├── .editorconfig ├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches └── rulinalg_vs_native.rs ├── examples ├── chapter10.rs ├── chapter11.rs ├── chapter12.rs ├── chapter13.rs ├── chapter14.rs ├── chapter15.rs ├── chapter3.rs ├── chapter4.rs ├── chapter5.rs ├── chapter6.rs ├── chapter8.rs └── chapter9.rs └── src ├── activations.rs ├── layers.rs ├── lib.rs ├── losses.rs ├── optimizers.rs └── tensor.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | rust: 4 | - stable 5 | - beta 6 | - nightly 7 | 8 | cache: cargo 9 | 10 | script: 11 | - cargo test 12 | 13 | after_success: 14 | - if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then 15 | cargo bench; 16 | fi 17 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "grokking-deep-learning-rs" 3 | version = "0.1.0" 4 | authors = ["Suyash "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | datasets = { git = "https://github.com/suyash/datasets" } 9 | rand = "0.6.4" 10 | rulinalg = "0.4.2" 11 | 12 | [dev-dependencies] 13 | indicatif = "0.11.0" 14 | # paillier = { version = "0.2.0", default-features = false, features = ["usegmp", "keygen"] } 15 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 Suyash 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Suyash 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grokking Deep Learning Rust 2 | 3 | [![Build Status](https://travis-ci.com/suyash/grokking-deep-learning-rs.svg?branch=master)](https://travis-ci.com/suyash/grokking-deep-learning-rs) 4 | 5 | The exercises from the [@iamtrask](https://iamtrask.github.io) book [Grokking Deep Learning](https://manning.com/books/grokking-deep-learning) implemented in rust. 6 | 7 | This crate isn't published, because ideally you'd do this on your own, but if you insist 8 | 9 | ``` 10 | cargo add grokking_deep_learning_rs --git https://github.com/suyash/grokking-deep-learning-rs 11 | ``` 12 | 13 | This crate is structured as a library, with the core library describing some common primitives used throughout and the individual chapters implemented in the exercises. To run the exercises from a particular chapter, for example chapter 12 14 | 15 | ``` 16 | cargo run --example chapter12 17 | ``` 18 | 19 | Currently this uses [rulinalg](https://docs.rs/rulinalg) for matrix operations, which uses a Rust implementation of `dgemm` and provides a 3x performance over normal ijk multiplication (see included benchmark). However, it still isn't as fast as numpy because it isn't multi-threaded. Currently working on something of my own. 20 | 21 | The __datasets__ are extracted into a [separate library crate](https://github.com/suyash/datasets), which currently provides functions for loading 4 datasets, and an iterator for batching and shuffling. Planning to add more. Can be added using 22 | 23 | ``` 24 | cargo add datasets --git https://github.com/suyash/datasets 25 | ``` 26 | 27 | As a result of slower matmul, chapter 8 onwards, certain examples are smaller in size compared to the python examples. 28 | 29 | The Chapter 13 core components were extracted into the core library, so they could be used in later chapters. 30 | 31 | So, something like 32 | 33 | ```rust 34 | use rulinalg::matrix::Matrix; 35 | 36 | use grokking_deep_learning_rs::activations::{Sigmoid, Tanh}; 37 | use grokking_deep_learning_rs::layers::{Layer, Linear, Sequential}; 38 | use grokking_deep_learning_rs::losses::{Loss, MSELoss}; 39 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer}; 40 | use grokking_deep_learning_rs::tensor::Tensor; 41 | 42 | let data = Tensor::new_const(Matrix::new( 43 | 4, 44 | 2, 45 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 46 | )); 47 | 48 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 49 | 50 | let model = Sequential::new(vec![ 51 | Box::new(Linear::new(2, 3)), 52 | Box::new(Tanh), 53 | Box::new(Linear::new(3, 1)), 54 | Box::new(Sigmoid), 55 | ]); 56 | 57 | let criterion = MSELoss; 58 | let optim = SGDOptimizer::new(model.parameters(), 0.5); 59 | 60 | for _ in 0..10 { 61 | let pred = model.forward(&[&data]); 62 | 63 | // compare 64 | let loss = criterion.forward(&pred[0], &target); 65 | 66 | println!("Loss: {:?}", loss.0.borrow().data.data()); 67 | 68 | // calculate difference 69 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 70 | 71 | // learn 72 | optim.step(true); 73 | } 74 | ``` 75 | 76 | In Chapter 14, the RNN and LSTM examples have vanishing gradients and loss keeps going to NaN. There seems to be some kind of logic bomb in the code, where something is not doing what I think it does, still investigating. I tried reproducing the problem in chapter 13 final exercise and also implemented [min-char-rnn.py](https://gist.github.com/karpathy/d4dee566867f8291f086) in [Rust](https://gist.github.com/suyash/07b2ae4822f717d3edadb09a0f79ec57), but no luck so far. 77 | 78 | For Chapter 15, the encrypted federated learning exercise is not implemented. [There does exist a crate](https://crates.io/crates/paillier) for paillier homomorphic crypto, but the current implementation only works with integers and BigInts, not floating point numbers. Will try to see how to get it to work. 79 | 80 | # License 81 | 82 | This project is licensed under either of 83 | 84 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or 85 | http://www.apache.org/licenses/LICENSE-2.0) 86 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or 87 | http://opensource.org/licenses/MIT) 88 | 89 | at your option. 90 | 91 | ### Contribution 92 | 93 | Unless you explicitly state otherwise, any contribution intentionally submitted 94 | for inclusion in this work by you, as defined in the Apache-2.0 license, shall be 95 | dual licensed as above, without any additional terms or conditions. 96 | -------------------------------------------------------------------------------- /benches/rulinalg_vs_native.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | use test::{black_box, Bencher}; 6 | 7 | use std::ops::Mul; 8 | 9 | use rulinalg::matrix::Matrix; 10 | 11 | use grokking_deep_learning_rs::matrix_matrix_dot; 12 | 13 | #[bench] 14 | fn bench_normal(b: &mut Bencher) { 15 | b.iter(|| { 16 | let m1 = vec![vec![1.0, 2.0], vec![3.0, 4.0], vec![5.0, 6.0]]; 17 | let m2 = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]]; 18 | black_box(matrix_matrix_dot(&m1, &m2)); 19 | }); 20 | } 21 | 22 | #[bench] 23 | fn bench_rulinalg(b: &mut Bencher) { 24 | b.iter(|| { 25 | let m1 = Matrix::new(3, 2, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); 26 | let m2 = Matrix::new(2, 3, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); 27 | black_box(m1.mul(m2)); 28 | }); 29 | } 30 | -------------------------------------------------------------------------------- /examples/chapter10.rs: -------------------------------------------------------------------------------- 1 | //! Chapter10 - Intro to Convolutional Neural Networks - Learning Edges and Corners.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter10%20-%20Intro%20to%20Convolutional%20Neural%20Networks%20-%20Learning%20Edges%20and%20Corners.ipynb 4 | 5 | use std::error::Error; 6 | use std::ops::Mul; 7 | 8 | use datasets::image::mnist; 9 | use datasets::Dataset; 10 | use indicatif::{ProgressBar, ProgressStyle}; 11 | use rand::distributions::Standard; 12 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice}; 13 | 14 | use grokking_deep_learning_rs::{ 15 | argmax, generate_random_vector, sample_bernoulli_trials, softmax_mut, tanh_derivative, tanh_mut, 16 | }; 17 | 18 | fn main() { 19 | println!("\nUpgrading our MNIST Network\n"); 20 | mnist_tanh(0.5).unwrap(); 21 | } 22 | 23 | #[allow(unused_doc_comments)] 24 | fn mnist_tanh(keep_probability: f64) -> Result<(), Box> { 25 | let (train_data, test_data) = mnist()?; 26 | 27 | let train_dataset_size = 1024; 28 | let test_dataset_size = 1024; 29 | 30 | let batch_size = 64; // 128 in the numpy version 31 | 32 | let (kernel_rows, kernel_cols) = (3, 3); 33 | let num_kernels = 4; // 16 in the numpy version 34 | 35 | let (train_images, train_labels) = process_mnist_filtered_dataset( 36 | train_data, 37 | train_dataset_size, 38 | batch_size, 39 | kernel_rows, 40 | kernel_cols, 41 | ); 42 | 43 | let (test_images, test_labels) = process_mnist_filtered_dataset( 44 | test_data, 45 | test_dataset_size, 46 | batch_size, 47 | kernel_rows, 48 | kernel_cols, 49 | ); 50 | 51 | let mut kernels = Matrix::new( 52 | kernel_rows * kernel_cols, 53 | num_kernels, 54 | generate_random_vector( 55 | kernel_rows * kernel_cols * num_kernels, 56 | 0.02, 57 | -0.01, 58 | &Standard, 59 | ), 60 | ); 61 | 62 | let mut weights_1_2 = Matrix::new( 63 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 64 | 10, 65 | generate_random_vector( 66 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels * 10, 67 | 0.2, 68 | -0.1, 69 | &Standard, 70 | ), 71 | ); 72 | 73 | let alpha = 2.0; 74 | 75 | let iterations = 100; 76 | let progress = ProgressBar::new(iterations as u64); 77 | progress.set_style( 78 | ProgressStyle::default_bar() 79 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 80 | ); 81 | 82 | for it in 0..iterations { 83 | let mut accuracy = 0.0; 84 | 85 | for (images, labels) in train_images.iter().zip(train_labels.iter()) { 86 | let labels = 87 | unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) }; 88 | 89 | let expanded_input_batch_size = images.len() * images[0].len(); 90 | 91 | let expanded_input: Vec = images 92 | .iter() 93 | .flat_map(|kernel_inputs| kernel_inputs.iter()) 94 | .flat_map(|kernel_inputs| kernel_inputs.iter().cloned()) 95 | .collect(); 96 | 97 | // [batch_size * 625, 9] 98 | let expanded_input = Matrix::new( 99 | expanded_input_batch_size, 100 | kernel_rows * kernel_cols, 101 | expanded_input, 102 | ); 103 | 104 | // [batch_size * 625, 16] 105 | let kernel_output = (&expanded_input).mul(&kernels); 106 | 107 | // [batch_size, 625 * 16] 108 | // NOTE: this is the flatten step 109 | let mut hidden_layer = Matrix::new( 110 | batch_size, 111 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 112 | kernel_output.into_vec(), 113 | ); 114 | 115 | /// Activation 116 | tanh_mut(&mut hidden_layer); 117 | 118 | /// Dropout 119 | let dropout_mask: Vec = sample_bernoulli_trials( 120 | keep_probability, 121 | batch_size * (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 122 | ) 123 | .into_iter() 124 | .map(|v| v * (1.0 / keep_probability)) 125 | .collect(); 126 | 127 | let dropout_mask = Matrix::new( 128 | batch_size, 129 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 130 | dropout_mask, 131 | ); 132 | 133 | let hidden_layer = hidden_layer.elemul(&dropout_mask); 134 | 135 | /// Final Outputs 136 | // [batch_size, 10] 137 | let mut predictions = (&hidden_layer).mul(&weights_1_2); 138 | softmax_mut(&mut predictions); 139 | 140 | /// NOTE: no error calculation still 141 | 142 | /// Accuracy 143 | for (r1, r2) in predictions.row_iter().zip(labels.row_iter()) { 144 | accuracy += if argmax(r1.raw_slice()) == argmax(r2.raw_slice()) { 145 | 1.0 146 | } else { 147 | 0.0 148 | } 149 | } 150 | 151 | /// delta_2_1 152 | let mut delta_2_1 = Matrix::new(batch_size, 10, vec![0.0; batch_size * 10]); 153 | for i in 0..batch_size { 154 | for j in 0..10 { 155 | delta_2_1[[i, j]] = 156 | (predictions[[i, j]] - labels[[i, j]]) / ((batch_size * batch_size) as f64); 157 | } 158 | } 159 | 160 | /// delta_1_0 161 | let mut delta_1_0 = (&delta_2_1) 162 | .mul(weights_1_2.transpose()) 163 | .elemul(&tanh_derivative(&hidden_layer)); 164 | 165 | for i in 0..batch_size { 166 | for j in 0..((28 - kernel_rows) * (28 - kernel_cols) * num_kernels) { 167 | delta_1_0[[i, j]] *= dropout_mask[[i, j]]; 168 | } 169 | } 170 | 171 | /// update weights_1_2 172 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 173 | for i in 0..((28 - kernel_rows) * (28 - kernel_cols) * num_kernels) { 174 | for j in 0..10 { 175 | weights_1_2[[i, j]] -= alpha * weight_delta_1_2[[i, j]]; 176 | } 177 | } 178 | 179 | /// update weights_0_1 180 | // reorient delta_1_0 181 | let delta_1_0 = Matrix::new( 182 | batch_size * (28 - kernel_rows) * (28 - kernel_cols), 183 | num_kernels, 184 | delta_1_0.into_vec(), 185 | ); 186 | 187 | let weight_delta_0_1 = expanded_input.transpose().mul(delta_1_0); 188 | for i in 0..(kernel_rows * kernel_cols) { 189 | for j in 0..num_kernels { 190 | kernels[[i, j]] -= alpha * weight_delta_0_1[[i, j]]; 191 | } 192 | } 193 | } 194 | 195 | let mut test_accuracy = 0.0; 196 | 197 | if (it + 1) % 10 == 0 { 198 | for (images, labels) in test_images.iter().zip(test_labels.iter()) { 199 | let labels = 200 | unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) }; 201 | 202 | let expanded_input_batch_size = images.len() * images[0].len(); 203 | 204 | let expanded_input: Vec = images 205 | .iter() 206 | .flat_map(|kernel_inputs| kernel_inputs.iter()) 207 | .flat_map(|kernel_inputs| kernel_inputs.iter().cloned()) 208 | .collect(); 209 | 210 | // [batch_size * 625, 9] 211 | let expanded_input = Matrix::new( 212 | expanded_input_batch_size, 213 | kernel_rows * kernel_cols, 214 | expanded_input, 215 | ); 216 | 217 | // [batch_size * 625, 16] 218 | let kernel_output = expanded_input.mul(&kernels); 219 | 220 | // [batch_size, 625 * 16] 221 | // NOTE: this is the flatten step 222 | let mut hidden_layer = Matrix::new( 223 | batch_size, 224 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 225 | kernel_output.into_vec(), 226 | ); 227 | 228 | /// Activation 229 | tanh_mut(&mut hidden_layer); 230 | 231 | /// Dropout 232 | let dropout_mask: Vec = sample_bernoulli_trials( 233 | keep_probability, 234 | batch_size * (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 235 | ) 236 | .into_iter() 237 | .map(|v| v * (1.0 / keep_probability)) 238 | .collect(); 239 | 240 | let dropout_mask = Matrix::new( 241 | batch_size, 242 | (28 - kernel_rows) * (28 - kernel_cols) * num_kernels, 243 | dropout_mask, 244 | ); 245 | 246 | let hidden_layer = hidden_layer.elemul(&dropout_mask); 247 | 248 | /// Final Outputs 249 | // [batch_size, 10] 250 | let mut predictions = hidden_layer.mul(&weights_1_2); 251 | softmax_mut(&mut predictions); 252 | 253 | /// NOTE: no error calculation still 254 | 255 | /// Accuracy 256 | for (r1, r2) in predictions.row_iter().zip(labels.row_iter()) { 257 | test_accuracy += if argmax(r1.raw_slice()) == argmax(r2.raw_slice()) { 258 | 1.0 259 | } else { 260 | 0.0 261 | } 262 | } 263 | } 264 | 265 | progress.println(format!( 266 | "Iteration: {}, Train Accuracy: {}, Test Accuracy: {}", 267 | it + 1, 268 | accuracy / (train_dataset_size as f64), 269 | test_accuracy / (test_dataset_size as f64), 270 | )); 271 | } 272 | 273 | progress.inc(1); 274 | progress.set_message(&format!( 275 | "Train Accuracy: {}", 276 | accuracy / (train_dataset_size as f64), 277 | )); 278 | } 279 | 280 | Ok(()) 281 | } 282 | 283 | #[allow(clippy::type_complexity)] 284 | fn process_mnist_filtered_dataset( 285 | dataset: impl Dataset, u8)>, 286 | dataset_size: usize, 287 | batch_size: usize, 288 | kernel_rows: usize, 289 | kernel_cols: usize, 290 | ) -> (Vec>>>, Vec>) { 291 | let (images, labels): (Vec>, Vec) = dataset.take(dataset_size).unzip(); 292 | 293 | // extract kernel sized image sections from images 294 | // [_, batch, kernels, kernel_image] 295 | let images = images 296 | .into_iter() 297 | .map(|img| { 298 | // convert each image into a vectors of kernel inputs of size 3x3 299 | 300 | let mut kernel_inputs = Vec::with_capacity((28 - kernel_rows) * (28 - kernel_cols)); 301 | 302 | for i in 0..(28 - kernel_rows) { 303 | for j in 0..(28 - kernel_cols) { 304 | let mut kernel_input = vec![0.0; kernel_rows * kernel_cols]; 305 | 306 | for k in 0..kernel_rows { 307 | for l in 0..kernel_cols { 308 | kernel_input[k * kernel_cols + l] = 309 | f64::from(img[(i + k) * 28 + (j + l)]); 310 | } 311 | } 312 | 313 | kernel_inputs.push(kernel_input); 314 | } 315 | } 316 | 317 | kernel_inputs 318 | }) 319 | .batch(batch_size, false) 320 | .collect(); 321 | 322 | // [_, batch, label] 323 | let labels = labels 324 | .into_iter() 325 | .map(|l| { 326 | let mut v = vec![0.0; 10]; 327 | v[l as usize] = 1.0; 328 | v 329 | }) 330 | .batch(batch_size, false) 331 | // flatten each batch so it can be converted to MatrixSlice easily 332 | .map(|b| b.into_iter().flat_map(|v| v.into_iter()).collect()) 333 | .collect(); 334 | 335 | (images, labels) 336 | } 337 | -------------------------------------------------------------------------------- /examples/chapter11.rs: -------------------------------------------------------------------------------- 1 | //! Chapter11 - Intro to Word Embeddings - Neural Networks that Understand Language.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter11%20-%20Intro%20to%20Word%20Embeddings%20-%20Neural%20Networks%20that%20Understand%20Language.ipynb 4 | 5 | use std::cmp::{max, min, Ordering}; 6 | use std::collections::{BTreeMap, BTreeSet}; 7 | use std::error::Error; 8 | use std::iter::FromIterator; 9 | use std::ops::Mul; 10 | 11 | use datasets::text::imdb_reviews; 12 | use datasets::Dataset; 13 | use indicatif::{ProgressBar, ProgressStyle}; 14 | use rand::distributions::Standard; 15 | use rulinalg::matrix::{BaseMatrix, Matrix}; 16 | 17 | use grokking_deep_learning_rs::{generate_random_vector, sigmoid_mut}; 18 | 19 | fn main() -> Result<(), Box> { 20 | let (train_dataset, test_dataset) = imdb_reviews()?; 21 | let train_dataset_size = 2000; 22 | let test_dataset_size = 2000; 23 | 24 | let (train_reviews, train_labels): (Vec<_>, Vec<_>) = train_dataset 25 | .shuffle(25000, 0) 26 | .map(|(s, l): (String, u8)| (s.to_lowercase(), l)) 27 | // currently only considering alphabets and nothing else. 28 | .map(|(s, l)| { 29 | ( 30 | s.chars() 31 | .map(|c| if c >= 'a' && c <= 'z' { c } else { ' ' }) 32 | .collect(), 33 | l, 34 | ) 35 | }) 36 | .take(train_dataset_size) 37 | .unzip(); 38 | 39 | let (test_reviews, test_labels): (Vec, Vec<_>) = test_dataset 40 | .shuffle(25000, 0) 41 | .map(|(s, l)| (s.to_lowercase(), l)) 42 | .take(test_dataset_size) 43 | .unzip(); 44 | 45 | // can't immutably borrow here 46 | let words = train_reviews 47 | .iter() 48 | .flat_map(|s: &String| s.split_whitespace().filter(|w| !w.is_empty())); 49 | 50 | let words = BTreeSet::from_iter(words); 51 | 52 | let len = words.len(); 53 | // 0 => UNK, 1 => PAD 54 | let word_index = BTreeMap::from_iter(words.into_iter().zip(2..(len + 2))); 55 | println!("Found {} words", word_index.len()); 56 | 57 | let train_reviews = encode_sentences(&train_reviews, &word_index); 58 | let train_labels: Vec<_> = encode_labels(train_labels); 59 | 60 | let test_reviews = encode_sentences(&test_reviews, &word_index); 61 | let test_labels: Vec<_> = encode_labels(test_labels); 62 | 63 | let embeddings = net_with_embedding_layer( 64 | (&train_reviews, &train_labels), 65 | (&test_reviews, &test_labels), 66 | len + 2, 67 | ); 68 | 69 | show_similar_embeddings("beautiful", &word_index, &embeddings); 70 | show_similar_embeddings("terrible", &word_index, &embeddings); 71 | 72 | let embeddings = filling_in_the_blank(&train_reviews, &word_index); 73 | 74 | show_similar_embeddings("beautiful", &word_index, &embeddings); 75 | show_similar_embeddings("terrible", &word_index, &embeddings); 76 | 77 | analogies(["terrible", "good"], "bad", &word_index, &embeddings); 78 | analogies(["elizabeth", "he"], "she", &word_index, &embeddings); 79 | 80 | Ok(()) 81 | } 82 | 83 | fn encode_sentences(v: &[String], word_index: &BTreeMap<&str, usize>) -> Vec> { 84 | v.iter() 85 | .map(|s| { 86 | let mut encoding = Vec::new();; 87 | 88 | for word in s.split_whitespace() { 89 | if word_index.contains_key(word) { 90 | encoding.push(word_index[word]); 91 | } else { 92 | encoding.push(0); 93 | } 94 | } 95 | 96 | encoding 97 | }) 98 | .collect() 99 | } 100 | 101 | fn encode_labels(labels: Vec) -> Vec { 102 | labels 103 | .into_iter() 104 | .map(|l| if l > 5 { 1.0 } else { 0.0 }) 105 | .collect() 106 | } 107 | 108 | #[allow(clippy::float_cmp)] 109 | fn net_with_embedding_layer( 110 | (train_reviews, train_labels): (&[Vec], &[f64]), 111 | (test_reviews, test_labels): (&[Vec], &[f64]), 112 | vocab_size: usize, 113 | ) -> Matrix { 114 | let hidden_size = 100; 115 | 116 | let mut embeddings = Matrix::new( 117 | vocab_size, 118 | hidden_size, 119 | generate_random_vector(vocab_size * hidden_size, 0.2, -0.1, &Standard), 120 | ); 121 | 122 | let mut weights_1_2 = Matrix::new( 123 | hidden_size, 124 | 1, 125 | generate_random_vector(hidden_size, 0.2, -0.1, &Standard), 126 | ); 127 | 128 | let alpha = 0.01; 129 | 130 | let iterations = 15; 131 | 132 | for _ in 0..iterations { 133 | let mut train_accuracy = 0.0; 134 | let mut total = 0.0; 135 | 136 | let progress = ProgressBar::new(train_reviews.len() as u64); 137 | progress.set_style( 138 | ProgressStyle::default_bar() 139 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 140 | ); 141 | 142 | for (review, label) in train_reviews.iter().zip(train_labels.iter()) { 143 | // take embeddings 144 | let mut hidden_layer = Matrix::new(1, hidden_size, vec![0.0; hidden_size]); 145 | for ix in review.iter() { 146 | for j in 0..hidden_size { 147 | hidden_layer[[0, j]] += embeddings[[*ix, j]]; 148 | } 149 | } 150 | sigmoid_mut(&mut hidden_layer); 151 | 152 | let mut prediction = (&hidden_layer).mul(&weights_1_2); 153 | sigmoid_mut(&mut prediction); 154 | 155 | let delta_2_1 = Matrix::new(1, 1, vec![prediction[[0, 0]] - label]); 156 | let delta_1_0 = (&delta_2_1).mul(weights_1_2.transpose()); 157 | 158 | if prediction[[0, 0]].round() == *label { 159 | train_accuracy += 1.0; 160 | } 161 | 162 | total += 1.0; 163 | 164 | let weight_deltas_1_2 = hidden_layer.transpose().mul(delta_2_1); 165 | 166 | for i in 0..hidden_size { 167 | weights_1_2[[i, 0]] -= alpha * weight_deltas_1_2[[i, 0]]; 168 | } 169 | 170 | for ix in review.iter() { 171 | for j in 0..hidden_size { 172 | embeddings[[*ix, j]] -= alpha * delta_1_0[[0, j]]; 173 | } 174 | } 175 | 176 | progress.inc(1); 177 | progress.set_message(&format!("Train Accuracy: {}", train_accuracy / total)); 178 | } 179 | 180 | progress.finish(); 181 | } 182 | 183 | println!("\nEvaluating on Test Dataset\n"); 184 | 185 | let progress = ProgressBar::new(test_reviews.len() as u64); 186 | progress.set_style( 187 | ProgressStyle::default_bar() 188 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 189 | ); 190 | 191 | let mut test_accuracy = 0.0; 192 | let mut total = 0.0; 193 | 194 | for (review, label) in test_reviews.iter().zip(test_labels.iter()) { 195 | // take embeddings 196 | let mut hidden_layer = Matrix::new(1, hidden_size, vec![0.0; hidden_size]); 197 | for ix in review.iter() { 198 | for j in 0..hidden_size { 199 | hidden_layer[[0, j]] += embeddings[[*ix, j]]; 200 | } 201 | } 202 | sigmoid_mut(&mut hidden_layer); 203 | 204 | let mut prediction = (&hidden_layer).mul(&weights_1_2); 205 | sigmoid_mut(&mut prediction); 206 | 207 | if prediction[[0, 0]].round() == *label { 208 | test_accuracy += 1.0; 209 | } 210 | 211 | total += 1.0; 212 | 213 | progress.inc(1); 214 | progress.set_message(&format!("Test Accuracy: {}", test_accuracy / total)); 215 | } 216 | 217 | progress.finish(); 218 | 219 | embeddings 220 | } 221 | 222 | fn show_similar_embeddings( 223 | word: &str, 224 | word_index: &BTreeMap<&str, usize>, 225 | embeddings: &Matrix, 226 | ) { 227 | if !word_index.contains_key(word) { 228 | println!("index does not have {}", word); 229 | } else { 230 | let ix = word_index[word]; 231 | let word_embeddings = embeddings.row(ix); 232 | 233 | let sims = get_similar_embeddings(word_embeddings.raw_slice(), word_index, embeddings); 234 | 235 | println!("\nWords Similar to {}:\n", word); 236 | for i in sims.iter().take(10) { 237 | println!("{}: {}", i.0, i.1); 238 | } 239 | } 240 | } 241 | 242 | fn get_similar_embeddings<'a>( 243 | row: &[f64], 244 | word_index: &'a BTreeMap<&str, usize>, 245 | embeddings: &'a Matrix, 246 | ) -> Vec<(&'a str, f64)> { 247 | let mut sims = Vec::with_capacity(word_index.len()); 248 | 249 | for (word, ix) in word_index.iter() { 250 | let mut distance = 0.0; 251 | 252 | for (a, b) in row.iter().zip(embeddings.row(*ix).iter()) { 253 | distance += (a - b).powi(2); 254 | } 255 | 256 | sims.push((word.to_owned(), distance.sqrt())); 257 | } 258 | 259 | sims.sort_by(|a: &(&str, f64), b: &(&str, f64)| { 260 | if a.1 < b.1 { 261 | Ordering::Less 262 | } else if a.1 > b.1 { 263 | Ordering::Greater 264 | } else { 265 | Ordering::Equal 266 | } 267 | }); 268 | 269 | sims 270 | } 271 | 272 | fn filling_in_the_blank( 273 | train_reviews: &[Vec], 274 | word_index: &BTreeMap<&str, usize>, 275 | ) -> Matrix { 276 | let concatenated: Vec = train_reviews.iter().flat_map(|v| v).cloned().collect(); 277 | 278 | // NOTE: inputs are already shuffled 279 | 280 | let hidden_size = 50; 281 | let (negative_samples, window_size) = (5, 2); 282 | let alpha = 0.05; 283 | 284 | let iterations = 2; 285 | 286 | let mut weights_0_1 = Matrix::new( 287 | word_index.len() + 2, 288 | hidden_size, 289 | generate_random_vector((word_index.len() + 2) * hidden_size, 0.2, -0.1, &Standard), 290 | ); 291 | 292 | let mut weights_1_2: Matrix = Matrix::zeros(word_index.len() + 2, hidden_size); 293 | 294 | let mut outputs = Matrix::new(1, negative_samples + 1, vec![0.0; negative_samples + 1]); 295 | outputs[[0, 0]] = 1.0; 296 | 297 | for _ in 0..iterations { 298 | let progress = ProgressBar::new(train_reviews.len() as u64); 299 | progress.set_style( 300 | ProgressStyle::default_bar() 301 | .template("{bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 302 | ); 303 | 304 | for review in train_reviews.iter() { 305 | for target_ix in 0..review.len() { 306 | let mut target_samples = vec![review[target_ix]]; 307 | target_samples.append( 308 | &mut generate_random_vector(negative_samples, 1.0, 0.0, &Standard) 309 | .into_iter() 310 | .map(|x| (x * (concatenated.len() as f64)) as usize) 311 | .map(|ix| concatenated[ix]) 312 | .collect(), 313 | ); 314 | 315 | let left_window_start = 316 | max(0, (target_ix as isize) - (window_size as isize)) as usize; 317 | let right_window_end = min(target_ix + window_size, review.len()); 318 | 319 | let left_window: Vec = (left_window_start..target_ix) 320 | .map(|ix| review[ix]) 321 | .collect(); 322 | let right_window: Vec = ((target_ix + 1)..right_window_end) 323 | .map(|ix| review[ix]) 324 | .collect(); 325 | 326 | let total_window_size = left_window.len() + right_window.len(); 327 | 328 | let mut hidden_layer: Matrix = Matrix::zeros(1, hidden_size); 329 | 330 | for ix in left_window.iter().chain(right_window.iter()) { 331 | for (i, x) in weights_0_1.row(*ix).iter().enumerate() { 332 | hidden_layer[[0, i]] += x; 333 | } 334 | } 335 | 336 | for i in 0..total_window_size { 337 | hidden_layer[[0, i]] /= total_window_size as f64; 338 | } 339 | 340 | let mut predictions = 341 | (&hidden_layer).mul(select_rows(&weights_1_2, &target_samples).transpose()); 342 | sigmoid_mut(&mut predictions); 343 | 344 | // [1, target_size] 345 | let layer_2_delta = predictions - (&outputs); 346 | 347 | // [1, hidden_size] 348 | let layer_1_delta = 349 | (&layer_2_delta).mul(select_rows(&weights_1_2, &target_samples)); 350 | 351 | // [target_size, hidden_size] 352 | // NOTE: we have initialized weights_1_2 in reverse order of traditional init 353 | // normally we'd do hidden_layer.transpose().mul(layer_2_delta) 354 | let weight_delta_1_2 = layer_2_delta.transpose().mul(hidden_layer); 355 | 356 | for ix in target_samples.into_iter() { 357 | for v in 0..hidden_size { 358 | weights_1_2[[ix, v]] -= alpha * weight_delta_1_2[[0, v]]; 359 | } 360 | } 361 | 362 | for ix in left_window.into_iter().chain(right_window.into_iter()) { 363 | for v in 0..hidden_size { 364 | weights_0_1[[ix, v]] -= alpha * layer_1_delta[[0, v]]; 365 | } 366 | } 367 | } 368 | 369 | progress.inc(1); 370 | } 371 | 372 | progress.finish(); 373 | } 374 | 375 | weights_0_1 376 | } 377 | 378 | fn select_rows(m: &Matrix, rows: &[usize]) -> Matrix { 379 | Matrix::new( 380 | rows.len(), 381 | m.cols(), 382 | rows.iter().fold(Vec::new(), |mut acc, i| { 383 | acc.append(&mut Vec::from(m.row(*i).raw_slice())); 384 | acc 385 | }), 386 | ) 387 | } 388 | 389 | fn analogies( 390 | positive: [&str; 2], 391 | negative: &str, 392 | word_index: &BTreeMap<&str, usize>, 393 | embeddings: &Matrix, 394 | ) { 395 | if !word_index.contains_key(positive[0]) 396 | || !word_index.contains_key(positive[1]) 397 | || !word_index.contains_key(negative) 398 | { 399 | println!("did not find all words in index"); 400 | return; 401 | } 402 | 403 | let (pix1, pix2) = (word_index[positive[0]], word_index[positive[1]]); 404 | let nix = word_index[negative]; 405 | 406 | let mut target_row = vec![0.0; embeddings.cols()]; 407 | for i in 0..embeddings.cols() { 408 | target_row[i] += embeddings[[pix1, i]]; 409 | target_row[i] -= embeddings[[nix, i]]; 410 | target_row[i] += embeddings[[pix2, i]]; 411 | } 412 | 413 | let sims = get_similar_embeddings(&target_row, word_index, embeddings); 414 | 415 | println!("\n{} - {} + {}:\n", positive[0], negative, positive[1]); 416 | for i in sims.iter().take(10) { 417 | println!("{}: {}", i.0, i.1); 418 | } 419 | } 420 | -------------------------------------------------------------------------------- /examples/chapter12.rs: -------------------------------------------------------------------------------- 1 | //! Chapter 12 - Introduction to Recurrence - Predicting the Next Word 2 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter12%20-%20Intro%20to%20Recurrence%20-%20Predicting%20the%20Next%20Word.ipynb 3 | //! 4 | //! This is *significantly* different from the python version. 5 | //! 6 | //! 1. The dataset is cleaned to remove all whitespaces including tabs, and as a result contains only 19 words. This results in a lower perplexity than 7 | //! the python version from the beginning. 8 | //! 9 | //! 2. The Forward Propagation, Back Propagation and Weight Update steps are implemented in a single function. 10 | //! 11 | //! 3. The gradients explode more rapidly, because of extremely low embeddings to match. Alleviated this by lowering the alpha from 0.001 to 0.0005 12 | //! and increasing embedding size from 10 to 100. Another measure would be to cap the gradients. 13 | 14 | use std::collections::{BTreeMap, BTreeSet}; 15 | use std::error::Error; 16 | use std::iter::FromIterator; 17 | use std::ops::Mul; 18 | 19 | use datasets::text::babi_en_single_supporting_fact_task; 20 | use indicatif::{ProgressBar, ProgressStyle}; 21 | use rand::distributions::Uniform; 22 | use rulinalg::matrix::{BaseMatrix, Matrix}; 23 | 24 | use grokking_deep_learning_rs::{argmax, generate_random_vector, softmax_mut}; 25 | 26 | fn main() -> Result<(), Box> { 27 | embeddings_forward_propagation(); 28 | 29 | let (train_data, _) = babi_en_single_supporting_fact_task()?; 30 | 31 | let train_data: Vec> = train_data 32 | .map(|v| vec![v.0, v.1, (v.2).0]) 33 | .flat_map(|v| v.into_iter()) 34 | .map(|s| { 35 | s.split_whitespace() 36 | .map(|w| { 37 | w.chars() 38 | .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z')) 39 | .collect() 40 | }) 41 | .collect() 42 | }) 43 | .collect(); 44 | 45 | let words = BTreeSet::from_iter(train_data.iter().flat_map(|v| v.iter())); 46 | 47 | let word_count = words.len(); 48 | let word_index = BTreeMap::from_iter(words.into_iter().zip(0..word_count)); 49 | let inverted_word_index = 50 | BTreeMap::from_iter(word_index.clone().into_iter().map(|(k, v)| (v, k))); 51 | 52 | let (start_state, embeddings, recurrent_weights, state_to_prediction_weights) = 53 | training_with_arbitrary_length(&train_data, &word_index)?; 54 | 55 | let sentence = &train_data[0]; 56 | 57 | let mut current_state = start_state.clone(); 58 | 59 | for (i, word) in sentence.iter().take(sentence.len() - 1).enumerate() { 60 | let mut prediction = (¤t_state).mul(&state_to_prediction_weights); 61 | softmax_mut(&mut prediction); 62 | 63 | let pred_ix = argmax(prediction.row(0).raw_slice()); 64 | let predicted_word = inverted_word_index[&pred_ix]; 65 | 66 | println!( 67 | "Input: {}, Expected: {}, Predicted: {}", 68 | word, 69 | sentence[i + 1], 70 | predicted_word 71 | ); 72 | current_state = 73 | current_state.mul(&recurrent_weights) + embeddings.row(word_index[word]).into_matrix(); 74 | } 75 | 76 | Ok(()) 77 | } 78 | 79 | fn embeddings_forward_propagation() { 80 | let mut word_vectors = BTreeMap::new(); 81 | word_vectors.insert("yankees", Matrix::new(1, 3, vec![0.0; 3])); 82 | word_vectors.insert("bears", Matrix::new(1, 3, vec![0.0; 3])); 83 | word_vectors.insert("braves", Matrix::new(1, 3, vec![0.0; 3])); 84 | word_vectors.insert("red", Matrix::new(1, 3, vec![0.0; 3])); 85 | word_vectors.insert("socks", Matrix::new(1, 3, vec![0.0; 3])); 86 | word_vectors.insert("lose", Matrix::new(1, 3, vec![0.0; 3])); 87 | word_vectors.insert("defeat", Matrix::new(1, 3, vec![0.0; 3])); 88 | word_vectors.insert("beat", Matrix::new(1, 3, vec![0.0; 3])); 89 | word_vectors.insert("tie", Matrix::new(1, 3, vec![0.0; 3])); 90 | 91 | let sent_to_output_weights = 92 | Matrix::new(3, word_vectors.len(), vec![0.0; 3 * word_vectors.len()]); 93 | 94 | let weights: Matrix = Matrix::identity(3); 95 | 96 | let layer_0 = &word_vectors["red"]; 97 | let layer_1 = layer_0.mul(&weights) + &word_vectors["socks"]; 98 | let layer_2 = layer_1.mul(&weights) + &word_vectors["defeat"]; 99 | 100 | let mut prediction = layer_2.mul(&sent_to_output_weights); 101 | softmax_mut(&mut prediction); 102 | 103 | println!("{}", prediction); 104 | } 105 | 106 | #[allow(clippy::type_complexity)] 107 | fn training_with_arbitrary_length( 108 | train_data: &[Vec], 109 | word_index: &BTreeMap<&String, usize>, 110 | ) -> Result<(Matrix, Matrix, Matrix, Matrix), Box> { 111 | let word_count = word_index.len(); 112 | 113 | let embedding_size = 50; 114 | 115 | let distribution = Uniform::new(0.0, 1.0); 116 | 117 | let mut embeddings = Matrix::new( 118 | word_count, 119 | embedding_size, 120 | generate_random_vector(word_count * embedding_size, 0.1, -0.05, &distribution), 121 | ); 122 | 123 | let mut recurrent_weights = Matrix::identity(embedding_size); 124 | 125 | let mut state_to_prediction_weights = Matrix::new( 126 | embedding_size, 127 | word_count, 128 | generate_random_vector(embedding_size * word_count, 0.1, -0.05, &distribution), 129 | ); 130 | 131 | let word_target_embeddings = Matrix::identity(word_count); 132 | 133 | let mut start_state = Matrix::zeros(1, embedding_size); 134 | 135 | let alpha = 0.0004; 136 | 137 | for _ in 0..10 { 138 | let progress = ProgressBar::new(train_data.len() as u64); 139 | progress.set_style( 140 | ProgressStyle::default_bar() 141 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 142 | ); 143 | 144 | for sentence in train_data.iter() { 145 | // forward prop 146 | 147 | let mut current_state = start_state.clone(); 148 | let mut loss = 0.0; 149 | 150 | let mut cells = Vec::with_capacity(sentence.len()); 151 | cells.push((None, current_state.clone())); 152 | 153 | for word in sentence.iter().skip(1) { 154 | let mut prediction = (¤t_state).mul(&state_to_prediction_weights); 155 | softmax_mut(&mut prediction); 156 | 157 | loss += -(prediction[[0, word_index[word]]]).ln(); 158 | 159 | let mut next_state = (¤t_state).mul(&recurrent_weights); 160 | 161 | for i in 0..embedding_size { 162 | next_state[[0, i]] += embeddings[[word_index[word], i]]; 163 | } 164 | 165 | cells.push((Some(prediction), next_state.clone())); 166 | 167 | current_state = next_state; 168 | } 169 | 170 | loss /= (sentence.len() - 1) as f64; 171 | 172 | // backward prop 173 | 174 | let mut deltas: Vec<(Option>, Matrix)> = Vec::new(); 175 | 176 | let mut current_state_delta: Matrix = Matrix::identity(1); 177 | 178 | for (i, (prediction, _)) in cells.iter().enumerate().rev() { 179 | let prediction_delta = match prediction { 180 | Some(prediction) => Some( 181 | prediction 182 | - (word_target_embeddings 183 | .row(word_index[&sentence[i]]) 184 | .into_matrix()), 185 | ), 186 | None => None, 187 | }; 188 | 189 | let mut state_delta_from_predictions = match &prediction_delta { 190 | Some(prediction_delta) => { 191 | Some(prediction_delta.mul(state_to_prediction_weights.transpose())) 192 | } 193 | None => None, 194 | }; 195 | 196 | let mut state_delta_from_next_state = if i == cells.len() - 1 { 197 | None 198 | } else { 199 | Some(current_state_delta.mul(recurrent_weights.transpose())) 200 | }; 201 | 202 | current_state_delta = match ( 203 | state_delta_from_predictions.take(), 204 | state_delta_from_next_state.take(), 205 | ) { 206 | (Some(m1), Some(m2)) => m1 + m2, 207 | (Some(m1), None) => m1, 208 | (None, Some(m2)) => m2, 209 | _ => panic!("this is broken"), 210 | }; 211 | 212 | deltas.push((prediction_delta, current_state_delta.clone())); 213 | } 214 | 215 | // weights update 216 | 217 | // align deltas with cells 218 | deltas.reverse(); 219 | 220 | let (_, start_delta) = &deltas[0]; 221 | for i in 0..embedding_size { 222 | start_state[[0, i]] -= 223 | (alpha * start_delta[[0, i]]) / ((sentence.len() - 1) as f64); 224 | } 225 | 226 | for i in 1..cells.len() { 227 | let (_, state) = &cells[i]; 228 | let (prediction_delta, state_delta) = &deltas[i]; 229 | // let (_, prev_state) = &cells[i - 1]; 230 | 231 | let prediction_delta = prediction_delta.as_ref().unwrap(); 232 | 233 | let state_to_prediction_weights_delta = state.transpose().mul(prediction_delta); 234 | for j in 0..embedding_size { 235 | for k in 0..word_count { 236 | state_to_prediction_weights[[j, k]] -= (alpha 237 | * state_to_prediction_weights_delta[[j, k]]) 238 | / ((sentence.len() - 1) as f64); 239 | } 240 | } 241 | 242 | for j in 0..embedding_size { 243 | embeddings[[word_index[&sentence[i]], j]] -= 244 | (alpha * state_delta[[0, j]]) / ((sentence.len() - 1) as f64); 245 | } 246 | 247 | let recurrent_weights_delta = state.transpose().mul(state_delta); 248 | for j in 0..embedding_size { 249 | for k in 0..embedding_size { 250 | recurrent_weights[[j, k]] -= (alpha * recurrent_weights_delta[[j, k]]) 251 | / ((sentence.len() - 1) as f64); 252 | } 253 | } 254 | } 255 | 256 | progress.set_message(&format!("Perplexity: {}", loss.exp())); 257 | progress.inc(1); 258 | } 259 | 260 | progress.finish(); 261 | } 262 | 263 | Ok(( 264 | start_state, 265 | embeddings, 266 | recurrent_weights, 267 | state_to_prediction_weights, 268 | )) 269 | } 270 | -------------------------------------------------------------------------------- /examples/chapter13.rs: -------------------------------------------------------------------------------- 1 | //! Chapter13 - Intro to Automatic Differentiation - Let's Build A Deep Learning Framework 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter13%20-%20Intro%20to%20Automatic%20Differentiation%20-%20Let's%20Build%20A%20Deep%20Learning%20Framework.ipynb 4 | 5 | use std::collections::{BTreeMap, BTreeSet}; 6 | use std::error::Error; 7 | use std::iter::FromIterator; 8 | use std::ops::Add; 9 | 10 | use datasets::text::babi_en_single_supporting_fact_task; 11 | use datasets::Dataset; 12 | use rand::distributions::Uniform; 13 | use rulinalg::matrix::{BaseMatrix, Matrix}; 14 | 15 | use grokking_deep_learning_rs::activations::{Sigmoid, Tanh}; 16 | use grokking_deep_learning_rs::layers::{Embedding, Layer, Linear, RNNCell, Sequential}; 17 | use grokking_deep_learning_rs::losses::{CrossEntropyLoss, Loss, MSELoss}; 18 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer}; 19 | use grokking_deep_learning_rs::tensor::{Dot, Sum, Tensor}; 20 | use grokking_deep_learning_rs::{argmax, generate_random_vector}; 21 | 22 | fn main() { 23 | println!("\nIntroduction to Tensors\n"); 24 | introduction_to_tensors(); 25 | 26 | println!("\nIntroduction to Autograd\n"); 27 | introduction_to_autograd(); 28 | introduction_to_autograd_2(); 29 | 30 | println!("\nAutograd with multiple tensors\n"); 31 | autograd_with_multiple_tensors(); 32 | autograd_neg(); 33 | 34 | println!("\nUsing Autograd ot train a Neural Network\n"); 35 | training_using_autograd(); 36 | 37 | println!("\nAdding Automatic Optimization\n"); 38 | training_with_automatic_optimization(); 39 | 40 | println!("\nLayers Which Contain Layers\n"); 41 | layers_which_contain_layers(); 42 | 43 | println!("\nLoss Function Layers\n"); 44 | loss_function_layers(); 45 | 46 | println!("\nNonLinearity Layers\n"); 47 | nonlinearity_layers(); 48 | 49 | println!("\nEmbedding Layers\n"); 50 | embedding_layer(); 51 | 52 | println!("\nThe Embedding Layer\n"); 53 | cross_entropy_loss(); 54 | 55 | println!("\nRecurrent Neural Network\n"); 56 | recurrent_neural_network().unwrap(); 57 | } 58 | 59 | fn introduction_to_tensors() { 60 | let t1 = BasicTensor1 { data: vec![0.0] }; 61 | let t2 = BasicTensor1 { data: vec![1.0] }; 62 | println!("{:?}", t1 + t2); 63 | } 64 | 65 | #[derive(Debug)] 66 | struct BasicTensor1 { 67 | data: Vec, 68 | } 69 | 70 | impl Add for BasicTensor1 { 71 | type Output = BasicTensor1; 72 | 73 | fn add(self, other: BasicTensor1) -> Self::Output { 74 | BasicTensor1 { 75 | data: self 76 | .data 77 | .into_iter() 78 | .zip(other.data.into_iter()) 79 | .map(|(a, b)| a + b) 80 | .collect(), 81 | } 82 | } 83 | } 84 | 85 | fn introduction_to_autograd() { 86 | let x = BasicTensor2::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]); 87 | let y = BasicTensor2::new(vec![2.0; 5]); 88 | 89 | let mut z = x + y; 90 | println!("{:?}", z); 91 | 92 | z.backward(BasicTensor2::new(vec![1.0, 1.0, 1.0, 1.0, 1.0])); 93 | 94 | let xy = z.creators.unwrap(); 95 | 96 | println!("{:?}", xy[0].grad); 97 | println!("{:?}", xy[1].grad); 98 | } 99 | 100 | #[derive(Debug, Clone)] 101 | enum BasicOperation { 102 | Add, 103 | Const, 104 | } 105 | 106 | #[derive(Debug, Clone)] 107 | struct BasicTensor2 { 108 | data: Vec, 109 | grad: Option>, 110 | creation_op: BasicOperation, 111 | creators: Option>, 112 | } 113 | 114 | impl BasicTensor2 { 115 | fn new(data: Vec) -> Self { 116 | BasicTensor2 { 117 | data, 118 | grad: None, 119 | creation_op: BasicOperation::Const, 120 | creators: None, 121 | } 122 | } 123 | 124 | fn backward(&mut self, grad: BasicTensor2) { 125 | match self.creation_op { 126 | BasicOperation::Add => { 127 | for c in self.creators.as_mut().unwrap().iter_mut() { 128 | c.backward(grad.clone()); 129 | } 130 | } 131 | _ => { 132 | self.grad = Some(Box::new(grad)); 133 | } 134 | }; 135 | } 136 | } 137 | 138 | impl Add for BasicTensor2 { 139 | type Output = BasicTensor2; 140 | 141 | fn add(self, other: Self) -> BasicTensor2 { 142 | BasicTensor2 { 143 | data: self 144 | .data 145 | .iter() 146 | .zip(other.data.iter()) 147 | .map(|(a, b)| a + b) 148 | .collect(), 149 | grad: None, 150 | creation_op: BasicOperation::Add, 151 | creators: Some(vec![self, other]), 152 | } 153 | } 154 | } 155 | 156 | #[allow(clippy::many_single_char_names)] 157 | fn introduction_to_autograd_2() { 158 | let a = BasicTensor2::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]); 159 | let b = BasicTensor2::new(vec![2.0; 5]); 160 | let c = BasicTensor2::new(vec![5.0, 4.0, 3.0, 2.0, 1.0]); 161 | let d = BasicTensor2::new(vec![-1.0, -2.0, -3.0, -4.0, -5.0]); 162 | 163 | let e = a + b; 164 | let f = c + d; 165 | let mut g = e + f; 166 | 167 | g.backward(BasicTensor2::new(vec![1.0, 1.0, 1.0, 1.0, 1.0])); 168 | println!("{:?}", g); 169 | 170 | let ef = g.creators.as_ref().unwrap(); 171 | let ab = ef[0].creators.as_ref().unwrap(); 172 | 173 | let a = &ab[0]; 174 | println!("{:?}", a.grad); 175 | } 176 | 177 | #[allow(clippy::many_single_char_names)] 178 | fn autograd_with_multiple_tensors() { 179 | let a = Tensor::new_const(Matrix::new(1, 5, vec![1.0, 2.0, 3.0, 4.0, 5.0])); 180 | let b = Tensor::new_const(Matrix::new(1, 5, vec![2.0, 2.0, 2.0, 2.0, 2.0])); 181 | let c = Tensor::new_const(Matrix::new(1, 5, vec![5.0, 4.0, 3.0, 2.0, 1.0])); 182 | 183 | let d = &a + &b; 184 | let e = &b + &c; 185 | let f = &d + &e; 186 | 187 | // println!("{:#?}", f); 188 | f.backward(Tensor::grad(Matrix::new( 189 | 1, 190 | 5, 191 | vec![1.0, 1.0, 1.0, 1.0, 1.0], 192 | ))); 193 | println!("{:?}", b.0.borrow().grad); 194 | } 195 | 196 | #[allow(clippy::many_single_char_names)] 197 | fn autograd_neg() { 198 | let a = Tensor::new_const(Matrix::new(1, 5, vec![1.0, 2.0, 3.0, 4.0, 5.0])); 199 | let b = Tensor::new_const(Matrix::new(1, 5, vec![2.0, 2.0, 2.0, 2.0, 2.0])); 200 | let c = Tensor::new_const(Matrix::new(1, 5, vec![5.0, 4.0, 3.0, 2.0, 1.0])); 201 | 202 | let d = &a + &(-&b); 203 | let e = &(-&b) + &c; 204 | let f = &d + &e; 205 | 206 | f.backward(Tensor::grad(Matrix::new( 207 | 1, 208 | 5, 209 | vec![1.0, 1.0, 1.0, 1.0, 1.0], 210 | ))); 211 | println!("{:?}", b.0.borrow().grad); 212 | } 213 | 214 | /// Using Autograd to train a Neural Network 215 | 216 | fn training_using_autograd() { 217 | let data = Tensor::new_const(Matrix::new( 218 | 4, 219 | 2, 220 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 221 | )); 222 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 223 | 224 | let distribution = Uniform::new(0.0, 1.0); 225 | 226 | let w1 = Tensor::new_const(Matrix::new( 227 | 2, 228 | 3, 229 | generate_random_vector(2 * 3, 1.0, 0.0, &distribution), 230 | )); 231 | let w2 = Tensor::new_const(Matrix::new( 232 | 3, 233 | 1, 234 | generate_random_vector(3, 1.0, 0.0, &distribution), 235 | )); 236 | 237 | let alpha = 0.1; 238 | 239 | for _ in 0..10 { 240 | let pred = data.dot(&w1).dot(&w2); 241 | let loss = (&(&pred - &target) * &(&pred - &target)).sum(0); 242 | let (loss_rows, loss_cols) = (1, 1); 243 | 244 | println!("Loss: {:?}", loss.0.borrow().data); 245 | 246 | loss.backward(Tensor::grad(Matrix::ones(loss_rows, loss_cols))); 247 | 248 | { 249 | let mut w1 = w1.0.borrow_mut(); 250 | let grad = w1.grad.take(); 251 | w1.grad = None; 252 | 253 | let grad = grad.unwrap(); 254 | let grad = &grad.borrow().data; 255 | 256 | for i in 0..w1.data.rows() { 257 | for j in 0..w1.data.cols() { 258 | w1.data[[i, j]] -= alpha * grad[[i, j]]; 259 | } 260 | } 261 | } 262 | 263 | { 264 | let mut w2 = w2.0.borrow_mut(); 265 | let grad = w2.grad.take(); 266 | w2.grad = None; 267 | 268 | let grad = grad.unwrap(); 269 | let grad = &grad.borrow().data; 270 | 271 | for i in 0..w2.data.rows() { 272 | for j in 0..w2.data.cols() { 273 | w2.data[[i, j]] -= alpha * grad[[i, j]]; 274 | } 275 | } 276 | } 277 | } 278 | } 279 | 280 | /// Adding Automatic Optimization 281 | 282 | fn training_with_automatic_optimization() { 283 | let data = Tensor::new_const(Matrix::new( 284 | 4, 285 | 2, 286 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 287 | )); 288 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 289 | 290 | let distribution = Uniform::new(0.0, 1.0); 291 | 292 | let w1 = Tensor::new_const(Matrix::new( 293 | 2, 294 | 3, 295 | generate_random_vector(2 * 3, 1.0, 0.0, &distribution), 296 | )); 297 | 298 | let w2 = Tensor::new_const(Matrix::new( 299 | 3, 300 | 1, 301 | generate_random_vector(3, 1.0, 0.0, &distribution), 302 | )); 303 | 304 | let alpha = 0.1; 305 | 306 | let optimizer = SGDOptimizer::new(vec![&w1, &w2], alpha); 307 | 308 | for _ in 0..10 { 309 | // predict 310 | let pred = data.dot(&w1).dot(&w2); 311 | 312 | // compare 313 | let loss = (&(&pred - &target) * &(&pred - &target)).sum(0); 314 | let (loss_rows, loss_cols) = (1, 1); 315 | 316 | println!("Loss: {:?}", loss.0.borrow().data.data()); 317 | 318 | // calculate difference 319 | loss.backward(Tensor::grad(Matrix::ones(loss_rows, loss_cols))); 320 | 321 | // learn 322 | optimizer.step(true); 323 | } 324 | } 325 | 326 | /// Layers Which Contain Layers 327 | 328 | fn layers_which_contain_layers() { 329 | let data = Tensor::new_const(Matrix::new( 330 | 4, 331 | 2, 332 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 333 | )); 334 | 335 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 336 | 337 | let model = Sequential::new(vec![ 338 | Box::new(Linear::new(2, 3, false)), 339 | Box::new(Linear::new(3, 1, false)), 340 | ]); 341 | 342 | let optim = SGDOptimizer::new(model.parameters(), 0.05); 343 | 344 | for _ in 0..10 { 345 | let pred = model.forward(&[&data]); 346 | 347 | // compare 348 | let loss = (&(&pred[0] - &target) * &(&pred[0] - &target)).sum(0); 349 | 350 | println!("Loss: {:?}", loss.0.borrow().data.data()); 351 | 352 | // calculate difference 353 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 354 | 355 | // learn 356 | optim.step(true); 357 | } 358 | } 359 | 360 | fn loss_function_layers() { 361 | let data = Tensor::new_const(Matrix::new( 362 | 4, 363 | 2, 364 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 365 | )); 366 | 367 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 368 | 369 | let model = Sequential::new(vec![ 370 | Box::new(Linear::new(2, 3, false)), 371 | Box::new(Linear::new(3, 1, false)), 372 | ]); 373 | 374 | let criterion = MSELoss; 375 | let optim = SGDOptimizer::new(model.parameters(), 0.05); 376 | 377 | for _ in 0..10 { 378 | let pred = model.forward(&[&data]); 379 | 380 | // compare 381 | let loss = criterion.forward(&pred[0], &target); 382 | 383 | println!("Loss: {:?}", loss.0.borrow().data.data()); 384 | 385 | // calculate difference 386 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 387 | 388 | // learn 389 | optim.step(true); 390 | } 391 | } 392 | 393 | /// NonLinearity Layers 394 | 395 | fn nonlinearity_layers() { 396 | let data = Tensor::new_const(Matrix::new( 397 | 4, 398 | 2, 399 | vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0], 400 | )); 401 | 402 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 403 | 404 | let model = Sequential::new(vec![ 405 | Box::new(Linear::new(2, 3, false)), 406 | Box::new(Tanh), 407 | Box::new(Linear::new(3, 1, false)), 408 | Box::new(Sigmoid), 409 | ]); 410 | 411 | let criterion = MSELoss; 412 | let optim = SGDOptimizer::new(model.parameters(), 0.5); 413 | 414 | for _ in 0..10 { 415 | let pred = model.forward(&[&data]); 416 | 417 | // compare 418 | let loss = criterion.forward(&pred[0], &target); 419 | 420 | println!("Loss: {:?}", loss.0.borrow().data.data()); 421 | 422 | // calculate difference 423 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 424 | 425 | // learn 426 | optim.step(true); 427 | } 428 | } 429 | 430 | /// The Embedding Layer 431 | 432 | fn embedding_layer() { 433 | let data = Tensor::new_const(Matrix::new(1, 4, vec![1.0, 2.0, 1.0, 2.0])); 434 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 435 | 436 | let model = Sequential::new(vec![ 437 | Box::new(Embedding::new(5, 3)), 438 | Box::new(Tanh), 439 | Box::new(Linear::new(3, 1, true)), 440 | Box::new(Sigmoid), 441 | ]); 442 | 443 | let criterion = MSELoss; 444 | let optim = SGDOptimizer::new(model.parameters(), 0.07); 445 | 446 | for _ in 0..10 { 447 | let pred = model.forward(&[&data]); 448 | 449 | // compare 450 | let loss = criterion.forward(&pred[0], &target); 451 | 452 | println!("Loss: {:?}", loss.0.borrow().data.data()); 453 | 454 | // calculate difference 455 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 456 | 457 | // learn 458 | optim.step(true); 459 | } 460 | } 461 | 462 | /// The Cross Entropy Layer 463 | 464 | fn cross_entropy_loss() { 465 | let data = Tensor::new_const(Matrix::new(1, 4, vec![1.0, 2.0, 1.0, 2.0])); 466 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0])); 467 | 468 | let model = Sequential::new(vec![ 469 | Box::new(Embedding::new(3, 3)), 470 | Box::new(Tanh), 471 | Box::new(Linear::new(3, 4, true)), 472 | ]); 473 | 474 | let criterion = CrossEntropyLoss; 475 | let optim = SGDOptimizer::new(model.parameters(), 0.1); 476 | 477 | for _ in 0..10 { 478 | let pred = model.forward(&[&data]); 479 | // println!("pred {}", pred.0.borrow().data); 480 | 481 | // compare 482 | let loss = criterion.forward(&pred[0], &target); 483 | 484 | println!("Loss: {:?}", loss.0.borrow().data.data()); 485 | 486 | // calculate difference 487 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 488 | 489 | // learn 490 | optim.step(true); 491 | } 492 | } 493 | 494 | #[allow(clippy::needless_range_loop)] 495 | fn recurrent_neural_network() -> Result<(), Box> { 496 | let (train_data, _) = babi_en_single_supporting_fact_task()?; 497 | 498 | let train_data: Vec> = train_data 499 | .map(|v| vec![v.0, v.1 /*, (v.2).0*/]) 500 | .flat_map(|v| v.into_iter()) 501 | .map(|s| { 502 | s.split_whitespace() 503 | .map(|w| { 504 | w.chars() 505 | .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z')) 506 | .collect() 507 | }) 508 | .collect() 509 | }) 510 | .collect(); 511 | 512 | let total_data_size = train_data.len(); 513 | 514 | let words = BTreeSet::from_iter(train_data.iter().flat_map(|v| v.iter())); 515 | 516 | let word_count = words.len(); 517 | let word_index = BTreeMap::from_iter(words.into_iter().zip(0..word_count)); 518 | let inverted_word_index = 519 | BTreeMap::from_iter(word_index.clone().into_iter().map(|(k, v)| (v, k))); 520 | 521 | let train_data: Vec> = train_data 522 | .iter() 523 | .map(|s| s.iter().map(|w| word_index[w] as f64).collect()) 524 | .collect(); 525 | 526 | let max_len = train_data.iter().map(|s| s.len()).max().unwrap(); 527 | let pad = word_index.len() + 1; 528 | 529 | let batch_size = 250; 530 | 531 | let train_data: Vec<_> = train_data 532 | .into_iter() 533 | .batch(batch_size, true) 534 | .map(|v: Vec>| { 535 | let mut ans = vec![vec![0.0; batch_size]; max_len]; 536 | for i in 0..batch_size { 537 | for j in 0..v[i].len() { 538 | ans[j][i] = v[i][j]; 539 | } 540 | 541 | for j in v[i].len()..max_len { 542 | ans[j][i] = pad as f64; 543 | } 544 | } 545 | 546 | ans 547 | }) 548 | .collect(); 549 | 550 | let embedding_size = 16; 551 | 552 | // net 553 | let embed = Embedding::new(word_index.len() + 2, embedding_size); 554 | let model = RNNCell::new(embedding_size, 16, word_index.len() + 2, Box::new(Sigmoid)); 555 | 556 | let criterion = CrossEntropyLoss; 557 | let mut parameters = embed.parameters(); 558 | parameters.append(&mut model.parameters()); 559 | 560 | let optim = SGDOptimizer::new(parameters, 0.01); 561 | 562 | for _ in 0..10 { 563 | let mut total_loss = 0.0; 564 | let mut total_accuracy = 0.0; 565 | 566 | for batch in train_data.iter() { 567 | let mut hidden = model.create_start_state(batch_size); 568 | let mut output = None; 569 | 570 | let len = batch.len(); 571 | 572 | for row in batch.iter().take(len - 1) { 573 | let input = Tensor::new_const(Matrix::new(1, batch_size, row.clone())); 574 | let rnn_input = embed.forward(&[&input]).remove(0); 575 | let mut outputs = model.forward(&[&rnn_input, &hidden]); 576 | output = Some(outputs.remove(0)); 577 | hidden = outputs.remove(0); 578 | } 579 | 580 | let output = output.unwrap(); 581 | 582 | let target = Tensor::new_const(Matrix::new(batch_size, 1, batch[len - 1].clone())); 583 | 584 | let loss = criterion.forward(&output, &target); 585 | loss.backward(Tensor::new_const(Matrix::ones(1, 1))); 586 | 587 | optim.step(true); 588 | 589 | let current_loss = loss.0.borrow().data.data()[0]; 590 | total_loss += current_loss; 591 | 592 | let current_accuracy: f64 = output 593 | .0 594 | .borrow() 595 | .data 596 | .row_iter() 597 | .zip(batch[len - 1].iter()) 598 | .map(|(row, ix)| { 599 | if argmax(row.raw_slice()) == (*ix) as usize { 600 | 1.0 601 | } else { 602 | 0.0 603 | } 604 | }) 605 | .sum(); 606 | 607 | total_accuracy += current_accuracy; 608 | } 609 | 610 | println!( 611 | "Loss: {}, Accuracy: {}", 612 | total_loss, 613 | total_accuracy / (total_data_size as f64) 614 | ); 615 | } 616 | 617 | let batch = vec![ 618 | vec![word_index[&"Mary".to_owned()] as f64], 619 | vec![word_index[&"moved".to_owned()] as f64], 620 | vec![word_index[&"to".to_owned()] as f64], 621 | vec![word_index[&"the".to_owned()] as f64], 622 | ]; 623 | 624 | let mut hidden = model.create_start_state(1); 625 | let mut output = None; 626 | for row in batch.iter() { 627 | let input = Tensor::new_const(Matrix::new(1, 1, row.clone())); 628 | let rnn_input = embed.forward(&[&input]).remove(0); 629 | let mut outputs = model.forward(&[&rnn_input, &hidden]); 630 | output = Some(outputs.remove(0)); 631 | hidden = outputs.remove(0); 632 | } 633 | 634 | let output = argmax(output.unwrap().0.borrow().data.row(0).raw_slice()); 635 | println!("Prediction: {}", inverted_word_index[&output]); 636 | 637 | Ok(()) 638 | } 639 | -------------------------------------------------------------------------------- /examples/chapter14.rs: -------------------------------------------------------------------------------- 1 | //! Chapter 14 - Learning to Write Like Shakespeare: Long-Short Term Memory 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Exploding%20Gradients%20Examples.ipynb 4 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Intro%20to%20LSTMs%20-%20Learn%20to%20Write%20Like%20Shakespeare.ipynb 5 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Intro%20to%20LSTMs%20-%20Part%202%20-%20Learn%20to%20Write%20Like%20Shakespeare.ipynb 6 | 7 | use std::collections::{BTreeMap, BTreeSet}; 8 | use std::error::Error; 9 | use std::iter::FromIterator; 10 | use std::ops::Mul; 11 | 12 | use datasets::text::shakespeare_100000; 13 | use indicatif::{ProgressBar, ProgressStyle}; 14 | use rulinalg::matrix::{BaseMatrix, Matrix}; 15 | 16 | use grokking_deep_learning_rs::activations::Sigmoid; 17 | use grokking_deep_learning_rs::layers::{Embedding, LSTMCell, Layer, RNNCell}; 18 | use grokking_deep_learning_rs::losses::{CrossEntropyLoss, Loss}; 19 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer}; 20 | use grokking_deep_learning_rs::tensor::Tensor; 21 | 22 | fn main() -> Result<(), Box> { 23 | println!("\nTraining Shakespeare using RNN Cells\n"); 24 | shakespeare_rnn_cell()?; 25 | 26 | println!("\nVanishing and Exploding Gradients\n"); 27 | vanishing_and_exploding_gradients(); 28 | 29 | println!("\nTraining Shakespeare using LSTM Cells\n"); 30 | shakespeare_lstm_cell()?; 31 | 32 | Ok(()) 33 | } 34 | 35 | fn shakespeare_rnn_cell() -> Result<(), Box> { 36 | let embedding_size = 64; 37 | let rnn_state_size = 512; 38 | let alpha = 0.05; 39 | let batch_size = 16; 40 | let bptt = 25; 41 | 42 | let n_iterations = 1; 43 | 44 | let data = shakespeare_100000()?; 45 | 46 | let characters = BTreeSet::from_iter(data.chars()); 47 | let len = characters.len(); 48 | let word_index = BTreeMap::from_iter(characters.iter().zip(0..len)); 49 | 50 | let indices: Vec<_> = data.chars().map(|c| word_index[&c]).collect(); 51 | 52 | let embed = Embedding::new(len, embedding_size); 53 | let cell = RNNCell::new(embedding_size, rnn_state_size, len, Box::new(Sigmoid)); 54 | 55 | let criterion = CrossEntropyLoss; 56 | 57 | let mut params = embed.parameters(); 58 | params.append(&mut cell.parameters()); 59 | 60 | let optimizer = SGDOptimizer::new(params, alpha); 61 | 62 | let n_batches = (indices.len() as f64 / batch_size as f64).floor() as usize; 63 | 64 | let mut batched_data = Matrix::zeros(n_batches, batch_size); 65 | for (i, c) in indices.into_iter().enumerate() { 66 | if i >= batched_data.data().len() { 67 | break; 68 | } 69 | 70 | let row = i / n_batches; 71 | let col = i % n_batches; 72 | 73 | batched_data[[col, row]] = c as f64; 74 | } 75 | 76 | dbg!(n_batches); 77 | 78 | let n_batches = 100 + 1; 79 | 80 | let steps = (n_batches - 1) / bptt; 81 | 82 | for _ in 0..n_iterations { 83 | let progress = ProgressBar::new((n_batches - 1) as u64); 84 | progress.set_style( 85 | ProgressStyle::default_bar() 86 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 87 | ); 88 | 89 | for j in 0..steps { 90 | let start = bptt * j; 91 | 92 | let mut state = cell.create_start_state(batch_size); 93 | 94 | let mut loss = None; 95 | 96 | for k in 0..bptt { 97 | let input = batched_data.row(start + k).raw_slice(); 98 | let target = batched_data.row(start + k + 1).raw_slice(); 99 | 100 | let input = Tensor::new_const(Matrix::new(1, batch_size, Vec::from(input))); 101 | let target = Tensor::new_const(Matrix::new(batch_size, 1, Vec::from(target))); 102 | 103 | let rnn_input = &embed.forward(&[&input])[0]; 104 | let mut outputs = cell.forward(&[rnn_input, &state]); 105 | 106 | let output = outputs.remove(0); 107 | state = outputs.remove(0); 108 | 109 | let current_loss = criterion.forward(&output, &target); 110 | progress.set_message(&format!( 111 | "Batch Loss: {:?}", 112 | current_loss.0.borrow().data.data() 113 | )); 114 | 115 | loss = match loss.take() { 116 | None => Some(current_loss), 117 | Some(existing_loss) => Some(&existing_loss + ¤t_loss), 118 | }; 119 | 120 | progress.inc(1); 121 | } 122 | 123 | loss.unwrap().backward(Tensor::grad(Matrix::ones(1, 1))); 124 | optimizer.step(true); 125 | } 126 | 127 | progress.finish(); 128 | } 129 | 130 | Ok(()) 131 | } 132 | 133 | fn vanishing_and_exploding_gradients() { 134 | let weights = Matrix::new(2, 2, vec![1.0, 4.0, 4.0, 1.0]); 135 | let mut activation = sigmoid(Matrix::new(1, 2, vec![1.0, 0.01])); 136 | 137 | println!("Sigmoid Activations"); 138 | let mut activations = Vec::new(); 139 | for _ in 0..10 { 140 | activation = sigmoid(activation.mul(&weights)); 141 | activations.push(activation.clone()); 142 | println!("{}", activation); 143 | } 144 | 145 | println!("\nSigmoid Gradients"); 146 | let mut gradient = Matrix::ones(1, 2); 147 | for activation in activations.into_iter().rev() { 148 | gradient = activation 149 | .elemul(&(Matrix::ones(1, 2) - &activation)) 150 | .elemul(&gradient); 151 | gradient = gradient.mul(weights.transpose()); 152 | println!("{}", gradient); 153 | } 154 | 155 | println!("\nrelu Activations"); 156 | let mut activations = Vec::new(); 157 | for _ in 0..10 { 158 | activation = relu(activation.mul(&weights)); 159 | activations.push(activation.clone()); 160 | println!("{}", activation); 161 | } 162 | 163 | println!("\nrelu Gradients"); 164 | let mut gradient = Matrix::ones(1, 2); 165 | for activation in activations.into_iter().rev() { 166 | gradient = gradient.elemul(&Matrix::new( 167 | 1, 168 | 2, 169 | activation 170 | .data() 171 | .iter() 172 | .map(|v| if v > &0.0 { *v } else { 0.0 }) 173 | .collect::>(), 174 | )); 175 | gradient = gradient.mul(weights.transpose()); 176 | println!("{}", gradient); 177 | } 178 | } 179 | 180 | fn sigmoid(mut m: Matrix) -> Matrix { 181 | for i in 0..m.rows() { 182 | for j in 0..m.cols() { 183 | m[[i, j]] = 1.0 / (1.0 + (-m[[i, j]]).exp()); 184 | } 185 | } 186 | 187 | m 188 | } 189 | 190 | fn relu(mut m: Matrix) -> Matrix { 191 | for i in 0..m.rows() { 192 | for j in 0..m.cols() { 193 | m[[i, j]] = if m[[i, j]] > 0.0 { m[[i, j]] } else { 0.0 }; 194 | } 195 | } 196 | 197 | m 198 | } 199 | 200 | fn shakespeare_lstm_cell() -> Result<(), Box> { 201 | let embedding_size = 64; 202 | let rnn_state_size = 512; 203 | let alpha = 0.05; 204 | let batch_size = 16; 205 | let bptt = 25; 206 | 207 | let n_iterations = 1; 208 | 209 | let data = shakespeare_100000()?; 210 | 211 | let characters = BTreeSet::from_iter(data.chars()); 212 | let len = characters.len(); 213 | let word_index = BTreeMap::from_iter(characters.iter().zip(0..len)); 214 | 215 | let indices: Vec<_> = data.chars().map(|c| word_index[&c]).collect(); 216 | 217 | let embed = Embedding::new(len, embedding_size); 218 | let cell = LSTMCell::new(embedding_size, rnn_state_size, len); 219 | 220 | let criterion = CrossEntropyLoss; 221 | 222 | let optimizer = SGDOptimizer::new( 223 | embed 224 | .parameters() 225 | .into_iter() 226 | .chain(cell.parameters().into_iter()) 227 | .collect(), 228 | alpha, 229 | ); 230 | 231 | let n_batches = (indices.len() as f64 / batch_size as f64).floor() as usize; 232 | 233 | let mut batched_data = Matrix::zeros(n_batches, batch_size); 234 | for (i, c) in indices.into_iter().enumerate() { 235 | if i >= batched_data.data().len() { 236 | break; 237 | } 238 | 239 | let row = i / n_batches; 240 | let col = i % n_batches; 241 | 242 | batched_data[[col, row]] = c as f64; 243 | } 244 | 245 | dbg!(n_batches); 246 | 247 | let n_batches = 100 + 1; 248 | 249 | let steps = (n_batches - 1) / bptt; 250 | 251 | for _ in 0..n_iterations { 252 | let progress = ProgressBar::new((n_batches - 1) as u64); 253 | progress.set_style( 254 | ProgressStyle::default_bar() 255 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 256 | ); 257 | 258 | for j in 0..steps { 259 | let start = bptt * j; 260 | 261 | let (mut h, mut c) = cell.create_start_state(batch_size); 262 | 263 | let mut loss = None; 264 | 265 | for k in 0..bptt { 266 | let input = batched_data.row(start + k).raw_slice(); 267 | let target = batched_data.row(start + k + 1).raw_slice(); 268 | 269 | let input = Tensor::new_const(Matrix::new(1, batch_size, Vec::from(input))); 270 | let target = Tensor::new_const(Matrix::new(batch_size, 1, Vec::from(target))); 271 | 272 | let rnn_input = &embed.forward(&[&input])[0]; 273 | let mut outputs = cell.forward(&[rnn_input, &h, &c]); 274 | 275 | let output = outputs.remove(0); 276 | h = outputs.remove(0); 277 | c = outputs.remove(0); 278 | 279 | let current_loss = criterion.forward(&output, &target); 280 | progress.set_message(&format!( 281 | "Batch Loss: {:?}", 282 | current_loss.0.borrow().data.data() 283 | )); 284 | 285 | loss = match loss.take() { 286 | None => Some(current_loss), 287 | Some(existing_loss) => Some(&existing_loss + ¤t_loss), 288 | }; 289 | 290 | progress.inc(1); 291 | } 292 | 293 | loss.unwrap().backward(Tensor::grad(Matrix::ones(1, 1))); 294 | optimizer.step(true); 295 | } 296 | 297 | progress.finish(); 298 | } 299 | 300 | Ok(()) 301 | } 302 | -------------------------------------------------------------------------------- /examples/chapter15.rs: -------------------------------------------------------------------------------- 1 | //! Chapter 15: Introduction to Federated Learning 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter15%20-%20Intro%20to%20Federated%20Learning%20-%20Deep%20Learning%20on%20Unseen%20Data.ipynb 4 | 5 | use std::collections::{BTreeMap, BTreeSet}; 6 | use std::error::Error; 7 | use std::iter::FromIterator; 8 | 9 | use datasets::text::enron_spam; 10 | use datasets::Dataset; 11 | use indicatif::{ProgressBar, ProgressStyle}; 12 | // use paillier::traits::{Add, Decrypt, Encrypt, KeyGeneration, Mul}; 13 | // use paillier::{EncryptionKey, Paillier}; 14 | use rulinalg::matrix::Matrix; 15 | 16 | use grokking_deep_learning_rs::layers::{Embedding, Layer}; 17 | use grokking_deep_learning_rs::losses::{Loss, MSELoss}; 18 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer}; 19 | use grokking_deep_learning_rs::tensor::{Sum, Tensor}; 20 | 21 | fn main() -> Result<(), Box> { 22 | println!("\nRegular Deep Learning\n"); 23 | regular_deep_learning()?; 24 | 25 | println!("\nFederated Deep Learning\n"); 26 | federated_deep_learning()?; 27 | 28 | Ok(()) 29 | } 30 | 31 | /// Regular Deep Learning 32 | 33 | fn regular_deep_learning() -> Result<(), Box> { 34 | let (spam, ham) = enron_spam()?; 35 | 36 | let dataset_size = 3000; 37 | let max_sentence_len = 100; 38 | 39 | let (spam, ham) = ( 40 | parse_dataset(spam, dataset_size, max_sentence_len), 41 | parse_dataset(ham, dataset_size, max_sentence_len), 42 | ); 43 | 44 | let word_index = { 45 | let words = BTreeSet::from_iter(spam.iter().chain(ham.iter()).flat_map(|v| v.iter())); 46 | let word_count = words.len(); 47 | BTreeMap::from_iter(words.into_iter().zip(0..word_count)) 48 | }; 49 | 50 | let word_count = word_index.len(); 51 | 52 | dbg!(word_count); 53 | 54 | let spam = spam 55 | .iter() 56 | .map(|sentence| { 57 | sentence 58 | .iter() 59 | .map(|word| word_index[&word] as f64) 60 | .collect::>() 61 | }) 62 | .collect::>(); 63 | 64 | let ham = ham 65 | .iter() 66 | .map(|sentence| { 67 | sentence 68 | .iter() 69 | .map(|word| word_index[&word] as f64) 70 | .collect::>() 71 | }) 72 | .collect::>(); 73 | 74 | let train_data = spam 75 | .iter() 76 | .take(dataset_size / 2) 77 | .cloned() 78 | .zip(vec![1.0; dataset_size / 2]) 79 | .chain( 80 | ham.iter() 81 | .take(dataset_size / 2) 82 | .cloned() 83 | .zip(vec![0.0; dataset_size / 2]), 84 | ) 85 | .shuffle(dataset_size, 0) 86 | .collect::>(); 87 | 88 | let test_data = spam 89 | .iter() 90 | .skip(dataset_size / 2) 91 | .cloned() 92 | .zip(vec![1.0; dataset_size / 2]) 93 | .chain( 94 | ham.iter() 95 | .skip(dataset_size / 2) 96 | .cloned() 97 | .zip(vec![0.0; dataset_size / 2]), 98 | ) 99 | .shuffle(dataset_size, 0) 100 | .collect::>(); 101 | 102 | let model = Embedding::new(word_count, 1); 103 | 104 | { 105 | model.weights.0.borrow_mut().data *= 0.0; 106 | } 107 | 108 | let n_iterations = 10; 109 | let batch_size = 200; 110 | let n_batches = dataset_size / batch_size; 111 | 112 | let model = train( 113 | model, 114 | train_data, 115 | dataset_size, 116 | &word_index, 117 | max_sentence_len, 118 | n_iterations, 119 | n_batches, 120 | batch_size, 121 | ); 122 | 123 | let accuracy = test(&model, &test_data, dataset_size, max_sentence_len); 124 | 125 | println!("Test Accuracy: {}", accuracy); 126 | 127 | Ok(()) 128 | } 129 | 130 | #[allow(clippy::needless_range_loop, clippy::too_many_arguments)] 131 | fn train( 132 | model: Embedding, 133 | data: Vec<(Vec, f64)>, 134 | dataset_size: usize, 135 | word_index: &BTreeMap<&String, usize>, 136 | max_sentence_len: usize, 137 | n_iterations: usize, 138 | n_batches: usize, 139 | batch_size: usize, 140 | ) -> Embedding { 141 | // NOTE: Unlike the Python version, cannot do batching as cannot support 3D operations 142 | // so running stochastic gradient descent in batch_size iterations and accumulating loss 143 | let criterion = MSELoss; 144 | let optim = SGDOptimizer::new(model.parameters(), 0.01); 145 | 146 | for _ in 0..n_iterations { 147 | let progress = ProgressBar::new(n_batches as u64); 148 | progress.set_style( 149 | ProgressStyle::default_bar() 150 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 151 | ); 152 | 153 | let mut total_loss = 0.0; 154 | 155 | for bi in 0..n_batches { 156 | let mut current_loss = 0.0; 157 | 158 | { 159 | model.weights.0.borrow_mut().data[[word_index[&"".to_owned()], 0]] *= 0.0; 160 | } 161 | 162 | for i in (batch_size * bi)..(batch_size * (bi + 1)) { 163 | let (input, output) = &data[i]; 164 | let input = Tensor::new_const(Matrix::new(1, max_sentence_len, input.clone())); 165 | let prediction = model.forward(&[&input]).remove(0); 166 | let prediction = prediction.sum(0); 167 | let prediction = prediction.sigmoid(); 168 | 169 | let target = Tensor::new_const(Matrix::new(1, 1, vec![*output])); 170 | 171 | let loss = criterion.forward(&prediction, &target); 172 | 173 | current_loss += loss.0.borrow().data.data()[0]; 174 | 175 | loss.backward(Tensor::grad(Matrix::ones(1, 1))); 176 | optim.step(true); 177 | } 178 | 179 | total_loss += current_loss; 180 | 181 | progress.set_message(&format!("Loss: {}", current_loss / (batch_size as f64))); 182 | progress.inc(1); 183 | } 184 | 185 | progress.finish_with_message(&format!("Loss: {}", total_loss / (dataset_size as f64))); 186 | } 187 | 188 | model 189 | } 190 | 191 | fn test( 192 | model: &Embedding, 193 | data: &[(Vec, f64)], 194 | dataset_size: usize, 195 | max_sentence_len: usize, 196 | ) -> f64 { 197 | let mut accuracy = 0.0; 198 | 199 | for item in data.iter().take(dataset_size / 2) { 200 | let (input, output) = item; 201 | let input = Tensor::new_const(Matrix::new(1, max_sentence_len, input.clone())); 202 | let prediction = model.forward(&[&input]).remove(0); 203 | let prediction = prediction.sum(0); 204 | let prediction = prediction.sigmoid(); 205 | 206 | if (prediction.0.borrow().data.data()[0] >= 0.5 && (output - 1.0).abs() < std::f64::EPSILON) 207 | || (prediction.0.borrow().data.data()[0] < 0.5 && (output - 0.0).abs() < std::f64::EPSILON) 208 | { 209 | accuracy += 1.0; 210 | } 211 | } 212 | 213 | accuracy / ((dataset_size / 2) as f64) 214 | } 215 | 216 | fn parse_dataset( 217 | dataset: impl Dataset, 218 | dataset_size: usize, 219 | max_sentence_len: usize, 220 | ) -> Vec> { 221 | dataset 222 | .take(dataset_size) 223 | .map(|email| { 224 | email 225 | .split('\n') 226 | .map(|line| line.split_whitespace()) 227 | .flat_map(|v| v) 228 | .map(|v| { 229 | v.chars() 230 | .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z')) 231 | .collect::() 232 | }) 233 | .collect::>() 234 | }) 235 | .map(|mut email| { 236 | if email.len() >= max_sentence_len { 237 | email.drain(max_sentence_len..email.len()); 238 | } else { 239 | for _ in 0..(max_sentence_len - email.len()) { 240 | email.push("".to_owned()); 241 | } 242 | } 243 | 244 | email 245 | }) 246 | .collect() 247 | } 248 | 249 | fn federated_deep_learning() -> Result<(), Box> { 250 | let (spam, ham) = enron_spam()?; 251 | 252 | let dataset_size = 4000; 253 | let train_dataset_size = 3000; 254 | let test_dataset_size = dataset_size - train_dataset_size; 255 | let max_sentence_len = 100; 256 | 257 | let (spam, ham) = ( 258 | parse_dataset(spam, dataset_size, max_sentence_len), 259 | parse_dataset(ham, dataset_size, max_sentence_len), 260 | ); 261 | 262 | let word_index = { 263 | let words = BTreeSet::from_iter(spam.iter().chain(ham.iter()).flat_map(|v| v.iter())); 264 | let word_count = words.len(); 265 | BTreeMap::from_iter(words.into_iter().zip(0..word_count)) 266 | }; 267 | 268 | let word_count = word_index.len(); 269 | 270 | dbg!(word_count); 271 | 272 | let spam = spam 273 | .iter() 274 | .map(|sentence| { 275 | sentence 276 | .iter() 277 | .map(|word| word_index[&word] as f64) 278 | .collect::>() 279 | }) 280 | .collect::>(); 281 | 282 | let ham = ham 283 | .iter() 284 | .map(|sentence| { 285 | sentence 286 | .iter() 287 | .map(|word| word_index[&word] as f64) 288 | .collect::>() 289 | }) 290 | .collect::>(); 291 | 292 | let train_data = spam 293 | .iter() 294 | .take(train_dataset_size) 295 | .cloned() 296 | .zip(vec![1.0; train_dataset_size]) 297 | .chain( 298 | ham.iter() 299 | .take(train_dataset_size) 300 | .cloned() 301 | .zip(vec![0.0; train_dataset_size]), 302 | ) 303 | .shuffle(2 * train_dataset_size, 0) 304 | .collect::>(); 305 | 306 | let test_data = spam 307 | .iter() 308 | .skip(train_dataset_size) 309 | .cloned() 310 | .zip(vec![1.0; test_dataset_size]) 311 | .chain( 312 | ham.iter() 313 | .skip(train_dataset_size) 314 | .cloned() 315 | .zip(vec![0.0; test_dataset_size]), 316 | ) 317 | .shuffle(2 * test_dataset_size, 0) 318 | .collect::>(); 319 | 320 | let alice: Vec<_> = train_data 321 | .iter() 322 | .take(train_dataset_size / 3) 323 | .cloned() 324 | .collect(); 325 | let bob: Vec<_> = train_data 326 | .iter() 327 | .skip(train_dataset_size / 3) 328 | .take(train_dataset_size / 3) 329 | .cloned() 330 | .collect(); 331 | let charlie: Vec<_> = train_data 332 | .iter() 333 | .skip(2 * train_dataset_size / 3) 334 | .cloned() 335 | .collect(); 336 | 337 | let alice_model = Embedding::new(word_count, 1); 338 | let bob_model = Embedding::new(word_count, 1); 339 | let charlie_model = Embedding::new(word_count, 1); 340 | 341 | { 342 | alice_model.weights.0.borrow_mut().data *= 0.0; 343 | bob_model.weights.0.borrow_mut().data *= 0.0; 344 | charlie_model.weights.0.borrow_mut().data *= 0.0; 345 | } 346 | 347 | let n_iterations = 10; 348 | let batch_size = 200; 349 | let n_batches = train_dataset_size / (3 * batch_size); 350 | 351 | println!("Training Alice"); 352 | let alice_model = train( 353 | alice_model, 354 | alice, 355 | train_dataset_size / 3, 356 | &word_index, 357 | max_sentence_len, 358 | n_iterations, 359 | n_batches, 360 | batch_size, 361 | ); 362 | 363 | println!("Training Bob"); 364 | let bob_model = train( 365 | bob_model, 366 | bob, 367 | train_dataset_size / 3, 368 | &word_index, 369 | max_sentence_len, 370 | n_iterations, 371 | n_batches, 372 | batch_size, 373 | ); 374 | 375 | println!("Training Charlie"); 376 | let charlie_model = train( 377 | charlie_model, 378 | charlie, 379 | train_dataset_size / 3, 380 | &word_index, 381 | max_sentence_len, 382 | n_iterations, 383 | n_batches, 384 | batch_size, 385 | ); 386 | 387 | let alice_weights = &alice_model.weights.0.borrow().data; 388 | let bob_weights = &bob_model.weights.0.borrow().data; 389 | let charlie_weights = &charlie_model.weights.0.borrow().data; 390 | 391 | let weights = alice_weights + bob_weights + charlie_weights; 392 | let weights = weights / 3.0; 393 | 394 | let model = Embedding::from_weights(weights); 395 | 396 | let accuracy = test(&model, &test_data, dataset_size, max_sentence_len); 397 | 398 | println!("Test Accuracy: {}", accuracy); 399 | 400 | Ok(()) 401 | } 402 | 403 | // fn train_and_encrypt( 404 | // model: Embedding, 405 | // data: Vec<(Vec, f64)>, 406 | // dataset_size: usize, 407 | // word_index: &BTreeMap<&String, usize>, 408 | // max_sentence_len: usize, 409 | // n_iterations: usize, 410 | // n_batches: usize, 411 | // batch_size: usize, 412 | // encryption_key: &EncryptionKey, 413 | // ) -> Vec> { 414 | // let model = train( 415 | // model, 416 | // data, 417 | // dataset_size, 418 | // word_index, 419 | // max_sentence_len, 420 | // n_iterations, 421 | // n_batches, 422 | // batch_size, 423 | // ); 424 | // 425 | // model 426 | // .weights 427 | // .0 428 | // .borrow() 429 | // .data 430 | // .data() 431 | // .iter() 432 | // .map(|v| Paillier::encrypt(&encryption_key, *v)) 433 | // .collect() 434 | // } 435 | -------------------------------------------------------------------------------- /examples/chapter3.rs: -------------------------------------------------------------------------------- 1 | //! Chapter3 - Forward Propagation - Intro to Neural Prediction 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter3%20-%20%20Forward%20Propagation%20-%20Intro%20to%20Neural%20Prediction.ipynb 4 | 5 | use grokking_deep_learning_rs::{ 6 | dot, elementwise_scalar_multiplication, matrix_vector_dot, Matrix, Vector, 7 | }; 8 | 9 | fn main() { 10 | // different sections of the chapter in order. 11 | what_is_a_neural_network(); 12 | making_a_prediction_with_multiple_inputs(); 13 | making_a_prediction_with_multiple_outputs(); 14 | predicting_with_multiple_inputs_and_outputs(); 15 | predicting_on_predictions(); 16 | } 17 | 18 | /// A Simple Neural Network making a prediction 19 | /// 20 | /// What is a neural network? 21 | 22 | fn what_is_a_neural_network() { 23 | let number_of_toes = vec![8.5, 9.5, 10.0, 9.0]; 24 | 25 | let input = number_of_toes[0]; 26 | let weight = 0.1; 27 | 28 | let prediction = neural_network_1(input, weight); 29 | println!("prediction: {}", prediction); 30 | } 31 | 32 | #[allow(clippy::let_and_return)] 33 | fn neural_network_1(input: f64, weight: f64) -> f64 { 34 | let prediction = input * weight; 35 | prediction 36 | } 37 | 38 | /// Making a prediction with multiple inputs 39 | 40 | fn making_a_prediction_with_multiple_inputs() { 41 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 42 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 43 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 44 | 45 | let input = vec![toes[0], wlrec[0], nfans[0]]; 46 | let weights = vec![0.1, 0.2, 0.0]; 47 | 48 | let pred = neural_network_2(input, weights); 49 | println!("prediction: {}", pred); 50 | } 51 | 52 | fn neural_network_2(input: Vec, weights: Vec) -> f64 { 53 | dot(&input, &weights) 54 | } 55 | 56 | /// Making a prediction with multiple outputs 57 | 58 | fn making_a_prediction_with_multiple_outputs() { 59 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 60 | 61 | let input = wlrec[0]; 62 | let weights = vec![0.3, 0.2, 0.9]; 63 | 64 | let pred = neural_network_3(input, weights); 65 | println!("predictions: {:?}", pred); 66 | } 67 | 68 | fn neural_network_3(input: f64, weights: Vec) -> Vec { 69 | elementwise_scalar_multiplication(&weights, input) 70 | } 71 | 72 | /// Predicting with multiple inputs and outputs 73 | 74 | fn predicting_with_multiple_inputs_and_outputs() { 75 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 76 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 77 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 78 | 79 | let input = vec![toes[0], wlrec[0], nfans[0]]; 80 | let weights = vec![ 81 | vec![0.1, 0.1, -0.3], 82 | vec![0.1, 0.2, 0.0], 83 | vec![0.0, 1.3, 0.1], 84 | ]; 85 | 86 | let pred = neural_network_4(input, weights); 87 | println!("predictions: {:?}", pred); 88 | } 89 | 90 | fn neural_network_4(input: Vector, weights: Matrix) -> Vector { 91 | matrix_vector_dot(&weights, &input) 92 | } 93 | 94 | /// Predicting on Predictions 95 | 96 | fn predicting_on_predictions() { 97 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 98 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 99 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 100 | 101 | let input = vec![toes[0], wlrec[0], nfans[0]]; 102 | let input_weights = vec![ 103 | vec![0.1, 0.2, -0.1], 104 | vec![-0.1, 0.1, 0.9], 105 | vec![0.1, 0.4, 0.1], 106 | ]; 107 | let hidden1_weights = vec![ 108 | vec![0.3, 1.1, -0.3], 109 | vec![0.1, 0.2, 0.0], 110 | vec![0.0, 1.3, 0.1], 111 | ]; 112 | 113 | let pred = neural_network_5(input, input_weights, hidden1_weights); 114 | println!("predictions: {:?}", pred); 115 | } 116 | 117 | fn neural_network_5(input: Vector, input_weights: Matrix, hidden1_weights: Matrix) -> Vector { 118 | matrix_vector_dot(&hidden1_weights, &matrix_vector_dot(&input_weights, &input)) 119 | } 120 | -------------------------------------------------------------------------------- /examples/chapter4.rs: -------------------------------------------------------------------------------- 1 | //! Chapter4 - Gradient Descent - Intro to Neural Learning 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter4%20-%20Gradient%20Descent%20-%20Intro%20to%20Neural%20Learning.ipynb 4 | 5 | fn main() { 6 | println!("\nLearning using hot and cold method\n"); 7 | hot_and_cold_method(); 8 | 9 | println!("\nHot and Cold Learning\n"); 10 | hot_and_cold_learning(); 11 | 12 | println!("\nCalculating both direction and amount from error.\n"); 13 | hot_and_cold_learning_with_direction_and_amount(); 14 | 15 | println!("\nOne Iteration of Gradient Descent\n"); 16 | gradient_descent_method(); 17 | 18 | println!("\nLearning is just reducing error\n"); 19 | gradient_descent(); 20 | 21 | println!("\nLet's watch several steps of learning\n"); 22 | gradient_descent_2(); 23 | 24 | println!("\nWhy does this work? What really is weight delta?\n"); 25 | gradient_descent_3(); 26 | 27 | println!("\nBreaking Gradient Descent\n"); 28 | gradient_descent_working(); 29 | println!(); 30 | gradient_descent_breaking(); 31 | 32 | println!("\nAlpha\n"); 33 | gradient_descent_working_again(); 34 | } 35 | 36 | /// Learning using hot and cold method 37 | 38 | #[allow(unused_assignments)] 39 | fn hot_and_cold_method() { 40 | let (mut weight, lr) = (0.1, 0.01); 41 | let (number_of_toes, win_or_lose_binary) = ([8.5], [1.0]); 42 | 43 | let (input, truth) = (number_of_toes[0], win_or_lose_binary[0]); 44 | 45 | let pred = neural_network(input, weight); 46 | 47 | let err = (pred - truth).powf(2.0); 48 | println!("error: {}", err); 49 | 50 | let (pred_up, pred_down) = ( 51 | neural_network(input, weight + lr), 52 | neural_network(input, weight - lr), 53 | ); 54 | let (err_up, err_down) = ((pred_up - truth).powf(2.0), (pred_down - truth).powf(2.0)); 55 | println!("error up: {}, error down: {}", err_up, err_down); 56 | 57 | if err_up < err_down { 58 | weight += lr; 59 | } else { 60 | weight -= lr; 61 | } 62 | } 63 | 64 | /// Hot and Cold Learning 65 | 66 | fn hot_and_cold_learning() { 67 | let mut weight = 0.5; 68 | 69 | let (input, truth) = (0.5, 0.8); 70 | 71 | let n_iterations = 20; 72 | let lr = 0.001; 73 | 74 | for _ in 0..n_iterations { 75 | let pred = neural_network(input, weight); 76 | 77 | let err = (pred - truth).powf(2.0); 78 | println!("Error: {}, Prediction: {}", err, pred); 79 | 80 | let (pred_up, pred_down) = ( 81 | neural_network(input, weight + lr), 82 | neural_network(input, weight - lr), 83 | ); 84 | let (err_up, err_down) = ((pred_up - truth).powf(2.0), (pred_down - truth).powf(2.0)); 85 | 86 | if err_up < err_down { 87 | weight += lr; 88 | } else if err_up > err_down { 89 | weight -= lr; 90 | } 91 | } 92 | } 93 | 94 | /// Calculating both direction and amount from error. 95 | 96 | fn hot_and_cold_learning_with_direction_and_amount() { 97 | let mut weight = 0.5; 98 | 99 | let (input, truth) = (0.5, 0.8); 100 | 101 | let n_iterations = 1101; 102 | 103 | for _ in 0..n_iterations { 104 | let pred = neural_network(input, weight); 105 | 106 | let err = (pred - truth).powf(2.0); 107 | println!("Error: {}, Prediction: {}", err, pred); 108 | 109 | let direction_and_amount = (pred - truth) * input; 110 | weight -= direction_and_amount; 111 | } 112 | } 113 | 114 | /// One Iteration of Gradient Descent 115 | 116 | #[allow(unused_variables, unused_assignments)] 117 | fn gradient_descent_method() { 118 | let (mut weight, alpha) = (0.1, 0.01); 119 | let (number_of_toes, win_or_lose_binary) = ([8.5], [1.0]); 120 | 121 | let (input, truth) = (number_of_toes[0], win_or_lose_binary[0]); 122 | 123 | let pred = neural_network(input, truth); 124 | let err = (pred - truth).powf(2.0); 125 | 126 | let delta = pred - truth; 127 | let weight_delta = input * delta; 128 | 129 | let alpha = 0.01; 130 | weight -= weight_delta * alpha; 131 | } 132 | 133 | fn neural_network(input: f64, weight: f64) -> f64 { 134 | input * weight 135 | } 136 | 137 | /// Learning is just reducing error 138 | 139 | fn gradient_descent() { 140 | let (mut weight, truth, input) = (0.0, 0.8, 0.5); 141 | for _ in 0..4 { 142 | let pred = neural_network(input, weight); 143 | let err = (pred - truth).powf(2.0); 144 | println!("Error: {}, Prediction: {}", err, pred); 145 | 146 | let delta = pred - truth; 147 | let weight_delta = delta * input; 148 | weight -= weight_delta; 149 | } 150 | } 151 | 152 | /// Let's watch several steps of learning. 153 | 154 | fn gradient_descent_2() { 155 | let (mut weight, truth, input) = (0.0, 0.8, 1.1); 156 | for _ in 0..4 { 157 | println!("------\nWeight: {}", weight); 158 | 159 | let pred = neural_network(input, weight); 160 | let err = (pred - truth).powf(2.0); 161 | println!("Error: {}, Prediction: {}", err, pred); 162 | 163 | let delta = pred - truth; 164 | let weight_delta = delta * input; 165 | weight -= weight_delta; 166 | println!("Delta: {}, Weight Delta: {}", delta, weight_delta); 167 | } 168 | } 169 | 170 | /// Why does this work? What really is weight delta? 171 | 172 | fn gradient_descent_3() { 173 | let (mut weight, truth, input) = (0.0, 0.8, 1.1); 174 | for _ in 0..20 { 175 | let pred = neural_network(input, weight); 176 | let err = (pred - truth).powf(2.0); 177 | println!("Error: {}, Prediction: {}", err, pred); 178 | 179 | let delta = pred - truth; 180 | let weight_delta = delta * input; 181 | weight -= weight_delta; 182 | } 183 | } 184 | 185 | /// Breaking Gradient Descent 186 | 187 | fn gradient_descent_working() { 188 | let (mut weight, truth, input) = (0.5, 0.8, 0.5); 189 | for _ in 0..20 { 190 | let pred = neural_network(input, weight); 191 | let err = (pred - truth).powf(2.0); 192 | println!("Error: {}, Prediction: {}", err, pred); 193 | 194 | let delta = pred - truth; 195 | let weight_delta = delta * input; 196 | weight -= weight_delta; 197 | } 198 | } 199 | 200 | fn gradient_descent_breaking() { 201 | let (mut weight, truth, input) = (0.5, 0.8, 2.0); 202 | for _ in 0..20 { 203 | let pred = neural_network(input, weight); 204 | let err = (pred - truth).powf(2.0); 205 | println!("Error: {}, Prediction: {}", err, pred); 206 | 207 | let delta = pred - truth; 208 | let weight_delta = delta * input; 209 | weight -= weight_delta; 210 | } 211 | } 212 | 213 | /// Alpha 214 | 215 | fn gradient_descent_working_again() { 216 | let (mut weight, truth, input) = (0.5, 0.8, 2.0); 217 | let alpha = 0.1; 218 | 219 | for _ in 0..20 { 220 | let pred = neural_network(input, weight); 221 | let err = (pred - truth).powf(2.0); 222 | println!("Error: {}, Prediction: {}", err, pred); 223 | 224 | let delta = pred - truth; 225 | let weight_delta = delta * input; 226 | weight -= alpha * weight_delta; 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /examples/chapter5.rs: -------------------------------------------------------------------------------- 1 | //! Chapter5 - Generalizing Gradient Descent - Learning Multiple Weights at a Time.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter5%20-%20Generalizing%20Gradient%20Descent%20-%20Learning%20Multiple%20Weights%20at%20a%20Time.ipynb 4 | 5 | use grokking_deep_learning_rs::{ 6 | dot, elementwise_scalar_multiplication, matrix_vector_dot, Matrix, Vector, 7 | }; 8 | 9 | fn main() { 10 | println!("\nGradient Descent Learning with Multiple Inputs.\n"); 11 | gradient_descent_with_multiple_inputs(); 12 | 13 | println!("\nLet's Watch Several Steps of Learning\n"); 14 | gradient_descent_with_multiple_inputs_iterations(); 15 | 16 | println!("\nFreezing one weight, What does it do?\n"); 17 | gradient_descent_with_multiple_inputs_frozen_weights(); 18 | 19 | println!("\nGradient Descent Learning with multiple outputs\n"); 20 | gradient_descent_with_multiple_outputs(); 21 | 22 | println!("\nGradient Descent with multiple inputs and outputs\n"); 23 | gradient_descent_with_multiple_inputs_and_outputs(); 24 | } 25 | 26 | /// Gradient Descent Learning with Multiple Inputs. 27 | 28 | fn gradient_descent_with_multiple_inputs() { 29 | let mut weights: Vector = vec![0.1, 0.2, -0.1]; 30 | 31 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 32 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 33 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 34 | 35 | let input = vec![toes[0], wlrec[0], nfans[0]]; 36 | 37 | let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0]; 38 | let truth = win_or_lose_binary[0]; 39 | 40 | let pred = neural_network_1(&input, &weights); 41 | let error = (pred - truth).powf(2.0); 42 | println!("Error: {}, Prediction: {}", error, pred); 43 | 44 | let delta = pred - truth; 45 | let weight_delta = elementwise_scalar_multiplication(&input, delta); 46 | 47 | let alpha = 0.01; 48 | for i in 0..3 { 49 | weights[i] -= alpha * weight_delta[i]; 50 | } 51 | println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_delta); 52 | } 53 | 54 | #[allow(clippy::ptr_arg)] 55 | fn neural_network_1(input: &Vector, weights: &Vector) -> f64 { 56 | dot(input, weights) 57 | } 58 | 59 | /// Let's Watch Several Steps of Learning 60 | 61 | fn gradient_descent_with_multiple_inputs_iterations() { 62 | let mut weights: Vector = vec![0.1, 0.2, -0.1]; 63 | 64 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 65 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 66 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 67 | 68 | let input = vec![toes[0], wlrec[0], nfans[0]]; 69 | 70 | let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0]; 71 | let truth = win_or_lose_binary[0]; 72 | 73 | let alpha = 0.01; 74 | 75 | for i in 0..3 { 76 | println!("Iteration {}", i + 1); 77 | 78 | let pred = neural_network_1(&input, &weights); 79 | let error = (pred - truth).powf(2.0); 80 | println!("Error: {}, Prediction: {}", error, pred); 81 | 82 | let delta = pred - truth; 83 | let weight_delta = elementwise_scalar_multiplication(&input, delta); 84 | 85 | for i in 0..3 { 86 | weights[i] -= alpha * weight_delta[i]; 87 | } 88 | println!( 89 | "Weights: {:?}, Weight Deltas: {:?}\n", 90 | weights, weight_delta 91 | ); 92 | } 93 | } 94 | 95 | /// Freezing one weight, What does it do? 96 | 97 | fn gradient_descent_with_multiple_inputs_frozen_weights() { 98 | let mut weights: Vector = vec![0.1, 0.2, -0.1]; 99 | 100 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 101 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 102 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 103 | 104 | let input = vec![toes[0], wlrec[0], nfans[0]]; 105 | 106 | let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0]; 107 | let truth = win_or_lose_binary[0]; 108 | 109 | let alpha = 0.3; 110 | 111 | for i in 0..3 { 112 | println!("Iteration {}", i + 1); 113 | 114 | let pred = neural_network_1(&input, &weights); 115 | let error = (pred - truth).powf(2.0); 116 | println!("Error: {}, Prediction: {}", error, pred); 117 | 118 | let delta = pred - truth; 119 | let mut weight_delta = elementwise_scalar_multiplication(&input, delta); 120 | weight_delta[0] = 0.0; 121 | 122 | for i in 0..3 { 123 | weights[i] -= alpha * weight_delta[i]; 124 | } 125 | println!( 126 | "Weights: {:?}, Weight Deltas: {:?}\n", 127 | weights, weight_delta 128 | ); 129 | } 130 | } 131 | 132 | /// Gradient Descent Learning with multiple outputs 133 | 134 | fn gradient_descent_with_multiple_outputs() { 135 | let mut weights = vec![0.3, 0.2, 0.9]; 136 | 137 | let wlrec = vec![0.65, 1.0, 1.0, 0.9]; 138 | 139 | let hurt = vec![0.1, 0.0, 0.0, 0.1]; 140 | let win = vec![1.0, 1.0, 0.0, 1.0]; 141 | let sad = vec![0.1, 0.0, 0.1, 0.2]; 142 | 143 | let input = wlrec[0]; 144 | let truth = vec![hurt[0], win[0], sad[0]]; 145 | 146 | let alpha = 0.1; 147 | 148 | let pred = neural_network_2(input, &weights); 149 | let error: Vector = pred 150 | .iter() 151 | .zip(truth.iter()) 152 | .map(|(x, y)| (x - y).powf(2.0)) 153 | .collect(); 154 | println!("Prediction: {:?}, Error: {:?}", pred, error); 155 | 156 | let deltas: Vector = pred.iter().zip(truth.iter()).map(|(x, y)| x - y).collect(); 157 | 158 | // NOTE: mistake in book. 159 | let weight_deltas: Vector = elementwise_scalar_multiplication(&deltas, input); 160 | 161 | for i in 0..weight_deltas.len() { 162 | weights[i] -= weight_deltas[i] * alpha; 163 | } 164 | 165 | println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_deltas); 166 | } 167 | 168 | #[allow(clippy::ptr_arg)] 169 | fn neural_network_2(input: f64, weights: &Vector) -> Vector { 170 | elementwise_scalar_multiplication(weights, input) 171 | } 172 | 173 | /// Gradient Descent with multiple inputs and outputs 174 | 175 | fn gradient_descent_with_multiple_inputs_and_outputs() { 176 | let toes = vec![8.5, 9.5, 9.9, 9.0]; 177 | let wlrec = vec![0.65, 0.8, 0.8, 0.9]; 178 | let nfans = vec![1.2, 1.3, 0.5, 1.0]; 179 | 180 | let hurt = vec![0.1, 0.0, 0.0, 0.1]; 181 | let win = vec![1.0, 1.0, 0.0, 1.0]; 182 | let sad = vec![0.1, 0.0, 0.1, 0.2]; 183 | 184 | let inputs = vec![toes[0], wlrec[0], nfans[0]]; 185 | let mut weights = vec![ 186 | vec![0.1, 0.1, -0.3], 187 | vec![0.1, 0.2, 0.0], 188 | vec![0.0, 1.3, 0.1], 189 | ]; 190 | let truth = vec![hurt[0], win[0], sad[0]]; 191 | 192 | let alpha = 0.01; 193 | 194 | let pred = neural_network_3(&inputs, &weights); 195 | let errors: Vector = pred 196 | .iter() 197 | .zip(truth.iter()) 198 | .map(|(x, y)| (x - y).powf(2.0)) 199 | .collect(); 200 | 201 | println!("Prediction: {:?}, Error: {:?}", pred, errors); 202 | 203 | let deltas: Vector = pred.iter().zip(truth.iter()).map(|(p, t)| p - t).collect(); 204 | let weight_deltas: Matrix = deltas 205 | .iter() 206 | .map(|i| elementwise_scalar_multiplication(&inputs, *i)) 207 | .collect(); 208 | 209 | for i in 0..weights.len() { 210 | for j in 0..weights[i].len() { 211 | weights[i][j] -= alpha * weight_deltas[i][j]; 212 | } 213 | } 214 | 215 | // NOTE: the saved weights output in the notebook is wrong. 216 | println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_deltas); 217 | } 218 | 219 | #[allow(clippy::ptr_arg)] 220 | fn neural_network_3(inputs: &Vector, weights: &Matrix) -> Vector { 221 | matrix_vector_dot(weights, inputs) 222 | } 223 | -------------------------------------------------------------------------------- /examples/chapter6.rs: -------------------------------------------------------------------------------- 1 | //! Chapter6 - Intro to Backpropagation - Building Your First DEEP Neural Network.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter6%20-%20Intro%20to%20Backpropagation%20-%20Building%20Your%20First%20DEEP%20Neural%20Network.ipynb 4 | 5 | use rand::distributions::{Distribution, Standard}; 6 | use rand::{thread_rng, Rng}; 7 | 8 | use grokking_deep_learning_rs::{ 9 | dot, matrix_matrix_dot, relu_matrix, relu_vector, relu_vector_derivative, vector_matrix_dot, 10 | vector_vector_multiplication, Matrix, 11 | }; 12 | 13 | fn main() { 14 | println!("\nCreating a Matrix or Two in Python\n"); 15 | creating_a_matrix_or_two(); 16 | 17 | println!("\nLearning the whole dataset!\n"); 18 | learning_the_whole_dataset(); 19 | 20 | println!("\nOur First \"Deep\" Neural Network\n"); 21 | first_deep_neural_network(); 22 | 23 | println!("\nBackpropagation\n"); 24 | backpropagation(); 25 | } 26 | 27 | /// Creating a Matrix or Two 28 | 29 | fn creating_a_matrix_or_two() { 30 | let streetlights = vec![ 31 | vec![1.0, 0.0, 1.0], 32 | vec![0.0, 1.0, 1.0], 33 | vec![0.0, 0.0, 1.0], 34 | vec![1.0, 1.0, 1.0], 35 | vec![0.0, 1.0, 1.0], 36 | vec![1.0, 0.0, 1.0], 37 | ]; 38 | 39 | let walk_vs_stop = vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0]; 40 | 41 | let mut weights = vec![0.5, 0.48, -0.7]; 42 | 43 | let input = &streetlights[0]; 44 | let goal_prediction = walk_vs_stop[0]; 45 | 46 | let alpha = 0.1; 47 | 48 | for _ in 0..20 { 49 | let prediction = dot(input, &weights); 50 | let error = (goal_prediction - prediction).powi(2); 51 | println!("Prediction: {}, Error: {}", prediction, error); 52 | 53 | let delta = prediction - goal_prediction; 54 | for i in 0..3 { 55 | weights[i] -= alpha * (input[i] * delta); 56 | } 57 | } 58 | } 59 | 60 | /// Learning the whole dataset! 61 | 62 | fn learning_the_whole_dataset() { 63 | let streetlights = vec![ 64 | vec![1.0, 0.0, 1.0], 65 | vec![0.0, 1.0, 1.0], 66 | vec![0.0, 0.0, 1.0], 67 | vec![1.0, 1.0, 1.0], 68 | vec![0.0, 1.0, 1.0], 69 | vec![1.0, 0.0, 1.0], 70 | ]; 71 | 72 | let walk_vs_stop = vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0]; 73 | 74 | let mut weights = vec![0.5, 0.48, -0.7]; 75 | 76 | let alpha = 0.1; 77 | 78 | for i in 0..40 { 79 | let mut total_error = 0.0; 80 | 81 | for r in 0..streetlights.len() { 82 | let input = &streetlights[r]; 83 | let goal_prediction = walk_vs_stop[r]; 84 | 85 | let prediction = dot(input, &weights); 86 | println!("Prediction: {}", prediction); 87 | 88 | let error = (goal_prediction - prediction).powi(2); 89 | 90 | total_error += error; 91 | 92 | let delta = prediction - goal_prediction; 93 | for i in 0..3 { 94 | weights[i] -= alpha * (input[i] * delta); 95 | } 96 | } 97 | 98 | println!("Error after iteration {} = {}\n", i + 1, total_error); 99 | } 100 | 101 | println!("Learned Weights: {:?}", weights); 102 | } 103 | 104 | /// Our first "Deep" Neural Network 105 | 106 | #[allow(unused_variables, unused_assignments, unused_mut)] 107 | fn first_deep_neural_network() { 108 | let inputs = vec![ 109 | vec![1.0, 0.0, 1.0], 110 | vec![0.0, 1.0, 1.0], 111 | vec![0.0, 0.0, 1.0], 112 | vec![1.0, 1.0, 1.0], 113 | ]; 114 | 115 | let outputs = vec![vec![1.0], vec![1.0], vec![0.0], vec![0.0]]; 116 | 117 | let (alpha, hidden_size) = (0.2, 4); 118 | 119 | let mut weights_1: Matrix = random_matrix(3, hidden_size, &Standard); 120 | let mut weights_2: Matrix = random_matrix(hidden_size, 1, &Standard); 121 | 122 | let hidden_layer = relu_matrix(matrix_matrix_dot(&inputs, &weights_1)); 123 | let output = matrix_matrix_dot(&hidden_layer, &weights_2); 124 | } 125 | 126 | /// Backpropagation 127 | 128 | fn backpropagation() { 129 | let inputs = vec![ 130 | vec![1.0, 0.0, 1.0], 131 | vec![0.0, 1.0, 1.0], 132 | vec![0.0, 0.0, 1.0], 133 | vec![1.0, 1.0, 1.0], 134 | ]; 135 | 136 | let outputs = vec![vec![1.0], vec![1.0], vec![0.0], vec![0.0]]; 137 | 138 | let alpha = 0.2; 139 | 140 | // Weight values taken from the python notebooks for reproducing results. 141 | let mut weights_0_1: Matrix = vec![ 142 | vec![-0.165_955_99, 0.440_648_99, -0.999_771_25, -0.395_334_85], 143 | vec![-0.706_488_22, -0.815_322_81, -0.627_479_58, -0.308_878_55], 144 | vec![-0.206_465_05, 0.077_633_47, -0.161_610_97, 0.370_439], 145 | ]; 146 | 147 | let mut weights_1_2: Matrix = vec![ 148 | vec![-0.591_095_5], 149 | vec![0.756_234_87], 150 | vec![-0.945_224_81], 151 | vec![0.340_935_02], 152 | ]; 153 | 154 | for it in 0..60 { 155 | let mut total_error = 0.0; 156 | 157 | for i in 0..4 { 158 | let hidden_layer = relu_vector(vector_matrix_dot(&inputs[i], &weights_0_1)); 159 | let prediction = vector_matrix_dot(&hidden_layer, &weights_1_2)[0]; 160 | 161 | let error: f64 = (prediction - outputs[i][0]).powi(2); 162 | total_error += error; 163 | 164 | let delta_2_1 = prediction - outputs[i][0]; 165 | let delta_1_0 = vector_vector_multiplication( 166 | &weights_1_2.iter().map(|v| v[0] * delta_2_1).collect(), 167 | &relu_vector_derivative(hidden_layer.clone()), 168 | ); 169 | 170 | let weight_deltas_1_2: Matrix = 171 | hidden_layer.iter().map(|v| vec![v * delta_2_1]).collect(); 172 | 173 | let weight_deltas_0_1: Matrix = inputs[i] 174 | .iter() 175 | .map(|v| delta_1_0.iter().map(|v2| v * v2).collect()) 176 | .collect(); 177 | 178 | for i in 0..weights_1_2.len() { 179 | for j in 0..weights_1_2[i].len() { 180 | weights_1_2[i][j] -= alpha * weight_deltas_1_2[i][j]; 181 | } 182 | } 183 | 184 | for i in 0..weights_0_1.len() { 185 | for j in 0..weights_0_1[i].len() { 186 | weights_0_1[i][j] -= alpha * weight_deltas_0_1[i][j]; 187 | } 188 | } 189 | } 190 | 191 | if (it + 1) % 10 == 0 { 192 | println!("Error: {}", total_error); 193 | } 194 | } 195 | } 196 | 197 | fn random_matrix(rows: usize, columns: usize, dist: &impl Distribution) -> Matrix { 198 | (0..rows) 199 | .map(|_| { 200 | (0..columns) 201 | .map(|_| 2.0 * thread_rng().sample(dist) - 1.0) 202 | .collect() 203 | }) 204 | .collect() 205 | } 206 | -------------------------------------------------------------------------------- /examples/chapter8.rs: -------------------------------------------------------------------------------- 1 | //! Chapter8 - Intro to Regularization - Learning Signal and Ignoring Noise.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter8%20-%20Intro%20to%20Regularization%20-%20Learning%20Signal%20and%20Ignoring%20Noise.ipynb 4 | 5 | use std::error::Error; 6 | use std::ops::Mul; 7 | 8 | use datasets::image::mnist; 9 | use indicatif::{ProgressBar, ProgressStyle}; 10 | use rand::distributions::Standard; 11 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice}; 12 | 13 | use grokking_deep_learning_rs::{ 14 | argmax, generate_random_vector, process_mnist_batch_dataset, relu_derivative, relu_mut, 15 | sample_bernoulli_trials, 16 | }; 17 | 18 | fn main() { 19 | println!("\n3 Layer Network on MNIST\n"); 20 | three_layer_mnist().unwrap(); 21 | 22 | println!("\n3 Layer Network on MNIST with validation every 10 iterations\n"); 23 | three_layer_mnist_with_validation().unwrap(); 24 | 25 | println!("\nDropout\n"); 26 | three_layer_mnist_with_validation_and_dropout(0.3).unwrap(); 27 | 28 | println!("\nBatched Gradient Descent with Dropout\n"); 29 | batched_gradient_descent_with_dropout(0.5).unwrap(); 30 | } 31 | 32 | fn three_layer_mnist() -> Result<(), Box> { 33 | let dataset_size = 100; // 1000 in notebook with numpy 34 | let test_dataset_size = 10000; 35 | 36 | let (train_data, test_data) = mnist()?; 37 | 38 | let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip(); 39 | 40 | let images: Vec> = images 41 | .iter() 42 | .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect()) 43 | .collect(); 44 | 45 | let labels: Vec> = labels 46 | .iter() 47 | .map(|l| { 48 | let mut v = vec![0.0; 10]; 49 | v[*l as usize] = 1.0; 50 | v 51 | }) 52 | .collect(); 53 | 54 | let (alpha, hidden_size) = (0.005, 40); 55 | 56 | let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication. 57 | let progress = ProgressBar::new(iterations as u64); 58 | progress.set_style( 59 | ProgressStyle::default_bar() 60 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 61 | ); 62 | 63 | let mut weights_0_1 = Matrix::new( 64 | 784, 65 | hidden_size, 66 | generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard), 67 | ); 68 | let mut weights_1_2 = Matrix::new( 69 | hidden_size, 70 | 10, 71 | generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard), 72 | ); 73 | 74 | // Training 75 | 76 | for it in 0..iterations { 77 | let mut total_error = 0.0; 78 | let mut accuracy = 0.0; 79 | 80 | for (image, label) in images.iter().zip(labels.iter()) { 81 | let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) }; 82 | 83 | let mut hidden_layer = (&image).mul(&weights_0_1); 84 | for j in 0..hidden_size { 85 | if hidden_layer[[0, j]] < 0.0 { 86 | hidden_layer[[0, j]] = 0.0; 87 | } 88 | } 89 | 90 | let output = (&hidden_layer).mul(&weights_1_2); 91 | 92 | accuracy += if argmax(&label) == argmax(output.data()) { 93 | 1.0 94 | } else { 95 | 0.0 96 | }; 97 | 98 | let error: f64 = output 99 | .data() 100 | .iter() 101 | .zip(label.iter()) 102 | .map(|(p, t)| (p - t).powi(2)) 103 | .sum(); 104 | 105 | total_error += error; 106 | 107 | let delta_2_1 = output - Matrix::new(1, 10, label.clone()); 108 | 109 | let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]); 110 | for i in 0..hidden_size { 111 | if hidden_layer[[0, i]] >= 0.0 { 112 | relu_deriv[[0, i]] = 1.0; 113 | } 114 | } 115 | 116 | let delta_1_0 = (&delta_2_1) 117 | .mul(weights_1_2.transpose()) 118 | .elemul(&relu_deriv); 119 | 120 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 121 | 122 | // avoid another clone of image 123 | let weight_delta_0_1 = image.transpose().mul(delta_1_0); 124 | 125 | for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() { 126 | *x -= alpha * weight_delta_0_1.data()[i]; 127 | } 128 | 129 | for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() { 130 | *x -= alpha * weight_delta_1_2.data()[i]; 131 | } 132 | } 133 | 134 | progress.inc(1); 135 | progress.set_message(&format!( 136 | "Train Accuracy: {}, Train Error: {}", 137 | accuracy / (dataset_size as f64), 138 | total_error / (dataset_size as f64) 139 | )); 140 | 141 | if (it + 1) % 10 == 0 { 142 | progress.println(format!( 143 | "Iteration: {}, Train Accuracy: {}, Train Error: {}", 144 | it + 1, 145 | accuracy / (dataset_size as f64), 146 | total_error / (dataset_size as f64) 147 | )); 148 | } 149 | } 150 | 151 | progress.finish_and_clear(); 152 | 153 | // Inference 154 | 155 | println!("Evaluating on the test dataset"); 156 | 157 | let (images, labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip(); 158 | 159 | let images: Vec> = images 160 | .into_iter() 161 | .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect()) 162 | .collect(); 163 | 164 | let labels: Vec> = labels 165 | .into_iter() 166 | .map(|l| { 167 | let mut v = vec![0.0; 10]; 168 | v[l as usize] = 1.0; 169 | v 170 | }) 171 | .collect(); 172 | 173 | let mut total_error = 0.0; 174 | let mut accuracy = 0.0; 175 | 176 | let progress = ProgressBar::new(test_dataset_size as u64); 177 | 178 | for (image, label) in images.into_iter().zip(labels.into_iter()) { 179 | let image = Matrix::new(1, 784, image); 180 | 181 | let mut hidden_layer = image.mul(&weights_0_1); 182 | 183 | // relu 184 | for j in 0..hidden_size { 185 | if hidden_layer[[0, j]] < 0.0 { 186 | hidden_layer[[0, j]] = 0.0; 187 | } 188 | } 189 | 190 | let output = hidden_layer.mul(&weights_1_2); 191 | 192 | accuracy += if argmax(&label) == argmax(output.data()) { 193 | 1.0 194 | } else { 195 | 0.0 196 | }; 197 | 198 | let error: f64 = output 199 | .iter() 200 | .zip(label.iter()) 201 | .map(|(p, t)| (p - t).powi(2)) 202 | .sum(); 203 | 204 | total_error += error; 205 | 206 | progress.inc(1); 207 | } 208 | 209 | progress.finish_and_clear(); 210 | 211 | println!( 212 | "Test Accuracy: {}, Test Error: {}", 213 | accuracy / (test_dataset_size as f64), 214 | total_error / (test_dataset_size as f64), 215 | ); 216 | 217 | Ok(()) 218 | } 219 | 220 | fn three_layer_mnist_with_validation() -> Result<(), Box> { 221 | let dataset_size = 100; // 1000 in notebook with numpy 222 | let test_dataset_size = 1000; 223 | 224 | let (train_data, test_data) = mnist()?; 225 | 226 | let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip(); 227 | 228 | let images: Vec> = images 229 | .iter() 230 | .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect()) 231 | .collect(); 232 | 233 | let labels: Vec> = labels 234 | .iter() 235 | .map(|l| { 236 | let mut v = vec![0.0; 10]; 237 | v[*l as usize] = 1.0; 238 | v 239 | }) 240 | .collect(); 241 | 242 | let (alpha, hidden_size) = (0.005, 40); 243 | 244 | let (test_images, test_labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip(); 245 | 246 | let test_images: Vec> = test_images 247 | .into_iter() 248 | .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect()) 249 | .collect(); 250 | 251 | let test_labels: Vec> = test_labels 252 | .into_iter() 253 | .map(|l| { 254 | let mut v = vec![0.0; 10]; 255 | v[l as usize] = 1.0; 256 | v 257 | }) 258 | .collect(); 259 | 260 | let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication. 261 | let progress = ProgressBar::new(iterations as u64); 262 | progress.set_style( 263 | ProgressStyle::default_bar() 264 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 265 | ); 266 | 267 | let mut weights_0_1 = Matrix::new( 268 | 784, 269 | hidden_size, 270 | generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard), 271 | ); 272 | let mut weights_1_2 = Matrix::new( 273 | hidden_size, 274 | 10, 275 | generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard), 276 | ); 277 | 278 | // Training 279 | 280 | for it in 0..iterations { 281 | let mut total_error = 0.0; 282 | let mut accuracy = 0.0; 283 | 284 | for (image, label) in images.iter().zip(labels.iter()) { 285 | let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) }; 286 | 287 | let mut hidden_layer = (&image).mul(&weights_0_1); 288 | for j in 0..hidden_size { 289 | if hidden_layer[[0, j]] < 0.0 { 290 | hidden_layer[[0, j]] = 0.0; 291 | } 292 | } 293 | 294 | let output = (&hidden_layer).mul(&weights_1_2); 295 | 296 | accuracy += if argmax(&label) == argmax(output.data()) { 297 | 1.0 298 | } else { 299 | 0.0 300 | }; 301 | 302 | let error: f64 = output 303 | .data() 304 | .iter() 305 | .zip(label.iter()) 306 | .map(|(p, t)| (p - t).powi(2)) 307 | .sum(); 308 | 309 | total_error += error; 310 | 311 | let delta_2_1 = output - Matrix::new(1, 10, label.clone()); 312 | 313 | let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]); 314 | for i in 0..hidden_size { 315 | if hidden_layer[[0, i]] >= 0.0 { 316 | relu_deriv[[0, i]] = 1.0; 317 | } 318 | } 319 | 320 | let delta_1_0 = (&delta_2_1) 321 | .mul(weights_1_2.transpose()) 322 | .elemul(&relu_deriv); 323 | 324 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 325 | 326 | // avoid another clone of image 327 | let weight_delta_0_1 = image.transpose().mul(delta_1_0); 328 | 329 | for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() { 330 | *x -= alpha * weight_delta_0_1.data()[i]; 331 | } 332 | 333 | for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() { 334 | *x -= alpha * weight_delta_1_2.data()[i]; 335 | } 336 | } 337 | 338 | if (it + 1) % 10 == 0 { 339 | // Inference 340 | 341 | let mut total_test_error = 0.0; 342 | let mut test_accuracy = 0.0; 343 | 344 | for (image, label) in test_images.iter().zip(test_labels.iter()) { 345 | let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) }; 346 | 347 | let mut hidden_layer = image.mul(&weights_0_1); 348 | 349 | // relu 350 | for j in 0..hidden_size { 351 | if hidden_layer[[0, j]] < 0.0 { 352 | hidden_layer[[0, j]] = 0.0; 353 | } 354 | } 355 | 356 | let output = hidden_layer.mul(&weights_1_2); 357 | 358 | test_accuracy += if argmax(&label) == argmax(output.data()) { 359 | 1.0 360 | } else { 361 | 0.0 362 | }; 363 | 364 | let error: f64 = output 365 | .iter() 366 | .zip(label.iter()) 367 | .map(|(p, t)| (p - t).powi(2)) 368 | .sum(); 369 | 370 | total_test_error += error; 371 | } 372 | 373 | progress.println(format!( 374 | "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}", 375 | it + 1, 376 | accuracy / (dataset_size as f64), 377 | total_error / (dataset_size as f64), 378 | test_accuracy / (test_dataset_size as f64), 379 | total_test_error / (test_dataset_size as f64), 380 | )); 381 | } 382 | 383 | progress.inc(1); 384 | progress.set_message(&format!( 385 | "Train Accuracy: {}, Train Error: {}", 386 | accuracy / (dataset_size as f64), 387 | total_error / (dataset_size as f64) 388 | )); 389 | } 390 | 391 | Ok(()) 392 | } 393 | 394 | fn three_layer_mnist_with_validation_and_dropout( 395 | keep_probability: f64, 396 | ) -> Result<(), Box> { 397 | let dataset_size = 1000; // 1000 in notebook with numpy 398 | let test_dataset_size = 1000; 399 | 400 | let (train_data, test_data) = mnist()?; 401 | 402 | let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip(); 403 | 404 | let images: Vec> = images 405 | .iter() 406 | .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect()) 407 | .collect(); 408 | 409 | let labels: Vec> = labels 410 | .iter() 411 | .map(|l| { 412 | let mut v = vec![0.0; 10]; 413 | v[*l as usize] = 1.0; 414 | v 415 | }) 416 | .collect(); 417 | 418 | let (alpha, hidden_size) = (0.005, 40); 419 | 420 | let (test_images, test_labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip(); 421 | 422 | let test_images: Vec> = test_images 423 | .into_iter() 424 | .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect()) 425 | .collect(); 426 | 427 | let test_labels: Vec> = test_labels 428 | .into_iter() 429 | .map(|l| { 430 | let mut v = vec![0.0; 10]; 431 | v[l as usize] = 1.0; 432 | v 433 | }) 434 | .collect(); 435 | 436 | let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication. 437 | let progress = ProgressBar::new(iterations as u64); 438 | progress.set_style( 439 | ProgressStyle::default_bar() 440 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 441 | ); 442 | 443 | let mut weights_0_1 = Matrix::new( 444 | 784, 445 | hidden_size, 446 | generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard), 447 | ); 448 | let mut weights_1_2 = Matrix::new( 449 | hidden_size, 450 | 10, 451 | generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard), 452 | ); 453 | 454 | // Training 455 | 456 | for it in 0..iterations { 457 | let mut total_error = 0.0; 458 | let mut accuracy = 0.0; 459 | 460 | for (image, label) in images.iter().zip(labels.iter()) { 461 | let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) }; 462 | 463 | let mut hidden_layer = (&image).mul(&weights_0_1); 464 | for j in 0..hidden_size { 465 | if hidden_layer[[0, j]] < 0.0 { 466 | hidden_layer[[0, j]] = 0.0; 467 | } 468 | } 469 | 470 | let dropout_mask_data: Vec = 471 | sample_bernoulli_trials(keep_probability, hidden_size); 472 | 473 | let dropout_mask = Matrix::new(1, hidden_size, dropout_mask_data); 474 | 475 | for j in 0..hidden_size { 476 | hidden_layer[[0, j]] *= dropout_mask[[0, j]] * (1.0 / keep_probability); 477 | } 478 | 479 | let output = (&hidden_layer).mul(&weights_1_2); 480 | 481 | accuracy += if argmax(&label) == argmax(output.data()) { 482 | 1.0 483 | } else { 484 | 0.0 485 | }; 486 | 487 | let error: f64 = output 488 | .data() 489 | .iter() 490 | .zip(label.iter()) 491 | .map(|(p, t)| (p - t).powi(2)) 492 | .sum(); 493 | 494 | total_error += error; 495 | 496 | let delta_2_1 = output - Matrix::new(1, 10, label.clone()); 497 | 498 | let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]); 499 | for i in 0..hidden_size { 500 | if hidden_layer[[0, i]] >= 0.0 { 501 | relu_deriv[[0, i]] = 1.0; 502 | } 503 | } 504 | 505 | let mut delta_1_0 = (&delta_2_1) 506 | .mul(weights_1_2.transpose()) 507 | .elemul(&relu_deriv); 508 | 509 | for j in 0..hidden_size { 510 | delta_1_0[[0, j]] *= dropout_mask[[0, j]] * (1.0 / keep_probability); 511 | } 512 | 513 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 514 | 515 | // avoid another clone of image 516 | let weight_delta_0_1 = image.transpose().mul(delta_1_0); 517 | 518 | for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() { 519 | *x -= alpha * weight_delta_0_1.data()[i]; 520 | } 521 | 522 | for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() { 523 | *x -= alpha * weight_delta_1_2.data()[i]; 524 | } 525 | } 526 | 527 | progress.inc(1); 528 | progress.set_message(&format!( 529 | "Train Accuracy: {}, Train Error: {}", 530 | accuracy / (dataset_size as f64), 531 | total_error / (dataset_size as f64) 532 | )); 533 | 534 | if (it + 1) % 10 == 0 { 535 | // Inference 536 | 537 | let mut total_test_error = 0.0; 538 | let mut test_accuracy = 0.0; 539 | 540 | for (image, label) in test_images.iter().zip(test_labels.iter()) { 541 | let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) }; 542 | 543 | let mut hidden_layer = image.mul(&weights_0_1); 544 | 545 | // relu 546 | for j in 0..hidden_size { 547 | if hidden_layer[[0, j]] < 0.0 { 548 | hidden_layer[[0, j]] = 0.0; 549 | } 550 | } 551 | 552 | let output = hidden_layer.mul(&weights_1_2); 553 | 554 | test_accuracy += if argmax(&label) == argmax(output.data()) { 555 | 1.0 556 | } else { 557 | 0.0 558 | }; 559 | 560 | let error: f64 = output 561 | .iter() 562 | .zip(label.iter()) 563 | .map(|(p, t)| (p - t).powi(2)) 564 | .sum(); 565 | 566 | total_test_error += error; 567 | } 568 | 569 | progress.println(format!( 570 | "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}", 571 | it + 1, 572 | accuracy / (dataset_size as f64), 573 | total_error / (dataset_size as f64), 574 | test_accuracy / (test_dataset_size as f64), 575 | total_test_error / (test_dataset_size as f64), 576 | )); 577 | } 578 | } 579 | 580 | progress.finish_and_clear(); 581 | 582 | Ok(()) 583 | } 584 | 585 | fn batched_gradient_descent_with_dropout(keep_probability: f64) -> Result<(), Box> { 586 | let dataset_size = 1000; // 1000 in notebook with numpy 587 | let test_dataset_size = 1000; 588 | 589 | let batch_size = 100; 590 | 591 | let (train_data, test_data) = mnist()?; 592 | 593 | let (images, labels) = process_mnist_batch_dataset(train_data, dataset_size, batch_size); 594 | let (test_images, test_labels) = 595 | process_mnist_batch_dataset(test_data, test_dataset_size, batch_size); 596 | 597 | let (alpha, hidden_size) = (0.001, 40); 598 | 599 | let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication. 600 | let progress = ProgressBar::new(iterations as u64); 601 | progress.set_style( 602 | ProgressStyle::default_bar() 603 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 604 | ); 605 | 606 | let mut weights_0_1 = Matrix::new( 607 | 784, 608 | hidden_size, 609 | generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard), 610 | ); 611 | let mut weights_1_2 = Matrix::new( 612 | hidden_size, 613 | 10, 614 | generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard), 615 | ); 616 | 617 | // Training 618 | 619 | for it in 0..iterations { 620 | let mut total_error = 0.0; 621 | let mut accuracy = 0.0; 622 | 623 | for (image, label) in images.iter().zip(labels.iter()) { 624 | let image = 625 | unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), batch_size, 784, 784) }; 626 | let label = unsafe { MatrixSlice::from_raw_parts(label.as_ptr(), batch_size, 10, 10) }; 627 | 628 | let mut hidden_layer = (&image).mul(&weights_0_1); 629 | relu_mut(&mut hidden_layer); 630 | 631 | let dropout_mask_data: Vec = 632 | sample_bernoulli_trials(keep_probability, batch_size * hidden_size); 633 | 634 | let dropout_mask = Matrix::new(batch_size, hidden_size, dropout_mask_data); 635 | 636 | for i in 0..batch_size { 637 | for j in 0..hidden_size { 638 | hidden_layer[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability); 639 | } 640 | } 641 | 642 | let outputs = (&hidden_layer).mul(&weights_1_2); 643 | 644 | for (output, l) in outputs.row_iter().zip(label.row_iter()) { 645 | if argmax(output.raw_slice()) == argmax(l.raw_slice()) { 646 | accuracy += 1.0; 647 | } 648 | } 649 | 650 | for (output, l) in outputs.row_iter().zip(label.row_iter()) { 651 | let err: f64 = output 652 | .raw_slice() 653 | .iter() 654 | .zip(l.raw_slice().iter()) 655 | .map(|(p, t)| (p - t).powi(2)) 656 | .sum(); 657 | total_error += err; 658 | } 659 | 660 | let mut delta_2_1 = Matrix::new(batch_size, 10, vec![0.0; batch_size * 10]); 661 | for i in 0..batch_size { 662 | for j in 0..10 { 663 | delta_2_1[[i, j]] = outputs[[i, j]] - label[[i, j]]; 664 | } 665 | } 666 | 667 | let relu_deriv = relu_derivative(&hidden_layer); 668 | 669 | let mut delta_1_0 = (&delta_2_1) 670 | .mul(weights_1_2.transpose()) 671 | .elemul(&relu_deriv); 672 | 673 | for i in 0..batch_size { 674 | for j in 0..hidden_size { 675 | delta_1_0[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability); 676 | } 677 | } 678 | 679 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 680 | let weight_delta_0_1 = image.transpose().mul(delta_1_0); 681 | 682 | for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() { 683 | *x -= alpha * weight_delta_0_1.data()[i]; 684 | } 685 | 686 | for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() { 687 | *x -= alpha * weight_delta_1_2.data()[i]; 688 | } 689 | } 690 | 691 | progress.inc(1); 692 | progress.set_message(&format!( 693 | "Train Accuracy: {}, Train Error: {}", 694 | accuracy / (dataset_size as f64), 695 | total_error / (dataset_size as f64) 696 | )); 697 | 698 | if (it + 1) % 10 == 0 { 699 | // Inference 700 | 701 | let mut total_test_error = 0.0; 702 | let mut test_accuracy = 0.0; 703 | 704 | for (image, label) in test_images.iter().zip(test_labels.iter()) { 705 | let image = 706 | unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), batch_size, 784, 784) }; 707 | let label = 708 | unsafe { MatrixSlice::from_raw_parts(label.as_ptr(), batch_size, 10, 10) }; 709 | 710 | let mut hidden_layer = image.mul(&weights_0_1); 711 | for i in 0..batch_size { 712 | for j in 0..hidden_size { 713 | if hidden_layer[[i, j]] < 0.0 { 714 | hidden_layer[[i, j]] = 0.0; 715 | } 716 | } 717 | } 718 | 719 | let outputs = hidden_layer.mul(&weights_1_2); 720 | 721 | for (output, l) in outputs.row_iter().zip(label.row_iter()) { 722 | if argmax(output.raw_slice()) == argmax(l.raw_slice()) { 723 | test_accuracy += 1.0; 724 | } 725 | } 726 | 727 | for (output, l) in outputs.row_iter().zip(label.row_iter()) { 728 | let err: f64 = output 729 | .raw_slice() 730 | .iter() 731 | .zip(l.raw_slice().iter()) 732 | .map(|(p, t)| (p - t).powi(2)) 733 | .sum(); 734 | 735 | total_test_error += err; 736 | } 737 | } 738 | 739 | progress.println(format!( 740 | "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}", 741 | it + 1, 742 | accuracy / (dataset_size as f64), 743 | total_error / (dataset_size as f64), 744 | test_accuracy / (test_dataset_size as f64), 745 | total_test_error / (test_dataset_size as f64), 746 | )); 747 | } 748 | } 749 | 750 | progress.finish_and_clear(); 751 | 752 | Ok(()) 753 | } 754 | -------------------------------------------------------------------------------- /examples/chapter9.rs: -------------------------------------------------------------------------------- 1 | //! Chapter9 - Intro to Activation Functions - Modeling Probabilities.ipynb 2 | //! 3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter9%20-%20Intro%20to%20Activation%20Functions%20-%20Modeling%20Probabilities.ipynb 4 | 5 | use std::error::Error; 6 | use std::ops::Mul; 7 | 8 | use datasets::image::mnist; 9 | use indicatif::{ProgressBar, ProgressStyle}; 10 | use rand::distributions::Standard; 11 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice}; 12 | 13 | use grokking_deep_learning_rs::{ 14 | argmax, generate_random_vector, process_mnist_batch_dataset, sample_bernoulli_trials, 15 | softmax_mut, tanh_derivative, tanh_mut, 16 | }; 17 | 18 | fn main() { 19 | println!("\nUpgrading our MNIST Network\n"); 20 | mnist_tanh(0.5).unwrap(); 21 | } 22 | 23 | fn mnist_tanh(keep_probability: f64) -> Result<(), Box> { 24 | let (train_data, test_data) = mnist()?; 25 | 26 | let train_data_size = 1000; 27 | let test_data_size = 1000; 28 | let batch_size = 100; 29 | 30 | let (train_images, train_labels) = 31 | process_mnist_batch_dataset(train_data, train_data_size, batch_size); 32 | let (test_images, test_labels) = 33 | process_mnist_batch_dataset(test_data, test_data_size, batch_size); 34 | 35 | let (alpha, hidden_size) = (2.0, 100); 36 | 37 | let mut weights_0_1 = Matrix::new( 38 | 784, 39 | hidden_size, 40 | generate_random_vector(784 * hidden_size, 0.02, -0.01, &Standard), 41 | ); 42 | let mut weights_1_2 = Matrix::new( 43 | hidden_size, 44 | 10, 45 | generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard), 46 | ); 47 | 48 | let iterations = 100; 49 | let progress = ProgressBar::new(iterations as u64); 50 | progress.set_style( 51 | ProgressStyle::default_bar() 52 | .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"), 53 | ); 54 | 55 | for it in 0..iterations { 56 | let mut accuracy = 0.0; 57 | 58 | for (images, labels) in train_images.iter().zip(train_labels.iter()) { 59 | let images = 60 | unsafe { MatrixSlice::from_raw_parts(images.as_ptr(), batch_size, 784, 784) }; 61 | let labels = 62 | unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) }; 63 | 64 | let mut hidden_layer = (&images).mul(&weights_0_1); 65 | tanh_mut(&mut hidden_layer); 66 | 67 | let dropout_mask = Matrix::new( 68 | batch_size, 69 | hidden_size, 70 | sample_bernoulli_trials(keep_probability, batch_size * hidden_size), 71 | ); 72 | 73 | for i in 0..batch_size { 74 | for j in 0..hidden_size { 75 | hidden_layer[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability); 76 | } 77 | } 78 | 79 | let mut outputs = (&hidden_layer).mul(&weights_1_2); 80 | softmax_mut(&mut outputs); 81 | 82 | for (r, l) in (&outputs).row_iter().zip(labels.row_iter()) { 83 | accuracy += if argmax(r.raw_slice()) == argmax(l.raw_slice()) { 84 | 1.0 85 | } else { 86 | 0.0 87 | } 88 | } 89 | 90 | // NOTE: no error calc here 91 | // just taking on faith that the derivative for the final layer = (value - true_value) / (batch_size^2) 92 | 93 | let mut delta_2_1 = Matrix::zeros(batch_size, 10); 94 | for i in 0..batch_size { 95 | for j in 0..10 { 96 | delta_2_1[[i, j]] = 97 | (outputs[[i, j]] - labels[[i, j]]) / ((batch_size * batch_size) as f64); 98 | } 99 | } 100 | 101 | let mut delta_1_0 = (&delta_2_1) 102 | .mul(weights_1_2.transpose()) 103 | .elemul(&tanh_derivative(&hidden_layer)); 104 | 105 | for i in 0..batch_size { 106 | for j in 0..hidden_size { 107 | delta_1_0[[i, j]] *= dropout_mask[[i, j]]; 108 | } 109 | } 110 | 111 | let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1); 112 | let weight_delta_0_1 = images.transpose().mul(delta_1_0); 113 | 114 | for i in 0..hidden_size { 115 | for k in 0..10 { 116 | weights_1_2[[i, k]] -= alpha * weight_delta_1_2[[i, k]]; 117 | } 118 | } 119 | 120 | for i in 0..784 { 121 | for k in 0..hidden_size { 122 | weights_0_1[[i, k]] -= alpha * weight_delta_0_1[[i, k]]; 123 | } 124 | } 125 | } 126 | 127 | if (it + 1) % 10 == 0 { 128 | let mut test_accuracy = 0.0; 129 | 130 | for (images, labels) in test_images.iter().zip(test_labels.iter()) { 131 | let images = 132 | unsafe { MatrixSlice::from_raw_parts(images.as_ptr(), batch_size, 784, 784) }; 133 | let labels = 134 | unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) }; 135 | 136 | let mut hidden_layer = images.mul(&weights_0_1); 137 | tanh_mut(&mut hidden_layer); 138 | 139 | let mut outputs = hidden_layer.mul(&weights_1_2); 140 | softmax_mut(&mut outputs); 141 | 142 | for (r, l) in (&outputs).row_iter().zip(labels.row_iter()) { 143 | test_accuracy += if argmax(r.raw_slice()) == argmax(l.raw_slice()) { 144 | 1.0 145 | } else { 146 | 0.0 147 | } 148 | } 149 | } 150 | 151 | progress.println(format!( 152 | "Iteration: {}, Train Accuracy: {}, Test Accuracy: {}", 153 | it + 1, 154 | accuracy / (train_data_size as f64), 155 | test_accuracy / (test_data_size as f64), 156 | )); 157 | } 158 | 159 | progress.inc(1); 160 | progress.set_message(&format!( 161 | "Train Accuracy: {}", 162 | accuracy / (train_data_size as f64), 163 | )); 164 | } 165 | 166 | progress.finish_and_clear(); 167 | 168 | Ok(()) 169 | } 170 | -------------------------------------------------------------------------------- /src/activations.rs: -------------------------------------------------------------------------------- 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters. 2 | 3 | use crate::layers::Layer; 4 | use crate::tensor::Tensor; 5 | 6 | #[derive(Debug)] 7 | pub struct Sigmoid; 8 | 9 | impl Layer for Sigmoid { 10 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 11 | vec![inputs[0].sigmoid()] 12 | } 13 | } 14 | 15 | #[derive(Debug)] 16 | pub struct Tanh; 17 | 18 | impl Layer for Tanh { 19 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 20 | vec![inputs[0].tanh()] 21 | } 22 | } 23 | 24 | #[derive(Debug)] 25 | pub struct Relu; 26 | 27 | impl Layer for Relu { 28 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 29 | vec![inputs[0].relu()] 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/layers.rs: -------------------------------------------------------------------------------- 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters. 2 | 3 | use std::fmt; 4 | use std::iter::FromIterator; 5 | 6 | use rand::distributions::Uniform; 7 | use rulinalg::matrix::{BaseMatrix, Matrix}; 8 | use std::rc::Rc; 9 | 10 | use crate::generate_random_vector; 11 | use crate::tensor::{Dot, Expand, Tensor}; 12 | 13 | pub trait Layer { 14 | fn forward(&self, inputs: &[&Tensor]) -> Vec; 15 | 16 | fn parameters(&self) -> Vec<&Tensor> { 17 | vec![] 18 | } 19 | } 20 | 21 | #[derive(Debug)] 22 | pub struct Linear { 23 | weights: Tensor, 24 | bias: Option, 25 | } 26 | 27 | impl Linear { 28 | pub fn new(n_inputs: usize, n_outputs: usize, bias: bool) -> Linear { 29 | let distribution = Uniform::new(0.0, 1.0); 30 | 31 | let weights = Tensor::new_const(Matrix::new( 32 | n_inputs, 33 | n_outputs, 34 | generate_random_vector(n_inputs * n_outputs, 0.5, 0.0, &distribution), 35 | )); 36 | 37 | let bias = if bias { 38 | Some(Tensor::new_const(Matrix::zeros(1, n_outputs))) 39 | } else { 40 | None 41 | }; 42 | 43 | Linear { weights, bias } 44 | } 45 | } 46 | 47 | impl Layer for Linear { 48 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 49 | let rows = inputs[0].0.borrow().data.rows(); 50 | match &self.bias { 51 | None => vec![inputs[0].dot(&self.weights)], 52 | Some(bias) => vec![&inputs[0].dot(&self.weights) + &bias.expand(0, rows)], 53 | } 54 | } 55 | 56 | fn parameters(&self) -> Vec<&Tensor> { 57 | match &self.bias { 58 | None => vec![&self.weights], 59 | Some(bias) => vec![&self.weights, bias], 60 | } 61 | } 62 | } 63 | 64 | pub struct Sequential { 65 | layers: Vec>, 66 | } 67 | 68 | impl fmt::Debug for Sequential { 69 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 70 | write!(f, "Sequential {{ }}") 71 | } 72 | } 73 | 74 | impl Sequential { 75 | pub fn new(layers: Vec>) -> Self { 76 | Sequential { layers } 77 | } 78 | 79 | #[allow(dead_code)] 80 | fn add(&mut self, layer: Box) { 81 | self.layers.push(layer); 82 | } 83 | } 84 | 85 | impl Layer for Sequential { 86 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 87 | // TODO: can this be avoided 88 | let mut input = Tensor(Rc::clone(&inputs[0].0)); 89 | 90 | for layer in self.layers.iter() { 91 | input = layer.forward(&[&input]).remove(0); 92 | } 93 | 94 | vec![input] 95 | } 96 | 97 | fn parameters(&self) -> Vec<&Tensor> { 98 | self.layers 99 | .iter() 100 | .map(|l| l.parameters()) 101 | .flat_map(|v| v.into_iter()) 102 | .collect() 103 | } 104 | } 105 | 106 | #[derive(Debug)] 107 | pub struct Embedding { 108 | pub weights: Tensor, 109 | } 110 | 111 | impl Embedding { 112 | pub fn new(vocab_size: usize, embedding_size: usize) -> Embedding { 113 | let distribution = Uniform::new(0.0, 1.0); 114 | Embedding { 115 | weights: Tensor::new_const(Matrix::new( 116 | vocab_size, 117 | embedding_size, 118 | generate_random_vector( 119 | vocab_size * embedding_size, 120 | 1.0 / (embedding_size as f64), 121 | -0.5 / (embedding_size as f64), 122 | &distribution, 123 | ), 124 | )), 125 | } 126 | } 127 | 128 | pub fn from_weights(weights: Matrix) -> Embedding { 129 | Embedding { 130 | weights: Tensor::new_const(weights), 131 | } 132 | } 133 | } 134 | 135 | impl Clone for Embedding { 136 | fn clone(&self) -> Embedding { 137 | Embedding { 138 | weights: Tensor::new_const(self.weights.0.borrow().data.clone()), 139 | } 140 | } 141 | } 142 | 143 | impl Layer for Embedding { 144 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 145 | let data = Vec::from_iter( 146 | inputs[0] 147 | .0 148 | .borrow() 149 | .data 150 | .row(0) 151 | .raw_slice() 152 | .iter() 153 | .map(|v| (*v as usize)), 154 | ); 155 | 156 | vec![self.weights.index_select(data)] 157 | } 158 | 159 | fn parameters(&self) -> Vec<&Tensor> { 160 | vec![&self.weights] 161 | } 162 | } 163 | 164 | pub struct RNNCell { 165 | n_hidden: usize, 166 | w_ih: Linear, 167 | w_hh: Linear, 168 | w_ho: Linear, 169 | activation: Box, 170 | } 171 | 172 | impl fmt::Debug for RNNCell { 173 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 174 | write!( 175 | f, 176 | "RNNCell {{ n_hidden: {:?}, w_ih: {:?}, w_hh: {:?}, w_ho: {:?} }}", 177 | self.n_hidden, self.w_ih, self.w_hh, self.w_ho 178 | ) 179 | } 180 | } 181 | 182 | impl RNNCell { 183 | pub fn new( 184 | n_inputs: usize, 185 | n_hidden: usize, 186 | n_outputs: usize, 187 | activation: Box, 188 | ) -> RNNCell { 189 | let w_ih = Linear::new(n_inputs, n_hidden, true); 190 | let w_hh = Linear::new(n_hidden, n_hidden, true); 191 | let w_ho = Linear::new(n_hidden, n_outputs, true); 192 | 193 | RNNCell { 194 | n_hidden, 195 | w_ih, 196 | w_hh, 197 | w_ho, 198 | activation, 199 | } 200 | } 201 | 202 | pub fn create_start_state(&self, batch_size: usize) -> Tensor { 203 | Tensor::new_const(Matrix::zeros(batch_size, self.n_hidden)) 204 | } 205 | } 206 | 207 | impl Layer for RNNCell { 208 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 209 | let (input, hidden) = (inputs[0], inputs[1]); 210 | 211 | let state_part = self.w_hh.forward(&[hidden]); 212 | let input_part = self.w_ih.forward(&[input]); 213 | 214 | let mut new_state = self 215 | .activation 216 | .forward(&[&(&input_part[0] + &state_part[0])]); 217 | let mut output = self.w_ho.forward(&[&new_state[0]]); 218 | 219 | vec![output.remove(0), new_state.remove(0)] 220 | } 221 | 222 | fn parameters(&self) -> Vec<&Tensor> { 223 | let mut ans = self.w_ih.parameters(); 224 | ans.append(&mut self.w_hh.parameters()); 225 | ans.append(&mut self.w_ho.parameters()); 226 | ans 227 | } 228 | } 229 | 230 | #[derive(Debug)] 231 | pub struct LSTMCell { 232 | xf: Linear, 233 | xi: Linear, 234 | xo: Linear, 235 | xc: Linear, 236 | 237 | hf: Linear, 238 | hi: Linear, 239 | ho: Linear, 240 | hc: Linear, 241 | 242 | w_ho: Linear, 243 | 244 | n_hidden: usize, 245 | } 246 | 247 | impl LSTMCell { 248 | pub fn new(n_inputs: usize, n_hidden: usize, n_outputs: usize) -> LSTMCell { 249 | LSTMCell { 250 | xf: Linear::new(n_inputs, n_hidden, true), 251 | xi: Linear::new(n_inputs, n_hidden, true), 252 | xo: Linear::new(n_inputs, n_hidden, true), 253 | xc: Linear::new(n_inputs, n_hidden, true), 254 | 255 | hf: Linear::new(n_hidden, n_hidden, false), 256 | hi: Linear::new(n_hidden, n_hidden, false), 257 | ho: Linear::new(n_hidden, n_hidden, false), 258 | hc: Linear::new(n_hidden, n_hidden, false), 259 | 260 | w_ho: Linear::new(n_hidden, n_outputs, false), 261 | 262 | n_hidden, 263 | } 264 | } 265 | 266 | pub fn create_start_state(&self, batch_size: usize) -> (Tensor, Tensor) { 267 | let mut h = Matrix::zeros(batch_size, self.n_hidden); 268 | let mut c = Matrix::zeros(batch_size, self.n_hidden); 269 | 270 | for i in 0..batch_size { 271 | h[[i, 0]] = 1.0; 272 | c[[i, 0]] = 1.0; 273 | } 274 | 275 | (Tensor::new_const(h), Tensor::new_const(c)) 276 | } 277 | } 278 | 279 | impl Layer for LSTMCell { 280 | #[allow(clippy::many_single_char_names)] 281 | fn forward(&self, inputs: &[&Tensor]) -> Vec { 282 | let (input, prev_hidden, prev_cell) = (inputs[0], inputs[1], inputs[2]); 283 | 284 | let f = (&self.xf.forward(&[input])[0] + &self.hf.forward(&[prev_hidden])[0]).sigmoid(); 285 | let i = (&self.xi.forward(&[input])[0] + &self.hi.forward(&[prev_hidden])[0]).sigmoid(); 286 | let o = (&self.xo.forward(&[input])[0] + &self.ho.forward(&[prev_hidden])[0]).sigmoid(); 287 | 288 | let g = (&self.xc.forward(&[input])[0] + &self.hc.forward(&[prev_hidden])[0]).tanh(); 289 | 290 | let c = &(&f * prev_cell) + &(&i * &g); 291 | let h = &o * &c.tanh(); 292 | 293 | let output = self.w_ho.forward(&[&h]).remove(0); 294 | 295 | vec![output, h, c] 296 | } 297 | 298 | fn parameters(&self) -> Vec<&Tensor> { 299 | self.xf 300 | .parameters() 301 | .into_iter() 302 | .chain(self.xi.parameters().into_iter()) 303 | .chain(self.xo.parameters().into_iter()) 304 | .chain(self.xc.parameters().into_iter()) 305 | .chain(self.hf.parameters().into_iter()) 306 | .chain(self.hi.parameters().into_iter()) 307 | .chain(self.ho.parameters().into_iter()) 308 | .chain(self.hc.parameters().into_iter()) 309 | .chain(self.w_ho.parameters().into_iter()) 310 | .collect() 311 | } 312 | } 313 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(missing_debug_implementations)] 2 | 3 | use datasets::Dataset; 4 | use rand::distributions::{Bernoulli, Distribution}; 5 | use rand::{thread_rng, Rng}; 6 | use rulinalg::matrix::{BaseMatrix, BaseMatrixMut, Matrix as RulinalgMatrix}; 7 | 8 | pub mod activations; 9 | pub mod layers; 10 | pub mod losses; 11 | pub mod optimizers; 12 | pub mod tensor; 13 | 14 | pub type Vector = Vec; 15 | pub type Matrix = Vec>; 16 | 17 | #[allow(clippy::ptr_arg)] 18 | pub fn elementwise_multiplication(vec_a: &Vector, vec_b: &Vector) -> Vector { 19 | vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a * b).collect() 20 | } 21 | 22 | pub fn argmax(vec: &[f64]) -> usize { 23 | let mut max = vec[0]; 24 | let mut ans = 0; 25 | 26 | for (i, x) in vec.iter().enumerate().skip(1) { 27 | if x > &max { 28 | max = *x; 29 | ans = i; 30 | } 31 | } 32 | 33 | ans 34 | } 35 | 36 | pub fn vector_sum(vec: Vector) -> f64 { 37 | vec.iter().sum() 38 | } 39 | 40 | #[allow(clippy::ptr_arg)] 41 | pub fn dot(vec_a: &Vector, vec_b: &Vector) -> f64 { 42 | vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a * b).sum() 43 | } 44 | 45 | #[allow(clippy::ptr_arg)] 46 | pub fn elementwise_scalar_multiplication(vec: &Vector, n: f64) -> Vector { 47 | vec.iter().map(|x| x * n).collect() 48 | } 49 | 50 | #[allow(clippy::ptr_arg)] 51 | pub fn elementwise_addition(vec_a: &Vector, vec_b: &Vector) -> Vector { 52 | vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a + b).collect() 53 | } 54 | 55 | #[allow(clippy::ptr_arg)] 56 | pub fn vector_average(vec: &Vector) -> f64 { 57 | let len = vec.len() as f64; 58 | vec.iter().sum::() / len 59 | } 60 | 61 | #[allow(clippy::ptr_arg)] 62 | pub fn vector_vector_subtraction(v1: &Vector, v2: &Vector) -> Vector { 63 | v1.iter().zip(v2.iter()).map(|(a, b)| a - b).collect() 64 | } 65 | 66 | #[allow(clippy::ptr_arg)] 67 | pub fn vector_vector_multiplication(v1: &Vector, v2: &Vector) -> Vector { 68 | v1.iter().zip(v2.iter()).map(|(a, b)| a * b).collect() 69 | } 70 | 71 | #[allow(clippy::ptr_arg)] 72 | pub fn vector_vector_dot(vec1: &Vector, vec2: &Vector) -> Matrix { 73 | vec1.iter() 74 | .map(|i| vec2.iter().map(|j| i * j).collect()) 75 | .collect() 76 | } 77 | 78 | #[allow(clippy::ptr_arg)] 79 | pub fn vector_matrix_dot(vec: &Vector, mat: &Matrix) -> Vector { 80 | matrix_vector_dot(&transpose(mat), vec) 81 | } 82 | 83 | #[allow(clippy::ptr_arg)] 84 | pub fn matrix_vector_dot(mat: &Matrix, vec: &Vector) -> Vector { 85 | mat.iter().map(|w| dot(w, vec)).collect() 86 | } 87 | 88 | #[allow(clippy::ptr_arg)] 89 | pub fn matrix_matrix_subtraction(mat1: &Matrix, mat2: &Matrix) -> Matrix { 90 | mat1.iter() 91 | .zip(mat2.iter()) 92 | .map(|(v1, v2)| vector_vector_subtraction(v1, v2)) 93 | .collect() 94 | } 95 | 96 | #[allow(clippy::ptr_arg)] 97 | pub fn matrix_matrix_multiplication(mat1: &Matrix, mat2: &Matrix) -> Matrix { 98 | mat1.iter() 99 | .zip(mat2.iter()) 100 | .map(|(v1, v2)| vector_vector_multiplication(v1, v2)) 101 | .collect() 102 | } 103 | 104 | #[allow(clippy::ptr_arg, clippy::needless_range_loop)] 105 | pub fn matrix_matrix_dot(mat1: &Matrix, mat2: &Matrix) -> Matrix { 106 | assert_eq!(mat1[0].len(), mat2.len()); 107 | 108 | let mut ans = vec![vec![0.0; mat2[0].len()]; mat1.len()]; 109 | 110 | for i in 0..mat1.len() { 111 | for j in 0..mat2[0].len() { 112 | for k in 0..mat2.len() { 113 | ans[i][j] += mat1[i][k] * mat2[k][j]; 114 | } 115 | } 116 | } 117 | 118 | ans 119 | } 120 | 121 | pub fn relu_vector(v: Vector) -> Vector { 122 | v.into_iter() 123 | .map(|a| if a > 0.0 { a } else { 0.0 }) 124 | .collect() 125 | } 126 | 127 | pub fn relu_vector_derivative(v: Vector) -> Vector { 128 | v.into_iter() 129 | .map(|a| if a > 0.0 { 1.0 } else { 0.0 }) 130 | .collect() 131 | } 132 | 133 | pub fn relu_matrix(m: Matrix) -> Matrix { 134 | m.into_iter().map(relu_vector).collect() 135 | } 136 | 137 | pub fn relu_matrix_derivative(m: Matrix) -> Matrix { 138 | m.into_iter().map(relu_vector_derivative).collect() 139 | } 140 | 141 | #[allow(clippy::ptr_arg, clippy::needless_range_loop)] 142 | pub fn transpose(m: &Matrix) -> Matrix { 143 | let mut ans = vec![vec![0.0; m.len()]; m[0].len()]; 144 | 145 | for i in 0..m.len() { 146 | for j in 0..m[0].len() { 147 | ans[j][i] = m[i][j]; 148 | } 149 | } 150 | 151 | ans 152 | } 153 | 154 | pub fn generate_random_vector( 155 | size: usize, 156 | scale_factor: f64, 157 | add_factor: f64, 158 | dist: &impl Distribution, 159 | ) -> Vec { 160 | let mut rng = thread_rng(); 161 | (0..size) 162 | .map(|_| scale_factor * rng.sample(dist) + add_factor) 163 | .collect() 164 | } 165 | 166 | pub fn process_mnist_batch_dataset( 167 | dataset: impl Dataset, u8)>, 168 | dataset_size: usize, 169 | batch_size: usize, 170 | ) -> (Vec>, Vec>) { 171 | let normalize_image = |img: Vec| img.iter().map(|v| f64::from(*v) / 255.0).collect(); 172 | let encode_label = |l| { 173 | let mut v = vec![0.0; 10]; 174 | v[l as usize] = 1.0; 175 | v 176 | }; 177 | 178 | let (images, labels): (Vec<_>, Vec<_>) = dataset 179 | .take(dataset_size) 180 | .map(|(i, l)| (normalize_image(i), encode_label(l))) 181 | .unzip(); 182 | 183 | let images = images 184 | .into_iter() 185 | .batch(batch_size, false) 186 | .map(|v| { 187 | v.into_iter() 188 | .fold(Vec::with_capacity(batch_size * 784), |mut acc, mut img| { 189 | acc.append(&mut img); 190 | acc 191 | }) 192 | }) 193 | .collect(); 194 | 195 | let labels = labels 196 | .into_iter() 197 | .batch(batch_size, false) 198 | .map(|v| { 199 | v.into_iter() 200 | .fold(Vec::with_capacity(batch_size * 10), |mut acc, mut l| { 201 | acc.append(&mut l); 202 | acc 203 | }) 204 | }) 205 | .collect(); 206 | 207 | (images, labels) 208 | } 209 | 210 | pub fn sample_bernoulli_trials(p: f64, length: usize) -> Vec { 211 | let dist = Bernoulli::new(p); 212 | thread_rng() 213 | .sample_iter(&dist) 214 | .take(length) 215 | .map(|v| if v { 1.0 } else { 0.0 }) 216 | .collect() 217 | } 218 | 219 | pub fn relu_mut(m: &mut RulinalgMatrix) { 220 | for x in m.iter_mut() { 221 | *x = if (*x) > 0.0 { *x } else { 0.0 }; 222 | } 223 | } 224 | 225 | pub fn relu_derivative(m: &RulinalgMatrix) -> RulinalgMatrix { 226 | let mut ans = RulinalgMatrix::zeros(m.rows(), m.cols()); 227 | for i in 0..m.rows() { 228 | for j in 0..m.cols() { 229 | if m[[i, j]] >= 0.0 { 230 | ans[[i, j]] = 1.0; 231 | } 232 | } 233 | } 234 | 235 | ans 236 | } 237 | 238 | pub fn sigmoid_mut(m: &mut RulinalgMatrix) { 239 | for x in m.iter_mut() { 240 | *x = 1.0 / (1.0 + (-(*x)).exp()); 241 | } 242 | } 243 | 244 | pub fn tanh_mut(m: &mut RulinalgMatrix) { 245 | for x in m.iter_mut() { 246 | *x = (*x).tanh(); 247 | } 248 | } 249 | 250 | pub fn tanh_derivative(m: &RulinalgMatrix) -> RulinalgMatrix { 251 | let mut ans = RulinalgMatrix::zeros(m.rows(), m.cols()); 252 | for i in 0..m.rows() { 253 | for j in 0..m.cols() { 254 | ans[[i, j]] = 1.0 - (m[[i, j]] * m[[i, j]]); 255 | } 256 | } 257 | ans 258 | } 259 | 260 | pub fn softmax_mut(m: &mut RulinalgMatrix) { 261 | for i in 0..m.rows() { 262 | let mut s = 0.0; 263 | 264 | for j in 0..m.cols() { 265 | m[[i, j]] = m[[i, j]].exp(); 266 | s += m[[i, j]]; 267 | } 268 | 269 | for j in 0..m.cols() { 270 | m[[i, j]] /= s; 271 | } 272 | } 273 | } 274 | 275 | #[cfg(test)] 276 | mod tests { 277 | use super::*; 278 | 279 | #[test] 280 | fn test_elementwise_multiplication() { 281 | assert_eq!( 282 | vec![6.0, 14.0, 24.0, 36.0, 0.0], 283 | elementwise_multiplication( 284 | &vec![1.0, 2.0, 3.0, 4.0, 5.0], 285 | &vec![6.0, 7.0, 8.0, 9.0, 0.0], 286 | ), 287 | ); 288 | } 289 | 290 | #[test] 291 | fn test_vector_sum() { 292 | assert_eq!(15.0, vector_sum(vec![1.0, 2.0, 3.0, 4.0, 5.0])); 293 | } 294 | 295 | #[test] 296 | fn test_elementwise_addition() { 297 | assert_eq!( 298 | vec![7.0, 9.0, 11.0, 13.0, 5.0], 299 | elementwise_addition( 300 | &vec![1.0, 2.0, 3.0, 4.0, 5.0], 301 | &vec![6.0, 7.0, 8.0, 9.0, 0.0], 302 | ), 303 | ) 304 | } 305 | 306 | #[test] 307 | fn test_vector_average() { 308 | assert_eq!(3.0, vector_average(&vec![1.0, 2.0, 3.0, 4.0, 5.0])); 309 | } 310 | 311 | #[test] 312 | fn test_dot() { 313 | assert_eq!( 314 | 80.0, 315 | dot( 316 | &vec![1.0, 2.0, 3.0, 4.0, 5.0], 317 | &vec![6.0, 7.0, 8.0, 9.0, 0.0], 318 | ), 319 | ); 320 | } 321 | 322 | #[test] 323 | fn test_elementwise_scalar_multiplication() { 324 | assert_eq!( 325 | vec![2.0, 4.0, 6.0, 8.0, 10.0], 326 | elementwise_scalar_multiplication(&vec![1.0, 2.0, 3.0, 4.0, 5.0], 2.0,) 327 | ) 328 | } 329 | 330 | #[test] 331 | fn test_matrix_vector_dot() { 332 | assert_eq!( 333 | vec![55.0, 45.0, 40.0, 40.0, 35.0], 334 | matrix_vector_dot( 335 | &vec![ 336 | vec![1.0, 2.0, 3.0, 4.0, 5.0], 337 | vec![2.0, 3.0, 4.0, 5.0, 1.0], 338 | vec![3.0, 4.0, 5.0, 1.0, 2.0], 339 | vec![4.0, 5.0, 1.0, 2.0, 3.0], 340 | vec![5.0, 4.0, 3.0, 2.0, 1.0], 341 | ], 342 | &vec![1.0, 2.0, 3.0, 4.0, 5.0], 343 | ), 344 | ); 345 | } 346 | 347 | #[test] 348 | fn test_relu_vector() { 349 | assert_eq!( 350 | vec![1.0, 0.0, 2.0, 0.0, 4.0], 351 | relu_vector(vec![1.0, -1.0, 2.0, -2.0, 4.0]), 352 | ); 353 | } 354 | } 355 | -------------------------------------------------------------------------------- /src/losses.rs: -------------------------------------------------------------------------------- 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters. 2 | 3 | use crate::tensor::{Sum, Tensor}; 4 | 5 | pub trait Loss { 6 | fn forward(&self, pred: &Tensor, target: &Tensor) -> Tensor; 7 | } 8 | 9 | #[derive(Debug)] 10 | pub struct MSELoss; 11 | 12 | impl Loss for MSELoss { 13 | fn forward(&self, pred: &Tensor, target: &Tensor) -> Tensor { 14 | (&(pred - target) * &(pred - target)).sum(0) 15 | } 16 | } 17 | 18 | #[derive(Debug)] 19 | pub struct CrossEntropyLoss; 20 | 21 | impl Loss for CrossEntropyLoss { 22 | fn forward(&self, pred: &Tensor, target_indices: &Tensor) -> Tensor { 23 | pred.cross_entropy(target_indices) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/optimizers.rs: -------------------------------------------------------------------------------- 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters. 2 | 3 | use rulinalg::matrix::BaseMatrix; 4 | 5 | use crate::tensor::Tensor; 6 | 7 | pub trait Optimizer { 8 | fn step(&self, zero: bool); 9 | } 10 | 11 | #[derive(Debug)] 12 | pub struct SGDOptimizer<'a> { 13 | parameters: Vec<&'a Tensor>, 14 | alpha: f64, 15 | } 16 | 17 | impl<'a> SGDOptimizer<'a> { 18 | pub fn new(parameters: Vec<&'a Tensor>, alpha: f64) -> SGDOptimizer { 19 | SGDOptimizer { parameters, alpha } 20 | } 21 | 22 | fn step_parameter(&self, parameter: &'a Tensor, zero: bool) { 23 | let mut w = parameter.0.borrow_mut(); 24 | let grad = w.grad.take(); 25 | 26 | if zero { 27 | w.grad = None; 28 | } 29 | 30 | let grad = grad.unwrap(); 31 | let grad = &grad.borrow().data; 32 | 33 | for i in 0..w.data.rows() { 34 | for j in 0..w.data.cols() { 35 | w.data[[i, j]] -= self.alpha * grad[[i, j]]; 36 | } 37 | } 38 | } 39 | } 40 | 41 | impl<'a> Optimizer for SGDOptimizer<'a> { 42 | fn step(&self, zero: bool) { 43 | for p in self.parameters.iter() { 44 | self.step_parameter(p, zero); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/tensor.rs: -------------------------------------------------------------------------------- 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters. 2 | 3 | use std::cell::RefCell; 4 | use std::collections::BTreeMap; 5 | use std::ops::{Add, Mul, Neg, Sub}; 6 | use std::rc::Rc; 7 | 8 | use rand::{thread_rng, RngCore}; 9 | use rulinalg::matrix::{BaseMatrix, Matrix}; 10 | 11 | pub type TensorRef = Rc>; 12 | 13 | #[derive(Debug, Clone)] 14 | pub enum Operation { 15 | Const, 16 | Add, 17 | Neg, 18 | Sub, 19 | Mul, 20 | Dot, 21 | Transpose, 22 | Sigmoid, 23 | Tanh, 24 | Relu, 25 | Sum(usize), 26 | Expand(usize), 27 | // This is not generic as implemented for python 28 | // and can only select indices on the 0th axis. Hence, only a vector. 29 | IndexSelect(Vec), 30 | CrossEntropy(Matrix, Matrix), 31 | } 32 | 33 | #[derive(Debug)] 34 | pub struct TensorImpl { 35 | id: u64, 36 | pub data: Matrix, 37 | pub grad: Option, 38 | creation_op: Operation, 39 | creators: Option>, 40 | autograd: bool, 41 | children: BTreeMap, 42 | } 43 | 44 | impl TensorImpl { 45 | fn grad(data: Matrix) -> Self { 46 | TensorImpl { 47 | id: thread_rng().next_u64(), 48 | data, 49 | grad: None, 50 | creation_op: Operation::Const, 51 | creators: None, 52 | autograd: false, 53 | children: BTreeMap::new(), 54 | } 55 | } 56 | 57 | fn all_children_grads_accounted_for(&self) -> bool { 58 | self.children.iter().all(|(_, c)| c == &0) 59 | } 60 | 61 | #[allow(clippy::cyclomatic_complexity)] 62 | fn backward(&mut self, grad: TensorRef, grad_from: Option) { 63 | if self.autograd { 64 | if let Some(grad_from) = &grad_from { 65 | if self.children[&grad_from] == 0 { 66 | panic!("Can only Backpropagate through a tensor once"); 67 | } else { 68 | self.children 69 | .insert(*grad_from, self.children[grad_from] - 1); 70 | } 71 | } 72 | 73 | self.grad = match self.grad.take() { 74 | None => Some(Rc::clone(&grad)), 75 | Some(current_grad) => { 76 | let new_grad_data = { 77 | let current_grad_data = ¤t_grad.borrow().data; 78 | let grad_data = &grad.borrow().data; 79 | current_grad_data + grad_data 80 | }; 81 | 82 | Some(Rc::new(RefCell::new(TensorImpl::grad(new_grad_data)))) 83 | } 84 | }; 85 | 86 | if self.creators.is_some() 87 | && (self.all_children_grads_accounted_for() || grad_from.is_none()) 88 | { 89 | let grad = self.grad.as_ref().unwrap(); 90 | let creators = self.creators.as_ref().unwrap(); 91 | 92 | match &self.creation_op { 93 | Operation::Add => { 94 | creators[0] 95 | .borrow_mut() 96 | .backward(Rc::clone(grad), Some(self.id)); 97 | creators[1] 98 | .borrow_mut() 99 | .backward(Rc::clone(grad), Some(self.id)); 100 | } 101 | Operation::Neg => { 102 | let data = &grad.borrow().data; 103 | let data_data: Vec = data.data().iter().map(|v| -v).collect(); 104 | creators[0].borrow_mut().backward( 105 | Rc::new(RefCell::new(TensorImpl::grad(Matrix::new( 106 | data.rows(), 107 | data.cols(), 108 | data_data, 109 | )))), 110 | Some(self.id), 111 | ); 112 | } 113 | Operation::Sub => { 114 | creators[0] 115 | .borrow_mut() 116 | .backward(Rc::clone(grad), Some(self.id)); 117 | { 118 | let data = &grad.borrow().data; 119 | creators[1].borrow_mut().backward( 120 | Rc::new(RefCell::new(TensorImpl::grad(-data))), 121 | Some(self.id), 122 | ); 123 | } 124 | } 125 | Operation::Mul => { 126 | let grad = &grad.borrow().data; 127 | 128 | let grad0 = { 129 | let grad0 = &creators[1].borrow().data; 130 | let grad0 = grad0.elemul(grad); 131 | Rc::new(RefCell::new(TensorImpl::grad(grad0))) 132 | }; 133 | 134 | let grad1 = { 135 | let grad1 = &creators[0].borrow().data; 136 | let grad1 = grad1.elemul(grad); 137 | Rc::new(RefCell::new(TensorImpl::grad(grad1))) 138 | }; 139 | 140 | creators[0].borrow_mut().backward(grad0, Some(self.id)); 141 | creators[1].borrow_mut().backward(grad1, Some(self.id)); 142 | } 143 | Operation::Transpose => { 144 | let grad = &grad.borrow().data; 145 | let data = grad.transpose(); 146 | creators[0] 147 | .borrow_mut() 148 | .backward(Rc::new(RefCell::new(TensorImpl::grad(data))), Some(self.id)); 149 | } 150 | Operation::Dot => { 151 | let grad = &grad.borrow().data; 152 | 153 | let act_delta = { 154 | let weights = &creators[1].borrow().data; 155 | grad.mul(weights.transpose()) 156 | }; 157 | 158 | let weights_delta = { 159 | let act = &creators[0].borrow().data; 160 | act.transpose().mul(grad) 161 | }; 162 | 163 | creators[0].borrow_mut().backward( 164 | Rc::new(RefCell::new(TensorImpl::grad(act_delta))), 165 | Some(self.id), 166 | ); 167 | 168 | creators[1].borrow_mut().backward( 169 | Rc::new(RefCell::new(TensorImpl::grad(weights_delta))), 170 | Some(self.id), 171 | ); 172 | } 173 | Operation::Sum(axis) => { 174 | let new_grad = { 175 | let data = &creators[0].borrow().data; 176 | let grad = &grad.borrow().data; 177 | let mut new_grad = Matrix::zeros(data.rows(), data.cols()); 178 | 179 | if axis == &0 { 180 | for i in 0..data.rows() { 181 | for j in 0..data.cols() { 182 | new_grad[[i, j]] = grad[[0, j]]; 183 | } 184 | } 185 | } else if axis == &1 { 186 | for i in 0..data.rows() { 187 | for j in 0..data.cols() { 188 | new_grad[[i, j]] = grad[[i, 0]]; 189 | } 190 | } 191 | } else { 192 | panic!("this is broken"); 193 | } 194 | 195 | new_grad 196 | }; 197 | 198 | creators[0].borrow_mut().backward( 199 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 200 | Some(self.id), 201 | ); 202 | } 203 | Operation::Expand(dim) => { 204 | let new_grad = { 205 | let data = &creators[0].borrow().data; 206 | let grad = &grad.borrow().data; 207 | let mut new_grad = Matrix::zeros(data.rows(), data.cols()); 208 | 209 | if dim == &0 { 210 | for i in 0..grad.rows() { 211 | for j in 0..grad.cols() { 212 | new_grad[[0, j]] += grad[[i, j]]; 213 | } 214 | } 215 | } else { 216 | panic!("this is broken"); 217 | } 218 | 219 | new_grad 220 | }; 221 | 222 | creators[0].borrow_mut().backward( 223 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 224 | Some(self.id), 225 | ); 226 | } 227 | Operation::Sigmoid => { 228 | let new_grad = { 229 | let data = &self.data; 230 | let grad = &grad.borrow().data; 231 | 232 | let mut new_grad = Matrix::zeros(grad.rows(), grad.cols()); 233 | for i in 0..grad.rows() { 234 | for j in 0..grad.cols() { 235 | new_grad[[i, j]] = 236 | grad[[i, j]] * (data[[i, j]] * (1.0 - data[[i, j]])); 237 | } 238 | } 239 | 240 | new_grad 241 | }; 242 | 243 | creators[0].borrow_mut().backward( 244 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 245 | Some(self.id), 246 | ); 247 | } 248 | Operation::Tanh => { 249 | let new_grad = { 250 | let data = &self.data; 251 | let grad = &grad.borrow().data; 252 | 253 | let mut new_grad = Matrix::zeros(grad.rows(), grad.cols()); 254 | for i in 0..grad.rows() { 255 | for j in 0..grad.cols() { 256 | new_grad[[i, j]] = 257 | grad[[i, j]] * (1.0 - (data[[i, j]] * data[[i, j]])); 258 | } 259 | } 260 | 261 | new_grad 262 | }; 263 | 264 | creators[0].borrow_mut().backward( 265 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 266 | Some(self.id), 267 | ); 268 | } 269 | Operation::Relu => { 270 | let new_grad = { 271 | let data = &self.data; 272 | let grad = &grad.borrow().data; 273 | 274 | let mut new_grad = Matrix::zeros(grad.rows(), grad.cols()); 275 | for i in 0..grad.rows() { 276 | for j in 0..grad.cols() { 277 | new_grad[[i, j]] = 278 | grad[[i, j]] * if data[[i, j]] > 0.0 { 1.0 } else { 0.0 }; 279 | } 280 | } 281 | 282 | new_grad 283 | }; 284 | 285 | creators[0].borrow_mut().backward( 286 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 287 | Some(self.id), 288 | ); 289 | } 290 | Operation::IndexSelect(indices) => { 291 | let new_grad = { 292 | let data = &creators[0].borrow().data; 293 | let grad = &grad.borrow().data; 294 | 295 | let mut new_grad = Matrix::zeros(data.rows(), data.cols()); 296 | for (i, ix) in indices.iter().enumerate() { 297 | for j in 0..data.cols() { 298 | new_grad[[*ix, j]] += grad[[i, j]]; 299 | } 300 | } 301 | 302 | new_grad 303 | }; 304 | 305 | creators[0].borrow_mut().backward( 306 | Rc::new(RefCell::new(TensorImpl::grad(new_grad))), 307 | Some(self.id), 308 | ) 309 | } 310 | Operation::CrossEntropy(predictions, targets) => { 311 | creators[0].borrow_mut().backward( 312 | Rc::new(RefCell::new(TensorImpl::grad(predictions - targets))), 313 | Some(self.id), 314 | ) 315 | } 316 | Operation::Const => {} 317 | } 318 | } 319 | } 320 | } 321 | } 322 | 323 | /// Tensor implements "shallow" clones, primarily so that they can be put inside enum variants. 324 | #[derive(Debug)] 325 | pub struct Tensor(pub TensorRef); 326 | 327 | impl Clone for Tensor { 328 | fn clone(&self) -> Self { 329 | Tensor(Rc::clone(&self.0)) 330 | } 331 | } 332 | 333 | impl Tensor { 334 | pub fn new_const(data: Matrix) -> Self { 335 | Self::new(data, Operation::Const, None) 336 | } 337 | 338 | pub fn grad(data: Matrix) -> Self { 339 | let tensor_impl = TensorImpl::grad(data); 340 | Tensor(Rc::new(RefCell::new(tensor_impl))) 341 | } 342 | 343 | pub fn new( 344 | data: Matrix, 345 | creation_op: Operation, 346 | creators: Option>, 347 | ) -> Self { 348 | let tensor_impl = TensorImpl { 349 | id: thread_rng().next_u64(), 350 | data, 351 | grad: None, 352 | creation_op, 353 | creators, 354 | autograd: true, 355 | children: BTreeMap::new(), 356 | }; 357 | 358 | if let Some(creators) = &tensor_impl.creators { 359 | for c in creators.iter() { 360 | let children = &mut c.borrow_mut().children; 361 | let e = children.entry(tensor_impl.id).or_insert(0); 362 | *e += 1; 363 | } 364 | } 365 | 366 | Tensor(Rc::new(RefCell::new(tensor_impl))) 367 | } 368 | 369 | pub fn backward(&self, grad: Tensor) { 370 | self.0.borrow_mut().backward(grad.0, None); 371 | } 372 | 373 | /// higher order ops 374 | 375 | pub fn sigmoid(&self) -> Tensor { 376 | let result = { 377 | let data = &self.0.borrow().data; 378 | let mut ans = Matrix::zeros(data.rows(), data.cols()); 379 | 380 | for i in 0..data.rows() { 381 | for j in 0..data.cols() { 382 | ans[[i, j]] = 1.0 / (1.0 + (-data[[i, j]]).exp()); 383 | } 384 | } 385 | 386 | ans 387 | }; 388 | 389 | if self.0.borrow().autograd { 390 | Tensor::new(result, Operation::Sigmoid, Some(vec![Rc::clone(&self.0)])) 391 | } else { 392 | Tensor::grad(result) 393 | } 394 | } 395 | 396 | pub fn tanh(&self) -> Tensor { 397 | let result = { 398 | let data = &self.0.borrow().data; 399 | let mut ans = Matrix::zeros(data.rows(), data.cols()); 400 | 401 | for i in 0..data.rows() { 402 | for j in 0..data.cols() { 403 | ans[[i, j]] = data[[i, j]].tanh(); 404 | } 405 | } 406 | 407 | ans 408 | }; 409 | 410 | if self.0.borrow().autograd { 411 | Tensor::new(result, Operation::Tanh, Some(vec![Rc::clone(&self.0)])) 412 | } else { 413 | Tensor::grad(result) 414 | } 415 | } 416 | 417 | pub fn relu(&self) -> Tensor { 418 | let result = { 419 | let data = &self.0.borrow().data; 420 | let mut ans = Matrix::zeros(data.rows(), data.cols()); 421 | 422 | for i in 0..data.rows() { 423 | for j in 0..data.cols() { 424 | ans[[i, j]] = if data[[i, j]] > 0.0 { 425 | data[[i, j]] 426 | } else { 427 | 0.0 428 | }; 429 | } 430 | } 431 | 432 | ans 433 | }; 434 | 435 | if self.0.borrow().autograd { 436 | Tensor::new(result, Operation::Relu, Some(vec![Rc::clone(&self.0)])) 437 | } else { 438 | Tensor::grad(result) 439 | } 440 | } 441 | 442 | pub fn index_select(&self, indices: Vec) -> Tensor { 443 | let result = { 444 | let data = &self.0.borrow().data; 445 | let mut ans = Matrix::zeros(indices.len(), data.cols()); 446 | 447 | for (i, ix) in indices.iter().enumerate() { 448 | for j in 0..data.cols() { 449 | ans[[i, j]] = data[[*ix, j]]; 450 | } 451 | } 452 | 453 | ans 454 | }; 455 | 456 | if self.0.borrow().autograd { 457 | Tensor::new( 458 | result, 459 | Operation::IndexSelect(indices), 460 | Some(vec![Rc::clone(&self.0)]), 461 | ) 462 | } else { 463 | Tensor::grad(result) 464 | } 465 | } 466 | 467 | /// the current tensor and the targets have to be the same shape 468 | pub fn cross_entropy(&self, target_indices: &Tensor) -> Tensor { 469 | let (m, target_dist, loss) = { 470 | let data = &self.0.borrow().data; 471 | let target_indices = &target_indices.0.borrow().data; 472 | 473 | let mut rs = vec![0.0; data.rows()]; 474 | 475 | let mut m = Matrix::zeros(data.rows(), data.cols()); 476 | 477 | for i in 0..data.rows() { 478 | for j in 0..data.cols() { 479 | m[[i, j]] = data[[i, j]].exp(); 480 | rs[i] += m[[i, j]]; 481 | } 482 | } 483 | 484 | for i in 0..data.rows() { 485 | for j in 0..data.cols() { 486 | m[[i, j]] /= rs[i]; 487 | } 488 | } 489 | 490 | let mut target_dist = Matrix::zeros(data.rows(), data.cols()); 491 | 492 | let mut loss = 0.0; 493 | for i in 0..target_indices.rows() { 494 | let index = target_indices[[i, 0]] as usize; 495 | target_dist[[i, index]] = 1.0; 496 | 497 | let current_loss = data[[i, index]].ln(); 498 | loss += -current_loss; 499 | } 500 | 501 | loss /= data.rows() as f64; 502 | 503 | (m, target_dist, loss) 504 | }; 505 | 506 | if self.0.borrow().autograd { 507 | Tensor::new( 508 | Matrix::new(1, 1, vec![loss]), 509 | Operation::CrossEntropy(m, target_dist), 510 | Some(vec![Rc::clone(&self.0)]), 511 | ) 512 | } else { 513 | Tensor::grad(Matrix::new(1, 1, vec![loss])) 514 | } 515 | } 516 | } 517 | 518 | impl Add for &Tensor { 519 | type Output = Tensor; 520 | 521 | fn add(self, other: Self) -> Self::Output { 522 | let data = &self.0.borrow().data + &other.0.borrow().data; 523 | 524 | if self.0.borrow().autograd { 525 | Tensor::new( 526 | data, 527 | Operation::Add, 528 | Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]), 529 | ) 530 | } else { 531 | Tensor::grad(data) 532 | } 533 | } 534 | } 535 | 536 | impl Neg for &Tensor { 537 | type Output = Tensor; 538 | 539 | fn neg(self) -> Self::Output { 540 | let data = -&self.0.borrow().data; 541 | if self.0.borrow().autograd { 542 | Tensor::new(data, Operation::Neg, Some(vec![Rc::clone(&self.0)])) 543 | } else { 544 | Tensor::grad(data) 545 | } 546 | } 547 | } 548 | 549 | impl Sub for &Tensor { 550 | type Output = Tensor; 551 | 552 | fn sub(self, other: Self) -> Self::Output { 553 | let data = &self.0.borrow().data - &other.0.borrow().data; 554 | 555 | if self.0.borrow().autograd { 556 | Tensor::new( 557 | data, 558 | Operation::Sub, 559 | Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]), 560 | ) 561 | } else { 562 | Tensor::grad(data) 563 | } 564 | } 565 | } 566 | 567 | impl Mul for &Tensor { 568 | type Output = Tensor; 569 | 570 | fn mul(self, other: Self) -> Self::Output { 571 | let data = self.0.borrow().data.elemul(&other.0.borrow().data); 572 | 573 | if self.0.borrow().autograd { 574 | Tensor::new( 575 | data, 576 | Operation::Mul, 577 | Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]), 578 | ) 579 | } else { 580 | Tensor::grad(data) 581 | } 582 | } 583 | } 584 | 585 | pub trait Sum { 586 | type Output; 587 | fn sum(self, dim: usize) -> Self::Output; 588 | } 589 | 590 | impl Sum for &Tensor { 591 | type Output = Tensor; 592 | 593 | fn sum(self, axis: usize) -> Self::Output { 594 | if axis > 1 { 595 | unimplemented!(); 596 | } 597 | 598 | let ans = if axis == 0 { 599 | let data = &self.0.borrow().data; 600 | let mut summed_data = Matrix::zeros(1, data.cols()); 601 | for i in 0..data.cols() { 602 | for j in 0..data.rows() { 603 | summed_data[[0, i]] += data[[j, i]]; 604 | } 605 | } 606 | summed_data 607 | } else { 608 | let data = &self.0.borrow().data; 609 | let mut summed_data = Matrix::zeros(data.rows(), 1); 610 | for i in 0..data.rows() { 611 | for j in 0..data.cols() { 612 | summed_data[[i, 0]] += data[[i, j]]; 613 | } 614 | } 615 | summed_data 616 | }; 617 | 618 | if self.0.borrow().autograd { 619 | Tensor::new(ans, Operation::Sum(axis), Some(vec![Rc::clone(&self.0)])) 620 | } else { 621 | Tensor::grad(ans) 622 | } 623 | } 624 | } 625 | 626 | pub trait Expand { 627 | type Output; 628 | fn expand(self, dim: usize, copies: usize) -> Self::Output; 629 | } 630 | 631 | impl Expand for &Tensor { 632 | type Output = Tensor; 633 | 634 | fn expand(self, dim: usize, copies: usize) -> Self::Output { 635 | if dim == 0 { 636 | let new_data = { 637 | let data = &self.0.borrow().data; 638 | if data.rows() != 1 { 639 | unimplemented!() 640 | } 641 | 642 | let mut new_data = Matrix::zeros(copies, data.cols()); 643 | for i in 0..copies { 644 | for j in 0..data.cols() { 645 | new_data[[i, j]] = data[[0, j]]; 646 | } 647 | } 648 | 649 | new_data 650 | }; 651 | 652 | if self.0.borrow().autograd { 653 | Tensor::new( 654 | new_data, 655 | Operation::Expand(dim), 656 | Some(vec![Rc::clone(&self.0)]), 657 | ) 658 | } else { 659 | Tensor::grad(new_data) 660 | } 661 | } else { 662 | unimplemented!() 663 | } 664 | } 665 | } 666 | 667 | pub trait Transpose { 668 | type Output; 669 | fn transpose(self) -> Self::Output; 670 | } 671 | 672 | impl Transpose for &Tensor { 673 | type Output = Tensor; 674 | 675 | fn transpose(self) -> Self::Output { 676 | let res = { 677 | let data = &self.0.borrow().data; 678 | data.transpose() 679 | }; 680 | 681 | if self.0.borrow().autograd { 682 | Tensor::new(res, Operation::Transpose, Some(vec![Rc::clone(&self.0)])) 683 | } else { 684 | Tensor::grad(res) 685 | } 686 | } 687 | } 688 | 689 | pub trait Dot { 690 | type Output; 691 | fn dot(self, other: Self) -> Self::Output; 692 | } 693 | 694 | impl Dot for &Tensor { 695 | type Output = Tensor; 696 | 697 | fn dot(self, other: &Tensor) -> Self::Output { 698 | let result = { 699 | let data = &self.0.borrow().data; 700 | let other_data = &other.0.borrow().data; 701 | data.mul(other_data) 702 | }; 703 | 704 | if self.0.borrow().autograd { 705 | Tensor::new( 706 | result, 707 | Operation::Dot, 708 | Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]), 709 | ) 710 | } else { 711 | Tensor::grad(result) 712 | } 713 | } 714 | } 715 | --------------------------------------------------------------------------------