├── .editorconfig
├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benches
    └── rulinalg_vs_native.rs
├── examples
    ├── chapter10.rs
    ├── chapter11.rs
    ├── chapter12.rs
    ├── chapter13.rs
    ├── chapter14.rs
    ├── chapter15.rs
    ├── chapter3.rs
    ├── chapter4.rs
    ├── chapter5.rs
    ├── chapter6.rs
    ├── chapter8.rs
    └── chapter9.rs
└── src
    ├── activations.rs
    ├── layers.rs
    ├── lib.rs
    ├── losses.rs
    ├── optimizers.rs
    └── tensor.rs


/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 | 
3 | [*]
4 | indent_style = space
5 | indent_size = 4
6 | charset = utf-8
7 | trim_trailing_whitespace = true
8 | insert_final_newline = true
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | 
 3 | rust:
 4 |     - stable
 5 |     - beta
 6 |     - nightly
 7 | 
 8 | cache: cargo
 9 | 
10 | script:
11 |     - cargo test
12 | 
13 | after_success:
14 |   - if [ "$TRAVIS_RUST_VERSION" == "nightly" ]; then
15 |         cargo bench;
16 |     fi
17 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "grokking-deep-learning-rs"
 3 | version = "0.1.0"
 4 | authors = ["Suyash <suyash93@protonmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | datasets = { git = "https://github.com/suyash/datasets" }
 9 | rand = "0.6.4"
10 | rulinalg = "0.4.2"
11 | 
12 | [dev-dependencies]
13 | indicatif = "0.11.0"
14 | # paillier = { version = "0.2.0", default-features = false, features = ["usegmp", "keygen"] }
15 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2018 Suyash
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Suyash
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Grokking Deep Learning Rust
 2 | 
 3 | [![Build Status](https://travis-ci.com/suyash/grokking-deep-learning-rs.svg?branch=master)](https://travis-ci.com/suyash/grokking-deep-learning-rs)
 4 | 
 5 | The exercises from the [@iamtrask](https://iamtrask.github.io) book [Grokking Deep Learning](https://manning.com/books/grokking-deep-learning) implemented in rust.
 6 | 
 7 | This crate isn't published, because ideally you'd do this on your own, but if you insist
 8 | 
 9 | ```
10 | cargo add grokking_deep_learning_rs --git https://github.com/suyash/grokking-deep-learning-rs
11 | ```
12 | 
13 | This crate is structured as a library, with the core library describing some common primitives used throughout and the individual chapters implemented in the exercises. To run the exercises from a particular chapter, for example chapter 12
14 | 
15 | ```
16 | cargo run --example chapter12
17 | ```
18 | 
19 | Currently this uses [rulinalg](https://docs.rs/rulinalg) for matrix operations, which uses a Rust implementation of `dgemm` and provides a 3x performance over normal ijk multiplication (see included benchmark). However, it still isn't as fast as numpy because it isn't multi-threaded. Currently working on something of my own.
20 | 
21 | The __datasets__ are extracted into a [separate library crate](https://github.com/suyash/datasets), which currently provides functions for loading 4 datasets, and an iterator for batching and shuffling. Planning to add more. Can be added using
22 | 
23 | ```
24 | cargo add datasets --git https://github.com/suyash/datasets
25 | ```
26 | 
27 | As a result of slower matmul, chapter 8 onwards, certain examples are smaller in size compared to the python examples.
28 | 
29 | The Chapter 13 core components were extracted into the core library, so they could be used in later chapters.
30 | 
31 | So, something like
32 | 
33 | ```rust
34 | use rulinalg::matrix::Matrix;
35 | 
36 | use grokking_deep_learning_rs::activations::{Sigmoid, Tanh};
37 | use grokking_deep_learning_rs::layers::{Layer, Linear, Sequential};
38 | use grokking_deep_learning_rs::losses::{Loss, MSELoss};
39 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer};
40 | use grokking_deep_learning_rs::tensor::Tensor;
41 | 
42 | let data = Tensor::new_const(Matrix::new(
43 |     4,
44 |     2,
45 |     vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
46 | ));
47 | 
48 | let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
49 | 
50 | let model = Sequential::new(vec![
51 |     Box::new(Linear::new(2, 3)),
52 |     Box::new(Tanh),
53 |     Box::new(Linear::new(3, 1)),
54 |     Box::new(Sigmoid),
55 | ]);
56 | 
57 | let criterion = MSELoss;
58 | let optim = SGDOptimizer::new(model.parameters(), 0.5);
59 | 
60 | for _ in 0..10 {
61 |     let pred = model.forward(&[&data]);
62 | 
63 |     // compare
64 |     let loss = criterion.forward(&pred[0], &target);
65 | 
66 |     println!("Loss: {:?}", loss.0.borrow().data.data());
67 | 
68 |     // calculate difference
69 |     loss.backward(Tensor::grad(Matrix::ones(1, 1)));
70 | 
71 |     // learn
72 |     optim.step(true);
73 | }
74 | ```
75 | 
76 | In Chapter 14, the RNN and LSTM examples have vanishing gradients and loss keeps going to NaN. There seems to be some kind of logic bomb in the code, where something is not doing what I think it does, still investigating. I tried reproducing the problem in chapter 13 final exercise and also implemented [min-char-rnn.py](https://gist.github.com/karpathy/d4dee566867f8291f086) in [Rust](https://gist.github.com/suyash/07b2ae4822f717d3edadb09a0f79ec57), but no luck so far.
77 | 
78 | For Chapter 15, the encrypted federated learning exercise is not implemented. [There does exist a crate](https://crates.io/crates/paillier) for paillier homomorphic crypto, but the current implementation only works with integers and BigInts, not floating point numbers. Will try to see how to get it to work.
79 | 
80 | # License
81 | 
82 | This project is licensed under either of
83 | 
84 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
85 |    http://www.apache.org/licenses/LICENSE-2.0)
86 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
87 |    http://opensource.org/licenses/MIT)
88 | 
89 | at your option.
90 | 
91 | ### Contribution
92 | 
93 | Unless you explicitly state otherwise, any contribution intentionally submitted
94 | for inclusion in this work by you, as defined in the Apache-2.0 license, shall be
95 | dual licensed as above, without any additional terms or conditions.
96 | 


--------------------------------------------------------------------------------
/benches/rulinalg_vs_native.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | 
 3 | extern crate test;
 4 | 
 5 | use test::{black_box, Bencher};
 6 | 
 7 | use std::ops::Mul;
 8 | 
 9 | use rulinalg::matrix::Matrix;
10 | 
11 | use grokking_deep_learning_rs::matrix_matrix_dot;
12 | 
13 | #[bench]
14 | fn bench_normal(b: &mut Bencher) {
15 |     b.iter(|| {
16 |         let m1 = vec![vec![1.0, 2.0], vec![3.0, 4.0], vec![5.0, 6.0]];
17 |         let m2 = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]];
18 |         black_box(matrix_matrix_dot(&m1, &m2));
19 |     });
20 | }
21 | 
22 | #[bench]
23 | fn bench_rulinalg(b: &mut Bencher) {
24 |     b.iter(|| {
25 |         let m1 = Matrix::new(3, 2, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
26 |         let m2 = Matrix::new(2, 3, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
27 |         black_box(m1.mul(m2));
28 |     });
29 | }
30 | 


--------------------------------------------------------------------------------
/examples/chapter10.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter10 - Intro to Convolutional Neural Networks - Learning Edges and Corners.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter10%20-%20Intro%20to%20Convolutional%20Neural%20Networks%20-%20Learning%20Edges%20and%20Corners.ipynb
  4 | 
  5 | use std::error::Error;
  6 | use std::ops::Mul;
  7 | 
  8 | use datasets::image::mnist;
  9 | use datasets::Dataset;
 10 | use indicatif::{ProgressBar, ProgressStyle};
 11 | use rand::distributions::Standard;
 12 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice};
 13 | 
 14 | use grokking_deep_learning_rs::{
 15 |     argmax, generate_random_vector, sample_bernoulli_trials, softmax_mut, tanh_derivative, tanh_mut,
 16 | };
 17 | 
 18 | fn main() {
 19 |     println!("\nUpgrading our MNIST Network\n");
 20 |     mnist_tanh(0.5).unwrap();
 21 | }
 22 | 
 23 | #[allow(unused_doc_comments)]
 24 | fn mnist_tanh(keep_probability: f64) -> Result<(), Box<dyn Error>> {
 25 |     let (train_data, test_data) = mnist()?;
 26 | 
 27 |     let train_dataset_size = 1024;
 28 |     let test_dataset_size = 1024;
 29 | 
 30 |     let batch_size = 64; // 128 in the numpy version
 31 | 
 32 |     let (kernel_rows, kernel_cols) = (3, 3);
 33 |     let num_kernels = 4; // 16 in the numpy version
 34 | 
 35 |     let (train_images, train_labels) = process_mnist_filtered_dataset(
 36 |         train_data,
 37 |         train_dataset_size,
 38 |         batch_size,
 39 |         kernel_rows,
 40 |         kernel_cols,
 41 |     );
 42 | 
 43 |     let (test_images, test_labels) = process_mnist_filtered_dataset(
 44 |         test_data,
 45 |         test_dataset_size,
 46 |         batch_size,
 47 |         kernel_rows,
 48 |         kernel_cols,
 49 |     );
 50 | 
 51 |     let mut kernels = Matrix::new(
 52 |         kernel_rows * kernel_cols,
 53 |         num_kernels,
 54 |         generate_random_vector(
 55 |             kernel_rows * kernel_cols * num_kernels,
 56 |             0.02,
 57 |             -0.01,
 58 |             &Standard,
 59 |         ),
 60 |     );
 61 | 
 62 |     let mut weights_1_2 = Matrix::new(
 63 |         (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
 64 |         10,
 65 |         generate_random_vector(
 66 |             (28 - kernel_rows) * (28 - kernel_cols) * num_kernels * 10,
 67 |             0.2,
 68 |             -0.1,
 69 |             &Standard,
 70 |         ),
 71 |     );
 72 | 
 73 |     let alpha = 2.0;
 74 | 
 75 |     let iterations = 100;
 76 |     let progress = ProgressBar::new(iterations as u64);
 77 |     progress.set_style(
 78 |         ProgressStyle::default_bar()
 79 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
 80 |     );
 81 | 
 82 |     for it in 0..iterations {
 83 |         let mut accuracy = 0.0;
 84 | 
 85 |         for (images, labels) in train_images.iter().zip(train_labels.iter()) {
 86 |             let labels =
 87 |                 unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) };
 88 | 
 89 |             let expanded_input_batch_size = images.len() * images[0].len();
 90 | 
 91 |             let expanded_input: Vec<f64> = images
 92 |                 .iter()
 93 |                 .flat_map(|kernel_inputs| kernel_inputs.iter())
 94 |                 .flat_map(|kernel_inputs| kernel_inputs.iter().cloned())
 95 |                 .collect();
 96 | 
 97 |             // [batch_size * 625, 9]
 98 |             let expanded_input = Matrix::new(
 99 |                 expanded_input_batch_size,
100 |                 kernel_rows * kernel_cols,
101 |                 expanded_input,
102 |             );
103 | 
104 |             // [batch_size * 625, 16]
105 |             let kernel_output = (&expanded_input).mul(&kernels);
106 | 
107 |             // [batch_size, 625 * 16]
108 |             // NOTE: this is the flatten step
109 |             let mut hidden_layer = Matrix::new(
110 |                 batch_size,
111 |                 (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
112 |                 kernel_output.into_vec(),
113 |             );
114 | 
115 |             /// Activation
116 |             tanh_mut(&mut hidden_layer);
117 | 
118 |             /// Dropout
119 |             let dropout_mask: Vec<f64> = sample_bernoulli_trials(
120 |                 keep_probability,
121 |                 batch_size * (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
122 |             )
123 |             .into_iter()
124 |             .map(|v| v * (1.0 / keep_probability))
125 |             .collect();
126 | 
127 |             let dropout_mask = Matrix::new(
128 |                 batch_size,
129 |                 (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
130 |                 dropout_mask,
131 |             );
132 | 
133 |             let hidden_layer = hidden_layer.elemul(&dropout_mask);
134 | 
135 |             /// Final Outputs
136 |             // [batch_size, 10]
137 |             let mut predictions = (&hidden_layer).mul(&weights_1_2);
138 |             softmax_mut(&mut predictions);
139 | 
140 |             /// NOTE: no error calculation still
141 | 
142 |             /// Accuracy
143 |             for (r1, r2) in predictions.row_iter().zip(labels.row_iter()) {
144 |                 accuracy += if argmax(r1.raw_slice()) == argmax(r2.raw_slice()) {
145 |                     1.0
146 |                 } else {
147 |                     0.0
148 |                 }
149 |             }
150 | 
151 |             /// delta_2_1
152 |             let mut delta_2_1 = Matrix::new(batch_size, 10, vec![0.0; batch_size * 10]);
153 |             for i in 0..batch_size {
154 |                 for j in 0..10 {
155 |                     delta_2_1[[i, j]] =
156 |                         (predictions[[i, j]] - labels[[i, j]]) / ((batch_size * batch_size) as f64);
157 |                 }
158 |             }
159 | 
160 |             /// delta_1_0
161 |             let mut delta_1_0 = (&delta_2_1)
162 |                 .mul(weights_1_2.transpose())
163 |                 .elemul(&tanh_derivative(&hidden_layer));
164 | 
165 |             for i in 0..batch_size {
166 |                 for j in 0..((28 - kernel_rows) * (28 - kernel_cols) * num_kernels) {
167 |                     delta_1_0[[i, j]] *= dropout_mask[[i, j]];
168 |                 }
169 |             }
170 | 
171 |             /// update weights_1_2
172 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
173 |             for i in 0..((28 - kernel_rows) * (28 - kernel_cols) * num_kernels) {
174 |                 for j in 0..10 {
175 |                     weights_1_2[[i, j]] -= alpha * weight_delta_1_2[[i, j]];
176 |                 }
177 |             }
178 | 
179 |             /// update weights_0_1
180 |             // reorient delta_1_0
181 |             let delta_1_0 = Matrix::new(
182 |                 batch_size * (28 - kernel_rows) * (28 - kernel_cols),
183 |                 num_kernels,
184 |                 delta_1_0.into_vec(),
185 |             );
186 | 
187 |             let weight_delta_0_1 = expanded_input.transpose().mul(delta_1_0);
188 |             for i in 0..(kernel_rows * kernel_cols) {
189 |                 for j in 0..num_kernels {
190 |                     kernels[[i, j]] -= alpha * weight_delta_0_1[[i, j]];
191 |                 }
192 |             }
193 |         }
194 | 
195 |         let mut test_accuracy = 0.0;
196 | 
197 |         if (it + 1) % 10 == 0 {
198 |             for (images, labels) in test_images.iter().zip(test_labels.iter()) {
199 |                 let labels =
200 |                     unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) };
201 | 
202 |                 let expanded_input_batch_size = images.len() * images[0].len();
203 | 
204 |                 let expanded_input: Vec<f64> = images
205 |                     .iter()
206 |                     .flat_map(|kernel_inputs| kernel_inputs.iter())
207 |                     .flat_map(|kernel_inputs| kernel_inputs.iter().cloned())
208 |                     .collect();
209 | 
210 |                 // [batch_size * 625, 9]
211 |                 let expanded_input = Matrix::new(
212 |                     expanded_input_batch_size,
213 |                     kernel_rows * kernel_cols,
214 |                     expanded_input,
215 |                 );
216 | 
217 |                 // [batch_size * 625, 16]
218 |                 let kernel_output = expanded_input.mul(&kernels);
219 | 
220 |                 // [batch_size, 625 * 16]
221 |                 // NOTE: this is the flatten step
222 |                 let mut hidden_layer = Matrix::new(
223 |                     batch_size,
224 |                     (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
225 |                     kernel_output.into_vec(),
226 |                 );
227 | 
228 |                 /// Activation
229 |                 tanh_mut(&mut hidden_layer);
230 | 
231 |                 /// Dropout
232 |                 let dropout_mask: Vec<f64> = sample_bernoulli_trials(
233 |                     keep_probability,
234 |                     batch_size * (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
235 |                 )
236 |                 .into_iter()
237 |                 .map(|v| v * (1.0 / keep_probability))
238 |                 .collect();
239 | 
240 |                 let dropout_mask = Matrix::new(
241 |                     batch_size,
242 |                     (28 - kernel_rows) * (28 - kernel_cols) * num_kernels,
243 |                     dropout_mask,
244 |                 );
245 | 
246 |                 let hidden_layer = hidden_layer.elemul(&dropout_mask);
247 | 
248 |                 /// Final Outputs
249 |                 // [batch_size, 10]
250 |                 let mut predictions = hidden_layer.mul(&weights_1_2);
251 |                 softmax_mut(&mut predictions);
252 | 
253 |                 /// NOTE: no error calculation still
254 | 
255 |                 /// Accuracy
256 |                 for (r1, r2) in predictions.row_iter().zip(labels.row_iter()) {
257 |                     test_accuracy += if argmax(r1.raw_slice()) == argmax(r2.raw_slice()) {
258 |                         1.0
259 |                     } else {
260 |                         0.0
261 |                     }
262 |                 }
263 |             }
264 | 
265 |             progress.println(format!(
266 |                 "Iteration: {}, Train Accuracy: {}, Test Accuracy: {}",
267 |                 it + 1,
268 |                 accuracy / (train_dataset_size as f64),
269 |                 test_accuracy / (test_dataset_size as f64),
270 |             ));
271 |         }
272 | 
273 |         progress.inc(1);
274 |         progress.set_message(&format!(
275 |             "Train Accuracy: {}",
276 |             accuracy / (train_dataset_size as f64),
277 |         ));
278 |     }
279 | 
280 |     Ok(())
281 | }
282 | 
283 | #[allow(clippy::type_complexity)]
284 | fn process_mnist_filtered_dataset(
285 |     dataset: impl Dataset<Item = (Vec<u8>, u8)>,
286 |     dataset_size: usize,
287 |     batch_size: usize,
288 |     kernel_rows: usize,
289 |     kernel_cols: usize,
290 | ) -> (Vec<Vec<Vec<Vec<f64>>>>, Vec<Vec<f64>>) {
291 |     let (images, labels): (Vec<Vec<u8>>, Vec<u8>) = dataset.take(dataset_size).unzip();
292 | 
293 |     // extract kernel sized image sections from images
294 |     // [_, batch, kernels, kernel_image]
295 |     let images = images
296 |         .into_iter()
297 |         .map(|img| {
298 |             // convert each image into a vectors of kernel inputs of size 3x3
299 | 
300 |             let mut kernel_inputs = Vec::with_capacity((28 - kernel_rows) * (28 - kernel_cols));
301 | 
302 |             for i in 0..(28 - kernel_rows) {
303 |                 for j in 0..(28 - kernel_cols) {
304 |                     let mut kernel_input = vec![0.0; kernel_rows * kernel_cols];
305 | 
306 |                     for k in 0..kernel_rows {
307 |                         for l in 0..kernel_cols {
308 |                             kernel_input[k * kernel_cols + l] =
309 |                                 f64::from(img[(i + k) * 28 + (j + l)]);
310 |                         }
311 |                     }
312 | 
313 |                     kernel_inputs.push(kernel_input);
314 |                 }
315 |             }
316 | 
317 |             kernel_inputs
318 |         })
319 |         .batch(batch_size, false)
320 |         .collect();
321 | 
322 |     // [_, batch, label]
323 |     let labels = labels
324 |         .into_iter()
325 |         .map(|l| {
326 |             let mut v = vec![0.0; 10];
327 |             v[l as usize] = 1.0;
328 |             v
329 |         })
330 |         .batch(batch_size, false)
331 |         // flatten each batch so it can be converted to MatrixSlice easily
332 |         .map(|b| b.into_iter().flat_map(|v| v.into_iter()).collect())
333 |         .collect();
334 | 
335 |     (images, labels)
336 | }
337 | 


--------------------------------------------------------------------------------
/examples/chapter11.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter11 - Intro to Word Embeddings - Neural Networks that Understand Language.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter11%20-%20Intro%20to%20Word%20Embeddings%20-%20Neural%20Networks%20that%20Understand%20Language.ipynb
  4 | 
  5 | use std::cmp::{max, min, Ordering};
  6 | use std::collections::{BTreeMap, BTreeSet};
  7 | use std::error::Error;
  8 | use std::iter::FromIterator;
  9 | use std::ops::Mul;
 10 | 
 11 | use datasets::text::imdb_reviews;
 12 | use datasets::Dataset;
 13 | use indicatif::{ProgressBar, ProgressStyle};
 14 | use rand::distributions::Standard;
 15 | use rulinalg::matrix::{BaseMatrix, Matrix};
 16 | 
 17 | use grokking_deep_learning_rs::{generate_random_vector, sigmoid_mut};
 18 | 
 19 | fn main() -> Result<(), Box<dyn Error>> {
 20 |     let (train_dataset, test_dataset) = imdb_reviews()?;
 21 |     let train_dataset_size = 2000;
 22 |     let test_dataset_size = 2000;
 23 | 
 24 |     let (train_reviews, train_labels): (Vec<_>, Vec<_>) = train_dataset
 25 |         .shuffle(25000, 0)
 26 |         .map(|(s, l): (String, u8)| (s.to_lowercase(), l))
 27 |         // currently only considering alphabets and nothing else.
 28 |         .map(|(s, l)| {
 29 |             (
 30 |                 s.chars()
 31 |                     .map(|c| if c >= 'a' && c <= 'z' { c } else { ' ' })
 32 |                     .collect(),
 33 |                 l,
 34 |             )
 35 |         })
 36 |         .take(train_dataset_size)
 37 |         .unzip();
 38 | 
 39 |     let (test_reviews, test_labels): (Vec<String>, Vec<_>) = test_dataset
 40 |         .shuffle(25000, 0)
 41 |         .map(|(s, l)| (s.to_lowercase(), l))
 42 |         .take(test_dataset_size)
 43 |         .unzip();
 44 | 
 45 |     // can't immutably borrow here
 46 |     let words = train_reviews
 47 |         .iter()
 48 |         .flat_map(|s: &String| s.split_whitespace().filter(|w| !w.is_empty()));
 49 | 
 50 |     let words = BTreeSet::from_iter(words);
 51 | 
 52 |     let len = words.len();
 53 |     // 0 => UNK, 1 => PAD
 54 |     let word_index = BTreeMap::from_iter(words.into_iter().zip(2..(len + 2)));
 55 |     println!("Found {} words", word_index.len());
 56 | 
 57 |     let train_reviews = encode_sentences(&train_reviews, &word_index);
 58 |     let train_labels: Vec<_> = encode_labels(train_labels);
 59 | 
 60 |     let test_reviews = encode_sentences(&test_reviews, &word_index);
 61 |     let test_labels: Vec<_> = encode_labels(test_labels);
 62 | 
 63 |     let embeddings = net_with_embedding_layer(
 64 |         (&train_reviews, &train_labels),
 65 |         (&test_reviews, &test_labels),
 66 |         len + 2,
 67 |     );
 68 | 
 69 |     show_similar_embeddings("beautiful", &word_index, &embeddings);
 70 |     show_similar_embeddings("terrible", &word_index, &embeddings);
 71 | 
 72 |     let embeddings = filling_in_the_blank(&train_reviews, &word_index);
 73 | 
 74 |     show_similar_embeddings("beautiful", &word_index, &embeddings);
 75 |     show_similar_embeddings("terrible", &word_index, &embeddings);
 76 | 
 77 |     analogies(["terrible", "good"], "bad", &word_index, &embeddings);
 78 |     analogies(["elizabeth", "he"], "she", &word_index, &embeddings);
 79 | 
 80 |     Ok(())
 81 | }
 82 | 
 83 | fn encode_sentences(v: &[String], word_index: &BTreeMap<&str, usize>) -> Vec<Vec<usize>> {
 84 |     v.iter()
 85 |         .map(|s| {
 86 |             let mut encoding = Vec::new();;
 87 | 
 88 |             for word in s.split_whitespace() {
 89 |                 if word_index.contains_key(word) {
 90 |                     encoding.push(word_index[word]);
 91 |                 } else {
 92 |                     encoding.push(0);
 93 |                 }
 94 |             }
 95 | 
 96 |             encoding
 97 |         })
 98 |         .collect()
 99 | }
100 | 
101 | fn encode_labels(labels: Vec<u8>) -> Vec<f64> {
102 |     labels
103 |         .into_iter()
104 |         .map(|l| if l > 5 { 1.0 } else { 0.0 })
105 |         .collect()
106 | }
107 | 
108 | #[allow(clippy::float_cmp)]
109 | fn net_with_embedding_layer(
110 |     (train_reviews, train_labels): (&[Vec<usize>], &[f64]),
111 |     (test_reviews, test_labels): (&[Vec<usize>], &[f64]),
112 |     vocab_size: usize,
113 | ) -> Matrix<f64> {
114 |     let hidden_size = 100;
115 | 
116 |     let mut embeddings = Matrix::new(
117 |         vocab_size,
118 |         hidden_size,
119 |         generate_random_vector(vocab_size * hidden_size, 0.2, -0.1, &Standard),
120 |     );
121 | 
122 |     let mut weights_1_2 = Matrix::new(
123 |         hidden_size,
124 |         1,
125 |         generate_random_vector(hidden_size, 0.2, -0.1, &Standard),
126 |     );
127 | 
128 |     let alpha = 0.01;
129 | 
130 |     let iterations = 15;
131 | 
132 |     for _ in 0..iterations {
133 |         let mut train_accuracy = 0.0;
134 |         let mut total = 0.0;
135 | 
136 |         let progress = ProgressBar::new(train_reviews.len() as u64);
137 |         progress.set_style(
138 |             ProgressStyle::default_bar()
139 |                 .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
140 |         );
141 | 
142 |         for (review, label) in train_reviews.iter().zip(train_labels.iter()) {
143 |             // take embeddings
144 |             let mut hidden_layer = Matrix::new(1, hidden_size, vec![0.0; hidden_size]);
145 |             for ix in review.iter() {
146 |                 for j in 0..hidden_size {
147 |                     hidden_layer[[0, j]] += embeddings[[*ix, j]];
148 |                 }
149 |             }
150 |             sigmoid_mut(&mut hidden_layer);
151 | 
152 |             let mut prediction = (&hidden_layer).mul(&weights_1_2);
153 |             sigmoid_mut(&mut prediction);
154 | 
155 |             let delta_2_1 = Matrix::new(1, 1, vec![prediction[[0, 0]] - label]);
156 |             let delta_1_0 = (&delta_2_1).mul(weights_1_2.transpose());
157 | 
158 |             if prediction[[0, 0]].round() == *label {
159 |                 train_accuracy += 1.0;
160 |             }
161 | 
162 |             total += 1.0;
163 | 
164 |             let weight_deltas_1_2 = hidden_layer.transpose().mul(delta_2_1);
165 | 
166 |             for i in 0..hidden_size {
167 |                 weights_1_2[[i, 0]] -= alpha * weight_deltas_1_2[[i, 0]];
168 |             }
169 | 
170 |             for ix in review.iter() {
171 |                 for j in 0..hidden_size {
172 |                     embeddings[[*ix, j]] -= alpha * delta_1_0[[0, j]];
173 |                 }
174 |             }
175 | 
176 |             progress.inc(1);
177 |             progress.set_message(&format!("Train Accuracy: {}", train_accuracy / total));
178 |         }
179 | 
180 |         progress.finish();
181 |     }
182 | 
183 |     println!("\nEvaluating on Test Dataset\n");
184 | 
185 |     let progress = ProgressBar::new(test_reviews.len() as u64);
186 |     progress.set_style(
187 |         ProgressStyle::default_bar()
188 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
189 |     );
190 | 
191 |     let mut test_accuracy = 0.0;
192 |     let mut total = 0.0;
193 | 
194 |     for (review, label) in test_reviews.iter().zip(test_labels.iter()) {
195 |         // take embeddings
196 |         let mut hidden_layer = Matrix::new(1, hidden_size, vec![0.0; hidden_size]);
197 |         for ix in review.iter() {
198 |             for j in 0..hidden_size {
199 |                 hidden_layer[[0, j]] += embeddings[[*ix, j]];
200 |             }
201 |         }
202 |         sigmoid_mut(&mut hidden_layer);
203 | 
204 |         let mut prediction = (&hidden_layer).mul(&weights_1_2);
205 |         sigmoid_mut(&mut prediction);
206 | 
207 |         if prediction[[0, 0]].round() == *label {
208 |             test_accuracy += 1.0;
209 |         }
210 | 
211 |         total += 1.0;
212 | 
213 |         progress.inc(1);
214 |         progress.set_message(&format!("Test Accuracy: {}", test_accuracy / total));
215 |     }
216 | 
217 |     progress.finish();
218 | 
219 |     embeddings
220 | }
221 | 
222 | fn show_similar_embeddings(
223 |     word: &str,
224 |     word_index: &BTreeMap<&str, usize>,
225 |     embeddings: &Matrix<f64>,
226 | ) {
227 |     if !word_index.contains_key(word) {
228 |         println!("index does not have {}", word);
229 |     } else {
230 |         let ix = word_index[word];
231 |         let word_embeddings = embeddings.row(ix);
232 | 
233 |         let sims = get_similar_embeddings(word_embeddings.raw_slice(), word_index, embeddings);
234 | 
235 |         println!("\nWords Similar to {}:\n", word);
236 |         for i in sims.iter().take(10) {
237 |             println!("{}: {}", i.0, i.1);
238 |         }
239 |     }
240 | }
241 | 
242 | fn get_similar_embeddings<'a>(
243 |     row: &[f64],
244 |     word_index: &'a BTreeMap<&str, usize>,
245 |     embeddings: &'a Matrix<f64>,
246 | ) -> Vec<(&'a str, f64)> {
247 |     let mut sims = Vec::with_capacity(word_index.len());
248 | 
249 |     for (word, ix) in word_index.iter() {
250 |         let mut distance = 0.0;
251 | 
252 |         for (a, b) in row.iter().zip(embeddings.row(*ix).iter()) {
253 |             distance += (a - b).powi(2);
254 |         }
255 | 
256 |         sims.push((word.to_owned(), distance.sqrt()));
257 |     }
258 | 
259 |     sims.sort_by(|a: &(&str, f64), b: &(&str, f64)| {
260 |         if a.1 < b.1 {
261 |             Ordering::Less
262 |         } else if a.1 > b.1 {
263 |             Ordering::Greater
264 |         } else {
265 |             Ordering::Equal
266 |         }
267 |     });
268 | 
269 |     sims
270 | }
271 | 
272 | fn filling_in_the_blank(
273 |     train_reviews: &[Vec<usize>],
274 |     word_index: &BTreeMap<&str, usize>,
275 | ) -> Matrix<f64> {
276 |     let concatenated: Vec<usize> = train_reviews.iter().flat_map(|v| v).cloned().collect();
277 | 
278 |     // NOTE: inputs are already shuffled
279 | 
280 |     let hidden_size = 50;
281 |     let (negative_samples, window_size) = (5, 2);
282 |     let alpha = 0.05;
283 | 
284 |     let iterations = 2;
285 | 
286 |     let mut weights_0_1 = Matrix::new(
287 |         word_index.len() + 2,
288 |         hidden_size,
289 |         generate_random_vector((word_index.len() + 2) * hidden_size, 0.2, -0.1, &Standard),
290 |     );
291 | 
292 |     let mut weights_1_2: Matrix<f64> = Matrix::zeros(word_index.len() + 2, hidden_size);
293 | 
294 |     let mut outputs = Matrix::new(1, negative_samples + 1, vec![0.0; negative_samples + 1]);
295 |     outputs[[0, 0]] = 1.0;
296 | 
297 |     for _ in 0..iterations {
298 |         let progress = ProgressBar::new(train_reviews.len() as u64);
299 |         progress.set_style(
300 |             ProgressStyle::default_bar()
301 |                 .template("{bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
302 |         );
303 | 
304 |         for review in train_reviews.iter() {
305 |             for target_ix in 0..review.len() {
306 |                 let mut target_samples = vec![review[target_ix]];
307 |                 target_samples.append(
308 |                     &mut generate_random_vector(negative_samples, 1.0, 0.0, &Standard)
309 |                         .into_iter()
310 |                         .map(|x| (x * (concatenated.len() as f64)) as usize)
311 |                         .map(|ix| concatenated[ix])
312 |                         .collect(),
313 |                 );
314 | 
315 |                 let left_window_start =
316 |                     max(0, (target_ix as isize) - (window_size as isize)) as usize;
317 |                 let right_window_end = min(target_ix + window_size, review.len());
318 | 
319 |                 let left_window: Vec<usize> = (left_window_start..target_ix)
320 |                     .map(|ix| review[ix])
321 |                     .collect();
322 |                 let right_window: Vec<usize> = ((target_ix + 1)..right_window_end)
323 |                     .map(|ix| review[ix])
324 |                     .collect();
325 | 
326 |                 let total_window_size = left_window.len() + right_window.len();
327 | 
328 |                 let mut hidden_layer: Matrix<f64> = Matrix::zeros(1, hidden_size);
329 | 
330 |                 for ix in left_window.iter().chain(right_window.iter()) {
331 |                     for (i, x) in weights_0_1.row(*ix).iter().enumerate() {
332 |                         hidden_layer[[0, i]] += x;
333 |                     }
334 |                 }
335 | 
336 |                 for i in 0..total_window_size {
337 |                     hidden_layer[[0, i]] /= total_window_size as f64;
338 |                 }
339 | 
340 |                 let mut predictions =
341 |                     (&hidden_layer).mul(select_rows(&weights_1_2, &target_samples).transpose());
342 |                 sigmoid_mut(&mut predictions);
343 | 
344 |                 // [1, target_size]
345 |                 let layer_2_delta = predictions - (&outputs);
346 | 
347 |                 // [1, hidden_size]
348 |                 let layer_1_delta =
349 |                     (&layer_2_delta).mul(select_rows(&weights_1_2, &target_samples));
350 | 
351 |                 // [target_size, hidden_size]
352 |                 // NOTE: we have initialized weights_1_2 in reverse order of traditional init
353 |                 // normally we'd do hidden_layer.transpose().mul(layer_2_delta)
354 |                 let weight_delta_1_2 = layer_2_delta.transpose().mul(hidden_layer);
355 | 
356 |                 for ix in target_samples.into_iter() {
357 |                     for v in 0..hidden_size {
358 |                         weights_1_2[[ix, v]] -= alpha * weight_delta_1_2[[0, v]];
359 |                     }
360 |                 }
361 | 
362 |                 for ix in left_window.into_iter().chain(right_window.into_iter()) {
363 |                     for v in 0..hidden_size {
364 |                         weights_0_1[[ix, v]] -= alpha * layer_1_delta[[0, v]];
365 |                     }
366 |                 }
367 |             }
368 | 
369 |             progress.inc(1);
370 |         }
371 | 
372 |         progress.finish();
373 |     }
374 | 
375 |     weights_0_1
376 | }
377 | 
378 | fn select_rows(m: &Matrix<f64>, rows: &[usize]) -> Matrix<f64> {
379 |     Matrix::new(
380 |         rows.len(),
381 |         m.cols(),
382 |         rows.iter().fold(Vec::new(), |mut acc, i| {
383 |             acc.append(&mut Vec::from(m.row(*i).raw_slice()));
384 |             acc
385 |         }),
386 |     )
387 | }
388 | 
389 | fn analogies(
390 |     positive: [&str; 2],
391 |     negative: &str,
392 |     word_index: &BTreeMap<&str, usize>,
393 |     embeddings: &Matrix<f64>,
394 | ) {
395 |     if !word_index.contains_key(positive[0])
396 |         || !word_index.contains_key(positive[1])
397 |         || !word_index.contains_key(negative)
398 |     {
399 |         println!("did not find all words in index");
400 |         return;
401 |     }
402 | 
403 |     let (pix1, pix2) = (word_index[positive[0]], word_index[positive[1]]);
404 |     let nix = word_index[negative];
405 | 
406 |     let mut target_row = vec![0.0; embeddings.cols()];
407 |     for i in 0..embeddings.cols() {
408 |         target_row[i] += embeddings[[pix1, i]];
409 |         target_row[i] -= embeddings[[nix, i]];
410 |         target_row[i] += embeddings[[pix2, i]];
411 |     }
412 | 
413 |     let sims = get_similar_embeddings(&target_row, word_index, embeddings);
414 | 
415 |     println!("\n{} - {} + {}:\n", positive[0], negative, positive[1]);
416 |     for i in sims.iter().take(10) {
417 |         println!("{}: {}", i.0, i.1);
418 |     }
419 | }
420 | 


--------------------------------------------------------------------------------
/examples/chapter12.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter 12 - Introduction to Recurrence - Predicting the Next Word
  2 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter12%20-%20Intro%20to%20Recurrence%20-%20Predicting%20the%20Next%20Word.ipynb
  3 | //!
  4 | //! This is *significantly* different from the python version.
  5 | //!
  6 | //! 1. The dataset is cleaned to remove all whitespaces including tabs, and as a result contains only 19 words. This results in a lower perplexity than
  7 | //! the python version from the beginning.
  8 | //!
  9 | //! 2. The Forward Propagation, Back Propagation and Weight Update steps are implemented in a single function.
 10 | //!
 11 | //! 3. The gradients explode more rapidly, because of extremely low embeddings to match. Alleviated this by lowering the alpha from 0.001 to 0.0005
 12 | //! and increasing embedding size from 10 to 100. Another measure would be to cap the gradients.
 13 | 
 14 | use std::collections::{BTreeMap, BTreeSet};
 15 | use std::error::Error;
 16 | use std::iter::FromIterator;
 17 | use std::ops::Mul;
 18 | 
 19 | use datasets::text::babi_en_single_supporting_fact_task;
 20 | use indicatif::{ProgressBar, ProgressStyle};
 21 | use rand::distributions::Uniform;
 22 | use rulinalg::matrix::{BaseMatrix, Matrix};
 23 | 
 24 | use grokking_deep_learning_rs::{argmax, generate_random_vector, softmax_mut};
 25 | 
 26 | fn main() -> Result<(), Box<dyn Error>> {
 27 |     embeddings_forward_propagation();
 28 | 
 29 |     let (train_data, _) = babi_en_single_supporting_fact_task()?;
 30 | 
 31 |     let train_data: Vec<Vec<String>> = train_data
 32 |         .map(|v| vec![v.0, v.1, (v.2).0])
 33 |         .flat_map(|v| v.into_iter())
 34 |         .map(|s| {
 35 |             s.split_whitespace()
 36 |                 .map(|w| {
 37 |                     w.chars()
 38 |                         .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z'))
 39 |                         .collect()
 40 |                 })
 41 |                 .collect()
 42 |         })
 43 |         .collect();
 44 | 
 45 |     let words = BTreeSet::from_iter(train_data.iter().flat_map(|v| v.iter()));
 46 | 
 47 |     let word_count = words.len();
 48 |     let word_index = BTreeMap::from_iter(words.into_iter().zip(0..word_count));
 49 |     let inverted_word_index =
 50 |         BTreeMap::from_iter(word_index.clone().into_iter().map(|(k, v)| (v, k)));
 51 | 
 52 |     let (start_state, embeddings, recurrent_weights, state_to_prediction_weights) =
 53 |         training_with_arbitrary_length(&train_data, &word_index)?;
 54 | 
 55 |     let sentence = &train_data[0];
 56 | 
 57 |     let mut current_state = start_state.clone();
 58 | 
 59 |     for (i, word) in sentence.iter().take(sentence.len() - 1).enumerate() {
 60 |         let mut prediction = (&current_state).mul(&state_to_prediction_weights);
 61 |         softmax_mut(&mut prediction);
 62 | 
 63 |         let pred_ix = argmax(prediction.row(0).raw_slice());
 64 |         let predicted_word = inverted_word_index[&pred_ix];
 65 | 
 66 |         println!(
 67 |             "Input: {}, Expected: {}, Predicted: {}",
 68 |             word,
 69 |             sentence[i + 1],
 70 |             predicted_word
 71 |         );
 72 |         current_state =
 73 |             current_state.mul(&recurrent_weights) + embeddings.row(word_index[word]).into_matrix();
 74 |     }
 75 | 
 76 |     Ok(())
 77 | }
 78 | 
 79 | fn embeddings_forward_propagation() {
 80 |     let mut word_vectors = BTreeMap::new();
 81 |     word_vectors.insert("yankees", Matrix::new(1, 3, vec![0.0; 3]));
 82 |     word_vectors.insert("bears", Matrix::new(1, 3, vec![0.0; 3]));
 83 |     word_vectors.insert("braves", Matrix::new(1, 3, vec![0.0; 3]));
 84 |     word_vectors.insert("red", Matrix::new(1, 3, vec![0.0; 3]));
 85 |     word_vectors.insert("socks", Matrix::new(1, 3, vec![0.0; 3]));
 86 |     word_vectors.insert("lose", Matrix::new(1, 3, vec![0.0; 3]));
 87 |     word_vectors.insert("defeat", Matrix::new(1, 3, vec![0.0; 3]));
 88 |     word_vectors.insert("beat", Matrix::new(1, 3, vec![0.0; 3]));
 89 |     word_vectors.insert("tie", Matrix::new(1, 3, vec![0.0; 3]));
 90 | 
 91 |     let sent_to_output_weights =
 92 |         Matrix::new(3, word_vectors.len(), vec![0.0; 3 * word_vectors.len()]);
 93 | 
 94 |     let weights: Matrix<f64> = Matrix::identity(3);
 95 | 
 96 |     let layer_0 = &word_vectors["red"];
 97 |     let layer_1 = layer_0.mul(&weights) + &word_vectors["socks"];
 98 |     let layer_2 = layer_1.mul(&weights) + &word_vectors["defeat"];
 99 | 
100 |     let mut prediction = layer_2.mul(&sent_to_output_weights);
101 |     softmax_mut(&mut prediction);
102 | 
103 |     println!("{}", prediction);
104 | }
105 | 
106 | #[allow(clippy::type_complexity)]
107 | fn training_with_arbitrary_length(
108 |     train_data: &[Vec<String>],
109 |     word_index: &BTreeMap<&String, usize>,
110 | ) -> Result<(Matrix<f64>, Matrix<f64>, Matrix<f64>, Matrix<f64>), Box<dyn Error>> {
111 |     let word_count = word_index.len();
112 | 
113 |     let embedding_size = 50;
114 | 
115 |     let distribution = Uniform::new(0.0, 1.0);
116 | 
117 |     let mut embeddings = Matrix::new(
118 |         word_count,
119 |         embedding_size,
120 |         generate_random_vector(word_count * embedding_size, 0.1, -0.05, &distribution),
121 |     );
122 | 
123 |     let mut recurrent_weights = Matrix::identity(embedding_size);
124 | 
125 |     let mut state_to_prediction_weights = Matrix::new(
126 |         embedding_size,
127 |         word_count,
128 |         generate_random_vector(embedding_size * word_count, 0.1, -0.05, &distribution),
129 |     );
130 | 
131 |     let word_target_embeddings = Matrix::identity(word_count);
132 | 
133 |     let mut start_state = Matrix::zeros(1, embedding_size);
134 | 
135 |     let alpha = 0.0004;
136 | 
137 |     for _ in 0..10 {
138 |         let progress = ProgressBar::new(train_data.len() as u64);
139 |         progress.set_style(
140 |             ProgressStyle::default_bar()
141 |                 .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
142 |         );
143 | 
144 |         for sentence in train_data.iter() {
145 |             // forward prop
146 | 
147 |             let mut current_state = start_state.clone();
148 |             let mut loss = 0.0;
149 | 
150 |             let mut cells = Vec::with_capacity(sentence.len());
151 |             cells.push((None, current_state.clone()));
152 | 
153 |             for word in sentence.iter().skip(1) {
154 |                 let mut prediction = (&current_state).mul(&state_to_prediction_weights);
155 |                 softmax_mut(&mut prediction);
156 | 
157 |                 loss += -(prediction[[0, word_index[word]]]).ln();
158 | 
159 |                 let mut next_state = (&current_state).mul(&recurrent_weights);
160 | 
161 |                 for i in 0..embedding_size {
162 |                     next_state[[0, i]] += embeddings[[word_index[word], i]];
163 |                 }
164 | 
165 |                 cells.push((Some(prediction), next_state.clone()));
166 | 
167 |                 current_state = next_state;
168 |             }
169 | 
170 |             loss /= (sentence.len() - 1) as f64;
171 | 
172 |             // backward prop
173 | 
174 |             let mut deltas: Vec<(Option<Matrix<f64>>, Matrix<f64>)> = Vec::new();
175 | 
176 |             let mut current_state_delta: Matrix<f64> = Matrix::identity(1);
177 | 
178 |             for (i, (prediction, _)) in cells.iter().enumerate().rev() {
179 |                 let prediction_delta = match prediction {
180 |                     Some(prediction) => Some(
181 |                         prediction
182 |                             - (word_target_embeddings
183 |                                 .row(word_index[&sentence[i]])
184 |                                 .into_matrix()),
185 |                     ),
186 |                     None => None,
187 |                 };
188 | 
189 |                 let mut state_delta_from_predictions = match &prediction_delta {
190 |                     Some(prediction_delta) => {
191 |                         Some(prediction_delta.mul(state_to_prediction_weights.transpose()))
192 |                     }
193 |                     None => None,
194 |                 };
195 | 
196 |                 let mut state_delta_from_next_state = if i == cells.len() - 1 {
197 |                     None
198 |                 } else {
199 |                     Some(current_state_delta.mul(recurrent_weights.transpose()))
200 |                 };
201 | 
202 |                 current_state_delta = match (
203 |                     state_delta_from_predictions.take(),
204 |                     state_delta_from_next_state.take(),
205 |                 ) {
206 |                     (Some(m1), Some(m2)) => m1 + m2,
207 |                     (Some(m1), None) => m1,
208 |                     (None, Some(m2)) => m2,
209 |                     _ => panic!("this is broken"),
210 |                 };
211 | 
212 |                 deltas.push((prediction_delta, current_state_delta.clone()));
213 |             }
214 | 
215 |             // weights update
216 | 
217 |             // align deltas with cells
218 |             deltas.reverse();
219 | 
220 |             let (_, start_delta) = &deltas[0];
221 |             for i in 0..embedding_size {
222 |                 start_state[[0, i]] -=
223 |                     (alpha * start_delta[[0, i]]) / ((sentence.len() - 1) as f64);
224 |             }
225 | 
226 |             for i in 1..cells.len() {
227 |                 let (_, state) = &cells[i];
228 |                 let (prediction_delta, state_delta) = &deltas[i];
229 |                 // let (_, prev_state) = &cells[i - 1];
230 | 
231 |                 let prediction_delta = prediction_delta.as_ref().unwrap();
232 | 
233 |                 let state_to_prediction_weights_delta = state.transpose().mul(prediction_delta);
234 |                 for j in 0..embedding_size {
235 |                     for k in 0..word_count {
236 |                         state_to_prediction_weights[[j, k]] -= (alpha
237 |                             * state_to_prediction_weights_delta[[j, k]])
238 |                             / ((sentence.len() - 1) as f64);
239 |                     }
240 |                 }
241 | 
242 |                 for j in 0..embedding_size {
243 |                     embeddings[[word_index[&sentence[i]], j]] -=
244 |                         (alpha * state_delta[[0, j]]) / ((sentence.len() - 1) as f64);
245 |                 }
246 | 
247 |                 let recurrent_weights_delta = state.transpose().mul(state_delta);
248 |                 for j in 0..embedding_size {
249 |                     for k in 0..embedding_size {
250 |                         recurrent_weights[[j, k]] -= (alpha * recurrent_weights_delta[[j, k]])
251 |                             / ((sentence.len() - 1) as f64);
252 |                     }
253 |                 }
254 |             }
255 | 
256 |             progress.set_message(&format!("Perplexity: {}", loss.exp()));
257 |             progress.inc(1);
258 |         }
259 | 
260 |         progress.finish();
261 |     }
262 | 
263 |     Ok((
264 |         start_state,
265 |         embeddings,
266 |         recurrent_weights,
267 |         state_to_prediction_weights,
268 |     ))
269 | }
270 | 


--------------------------------------------------------------------------------
/examples/chapter13.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter13 - Intro to Automatic Differentiation - Let's Build A Deep Learning Framework
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter13%20-%20Intro%20to%20Automatic%20Differentiation%20-%20Let's%20Build%20A%20Deep%20Learning%20Framework.ipynb
  4 | 
  5 | use std::collections::{BTreeMap, BTreeSet};
  6 | use std::error::Error;
  7 | use std::iter::FromIterator;
  8 | use std::ops::Add;
  9 | 
 10 | use datasets::text::babi_en_single_supporting_fact_task;
 11 | use datasets::Dataset;
 12 | use rand::distributions::Uniform;
 13 | use rulinalg::matrix::{BaseMatrix, Matrix};
 14 | 
 15 | use grokking_deep_learning_rs::activations::{Sigmoid, Tanh};
 16 | use grokking_deep_learning_rs::layers::{Embedding, Layer, Linear, RNNCell, Sequential};
 17 | use grokking_deep_learning_rs::losses::{CrossEntropyLoss, Loss, MSELoss};
 18 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer};
 19 | use grokking_deep_learning_rs::tensor::{Dot, Sum, Tensor};
 20 | use grokking_deep_learning_rs::{argmax, generate_random_vector};
 21 | 
 22 | fn main() {
 23 |     println!("\nIntroduction to Tensors\n");
 24 |     introduction_to_tensors();
 25 | 
 26 |     println!("\nIntroduction to Autograd\n");
 27 |     introduction_to_autograd();
 28 |     introduction_to_autograd_2();
 29 | 
 30 |     println!("\nAutograd with multiple tensors\n");
 31 |     autograd_with_multiple_tensors();
 32 |     autograd_neg();
 33 | 
 34 |     println!("\nUsing Autograd ot train a Neural Network\n");
 35 |     training_using_autograd();
 36 | 
 37 |     println!("\nAdding Automatic Optimization\n");
 38 |     training_with_automatic_optimization();
 39 | 
 40 |     println!("\nLayers Which Contain Layers\n");
 41 |     layers_which_contain_layers();
 42 | 
 43 |     println!("\nLoss Function Layers\n");
 44 |     loss_function_layers();
 45 | 
 46 |     println!("\nNonLinearity Layers\n");
 47 |     nonlinearity_layers();
 48 | 
 49 |     println!("\nEmbedding Layers\n");
 50 |     embedding_layer();
 51 | 
 52 |     println!("\nThe Embedding Layer\n");
 53 |     cross_entropy_loss();
 54 | 
 55 |     println!("\nRecurrent Neural Network\n");
 56 |     recurrent_neural_network().unwrap();
 57 | }
 58 | 
 59 | fn introduction_to_tensors() {
 60 |     let t1 = BasicTensor1 { data: vec![0.0] };
 61 |     let t2 = BasicTensor1 { data: vec![1.0] };
 62 |     println!("{:?}", t1 + t2);
 63 | }
 64 | 
 65 | #[derive(Debug)]
 66 | struct BasicTensor1 {
 67 |     data: Vec<f64>,
 68 | }
 69 | 
 70 | impl Add for BasicTensor1 {
 71 |     type Output = BasicTensor1;
 72 | 
 73 |     fn add(self, other: BasicTensor1) -> Self::Output {
 74 |         BasicTensor1 {
 75 |             data: self
 76 |                 .data
 77 |                 .into_iter()
 78 |                 .zip(other.data.into_iter())
 79 |                 .map(|(a, b)| a + b)
 80 |                 .collect(),
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | fn introduction_to_autograd() {
 86 |     let x = BasicTensor2::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
 87 |     let y = BasicTensor2::new(vec![2.0; 5]);
 88 | 
 89 |     let mut z = x + y;
 90 |     println!("{:?}", z);
 91 | 
 92 |     z.backward(BasicTensor2::new(vec![1.0, 1.0, 1.0, 1.0, 1.0]));
 93 | 
 94 |     let xy = z.creators.unwrap();
 95 | 
 96 |     println!("{:?}", xy[0].grad);
 97 |     println!("{:?}", xy[1].grad);
 98 | }
 99 | 
100 | #[derive(Debug, Clone)]
101 | enum BasicOperation {
102 |     Add,
103 |     Const,
104 | }
105 | 
106 | #[derive(Debug, Clone)]
107 | struct BasicTensor2 {
108 |     data: Vec<f64>,
109 |     grad: Option<Box<BasicTensor2>>,
110 |     creation_op: BasicOperation,
111 |     creators: Option<Vec<BasicTensor2>>,
112 | }
113 | 
114 | impl BasicTensor2 {
115 |     fn new(data: Vec<f64>) -> Self {
116 |         BasicTensor2 {
117 |             data,
118 |             grad: None,
119 |             creation_op: BasicOperation::Const,
120 |             creators: None,
121 |         }
122 |     }
123 | 
124 |     fn backward(&mut self, grad: BasicTensor2) {
125 |         match self.creation_op {
126 |             BasicOperation::Add => {
127 |                 for c in self.creators.as_mut().unwrap().iter_mut() {
128 |                     c.backward(grad.clone());
129 |                 }
130 |             }
131 |             _ => {
132 |                 self.grad = Some(Box::new(grad));
133 |             }
134 |         };
135 |     }
136 | }
137 | 
138 | impl Add for BasicTensor2 {
139 |     type Output = BasicTensor2;
140 | 
141 |     fn add(self, other: Self) -> BasicTensor2 {
142 |         BasicTensor2 {
143 |             data: self
144 |                 .data
145 |                 .iter()
146 |                 .zip(other.data.iter())
147 |                 .map(|(a, b)| a + b)
148 |                 .collect(),
149 |             grad: None,
150 |             creation_op: BasicOperation::Add,
151 |             creators: Some(vec![self, other]),
152 |         }
153 |     }
154 | }
155 | 
156 | #[allow(clippy::many_single_char_names)]
157 | fn introduction_to_autograd_2() {
158 |     let a = BasicTensor2::new(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
159 |     let b = BasicTensor2::new(vec![2.0; 5]);
160 |     let c = BasicTensor2::new(vec![5.0, 4.0, 3.0, 2.0, 1.0]);
161 |     let d = BasicTensor2::new(vec![-1.0, -2.0, -3.0, -4.0, -5.0]);
162 | 
163 |     let e = a + b;
164 |     let f = c + d;
165 |     let mut g = e + f;
166 | 
167 |     g.backward(BasicTensor2::new(vec![1.0, 1.0, 1.0, 1.0, 1.0]));
168 |     println!("{:?}", g);
169 | 
170 |     let ef = g.creators.as_ref().unwrap();
171 |     let ab = ef[0].creators.as_ref().unwrap();
172 | 
173 |     let a = &ab[0];
174 |     println!("{:?}", a.grad);
175 | }
176 | 
177 | #[allow(clippy::many_single_char_names)]
178 | fn autograd_with_multiple_tensors() {
179 |     let a = Tensor::new_const(Matrix::new(1, 5, vec![1.0, 2.0, 3.0, 4.0, 5.0]));
180 |     let b = Tensor::new_const(Matrix::new(1, 5, vec![2.0, 2.0, 2.0, 2.0, 2.0]));
181 |     let c = Tensor::new_const(Matrix::new(1, 5, vec![5.0, 4.0, 3.0, 2.0, 1.0]));
182 | 
183 |     let d = &a + &b;
184 |     let e = &b + &c;
185 |     let f = &d + &e;
186 | 
187 |     // println!("{:#?}", f);
188 |     f.backward(Tensor::grad(Matrix::new(
189 |         1,
190 |         5,
191 |         vec![1.0, 1.0, 1.0, 1.0, 1.0],
192 |     )));
193 |     println!("{:?}", b.0.borrow().grad);
194 | }
195 | 
196 | #[allow(clippy::many_single_char_names)]
197 | fn autograd_neg() {
198 |     let a = Tensor::new_const(Matrix::new(1, 5, vec![1.0, 2.0, 3.0, 4.0, 5.0]));
199 |     let b = Tensor::new_const(Matrix::new(1, 5, vec![2.0, 2.0, 2.0, 2.0, 2.0]));
200 |     let c = Tensor::new_const(Matrix::new(1, 5, vec![5.0, 4.0, 3.0, 2.0, 1.0]));
201 | 
202 |     let d = &a + &(-&b);
203 |     let e = &(-&b) + &c;
204 |     let f = &d + &e;
205 | 
206 |     f.backward(Tensor::grad(Matrix::new(
207 |         1,
208 |         5,
209 |         vec![1.0, 1.0, 1.0, 1.0, 1.0],
210 |     )));
211 |     println!("{:?}", b.0.borrow().grad);
212 | }
213 | 
214 | /// Using Autograd to train a Neural Network
215 | 
216 | fn training_using_autograd() {
217 |     let data = Tensor::new_const(Matrix::new(
218 |         4,
219 |         2,
220 |         vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
221 |     ));
222 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
223 | 
224 |     let distribution = Uniform::new(0.0, 1.0);
225 | 
226 |     let w1 = Tensor::new_const(Matrix::new(
227 |         2,
228 |         3,
229 |         generate_random_vector(2 * 3, 1.0, 0.0, &distribution),
230 |     ));
231 |     let w2 = Tensor::new_const(Matrix::new(
232 |         3,
233 |         1,
234 |         generate_random_vector(3, 1.0, 0.0, &distribution),
235 |     ));
236 | 
237 |     let alpha = 0.1;
238 | 
239 |     for _ in 0..10 {
240 |         let pred = data.dot(&w1).dot(&w2);
241 |         let loss = (&(&pred - &target) * &(&pred - &target)).sum(0);
242 |         let (loss_rows, loss_cols) = (1, 1);
243 | 
244 |         println!("Loss: {:?}", loss.0.borrow().data);
245 | 
246 |         loss.backward(Tensor::grad(Matrix::ones(loss_rows, loss_cols)));
247 | 
248 |         {
249 |             let mut w1 = w1.0.borrow_mut();
250 |             let grad = w1.grad.take();
251 |             w1.grad = None;
252 | 
253 |             let grad = grad.unwrap();
254 |             let grad = &grad.borrow().data;
255 | 
256 |             for i in 0..w1.data.rows() {
257 |                 for j in 0..w1.data.cols() {
258 |                     w1.data[[i, j]] -= alpha * grad[[i, j]];
259 |                 }
260 |             }
261 |         }
262 | 
263 |         {
264 |             let mut w2 = w2.0.borrow_mut();
265 |             let grad = w2.grad.take();
266 |             w2.grad = None;
267 | 
268 |             let grad = grad.unwrap();
269 |             let grad = &grad.borrow().data;
270 | 
271 |             for i in 0..w2.data.rows() {
272 |                 for j in 0..w2.data.cols() {
273 |                     w2.data[[i, j]] -= alpha * grad[[i, j]];
274 |                 }
275 |             }
276 |         }
277 |     }
278 | }
279 | 
280 | /// Adding Automatic Optimization
281 | 
282 | fn training_with_automatic_optimization() {
283 |     let data = Tensor::new_const(Matrix::new(
284 |         4,
285 |         2,
286 |         vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
287 |     ));
288 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
289 | 
290 |     let distribution = Uniform::new(0.0, 1.0);
291 | 
292 |     let w1 = Tensor::new_const(Matrix::new(
293 |         2,
294 |         3,
295 |         generate_random_vector(2 * 3, 1.0, 0.0, &distribution),
296 |     ));
297 | 
298 |     let w2 = Tensor::new_const(Matrix::new(
299 |         3,
300 |         1,
301 |         generate_random_vector(3, 1.0, 0.0, &distribution),
302 |     ));
303 | 
304 |     let alpha = 0.1;
305 | 
306 |     let optimizer = SGDOptimizer::new(vec![&w1, &w2], alpha);
307 | 
308 |     for _ in 0..10 {
309 |         // predict
310 |         let pred = data.dot(&w1).dot(&w2);
311 | 
312 |         // compare
313 |         let loss = (&(&pred - &target) * &(&pred - &target)).sum(0);
314 |         let (loss_rows, loss_cols) = (1, 1);
315 | 
316 |         println!("Loss: {:?}", loss.0.borrow().data.data());
317 | 
318 |         // calculate difference
319 |         loss.backward(Tensor::grad(Matrix::ones(loss_rows, loss_cols)));
320 | 
321 |         // learn
322 |         optimizer.step(true);
323 |     }
324 | }
325 | 
326 | /// Layers Which Contain Layers
327 | 
328 | fn layers_which_contain_layers() {
329 |     let data = Tensor::new_const(Matrix::new(
330 |         4,
331 |         2,
332 |         vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
333 |     ));
334 | 
335 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
336 | 
337 |     let model = Sequential::new(vec![
338 |         Box::new(Linear::new(2, 3, false)),
339 |         Box::new(Linear::new(3, 1, false)),
340 |     ]);
341 | 
342 |     let optim = SGDOptimizer::new(model.parameters(), 0.05);
343 | 
344 |     for _ in 0..10 {
345 |         let pred = model.forward(&[&data]);
346 | 
347 |         // compare
348 |         let loss = (&(&pred[0] - &target) * &(&pred[0] - &target)).sum(0);
349 | 
350 |         println!("Loss: {:?}", loss.0.borrow().data.data());
351 | 
352 |         // calculate difference
353 |         loss.backward(Tensor::grad(Matrix::ones(1, 1)));
354 | 
355 |         // learn
356 |         optim.step(true);
357 |     }
358 | }
359 | 
360 | fn loss_function_layers() {
361 |     let data = Tensor::new_const(Matrix::new(
362 |         4,
363 |         2,
364 |         vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
365 |     ));
366 | 
367 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
368 | 
369 |     let model = Sequential::new(vec![
370 |         Box::new(Linear::new(2, 3, false)),
371 |         Box::new(Linear::new(3, 1, false)),
372 |     ]);
373 | 
374 |     let criterion = MSELoss;
375 |     let optim = SGDOptimizer::new(model.parameters(), 0.05);
376 | 
377 |     for _ in 0..10 {
378 |         let pred = model.forward(&[&data]);
379 | 
380 |         // compare
381 |         let loss = criterion.forward(&pred[0], &target);
382 | 
383 |         println!("Loss: {:?}", loss.0.borrow().data.data());
384 | 
385 |         // calculate difference
386 |         loss.backward(Tensor::grad(Matrix::ones(1, 1)));
387 | 
388 |         // learn
389 |         optim.step(true);
390 |     }
391 | }
392 | 
393 | /// NonLinearity Layers
394 | 
395 | fn nonlinearity_layers() {
396 |     let data = Tensor::new_const(Matrix::new(
397 |         4,
398 |         2,
399 |         vec![0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
400 |     ));
401 | 
402 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
403 | 
404 |     let model = Sequential::new(vec![
405 |         Box::new(Linear::new(2, 3, false)),
406 |         Box::new(Tanh),
407 |         Box::new(Linear::new(3, 1, false)),
408 |         Box::new(Sigmoid),
409 |     ]);
410 | 
411 |     let criterion = MSELoss;
412 |     let optim = SGDOptimizer::new(model.parameters(), 0.5);
413 | 
414 |     for _ in 0..10 {
415 |         let pred = model.forward(&[&data]);
416 | 
417 |         // compare
418 |         let loss = criterion.forward(&pred[0], &target);
419 | 
420 |         println!("Loss: {:?}", loss.0.borrow().data.data());
421 | 
422 |         // calculate difference
423 |         loss.backward(Tensor::grad(Matrix::ones(1, 1)));
424 | 
425 |         // learn
426 |         optim.step(true);
427 |     }
428 | }
429 | 
430 | /// The Embedding Layer
431 | 
432 | fn embedding_layer() {
433 |     let data = Tensor::new_const(Matrix::new(1, 4, vec![1.0, 2.0, 1.0, 2.0]));
434 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
435 | 
436 |     let model = Sequential::new(vec![
437 |         Box::new(Embedding::new(5, 3)),
438 |         Box::new(Tanh),
439 |         Box::new(Linear::new(3, 1, true)),
440 |         Box::new(Sigmoid),
441 |     ]);
442 | 
443 |     let criterion = MSELoss;
444 |     let optim = SGDOptimizer::new(model.parameters(), 0.07);
445 | 
446 |     for _ in 0..10 {
447 |         let pred = model.forward(&[&data]);
448 | 
449 |         // compare
450 |         let loss = criterion.forward(&pred[0], &target);
451 | 
452 |         println!("Loss: {:?}", loss.0.borrow().data.data());
453 | 
454 |         // calculate difference
455 |         loss.backward(Tensor::grad(Matrix::ones(1, 1)));
456 | 
457 |         // learn
458 |         optim.step(true);
459 |     }
460 | }
461 | 
462 | /// The Cross Entropy Layer
463 | 
464 | fn cross_entropy_loss() {
465 |     let data = Tensor::new_const(Matrix::new(1, 4, vec![1.0, 2.0, 1.0, 2.0]));
466 |     let target = Tensor::new_const(Matrix::new(4, 1, vec![0.0, 1.0, 0.0, 1.0]));
467 | 
468 |     let model = Sequential::new(vec![
469 |         Box::new(Embedding::new(3, 3)),
470 |         Box::new(Tanh),
471 |         Box::new(Linear::new(3, 4, true)),
472 |     ]);
473 | 
474 |     let criterion = CrossEntropyLoss;
475 |     let optim = SGDOptimizer::new(model.parameters(), 0.1);
476 | 
477 |     for _ in 0..10 {
478 |         let pred = model.forward(&[&data]);
479 |         // println!("pred {}", pred.0.borrow().data);
480 | 
481 |         // compare
482 |         let loss = criterion.forward(&pred[0], &target);
483 | 
484 |         println!("Loss: {:?}", loss.0.borrow().data.data());
485 | 
486 |         // calculate difference
487 |         loss.backward(Tensor::grad(Matrix::ones(1, 1)));
488 | 
489 |         // learn
490 |         optim.step(true);
491 |     }
492 | }
493 | 
494 | #[allow(clippy::needless_range_loop)]
495 | fn recurrent_neural_network() -> Result<(), Box<dyn Error>> {
496 |     let (train_data, _) = babi_en_single_supporting_fact_task()?;
497 | 
498 |     let train_data: Vec<Vec<String>> = train_data
499 |         .map(|v| vec![v.0, v.1 /*, (v.2).0*/])
500 |         .flat_map(|v| v.into_iter())
501 |         .map(|s| {
502 |             s.split_whitespace()
503 |                 .map(|w| {
504 |                     w.chars()
505 |                         .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z'))
506 |                         .collect()
507 |                 })
508 |                 .collect()
509 |         })
510 |         .collect();
511 | 
512 |     let total_data_size = train_data.len();
513 | 
514 |     let words = BTreeSet::from_iter(train_data.iter().flat_map(|v| v.iter()));
515 | 
516 |     let word_count = words.len();
517 |     let word_index = BTreeMap::from_iter(words.into_iter().zip(0..word_count));
518 |     let inverted_word_index =
519 |         BTreeMap::from_iter(word_index.clone().into_iter().map(|(k, v)| (v, k)));
520 | 
521 |     let train_data: Vec<Vec<f64>> = train_data
522 |         .iter()
523 |         .map(|s| s.iter().map(|w| word_index[w] as f64).collect())
524 |         .collect();
525 | 
526 |     let max_len = train_data.iter().map(|s| s.len()).max().unwrap();
527 |     let pad = word_index.len() + 1;
528 | 
529 |     let batch_size = 250;
530 | 
531 |     let train_data: Vec<_> = train_data
532 |         .into_iter()
533 |         .batch(batch_size, true)
534 |         .map(|v: Vec<Vec<f64>>| {
535 |             let mut ans = vec![vec![0.0; batch_size]; max_len];
536 |             for i in 0..batch_size {
537 |                 for j in 0..v[i].len() {
538 |                     ans[j][i] = v[i][j];
539 |                 }
540 | 
541 |                 for j in v[i].len()..max_len {
542 |                     ans[j][i] = pad as f64;
543 |                 }
544 |             }
545 | 
546 |             ans
547 |         })
548 |         .collect();
549 | 
550 |     let embedding_size = 16;
551 | 
552 |     // net
553 |     let embed = Embedding::new(word_index.len() + 2, embedding_size);
554 |     let model = RNNCell::new(embedding_size, 16, word_index.len() + 2, Box::new(Sigmoid));
555 | 
556 |     let criterion = CrossEntropyLoss;
557 |     let mut parameters = embed.parameters();
558 |     parameters.append(&mut model.parameters());
559 | 
560 |     let optim = SGDOptimizer::new(parameters, 0.01);
561 | 
562 |     for _ in 0..10 {
563 |         let mut total_loss = 0.0;
564 |         let mut total_accuracy = 0.0;
565 | 
566 |         for batch in train_data.iter() {
567 |             let mut hidden = model.create_start_state(batch_size);
568 |             let mut output = None;
569 | 
570 |             let len = batch.len();
571 | 
572 |             for row in batch.iter().take(len - 1) {
573 |                 let input = Tensor::new_const(Matrix::new(1, batch_size, row.clone()));
574 |                 let rnn_input = embed.forward(&[&input]).remove(0);
575 |                 let mut outputs = model.forward(&[&rnn_input, &hidden]);
576 |                 output = Some(outputs.remove(0));
577 |                 hidden = outputs.remove(0);
578 |             }
579 | 
580 |             let output = output.unwrap();
581 | 
582 |             let target = Tensor::new_const(Matrix::new(batch_size, 1, batch[len - 1].clone()));
583 | 
584 |             let loss = criterion.forward(&output, &target);
585 |             loss.backward(Tensor::new_const(Matrix::ones(1, 1)));
586 | 
587 |             optim.step(true);
588 | 
589 |             let current_loss = loss.0.borrow().data.data()[0];
590 |             total_loss += current_loss;
591 | 
592 |             let current_accuracy: f64 = output
593 |                 .0
594 |                 .borrow()
595 |                 .data
596 |                 .row_iter()
597 |                 .zip(batch[len - 1].iter())
598 |                 .map(|(row, ix)| {
599 |                     if argmax(row.raw_slice()) == (*ix) as usize {
600 |                         1.0
601 |                     } else {
602 |                         0.0
603 |                     }
604 |                 })
605 |                 .sum();
606 | 
607 |             total_accuracy += current_accuracy;
608 |         }
609 | 
610 |         println!(
611 |             "Loss: {}, Accuracy: {}",
612 |             total_loss,
613 |             total_accuracy / (total_data_size as f64)
614 |         );
615 |     }
616 | 
617 |     let batch = vec![
618 |         vec![word_index[&"Mary".to_owned()] as f64],
619 |         vec![word_index[&"moved".to_owned()] as f64],
620 |         vec![word_index[&"to".to_owned()] as f64],
621 |         vec![word_index[&"the".to_owned()] as f64],
622 |     ];
623 | 
624 |     let mut hidden = model.create_start_state(1);
625 |     let mut output = None;
626 |     for row in batch.iter() {
627 |         let input = Tensor::new_const(Matrix::new(1, 1, row.clone()));
628 |         let rnn_input = embed.forward(&[&input]).remove(0);
629 |         let mut outputs = model.forward(&[&rnn_input, &hidden]);
630 |         output = Some(outputs.remove(0));
631 |         hidden = outputs.remove(0);
632 |     }
633 | 
634 |     let output = argmax(output.unwrap().0.borrow().data.row(0).raw_slice());
635 |     println!("Prediction: {}", inverted_word_index[&output]);
636 | 
637 |     Ok(())
638 | }
639 | 


--------------------------------------------------------------------------------
/examples/chapter14.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter 14 - Learning to Write Like Shakespeare: Long-Short Term Memory
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Exploding%20Gradients%20Examples.ipynb
  4 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Intro%20to%20LSTMs%20-%20Learn%20to%20Write%20Like%20Shakespeare.ipynb
  5 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter14%20-%20Intro%20to%20LSTMs%20-%20Part%202%20-%20Learn%20to%20Write%20Like%20Shakespeare.ipynb
  6 | 
  7 | use std::collections::{BTreeMap, BTreeSet};
  8 | use std::error::Error;
  9 | use std::iter::FromIterator;
 10 | use std::ops::Mul;
 11 | 
 12 | use datasets::text::shakespeare_100000;
 13 | use indicatif::{ProgressBar, ProgressStyle};
 14 | use rulinalg::matrix::{BaseMatrix, Matrix};
 15 | 
 16 | use grokking_deep_learning_rs::activations::Sigmoid;
 17 | use grokking_deep_learning_rs::layers::{Embedding, LSTMCell, Layer, RNNCell};
 18 | use grokking_deep_learning_rs::losses::{CrossEntropyLoss, Loss};
 19 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer};
 20 | use grokking_deep_learning_rs::tensor::Tensor;
 21 | 
 22 | fn main() -> Result<(), Box<dyn Error>> {
 23 |     println!("\nTraining Shakespeare using RNN Cells\n");
 24 |     shakespeare_rnn_cell()?;
 25 | 
 26 |     println!("\nVanishing and Exploding Gradients\n");
 27 |     vanishing_and_exploding_gradients();
 28 | 
 29 |     println!("\nTraining Shakespeare using LSTM Cells\n");
 30 |     shakespeare_lstm_cell()?;
 31 | 
 32 |     Ok(())
 33 | }
 34 | 
 35 | fn shakespeare_rnn_cell() -> Result<(), Box<dyn Error>> {
 36 |     let embedding_size = 64;
 37 |     let rnn_state_size = 512;
 38 |     let alpha = 0.05;
 39 |     let batch_size = 16;
 40 |     let bptt = 25;
 41 | 
 42 |     let n_iterations = 1;
 43 | 
 44 |     let data = shakespeare_100000()?;
 45 | 
 46 |     let characters = BTreeSet::from_iter(data.chars());
 47 |     let len = characters.len();
 48 |     let word_index = BTreeMap::from_iter(characters.iter().zip(0..len));
 49 | 
 50 |     let indices: Vec<_> = data.chars().map(|c| word_index[&c]).collect();
 51 | 
 52 |     let embed = Embedding::new(len, embedding_size);
 53 |     let cell = RNNCell::new(embedding_size, rnn_state_size, len, Box::new(Sigmoid));
 54 | 
 55 |     let criterion = CrossEntropyLoss;
 56 | 
 57 |     let mut params = embed.parameters();
 58 |     params.append(&mut cell.parameters());
 59 | 
 60 |     let optimizer = SGDOptimizer::new(params, alpha);
 61 | 
 62 |     let n_batches = (indices.len() as f64 / batch_size as f64).floor() as usize;
 63 | 
 64 |     let mut batched_data = Matrix::zeros(n_batches, batch_size);
 65 |     for (i, c) in indices.into_iter().enumerate() {
 66 |         if i >= batched_data.data().len() {
 67 |             break;
 68 |         }
 69 | 
 70 |         let row = i / n_batches;
 71 |         let col = i % n_batches;
 72 | 
 73 |         batched_data[[col, row]] = c as f64;
 74 |     }
 75 | 
 76 |     dbg!(n_batches);
 77 | 
 78 |     let n_batches = 100 + 1;
 79 | 
 80 |     let steps = (n_batches - 1) / bptt;
 81 | 
 82 |     for _ in 0..n_iterations {
 83 |         let progress = ProgressBar::new((n_batches - 1) as u64);
 84 |         progress.set_style(
 85 |             ProgressStyle::default_bar()
 86 |                 .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
 87 |         );
 88 | 
 89 |         for j in 0..steps {
 90 |             let start = bptt * j;
 91 | 
 92 |             let mut state = cell.create_start_state(batch_size);
 93 | 
 94 |             let mut loss = None;
 95 | 
 96 |             for k in 0..bptt {
 97 |                 let input = batched_data.row(start + k).raw_slice();
 98 |                 let target = batched_data.row(start + k + 1).raw_slice();
 99 | 
100 |                 let input = Tensor::new_const(Matrix::new(1, batch_size, Vec::from(input)));
101 |                 let target = Tensor::new_const(Matrix::new(batch_size, 1, Vec::from(target)));
102 | 
103 |                 let rnn_input = &embed.forward(&[&input])[0];
104 |                 let mut outputs = cell.forward(&[rnn_input, &state]);
105 | 
106 |                 let output = outputs.remove(0);
107 |                 state = outputs.remove(0);
108 | 
109 |                 let current_loss = criterion.forward(&output, &target);
110 |                 progress.set_message(&format!(
111 |                     "Batch Loss: {:?}",
112 |                     current_loss.0.borrow().data.data()
113 |                 ));
114 | 
115 |                 loss = match loss.take() {
116 |                     None => Some(current_loss),
117 |                     Some(existing_loss) => Some(&existing_loss + &current_loss),
118 |                 };
119 | 
120 |                 progress.inc(1);
121 |             }
122 | 
123 |             loss.unwrap().backward(Tensor::grad(Matrix::ones(1, 1)));
124 |             optimizer.step(true);
125 |         }
126 | 
127 |         progress.finish();
128 |     }
129 | 
130 |     Ok(())
131 | }
132 | 
133 | fn vanishing_and_exploding_gradients() {
134 |     let weights = Matrix::new(2, 2, vec![1.0, 4.0, 4.0, 1.0]);
135 |     let mut activation = sigmoid(Matrix::new(1, 2, vec![1.0, 0.01]));
136 | 
137 |     println!("Sigmoid Activations");
138 |     let mut activations = Vec::new();
139 |     for _ in 0..10 {
140 |         activation = sigmoid(activation.mul(&weights));
141 |         activations.push(activation.clone());
142 |         println!("{}", activation);
143 |     }
144 | 
145 |     println!("\nSigmoid Gradients");
146 |     let mut gradient = Matrix::ones(1, 2);
147 |     for activation in activations.into_iter().rev() {
148 |         gradient = activation
149 |             .elemul(&(Matrix::ones(1, 2) - &activation))
150 |             .elemul(&gradient);
151 |         gradient = gradient.mul(weights.transpose());
152 |         println!("{}", gradient);
153 |     }
154 | 
155 |     println!("\nrelu Activations");
156 |     let mut activations = Vec::new();
157 |     for _ in 0..10 {
158 |         activation = relu(activation.mul(&weights));
159 |         activations.push(activation.clone());
160 |         println!("{}", activation);
161 |     }
162 | 
163 |     println!("\nrelu Gradients");
164 |     let mut gradient = Matrix::ones(1, 2);
165 |     for activation in activations.into_iter().rev() {
166 |         gradient = gradient.elemul(&Matrix::new(
167 |             1,
168 |             2,
169 |             activation
170 |                 .data()
171 |                 .iter()
172 |                 .map(|v| if v > &0.0 { *v } else { 0.0 })
173 |                 .collect::<Vec<f64>>(),
174 |         ));
175 |         gradient = gradient.mul(weights.transpose());
176 |         println!("{}", gradient);
177 |     }
178 | }
179 | 
180 | fn sigmoid(mut m: Matrix<f64>) -> Matrix<f64> {
181 |     for i in 0..m.rows() {
182 |         for j in 0..m.cols() {
183 |             m[[i, j]] = 1.0 / (1.0 + (-m[[i, j]]).exp());
184 |         }
185 |     }
186 | 
187 |     m
188 | }
189 | 
190 | fn relu(mut m: Matrix<f64>) -> Matrix<f64> {
191 |     for i in 0..m.rows() {
192 |         for j in 0..m.cols() {
193 |             m[[i, j]] = if m[[i, j]] > 0.0 { m[[i, j]] } else { 0.0 };
194 |         }
195 |     }
196 | 
197 |     m
198 | }
199 | 
200 | fn shakespeare_lstm_cell() -> Result<(), Box<dyn Error>> {
201 |     let embedding_size = 64;
202 |     let rnn_state_size = 512;
203 |     let alpha = 0.05;
204 |     let batch_size = 16;
205 |     let bptt = 25;
206 | 
207 |     let n_iterations = 1;
208 | 
209 |     let data = shakespeare_100000()?;
210 | 
211 |     let characters = BTreeSet::from_iter(data.chars());
212 |     let len = characters.len();
213 |     let word_index = BTreeMap::from_iter(characters.iter().zip(0..len));
214 | 
215 |     let indices: Vec<_> = data.chars().map(|c| word_index[&c]).collect();
216 | 
217 |     let embed = Embedding::new(len, embedding_size);
218 |     let cell = LSTMCell::new(embedding_size, rnn_state_size, len);
219 | 
220 |     let criterion = CrossEntropyLoss;
221 | 
222 |     let optimizer = SGDOptimizer::new(
223 |         embed
224 |             .parameters()
225 |             .into_iter()
226 |             .chain(cell.parameters().into_iter())
227 |             .collect(),
228 |         alpha,
229 |     );
230 | 
231 |     let n_batches = (indices.len() as f64 / batch_size as f64).floor() as usize;
232 | 
233 |     let mut batched_data = Matrix::zeros(n_batches, batch_size);
234 |     for (i, c) in indices.into_iter().enumerate() {
235 |         if i >= batched_data.data().len() {
236 |             break;
237 |         }
238 | 
239 |         let row = i / n_batches;
240 |         let col = i % n_batches;
241 | 
242 |         batched_data[[col, row]] = c as f64;
243 |     }
244 | 
245 |     dbg!(n_batches);
246 | 
247 |     let n_batches = 100 + 1;
248 | 
249 |     let steps = (n_batches - 1) / bptt;
250 | 
251 |     for _ in 0..n_iterations {
252 |         let progress = ProgressBar::new((n_batches - 1) as u64);
253 |         progress.set_style(
254 |             ProgressStyle::default_bar()
255 |                 .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
256 |         );
257 | 
258 |         for j in 0..steps {
259 |             let start = bptt * j;
260 | 
261 |             let (mut h, mut c) = cell.create_start_state(batch_size);
262 | 
263 |             let mut loss = None;
264 | 
265 |             for k in 0..bptt {
266 |                 let input = batched_data.row(start + k).raw_slice();
267 |                 let target = batched_data.row(start + k + 1).raw_slice();
268 | 
269 |                 let input = Tensor::new_const(Matrix::new(1, batch_size, Vec::from(input)));
270 |                 let target = Tensor::new_const(Matrix::new(batch_size, 1, Vec::from(target)));
271 | 
272 |                 let rnn_input = &embed.forward(&[&input])[0];
273 |                 let mut outputs = cell.forward(&[rnn_input, &h, &c]);
274 | 
275 |                 let output = outputs.remove(0);
276 |                 h = outputs.remove(0);
277 |                 c = outputs.remove(0);
278 | 
279 |                 let current_loss = criterion.forward(&output, &target);
280 |                 progress.set_message(&format!(
281 |                     "Batch Loss: {:?}",
282 |                     current_loss.0.borrow().data.data()
283 |                 ));
284 | 
285 |                 loss = match loss.take() {
286 |                     None => Some(current_loss),
287 |                     Some(existing_loss) => Some(&existing_loss + &current_loss),
288 |                 };
289 | 
290 |                 progress.inc(1);
291 |             }
292 | 
293 |             loss.unwrap().backward(Tensor::grad(Matrix::ones(1, 1)));
294 |             optimizer.step(true);
295 |         }
296 | 
297 |         progress.finish();
298 |     }
299 | 
300 |     Ok(())
301 | }
302 | 


--------------------------------------------------------------------------------
/examples/chapter15.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter 15: Introduction to Federated Learning
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter15%20-%20Intro%20to%20Federated%20Learning%20-%20Deep%20Learning%20on%20Unseen%20Data.ipynb
  4 | 
  5 | use std::collections::{BTreeMap, BTreeSet};
  6 | use std::error::Error;
  7 | use std::iter::FromIterator;
  8 | 
  9 | use datasets::text::enron_spam;
 10 | use datasets::Dataset;
 11 | use indicatif::{ProgressBar, ProgressStyle};
 12 | // use paillier::traits::{Add, Decrypt, Encrypt, KeyGeneration, Mul};
 13 | // use paillier::{EncryptionKey, Paillier};
 14 | use rulinalg::matrix::Matrix;
 15 | 
 16 | use grokking_deep_learning_rs::layers::{Embedding, Layer};
 17 | use grokking_deep_learning_rs::losses::{Loss, MSELoss};
 18 | use grokking_deep_learning_rs::optimizers::{Optimizer, SGDOptimizer};
 19 | use grokking_deep_learning_rs::tensor::{Sum, Tensor};
 20 | 
 21 | fn main() -> Result<(), Box<dyn Error>> {
 22 |     println!("\nRegular Deep Learning\n");
 23 |     regular_deep_learning()?;
 24 | 
 25 |     println!("\nFederated Deep Learning\n");
 26 |     federated_deep_learning()?;
 27 | 
 28 |     Ok(())
 29 | }
 30 | 
 31 | /// Regular Deep Learning
 32 | 
 33 | fn regular_deep_learning() -> Result<(), Box<dyn Error>> {
 34 |     let (spam, ham) = enron_spam()?;
 35 | 
 36 |     let dataset_size = 3000;
 37 |     let max_sentence_len = 100;
 38 | 
 39 |     let (spam, ham) = (
 40 |         parse_dataset(spam, dataset_size, max_sentence_len),
 41 |         parse_dataset(ham, dataset_size, max_sentence_len),
 42 |     );
 43 | 
 44 |     let word_index = {
 45 |         let words = BTreeSet::from_iter(spam.iter().chain(ham.iter()).flat_map(|v| v.iter()));
 46 |         let word_count = words.len();
 47 |         BTreeMap::from_iter(words.into_iter().zip(0..word_count))
 48 |     };
 49 | 
 50 |     let word_count = word_index.len();
 51 | 
 52 |     dbg!(word_count);
 53 | 
 54 |     let spam = spam
 55 |         .iter()
 56 |         .map(|sentence| {
 57 |             sentence
 58 |                 .iter()
 59 |                 .map(|word| word_index[&word] as f64)
 60 |                 .collect::<Vec<_>>()
 61 |         })
 62 |         .collect::<Vec<_>>();
 63 | 
 64 |     let ham = ham
 65 |         .iter()
 66 |         .map(|sentence| {
 67 |             sentence
 68 |                 .iter()
 69 |                 .map(|word| word_index[&word] as f64)
 70 |                 .collect::<Vec<_>>()
 71 |         })
 72 |         .collect::<Vec<_>>();
 73 | 
 74 |     let train_data = spam
 75 |         .iter()
 76 |         .take(dataset_size / 2)
 77 |         .cloned()
 78 |         .zip(vec![1.0; dataset_size / 2])
 79 |         .chain(
 80 |             ham.iter()
 81 |                 .take(dataset_size / 2)
 82 |                 .cloned()
 83 |                 .zip(vec![0.0; dataset_size / 2]),
 84 |         )
 85 |         .shuffle(dataset_size, 0)
 86 |         .collect::<Vec<_>>();
 87 | 
 88 |     let test_data = spam
 89 |         .iter()
 90 |         .skip(dataset_size / 2)
 91 |         .cloned()
 92 |         .zip(vec![1.0; dataset_size / 2])
 93 |         .chain(
 94 |             ham.iter()
 95 |                 .skip(dataset_size / 2)
 96 |                 .cloned()
 97 |                 .zip(vec![0.0; dataset_size / 2]),
 98 |         )
 99 |         .shuffle(dataset_size, 0)
100 |         .collect::<Vec<_>>();
101 | 
102 |     let model = Embedding::new(word_count, 1);
103 | 
104 |     {
105 |         model.weights.0.borrow_mut().data *= 0.0;
106 |     }
107 | 
108 |     let n_iterations = 10;
109 |     let batch_size = 200;
110 |     let n_batches = dataset_size / batch_size;
111 | 
112 |     let model = train(
113 |         model,
114 |         train_data,
115 |         dataset_size,
116 |         &word_index,
117 |         max_sentence_len,
118 |         n_iterations,
119 |         n_batches,
120 |         batch_size,
121 |     );
122 | 
123 |     let accuracy = test(&model, &test_data, dataset_size, max_sentence_len);
124 | 
125 |     println!("Test Accuracy: {}", accuracy);
126 | 
127 |     Ok(())
128 | }
129 | 
130 | #[allow(clippy::needless_range_loop, clippy::too_many_arguments)]
131 | fn train(
132 |     model: Embedding,
133 |     data: Vec<(Vec<f64>, f64)>,
134 |     dataset_size: usize,
135 |     word_index: &BTreeMap<&String, usize>,
136 |     max_sentence_len: usize,
137 |     n_iterations: usize,
138 |     n_batches: usize,
139 |     batch_size: usize,
140 | ) -> Embedding {
141 |     // NOTE: Unlike the Python version, cannot do batching as cannot support 3D operations
142 |     // so running stochastic gradient descent in batch_size iterations and accumulating loss
143 |     let criterion = MSELoss;
144 |     let optim = SGDOptimizer::new(model.parameters(), 0.01);
145 | 
146 |     for _ in 0..n_iterations {
147 |         let progress = ProgressBar::new(n_batches as u64);
148 |         progress.set_style(
149 |             ProgressStyle::default_bar()
150 |                 .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
151 |         );
152 | 
153 |         let mut total_loss = 0.0;
154 | 
155 |         for bi in 0..n_batches {
156 |             let mut current_loss = 0.0;
157 | 
158 |             {
159 |                 model.weights.0.borrow_mut().data[[word_index[&"<UNK>".to_owned()], 0]] *= 0.0;
160 |             }
161 | 
162 |             for i in (batch_size * bi)..(batch_size * (bi + 1)) {
163 |                 let (input, output) = &data[i];
164 |                 let input = Tensor::new_const(Matrix::new(1, max_sentence_len, input.clone()));
165 |                 let prediction = model.forward(&[&input]).remove(0);
166 |                 let prediction = prediction.sum(0);
167 |                 let prediction = prediction.sigmoid();
168 | 
169 |                 let target = Tensor::new_const(Matrix::new(1, 1, vec![*output]));
170 | 
171 |                 let loss = criterion.forward(&prediction, &target);
172 | 
173 |                 current_loss += loss.0.borrow().data.data()[0];
174 | 
175 |                 loss.backward(Tensor::grad(Matrix::ones(1, 1)));
176 |                 optim.step(true);
177 |             }
178 | 
179 |             total_loss += current_loss;
180 | 
181 |             progress.set_message(&format!("Loss: {}", current_loss / (batch_size as f64)));
182 |             progress.inc(1);
183 |         }
184 | 
185 |         progress.finish_with_message(&format!("Loss: {}", total_loss / (dataset_size as f64)));
186 |     }
187 | 
188 |     model
189 | }
190 | 
191 | fn test(
192 |     model: &Embedding,
193 |     data: &[(Vec<f64>, f64)],
194 |     dataset_size: usize,
195 |     max_sentence_len: usize,
196 | ) -> f64 {
197 |     let mut accuracy = 0.0;
198 | 
199 |     for item in data.iter().take(dataset_size / 2) {
200 |         let (input, output) = item;
201 |         let input = Tensor::new_const(Matrix::new(1, max_sentence_len, input.clone()));
202 |         let prediction = model.forward(&[&input]).remove(0);
203 |         let prediction = prediction.sum(0);
204 |         let prediction = prediction.sigmoid();
205 | 
206 |         if (prediction.0.borrow().data.data()[0] >= 0.5 && (output - 1.0).abs() < std::f64::EPSILON)
207 |             || (prediction.0.borrow().data.data()[0] < 0.5 && (output - 0.0).abs() < std::f64::EPSILON)
208 |         {
209 |             accuracy += 1.0;
210 |         }
211 |     }
212 | 
213 |     accuracy / ((dataset_size / 2) as f64)
214 | }
215 | 
216 | fn parse_dataset(
217 |     dataset: impl Dataset<Item = String>,
218 |     dataset_size: usize,
219 |     max_sentence_len: usize,
220 | ) -> Vec<Vec<String>> {
221 |     dataset
222 |         .take(dataset_size)
223 |         .map(|email| {
224 |             email
225 |                 .split('\n')
226 |                 .map(|line| line.split_whitespace())
227 |                 .flat_map(|v| v)
228 |                 .map(|v| {
229 |                     v.chars()
230 |                         .filter(|c| (*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z'))
231 |                         .collect::<String>()
232 |                 })
233 |                 .collect::<Vec<_>>()
234 |         })
235 |         .map(|mut email| {
236 |             if email.len() >= max_sentence_len {
237 |                 email.drain(max_sentence_len..email.len());
238 |             } else {
239 |                 for _ in 0..(max_sentence_len - email.len()) {
240 |                     email.push("<UNK>".to_owned());
241 |                 }
242 |             }
243 | 
244 |             email
245 |         })
246 |         .collect()
247 | }
248 | 
249 | fn federated_deep_learning() -> Result<(), Box<dyn Error>> {
250 |     let (spam, ham) = enron_spam()?;
251 | 
252 |     let dataset_size = 4000;
253 |     let train_dataset_size = 3000;
254 |     let test_dataset_size = dataset_size - train_dataset_size;
255 |     let max_sentence_len = 100;
256 | 
257 |     let (spam, ham) = (
258 |         parse_dataset(spam, dataset_size, max_sentence_len),
259 |         parse_dataset(ham, dataset_size, max_sentence_len),
260 |     );
261 | 
262 |     let word_index = {
263 |         let words = BTreeSet::from_iter(spam.iter().chain(ham.iter()).flat_map(|v| v.iter()));
264 |         let word_count = words.len();
265 |         BTreeMap::from_iter(words.into_iter().zip(0..word_count))
266 |     };
267 | 
268 |     let word_count = word_index.len();
269 | 
270 |     dbg!(word_count);
271 | 
272 |     let spam = spam
273 |         .iter()
274 |         .map(|sentence| {
275 |             sentence
276 |                 .iter()
277 |                 .map(|word| word_index[&word] as f64)
278 |                 .collect::<Vec<_>>()
279 |         })
280 |         .collect::<Vec<_>>();
281 | 
282 |     let ham = ham
283 |         .iter()
284 |         .map(|sentence| {
285 |             sentence
286 |                 .iter()
287 |                 .map(|word| word_index[&word] as f64)
288 |                 .collect::<Vec<_>>()
289 |         })
290 |         .collect::<Vec<_>>();
291 | 
292 |     let train_data = spam
293 |         .iter()
294 |         .take(train_dataset_size)
295 |         .cloned()
296 |         .zip(vec![1.0; train_dataset_size])
297 |         .chain(
298 |             ham.iter()
299 |                 .take(train_dataset_size)
300 |                 .cloned()
301 |                 .zip(vec![0.0; train_dataset_size]),
302 |         )
303 |         .shuffle(2 * train_dataset_size, 0)
304 |         .collect::<Vec<_>>();
305 | 
306 |     let test_data = spam
307 |         .iter()
308 |         .skip(train_dataset_size)
309 |         .cloned()
310 |         .zip(vec![1.0; test_dataset_size])
311 |         .chain(
312 |             ham.iter()
313 |                 .skip(train_dataset_size)
314 |                 .cloned()
315 |                 .zip(vec![0.0; test_dataset_size]),
316 |         )
317 |         .shuffle(2 * test_dataset_size, 0)
318 |         .collect::<Vec<_>>();
319 | 
320 |     let alice: Vec<_> = train_data
321 |         .iter()
322 |         .take(train_dataset_size / 3)
323 |         .cloned()
324 |         .collect();
325 |     let bob: Vec<_> = train_data
326 |         .iter()
327 |         .skip(train_dataset_size / 3)
328 |         .take(train_dataset_size / 3)
329 |         .cloned()
330 |         .collect();
331 |     let charlie: Vec<_> = train_data
332 |         .iter()
333 |         .skip(2 * train_dataset_size / 3)
334 |         .cloned()
335 |         .collect();
336 | 
337 |     let alice_model = Embedding::new(word_count, 1);
338 |     let bob_model = Embedding::new(word_count, 1);
339 |     let charlie_model = Embedding::new(word_count, 1);
340 | 
341 |     {
342 |         alice_model.weights.0.borrow_mut().data *= 0.0;
343 |         bob_model.weights.0.borrow_mut().data *= 0.0;
344 |         charlie_model.weights.0.borrow_mut().data *= 0.0;
345 |     }
346 | 
347 |     let n_iterations = 10;
348 |     let batch_size = 200;
349 |     let n_batches = train_dataset_size / (3 * batch_size);
350 | 
351 |     println!("Training Alice");
352 |     let alice_model = train(
353 |         alice_model,
354 |         alice,
355 |         train_dataset_size / 3,
356 |         &word_index,
357 |         max_sentence_len,
358 |         n_iterations,
359 |         n_batches,
360 |         batch_size,
361 |     );
362 | 
363 |     println!("Training Bob");
364 |     let bob_model = train(
365 |         bob_model,
366 |         bob,
367 |         train_dataset_size / 3,
368 |         &word_index,
369 |         max_sentence_len,
370 |         n_iterations,
371 |         n_batches,
372 |         batch_size,
373 |     );
374 | 
375 |     println!("Training Charlie");
376 |     let charlie_model = train(
377 |         charlie_model,
378 |         charlie,
379 |         train_dataset_size / 3,
380 |         &word_index,
381 |         max_sentence_len,
382 |         n_iterations,
383 |         n_batches,
384 |         batch_size,
385 |     );
386 | 
387 |     let alice_weights = &alice_model.weights.0.borrow().data;
388 |     let bob_weights = &bob_model.weights.0.borrow().data;
389 |     let charlie_weights = &charlie_model.weights.0.borrow().data;
390 | 
391 |     let weights = alice_weights + bob_weights + charlie_weights;
392 |     let weights = weights / 3.0;
393 | 
394 |     let model = Embedding::from_weights(weights);
395 | 
396 |     let accuracy = test(&model, &test_data, dataset_size, max_sentence_len);
397 | 
398 |     println!("Test Accuracy: {}", accuracy);
399 | 
400 |     Ok(())
401 | }
402 | 
403 | // fn train_and_encrypt(
404 | //     model: Embedding,
405 | //     data: Vec<(Vec<f64>, f64)>,
406 | //     dataset_size: usize,
407 | //     word_index: &BTreeMap<&String, usize>,
408 | //     max_sentence_len: usize,
409 | //     n_iterations: usize,
410 | //     n_batches: usize,
411 | //     batch_size: usize,
412 | //     encryption_key: &EncryptionKey,
413 | // ) -> Vec<paillier::encoding::EncodedCiphertext<f64>> {
414 | //     let model = train(
415 | //         model,
416 | //         data,
417 | //         dataset_size,
418 | //         word_index,
419 | //         max_sentence_len,
420 | //         n_iterations,
421 | //         n_batches,
422 | //         batch_size,
423 | //     );
424 | //
425 | //     model
426 | //         .weights
427 | //         .0
428 | //         .borrow()
429 | //         .data
430 | //         .data()
431 | //         .iter()
432 | //         .map(|v| Paillier::encrypt(&encryption_key, *v))
433 | //         .collect()
434 | // }
435 | 


--------------------------------------------------------------------------------
/examples/chapter3.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter3 -  Forward Propagation - Intro to Neural Prediction
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter3%20-%20%20Forward%20Propagation%20-%20Intro%20to%20Neural%20Prediction.ipynb
  4 | 
  5 | use grokking_deep_learning_rs::{
  6 |     dot, elementwise_scalar_multiplication, matrix_vector_dot, Matrix, Vector,
  7 | };
  8 | 
  9 | fn main() {
 10 |     // different sections of the chapter in order.
 11 |     what_is_a_neural_network();
 12 |     making_a_prediction_with_multiple_inputs();
 13 |     making_a_prediction_with_multiple_outputs();
 14 |     predicting_with_multiple_inputs_and_outputs();
 15 |     predicting_on_predictions();
 16 | }
 17 | 
 18 | /// A Simple Neural Network making a prediction
 19 | ///
 20 | /// What is a neural network?
 21 | 
 22 | fn what_is_a_neural_network() {
 23 |     let number_of_toes = vec![8.5, 9.5, 10.0, 9.0];
 24 | 
 25 |     let input = number_of_toes[0];
 26 |     let weight = 0.1;
 27 | 
 28 |     let prediction = neural_network_1(input, weight);
 29 |     println!("prediction: {}", prediction);
 30 | }
 31 | 
 32 | #[allow(clippy::let_and_return)]
 33 | fn neural_network_1(input: f64, weight: f64) -> f64 {
 34 |     let prediction = input * weight;
 35 |     prediction
 36 | }
 37 | 
 38 | /// Making a prediction with multiple inputs
 39 | 
 40 | fn making_a_prediction_with_multiple_inputs() {
 41 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
 42 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 43 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
 44 | 
 45 |     let input = vec![toes[0], wlrec[0], nfans[0]];
 46 |     let weights = vec![0.1, 0.2, 0.0];
 47 | 
 48 |     let pred = neural_network_2(input, weights);
 49 |     println!("prediction: {}", pred);
 50 | }
 51 | 
 52 | fn neural_network_2(input: Vec<f64>, weights: Vec<f64>) -> f64 {
 53 |     dot(&input, &weights)
 54 | }
 55 | 
 56 | /// Making a prediction with multiple outputs
 57 | 
 58 | fn making_a_prediction_with_multiple_outputs() {
 59 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 60 | 
 61 |     let input = wlrec[0];
 62 |     let weights = vec![0.3, 0.2, 0.9];
 63 | 
 64 |     let pred = neural_network_3(input, weights);
 65 |     println!("predictions: {:?}", pred);
 66 | }
 67 | 
 68 | fn neural_network_3(input: f64, weights: Vec<f64>) -> Vec<f64> {
 69 |     elementwise_scalar_multiplication(&weights, input)
 70 | }
 71 | 
 72 | /// Predicting with multiple inputs and outputs
 73 | 
 74 | fn predicting_with_multiple_inputs_and_outputs() {
 75 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
 76 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 77 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
 78 | 
 79 |     let input = vec![toes[0], wlrec[0], nfans[0]];
 80 |     let weights = vec![
 81 |         vec![0.1, 0.1, -0.3],
 82 |         vec![0.1, 0.2, 0.0],
 83 |         vec![0.0, 1.3, 0.1],
 84 |     ];
 85 | 
 86 |     let pred = neural_network_4(input, weights);
 87 |     println!("predictions: {:?}", pred);
 88 | }
 89 | 
 90 | fn neural_network_4(input: Vector, weights: Matrix) -> Vector {
 91 |     matrix_vector_dot(&weights, &input)
 92 | }
 93 | 
 94 | /// Predicting on Predictions
 95 | 
 96 | fn predicting_on_predictions() {
 97 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
 98 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 99 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
100 | 
101 |     let input = vec![toes[0], wlrec[0], nfans[0]];
102 |     let input_weights = vec![
103 |         vec![0.1, 0.2, -0.1],
104 |         vec![-0.1, 0.1, 0.9],
105 |         vec![0.1, 0.4, 0.1],
106 |     ];
107 |     let hidden1_weights = vec![
108 |         vec![0.3, 1.1, -0.3],
109 |         vec![0.1, 0.2, 0.0],
110 |         vec![0.0, 1.3, 0.1],
111 |     ];
112 | 
113 |     let pred = neural_network_5(input, input_weights, hidden1_weights);
114 |     println!("predictions: {:?}", pred);
115 | }
116 | 
117 | fn neural_network_5(input: Vector, input_weights: Matrix, hidden1_weights: Matrix) -> Vector {
118 |     matrix_vector_dot(&hidden1_weights, &matrix_vector_dot(&input_weights, &input))
119 | }
120 | 


--------------------------------------------------------------------------------
/examples/chapter4.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter4 - Gradient Descent - Intro to Neural Learning
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter4%20-%20Gradient%20Descent%20-%20Intro%20to%20Neural%20Learning.ipynb
  4 | 
  5 | fn main() {
  6 |     println!("\nLearning using hot and cold method\n");
  7 |     hot_and_cold_method();
  8 | 
  9 |     println!("\nHot and Cold Learning\n");
 10 |     hot_and_cold_learning();
 11 | 
 12 |     println!("\nCalculating both direction and amount from error.\n");
 13 |     hot_and_cold_learning_with_direction_and_amount();
 14 | 
 15 |     println!("\nOne Iteration of Gradient Descent\n");
 16 |     gradient_descent_method();
 17 | 
 18 |     println!("\nLearning is just reducing error\n");
 19 |     gradient_descent();
 20 | 
 21 |     println!("\nLet's watch several steps of learning\n");
 22 |     gradient_descent_2();
 23 | 
 24 |     println!("\nWhy does this work? What really is weight delta?\n");
 25 |     gradient_descent_3();
 26 | 
 27 |     println!("\nBreaking Gradient Descent\n");
 28 |     gradient_descent_working();
 29 |     println!();
 30 |     gradient_descent_breaking();
 31 | 
 32 |     println!("\nAlpha\n");
 33 |     gradient_descent_working_again();
 34 | }
 35 | 
 36 | /// Learning using hot and cold method
 37 | 
 38 | #[allow(unused_assignments)]
 39 | fn hot_and_cold_method() {
 40 |     let (mut weight, lr) = (0.1, 0.01);
 41 |     let (number_of_toes, win_or_lose_binary) = ([8.5], [1.0]);
 42 | 
 43 |     let (input, truth) = (number_of_toes[0], win_or_lose_binary[0]);
 44 | 
 45 |     let pred = neural_network(input, weight);
 46 | 
 47 |     let err = (pred - truth).powf(2.0);
 48 |     println!("error: {}", err);
 49 | 
 50 |     let (pred_up, pred_down) = (
 51 |         neural_network(input, weight + lr),
 52 |         neural_network(input, weight - lr),
 53 |     );
 54 |     let (err_up, err_down) = ((pred_up - truth).powf(2.0), (pred_down - truth).powf(2.0));
 55 |     println!("error up: {}, error down: {}", err_up, err_down);
 56 | 
 57 |     if err_up < err_down {
 58 |         weight += lr;
 59 |     } else {
 60 |         weight -= lr;
 61 |     }
 62 | }
 63 | 
 64 | /// Hot and Cold Learning
 65 | 
 66 | fn hot_and_cold_learning() {
 67 |     let mut weight = 0.5;
 68 | 
 69 |     let (input, truth) = (0.5, 0.8);
 70 | 
 71 |     let n_iterations = 20;
 72 |     let lr = 0.001;
 73 | 
 74 |     for _ in 0..n_iterations {
 75 |         let pred = neural_network(input, weight);
 76 | 
 77 |         let err = (pred - truth).powf(2.0);
 78 |         println!("Error: {}, Prediction: {}", err, pred);
 79 | 
 80 |         let (pred_up, pred_down) = (
 81 |             neural_network(input, weight + lr),
 82 |             neural_network(input, weight - lr),
 83 |         );
 84 |         let (err_up, err_down) = ((pred_up - truth).powf(2.0), (pred_down - truth).powf(2.0));
 85 | 
 86 |         if err_up < err_down {
 87 |             weight += lr;
 88 |         } else if err_up > err_down {
 89 |             weight -= lr;
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | /// Calculating both direction and amount from error.
 95 | 
 96 | fn hot_and_cold_learning_with_direction_and_amount() {
 97 |     let mut weight = 0.5;
 98 | 
 99 |     let (input, truth) = (0.5, 0.8);
100 | 
101 |     let n_iterations = 1101;
102 | 
103 |     for _ in 0..n_iterations {
104 |         let pred = neural_network(input, weight);
105 | 
106 |         let err = (pred - truth).powf(2.0);
107 |         println!("Error: {}, Prediction: {}", err, pred);
108 | 
109 |         let direction_and_amount = (pred - truth) * input;
110 |         weight -= direction_and_amount;
111 |     }
112 | }
113 | 
114 | /// One Iteration of Gradient Descent
115 | 
116 | #[allow(unused_variables, unused_assignments)]
117 | fn gradient_descent_method() {
118 |     let (mut weight, alpha) = (0.1, 0.01);
119 |     let (number_of_toes, win_or_lose_binary) = ([8.5], [1.0]);
120 | 
121 |     let (input, truth) = (number_of_toes[0], win_or_lose_binary[0]);
122 | 
123 |     let pred = neural_network(input, truth);
124 |     let err = (pred - truth).powf(2.0);
125 | 
126 |     let delta = pred - truth;
127 |     let weight_delta = input * delta;
128 | 
129 |     let alpha = 0.01;
130 |     weight -= weight_delta * alpha;
131 | }
132 | 
133 | fn neural_network(input: f64, weight: f64) -> f64 {
134 |     input * weight
135 | }
136 | 
137 | /// Learning is just reducing error
138 | 
139 | fn gradient_descent() {
140 |     let (mut weight, truth, input) = (0.0, 0.8, 0.5);
141 |     for _ in 0..4 {
142 |         let pred = neural_network(input, weight);
143 |         let err = (pred - truth).powf(2.0);
144 |         println!("Error: {}, Prediction: {}", err, pred);
145 | 
146 |         let delta = pred - truth;
147 |         let weight_delta = delta * input;
148 |         weight -= weight_delta;
149 |     }
150 | }
151 | 
152 | /// Let's watch several steps of learning.
153 | 
154 | fn gradient_descent_2() {
155 |     let (mut weight, truth, input) = (0.0, 0.8, 1.1);
156 |     for _ in 0..4 {
157 |         println!("------\nWeight: {}", weight);
158 | 
159 |         let pred = neural_network(input, weight);
160 |         let err = (pred - truth).powf(2.0);
161 |         println!("Error: {}, Prediction: {}", err, pred);
162 | 
163 |         let delta = pred - truth;
164 |         let weight_delta = delta * input;
165 |         weight -= weight_delta;
166 |         println!("Delta: {}, Weight Delta: {}", delta, weight_delta);
167 |     }
168 | }
169 | 
170 | /// Why does this work? What really is weight delta?
171 | 
172 | fn gradient_descent_3() {
173 |     let (mut weight, truth, input) = (0.0, 0.8, 1.1);
174 |     for _ in 0..20 {
175 |         let pred = neural_network(input, weight);
176 |         let err = (pred - truth).powf(2.0);
177 |         println!("Error: {}, Prediction: {}", err, pred);
178 | 
179 |         let delta = pred - truth;
180 |         let weight_delta = delta * input;
181 |         weight -= weight_delta;
182 |     }
183 | }
184 | 
185 | /// Breaking Gradient Descent
186 | 
187 | fn gradient_descent_working() {
188 |     let (mut weight, truth, input) = (0.5, 0.8, 0.5);
189 |     for _ in 0..20 {
190 |         let pred = neural_network(input, weight);
191 |         let err = (pred - truth).powf(2.0);
192 |         println!("Error: {}, Prediction: {}", err, pred);
193 | 
194 |         let delta = pred - truth;
195 |         let weight_delta = delta * input;
196 |         weight -= weight_delta;
197 |     }
198 | }
199 | 
200 | fn gradient_descent_breaking() {
201 |     let (mut weight, truth, input) = (0.5, 0.8, 2.0);
202 |     for _ in 0..20 {
203 |         let pred = neural_network(input, weight);
204 |         let err = (pred - truth).powf(2.0);
205 |         println!("Error: {}, Prediction: {}", err, pred);
206 | 
207 |         let delta = pred - truth;
208 |         let weight_delta = delta * input;
209 |         weight -= weight_delta;
210 |     }
211 | }
212 | 
213 | /// Alpha
214 | 
215 | fn gradient_descent_working_again() {
216 |     let (mut weight, truth, input) = (0.5, 0.8, 2.0);
217 |     let alpha = 0.1;
218 | 
219 |     for _ in 0..20 {
220 |         let pred = neural_network(input, weight);
221 |         let err = (pred - truth).powf(2.0);
222 |         println!("Error: {}, Prediction: {}", err, pred);
223 | 
224 |         let delta = pred - truth;
225 |         let weight_delta = delta * input;
226 |         weight -= alpha * weight_delta;
227 |     }
228 | }
229 | 


--------------------------------------------------------------------------------
/examples/chapter5.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter5 - Generalizing Gradient Descent - Learning Multiple Weights at a Time.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter5%20-%20Generalizing%20Gradient%20Descent%20-%20Learning%20Multiple%20Weights%20at%20a%20Time.ipynb
  4 | 
  5 | use grokking_deep_learning_rs::{
  6 |     dot, elementwise_scalar_multiplication, matrix_vector_dot, Matrix, Vector,
  7 | };
  8 | 
  9 | fn main() {
 10 |     println!("\nGradient Descent Learning with Multiple Inputs.\n");
 11 |     gradient_descent_with_multiple_inputs();
 12 | 
 13 |     println!("\nLet's Watch Several Steps of Learning\n");
 14 |     gradient_descent_with_multiple_inputs_iterations();
 15 | 
 16 |     println!("\nFreezing one weight, What does it do?\n");
 17 |     gradient_descent_with_multiple_inputs_frozen_weights();
 18 | 
 19 |     println!("\nGradient Descent Learning with multiple outputs\n");
 20 |     gradient_descent_with_multiple_outputs();
 21 | 
 22 |     println!("\nGradient Descent with multiple inputs and outputs\n");
 23 |     gradient_descent_with_multiple_inputs_and_outputs();
 24 | }
 25 | 
 26 | /// Gradient Descent Learning with Multiple Inputs.
 27 | 
 28 | fn gradient_descent_with_multiple_inputs() {
 29 |     let mut weights: Vector = vec![0.1, 0.2, -0.1];
 30 | 
 31 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
 32 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 33 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
 34 | 
 35 |     let input = vec![toes[0], wlrec[0], nfans[0]];
 36 | 
 37 |     let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0];
 38 |     let truth = win_or_lose_binary[0];
 39 | 
 40 |     let pred = neural_network_1(&input, &weights);
 41 |     let error = (pred - truth).powf(2.0);
 42 |     println!("Error: {}, Prediction: {}", error, pred);
 43 | 
 44 |     let delta = pred - truth;
 45 |     let weight_delta = elementwise_scalar_multiplication(&input, delta);
 46 | 
 47 |     let alpha = 0.01;
 48 |     for i in 0..3 {
 49 |         weights[i] -= alpha * weight_delta[i];
 50 |     }
 51 |     println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_delta);
 52 | }
 53 | 
 54 | #[allow(clippy::ptr_arg)]
 55 | fn neural_network_1(input: &Vector, weights: &Vector) -> f64 {
 56 |     dot(input, weights)
 57 | }
 58 | 
 59 | /// Let's Watch Several Steps of Learning
 60 | 
 61 | fn gradient_descent_with_multiple_inputs_iterations() {
 62 |     let mut weights: Vector = vec![0.1, 0.2, -0.1];
 63 | 
 64 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
 65 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
 66 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
 67 | 
 68 |     let input = vec![toes[0], wlrec[0], nfans[0]];
 69 | 
 70 |     let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0];
 71 |     let truth = win_or_lose_binary[0];
 72 | 
 73 |     let alpha = 0.01;
 74 | 
 75 |     for i in 0..3 {
 76 |         println!("Iteration {}", i + 1);
 77 | 
 78 |         let pred = neural_network_1(&input, &weights);
 79 |         let error = (pred - truth).powf(2.0);
 80 |         println!("Error: {}, Prediction: {}", error, pred);
 81 | 
 82 |         let delta = pred - truth;
 83 |         let weight_delta = elementwise_scalar_multiplication(&input, delta);
 84 | 
 85 |         for i in 0..3 {
 86 |             weights[i] -= alpha * weight_delta[i];
 87 |         }
 88 |         println!(
 89 |             "Weights: {:?}, Weight Deltas: {:?}\n",
 90 |             weights, weight_delta
 91 |         );
 92 |     }
 93 | }
 94 | 
 95 | /// Freezing one weight, What does it do?
 96 | 
 97 | fn gradient_descent_with_multiple_inputs_frozen_weights() {
 98 |     let mut weights: Vector = vec![0.1, 0.2, -0.1];
 99 | 
100 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
101 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
102 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
103 | 
104 |     let input = vec![toes[0], wlrec[0], nfans[0]];
105 | 
106 |     let win_or_lose_binary = [1.0, 1.0, 0.0, 1.0];
107 |     let truth = win_or_lose_binary[0];
108 | 
109 |     let alpha = 0.3;
110 | 
111 |     for i in 0..3 {
112 |         println!("Iteration {}", i + 1);
113 | 
114 |         let pred = neural_network_1(&input, &weights);
115 |         let error = (pred - truth).powf(2.0);
116 |         println!("Error: {}, Prediction: {}", error, pred);
117 | 
118 |         let delta = pred - truth;
119 |         let mut weight_delta = elementwise_scalar_multiplication(&input, delta);
120 |         weight_delta[0] = 0.0;
121 | 
122 |         for i in 0..3 {
123 |             weights[i] -= alpha * weight_delta[i];
124 |         }
125 |         println!(
126 |             "Weights: {:?}, Weight Deltas: {:?}\n",
127 |             weights, weight_delta
128 |         );
129 |     }
130 | }
131 | 
132 | /// Gradient Descent Learning with multiple outputs
133 | 
134 | fn gradient_descent_with_multiple_outputs() {
135 |     let mut weights = vec![0.3, 0.2, 0.9];
136 | 
137 |     let wlrec = vec![0.65, 1.0, 1.0, 0.9];
138 | 
139 |     let hurt = vec![0.1, 0.0, 0.0, 0.1];
140 |     let win = vec![1.0, 1.0, 0.0, 1.0];
141 |     let sad = vec![0.1, 0.0, 0.1, 0.2];
142 | 
143 |     let input = wlrec[0];
144 |     let truth = vec![hurt[0], win[0], sad[0]];
145 | 
146 |     let alpha = 0.1;
147 | 
148 |     let pred = neural_network_2(input, &weights);
149 |     let error: Vector = pred
150 |         .iter()
151 |         .zip(truth.iter())
152 |         .map(|(x, y)| (x - y).powf(2.0))
153 |         .collect();
154 |     println!("Prediction: {:?}, Error: {:?}", pred, error);
155 | 
156 |     let deltas: Vector = pred.iter().zip(truth.iter()).map(|(x, y)| x - y).collect();
157 | 
158 |     // NOTE: mistake in book.
159 |     let weight_deltas: Vector = elementwise_scalar_multiplication(&deltas, input);
160 | 
161 |     for i in 0..weight_deltas.len() {
162 |         weights[i] -= weight_deltas[i] * alpha;
163 |     }
164 | 
165 |     println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_deltas);
166 | }
167 | 
168 | #[allow(clippy::ptr_arg)]
169 | fn neural_network_2(input: f64, weights: &Vector) -> Vector {
170 |     elementwise_scalar_multiplication(weights, input)
171 | }
172 | 
173 | /// Gradient Descent with multiple inputs and outputs
174 | 
175 | fn gradient_descent_with_multiple_inputs_and_outputs() {
176 |     let toes = vec![8.5, 9.5, 9.9, 9.0];
177 |     let wlrec = vec![0.65, 0.8, 0.8, 0.9];
178 |     let nfans = vec![1.2, 1.3, 0.5, 1.0];
179 | 
180 |     let hurt = vec![0.1, 0.0, 0.0, 0.1];
181 |     let win = vec![1.0, 1.0, 0.0, 1.0];
182 |     let sad = vec![0.1, 0.0, 0.1, 0.2];
183 | 
184 |     let inputs = vec![toes[0], wlrec[0], nfans[0]];
185 |     let mut weights = vec![
186 |         vec![0.1, 0.1, -0.3],
187 |         vec![0.1, 0.2, 0.0],
188 |         vec![0.0, 1.3, 0.1],
189 |     ];
190 |     let truth = vec![hurt[0], win[0], sad[0]];
191 | 
192 |     let alpha = 0.01;
193 | 
194 |     let pred = neural_network_3(&inputs, &weights);
195 |     let errors: Vector = pred
196 |         .iter()
197 |         .zip(truth.iter())
198 |         .map(|(x, y)| (x - y).powf(2.0))
199 |         .collect();
200 | 
201 |     println!("Prediction: {:?}, Error: {:?}", pred, errors);
202 | 
203 |     let deltas: Vector = pred.iter().zip(truth.iter()).map(|(p, t)| p - t).collect();
204 |     let weight_deltas: Matrix = deltas
205 |         .iter()
206 |         .map(|i| elementwise_scalar_multiplication(&inputs, *i))
207 |         .collect();
208 | 
209 |     for i in 0..weights.len() {
210 |         for j in 0..weights[i].len() {
211 |             weights[i][j] -= alpha * weight_deltas[i][j];
212 |         }
213 |     }
214 | 
215 |     // NOTE: the saved weights output in the notebook is wrong.
216 |     println!("Weights: {:?}, Weight Deltas: {:?}", weights, weight_deltas);
217 | }
218 | 
219 | #[allow(clippy::ptr_arg)]
220 | fn neural_network_3(inputs: &Vector, weights: &Matrix) -> Vector {
221 |     matrix_vector_dot(weights, inputs)
222 | }
223 | 


--------------------------------------------------------------------------------
/examples/chapter6.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter6 - Intro to Backpropagation - Building Your First DEEP Neural Network.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter6%20-%20Intro%20to%20Backpropagation%20-%20Building%20Your%20First%20DEEP%20Neural%20Network.ipynb
  4 | 
  5 | use rand::distributions::{Distribution, Standard};
  6 | use rand::{thread_rng, Rng};
  7 | 
  8 | use grokking_deep_learning_rs::{
  9 |     dot, matrix_matrix_dot, relu_matrix, relu_vector, relu_vector_derivative, vector_matrix_dot,
 10 |     vector_vector_multiplication, Matrix,
 11 | };
 12 | 
 13 | fn main() {
 14 |     println!("\nCreating a Matrix or Two in Python\n");
 15 |     creating_a_matrix_or_two();
 16 | 
 17 |     println!("\nLearning the whole dataset!\n");
 18 |     learning_the_whole_dataset();
 19 | 
 20 |     println!("\nOur First \"Deep\" Neural Network\n");
 21 |     first_deep_neural_network();
 22 | 
 23 |     println!("\nBackpropagation\n");
 24 |     backpropagation();
 25 | }
 26 | 
 27 | /// Creating a Matrix or Two
 28 | 
 29 | fn creating_a_matrix_or_two() {
 30 |     let streetlights = vec![
 31 |         vec![1.0, 0.0, 1.0],
 32 |         vec![0.0, 1.0, 1.0],
 33 |         vec![0.0, 0.0, 1.0],
 34 |         vec![1.0, 1.0, 1.0],
 35 |         vec![0.0, 1.0, 1.0],
 36 |         vec![1.0, 0.0, 1.0],
 37 |     ];
 38 | 
 39 |     let walk_vs_stop = vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0];
 40 | 
 41 |     let mut weights = vec![0.5, 0.48, -0.7];
 42 | 
 43 |     let input = &streetlights[0];
 44 |     let goal_prediction = walk_vs_stop[0];
 45 | 
 46 |     let alpha = 0.1;
 47 | 
 48 |     for _ in 0..20 {
 49 |         let prediction = dot(input, &weights);
 50 |         let error = (goal_prediction - prediction).powi(2);
 51 |         println!("Prediction: {}, Error: {}", prediction, error);
 52 | 
 53 |         let delta = prediction - goal_prediction;
 54 |         for i in 0..3 {
 55 |             weights[i] -= alpha * (input[i] * delta);
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | /// Learning the whole dataset!
 61 | 
 62 | fn learning_the_whole_dataset() {
 63 |     let streetlights = vec![
 64 |         vec![1.0, 0.0, 1.0],
 65 |         vec![0.0, 1.0, 1.0],
 66 |         vec![0.0, 0.0, 1.0],
 67 |         vec![1.0, 1.0, 1.0],
 68 |         vec![0.0, 1.0, 1.0],
 69 |         vec![1.0, 0.0, 1.0],
 70 |     ];
 71 | 
 72 |     let walk_vs_stop = vec![0.0, 1.0, 0.0, 1.0, 1.0, 0.0];
 73 | 
 74 |     let mut weights = vec![0.5, 0.48, -0.7];
 75 | 
 76 |     let alpha = 0.1;
 77 | 
 78 |     for i in 0..40 {
 79 |         let mut total_error = 0.0;
 80 | 
 81 |         for r in 0..streetlights.len() {
 82 |             let input = &streetlights[r];
 83 |             let goal_prediction = walk_vs_stop[r];
 84 | 
 85 |             let prediction = dot(input, &weights);
 86 |             println!("Prediction: {}", prediction);
 87 | 
 88 |             let error = (goal_prediction - prediction).powi(2);
 89 | 
 90 |             total_error += error;
 91 | 
 92 |             let delta = prediction - goal_prediction;
 93 |             for i in 0..3 {
 94 |                 weights[i] -= alpha * (input[i] * delta);
 95 |             }
 96 |         }
 97 | 
 98 |         println!("Error after iteration {} = {}\n", i + 1, total_error);
 99 |     }
100 | 
101 |     println!("Learned Weights: {:?}", weights);
102 | }
103 | 
104 | /// Our first "Deep" Neural Network
105 | 
106 | #[allow(unused_variables, unused_assignments, unused_mut)]
107 | fn first_deep_neural_network() {
108 |     let inputs = vec![
109 |         vec![1.0, 0.0, 1.0],
110 |         vec![0.0, 1.0, 1.0],
111 |         vec![0.0, 0.0, 1.0],
112 |         vec![1.0, 1.0, 1.0],
113 |     ];
114 | 
115 |     let outputs = vec![vec![1.0], vec![1.0], vec![0.0], vec![0.0]];
116 | 
117 |     let (alpha, hidden_size) = (0.2, 4);
118 | 
119 |     let mut weights_1: Matrix = random_matrix(3, hidden_size, &Standard);
120 |     let mut weights_2: Matrix = random_matrix(hidden_size, 1, &Standard);
121 | 
122 |     let hidden_layer = relu_matrix(matrix_matrix_dot(&inputs, &weights_1));
123 |     let output = matrix_matrix_dot(&hidden_layer, &weights_2);
124 | }
125 | 
126 | /// Backpropagation
127 | 
128 | fn backpropagation() {
129 |     let inputs = vec![
130 |         vec![1.0, 0.0, 1.0],
131 |         vec![0.0, 1.0, 1.0],
132 |         vec![0.0, 0.0, 1.0],
133 |         vec![1.0, 1.0, 1.0],
134 |     ];
135 | 
136 |     let outputs = vec![vec![1.0], vec![1.0], vec![0.0], vec![0.0]];
137 | 
138 |     let alpha = 0.2;
139 | 
140 |     // Weight values taken from the python notebooks for reproducing results.
141 |     let mut weights_0_1: Matrix = vec![
142 |         vec![-0.165_955_99, 0.440_648_99, -0.999_771_25, -0.395_334_85],
143 |         vec![-0.706_488_22, -0.815_322_81, -0.627_479_58, -0.308_878_55],
144 |         vec![-0.206_465_05, 0.077_633_47, -0.161_610_97, 0.370_439],
145 |     ];
146 | 
147 |     let mut weights_1_2: Matrix = vec![
148 |         vec![-0.591_095_5],
149 |         vec![0.756_234_87],
150 |         vec![-0.945_224_81],
151 |         vec![0.340_935_02],
152 |     ];
153 | 
154 |     for it in 0..60 {
155 |         let mut total_error = 0.0;
156 | 
157 |         for i in 0..4 {
158 |             let hidden_layer = relu_vector(vector_matrix_dot(&inputs[i], &weights_0_1));
159 |             let prediction = vector_matrix_dot(&hidden_layer, &weights_1_2)[0];
160 | 
161 |             let error: f64 = (prediction - outputs[i][0]).powi(2);
162 |             total_error += error;
163 | 
164 |             let delta_2_1 = prediction - outputs[i][0];
165 |             let delta_1_0 = vector_vector_multiplication(
166 |                 &weights_1_2.iter().map(|v| v[0] * delta_2_1).collect(),
167 |                 &relu_vector_derivative(hidden_layer.clone()),
168 |             );
169 | 
170 |             let weight_deltas_1_2: Matrix =
171 |                 hidden_layer.iter().map(|v| vec![v * delta_2_1]).collect();
172 | 
173 |             let weight_deltas_0_1: Matrix = inputs[i]
174 |                 .iter()
175 |                 .map(|v| delta_1_0.iter().map(|v2| v * v2).collect())
176 |                 .collect();
177 | 
178 |             for i in 0..weights_1_2.len() {
179 |                 for j in 0..weights_1_2[i].len() {
180 |                     weights_1_2[i][j] -= alpha * weight_deltas_1_2[i][j];
181 |                 }
182 |             }
183 | 
184 |             for i in 0..weights_0_1.len() {
185 |                 for j in 0..weights_0_1[i].len() {
186 |                     weights_0_1[i][j] -= alpha * weight_deltas_0_1[i][j];
187 |                 }
188 |             }
189 |         }
190 | 
191 |         if (it + 1) % 10 == 0 {
192 |             println!("Error: {}", total_error);
193 |         }
194 |     }
195 | }
196 | 
197 | fn random_matrix(rows: usize, columns: usize, dist: &impl Distribution<f64>) -> Matrix {
198 |     (0..rows)
199 |         .map(|_| {
200 |             (0..columns)
201 |                 .map(|_| 2.0 * thread_rng().sample(dist) - 1.0)
202 |                 .collect()
203 |         })
204 |         .collect()
205 | }
206 | 


--------------------------------------------------------------------------------
/examples/chapter8.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter8 - Intro to Regularization - Learning Signal and Ignoring Noise.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter8%20-%20Intro%20to%20Regularization%20-%20Learning%20Signal%20and%20Ignoring%20Noise.ipynb
  4 | 
  5 | use std::error::Error;
  6 | use std::ops::Mul;
  7 | 
  8 | use datasets::image::mnist;
  9 | use indicatif::{ProgressBar, ProgressStyle};
 10 | use rand::distributions::Standard;
 11 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice};
 12 | 
 13 | use grokking_deep_learning_rs::{
 14 |     argmax, generate_random_vector, process_mnist_batch_dataset, relu_derivative, relu_mut,
 15 |     sample_bernoulli_trials,
 16 | };
 17 | 
 18 | fn main() {
 19 |     println!("\n3 Layer Network on MNIST\n");
 20 |     three_layer_mnist().unwrap();
 21 | 
 22 |     println!("\n3 Layer Network on MNIST with validation every 10 iterations\n");
 23 |     three_layer_mnist_with_validation().unwrap();
 24 | 
 25 |     println!("\nDropout\n");
 26 |     three_layer_mnist_with_validation_and_dropout(0.3).unwrap();
 27 | 
 28 |     println!("\nBatched Gradient Descent with Dropout\n");
 29 |     batched_gradient_descent_with_dropout(0.5).unwrap();
 30 | }
 31 | 
 32 | fn three_layer_mnist() -> Result<(), Box<dyn Error>> {
 33 |     let dataset_size = 100; // 1000 in notebook with numpy
 34 |     let test_dataset_size = 10000;
 35 | 
 36 |     let (train_data, test_data) = mnist()?;
 37 | 
 38 |     let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip();
 39 | 
 40 |     let images: Vec<Vec<f64>> = images
 41 |         .iter()
 42 |         .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect())
 43 |         .collect();
 44 | 
 45 |     let labels: Vec<Vec<f64>> = labels
 46 |         .iter()
 47 |         .map(|l| {
 48 |             let mut v = vec![0.0; 10];
 49 |             v[*l as usize] = 1.0;
 50 |             v
 51 |         })
 52 |         .collect();
 53 | 
 54 |     let (alpha, hidden_size) = (0.005, 40);
 55 | 
 56 |     let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication.
 57 |     let progress = ProgressBar::new(iterations as u64);
 58 |     progress.set_style(
 59 |         ProgressStyle::default_bar()
 60 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
 61 |     );
 62 | 
 63 |     let mut weights_0_1 = Matrix::new(
 64 |         784,
 65 |         hidden_size,
 66 |         generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard),
 67 |     );
 68 |     let mut weights_1_2 = Matrix::new(
 69 |         hidden_size,
 70 |         10,
 71 |         generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard),
 72 |     );
 73 | 
 74 |     // Training
 75 | 
 76 |     for it in 0..iterations {
 77 |         let mut total_error = 0.0;
 78 |         let mut accuracy = 0.0;
 79 | 
 80 |         for (image, label) in images.iter().zip(labels.iter()) {
 81 |             let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) };
 82 | 
 83 |             let mut hidden_layer = (&image).mul(&weights_0_1);
 84 |             for j in 0..hidden_size {
 85 |                 if hidden_layer[[0, j]] < 0.0 {
 86 |                     hidden_layer[[0, j]] = 0.0;
 87 |                 }
 88 |             }
 89 | 
 90 |             let output = (&hidden_layer).mul(&weights_1_2);
 91 | 
 92 |             accuracy += if argmax(&label) == argmax(output.data()) {
 93 |                 1.0
 94 |             } else {
 95 |                 0.0
 96 |             };
 97 | 
 98 |             let error: f64 = output
 99 |                 .data()
100 |                 .iter()
101 |                 .zip(label.iter())
102 |                 .map(|(p, t)| (p - t).powi(2))
103 |                 .sum();
104 | 
105 |             total_error += error;
106 | 
107 |             let delta_2_1 = output - Matrix::new(1, 10, label.clone());
108 | 
109 |             let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]);
110 |             for i in 0..hidden_size {
111 |                 if hidden_layer[[0, i]] >= 0.0 {
112 |                     relu_deriv[[0, i]] = 1.0;
113 |                 }
114 |             }
115 | 
116 |             let delta_1_0 = (&delta_2_1)
117 |                 .mul(weights_1_2.transpose())
118 |                 .elemul(&relu_deriv);
119 | 
120 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
121 | 
122 |             // avoid another clone of image
123 |             let weight_delta_0_1 = image.transpose().mul(delta_1_0);
124 | 
125 |             for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() {
126 |                 *x -= alpha * weight_delta_0_1.data()[i];
127 |             }
128 | 
129 |             for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() {
130 |                 *x -= alpha * weight_delta_1_2.data()[i];
131 |             }
132 |         }
133 | 
134 |         progress.inc(1);
135 |         progress.set_message(&format!(
136 |             "Train Accuracy: {}, Train Error: {}",
137 |             accuracy / (dataset_size as f64),
138 |             total_error / (dataset_size as f64)
139 |         ));
140 | 
141 |         if (it + 1) % 10 == 0 {
142 |             progress.println(format!(
143 |                 "Iteration: {}, Train Accuracy: {}, Train Error: {}",
144 |                 it + 1,
145 |                 accuracy / (dataset_size as f64),
146 |                 total_error / (dataset_size as f64)
147 |             ));
148 |         }
149 |     }
150 | 
151 |     progress.finish_and_clear();
152 | 
153 |     // Inference
154 | 
155 |     println!("Evaluating on the test dataset");
156 | 
157 |     let (images, labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip();
158 | 
159 |     let images: Vec<Vec<f64>> = images
160 |         .into_iter()
161 |         .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect())
162 |         .collect();
163 | 
164 |     let labels: Vec<Vec<f64>> = labels
165 |         .into_iter()
166 |         .map(|l| {
167 |             let mut v = vec![0.0; 10];
168 |             v[l as usize] = 1.0;
169 |             v
170 |         })
171 |         .collect();
172 | 
173 |     let mut total_error = 0.0;
174 |     let mut accuracy = 0.0;
175 | 
176 |     let progress = ProgressBar::new(test_dataset_size as u64);
177 | 
178 |     for (image, label) in images.into_iter().zip(labels.into_iter()) {
179 |         let image = Matrix::new(1, 784, image);
180 | 
181 |         let mut hidden_layer = image.mul(&weights_0_1);
182 | 
183 |         // relu
184 |         for j in 0..hidden_size {
185 |             if hidden_layer[[0, j]] < 0.0 {
186 |                 hidden_layer[[0, j]] = 0.0;
187 |             }
188 |         }
189 | 
190 |         let output = hidden_layer.mul(&weights_1_2);
191 | 
192 |         accuracy += if argmax(&label) == argmax(output.data()) {
193 |             1.0
194 |         } else {
195 |             0.0
196 |         };
197 | 
198 |         let error: f64 = output
199 |             .iter()
200 |             .zip(label.iter())
201 |             .map(|(p, t)| (p - t).powi(2))
202 |             .sum();
203 | 
204 |         total_error += error;
205 | 
206 |         progress.inc(1);
207 |     }
208 | 
209 |     progress.finish_and_clear();
210 | 
211 |     println!(
212 |         "Test Accuracy: {}, Test Error: {}",
213 |         accuracy / (test_dataset_size as f64),
214 |         total_error / (test_dataset_size as f64),
215 |     );
216 | 
217 |     Ok(())
218 | }
219 | 
220 | fn three_layer_mnist_with_validation() -> Result<(), Box<dyn Error>> {
221 |     let dataset_size = 100; // 1000 in notebook with numpy
222 |     let test_dataset_size = 1000;
223 | 
224 |     let (train_data, test_data) = mnist()?;
225 | 
226 |     let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip();
227 | 
228 |     let images: Vec<Vec<f64>> = images
229 |         .iter()
230 |         .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect())
231 |         .collect();
232 | 
233 |     let labels: Vec<Vec<f64>> = labels
234 |         .iter()
235 |         .map(|l| {
236 |             let mut v = vec![0.0; 10];
237 |             v[*l as usize] = 1.0;
238 |             v
239 |         })
240 |         .collect();
241 | 
242 |     let (alpha, hidden_size) = (0.005, 40);
243 | 
244 |     let (test_images, test_labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip();
245 | 
246 |     let test_images: Vec<Vec<f64>> = test_images
247 |         .into_iter()
248 |         .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect())
249 |         .collect();
250 | 
251 |     let test_labels: Vec<Vec<f64>> = test_labels
252 |         .into_iter()
253 |         .map(|l| {
254 |             let mut v = vec![0.0; 10];
255 |             v[l as usize] = 1.0;
256 |             v
257 |         })
258 |         .collect();
259 | 
260 |     let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication.
261 |     let progress = ProgressBar::new(iterations as u64);
262 |     progress.set_style(
263 |         ProgressStyle::default_bar()
264 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
265 |     );
266 | 
267 |     let mut weights_0_1 = Matrix::new(
268 |         784,
269 |         hidden_size,
270 |         generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard),
271 |     );
272 |     let mut weights_1_2 = Matrix::new(
273 |         hidden_size,
274 |         10,
275 |         generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard),
276 |     );
277 | 
278 |     // Training
279 | 
280 |     for it in 0..iterations {
281 |         let mut total_error = 0.0;
282 |         let mut accuracy = 0.0;
283 | 
284 |         for (image, label) in images.iter().zip(labels.iter()) {
285 |             let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) };
286 | 
287 |             let mut hidden_layer = (&image).mul(&weights_0_1);
288 |             for j in 0..hidden_size {
289 |                 if hidden_layer[[0, j]] < 0.0 {
290 |                     hidden_layer[[0, j]] = 0.0;
291 |                 }
292 |             }
293 | 
294 |             let output = (&hidden_layer).mul(&weights_1_2);
295 | 
296 |             accuracy += if argmax(&label) == argmax(output.data()) {
297 |                 1.0
298 |             } else {
299 |                 0.0
300 |             };
301 | 
302 |             let error: f64 = output
303 |                 .data()
304 |                 .iter()
305 |                 .zip(label.iter())
306 |                 .map(|(p, t)| (p - t).powi(2))
307 |                 .sum();
308 | 
309 |             total_error += error;
310 | 
311 |             let delta_2_1 = output - Matrix::new(1, 10, label.clone());
312 | 
313 |             let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]);
314 |             for i in 0..hidden_size {
315 |                 if hidden_layer[[0, i]] >= 0.0 {
316 |                     relu_deriv[[0, i]] = 1.0;
317 |                 }
318 |             }
319 | 
320 |             let delta_1_0 = (&delta_2_1)
321 |                 .mul(weights_1_2.transpose())
322 |                 .elemul(&relu_deriv);
323 | 
324 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
325 | 
326 |             // avoid another clone of image
327 |             let weight_delta_0_1 = image.transpose().mul(delta_1_0);
328 | 
329 |             for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() {
330 |                 *x -= alpha * weight_delta_0_1.data()[i];
331 |             }
332 | 
333 |             for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() {
334 |                 *x -= alpha * weight_delta_1_2.data()[i];
335 |             }
336 |         }
337 | 
338 |         if (it + 1) % 10 == 0 {
339 |             // Inference
340 | 
341 |             let mut total_test_error = 0.0;
342 |             let mut test_accuracy = 0.0;
343 | 
344 |             for (image, label) in test_images.iter().zip(test_labels.iter()) {
345 |                 let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) };
346 | 
347 |                 let mut hidden_layer = image.mul(&weights_0_1);
348 | 
349 |                 // relu
350 |                 for j in 0..hidden_size {
351 |                     if hidden_layer[[0, j]] < 0.0 {
352 |                         hidden_layer[[0, j]] = 0.0;
353 |                     }
354 |                 }
355 | 
356 |                 let output = hidden_layer.mul(&weights_1_2);
357 | 
358 |                 test_accuracy += if argmax(&label) == argmax(output.data()) {
359 |                     1.0
360 |                 } else {
361 |                     0.0
362 |                 };
363 | 
364 |                 let error: f64 = output
365 |                     .iter()
366 |                     .zip(label.iter())
367 |                     .map(|(p, t)| (p - t).powi(2))
368 |                     .sum();
369 | 
370 |                 total_test_error += error;
371 |             }
372 | 
373 |             progress.println(format!(
374 |                 "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}",
375 |                 it + 1,
376 |                 accuracy / (dataset_size as f64),
377 |                 total_error / (dataset_size as f64),
378 |                 test_accuracy / (test_dataset_size as f64),
379 |                 total_test_error / (test_dataset_size as f64),
380 |             ));
381 |         }
382 | 
383 |         progress.inc(1);
384 |         progress.set_message(&format!(
385 |             "Train Accuracy: {}, Train Error: {}",
386 |             accuracy / (dataset_size as f64),
387 |             total_error / (dataset_size as f64)
388 |         ));
389 |     }
390 | 
391 |     Ok(())
392 | }
393 | 
394 | fn three_layer_mnist_with_validation_and_dropout(
395 |     keep_probability: f64,
396 | ) -> Result<(), Box<dyn Error>> {
397 |     let dataset_size = 1000; // 1000 in notebook with numpy
398 |     let test_dataset_size = 1000;
399 | 
400 |     let (train_data, test_data) = mnist()?;
401 | 
402 |     let (images, labels): (Vec<_>, Vec<_>) = train_data.take(dataset_size).unzip();
403 | 
404 |     let images: Vec<Vec<f64>> = images
405 |         .iter()
406 |         .map(|img| img.iter().map(|v| f64::from(*v) / 255.0).collect())
407 |         .collect();
408 | 
409 |     let labels: Vec<Vec<f64>> = labels
410 |         .iter()
411 |         .map(|l| {
412 |             let mut v = vec![0.0; 10];
413 |             v[*l as usize] = 1.0;
414 |             v
415 |         })
416 |         .collect();
417 | 
418 |     let (alpha, hidden_size) = (0.005, 40);
419 | 
420 |     let (test_images, test_labels): (Vec<_>, Vec<_>) = test_data.take(test_dataset_size).unzip();
421 | 
422 |     let test_images: Vec<Vec<f64>> = test_images
423 |         .into_iter()
424 |         .map(|img| img.into_iter().map(|v| f64::from(v) / 255.0).collect())
425 |         .collect();
426 | 
427 |     let test_labels: Vec<Vec<f64>> = test_labels
428 |         .into_iter()
429 |         .map(|l| {
430 |             let mut v = vec![0.0; 10];
431 |             v[l as usize] = 1.0;
432 |             v
433 |         })
434 |         .collect();
435 | 
436 |     let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication.
437 |     let progress = ProgressBar::new(iterations as u64);
438 |     progress.set_style(
439 |         ProgressStyle::default_bar()
440 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
441 |     );
442 | 
443 |     let mut weights_0_1 = Matrix::new(
444 |         784,
445 |         hidden_size,
446 |         generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard),
447 |     );
448 |     let mut weights_1_2 = Matrix::new(
449 |         hidden_size,
450 |         10,
451 |         generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard),
452 |     );
453 | 
454 |     // Training
455 | 
456 |     for it in 0..iterations {
457 |         let mut total_error = 0.0;
458 |         let mut accuracy = 0.0;
459 | 
460 |         for (image, label) in images.iter().zip(labels.iter()) {
461 |             let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) };
462 | 
463 |             let mut hidden_layer = (&image).mul(&weights_0_1);
464 |             for j in 0..hidden_size {
465 |                 if hidden_layer[[0, j]] < 0.0 {
466 |                     hidden_layer[[0, j]] = 0.0;
467 |                 }
468 |             }
469 | 
470 |             let dropout_mask_data: Vec<f64> =
471 |                 sample_bernoulli_trials(keep_probability, hidden_size);
472 | 
473 |             let dropout_mask = Matrix::new(1, hidden_size, dropout_mask_data);
474 | 
475 |             for j in 0..hidden_size {
476 |                 hidden_layer[[0, j]] *= dropout_mask[[0, j]] * (1.0 / keep_probability);
477 |             }
478 | 
479 |             let output = (&hidden_layer).mul(&weights_1_2);
480 | 
481 |             accuracy += if argmax(&label) == argmax(output.data()) {
482 |                 1.0
483 |             } else {
484 |                 0.0
485 |             };
486 | 
487 |             let error: f64 = output
488 |                 .data()
489 |                 .iter()
490 |                 .zip(label.iter())
491 |                 .map(|(p, t)| (p - t).powi(2))
492 |                 .sum();
493 | 
494 |             total_error += error;
495 | 
496 |             let delta_2_1 = output - Matrix::new(1, 10, label.clone());
497 | 
498 |             let mut relu_deriv = Matrix::new(1, hidden_size, vec![0.0; hidden_size]);
499 |             for i in 0..hidden_size {
500 |                 if hidden_layer[[0, i]] >= 0.0 {
501 |                     relu_deriv[[0, i]] = 1.0;
502 |                 }
503 |             }
504 | 
505 |             let mut delta_1_0 = (&delta_2_1)
506 |                 .mul(weights_1_2.transpose())
507 |                 .elemul(&relu_deriv);
508 | 
509 |             for j in 0..hidden_size {
510 |                 delta_1_0[[0, j]] *= dropout_mask[[0, j]] * (1.0 / keep_probability);
511 |             }
512 | 
513 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
514 | 
515 |             // avoid another clone of image
516 |             let weight_delta_0_1 = image.transpose().mul(delta_1_0);
517 | 
518 |             for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() {
519 |                 *x -= alpha * weight_delta_0_1.data()[i];
520 |             }
521 | 
522 |             for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() {
523 |                 *x -= alpha * weight_delta_1_2.data()[i];
524 |             }
525 |         }
526 | 
527 |         progress.inc(1);
528 |         progress.set_message(&format!(
529 |             "Train Accuracy: {}, Train Error: {}",
530 |             accuracy / (dataset_size as f64),
531 |             total_error / (dataset_size as f64)
532 |         ));
533 | 
534 |         if (it + 1) % 10 == 0 {
535 |             // Inference
536 | 
537 |             let mut total_test_error = 0.0;
538 |             let mut test_accuracy = 0.0;
539 | 
540 |             for (image, label) in test_images.iter().zip(test_labels.iter()) {
541 |                 let image = unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), 1, 784, 1) };
542 | 
543 |                 let mut hidden_layer = image.mul(&weights_0_1);
544 | 
545 |                 // relu
546 |                 for j in 0..hidden_size {
547 |                     if hidden_layer[[0, j]] < 0.0 {
548 |                         hidden_layer[[0, j]] = 0.0;
549 |                     }
550 |                 }
551 | 
552 |                 let output = hidden_layer.mul(&weights_1_2);
553 | 
554 |                 test_accuracy += if argmax(&label) == argmax(output.data()) {
555 |                     1.0
556 |                 } else {
557 |                     0.0
558 |                 };
559 | 
560 |                 let error: f64 = output
561 |                     .iter()
562 |                     .zip(label.iter())
563 |                     .map(|(p, t)| (p - t).powi(2))
564 |                     .sum();
565 | 
566 |                 total_test_error += error;
567 |             }
568 | 
569 |             progress.println(format!(
570 |                 "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}",
571 |                 it + 1,
572 |                 accuracy / (dataset_size as f64),
573 |                 total_error / (dataset_size as f64),
574 |                 test_accuracy / (test_dataset_size as f64),
575 |                 total_test_error / (test_dataset_size as f64),
576 |             ));
577 |         }
578 |     }
579 | 
580 |     progress.finish_and_clear();
581 | 
582 |     Ok(())
583 | }
584 | 
585 | fn batched_gradient_descent_with_dropout(keep_probability: f64) -> Result<(), Box<dyn Error>> {
586 |     let dataset_size = 1000; // 1000 in notebook with numpy
587 |     let test_dataset_size = 1000;
588 | 
589 |     let batch_size = 100;
590 | 
591 |     let (train_data, test_data) = mnist()?;
592 | 
593 |     let (images, labels) = process_mnist_batch_dataset(train_data, dataset_size, batch_size);
594 |     let (test_images, test_labels) =
595 |         process_mnist_batch_dataset(test_data, test_dataset_size, batch_size);
596 | 
597 |     let (alpha, hidden_size) = (0.001, 40);
598 | 
599 |     let iterations = 100; // NOTE: cannot run this for 350 iterations because of slower matrix multiplication.
600 |     let progress = ProgressBar::new(iterations as u64);
601 |     progress.set_style(
602 |         ProgressStyle::default_bar()
603 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
604 |     );
605 | 
606 |     let mut weights_0_1 = Matrix::new(
607 |         784,
608 |         hidden_size,
609 |         generate_random_vector(784 * hidden_size, 0.2, -0.1, &Standard),
610 |     );
611 |     let mut weights_1_2 = Matrix::new(
612 |         hidden_size,
613 |         10,
614 |         generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard),
615 |     );
616 | 
617 |     // Training
618 | 
619 |     for it in 0..iterations {
620 |         let mut total_error = 0.0;
621 |         let mut accuracy = 0.0;
622 | 
623 |         for (image, label) in images.iter().zip(labels.iter()) {
624 |             let image =
625 |                 unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), batch_size, 784, 784) };
626 |             let label = unsafe { MatrixSlice::from_raw_parts(label.as_ptr(), batch_size, 10, 10) };
627 | 
628 |             let mut hidden_layer = (&image).mul(&weights_0_1);
629 |             relu_mut(&mut hidden_layer);
630 | 
631 |             let dropout_mask_data: Vec<f64> =
632 |                 sample_bernoulli_trials(keep_probability, batch_size * hidden_size);
633 | 
634 |             let dropout_mask = Matrix::new(batch_size, hidden_size, dropout_mask_data);
635 | 
636 |             for i in 0..batch_size {
637 |                 for j in 0..hidden_size {
638 |                     hidden_layer[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability);
639 |                 }
640 |             }
641 | 
642 |             let outputs = (&hidden_layer).mul(&weights_1_2);
643 | 
644 |             for (output, l) in outputs.row_iter().zip(label.row_iter()) {
645 |                 if argmax(output.raw_slice()) == argmax(l.raw_slice()) {
646 |                     accuracy += 1.0;
647 |                 }
648 |             }
649 | 
650 |             for (output, l) in outputs.row_iter().zip(label.row_iter()) {
651 |                 let err: f64 = output
652 |                     .raw_slice()
653 |                     .iter()
654 |                     .zip(l.raw_slice().iter())
655 |                     .map(|(p, t)| (p - t).powi(2))
656 |                     .sum();
657 |                 total_error += err;
658 |             }
659 | 
660 |             let mut delta_2_1 = Matrix::new(batch_size, 10, vec![0.0; batch_size * 10]);
661 |             for i in 0..batch_size {
662 |                 for j in 0..10 {
663 |                     delta_2_1[[i, j]] = outputs[[i, j]] - label[[i, j]];
664 |                 }
665 |             }
666 | 
667 |             let relu_deriv = relu_derivative(&hidden_layer);
668 | 
669 |             let mut delta_1_0 = (&delta_2_1)
670 |                 .mul(weights_1_2.transpose())
671 |                 .elemul(&relu_deriv);
672 | 
673 |             for i in 0..batch_size {
674 |                 for j in 0..hidden_size {
675 |                     delta_1_0[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability);
676 |                 }
677 |             }
678 | 
679 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
680 |             let weight_delta_0_1 = image.transpose().mul(delta_1_0);
681 | 
682 |             for (i, x) in weights_0_1.mut_data().iter_mut().enumerate() {
683 |                 *x -= alpha * weight_delta_0_1.data()[i];
684 |             }
685 | 
686 |             for (i, x) in weights_1_2.mut_data().iter_mut().enumerate() {
687 |                 *x -= alpha * weight_delta_1_2.data()[i];
688 |             }
689 |         }
690 | 
691 |         progress.inc(1);
692 |         progress.set_message(&format!(
693 |             "Train Accuracy: {}, Train Error: {}",
694 |             accuracy / (dataset_size as f64),
695 |             total_error / (dataset_size as f64)
696 |         ));
697 | 
698 |         if (it + 1) % 10 == 0 {
699 |             // Inference
700 | 
701 |             let mut total_test_error = 0.0;
702 |             let mut test_accuracy = 0.0;
703 | 
704 |             for (image, label) in test_images.iter().zip(test_labels.iter()) {
705 |                 let image =
706 |                     unsafe { MatrixSlice::from_raw_parts(image.as_ptr(), batch_size, 784, 784) };
707 |                 let label =
708 |                     unsafe { MatrixSlice::from_raw_parts(label.as_ptr(), batch_size, 10, 10) };
709 | 
710 |                 let mut hidden_layer = image.mul(&weights_0_1);
711 |                 for i in 0..batch_size {
712 |                     for j in 0..hidden_size {
713 |                         if hidden_layer[[i, j]] < 0.0 {
714 |                             hidden_layer[[i, j]] = 0.0;
715 |                         }
716 |                     }
717 |                 }
718 | 
719 |                 let outputs = hidden_layer.mul(&weights_1_2);
720 | 
721 |                 for (output, l) in outputs.row_iter().zip(label.row_iter()) {
722 |                     if argmax(output.raw_slice()) == argmax(l.raw_slice()) {
723 |                         test_accuracy += 1.0;
724 |                     }
725 |                 }
726 | 
727 |                 for (output, l) in outputs.row_iter().zip(label.row_iter()) {
728 |                     let err: f64 = output
729 |                         .raw_slice()
730 |                         .iter()
731 |                         .zip(l.raw_slice().iter())
732 |                         .map(|(p, t)| (p - t).powi(2))
733 |                         .sum();
734 | 
735 |                     total_test_error += err;
736 |                 }
737 |             }
738 | 
739 |             progress.println(format!(
740 |                 "Iteration: {}, Train Accuracy: {}, Train Error: {}, Test Accuracy: {}, Test Error: {}",
741 |                 it + 1,
742 |                 accuracy / (dataset_size as f64),
743 |                 total_error / (dataset_size as f64),
744 |                 test_accuracy / (test_dataset_size as f64),
745 |                 total_test_error / (test_dataset_size as f64),
746 |             ));
747 |         }
748 |     }
749 | 
750 |     progress.finish_and_clear();
751 | 
752 |     Ok(())
753 | }
754 | 


--------------------------------------------------------------------------------
/examples/chapter9.rs:
--------------------------------------------------------------------------------
  1 | //! Chapter9 - Intro to Activation Functions - Modeling Probabilities.ipynb
  2 | //!
  3 | //! https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/Chapter9%20-%20Intro%20to%20Activation%20Functions%20-%20Modeling%20Probabilities.ipynb
  4 | 
  5 | use std::error::Error;
  6 | use std::ops::Mul;
  7 | 
  8 | use datasets::image::mnist;
  9 | use indicatif::{ProgressBar, ProgressStyle};
 10 | use rand::distributions::Standard;
 11 | use rulinalg::matrix::{BaseMatrix, Matrix, MatrixSlice};
 12 | 
 13 | use grokking_deep_learning_rs::{
 14 |     argmax, generate_random_vector, process_mnist_batch_dataset, sample_bernoulli_trials,
 15 |     softmax_mut, tanh_derivative, tanh_mut,
 16 | };
 17 | 
 18 | fn main() {
 19 |     println!("\nUpgrading our MNIST Network\n");
 20 |     mnist_tanh(0.5).unwrap();
 21 | }
 22 | 
 23 | fn mnist_tanh(keep_probability: f64) -> Result<(), Box<dyn Error>> {
 24 |     let (train_data, test_data) = mnist()?;
 25 | 
 26 |     let train_data_size = 1000;
 27 |     let test_data_size = 1000;
 28 |     let batch_size = 100;
 29 | 
 30 |     let (train_images, train_labels) =
 31 |         process_mnist_batch_dataset(train_data, train_data_size, batch_size);
 32 |     let (test_images, test_labels) =
 33 |         process_mnist_batch_dataset(test_data, test_data_size, batch_size);
 34 | 
 35 |     let (alpha, hidden_size) = (2.0, 100);
 36 | 
 37 |     let mut weights_0_1 = Matrix::new(
 38 |         784,
 39 |         hidden_size,
 40 |         generate_random_vector(784 * hidden_size, 0.02, -0.01, &Standard),
 41 |     );
 42 |     let mut weights_1_2 = Matrix::new(
 43 |         hidden_size,
 44 |         10,
 45 |         generate_random_vector(hidden_size * 10, 0.2, -0.1, &Standard),
 46 |     );
 47 | 
 48 |     let iterations = 100;
 49 |     let progress = ProgressBar::new(iterations as u64);
 50 |     progress.set_style(
 51 |         ProgressStyle::default_bar()
 52 |             .template("{msg} {bar:40.cyan/blue} {pos:>7}/{len:7} [{elapsed_precise}]"),
 53 |     );
 54 | 
 55 |     for it in 0..iterations {
 56 |         let mut accuracy = 0.0;
 57 | 
 58 |         for (images, labels) in train_images.iter().zip(train_labels.iter()) {
 59 |             let images =
 60 |                 unsafe { MatrixSlice::from_raw_parts(images.as_ptr(), batch_size, 784, 784) };
 61 |             let labels =
 62 |                 unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) };
 63 | 
 64 |             let mut hidden_layer = (&images).mul(&weights_0_1);
 65 |             tanh_mut(&mut hidden_layer);
 66 | 
 67 |             let dropout_mask = Matrix::new(
 68 |                 batch_size,
 69 |                 hidden_size,
 70 |                 sample_bernoulli_trials(keep_probability, batch_size * hidden_size),
 71 |             );
 72 | 
 73 |             for i in 0..batch_size {
 74 |                 for j in 0..hidden_size {
 75 |                     hidden_layer[[i, j]] *= dropout_mask[[i, j]] * (1.0 / keep_probability);
 76 |                 }
 77 |             }
 78 | 
 79 |             let mut outputs = (&hidden_layer).mul(&weights_1_2);
 80 |             softmax_mut(&mut outputs);
 81 | 
 82 |             for (r, l) in (&outputs).row_iter().zip(labels.row_iter()) {
 83 |                 accuracy += if argmax(r.raw_slice()) == argmax(l.raw_slice()) {
 84 |                     1.0
 85 |                 } else {
 86 |                     0.0
 87 |                 }
 88 |             }
 89 | 
 90 |             // NOTE: no error calc here
 91 |             // just taking on faith that the derivative for the final layer = (value - true_value) / (batch_size^2)
 92 | 
 93 |             let mut delta_2_1 = Matrix::zeros(batch_size, 10);
 94 |             for i in 0..batch_size {
 95 |                 for j in 0..10 {
 96 |                     delta_2_1[[i, j]] =
 97 |                         (outputs[[i, j]] - labels[[i, j]]) / ((batch_size * batch_size) as f64);
 98 |                 }
 99 |             }
100 | 
101 |             let mut delta_1_0 = (&delta_2_1)
102 |                 .mul(weights_1_2.transpose())
103 |                 .elemul(&tanh_derivative(&hidden_layer));
104 | 
105 |             for i in 0..batch_size {
106 |                 for j in 0..hidden_size {
107 |                     delta_1_0[[i, j]] *= dropout_mask[[i, j]];
108 |                 }
109 |             }
110 | 
111 |             let weight_delta_1_2 = hidden_layer.transpose().mul(delta_2_1);
112 |             let weight_delta_0_1 = images.transpose().mul(delta_1_0);
113 | 
114 |             for i in 0..hidden_size {
115 |                 for k in 0..10 {
116 |                     weights_1_2[[i, k]] -= alpha * weight_delta_1_2[[i, k]];
117 |                 }
118 |             }
119 | 
120 |             for i in 0..784 {
121 |                 for k in 0..hidden_size {
122 |                     weights_0_1[[i, k]] -= alpha * weight_delta_0_1[[i, k]];
123 |                 }
124 |             }
125 |         }
126 | 
127 |         if (it + 1) % 10 == 0 {
128 |             let mut test_accuracy = 0.0;
129 | 
130 |             for (images, labels) in test_images.iter().zip(test_labels.iter()) {
131 |                 let images =
132 |                     unsafe { MatrixSlice::from_raw_parts(images.as_ptr(), batch_size, 784, 784) };
133 |                 let labels =
134 |                     unsafe { MatrixSlice::from_raw_parts(labels.as_ptr(), batch_size, 10, 10) };
135 | 
136 |                 let mut hidden_layer = images.mul(&weights_0_1);
137 |                 tanh_mut(&mut hidden_layer);
138 | 
139 |                 let mut outputs = hidden_layer.mul(&weights_1_2);
140 |                 softmax_mut(&mut outputs);
141 | 
142 |                 for (r, l) in (&outputs).row_iter().zip(labels.row_iter()) {
143 |                     test_accuracy += if argmax(r.raw_slice()) == argmax(l.raw_slice()) {
144 |                         1.0
145 |                     } else {
146 |                         0.0
147 |                     }
148 |                 }
149 |             }
150 | 
151 |             progress.println(format!(
152 |                 "Iteration: {}, Train Accuracy: {}, Test Accuracy: {}",
153 |                 it + 1,
154 |                 accuracy / (train_data_size as f64),
155 |                 test_accuracy / (test_data_size as f64),
156 |             ));
157 |         }
158 | 
159 |         progress.inc(1);
160 |         progress.set_message(&format!(
161 |             "Train Accuracy: {}",
162 |             accuracy / (train_data_size as f64),
163 |         ));
164 |     }
165 | 
166 |     progress.finish_and_clear();
167 | 
168 |     Ok(())
169 | }
170 | 


--------------------------------------------------------------------------------
/src/activations.rs:
--------------------------------------------------------------------------------
 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters.
 2 | 
 3 | use crate::layers::Layer;
 4 | use crate::tensor::Tensor;
 5 | 
 6 | #[derive(Debug)]
 7 | pub struct Sigmoid;
 8 | 
 9 | impl Layer for Sigmoid {
10 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
11 |         vec![inputs[0].sigmoid()]
12 |     }
13 | }
14 | 
15 | #[derive(Debug)]
16 | pub struct Tanh;
17 | 
18 | impl Layer for Tanh {
19 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
20 |         vec![inputs[0].tanh()]
21 |     }
22 | }
23 | 
24 | #[derive(Debug)]
25 | pub struct Relu;
26 | 
27 | impl Layer for Relu {
28 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
29 |         vec![inputs[0].relu()]
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/layers.rs:
--------------------------------------------------------------------------------
  1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters.
  2 | 
  3 | use std::fmt;
  4 | use std::iter::FromIterator;
  5 | 
  6 | use rand::distributions::Uniform;
  7 | use rulinalg::matrix::{BaseMatrix, Matrix};
  8 | use std::rc::Rc;
  9 | 
 10 | use crate::generate_random_vector;
 11 | use crate::tensor::{Dot, Expand, Tensor};
 12 | 
 13 | pub trait Layer {
 14 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor>;
 15 | 
 16 |     fn parameters(&self) -> Vec<&Tensor> {
 17 |         vec![]
 18 |     }
 19 | }
 20 | 
 21 | #[derive(Debug)]
 22 | pub struct Linear {
 23 |     weights: Tensor,
 24 |     bias: Option<Tensor>,
 25 | }
 26 | 
 27 | impl Linear {
 28 |     pub fn new(n_inputs: usize, n_outputs: usize, bias: bool) -> Linear {
 29 |         let distribution = Uniform::new(0.0, 1.0);
 30 | 
 31 |         let weights = Tensor::new_const(Matrix::new(
 32 |             n_inputs,
 33 |             n_outputs,
 34 |             generate_random_vector(n_inputs * n_outputs, 0.5, 0.0, &distribution),
 35 |         ));
 36 | 
 37 |         let bias = if bias {
 38 |             Some(Tensor::new_const(Matrix::zeros(1, n_outputs)))
 39 |         } else {
 40 |             None
 41 |         };
 42 | 
 43 |         Linear { weights, bias }
 44 |     }
 45 | }
 46 | 
 47 | impl Layer for Linear {
 48 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
 49 |         let rows = inputs[0].0.borrow().data.rows();
 50 |         match &self.bias {
 51 |             None => vec![inputs[0].dot(&self.weights)],
 52 |             Some(bias) => vec![&inputs[0].dot(&self.weights) + &bias.expand(0, rows)],
 53 |         }
 54 |     }
 55 | 
 56 |     fn parameters(&self) -> Vec<&Tensor> {
 57 |         match &self.bias {
 58 |             None => vec![&self.weights],
 59 |             Some(bias) => vec![&self.weights, bias],
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | pub struct Sequential {
 65 |     layers: Vec<Box<dyn Layer>>,
 66 | }
 67 | 
 68 | impl fmt::Debug for Sequential {
 69 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 70 |         write!(f, "Sequential {{  }}")
 71 |     }
 72 | }
 73 | 
 74 | impl Sequential {
 75 |     pub fn new(layers: Vec<Box<dyn Layer>>) -> Self {
 76 |         Sequential { layers }
 77 |     }
 78 | 
 79 |     #[allow(dead_code)]
 80 |     fn add(&mut self, layer: Box<dyn Layer>) {
 81 |         self.layers.push(layer);
 82 |     }
 83 | }
 84 | 
 85 | impl Layer for Sequential {
 86 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
 87 |         // TODO: can this be avoided
 88 |         let mut input = Tensor(Rc::clone(&inputs[0].0));
 89 | 
 90 |         for layer in self.layers.iter() {
 91 |             input = layer.forward(&[&input]).remove(0);
 92 |         }
 93 | 
 94 |         vec![input]
 95 |     }
 96 | 
 97 |     fn parameters(&self) -> Vec<&Tensor> {
 98 |         self.layers
 99 |             .iter()
100 |             .map(|l| l.parameters())
101 |             .flat_map(|v| v.into_iter())
102 |             .collect()
103 |     }
104 | }
105 | 
106 | #[derive(Debug)]
107 | pub struct Embedding {
108 |     pub weights: Tensor,
109 | }
110 | 
111 | impl Embedding {
112 |     pub fn new(vocab_size: usize, embedding_size: usize) -> Embedding {
113 |         let distribution = Uniform::new(0.0, 1.0);
114 |         Embedding {
115 |             weights: Tensor::new_const(Matrix::new(
116 |                 vocab_size,
117 |                 embedding_size,
118 |                 generate_random_vector(
119 |                     vocab_size * embedding_size,
120 |                     1.0 / (embedding_size as f64),
121 |                     -0.5 / (embedding_size as f64),
122 |                     &distribution,
123 |                 ),
124 |             )),
125 |         }
126 |     }
127 | 
128 |     pub fn from_weights(weights: Matrix<f64>) -> Embedding {
129 |         Embedding {
130 |             weights: Tensor::new_const(weights),
131 |         }
132 |     }
133 | }
134 | 
135 | impl Clone for Embedding {
136 |     fn clone(&self) -> Embedding {
137 |         Embedding {
138 |             weights: Tensor::new_const(self.weights.0.borrow().data.clone()),
139 |         }
140 |     }
141 | }
142 | 
143 | impl Layer for Embedding {
144 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
145 |         let data = Vec::from_iter(
146 |             inputs[0]
147 |                 .0
148 |                 .borrow()
149 |                 .data
150 |                 .row(0)
151 |                 .raw_slice()
152 |                 .iter()
153 |                 .map(|v| (*v as usize)),
154 |         );
155 | 
156 |         vec![self.weights.index_select(data)]
157 |     }
158 | 
159 |     fn parameters(&self) -> Vec<&Tensor> {
160 |         vec![&self.weights]
161 |     }
162 | }
163 | 
164 | pub struct RNNCell {
165 |     n_hidden: usize,
166 |     w_ih: Linear,
167 |     w_hh: Linear,
168 |     w_ho: Linear,
169 |     activation: Box<dyn Layer>,
170 | }
171 | 
172 | impl fmt::Debug for RNNCell {
173 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
174 |         write!(
175 |             f,
176 |             "RNNCell {{ n_hidden: {:?}, w_ih: {:?}, w_hh: {:?}, w_ho: {:?} }}",
177 |             self.n_hidden, self.w_ih, self.w_hh, self.w_ho
178 |         )
179 |     }
180 | }
181 | 
182 | impl RNNCell {
183 |     pub fn new(
184 |         n_inputs: usize,
185 |         n_hidden: usize,
186 |         n_outputs: usize,
187 |         activation: Box<dyn Layer>,
188 |     ) -> RNNCell {
189 |         let w_ih = Linear::new(n_inputs, n_hidden, true);
190 |         let w_hh = Linear::new(n_hidden, n_hidden, true);
191 |         let w_ho = Linear::new(n_hidden, n_outputs, true);
192 | 
193 |         RNNCell {
194 |             n_hidden,
195 |             w_ih,
196 |             w_hh,
197 |             w_ho,
198 |             activation,
199 |         }
200 |     }
201 | 
202 |     pub fn create_start_state(&self, batch_size: usize) -> Tensor {
203 |         Tensor::new_const(Matrix::zeros(batch_size, self.n_hidden))
204 |     }
205 | }
206 | 
207 | impl Layer for RNNCell {
208 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
209 |         let (input, hidden) = (inputs[0], inputs[1]);
210 | 
211 |         let state_part = self.w_hh.forward(&[hidden]);
212 |         let input_part = self.w_ih.forward(&[input]);
213 | 
214 |         let mut new_state = self
215 |             .activation
216 |             .forward(&[&(&input_part[0] + &state_part[0])]);
217 |         let mut output = self.w_ho.forward(&[&new_state[0]]);
218 | 
219 |         vec![output.remove(0), new_state.remove(0)]
220 |     }
221 | 
222 |     fn parameters(&self) -> Vec<&Tensor> {
223 |         let mut ans = self.w_ih.parameters();
224 |         ans.append(&mut self.w_hh.parameters());
225 |         ans.append(&mut self.w_ho.parameters());
226 |         ans
227 |     }
228 | }
229 | 
230 | #[derive(Debug)]
231 | pub struct LSTMCell {
232 |     xf: Linear,
233 |     xi: Linear,
234 |     xo: Linear,
235 |     xc: Linear,
236 | 
237 |     hf: Linear,
238 |     hi: Linear,
239 |     ho: Linear,
240 |     hc: Linear,
241 | 
242 |     w_ho: Linear,
243 | 
244 |     n_hidden: usize,
245 | }
246 | 
247 | impl LSTMCell {
248 |     pub fn new(n_inputs: usize, n_hidden: usize, n_outputs: usize) -> LSTMCell {
249 |         LSTMCell {
250 |             xf: Linear::new(n_inputs, n_hidden, true),
251 |             xi: Linear::new(n_inputs, n_hidden, true),
252 |             xo: Linear::new(n_inputs, n_hidden, true),
253 |             xc: Linear::new(n_inputs, n_hidden, true),
254 | 
255 |             hf: Linear::new(n_hidden, n_hidden, false),
256 |             hi: Linear::new(n_hidden, n_hidden, false),
257 |             ho: Linear::new(n_hidden, n_hidden, false),
258 |             hc: Linear::new(n_hidden, n_hidden, false),
259 | 
260 |             w_ho: Linear::new(n_hidden, n_outputs, false),
261 | 
262 |             n_hidden,
263 |         }
264 |     }
265 | 
266 |     pub fn create_start_state(&self, batch_size: usize) -> (Tensor, Tensor) {
267 |         let mut h = Matrix::zeros(batch_size, self.n_hidden);
268 |         let mut c = Matrix::zeros(batch_size, self.n_hidden);
269 | 
270 |         for i in 0..batch_size {
271 |             h[[i, 0]] = 1.0;
272 |             c[[i, 0]] = 1.0;
273 |         }
274 | 
275 |         (Tensor::new_const(h), Tensor::new_const(c))
276 |     }
277 | }
278 | 
279 | impl Layer for LSTMCell {
280 |     #[allow(clippy::many_single_char_names)]
281 |     fn forward(&self, inputs: &[&Tensor]) -> Vec<Tensor> {
282 |         let (input, prev_hidden, prev_cell) = (inputs[0], inputs[1], inputs[2]);
283 | 
284 |         let f = (&self.xf.forward(&[input])[0] + &self.hf.forward(&[prev_hidden])[0]).sigmoid();
285 |         let i = (&self.xi.forward(&[input])[0] + &self.hi.forward(&[prev_hidden])[0]).sigmoid();
286 |         let o = (&self.xo.forward(&[input])[0] + &self.ho.forward(&[prev_hidden])[0]).sigmoid();
287 | 
288 |         let g = (&self.xc.forward(&[input])[0] + &self.hc.forward(&[prev_hidden])[0]).tanh();
289 | 
290 |         let c = &(&f * prev_cell) + &(&i * &g);
291 |         let h = &o * &c.tanh();
292 | 
293 |         let output = self.w_ho.forward(&[&h]).remove(0);
294 | 
295 |         vec![output, h, c]
296 |     }
297 | 
298 |     fn parameters(&self) -> Vec<&Tensor> {
299 |         self.xf
300 |             .parameters()
301 |             .into_iter()
302 |             .chain(self.xi.parameters().into_iter())
303 |             .chain(self.xo.parameters().into_iter())
304 |             .chain(self.xc.parameters().into_iter())
305 |             .chain(self.hf.parameters().into_iter())
306 |             .chain(self.hi.parameters().into_iter())
307 |             .chain(self.ho.parameters().into_iter())
308 |             .chain(self.hc.parameters().into_iter())
309 |             .chain(self.w_ho.parameters().into_iter())
310 |             .collect()
311 |     }
312 | }
313 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![deny(missing_debug_implementations)]
  2 | 
  3 | use datasets::Dataset;
  4 | use rand::distributions::{Bernoulli, Distribution};
  5 | use rand::{thread_rng, Rng};
  6 | use rulinalg::matrix::{BaseMatrix, BaseMatrixMut, Matrix as RulinalgMatrix};
  7 | 
  8 | pub mod activations;
  9 | pub mod layers;
 10 | pub mod losses;
 11 | pub mod optimizers;
 12 | pub mod tensor;
 13 | 
 14 | pub type Vector = Vec<f64>;
 15 | pub type Matrix = Vec<Vec<f64>>;
 16 | 
 17 | #[allow(clippy::ptr_arg)]
 18 | pub fn elementwise_multiplication(vec_a: &Vector, vec_b: &Vector) -> Vector {
 19 |     vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a * b).collect()
 20 | }
 21 | 
 22 | pub fn argmax(vec: &[f64]) -> usize {
 23 |     let mut max = vec[0];
 24 |     let mut ans = 0;
 25 | 
 26 |     for (i, x) in vec.iter().enumerate().skip(1) {
 27 |         if x > &max {
 28 |             max = *x;
 29 |             ans = i;
 30 |         }
 31 |     }
 32 | 
 33 |     ans
 34 | }
 35 | 
 36 | pub fn vector_sum(vec: Vector) -> f64 {
 37 |     vec.iter().sum()
 38 | }
 39 | 
 40 | #[allow(clippy::ptr_arg)]
 41 | pub fn dot(vec_a: &Vector, vec_b: &Vector) -> f64 {
 42 |     vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a * b).sum()
 43 | }
 44 | 
 45 | #[allow(clippy::ptr_arg)]
 46 | pub fn elementwise_scalar_multiplication(vec: &Vector, n: f64) -> Vector {
 47 |     vec.iter().map(|x| x * n).collect()
 48 | }
 49 | 
 50 | #[allow(clippy::ptr_arg)]
 51 | pub fn elementwise_addition(vec_a: &Vector, vec_b: &Vector) -> Vector {
 52 |     vec_a.iter().zip(vec_b.iter()).map(|(a, b)| a + b).collect()
 53 | }
 54 | 
 55 | #[allow(clippy::ptr_arg)]
 56 | pub fn vector_average(vec: &Vector) -> f64 {
 57 |     let len = vec.len() as f64;
 58 |     vec.iter().sum::<f64>() / len
 59 | }
 60 | 
 61 | #[allow(clippy::ptr_arg)]
 62 | pub fn vector_vector_subtraction(v1: &Vector, v2: &Vector) -> Vector {
 63 |     v1.iter().zip(v2.iter()).map(|(a, b)| a - b).collect()
 64 | }
 65 | 
 66 | #[allow(clippy::ptr_arg)]
 67 | pub fn vector_vector_multiplication(v1: &Vector, v2: &Vector) -> Vector {
 68 |     v1.iter().zip(v2.iter()).map(|(a, b)| a * b).collect()
 69 | }
 70 | 
 71 | #[allow(clippy::ptr_arg)]
 72 | pub fn vector_vector_dot(vec1: &Vector, vec2: &Vector) -> Matrix {
 73 |     vec1.iter()
 74 |         .map(|i| vec2.iter().map(|j| i * j).collect())
 75 |         .collect()
 76 | }
 77 | 
 78 | #[allow(clippy::ptr_arg)]
 79 | pub fn vector_matrix_dot(vec: &Vector, mat: &Matrix) -> Vector {
 80 |     matrix_vector_dot(&transpose(mat), vec)
 81 | }
 82 | 
 83 | #[allow(clippy::ptr_arg)]
 84 | pub fn matrix_vector_dot(mat: &Matrix, vec: &Vector) -> Vector {
 85 |     mat.iter().map(|w| dot(w, vec)).collect()
 86 | }
 87 | 
 88 | #[allow(clippy::ptr_arg)]
 89 | pub fn matrix_matrix_subtraction(mat1: &Matrix, mat2: &Matrix) -> Matrix {
 90 |     mat1.iter()
 91 |         .zip(mat2.iter())
 92 |         .map(|(v1, v2)| vector_vector_subtraction(v1, v2))
 93 |         .collect()
 94 | }
 95 | 
 96 | #[allow(clippy::ptr_arg)]
 97 | pub fn matrix_matrix_multiplication(mat1: &Matrix, mat2: &Matrix) -> Matrix {
 98 |     mat1.iter()
 99 |         .zip(mat2.iter())
100 |         .map(|(v1, v2)| vector_vector_multiplication(v1, v2))
101 |         .collect()
102 | }
103 | 
104 | #[allow(clippy::ptr_arg, clippy::needless_range_loop)]
105 | pub fn matrix_matrix_dot(mat1: &Matrix, mat2: &Matrix) -> Matrix {
106 |     assert_eq!(mat1[0].len(), mat2.len());
107 | 
108 |     let mut ans = vec![vec![0.0; mat2[0].len()]; mat1.len()];
109 | 
110 |     for i in 0..mat1.len() {
111 |         for j in 0..mat2[0].len() {
112 |             for k in 0..mat2.len() {
113 |                 ans[i][j] += mat1[i][k] * mat2[k][j];
114 |             }
115 |         }
116 |     }
117 | 
118 |     ans
119 | }
120 | 
121 | pub fn relu_vector(v: Vector) -> Vector {
122 |     v.into_iter()
123 |         .map(|a| if a > 0.0 { a } else { 0.0 })
124 |         .collect()
125 | }
126 | 
127 | pub fn relu_vector_derivative(v: Vector) -> Vector {
128 |     v.into_iter()
129 |         .map(|a| if a > 0.0 { 1.0 } else { 0.0 })
130 |         .collect()
131 | }
132 | 
133 | pub fn relu_matrix(m: Matrix) -> Matrix {
134 |     m.into_iter().map(relu_vector).collect()
135 | }
136 | 
137 | pub fn relu_matrix_derivative(m: Matrix) -> Matrix {
138 |     m.into_iter().map(relu_vector_derivative).collect()
139 | }
140 | 
141 | #[allow(clippy::ptr_arg, clippy::needless_range_loop)]
142 | pub fn transpose(m: &Matrix) -> Matrix {
143 |     let mut ans = vec![vec![0.0; m.len()]; m[0].len()];
144 | 
145 |     for i in 0..m.len() {
146 |         for j in 0..m[0].len() {
147 |             ans[j][i] = m[i][j];
148 |         }
149 |     }
150 | 
151 |     ans
152 | }
153 | 
154 | pub fn generate_random_vector(
155 |     size: usize,
156 |     scale_factor: f64,
157 |     add_factor: f64,
158 |     dist: &impl Distribution<f64>,
159 | ) -> Vec<f64> {
160 |     let mut rng = thread_rng();
161 |     (0..size)
162 |         .map(|_| scale_factor * rng.sample(dist) + add_factor)
163 |         .collect()
164 | }
165 | 
166 | pub fn process_mnist_batch_dataset(
167 |     dataset: impl Dataset<Item = (Vec<u8>, u8)>,
168 |     dataset_size: usize,
169 |     batch_size: usize,
170 | ) -> (Vec<Vec<f64>>, Vec<Vec<f64>>) {
171 |     let normalize_image = |img: Vec<u8>| img.iter().map(|v| f64::from(*v) / 255.0).collect();
172 |     let encode_label = |l| {
173 |         let mut v = vec![0.0; 10];
174 |         v[l as usize] = 1.0;
175 |         v
176 |     };
177 | 
178 |     let (images, labels): (Vec<_>, Vec<_>) = dataset
179 |         .take(dataset_size)
180 |         .map(|(i, l)| (normalize_image(i), encode_label(l)))
181 |         .unzip();
182 | 
183 |     let images = images
184 |         .into_iter()
185 |         .batch(batch_size, false)
186 |         .map(|v| {
187 |             v.into_iter()
188 |                 .fold(Vec::with_capacity(batch_size * 784), |mut acc, mut img| {
189 |                     acc.append(&mut img);
190 |                     acc
191 |                 })
192 |         })
193 |         .collect();
194 | 
195 |     let labels = labels
196 |         .into_iter()
197 |         .batch(batch_size, false)
198 |         .map(|v| {
199 |             v.into_iter()
200 |                 .fold(Vec::with_capacity(batch_size * 10), |mut acc, mut l| {
201 |                     acc.append(&mut l);
202 |                     acc
203 |                 })
204 |         })
205 |         .collect();
206 | 
207 |     (images, labels)
208 | }
209 | 
210 | pub fn sample_bernoulli_trials(p: f64, length: usize) -> Vec<f64> {
211 |     let dist = Bernoulli::new(p);
212 |     thread_rng()
213 |         .sample_iter(&dist)
214 |         .take(length)
215 |         .map(|v| if v { 1.0 } else { 0.0 })
216 |         .collect()
217 | }
218 | 
219 | pub fn relu_mut(m: &mut RulinalgMatrix<f64>) {
220 |     for x in m.iter_mut() {
221 |         *x = if (*x) > 0.0 { *x } else { 0.0 };
222 |     }
223 | }
224 | 
225 | pub fn relu_derivative(m: &RulinalgMatrix<f64>) -> RulinalgMatrix<f64> {
226 |     let mut ans = RulinalgMatrix::zeros(m.rows(), m.cols());
227 |     for i in 0..m.rows() {
228 |         for j in 0..m.cols() {
229 |             if m[[i, j]] >= 0.0 {
230 |                 ans[[i, j]] = 1.0;
231 |             }
232 |         }
233 |     }
234 | 
235 |     ans
236 | }
237 | 
238 | pub fn sigmoid_mut(m: &mut RulinalgMatrix<f64>) {
239 |     for x in m.iter_mut() {
240 |         *x = 1.0 / (1.0 + (-(*x)).exp());
241 |     }
242 | }
243 | 
244 | pub fn tanh_mut(m: &mut RulinalgMatrix<f64>) {
245 |     for x in m.iter_mut() {
246 |         *x = (*x).tanh();
247 |     }
248 | }
249 | 
250 | pub fn tanh_derivative(m: &RulinalgMatrix<f64>) -> RulinalgMatrix<f64> {
251 |     let mut ans = RulinalgMatrix::zeros(m.rows(), m.cols());
252 |     for i in 0..m.rows() {
253 |         for j in 0..m.cols() {
254 |             ans[[i, j]] = 1.0 - (m[[i, j]] * m[[i, j]]);
255 |         }
256 |     }
257 |     ans
258 | }
259 | 
260 | pub fn softmax_mut(m: &mut RulinalgMatrix<f64>) {
261 |     for i in 0..m.rows() {
262 |         let mut s = 0.0;
263 | 
264 |         for j in 0..m.cols() {
265 |             m[[i, j]] = m[[i, j]].exp();
266 |             s += m[[i, j]];
267 |         }
268 | 
269 |         for j in 0..m.cols() {
270 |             m[[i, j]] /= s;
271 |         }
272 |     }
273 | }
274 | 
275 | #[cfg(test)]
276 | mod tests {
277 |     use super::*;
278 | 
279 |     #[test]
280 |     fn test_elementwise_multiplication() {
281 |         assert_eq!(
282 |             vec![6.0, 14.0, 24.0, 36.0, 0.0],
283 |             elementwise_multiplication(
284 |                 &vec![1.0, 2.0, 3.0, 4.0, 5.0],
285 |                 &vec![6.0, 7.0, 8.0, 9.0, 0.0],
286 |             ),
287 |         );
288 |     }
289 | 
290 |     #[test]
291 |     fn test_vector_sum() {
292 |         assert_eq!(15.0, vector_sum(vec![1.0, 2.0, 3.0, 4.0, 5.0]));
293 |     }
294 | 
295 |     #[test]
296 |     fn test_elementwise_addition() {
297 |         assert_eq!(
298 |             vec![7.0, 9.0, 11.0, 13.0, 5.0],
299 |             elementwise_addition(
300 |                 &vec![1.0, 2.0, 3.0, 4.0, 5.0],
301 |                 &vec![6.0, 7.0, 8.0, 9.0, 0.0],
302 |             ),
303 |         )
304 |     }
305 | 
306 |     #[test]
307 |     fn test_vector_average() {
308 |         assert_eq!(3.0, vector_average(&vec![1.0, 2.0, 3.0, 4.0, 5.0]));
309 |     }
310 | 
311 |     #[test]
312 |     fn test_dot() {
313 |         assert_eq!(
314 |             80.0,
315 |             dot(
316 |                 &vec![1.0, 2.0, 3.0, 4.0, 5.0],
317 |                 &vec![6.0, 7.0, 8.0, 9.0, 0.0],
318 |             ),
319 |         );
320 |     }
321 | 
322 |     #[test]
323 |     fn test_elementwise_scalar_multiplication() {
324 |         assert_eq!(
325 |             vec![2.0, 4.0, 6.0, 8.0, 10.0],
326 |             elementwise_scalar_multiplication(&vec![1.0, 2.0, 3.0, 4.0, 5.0], 2.0,)
327 |         )
328 |     }
329 | 
330 |     #[test]
331 |     fn test_matrix_vector_dot() {
332 |         assert_eq!(
333 |             vec![55.0, 45.0, 40.0, 40.0, 35.0],
334 |             matrix_vector_dot(
335 |                 &vec![
336 |                     vec![1.0, 2.0, 3.0, 4.0, 5.0],
337 |                     vec![2.0, 3.0, 4.0, 5.0, 1.0],
338 |                     vec![3.0, 4.0, 5.0, 1.0, 2.0],
339 |                     vec![4.0, 5.0, 1.0, 2.0, 3.0],
340 |                     vec![5.0, 4.0, 3.0, 2.0, 1.0],
341 |                 ],
342 |                 &vec![1.0, 2.0, 3.0, 4.0, 5.0],
343 |             ),
344 |         );
345 |     }
346 | 
347 |     #[test]
348 |     fn test_relu_vector() {
349 |         assert_eq!(
350 |             vec![1.0, 0.0, 2.0, 0.0, 4.0],
351 |             relu_vector(vec![1.0, -1.0, 2.0, -2.0, 4.0]),
352 |         );
353 |     }
354 | }
355 | 


--------------------------------------------------------------------------------
/src/losses.rs:
--------------------------------------------------------------------------------
 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters.
 2 | 
 3 | use crate::tensor::{Sum, Tensor};
 4 | 
 5 | pub trait Loss {
 6 |     fn forward(&self, pred: &Tensor, target: &Tensor) -> Tensor;
 7 | }
 8 | 
 9 | #[derive(Debug)]
10 | pub struct MSELoss;
11 | 
12 | impl Loss for MSELoss {
13 |     fn forward(&self, pred: &Tensor, target: &Tensor) -> Tensor {
14 |         (&(pred - target) * &(pred - target)).sum(0)
15 |     }
16 | }
17 | 
18 | #[derive(Debug)]
19 | pub struct CrossEntropyLoss;
20 | 
21 | impl Loss for CrossEntropyLoss {
22 |     fn forward(&self, pred: &Tensor, target_indices: &Tensor) -> Tensor {
23 |         pred.cross_entropy(target_indices)
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/optimizers.rs:
--------------------------------------------------------------------------------
 1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters.
 2 | 
 3 | use rulinalg::matrix::BaseMatrix;
 4 | 
 5 | use crate::tensor::Tensor;
 6 | 
 7 | pub trait Optimizer {
 8 |     fn step(&self, zero: bool);
 9 | }
10 | 
11 | #[derive(Debug)]
12 | pub struct SGDOptimizer<'a> {
13 |     parameters: Vec<&'a Tensor>,
14 |     alpha: f64,
15 | }
16 | 
17 | impl<'a> SGDOptimizer<'a> {
18 |     pub fn new(parameters: Vec<&'a Tensor>, alpha: f64) -> SGDOptimizer {
19 |         SGDOptimizer { parameters, alpha }
20 |     }
21 | 
22 |     fn step_parameter(&self, parameter: &'a Tensor, zero: bool) {
23 |         let mut w = parameter.0.borrow_mut();
24 |         let grad = w.grad.take();
25 | 
26 |         if zero {
27 |             w.grad = None;
28 |         }
29 | 
30 |         let grad = grad.unwrap();
31 |         let grad = &grad.borrow().data;
32 | 
33 |         for i in 0..w.data.rows() {
34 |             for j in 0..w.data.cols() {
35 |                 w.data[[i, j]] -= self.alpha * grad[[i, j]];
36 |             }
37 |         }
38 |     }
39 | }
40 | 
41 | impl<'a> Optimizer for SGDOptimizer<'a> {
42 |     fn step(&self, zero: bool) {
43 |         for p in self.parameters.iter() {
44 |             self.step_parameter(p, zero);
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/tensor.rs:
--------------------------------------------------------------------------------
  1 | //! This was extracted from the Chapter 13 exercises and moved into the core library so it could be used in later chapters.
  2 | 
  3 | use std::cell::RefCell;
  4 | use std::collections::BTreeMap;
  5 | use std::ops::{Add, Mul, Neg, Sub};
  6 | use std::rc::Rc;
  7 | 
  8 | use rand::{thread_rng, RngCore};
  9 | use rulinalg::matrix::{BaseMatrix, Matrix};
 10 | 
 11 | pub type TensorRef = Rc<RefCell<TensorImpl>>;
 12 | 
 13 | #[derive(Debug, Clone)]
 14 | pub enum Operation {
 15 |     Const,
 16 |     Add,
 17 |     Neg,
 18 |     Sub,
 19 |     Mul,
 20 |     Dot,
 21 |     Transpose,
 22 |     Sigmoid,
 23 |     Tanh,
 24 |     Relu,
 25 |     Sum(usize),
 26 |     Expand(usize),
 27 |     // This is not generic as implemented for python
 28 |     // and can only select indices on the 0th axis. Hence, only a vector.
 29 |     IndexSelect(Vec<usize>),
 30 |     CrossEntropy(Matrix<f64>, Matrix<f64>),
 31 | }
 32 | 
 33 | #[derive(Debug)]
 34 | pub struct TensorImpl {
 35 |     id: u64,
 36 |     pub data: Matrix<f64>,
 37 |     pub grad: Option<TensorRef>,
 38 |     creation_op: Operation,
 39 |     creators: Option<Vec<TensorRef>>,
 40 |     autograd: bool,
 41 |     children: BTreeMap<u64, usize>,
 42 | }
 43 | 
 44 | impl TensorImpl {
 45 |     fn grad(data: Matrix<f64>) -> Self {
 46 |         TensorImpl {
 47 |             id: thread_rng().next_u64(),
 48 |             data,
 49 |             grad: None,
 50 |             creation_op: Operation::Const,
 51 |             creators: None,
 52 |             autograd: false,
 53 |             children: BTreeMap::new(),
 54 |         }
 55 |     }
 56 | 
 57 |     fn all_children_grads_accounted_for(&self) -> bool {
 58 |         self.children.iter().all(|(_, c)| c == &0)
 59 |     }
 60 | 
 61 |     #[allow(clippy::cyclomatic_complexity)]
 62 |     fn backward(&mut self, grad: TensorRef, grad_from: Option<u64>) {
 63 |         if self.autograd {
 64 |             if let Some(grad_from) = &grad_from {
 65 |                 if self.children[&grad_from] == 0 {
 66 |                     panic!("Can only Backpropagate through a tensor once");
 67 |                 } else {
 68 |                     self.children
 69 |                         .insert(*grad_from, self.children[grad_from] - 1);
 70 |                 }
 71 |             }
 72 | 
 73 |             self.grad = match self.grad.take() {
 74 |                 None => Some(Rc::clone(&grad)),
 75 |                 Some(current_grad) => {
 76 |                     let new_grad_data = {
 77 |                         let current_grad_data = &current_grad.borrow().data;
 78 |                         let grad_data = &grad.borrow().data;
 79 |                         current_grad_data + grad_data
 80 |                     };
 81 | 
 82 |                     Some(Rc::new(RefCell::new(TensorImpl::grad(new_grad_data))))
 83 |                 }
 84 |             };
 85 | 
 86 |             if self.creators.is_some()
 87 |                 && (self.all_children_grads_accounted_for() || grad_from.is_none())
 88 |             {
 89 |                 let grad = self.grad.as_ref().unwrap();
 90 |                 let creators = self.creators.as_ref().unwrap();
 91 | 
 92 |                 match &self.creation_op {
 93 |                     Operation::Add => {
 94 |                         creators[0]
 95 |                             .borrow_mut()
 96 |                             .backward(Rc::clone(grad), Some(self.id));
 97 |                         creators[1]
 98 |                             .borrow_mut()
 99 |                             .backward(Rc::clone(grad), Some(self.id));
100 |                     }
101 |                     Operation::Neg => {
102 |                         let data = &grad.borrow().data;
103 |                         let data_data: Vec<f64> = data.data().iter().map(|v| -v).collect();
104 |                         creators[0].borrow_mut().backward(
105 |                             Rc::new(RefCell::new(TensorImpl::grad(Matrix::new(
106 |                                 data.rows(),
107 |                                 data.cols(),
108 |                                 data_data,
109 |                             )))),
110 |                             Some(self.id),
111 |                         );
112 |                     }
113 |                     Operation::Sub => {
114 |                         creators[0]
115 |                             .borrow_mut()
116 |                             .backward(Rc::clone(grad), Some(self.id));
117 |                         {
118 |                             let data = &grad.borrow().data;
119 |                             creators[1].borrow_mut().backward(
120 |                                 Rc::new(RefCell::new(TensorImpl::grad(-data))),
121 |                                 Some(self.id),
122 |                             );
123 |                         }
124 |                     }
125 |                     Operation::Mul => {
126 |                         let grad = &grad.borrow().data;
127 | 
128 |                         let grad0 = {
129 |                             let grad0 = &creators[1].borrow().data;
130 |                             let grad0 = grad0.elemul(grad);
131 |                             Rc::new(RefCell::new(TensorImpl::grad(grad0)))
132 |                         };
133 | 
134 |                         let grad1 = {
135 |                             let grad1 = &creators[0].borrow().data;
136 |                             let grad1 = grad1.elemul(grad);
137 |                             Rc::new(RefCell::new(TensorImpl::grad(grad1)))
138 |                         };
139 | 
140 |                         creators[0].borrow_mut().backward(grad0, Some(self.id));
141 |                         creators[1].borrow_mut().backward(grad1, Some(self.id));
142 |                     }
143 |                     Operation::Transpose => {
144 |                         let grad = &grad.borrow().data;
145 |                         let data = grad.transpose();
146 |                         creators[0]
147 |                             .borrow_mut()
148 |                             .backward(Rc::new(RefCell::new(TensorImpl::grad(data))), Some(self.id));
149 |                     }
150 |                     Operation::Dot => {
151 |                         let grad = &grad.borrow().data;
152 | 
153 |                         let act_delta = {
154 |                             let weights = &creators[1].borrow().data;
155 |                             grad.mul(weights.transpose())
156 |                         };
157 | 
158 |                         let weights_delta = {
159 |                             let act = &creators[0].borrow().data;
160 |                             act.transpose().mul(grad)
161 |                         };
162 | 
163 |                         creators[0].borrow_mut().backward(
164 |                             Rc::new(RefCell::new(TensorImpl::grad(act_delta))),
165 |                             Some(self.id),
166 |                         );
167 | 
168 |                         creators[1].borrow_mut().backward(
169 |                             Rc::new(RefCell::new(TensorImpl::grad(weights_delta))),
170 |                             Some(self.id),
171 |                         );
172 |                     }
173 |                     Operation::Sum(axis) => {
174 |                         let new_grad = {
175 |                             let data = &creators[0].borrow().data;
176 |                             let grad = &grad.borrow().data;
177 |                             let mut new_grad = Matrix::zeros(data.rows(), data.cols());
178 | 
179 |                             if axis == &0 {
180 |                                 for i in 0..data.rows() {
181 |                                     for j in 0..data.cols() {
182 |                                         new_grad[[i, j]] = grad[[0, j]];
183 |                                     }
184 |                                 }
185 |                             } else if axis == &1 {
186 |                                 for i in 0..data.rows() {
187 |                                     for j in 0..data.cols() {
188 |                                         new_grad[[i, j]] = grad[[i, 0]];
189 |                                     }
190 |                                 }
191 |                             } else {
192 |                                 panic!("this is broken");
193 |                             }
194 | 
195 |                             new_grad
196 |                         };
197 | 
198 |                         creators[0].borrow_mut().backward(
199 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
200 |                             Some(self.id),
201 |                         );
202 |                     }
203 |                     Operation::Expand(dim) => {
204 |                         let new_grad = {
205 |                             let data = &creators[0].borrow().data;
206 |                             let grad = &grad.borrow().data;
207 |                             let mut new_grad = Matrix::zeros(data.rows(), data.cols());
208 | 
209 |                             if dim == &0 {
210 |                                 for i in 0..grad.rows() {
211 |                                     for j in 0..grad.cols() {
212 |                                         new_grad[[0, j]] += grad[[i, j]];
213 |                                     }
214 |                                 }
215 |                             } else {
216 |                                 panic!("this is broken");
217 |                             }
218 | 
219 |                             new_grad
220 |                         };
221 | 
222 |                         creators[0].borrow_mut().backward(
223 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
224 |                             Some(self.id),
225 |                         );
226 |                     }
227 |                     Operation::Sigmoid => {
228 |                         let new_grad = {
229 |                             let data = &self.data;
230 |                             let grad = &grad.borrow().data;
231 | 
232 |                             let mut new_grad = Matrix::zeros(grad.rows(), grad.cols());
233 |                             for i in 0..grad.rows() {
234 |                                 for j in 0..grad.cols() {
235 |                                     new_grad[[i, j]] =
236 |                                         grad[[i, j]] * (data[[i, j]] * (1.0 - data[[i, j]]));
237 |                                 }
238 |                             }
239 | 
240 |                             new_grad
241 |                         };
242 | 
243 |                         creators[0].borrow_mut().backward(
244 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
245 |                             Some(self.id),
246 |                         );
247 |                     }
248 |                     Operation::Tanh => {
249 |                         let new_grad = {
250 |                             let data = &self.data;
251 |                             let grad = &grad.borrow().data;
252 | 
253 |                             let mut new_grad = Matrix::zeros(grad.rows(), grad.cols());
254 |                             for i in 0..grad.rows() {
255 |                                 for j in 0..grad.cols() {
256 |                                     new_grad[[i, j]] =
257 |                                         grad[[i, j]] * (1.0 - (data[[i, j]] * data[[i, j]]));
258 |                                 }
259 |                             }
260 | 
261 |                             new_grad
262 |                         };
263 | 
264 |                         creators[0].borrow_mut().backward(
265 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
266 |                             Some(self.id),
267 |                         );
268 |                     }
269 |                     Operation::Relu => {
270 |                         let new_grad = {
271 |                             let data = &self.data;
272 |                             let grad = &grad.borrow().data;
273 | 
274 |                             let mut new_grad = Matrix::zeros(grad.rows(), grad.cols());
275 |                             for i in 0..grad.rows() {
276 |                                 for j in 0..grad.cols() {
277 |                                     new_grad[[i, j]] =
278 |                                         grad[[i, j]] * if data[[i, j]] > 0.0 { 1.0 } else { 0.0 };
279 |                                 }
280 |                             }
281 | 
282 |                             new_grad
283 |                         };
284 | 
285 |                         creators[0].borrow_mut().backward(
286 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
287 |                             Some(self.id),
288 |                         );
289 |                     }
290 |                     Operation::IndexSelect(indices) => {
291 |                         let new_grad = {
292 |                             let data = &creators[0].borrow().data;
293 |                             let grad = &grad.borrow().data;
294 | 
295 |                             let mut new_grad = Matrix::zeros(data.rows(), data.cols());
296 |                             for (i, ix) in indices.iter().enumerate() {
297 |                                 for j in 0..data.cols() {
298 |                                     new_grad[[*ix, j]] += grad[[i, j]];
299 |                                 }
300 |                             }
301 | 
302 |                             new_grad
303 |                         };
304 | 
305 |                         creators[0].borrow_mut().backward(
306 |                             Rc::new(RefCell::new(TensorImpl::grad(new_grad))),
307 |                             Some(self.id),
308 |                         )
309 |                     }
310 |                     Operation::CrossEntropy(predictions, targets) => {
311 |                         creators[0].borrow_mut().backward(
312 |                             Rc::new(RefCell::new(TensorImpl::grad(predictions - targets))),
313 |                             Some(self.id),
314 |                         )
315 |                     }
316 |                     Operation::Const => {}
317 |                 }
318 |             }
319 |         }
320 |     }
321 | }
322 | 
323 | /// Tensor implements "shallow" clones, primarily so that they can be put inside enum variants.
324 | #[derive(Debug)]
325 | pub struct Tensor(pub TensorRef);
326 | 
327 | impl Clone for Tensor {
328 |     fn clone(&self) -> Self {
329 |         Tensor(Rc::clone(&self.0))
330 |     }
331 | }
332 | 
333 | impl Tensor {
334 |     pub fn new_const(data: Matrix<f64>) -> Self {
335 |         Self::new(data, Operation::Const, None)
336 |     }
337 | 
338 |     pub fn grad(data: Matrix<f64>) -> Self {
339 |         let tensor_impl = TensorImpl::grad(data);
340 |         Tensor(Rc::new(RefCell::new(tensor_impl)))
341 |     }
342 | 
343 |     pub fn new(
344 |         data: Matrix<f64>,
345 |         creation_op: Operation,
346 |         creators: Option<Vec<TensorRef>>,
347 |     ) -> Self {
348 |         let tensor_impl = TensorImpl {
349 |             id: thread_rng().next_u64(),
350 |             data,
351 |             grad: None,
352 |             creation_op,
353 |             creators,
354 |             autograd: true,
355 |             children: BTreeMap::new(),
356 |         };
357 | 
358 |         if let Some(creators) = &tensor_impl.creators {
359 |             for c in creators.iter() {
360 |                 let children = &mut c.borrow_mut().children;
361 |                 let e = children.entry(tensor_impl.id).or_insert(0);
362 |                 *e += 1;
363 |             }
364 |         }
365 | 
366 |         Tensor(Rc::new(RefCell::new(tensor_impl)))
367 |     }
368 | 
369 |     pub fn backward(&self, grad: Tensor) {
370 |         self.0.borrow_mut().backward(grad.0, None);
371 |     }
372 | 
373 |     /// higher order ops
374 | 
375 |     pub fn sigmoid(&self) -> Tensor {
376 |         let result = {
377 |             let data = &self.0.borrow().data;
378 |             let mut ans = Matrix::zeros(data.rows(), data.cols());
379 | 
380 |             for i in 0..data.rows() {
381 |                 for j in 0..data.cols() {
382 |                     ans[[i, j]] = 1.0 / (1.0 + (-data[[i, j]]).exp());
383 |                 }
384 |             }
385 | 
386 |             ans
387 |         };
388 | 
389 |         if self.0.borrow().autograd {
390 |             Tensor::new(result, Operation::Sigmoid, Some(vec![Rc::clone(&self.0)]))
391 |         } else {
392 |             Tensor::grad(result)
393 |         }
394 |     }
395 | 
396 |     pub fn tanh(&self) -> Tensor {
397 |         let result = {
398 |             let data = &self.0.borrow().data;
399 |             let mut ans = Matrix::zeros(data.rows(), data.cols());
400 | 
401 |             for i in 0..data.rows() {
402 |                 for j in 0..data.cols() {
403 |                     ans[[i, j]] = data[[i, j]].tanh();
404 |                 }
405 |             }
406 | 
407 |             ans
408 |         };
409 | 
410 |         if self.0.borrow().autograd {
411 |             Tensor::new(result, Operation::Tanh, Some(vec![Rc::clone(&self.0)]))
412 |         } else {
413 |             Tensor::grad(result)
414 |         }
415 |     }
416 | 
417 |     pub fn relu(&self) -> Tensor {
418 |         let result = {
419 |             let data = &self.0.borrow().data;
420 |             let mut ans = Matrix::zeros(data.rows(), data.cols());
421 | 
422 |             for i in 0..data.rows() {
423 |                 for j in 0..data.cols() {
424 |                     ans[[i, j]] = if data[[i, j]] > 0.0 {
425 |                         data[[i, j]]
426 |                     } else {
427 |                         0.0
428 |                     };
429 |                 }
430 |             }
431 | 
432 |             ans
433 |         };
434 | 
435 |         if self.0.borrow().autograd {
436 |             Tensor::new(result, Operation::Relu, Some(vec![Rc::clone(&self.0)]))
437 |         } else {
438 |             Tensor::grad(result)
439 |         }
440 |     }
441 | 
442 |     pub fn index_select(&self, indices: Vec<usize>) -> Tensor {
443 |         let result = {
444 |             let data = &self.0.borrow().data;
445 |             let mut ans = Matrix::zeros(indices.len(), data.cols());
446 | 
447 |             for (i, ix) in indices.iter().enumerate() {
448 |                 for j in 0..data.cols() {
449 |                     ans[[i, j]] = data[[*ix, j]];
450 |                 }
451 |             }
452 | 
453 |             ans
454 |         };
455 | 
456 |         if self.0.borrow().autograd {
457 |             Tensor::new(
458 |                 result,
459 |                 Operation::IndexSelect(indices),
460 |                 Some(vec![Rc::clone(&self.0)]),
461 |             )
462 |         } else {
463 |             Tensor::grad(result)
464 |         }
465 |     }
466 | 
467 |     /// the current tensor and the targets have to be the same shape
468 |     pub fn cross_entropy(&self, target_indices: &Tensor) -> Tensor {
469 |         let (m, target_dist, loss) = {
470 |             let data = &self.0.borrow().data;
471 |             let target_indices = &target_indices.0.borrow().data;
472 | 
473 |             let mut rs = vec![0.0; data.rows()];
474 | 
475 |             let mut m = Matrix::zeros(data.rows(), data.cols());
476 | 
477 |             for i in 0..data.rows() {
478 |                 for j in 0..data.cols() {
479 |                     m[[i, j]] = data[[i, j]].exp();
480 |                     rs[i] += m[[i, j]];
481 |                 }
482 |             }
483 | 
484 |             for i in 0..data.rows() {
485 |                 for j in 0..data.cols() {
486 |                     m[[i, j]] /= rs[i];
487 |                 }
488 |             }
489 | 
490 |             let mut target_dist = Matrix::zeros(data.rows(), data.cols());
491 | 
492 |             let mut loss = 0.0;
493 |             for i in 0..target_indices.rows() {
494 |                 let index = target_indices[[i, 0]] as usize;
495 |                 target_dist[[i, index]] = 1.0;
496 | 
497 |                 let current_loss = data[[i, index]].ln();
498 |                 loss += -current_loss;
499 |             }
500 | 
501 |             loss /= data.rows() as f64;
502 | 
503 |             (m, target_dist, loss)
504 |         };
505 | 
506 |         if self.0.borrow().autograd {
507 |             Tensor::new(
508 |                 Matrix::new(1, 1, vec![loss]),
509 |                 Operation::CrossEntropy(m, target_dist),
510 |                 Some(vec![Rc::clone(&self.0)]),
511 |             )
512 |         } else {
513 |             Tensor::grad(Matrix::new(1, 1, vec![loss]))
514 |         }
515 |     }
516 | }
517 | 
518 | impl Add for &Tensor {
519 |     type Output = Tensor;
520 | 
521 |     fn add(self, other: Self) -> Self::Output {
522 |         let data = &self.0.borrow().data + &other.0.borrow().data;
523 | 
524 |         if self.0.borrow().autograd {
525 |             Tensor::new(
526 |                 data,
527 |                 Operation::Add,
528 |                 Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]),
529 |             )
530 |         } else {
531 |             Tensor::grad(data)
532 |         }
533 |     }
534 | }
535 | 
536 | impl Neg for &Tensor {
537 |     type Output = Tensor;
538 | 
539 |     fn neg(self) -> Self::Output {
540 |         let data = -&self.0.borrow().data;
541 |         if self.0.borrow().autograd {
542 |             Tensor::new(data, Operation::Neg, Some(vec![Rc::clone(&self.0)]))
543 |         } else {
544 |             Tensor::grad(data)
545 |         }
546 |     }
547 | }
548 | 
549 | impl Sub for &Tensor {
550 |     type Output = Tensor;
551 | 
552 |     fn sub(self, other: Self) -> Self::Output {
553 |         let data = &self.0.borrow().data - &other.0.borrow().data;
554 | 
555 |         if self.0.borrow().autograd {
556 |             Tensor::new(
557 |                 data,
558 |                 Operation::Sub,
559 |                 Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]),
560 |             )
561 |         } else {
562 |             Tensor::grad(data)
563 |         }
564 |     }
565 | }
566 | 
567 | impl Mul for &Tensor {
568 |     type Output = Tensor;
569 | 
570 |     fn mul(self, other: Self) -> Self::Output {
571 |         let data = self.0.borrow().data.elemul(&other.0.borrow().data);
572 | 
573 |         if self.0.borrow().autograd {
574 |             Tensor::new(
575 |                 data,
576 |                 Operation::Mul,
577 |                 Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]),
578 |             )
579 |         } else {
580 |             Tensor::grad(data)
581 |         }
582 |     }
583 | }
584 | 
585 | pub trait Sum {
586 |     type Output;
587 |     fn sum(self, dim: usize) -> Self::Output;
588 | }
589 | 
590 | impl Sum for &Tensor {
591 |     type Output = Tensor;
592 | 
593 |     fn sum(self, axis: usize) -> Self::Output {
594 |         if axis > 1 {
595 |             unimplemented!();
596 |         }
597 | 
598 |         let ans = if axis == 0 {
599 |             let data = &self.0.borrow().data;
600 |             let mut summed_data = Matrix::zeros(1, data.cols());
601 |             for i in 0..data.cols() {
602 |                 for j in 0..data.rows() {
603 |                     summed_data[[0, i]] += data[[j, i]];
604 |                 }
605 |             }
606 |             summed_data
607 |         } else {
608 |             let data = &self.0.borrow().data;
609 |             let mut summed_data = Matrix::zeros(data.rows(), 1);
610 |             for i in 0..data.rows() {
611 |                 for j in 0..data.cols() {
612 |                     summed_data[[i, 0]] += data[[i, j]];
613 |                 }
614 |             }
615 |             summed_data
616 |         };
617 | 
618 |         if self.0.borrow().autograd {
619 |             Tensor::new(ans, Operation::Sum(axis), Some(vec![Rc::clone(&self.0)]))
620 |         } else {
621 |             Tensor::grad(ans)
622 |         }
623 |     }
624 | }
625 | 
626 | pub trait Expand {
627 |     type Output;
628 |     fn expand(self, dim: usize, copies: usize) -> Self::Output;
629 | }
630 | 
631 | impl Expand for &Tensor {
632 |     type Output = Tensor;
633 | 
634 |     fn expand(self, dim: usize, copies: usize) -> Self::Output {
635 |         if dim == 0 {
636 |             let new_data = {
637 |                 let data = &self.0.borrow().data;
638 |                 if data.rows() != 1 {
639 |                     unimplemented!()
640 |                 }
641 | 
642 |                 let mut new_data = Matrix::zeros(copies, data.cols());
643 |                 for i in 0..copies {
644 |                     for j in 0..data.cols() {
645 |                         new_data[[i, j]] = data[[0, j]];
646 |                     }
647 |                 }
648 | 
649 |                 new_data
650 |             };
651 | 
652 |             if self.0.borrow().autograd {
653 |                 Tensor::new(
654 |                     new_data,
655 |                     Operation::Expand(dim),
656 |                     Some(vec![Rc::clone(&self.0)]),
657 |                 )
658 |             } else {
659 |                 Tensor::grad(new_data)
660 |             }
661 |         } else {
662 |             unimplemented!()
663 |         }
664 |     }
665 | }
666 | 
667 | pub trait Transpose {
668 |     type Output;
669 |     fn transpose(self) -> Self::Output;
670 | }
671 | 
672 | impl Transpose for &Tensor {
673 |     type Output = Tensor;
674 | 
675 |     fn transpose(self) -> Self::Output {
676 |         let res = {
677 |             let data = &self.0.borrow().data;
678 |             data.transpose()
679 |         };
680 | 
681 |         if self.0.borrow().autograd {
682 |             Tensor::new(res, Operation::Transpose, Some(vec![Rc::clone(&self.0)]))
683 |         } else {
684 |             Tensor::grad(res)
685 |         }
686 |     }
687 | }
688 | 
689 | pub trait Dot {
690 |     type Output;
691 |     fn dot(self, other: Self) -> Self::Output;
692 | }
693 | 
694 | impl Dot for &Tensor {
695 |     type Output = Tensor;
696 | 
697 |     fn dot(self, other: &Tensor) -> Self::Output {
698 |         let result = {
699 |             let data = &self.0.borrow().data;
700 |             let other_data = &other.0.borrow().data;
701 |             data.mul(other_data)
702 |         };
703 | 
704 |         if self.0.borrow().autograd {
705 |             Tensor::new(
706 |                 result,
707 |                 Operation::Dot,
708 |                 Some(vec![Rc::clone(&self.0), Rc::clone(&other.0)]),
709 |             )
710 |         } else {
711 |             Tensor::grad(result)
712 |         }
713 |     }
714 | }
715 | 


--------------------------------------------------------------------------------