├── .github ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── SECURITY.md └── workflows │ └── mojo.yml ├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── __init__.mojo └── mnist │ └── __init__.mojo ├── test_conv.mojo ├── test_functional.mojo ├── test_layer.mojo └── voodoo ├── __init__.mojo ├── autograd ├── __init__.mojo ├── graph.mojo ├── kernels │ ├── __init__.mojo │ ├── activations.mojo │ ├── arithmetic.mojo │ ├── conv.mojo │ ├── generics.mojo │ ├── kernels.mojo │ ├── losses.mojo │ ├── matmul.mojo │ ├── maxpool.mojo │ └── operations.mojo └── node.mojo ├── constants.mojo ├── core ├── __init__.mojo ├── constraints.mojo ├── initializers.mojo ├── layers │ ├── __init__.mojo │ ├── activation.mojo │ ├── conv1D.mojo │ ├── conv2D.mojo │ ├── dense.mojo │ ├── dropout.mojo │ ├── flatten.mojo │ ├── leakyRelu.mojo │ ├── maxPool1D.mojo │ ├── maxPool2D.mojo │ └── reshape.mojo ├── optimizers.mojo └── tensor.mojo └── utils ├── __init__.mojo ├── array.mojo ├── broadcast.mojo ├── code_lookup.mojo ├── console.mojo └── operator_codes.mojo /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | Email. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to examples 2 | 3 | I want to make contributing to this project as easy and transparent as 4 | possible. 5 | 6 | ## Pull Requests 7 | 8 | I actively welcome your pull requests. 9 | 10 | ### For new examples 11 | 12 | 0. Create a GitHub issue proposing a new example and make sure it's substantially different from an existing one. 13 | 1. Fork the repo and create your branch from `main`. 14 | 2. If you've added code that should be tested, add tests. 15 | 3. Create a `README.md`. 16 | 4. Ensure your test passes locally. 17 | 5. Address any feedback in code review promptly. 18 | 19 | ## For bug fixes 20 | 21 | 1. Fork the repo and create your branch from `main` 22 | 2. Make your code change. 23 | 3. Address any feedback in code review promptly. 24 | 25 | ## Issues 26 | 27 | I use GitHub issues to track public bugs. Please ensure your description is 28 | clear and has sufficient instructions to be able to reproduce the issue. 29 | 30 | ## License 31 | 32 | By contributing to examples, you agree that your contributions will be licensed 33 | under the LICENSE file in the root directory of this source tree. 34 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [Benny-Nottonson] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Describe your changes 2 | 3 | ## Issue ticket number and link 4 | 5 | ## Checklist before requesting a review 6 | - [ ] I have performed a self-review of my code 7 | - [ ] If it is a core feature, I have added thorough tests. 8 | - [ ] Do we need to implement analytics? 9 | - [ ] Will this be part of a product update? If yes, please write one phrase about this update. 10 | 11 | -------------------------------------------------------------------------------- /.github/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | The codebase relies on no external code sources, all code is based soley from the Mojo language. Because of this, any vulnerabilities in the language should be assumed as part of the language, not the project. Voodoo itself is a standalone project, and the end user is responsible for their data when used. 4 | -------------------------------------------------------------------------------- /.github/workflows/mojo.yml: -------------------------------------------------------------------------------- 1 | name: MojoCodeTest 2 | 3 | env: 4 | MOJO_HOME: /home/runner/.modular/pkg/packages.modular.com_mojo/bin 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | workflow_dispatch: 12 | 13 | jobs: 14 | build: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/cache@v4 19 | with: 20 | path: /home/runner/.modular 21 | key: ${{ runner.os }}-modular-${{ hashFiles('**/modular.lock') }} 22 | restore-keys: | 23 | ${{ runner.os }}-modular- 24 | - name: Download and Install Mojo 25 | run: | 26 | curl https://get.modular.com | sh - && 27 | modular auth ${{ secrets.MOJO_TOKEN }} 28 | modular install mojo 29 | - name: Run Mojo and Check for Errors 30 | run: | 31 | ${{ env.MOJO_HOME }}/mojo ${{ vars.TEST_FILE }} 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | profile.sh 2 | flamegraph.svg 3 | /temp 4 | /datasets/mnist/*.csv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This project has been merged and is now [Basalt](https://github.com/basalt-org/basalt) 2 | ![BasaltPreview](https://github.com/Benny-Nottonson/voodoo/assets/112336374/9d41aaeb-936d-437e-9734-fa81a4f60d8a) 3 | -------------------------------------------------------------------------------- /datasets/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .mnist import MNist 2 | -------------------------------------------------------------------------------- /datasets/mnist/__init__.mojo: -------------------------------------------------------------------------------- 1 | from voodoo.utils import info 2 | 3 | alias train_datapath = "./datasets/mnist/MNIST_train.csv" 4 | alias test_datapath = "./datasets/mnist/MNIST_test.csv" 5 | alias NELTS = simdwidthof[DType.int8]() 6 | 7 | # Data from https://github.com/halimb/MNIST-txt 8 | 9 | 10 | struct MNist: 11 | var train_labels: DTypePointer[DType.int8] 12 | var train_images: Pointer[DTypePointer[DType.int8]] 13 | var test_labels: DTypePointer[DType.int8] 14 | var test_images: Pointer[DTypePointer[DType.int8]] 15 | 16 | fn __init__(inout self) raises: 17 | info("Loading MNIST dataset...\n") 18 | 19 | var train_data = open(train_datapath, "r").read().split("\n") 20 | var train_size = len(train_data) - 1 21 | 22 | self.train_labels = DTypePointer[DType.int8].alloc(train_size) 23 | self.train_images = Pointer[DTypePointer[DType.int8]].alloc(train_size) 24 | 25 | for i in range(train_size): 26 | var line = train_data[i].strip().split(",") 27 | self.train_labels[i] = atol(line[0]) 28 | self.train_images[i] = DTypePointer[DType.int8].alloc(784) 29 | for j in range(1, len(line)): 30 | self.train_images[i][j - 1] = atol(line[j]) 31 | 32 | var test_data = open(test_datapath, "r").read().split("\n") 33 | var test_size = len(test_data) - 1 34 | 35 | self.test_labels = DTypePointer[DType.int8].alloc(test_size) 36 | self.test_images = Pointer[DTypePointer[DType.int8]].alloc(test_size) 37 | 38 | for i in range(test_size): 39 | var line = test_data[i].strip().split(",") 40 | self.test_labels[i] = atol(line[0]) 41 | self.test_images[i] = DTypePointer[DType.int8].alloc(784) 42 | for j in range(1, len(line)): 43 | self.test_images[i][j - 1] = atol(line[j]) 44 | 45 | info("MNIST dataset loaded.\n") 46 | print( 47 | "There are ", 48 | train_size, 49 | " training samples and ", 50 | test_size, 51 | " test samples.\n", 52 | ) 53 | -------------------------------------------------------------------------------- /test_conv.mojo: -------------------------------------------------------------------------------- 1 | from time.time import now 2 | from tensor import TensorShape 3 | 4 | from voodoo.core import Tensor, HeUniform, HeUniform, RandomUniform, SGD, Zeros 5 | from voodoo.core.layers import Conv2D, MaxPool2D, Flatten, Dense 6 | from voodoo.utils import ( 7 | info, 8 | clear, 9 | ) 10 | 11 | from datasets import MNist 12 | 13 | 14 | fn nanoseconds_to_seconds(t: Int) -> Float64: 15 | return t / 1_000_000_000.0 16 | 17 | 18 | alias batches = 32 19 | alias channels = 1 20 | alias width = 28 21 | alias height = 28 22 | 23 | alias data_shape = TensorShape(batches, channels, width, height) 24 | 25 | 26 | fn main() raises: 27 | var dataset = MNist() 28 | 29 | var conv_layer_one = Conv2D[ 30 | in_channels=1, 31 | kernel_width=3, 32 | kernel_height=3, 33 | stride=1, 34 | padding=0, 35 | weight_initializer = HeUniform[1], 36 | activation="relu", 37 | ]() 38 | var max_pool_one = MaxPool2D[ 39 | kernel_width=2, 40 | kernel_height=2, 41 | stride=2, 42 | ]() 43 | var flatten = Flatten[]() 44 | var dense1 = Dense[ 45 | in_neurons=169, 46 | out_neurons=100, 47 | weight_initializer = HeUniform[169], 48 | activation="relu", 49 | ]() 50 | var dense2 = Dense[ 51 | in_neurons=100, 52 | out_neurons=10, 53 | activation="sigmoid", 54 | ]() 55 | 56 | var avg_loss: Float32 = 0.0 57 | var num_epochs = 1800 58 | var every = 100 59 | 60 | var true_vals = Tensor[TensorShape(batches, 10), Zeros]() 61 | var input = Tensor[TensorShape(batches, channels, width, height), Zeros]() 62 | 63 | for i in range(batches): 64 | var image = dataset.train_images[i] 65 | var label = dataset.train_labels[i].to_int() 66 | true_vals[i * 10 + label] = 1.0 67 | for j in range(width): 68 | for k in range(height): 69 | input[i * channels * width * height + j * width + k] = image[ 70 | j * width + k 71 | ].to_int() 72 | 73 | var x0 = conv_layer_one.forward(input) 74 | var x1 = max_pool_one.forward(x0) 75 | var x2 = flatten.forward(x1) 76 | var x3 = dense1.forward(x2) 77 | var x4 = dense2.forward(x3) 78 | var loss = x4.compute_loss["mse"](true_vals) 79 | 80 | var initial_start = now() 81 | var epoch_start = now() 82 | var bar_accuracy = 20 83 | 84 | for epoch in range(1, num_epochs + 1): 85 | for i in range(batches): 86 | var image = dataset.train_images[i + epoch * batches] 87 | var label = dataset.train_labels[i + epoch * batches].to_int() 88 | true_vals[i * 10 + label] = 1.0 89 | for j in range(width): 90 | for k in range(height): 91 | input[i * channels * width * height + j * width + k] = image[ 92 | j * width + k 93 | ].to_int() 94 | 95 | avg_loss += loss.forward_static()[0] 96 | loss.backward() 97 | loss.optimize[SGD[0.01]]() 98 | 99 | if epoch % every == 0: 100 | var bar = String("") 101 | for i in range(bar_accuracy): 102 | if i < ((epoch * bar_accuracy) / num_epochs).to_int(): 103 | bar += "█" 104 | else: 105 | bar += "░" 106 | clear() 107 | print_no_newline("\nEpoch: " + String(epoch) + " ") 108 | info(bar + " ") 109 | print_no_newline(String(((epoch * 100) / num_epochs).to_int()) + "%\n") 110 | print("----------------------------------------\n") 111 | print_no_newline("Average Loss: ") 112 | info(String(avg_loss / every) + "\n") 113 | print_no_newline("Time: ") 114 | info(String(nanoseconds_to_seconds(now() - epoch_start)) + "s\n") 115 | epoch_start = now() 116 | print("\n----------------------------------------\n") 117 | avg_loss = 0.0 118 | 119 | print_no_newline("Total Time: ") 120 | info(String(nanoseconds_to_seconds(now() - initial_start)) + "s\n\n") 121 | -------------------------------------------------------------------------------- /test_functional.mojo: -------------------------------------------------------------------------------- 1 | from time.time import now 2 | from tensor import TensorShape 3 | 4 | from voodoo.core import Tensor, HeNormal, RandomUniform, SGD 5 | from voodoo.utils import ( 6 | info, 7 | clear, 8 | ) 9 | 10 | 11 | fn nanoseconds_to_seconds(t: Int) -> Float64: 12 | return Float64(t) / 1_000_000_000.0 13 | 14 | 15 | fn main() raises: 16 | var W1 = Tensor[TensorShape(1, 32), HeNormal[1]]() 17 | var W2 = Tensor[TensorShape(32, 32), HeNormal[32]]() 18 | var W3 = Tensor[TensorShape(32, 1), HeNormal[32]]() 19 | 20 | var b1 = Tensor[TensorShape(32), HeNormal[32]]() 21 | var b2 = Tensor[TensorShape(32), HeNormal[32]]() 22 | var b3 = Tensor[TensorShape(1), HeNormal[1]]() 23 | 24 | var avg_loss: Float32 = 0.0 25 | var every = 1000 26 | var num_epochs = 200000 27 | 28 | var input = Tensor[TensorShape(32, 1), RandomUniform[0, 1]]() 29 | var true_vals = Tensor[TensorShape(32, 1), RandomUniform[0, 1]]() 30 | 31 | var x = (input @ W1 + b1).compute_activation["relu"]() 32 | x = (x @ W2 + b2).compute_activation["relu"]() 33 | x = x @ W3 + b3 34 | var loss = x.compute_loss["mse"](true_vals) 35 | 36 | var initial_start = now() 37 | var epoch_start = now() 38 | var bar_accuracy = 20 39 | 40 | for epoch in range(1, num_epochs + 1): 41 | input.refresh() 42 | for i in range(input.shape.num_elements()): 43 | true_vals[i] = math.sin(15.0 * input[i]) 44 | 45 | var computed_loss = loss.forward_static() 46 | avg_loss += computed_loss[0] 47 | loss.backward() 48 | loss.optimize[SGD[0.01]]() 49 | 50 | if epoch % every == 0: 51 | var bar = String("") 52 | for i in range(bar_accuracy): 53 | if i < ((epoch * bar_accuracy) / num_epochs).to_int(): 54 | bar += "█" 55 | else: 56 | bar += "░" 57 | clear() 58 | print_no_newline("\nEpoch: " + String(epoch) + " ") 59 | info(bar + " ") 60 | print_no_newline(String(((epoch * 100) / num_epochs).to_int()) + "%\n") 61 | print("----------------------------------------\n") 62 | print_no_newline("Average Loss: ") 63 | info(String(avg_loss / every) + "\n") 64 | print_no_newline("Time: ") 65 | info(String(nanoseconds_to_seconds(now() - epoch_start)) + "s\n") 66 | epoch_start = now() 67 | print("\n----------------------------------------\n") 68 | avg_loss = 0.0 69 | 70 | print_no_newline("Total Time: ") 71 | info(String(nanoseconds_to_seconds(now() - initial_start)) + "s\n\n") 72 | -------------------------------------------------------------------------------- /test_layer.mojo: -------------------------------------------------------------------------------- 1 | from time.time import now 2 | from tensor import TensorShape 3 | 4 | from voodoo.core import Tensor, HeNormal, RandomUniform, SGD 5 | from voodoo.core.layers import Dense, LeakyReLu, Dropout 6 | from voodoo.utils import ( 7 | info, 8 | clear, 9 | ) 10 | 11 | 12 | fn nanoseconds_to_seconds(t: Int) -> Float64: 13 | return t / 1_000_000_000.0 14 | 15 | 16 | fn main() raises: 17 | var input_layer = Dense[ 18 | in_neurons=1, 19 | out_neurons=32, 20 | activation="relu", 21 | weight_initializer = HeNormal[1], 22 | bias_initializer = HeNormal[32], 23 | ]() 24 | var dropout = Dropout[dropout_rate=0.01,]() 25 | var leaky_relu = LeakyReLu[ 26 | in_neurons=32, 27 | out_neurons=32, 28 | weight_initializer = HeNormal[32], 29 | bias_initializer = HeNormal[32], 30 | ]() 31 | var output_layer = Dense[ 32 | in_neurons=32, 33 | out_neurons=1, 34 | weight_initializer = HeNormal[32], 35 | bias_initializer = HeNormal[1], 36 | ]() 37 | 38 | var avg_loss: Float32 = 0.0 39 | var every = 1000 40 | var num_epochs = 2000000 41 | 42 | var input = Tensor[TensorShape(32, 1), RandomUniform[0, 1]]() 43 | var true_vals = Tensor[TensorShape(32, 1), RandomUniform[0, 1]]() 44 | 45 | var x0 = input_layer.forward(input) 46 | var x1 = dropout.forward(x0) 47 | var x2 = leaky_relu.forward(x1) 48 | var x3 = output_layer.forward(x2) 49 | var loss = x3.compute_loss["mse"](true_vals) 50 | 51 | var initial_start = now() 52 | var epoch_start = now() 53 | var bar_accuracy = 20 54 | for epoch in range(1, num_epochs + 1): 55 | input.refresh() 56 | for i in range(input.shape.num_elements()): 57 | true_vals[i] = math.sin(15.0 * input[i]) 58 | 59 | var computed_loss = loss.forward_static() 60 | avg_loss += computed_loss[0] 61 | loss.backward() 62 | loss.optimize[SGD[0.01]]() 63 | 64 | if epoch % every == 0: 65 | var bar = String("") 66 | for i in range(bar_accuracy): 67 | if i < ((epoch * bar_accuracy) / num_epochs).to_int(): 68 | bar += "█" 69 | else: 70 | bar += "░" 71 | clear() 72 | print_no_newline("\nEpoch: " + String(epoch) + " ") 73 | info(bar + " ") 74 | print_no_newline(String(((epoch * 100) / num_epochs).to_int()) + "%\n") 75 | print("----------------------------------------\n") 76 | print_no_newline("Average Loss: ") 77 | info(String(avg_loss / every) + "\n") 78 | print_no_newline("Time: ") 79 | info(String(nanoseconds_to_seconds(now() - epoch_start)) + "s\n") 80 | epoch_start = now() 81 | print("\n----------------------------------------\n") 82 | avg_loss = 0.0 83 | 84 | print_no_newline("Total Time: ") 85 | info(String(nanoseconds_to_seconds(now() - initial_start)) + "s\n\n") 86 | -------------------------------------------------------------------------------- /voodoo/__init__.mojo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Benny-Nottonson/voodoo/0c879f06bf28f01d55e8a5b44fe49d4a4f3115d0/voodoo/__init__.mojo -------------------------------------------------------------------------------- /voodoo/autograd/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .graph import Graph 2 | from .node import Node 3 | -------------------------------------------------------------------------------- /voodoo/autograd/graph.mojo: -------------------------------------------------------------------------------- 1 | from memory import memset_zero, memset 2 | from math import log2, exp2, ceil, round 3 | from tensor import TensorShape 4 | 5 | from voodoo.constants import MEMORY_POOL_SIZE, OP_TUPLE, BINARY_OP, UNARY_OP 6 | from voodoo.autograd.kernels import KERNELS 7 | from voodoo.core import Optimizer 8 | from voodoo.utils import ( 9 | Vector, 10 | get_broadcasted_shape_for_ew_op, 11 | warn, 12 | copy_code, 13 | mmul_code, 14 | conv1d_code, 15 | conv2d_code, 16 | maxpool1d_code, 17 | maxpool2d_code, 18 | dropout_code, 19 | reshape_code, 20 | transp_code, 21 | sum_code, 22 | ) 23 | 24 | 25 | @register_passable("trivial") 26 | struct MemoryPool(Sized): 27 | var _memory_pool: Vector[DTypePointer[DType.float32]] 28 | 29 | fn __init__() -> Self: 30 | return MemoryPool {_memory_pool: Vector[DTypePointer[DType.float32]]()} 31 | 32 | fn __getitem__(self, index: Int) -> DTypePointer[DType.float32]: 33 | return self._memory_pool[index] 34 | 35 | fn __setitem__(inout self, index: Int, value: DTypePointer[DType.float32]): 36 | self._memory_pool[index] = value 37 | 38 | fn __len__(self) -> Int: 39 | return len(self._memory_pool) 40 | 41 | fn free(self): 42 | for i in range(len(self._memory_pool)): 43 | self._memory_pool[i].free() 44 | self._memory_pool.free() 45 | 46 | fn push_back(inout self, value: DTypePointer[DType.float32]): 47 | self._memory_pool.push_back(value) 48 | 49 | 50 | @register_passable("trivial") 51 | struct MemoryPoolManager: 52 | var _memory_pool_manager: StaticTuple[MEMORY_POOL_SIZE, Vector[Int]] 53 | 54 | fn __init__() -> Self: 55 | var memory_pool_manager = StaticTuple[MEMORY_POOL_SIZE, Vector[Int]]() 56 | 57 | @unroll 58 | for i in range(MEMORY_POOL_SIZE): 59 | memory_pool_manager[i] = Vector[Int]() 60 | 61 | return MemoryPoolManager {_memory_pool_manager: memory_pool_manager} 62 | 63 | fn __getitem__(self, index: Int) -> Vector[Int]: 64 | return self._memory_pool_manager[index] 65 | 66 | fn __setitem__(inout self, index: Int, value: Vector[Int]): 67 | self._memory_pool_manager[index] = value 68 | 69 | fn __len__(self) -> Int: 70 | return MEMORY_POOL_SIZE 71 | 72 | fn free(self): 73 | @unroll 74 | for i in range(MEMORY_POOL_SIZE): 75 | self._memory_pool_manager[i].free() 76 | 77 | 78 | @register_passable("trivial") 79 | struct Graph: 80 | var _nodes: Vector[Node] 81 | var _memory_pool: MemoryPool 82 | var _memory_pool_manager: MemoryPoolManager 83 | var _free_node_ids: Vector[Int] 84 | var _free_data_ids: Vector[Int] 85 | var _last_node_id: Pointer[Int] 86 | var _grad_nodes_order: Vector[Int] 87 | 88 | fn __init__() -> Self: 89 | var last_node_id = Pointer[Int].alloc(1) 90 | last_node_id.store(-1) 91 | 92 | return Graph { 93 | _nodes: Vector[Node](), 94 | _memory_pool: MemoryPool(), 95 | _memory_pool_manager: MemoryPoolManager(), 96 | _free_node_ids: Vector[Int](), 97 | _free_data_ids: Vector[Int](), 98 | _last_node_id: last_node_id, 99 | _grad_nodes_order: Vector[Int](), 100 | } 101 | 102 | fn get_free_node_id(inout self) raises -> Int: 103 | if len(self._free_node_ids) > 0: 104 | return self._free_node_ids.pop_back() 105 | else: 106 | return len(self._nodes) 107 | 108 | fn get_free_data_id(inout self) raises -> Int: 109 | if len(self._free_data_ids) > 0: 110 | return self._free_data_ids.pop_back() 111 | return len(self._memory_pool) 112 | 113 | fn load_ceiled_cap(self, cap: Int) raises -> Int: 114 | return exp2(ceil(log2(Float32(cap)))).to_int() 115 | 116 | fn get_index(self, cap: Int) raises -> Int: 117 | return ceil(log2(Float32(cap))).to_int() 118 | 119 | fn node[ 120 | checkpoint: Bool, 121 | is_static: Bool, 122 | is_single: Bool, 123 | operator_id: Int, 124 | ]( 125 | inout self, 126 | shape: Vector[Int], 127 | other_params: Vector[Int], 128 | *parents: Node, 129 | ) raises -> Node: 130 | var node = Node( 131 | self.get_free_node_id(), 132 | shape, 133 | is_static, 134 | other_params.copy(), 135 | checkpoint, 136 | operator_id, 137 | is_single, 138 | ) 139 | 140 | for i in range(len(parents)): 141 | var parent = parents[i] 142 | node.push_back_parent(parent.get_id()) 143 | parent.push_back_child(node.get_id()) 144 | parent.set_dependencies(parent.get_dependencies() + 1) 145 | 146 | self.get_free_data(node) 147 | 148 | for i in range(len(parents)): 149 | if parents[i].get_dependencies() == 0: 150 | _ = self.forward_recursive(parents[i]) 151 | 152 | var node_id = node.get_id() 153 | if node_id < len(self._nodes): 154 | self._nodes[node_id] = node 155 | else: 156 | self._nodes.push_back(node) 157 | 158 | return node 159 | 160 | fn get_free_data[unique: Bool = False](inout self, node: Node) raises: 161 | if node.get_data_id() != -1: 162 | return 163 | 164 | var idx = -1 165 | var node_parents = node.get_parents() 166 | var node_cap = node.get_cap() 167 | var node_is_static = node.get_is_static() 168 | var node_checkpoint = node.get_checkpoint() 169 | var node_is_single = node.get_is_single() 170 | var node_ceiled_cap = self.load_ceiled_cap(node_cap) 171 | 172 | if ( 173 | not unique 174 | and not node_is_static 175 | and not node_checkpoint 176 | and not node_is_single 177 | ): 178 | for i in range(len(node_parents)): 179 | var parent = self._nodes[node_parents[i]] 180 | if ( 181 | self.load_ceiled_cap(parent.get_cap()) == node_ceiled_cap 182 | and parent.get_dependencies() == 1 183 | and not parent.get_is_static() 184 | and not parent.get_checkpoint() 185 | and not parent.get_is_single() 186 | ): 187 | node.set_data_id(parent.get_data_id()) 188 | node.set_data(self._memory_pool[node.get_data_id()]) 189 | idx = i 190 | break 191 | 192 | for i in range(len(node_parents)): 193 | if i == idx: 194 | continue 195 | else: 196 | var parent = self._nodes[node_parents[i]] 197 | parent.set_dependencies(parent.get_dependencies() - 1) 198 | 199 | if idx == -1: 200 | var mem_pool = self._memory_pool_manager[self.get_index(node_cap)] 201 | if len(mem_pool) > 0: 202 | var data_id = mem_pool.pop_back() 203 | node.set_data_id(data_id) 204 | var ceiled_cap = self.load_ceiled_cap(node_cap) 205 | 206 | node.set_data(self._memory_pool[node.get_data_id()]) 207 | memset_zero(node.get_data(), ceiled_cap) 208 | else: 209 | var data_id = self.get_free_data_id() 210 | node.set_data_id(data_id) 211 | var ceiled_cap = self.load_ceiled_cap(node_cap + 1) 212 | var new_data_ptr = DTypePointer[DType.float32].alloc(ceiled_cap) 213 | if data_id == len(self._memory_pool): 214 | self._memory_pool.push_back(new_data_ptr) 215 | else: 216 | self._memory_pool[data_id] = new_data_ptr 217 | 218 | node.set_data(self._memory_pool[node.get_data_id()]) 219 | memset_zero(node.get_data(), ceiled_cap) 220 | 221 | fn get_free_grad(inout self, node: Node) raises: 222 | if node.get_grad_id() != -1: 223 | return 224 | 225 | var index = self.get_index(node.get_cap()) 226 | var mem_pool = self._memory_pool_manager[index] 227 | if len(mem_pool) > 0: 228 | var grad_id = mem_pool.pop_back() 229 | node.set_grad_id(grad_id) 230 | var ceiled_cap = self.load_ceiled_cap(node.get_cap()) 231 | 232 | node.set_grad(self._memory_pool[node.get_grad_id()]) 233 | memset_zero(node.get_grad(), ceiled_cap) 234 | else: 235 | var grad_id = self.get_free_data_id() 236 | node.set_grad_id(grad_id) 237 | var ceiled_cap = self.load_ceiled_cap(node.get_cap()) 238 | var new_grad_ptr = DTypePointer[DType.float32].alloc(ceiled_cap) 239 | if grad_id == len(self._memory_pool): 240 | self._memory_pool.push_back(new_grad_ptr) 241 | else: 242 | self._memory_pool[grad_id] = new_grad_ptr 243 | 244 | node.set_grad(self._memory_pool[node.get_grad_id()]) 245 | memset_zero(node.get_grad(), ceiled_cap) 246 | 247 | fn release_data[forced: Bool = False](self, node: Node) raises: 248 | if node.get_is_static() or node.get_data_id() == -1: 249 | return 250 | 251 | @parameter 252 | if not forced: 253 | if node.get_checkpoint() or node.get_is_single(): 254 | return 255 | 256 | @parameter 257 | if forced: 258 | var index = self.get_index(node.get_cap()) 259 | var data_id = node.get_data_id() 260 | var mem_pool = self._memory_pool_manager[index] 261 | mem_pool.push_back(data_id) 262 | node.set_data_id(-1) 263 | node.set_dependencies(len(node.get_children())) 264 | node.set_computed(False) 265 | return 266 | 267 | if node.get_dependencies() == 0: 268 | var index = self.get_index(node.get_cap()) 269 | var data_id = node.get_data_id() 270 | var mem_pool = self._memory_pool_manager[index] 271 | mem_pool.push_back(data_id) 272 | node.set_data_id(-1) 273 | node.set_dependencies(len(node.get_children())) 274 | node.set_computed(False) 275 | 276 | fn release_grad_forced(self, node: Node) raises: 277 | if node.get_is_static() or node.get_grad_id() == -1: 278 | return 279 | var index = self.get_index(node.get_cap()) 280 | var grad_id = node.get_grad_id() 281 | var mem_pool = self._memory_pool_manager[index] 282 | mem_pool.push_back(grad_id) 283 | node.set_grad_id(-1) 284 | node.set_grad_computed(False) 285 | 286 | fn clear_cache(inout self, reset_static_nodes: Bool = False) raises: 287 | var dt_null = DTypePointer[DType.float32].get_null() 288 | 289 | if self._last_node_id.load() != -1: 290 | self.release_data[True](self._nodes[self._last_node_id.load()]) 291 | 292 | for i in range(len(self._nodes) - 1): 293 | if self._nodes[i].get_data_id() == -1: 294 | continue 295 | 296 | for j in range(i + 1, len(self._nodes)): 297 | if self._nodes[i].get_id() == self._nodes[j].get_id(): 298 | self._nodes[i].set_data_id(-1) 299 | self._nodes[i].set_grad_id(-1) 300 | break 301 | 302 | for i in range(len(self._memory_pool) - 1): 303 | for j in range(i + 1, len(self._memory_pool)): 304 | if self._memory_pool[i] == self._memory_pool[j]: 305 | self._memory_pool[i] = dt_null 306 | 307 | var deletable_data = Vector[Bool](len(self._memory_pool)) 308 | memset(deletable_data._data, True, len(deletable_data)) 309 | 310 | for i in range(len(self._nodes)): 311 | var node = self._nodes[i] 312 | var data_id = node.get_data_id() 313 | 314 | if node.get_is_static() and data_id != -1: 315 | deletable_data[data_id] = False 316 | if node.get_grad_id() != -1: 317 | deletable_data[node.get_grad_id()] = False 318 | 319 | for i in range(len(deletable_data)): 320 | if deletable_data[i] and not self._memory_pool[i] == dt_null: 321 | self._memory_pool[i].free() 322 | deletable_data.free() 323 | 324 | for i in range(len(self._nodes) - 1, -1, -1): 325 | var node = self._nodes[i] 326 | if node.get_data_id() == -1: 327 | continue 328 | 329 | if not node.get_is_static(): 330 | self._free_node_ids.push_back(node.get_id()) 331 | node.free() 332 | else: 333 | node.clear_children() 334 | node.clear_parents() 335 | node.set_dependencies(0) 336 | node.set_id(0) 337 | node.set_data_id(0) 338 | node.set_grad_id(0) 339 | 340 | fn free(self): 341 | self._nodes.free() 342 | self._memory_pool.free() 343 | self._memory_pool_manager.free() 344 | self._free_node_ids.free() 345 | self._free_data_ids.free() 346 | self._last_node_id.free() 347 | self._grad_nodes_order.free() 348 | 349 | fn forward_recursive(inout self, node: Node) raises -> Node: 350 | if node.get_computed(): 351 | return node 352 | 353 | var operator_id = node.get_operator_id() 354 | var parents = node.get_parents() 355 | var num_parents = len(parents) 356 | 357 | if num_parents == 1: 358 | var parent_node = self.forward_recursive(self._nodes[parents[0]]) 359 | self.get_free_data(node) 360 | KERNELS.get(operator_id).get[0, UNARY_OP]()(node, parent_node) 361 | self.release_data(parent_node) 362 | else: 363 | var parent1 = self.forward_recursive(self._nodes[parents[0]]) 364 | var parent2 = self.forward_recursive(self._nodes[parents[1]]) 365 | self.get_free_data(node) 366 | KERNELS.get(operator_id).get[1, BINARY_OP]()(node, parent1, parent2) 367 | self.release_data(parent1) 368 | self.release_data(parent2) 369 | 370 | node.set_computed(True) 371 | 372 | return node 373 | 374 | fn forward(inout self, node: Node) raises -> Node: 375 | self._last_node_id.store(node.get_id()) 376 | return self.forward_recursive(node) 377 | 378 | fn forward_static(inout self, node: Node) raises -> Node: 379 | self.release_data[True](node) 380 | 381 | for i in range(len(self._nodes)): 382 | var node = self._nodes[i] 383 | if node.get_is_single(): 384 | continue 385 | 386 | if not node.get_is_static(): 387 | node.set_computed(False) 388 | node.set_grad_id(-1) 389 | node.set_data_id(-1) 390 | node.set_dependencies(len(node.get_children())) 391 | 392 | _ = self.forward_recursive(node) 393 | 394 | return self._nodes[self._last_node_id.load()] 395 | 396 | fn forward_recursive_graph_slice(inout self, node: Node) raises -> Node: 397 | if node.get_computed(): 398 | return node 399 | 400 | var operator_id = node.get_operator_id() 401 | var parents = node.get_parents() 402 | var num_parents = len(parents) 403 | 404 | if num_parents == 1: 405 | var parent1 = self.forward_recursive_graph_slice(self._nodes[parents[0]]) 406 | self.get_free_data[True](node) 407 | KERNELS.get(operator_id).get[0, UNARY_OP]()(node, parent1) 408 | else: 409 | var parent1 = self.forward_recursive_graph_slice(self._nodes[parents[0]]) 410 | var parent2 = self.forward_recursive_graph_slice(self._nodes[parents[1]]) 411 | self.get_free_data[True](node) 412 | KERNELS.get(operator_id).get[1, BINARY_OP]()(node, parent1, parent2) 413 | 414 | node.set_computed(True) 415 | 416 | return node 417 | 418 | fn backward_recursive(inout self, node: Node) raises -> Node: 419 | if node.get_grad_computed(): 420 | return node 421 | 422 | var children = node.get_children() 423 | 424 | for i in range(len(children)): 425 | var child = self._nodes[children[i]] 426 | var grad_operator_id = child.get_grad_operator_id() 427 | var child_parents = child.get_parents() 428 | 429 | _ = self.backward_recursive(child) 430 | 431 | if len(child_parents) == 1: 432 | var parent1 = self._nodes[child_parents[0]] 433 | _ = self.forward_recursive_graph_slice(parent1) 434 | 435 | if parent1.get_grad_id() == -1: 436 | self.get_free_grad(parent1) 437 | 438 | parent1.set_grad_computed(True) 439 | 440 | KERNELS.get(grad_operator_id).get[0, UNARY_OP]()(child, parent1) 441 | else: 442 | var parent1 = self._nodes[child_parents[0]] 443 | var parent2 = self._nodes[child_parents[1]] 444 | 445 | _ = self.forward_recursive_graph_slice(parent1) 446 | _ = self.forward_recursive_graph_slice(parent2) 447 | 448 | if parent1.get_grad_id() == -1: 449 | self.get_free_grad(parent1) 450 | if parent2.get_grad_id() == -1: 451 | self.get_free_grad(parent2) 452 | 453 | parent1.set_grad_computed(True) 454 | parent2.set_grad_computed(True) 455 | 456 | KERNELS.get(grad_operator_id).get[1, BINARY_OP]()( 457 | child, parent1, parent2 458 | ) 459 | 460 | if child.get_id() != self._last_node_id.load(): 461 | self.release_data[True](child) 462 | self.release_grad_forced(child) 463 | 464 | return node 465 | 466 | fn find_grad_nodes_order(inout self, node: Node) raises: 467 | self._grad_nodes_order.clear() 468 | for i in range(len(self._nodes)): 469 | self._nodes[i].set_tmp_visited(False) 470 | 471 | var backward = Vector[Int]() 472 | backward.push_back(node.get_id()) 473 | var it = 0 474 | while it < len(backward): 475 | var currId = backward[it] 476 | var curr = self._nodes[currId] 477 | for i in range(len(curr.get_parents())): 478 | var parId = curr.get_parents()[i] 479 | if not self._nodes[parId].get_tmp_visited(): 480 | backward.push_back(parId) 481 | if curr.get_is_static() or curr.get_checkpoint(): 482 | self._grad_nodes_order.push_back(currId) 483 | self._nodes[currId].set_tmp_visited(True) 484 | it += 1 485 | 486 | fn backward(inout self, node: Node) raises: 487 | var new_last_node_id = node.get_id() 488 | 489 | self.find_grad_nodes_order(node) 490 | self._last_node_id.store(new_last_node_id) 491 | 492 | for i in range(len(self._nodes)): 493 | var node = self._nodes[i] 494 | node.set_grad_computed(False) 495 | 496 | if node.get_is_single() or node.get_id() == new_last_node_id: 497 | continue 498 | 499 | if not node.get_is_static(): 500 | node.set_grad_id(-1) 501 | if not node.get_checkpoint(): 502 | node.set_computed(False) 503 | node.set_data_id(-1) 504 | else: 505 | if node.get_grad_id() != -1: 506 | memset_zero( 507 | node.get_grad(), 508 | self.load_ceiled_cap(node.get_cap()), 509 | ) 510 | 511 | self.get_free_grad(node) 512 | node.fill_grad(1.0) 513 | node.set_grad_computed(True) 514 | for i in range(len(self._grad_nodes_order)): 515 | _ = self.backward_recursive(self._nodes[self._grad_nodes_order[i]]) 516 | 517 | fn optimizer_step[optimizer: Optimizer](self): 518 | optimizer.step(self._nodes) 519 | 520 | fn copy(inout self, parent1: Node) raises -> Node: 521 | return self.node[False, True, False, copy_code]( 522 | parent1.get_shape().copy(), 523 | Vector[Int](), 524 | parent1, 525 | ) 526 | 527 | fn mmul(inout self, a: Node, b: Node) raises -> Node: 528 | var shape = get_broadcasted_shape_for_ew_op(a, b) 529 | var a_dims = a.get_num_dims() 530 | var b_dims = b.get_num_dims() 531 | shape[len(shape) - 2] = a.get_shape().copy()[a_dims - 2] 532 | shape[len(shape) - 1] = b.get_shape().copy()[b_dims - 1] 533 | if a.get_shape()[a_dims - 1] != b.get_shape()[b_dims - 2]: 534 | raise "Shapes don't fit for matrix multiplication. Got shapes: " + str( 535 | a.get_shape()[a_dims - 1] 536 | ) + " " + str(b.get_shape()[b_dims - 2]) 537 | 538 | var other_params = Vector[Int]() 539 | 540 | return self.node[True, False, False, mmul_code](shape, other_params, a, b) 541 | 542 | fn conv_1d( 543 | inout self, 544 | a: Node, 545 | b: Node, 546 | padding: Int, 547 | stride: Int, 548 | ) raises -> Node: 549 | var batch_size = a.get_shape()[0] 550 | var channels = a.get_shape()[1] 551 | var input_width = a.get_shape()[2] 552 | var kernel_width = b.get_shape()[1] 553 | 554 | var shape = TensorShape( 555 | batch_size, 556 | channels, 557 | (input_width - kernel_width + 2 * padding) // stride + 1, 558 | ) 559 | 560 | var other_params = Vector[Int]() 561 | other_params.push_back(padding) 562 | other_params.push_back(stride) 563 | 564 | return self.node[True, False, False, conv1d_code](shape, other_params, a, b) 565 | 566 | fn conv_2d( 567 | inout self, 568 | a: Node, 569 | b: Node, 570 | padding: StaticIntTuple[2], 571 | stride: StaticIntTuple[2], 572 | ) raises -> Node: 573 | var batch_size = a.get_shape()[0] 574 | var channels = a.get_shape()[1] 575 | var input_width = a.get_shape()[2] 576 | var input_height = a.get_shape()[3] 577 | var kernel_width = b.get_shape()[1] 578 | var kernel_height = b.get_shape()[2] 579 | 580 | var shape = TensorShape( 581 | batch_size, 582 | channels, 583 | (input_width - kernel_width + 2 * padding[0]) // stride[0] + 1, 584 | (input_height - kernel_height + 2 * padding[1]) // stride[1] + 1, 585 | ) 586 | 587 | var other_params = Vector[Int]() 588 | other_params.push_back(padding[0]) 589 | other_params.push_back(padding[1]) 590 | other_params.push_back(stride[0]) 591 | other_params.push_back(stride[1]) 592 | 593 | return self.node[True, False, False, conv2d_code](shape, other_params, a, b) 594 | 595 | fn maxpool_1d( 596 | inout self, 597 | a: Node, 598 | kernel_size: Int, 599 | stride: Int, 600 | padding: Int, 601 | ) raises -> Node: 602 | var other_params = Vector[Int]() 603 | other_params.push_back(kernel_size) 604 | other_params.push_back(stride) 605 | other_params.push_back(padding) 606 | 607 | var shape = TensorShape( 608 | a.get_shape()[0], 609 | a.get_shape()[1], 610 | (a.get_shape()[2] - kernel_size + 2 * padding) // stride + 1, 611 | ) 612 | 613 | return self.node[True, False, False, maxpool1d_code](shape, other_params, a) 614 | 615 | fn maxpool_2d( 616 | inout self, 617 | a: Node, 618 | kernel_size: StaticIntTuple[2], 619 | stride: Int, 620 | padding: Int, 621 | ) raises -> Node: 622 | var other_params = Vector[Int]() 623 | other_params.push_back(kernel_size[0]) 624 | other_params.push_back(kernel_size[1]) 625 | other_params.push_back(stride) 626 | other_params.push_back(padding) 627 | 628 | var shape = TensorShape( 629 | a.get_shape()[0], 630 | a.get_shape()[1], 631 | (a.get_shape()[2] - kernel_size[0] + 2 * padding) // stride + 1, 632 | (a.get_shape()[3] - kernel_size[1] + 2 * padding) // stride + 1, 633 | ) 634 | 635 | return self.node[True, False, False, maxpool2d_code](shape, other_params, a) 636 | 637 | fn dropout( 638 | inout self, a: Node, dropout_rate: Float32, noise_shape: TensorShape 639 | ) raises -> Node: 640 | return self.node[False, False, False, dropout_code]( 641 | a.get_shape().copy(), 642 | Vector[Int](), 643 | a, 644 | ) 645 | 646 | fn reshape(inout self, parent1: Node, shape: Vector[Int]) raises -> Node: 647 | return self.node[False, False, False, reshape_code]( 648 | shape, Vector[Int](), parent1 649 | ) 650 | 651 | fn transp(inout self, parent1: Node) raises -> Node: 652 | var old_shape = parent1.get_shape().copy() 653 | 654 | return self.node[False, False, False, transp_code]( 655 | TensorShape(old_shape[len(old_shape) - 1], old_shape[len(old_shape) - 2]), 656 | Vector[Int](), 657 | parent1, 658 | ) 659 | 660 | fn sum(inout self, parent1: Node) raises -> Node: 661 | return self.node[False, False, False, sum_code]( 662 | TensorShape(1), Vector[Int](), parent1 663 | ) 664 | 665 | fn function_general[operator_id: Int](inout self, parent1: Node) raises -> Node: 666 | return self.node[False, False, False, operator_id]( 667 | parent1.get_shape().copy(), 668 | Vector[Int](), 669 | parent1, 670 | ) 671 | 672 | fn arithmetic_general[ 673 | operator_id: Int 674 | ](inout self, a: Node, b: Node) raises -> Node: 675 | return self.node[False, False, False, operator_id]( 676 | get_broadcasted_shape_for_ew_op(a, b), 677 | Vector[Int](), 678 | a, 679 | b, 680 | ) 681 | 682 | fn activation_general[ 683 | operator_id: Int, 684 | arg1: Float32 = 0.0, 685 | ](inout self, parent1: Node) raises -> Node: 686 | var other_params = Vector[Int]() 687 | other_params.push_back(round(arg1 * 1000000.0).to_int()) 688 | return self.node[False, False, False, operator_id]( 689 | parent1.get_shape().copy(), 690 | other_params, 691 | parent1, 692 | ) 693 | 694 | fn loss_general[ 695 | operator_id: Int 696 | ](inout self, parent1: Node, parent2: Node) raises -> Node: 697 | return self.node[False, False, False, operator_id]( 698 | TensorShape(1), 699 | Vector[Int](), 700 | parent1, 701 | parent2, 702 | ) 703 | 704 | fn fuse_graphs( 705 | inout self: Graph, 706 | other_graph: Graph, 707 | remove_other: Bool = False, 708 | ) raises: 709 | var num_nodes = len(self._nodes) 710 | var memory_pool_len = len(self._memory_pool) 711 | 712 | for i in range(len(other_graph._nodes)): 713 | var node = other_graph._nodes[i] 714 | node.set_id(node.get_id() + num_nodes) 715 | for j in range(len(node.get_children())): 716 | node.get_children()[j] = node.get_children()[j] + num_nodes 717 | for j in range(len(node.get_parents())): 718 | node.get_parents()[j] = node.get_parents()[j] + num_nodes 719 | node.set_data_id(node.get_data_id() + memory_pool_len) 720 | self._nodes.push_back(node) 721 | 722 | for i in range(len(other_graph._memory_pool)): 723 | self._memory_pool.push_back(other_graph._memory_pool[i]) 724 | 725 | @unroll 726 | for i in range(MEMORY_POOL_SIZE): 727 | var mem_pool_len = len(other_graph._memory_pool_manager[i]) 728 | for j in range(mem_pool_len): 729 | self._memory_pool_manager[i].push_back( 730 | other_graph._memory_pool_manager[i][j] + memory_pool_len 731 | ) 732 | 733 | var free_node_ids_len = len(self._free_node_ids) 734 | for i in range(free_node_ids_len): 735 | self._free_node_ids.push_back(other_graph._free_node_ids[i] + num_nodes) 736 | 737 | var free_data_ids_len = len(self._free_data_ids) 738 | for i in range(free_data_ids_len): 739 | self._free_data_ids.push_back( 740 | other_graph._free_data_ids[i] + memory_pool_len 741 | ) 742 | 743 | if remove_other: 744 | other_graph.free() 745 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .kernels import KERNELS 2 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/activations.mojo: -------------------------------------------------------------------------------- 1 | from math import exp, log, abs, tanh, cosh, erf 2 | 3 | from voodoo.constants import F32_MAX, NELTS 4 | from voodoo.autograd.kernels.generics import GenericActivation 5 | 6 | 7 | trait Activation: 8 | ... 9 | 10 | 11 | @register_passable("trivial") 12 | struct Relu[arg1: Float32, arg2: Float32, arg3: Float32](Activation): 13 | alias fw = GenericActivation[relu_fw_vec, relu_bw_vec, arg1, arg2, arg3].fw 14 | alias bw = GenericActivation[relu_fw_vec, relu_bw_vec, arg1, arg2, arg3].bw 15 | 16 | 17 | @register_passable("trivial") 18 | struct Sigmoid[](Activation): 19 | alias fw = GenericActivation[sigmoid_fw_vec, sigmoid_bw_vec, 0.0, 0.0, 0.0].fw 20 | alias bw = GenericActivation[sigmoid_fw_vec, sigmoid_bw_vec, 0.0, 0.0, 0.0].bw 21 | 22 | 23 | @register_passable("trivial") 24 | struct Softplus[](Activation): 25 | alias fw = GenericActivation[softplus_fw_vec, softplus_bw_vec, 0.0, 0.0, 0.0].fw 26 | alias bw = GenericActivation[softplus_fw_vec, softplus_bw_vec, 0.0, 0.0, 0.0].bw 27 | 28 | 29 | @register_passable("trivial") 30 | struct Softsign[](Activation): 31 | alias fw = GenericActivation[softsign_fw_vec, softsign_bw_vec, 0.0, 0.0, 0.0].fw 32 | alias bw = GenericActivation[softsign_fw_vec, softsign_bw_vec, 0.0, 0.0, 0.0].bw 33 | 34 | 35 | @register_passable("trivial") 36 | struct Tanh[](Activation): 37 | alias fw = GenericActivation[tanh_fw_vec, tanh_bw_vec, 0.0, 0.0, 0.0].fw 38 | alias bw = GenericActivation[tanh_fw_vec, tanh_bw_vec, 0.0, 0.0, 0.0].bw 39 | 40 | 41 | @register_passable("trivial") 42 | struct Selu[](Activation): 43 | alias fw = GenericActivation[selu_fw_vec, selu_bw_vec, 0.0, 0.0, 0.0].fw 44 | alias bw = GenericActivation[selu_fw_vec, selu_bw_vec, 0.0, 0.0, 0.0].bw 45 | 46 | 47 | @register_passable("trivial") 48 | struct Elu[alpha: Float32](Activation): 49 | alias fw = GenericActivation[elu_fw_vec, elu_bw_vec, 0.0, 0.0, alpha].fw 50 | alias bw = GenericActivation[elu_fw_vec, elu_bw_vec, 0.0, 0.0, alpha].bw 51 | 52 | 53 | @register_passable("trivial") 54 | struct Exp[](Activation): 55 | alias fw = GenericActivation[exp_vec, exp_vec, 0.0, 0.0, 0.0].fw 56 | alias bw = GenericActivation[exp_vec, exp_vec, 0.0, 0.0, 0.0].bw 57 | 58 | 59 | @register_passable("trivial") 60 | struct LeakyRelu[alpha: Float32](Activation): 61 | alias fw = GenericActivation[relu_fw_vec, relu_bw_vec, alpha, F32_MAX, 0.0].fw 62 | alias bw = GenericActivation[relu_fw_vec, relu_bw_vec, alpha, F32_MAX, 0.0].bw 63 | 64 | 65 | @register_passable("trivial") 66 | struct Relu6[](Activation): 67 | alias fw = GenericActivation[relu_fw_vec, relu_bw_vec, 0.0, 6.0, 0.0].fw 68 | alias bw = GenericActivation[relu_fw_vec, relu_bw_vec, 0.0, 6.0, 0.0].bw 69 | 70 | 71 | @register_passable("trivial") 72 | struct Silu[](Activation): 73 | alias fw = GenericActivation[silu_fw_vec, silu_bw_vec, 0.0, 0.0, 0.0].fw 74 | alias bw = GenericActivation[silu_fw_vec, silu_bw_vec, 0.0, 0.0, 0.0].bw 75 | 76 | 77 | @register_passable("trivial") 78 | struct Gelu[approximate: Float32](Activation): 79 | alias fw = GenericActivation[gelu_fw_vec, gelu_bw_vec, approximate, 0.0, 0.0].fw 80 | alias bw = GenericActivation[gelu_fw_vec, gelu_bw_vec, approximate, 0.0, 0.0].bw 81 | 82 | 83 | @register_passable("trivial") 84 | struct HardSigmoid[](Activation): 85 | alias fw = GenericActivation[hsig_fw_vec, hsig_bw_vec, 0.0, 0.0, 0.0].fw 86 | alias bw = GenericActivation[hsig_fw_vec, hsig_bw_vec, 0.0, 0.0, 0.0].bw 87 | 88 | 89 | @register_passable("trivial") 90 | struct Linear[](Activation): 91 | alias fw = GenericActivation[linear_fw_vec, linear_bw_vec, 0.0, 0.0, 0.0].fw 92 | alias bw = GenericActivation[linear_fw_vec, linear_bw_vec, 0.0, 0.0, 0.0].bw 93 | 94 | 95 | @register_passable("trivial") 96 | struct Mish[](Activation): 97 | alias fw = GenericActivation[mish_fw_vec, mish_bw_vec, 0.0, 0.0, 0.0].fw 98 | alias bw = GenericActivation[mish_fw_vec, mish_bw_vec, 0.0, 0.0, 0.0].bw 99 | 100 | 101 | fn relu_fw_vec[ 102 | NELTS: Int, 103 | negative_slope: Float32 = 0.0, 104 | max_value: Float32 = F32_MAX, 105 | threshold: Float32 = 0.0, 106 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 107 | # f(x) = x > threshold ? (x > max_value ? max_value : x) : negative_slope * x 108 | # Best is 4 instructions (compare, select, mul, min), 2 if max == F32_MAX and slope == 0 109 | @parameter 110 | if negative_slope == 0.0 and max_value == F32_MAX: 111 | return (x > threshold).select(x, 0.0) 112 | return (x > threshold).select(x, negative_slope * x).min(max_value) 113 | 114 | 115 | fn relu_bw_vec[ 116 | NELTS: Int, 117 | negative_slope: Float32 = 0.0, 118 | max_value: Float32 = F32_MAX, 119 | threshold: Float32 = 0.0, 120 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 121 | # f'(x) = x > threshold ? (x > max_value ? 0 : 1) : negative_slope 122 | # Best is 4 instructions (compare, select, compare, select), 2 max == F32_MAX and slope == 0 123 | @parameter 124 | if negative_slope == 0.0 and max_value == F32_MAX: 125 | return (x > threshold).select[DType.float32](1.0, 0.0) 126 | return (x < max_value).select((x > threshold).select(1.0, negative_slope), 0.0) 127 | 128 | 129 | fn sigmoid_fw_vec[ 130 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 131 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 132 | # f(x) = 1 / (1 + e^-x) 133 | # Best is 3 instructions (exp, add, div) 134 | return 1.0 / (1.0 + exp(-x)) 135 | 136 | 137 | fn sigmoid_bw_vec[ 138 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 139 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 140 | # f'(x) = e^x / (1 + e^x)^2 141 | # Best is 6 instructions (exp, div, fma, exp, mul, add) 142 | var e_x = (exp(x)) 143 | return e_x / (1.0 + e_x) ** 2 144 | 145 | 146 | fn softplus_fw_vec[ 147 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 148 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 149 | # f(x) = log(1 + e^x) 150 | # Best is 3 instructions (exp, add, log) 151 | return log(1.0 + exp(x)) 152 | 153 | 154 | fn softplus_bw_vec[ 155 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 156 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 157 | # f'(x) = e^x / (1 + e^x) 158 | # Best is 3 instructions (exp, add, div) 159 | var e_x = (exp(x)) 160 | return e_x / (1.0 + e_x) 161 | 162 | 163 | fn softsign_fw_vec[ 164 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 165 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 166 | # f(x) = x / (1 + |x|) 167 | # Best is 3 instructions (abs, add, div) 168 | return x / (1.0 + abs(x)) 169 | 170 | 171 | fn softsign_bw_vec[ 172 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 173 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 174 | # f'(x) = 1 / (1 + |x|)^2 175 | # Simplifies to 1 / (1 + x^2 + 2|x|) 176 | # Best is 4 instructions (div, abs, fma, fma) 177 | return 1.0 / abs(x).fma(2.0, x.fma(x, 1.0)) 178 | 179 | 180 | fn tanh_fw_vec[ 181 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 182 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 183 | # f(x) = tanh(x) 184 | # Best is 1 instruction (tanh) 185 | return tanh(x) 186 | 187 | 188 | fn tanh_bw_vec[ 189 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 190 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 191 | # f'(x) = 1 / cosh(x)^2 192 | # Best is 3 instructions (cosh, pow, div) 193 | return 1.0 / cosh(x) ** 2 194 | 195 | 196 | fn selu_fw_vec[ 197 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 198 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 199 | # f(x) = x > 0 ? 1.05070098 * x : 1.05070098 * 1.67326324 * (e^x - 1) 200 | # Best is 5 instructions (compare, select, mul, exp, fma) 201 | return (x > 0.0).select(1.05070098 * x, exp(x).fma(1.75809932607, -1.75809932607)) 202 | 203 | 204 | fn selu_bw_vec[ 205 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 206 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 207 | # f'(x) = x > 0 ? 1.05070098 : 1.05070098 * 1.67326324 * e^x 208 | # Best is 4 instructions (compare, select, mul, exp) 209 | return (x > 0.0).select(1.05070098, 1.75809932607 * exp(x)) 210 | 211 | 212 | fn elu_fw_vec[ 213 | NELTS: Int, 214 | arg1: Float32, 215 | arg2: Float32, 216 | alpha: Float32 = 1.0, 217 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 218 | # f(x) = x > 0 ? x : alpha * (e^x - 1) 219 | # Best is 5 instructions (compare, select, mul, exp, sub), 4 if alpha == 1 220 | @parameter 221 | if alpha == 1.0: 222 | return (x > 0.0).select(x, exp(x) - 1.0) 223 | return (x > 0.0).select(x, alpha * (exp(x) - 1.0)) 224 | 225 | 226 | fn elu_bw_vec[ 227 | NELTS: Int, 228 | arg1: Float32, 229 | arg2: Float32, 230 | alpha: Float32 = 1.0, 231 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 232 | # f'(x) = x > 0 ? 1 : alpha * e^x 233 | # Best is 4 instructions (compare, select, mul, exp), 3 if alpha == 1 234 | @parameter 235 | if alpha == 1.0: 236 | return (x > 0.0).select(1.0, exp(x)) 237 | return (x > 0.0).select(1.0, alpha * exp(x)) 238 | 239 | 240 | fn exp_vec[ 241 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 242 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 243 | # f(x) = e^x 244 | # Best is 1 instruction (exp) 245 | return exp(x) 246 | 247 | 248 | fn silu_fw_vec[ 249 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 250 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 251 | # f(x) = x / (1 + e^-x) 252 | # Best is 4 instructions (div, add, exp, inverse) 253 | return x / (1.0 + exp(-x)) 254 | 255 | 256 | fn silu_bw_vec[ 257 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 258 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 259 | # f'(x) = (e^x * x + e^x + e^2x) / (e^x + 1)^2 260 | # Best is 8 instructions (exp, fma, add, exp, mul, div, add, pow) 261 | var e_x = exp(x) 262 | return (e_x.fma(x, e_x) + exp(2.0 * x)) / (e_x + 1.0) ** 2 263 | 264 | 265 | fn gelu_fw_vec[ 266 | NELTS: Int, 267 | arg1: Float32, 268 | arg2: Float32, 269 | approximate: Float32 = 0.0, 270 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 271 | # f(x) when approximate == 0.0 = 0.5 * x * (1 + erf(x / sqrt(2))) 272 | # f(x) when approximate != 0.0 = 0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.044715 * x^3))) 273 | # Best is 6 instructions (mul, tanh, fma, mul, pow, fma), 4 if approximate == 0 274 | var x_05 = x * 0.5 275 | 276 | @parameter 277 | if approximate == 0.0: 278 | return erf(x / 1.4142135623730951).fma(x_05, x_05) 279 | return tanh(x.fma(0.7978845608028654, 0.03567740813 * x**3)).fma(x_05, x_05) 280 | 281 | 282 | fn gelu_bw_vec[ 283 | NELTS: Int, 284 | arg1: Float32, 285 | arg2: Float32, 286 | approximate: Float32 = 0.0, 287 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 288 | # f'(x) when approximate == 0.0 = 0.5 * (erf(0.7071067811865475 * x) + 1) + 0.3989422804014327 * x * exp(-0.5 * x^2) 289 | # f'(x) when approximate != 0.0 = 0.5 * (1 + tanh(0.7978845608028654 * (x + 0.044715 * x^3))^2) + 0.7978845608028654 * x * (1 - tanh(0.7978845608028654 * (x + 0.044715 * x^3))^2) 290 | # Best is 7 instructions (tanh, fma, fma, mul, mul, sub, pow), 7 if approximate == 0 291 | @parameter 292 | if approximate == 0.0: 293 | return x.fma( 294 | 0.3989422804014327 * exp(-0.5 * x**2), 295 | erf(0.7071067811865475 * x).fma(0.5, 0.5), 296 | ) 297 | var tanh_x = tanh(x.fma(0.7978845608028654, 0.03567740813 * x**3)) 298 | return tanh_x.fma(tanh_x, 1.0).fma( 299 | 0.5, 0.7978845608028654 * x * (1.0 - tanh_x**2) 300 | ) 301 | 302 | 303 | fn hsig_fw_vec[ 304 | NELTS: Int, 305 | arg1: Float32, 306 | arg2: Float32, 307 | arg3: Float32, 308 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 309 | # f(x) = x > 2.5 ? 1 : x < -2.5 ? 0 : 0.2 * x + 0.5 310 | # Best is 5 instructions (compare, select, compare, select, fma) 311 | return (x > 2.5).select(1.0, (x > -2.5).select(x.fma(0.2, 0.5), 0.0)) 312 | 313 | 314 | fn hsig_bw_vec[ 315 | NELTS: Int, 316 | arg1: Float32, 317 | arg2: Float32, 318 | arg3: Float32, 319 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 320 | # f'(x) = x > -2.5 ? x < 2.5 ? 0.2 : 0 : 0 321 | # Best is 5 instructions (compare, and, compare, cast, mul) 322 | return ((x > -2.5) & (x < 2.5)).select[DType.float32](0.2, 0.0) 323 | 324 | 325 | fn linear_fw_vec[ 326 | NELTS: Int, 327 | arg1: Float32, 328 | arg2: Float32, 329 | arg3: Float32, 330 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 331 | # f(x) = x 332 | # Best is 1 instruction (mov) 333 | return x 334 | 335 | 336 | fn linear_bw_vec[ 337 | NELTS: Int, 338 | arg1: Float32, 339 | arg2: Float32, 340 | arg3: Float32, 341 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 342 | # f'(x) = 1 343 | # Best is 1 instruction (mov) 344 | return 1.0 345 | 346 | 347 | fn mish_fw_vec[ 348 | NELTS: Int, 349 | arg1: Float32, 350 | arg2: Float32, 351 | arg3: Float32, 352 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 353 | # f(x) = x * tanh(log(1 + e^x)) 354 | # Best is 5 instructions (mul, tanh, log, add, exp) 355 | return x * tanh(log(1.0 + exp(x))) 356 | 357 | 358 | fn mish_bw_vec[ 359 | NELTS: Int, 360 | arg1: Float32, 361 | arg2: Float32, 362 | arg3: Float32, 363 | ](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 364 | # f'(x) = tanh(log(exp(x) + 1)) + (x * exp(x) * (1 / cosh(ln(exp(x) + 1)) ^ 2)) / (exp(x) + 1) 365 | # Best is 14 instructions (exp, tanh, log, add, add, mul, mul, div, cosh, log, add, pow, div, add) 366 | var e_x = exp(x) 367 | return tanh(log(e_x + 1)) + (x * e_x * (1 / cosh(log(e_x + 1)) ** 2)) / (e_x + 1) 368 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/arithmetic.mojo: -------------------------------------------------------------------------------- 1 | from math import sqrt, exp2, exp, log2, sin, cos, tan, log, asin, acos, atan, sinh, cosh 2 | 3 | from voodoo.autograd.kernels.generics import GenericArithmetic, GenericBinaryArithmetic 4 | 5 | 6 | trait Aritmetic: 7 | ... 8 | 9 | 10 | @register_passable("trivial") 11 | struct Sqrt[](Aritmetic): 12 | alias fw = GenericArithmetic[sqrt_fw_vec, sqrt_bw_vec].fw 13 | alias bw = GenericArithmetic[sqrt_fw_vec, sqrt_bw_vec].bw 14 | 15 | 16 | @register_passable("trivial") 17 | struct Abs[](Aritmetic): 18 | alias fw = GenericArithmetic[abs_fw_vec, abs_bw_vec].fw 19 | alias bw = GenericArithmetic[abs_fw_vec, abs_bw_vec].bw 20 | 21 | 22 | @register_passable("trivial") 23 | struct Exp2[](Aritmetic): 24 | alias fw = GenericArithmetic[exp2_fw_vec, exp2_bw_vec].fw 25 | alias bw = GenericArithmetic[exp2_fw_vec, exp2_bw_vec].bw 26 | 27 | 28 | @register_passable("trivial") 29 | struct Log2[](Aritmetic): 30 | alias fw = GenericArithmetic[log2_fw_vec, log2_bw_vec].fw 31 | alias bw = GenericArithmetic[log2_fw_vec, log2_bw_vec].bw 32 | 33 | 34 | @register_passable("trivial") 35 | struct Log[](Aritmetic): 36 | alias fw = GenericArithmetic[log_fw_vec, log_bw_vec].fw 37 | alias bw = GenericArithmetic[log_fw_vec, log_bw_vec].bw 38 | 39 | 40 | @register_passable("trivial") 41 | struct Sin[](Aritmetic): 42 | alias fw = GenericArithmetic[sin_fw_vec, sin_bw_vec].fw 43 | alias bw = GenericArithmetic[sin_fw_vec, sin_bw_vec].bw 44 | 45 | 46 | @register_passable("trivial") 47 | struct Cos[](Aritmetic): 48 | alias fw = GenericArithmetic[cos_fw_vec, cos_bw_vec].fw 49 | alias bw = GenericArithmetic[cos_fw_vec, cos_bw_vec].bw 50 | 51 | 52 | @register_passable("trivial") 53 | struct Tan[](Aritmetic): 54 | alias fw = GenericArithmetic[tan_fw_vec, tan_bw_vec].fw 55 | alias bw = GenericArithmetic[tan_fw_vec, tan_bw_vec].bw 56 | 57 | 58 | @register_passable("trivial") 59 | struct Asin[](Aritmetic): 60 | alias fw = GenericArithmetic[asin_fw_vec, asin_bw_vec].fw 61 | alias bw = GenericArithmetic[asin_fw_vec, asin_bw_vec].bw 62 | 63 | 64 | @register_passable("trivial") 65 | struct Acos[](Aritmetic): 66 | alias fw = GenericArithmetic[acos_fw_vec, acos_bw_vec].fw 67 | alias bw = GenericArithmetic[acos_fw_vec, acos_bw_vec].bw 68 | 69 | 70 | @register_passable("trivial") 71 | struct Atan[](Aritmetic): 72 | alias fw = GenericArithmetic[atan_fw_vec, atan_bw_vec].fw 73 | alias bw = GenericArithmetic[atan_fw_vec, atan_bw_vec].bw 74 | 75 | 76 | @register_passable("trivial") 77 | struct Sinh[](Aritmetic): 78 | alias fw = GenericArithmetic[sinh_fw_vec, sinh_bw_vec].fw 79 | alias bw = GenericArithmetic[sinh_fw_vec, sinh_bw_vec].bw 80 | 81 | 82 | @register_passable("trivial") 83 | struct Cosh[](Aritmetic): 84 | alias fw = GenericArithmetic[cosh_fw_vec, cosh_bw_vec].fw 85 | alias bw = GenericArithmetic[cosh_fw_vec, cosh_bw_vec].bw 86 | 87 | 88 | @register_passable("trivial") 89 | struct Add[](Aritmetic): 90 | alias fw = GenericBinaryArithmetic[add_fw, add_bw, add_bw].fw 91 | alias bw = GenericBinaryArithmetic[add_fw, add_bw, add_bw].bw 92 | 93 | 94 | @register_passable("trivial") 95 | struct Sub[](Aritmetic): 96 | alias fw = GenericBinaryArithmetic[sub_fw, sub_bw_a, sub_bw_b].fw 97 | alias bw = GenericBinaryArithmetic[sub_fw, sub_bw_a, sub_bw_b].bw 98 | 99 | 100 | @register_passable("trivial") 101 | struct Mul[](Aritmetic): 102 | alias fw = GenericBinaryArithmetic[mul_fw, mul_bw_a, mul_bw_b].fw 103 | alias bw = GenericBinaryArithmetic[mul_fw, mul_bw_a, mul_bw_b].bw 104 | 105 | 106 | @register_passable("trivial") 107 | struct Div[](Aritmetic): 108 | alias fw = GenericBinaryArithmetic[div_fw, div_bw_a, div_bw_b].fw 109 | alias bw = GenericBinaryArithmetic[div_fw, div_bw_a, div_bw_b].bw 110 | 111 | 112 | @register_passable("trivial") 113 | struct Pow[](Aritmetic): 114 | alias fw = GenericBinaryArithmetic[pow_fw, pow_bw_a, pow_bw_b].fw 115 | alias bw = GenericBinaryArithmetic[pow_fw, pow_bw_a, pow_bw_b].bw 116 | 117 | 118 | fn sqrt_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 119 | return sqrt(x) 120 | 121 | 122 | fn sqrt_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 123 | return 0.5 / sqrt(x) 124 | 125 | 126 | fn abs_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 127 | return (x > 0).select(x, -x) 128 | 129 | 130 | fn abs_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 131 | return (x > 0).select(Float32(1.0), Float32(-1.0)) 132 | 133 | 134 | fn exp2_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 135 | return exp2(x) 136 | 137 | 138 | fn exp2_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 139 | return x * 0.69314718056 140 | 141 | 142 | fn log2_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 143 | return log2(x) 144 | 145 | 146 | fn log2_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 147 | return 1.0 / (x * 0.69314718056) 148 | 149 | 150 | fn log_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 151 | return log(x) 152 | 153 | 154 | fn log_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 155 | return 1.0 / x 156 | 157 | 158 | fn sin_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 159 | return sin(x) 160 | 161 | 162 | fn sin_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 163 | return cos(x) 164 | 165 | 166 | fn cos_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 167 | return cos(x) 168 | 169 | 170 | fn cos_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 171 | return -sin(x) 172 | 173 | 174 | fn tan_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 175 | return tan(x) 176 | 177 | 178 | fn tan_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 179 | return 1.0 / (cos(x) ** 2) 180 | 181 | 182 | fn asin_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 183 | return asin(x) 184 | 185 | 186 | fn asin_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 187 | return 1.0 / sqrt(1.0 - x**2) 188 | 189 | 190 | fn acos_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 191 | return acos(x) 192 | 193 | 194 | fn acos_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 195 | return -1.0 / sqrt(1.0 - x**2) 196 | 197 | 198 | fn atan_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 199 | return atan(x) 200 | 201 | 202 | fn atan_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 203 | return 1.0 / (1.0 + x**2) 204 | 205 | 206 | fn sinh_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 207 | return sinh(x) 208 | 209 | 210 | fn sinh_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 211 | return cosh(x) 212 | 213 | 214 | fn cosh_fw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 215 | return cosh(x) 216 | 217 | 218 | fn cosh_bw_vec[NELTS: Int](x: SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS]: 219 | return sinh(x) 220 | 221 | 222 | fn add_fw[ 223 | NELTS: Int 224 | ](a: SIMD[DType.float32, NELTS], b: SIMD[DType.float32, NELTS]) -> SIMD[ 225 | DType.float32, NELTS 226 | ]: 227 | # f(x, y) = x + y 228 | return a + b 229 | 230 | 231 | fn add_bw[ 232 | NELTS: Int 233 | ]( 234 | a: SIMD[DType.float32, NELTS], 235 | b: SIMD[DType.float32, NELTS], 236 | ) -> SIMD[ 237 | DType.float32, NELTS 238 | ]: 239 | # f'(x, y) = 1 240 | return 1 241 | 242 | 243 | fn sub_fw[ 244 | NELTS: Int 245 | ](a: SIMD[DType.float32, NELTS], b: SIMD[DType.float32, NELTS]) -> SIMD[ 246 | DType.float32, NELTS 247 | ]: 248 | # f(x, y) = x - y 249 | return a - b 250 | 251 | 252 | fn sub_bw_a[ 253 | NELTS: Int 254 | ]( 255 | a: SIMD[DType.float32, NELTS], 256 | b: SIMD[DType.float32, NELTS], 257 | ) -> SIMD[ 258 | DType.float32, NELTS 259 | ]: 260 | # f'(x, y) with respect to x = 1 261 | return 1 262 | 263 | 264 | fn sub_bw_b[ 265 | NELTS: Int 266 | ]( 267 | a: SIMD[DType.float32, NELTS], 268 | b: SIMD[DType.float32, NELTS], 269 | ) -> SIMD[ 270 | DType.float32, NELTS 271 | ]: 272 | # f'(x, y) with respect to y = -1 273 | return -1 274 | 275 | 276 | fn mul_fw[ 277 | NELTS: Int 278 | ](a: SIMD[DType.float32, NELTS], b: SIMD[DType.float32, NELTS]) -> SIMD[ 279 | DType.float32, NELTS 280 | ]: 281 | # f(x, y) = x * y 282 | return a * b 283 | 284 | 285 | fn mul_bw_a[ 286 | NELTS: Int 287 | ]( 288 | a: SIMD[DType.float32, NELTS], 289 | b: SIMD[DType.float32, NELTS], 290 | ) -> SIMD[ 291 | DType.float32, NELTS 292 | ]: 293 | # f'(x, y) with respect to x = y 294 | return b 295 | 296 | 297 | fn mul_bw_b[ 298 | NELTS: Int 299 | ]( 300 | a: SIMD[DType.float32, NELTS], 301 | b: SIMD[DType.float32, NELTS], 302 | ) -> SIMD[ 303 | DType.float32, NELTS 304 | ]: 305 | # f'(x, y) with respect to y = x 306 | return a 307 | 308 | 309 | fn div_fw[ 310 | NELTS: Int 311 | ](a: SIMD[DType.float32, NELTS], b: SIMD[DType.float32, NELTS]) -> SIMD[ 312 | DType.float32, NELTS 313 | ]: 314 | # f(x, y) = x / y 315 | return a / b 316 | 317 | 318 | fn div_bw_a[ 319 | NELTS: Int 320 | ]( 321 | a: SIMD[DType.float32, NELTS], 322 | b: SIMD[DType.float32, NELTS], 323 | ) -> SIMD[ 324 | DType.float32, NELTS 325 | ]: 326 | # f'(x, y) with respect to x = 1/y 327 | return 1 / b 328 | 329 | 330 | fn div_bw_b[ 331 | NELTS: Int 332 | ]( 333 | a: SIMD[DType.float32, NELTS], 334 | b: SIMD[DType.float32, NELTS], 335 | ) -> SIMD[ 336 | DType.float32, NELTS 337 | ]: 338 | # f'(x, y) with respect to y = -x/y^2 339 | return -a / (b * b) 340 | 341 | 342 | fn pow_fw[ 343 | NELTS: Int 344 | ](a: SIMD[DType.float32, NELTS], b: SIMD[DType.float32, NELTS]) -> SIMD[ 345 | DType.float32, NELTS 346 | ]: 347 | # f(x, y) = x^y 348 | return a**b 349 | 350 | 351 | fn pow_bw_a[ 352 | NELTS: Int 353 | ]( 354 | a: SIMD[DType.float32, NELTS], 355 | b: SIMD[DType.float32, NELTS], 356 | ) -> SIMD[ 357 | DType.float32, NELTS 358 | ]: 359 | # f'(x, y) with respect to x = y * x^(y-1) 360 | return b * (a ** (b - 1.0)) 361 | 362 | 363 | fn pow_bw_b[ 364 | NELTS: Int 365 | ]( 366 | a: SIMD[DType.float32, NELTS], 367 | b: SIMD[DType.float32, NELTS], 368 | ) -> SIMD[ 369 | DType.float32, NELTS 370 | ]: 371 | # f'(x, y) with respect to y = x^y * log(x) 372 | return (a**b) * log(a) 373 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/conv.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize, tile 2 | from math import max 3 | 4 | from voodoo.autograd import Node 5 | from voodoo.utils import Vector 6 | from voodoo.constants import NELTS, PREFETCH_READ, PREFETCH_WRITE 7 | 8 | alias tile_sizes = VariadicList[Int](32, 16, 8, 4, 2, 1) 9 | 10 | 11 | trait Conv: 12 | ... 13 | 14 | 15 | @register_passable("trivial") 16 | struct Conv1D(Conv): 17 | @staticmethod 18 | fn fw(c: Node, a: Node, b: Node): 19 | var params = c.get_other_params() 20 | 21 | var padding_x = params[0] 22 | var stride_x = params[1] 23 | 24 | var batches = a.get_shape()[0] 25 | var channels = a.get_shape()[1] 26 | var input_width = a.get_shape()[2] 27 | 28 | var kernel_width = b.get_shape()[1] 29 | 30 | var output_width = c.get_shape()[2] 31 | 32 | var im2col = im2col2D( 33 | a.get_data(), 34 | a.get_shape(), 35 | b.get_shape(), 36 | c.get_shape(), 37 | padding_x, 38 | stride_x, 39 | ) 40 | 41 | for batch in range(batches): 42 | for output_x in range(output_width): 43 | for kernel_x in range(kernel_width): 44 | for channel in range(channels): 45 | var kernel_value = b.get_data().load( 46 | channel * kernel_width + kernel_x 47 | ) 48 | 49 | var output_value = c.get_data().load( 50 | batch * output_width * channels 51 | + output_x * channels 52 | + channel 53 | ) 54 | 55 | var im2col_value = im2col.load( 56 | batch * output_width * kernel_width * channels 57 | + output_x * kernel_width * channels 58 | + kernel_x * channels 59 | + channel 60 | ) 61 | 62 | c.get_data().store( 63 | batch * output_width * channels 64 | + output_x * channels 65 | + channel, 66 | output_value + kernel_value * im2col_value, 67 | ) 68 | 69 | im2col.free() 70 | 71 | @staticmethod 72 | fn bw(c: Node, a: Node, b: Node): 73 | var params = c.get_other_params() 74 | 75 | var padding_x = params[0] 76 | var stride_x = params[1] 77 | 78 | var batches = a.get_shape()[0] 79 | var channels = a.get_shape()[1] 80 | var input_width = a.get_shape()[2] 81 | 82 | var kernel_width = b.get_shape()[1] 83 | 84 | var output_width = c.get_shape()[2] 85 | 86 | var im2col = im2col2D( 87 | a.get_data(), 88 | a.get_shape(), 89 | b.get_shape(), 90 | c.get_shape(), 91 | padding_x, 92 | stride_x, 93 | ) 94 | 95 | for batch in range(batches): 96 | for output_x in range(output_width): 97 | for kernel_x in range(kernel_width): 98 | for channel in range(channels): 99 | var kernel_value = b.get_data().load( 100 | channel * kernel_width + kernel_x 101 | ) 102 | 103 | var output_value = c.get_data().load( 104 | batch * output_width * channels 105 | + output_x * channels 106 | + channel 107 | ) 108 | 109 | var im2col_value = im2col.load( 110 | batch * output_width * kernel_width * channels 111 | + output_x * kernel_width * channels 112 | + kernel_x * channels 113 | + channel 114 | ) 115 | 116 | a.get_grad().store( 117 | batch * input_width * channels 118 | + (output_x * stride_x + kernel_x - padding_x) * channels 119 | + channel, 120 | a.get_grad().load( 121 | batch * input_width * channels 122 | + (output_x * stride_x + kernel_x - padding_x) 123 | * channels 124 | + channel 125 | ) 126 | + kernel_value 127 | * c.get_grad().load( 128 | batch * output_width * channels 129 | + output_x * channels 130 | + channel 131 | ), 132 | ) 133 | 134 | b.get_grad().store( 135 | channel * kernel_width + kernel_x, 136 | b.get_grad()[channel * kernel_width + kernel_x] 137 | + output_value * im2col_value, 138 | ) 139 | 140 | im2col.free() 141 | 142 | 143 | @register_passable("trivial") 144 | struct Conv2D(Conv): 145 | @staticmethod 146 | fn fw(c: Node, a: Node, b: Node): 147 | var params = c.get_other_params() 148 | 149 | var padding_x = params[0] 150 | var padding_y = params[1] 151 | var stride_x = params[2] 152 | var stride_y = params[3] 153 | 154 | var batches = a.get_shape()[0] 155 | var channels = a.get_shape()[1] 156 | var input_width = a.get_shape()[2] 157 | var input_height = a.get_shape()[3] 158 | 159 | var kernel_width = b.get_shape()[1] 160 | var kernel_height = b.get_shape()[2] 161 | 162 | var output_width = c.get_shape()[2] 163 | var output_height = c.get_shape()[3] 164 | 165 | var im2col = im2col3D( 166 | a.get_data(), 167 | a.get_shape(), 168 | b.get_shape(), 169 | c.get_shape(), 170 | padding_x, 171 | padding_y, 172 | stride_x, 173 | stride_y, 174 | ) 175 | 176 | var a_data = a.get_data() 177 | var b_data = b.get_data() 178 | var c_data = c.get_data() 179 | 180 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_data) 181 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_data) 182 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](c_data) 183 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](im2col) 184 | 185 | for batch in range(batches): 186 | for output_y in range(output_height): 187 | for output_x in range(output_width): 188 | for kernel_y in range(kernel_height): 189 | 190 | @parameter 191 | fn fw_vec[NELTS: Int](kernel_x: Int): 192 | for channel in range(channels): 193 | var kernel_value = b_data.simd_load[NELTS]( 194 | channel * kernel_width * kernel_height 195 | + kernel_y * kernel_width 196 | + kernel_x 197 | ) 198 | 199 | var output_value = c_data.simd_load[NELTS]( 200 | batch * output_width * output_height * channels 201 | + output_y * output_width * channels 202 | + output_x * channels 203 | + channel 204 | ) 205 | 206 | var im2col_value = im2col.simd_load[NELTS]( 207 | batch 208 | * output_width 209 | * output_height 210 | * kernel_width 211 | * kernel_height 212 | * channels 213 | + output_y 214 | * output_width 215 | * kernel_width 216 | * kernel_height 217 | * channels 218 | + output_x * kernel_width * kernel_height * channels 219 | + kernel_y * kernel_width * channels 220 | + kernel_x * channels 221 | + channel 222 | ) 223 | 224 | c_data.simd_store[NELTS]( 225 | batch * output_width * output_height * channels 226 | + output_y * output_width * channels 227 | + output_x * channels 228 | + channel, 229 | output_value + kernel_value * im2col_value, 230 | ) 231 | 232 | vectorize[fw_vec, NELTS](kernel_width) 233 | 234 | im2col.free() 235 | 236 | @staticmethod 237 | fn bw(c: Node, a: Node, b: Node): 238 | var params = c.get_other_params() 239 | 240 | var padding_x = params[0] 241 | var padding_y = params[1] 242 | var stride_x = params[2] 243 | var stride_y = params[3] 244 | 245 | var batches = a.get_shape()[0] 246 | var channels = a.get_shape()[1] 247 | var input_width = a.get_shape()[2] 248 | var input_height = a.get_shape()[3] 249 | 250 | var kernel_width = b.get_shape()[1] 251 | var kernel_height = b.get_shape()[2] 252 | 253 | var output_width = c.get_shape()[2] 254 | var output_height = c.get_shape()[3] 255 | 256 | var im2col = im2col3D( 257 | a.get_data(), 258 | a.get_shape(), 259 | b.get_shape(), 260 | c.get_shape(), 261 | padding_x, 262 | padding_y, 263 | stride_x, 264 | stride_y, 265 | ) 266 | 267 | var b_data = b.get_data() 268 | var c_data = c.get_data() 269 | var a_grad = a.get_grad() 270 | var b_grad = b.get_grad() 271 | var c_grad = c.get_grad() 272 | 273 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_data) 274 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](c_data) 275 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](im2col) 276 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_grad) 277 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_grad) 278 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](c_grad) 279 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](a_grad) 280 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](b_grad) 281 | 282 | for batch in range(batches): 283 | for output_y in range(output_height): 284 | for output_x in range(output_width): 285 | for kernel_y in range(kernel_height): 286 | 287 | @parameter 288 | fn bw_vec[NELTS: Int](kernel_x: Int): 289 | for channel in range(channels): 290 | var kernel_value = b_data.simd_load[NELTS]( 291 | channel * kernel_width * kernel_height 292 | + kernel_y * kernel_width 293 | + kernel_x 294 | ) 295 | 296 | var output_value = c_data.simd_load[NELTS]( 297 | batch * output_width * output_height * channels 298 | + output_y * output_width * channels 299 | + output_x * channels 300 | + channel 301 | ) 302 | 303 | var im2col_value = im2col.simd_load[NELTS]( 304 | batch 305 | * output_width 306 | * output_height 307 | * kernel_width 308 | * kernel_height 309 | * channels 310 | + output_y 311 | * output_width 312 | * kernel_width 313 | * kernel_height 314 | * channels 315 | + output_x * kernel_width * kernel_height * channels 316 | + kernel_y * kernel_width * channels 317 | + kernel_x * channels 318 | + channel 319 | ) 320 | 321 | a_grad.simd_store[NELTS]( 322 | batch * input_width * input_height * channels 323 | + (output_y * stride_y + kernel_y - padding_y) 324 | * input_width 325 | * channels 326 | + (output_x * stride_x + kernel_x - padding_x) 327 | * channels 328 | + channel, 329 | a_grad.simd_load[NELTS]( 330 | batch * input_width * input_height * channels 331 | + (output_y * stride_y + kernel_y - padding_y) 332 | * input_width 333 | * channels 334 | + (output_x * stride_x + kernel_x - padding_x) 335 | * channels 336 | + channel 337 | ) 338 | + kernel_value 339 | * c_grad.simd_load[NELTS]( 340 | batch * output_width * output_height * channels 341 | + output_y * output_width * channels 342 | + output_x * channels 343 | + channel 344 | ), 345 | ) 346 | 347 | b_grad.simd_store[NELTS]( 348 | channel * kernel_width * kernel_height 349 | + kernel_y * kernel_width 350 | + kernel_x, 351 | b_grad.simd_load[NELTS]( 352 | channel * kernel_width * kernel_height 353 | + kernel_y * kernel_width 354 | + kernel_x 355 | ) 356 | + output_value * im2col_value, 357 | ) 358 | 359 | im2col.free() 360 | 361 | 362 | fn im2col2D( 363 | input: DTypePointer[DType.float32], 364 | input_shape: Vector[Int], 365 | kernel_shape: Vector[Int], 366 | output_shape: Vector[Int], 367 | padding: Int, 368 | stride: Int, 369 | ) -> DTypePointer[DType.float32]: 370 | var batches = input_shape[0] 371 | var channels = input_shape[1] 372 | var input_width = input_shape[2] 373 | 374 | var kernel_width = kernel_shape[1] 375 | 376 | var output_width = output_shape[2] 377 | 378 | var im2col = DTypePointer[DType.float32].alloc( 379 | batches * output_width * kernel_width * channels 380 | ) 381 | 382 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](input) 383 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](im2col) 384 | 385 | for batch in range(batches): 386 | for channel in range(channels): 387 | 388 | @parameter 389 | fn workgroup_function[NELTS: Int](output_x: Int): 390 | @parameter 391 | fn fw_vec[NELTS: Int](kernel_x: Int): 392 | var input_x = output_x * stride + kernel_x - padding 393 | 394 | if input_x < 0 or input_x >= input_width: 395 | im2col.simd_store[NELTS]( 396 | batch * output_width * kernel_width * channels 397 | + output_x * kernel_width * channels 398 | + kernel_x * channels 399 | + channel, 400 | 0.0, 401 | ) 402 | else: 403 | im2col.simd_store[NELTS]( 404 | batch * output_width * kernel_width * channels 405 | + output_x * kernel_width * channels 406 | + kernel_x * channels 407 | + channel, 408 | input.simd_load[NELTS]( 409 | batch * input_width * channels 410 | + input_x * channels 411 | + channel 412 | ), 413 | ) 414 | 415 | vectorize[fw_vec, NELTS](kernel_width) 416 | 417 | tile[workgroup_function, tile_sizes](0, output_width) 418 | 419 | return im2col 420 | 421 | 422 | fn im2col3D( 423 | input: DTypePointer[DType.float32], 424 | input_shape: Vector[Int], 425 | kernel_shape: Vector[Int], 426 | output_shape: Vector[Int], 427 | padding_x: Int, 428 | padding_y: Int, 429 | stride_x: Int, 430 | stride_y: Int, 431 | ) -> DTypePointer[DType.float32]: 432 | var batches = input_shape[0] 433 | var channels = input_shape[1] 434 | var input_width = input_shape[2] 435 | var input_height = input_shape[3] 436 | 437 | var kernel_width = kernel_shape[1] 438 | var kernel_height = kernel_shape[2] 439 | 440 | var output_width = output_shape[2] 441 | var output_height = output_shape[3] 442 | 443 | var im2col = DTypePointer[DType.float32].alloc( 444 | batches * output_width * output_height * kernel_width * kernel_height * channels 445 | ) 446 | 447 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](input) 448 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](im2col) 449 | 450 | for batch in range(batches): 451 | for channel in range(channels): 452 | 453 | @parameter 454 | fn workgroup_function[NELTS: Int](output_y: Int): 455 | for output_x in range(output_width): 456 | var base_index = batch * output_width * output_height * kernel_width * kernel_height * channels + output_y * output_width * kernel_width * kernel_height * channels + output_x * kernel_width * kernel_height * channels + channel 457 | for kernel_y in range(kernel_height): 458 | var input_y = output_y * stride_y + kernel_y - padding_y 459 | var y_index = base_index + kernel_y * kernel_width * channels 460 | if input_y < 0 or input_y >= input_height: 461 | 462 | @parameter 463 | fn fw_vec_zero[NELTS: Int](kernel_x: Int): 464 | im2col.simd_store[NELTS]( 465 | y_index + kernel_x * channels, 0.0 466 | ) 467 | 468 | vectorize[fw_vec_zero, NELTS](kernel_width) 469 | else: 470 | 471 | @parameter 472 | fn fw_vec_one[NELTS: Int](kernel_x: Int): 473 | var input_x = output_x * stride_x + kernel_x - padding_x 474 | if input_x < 0 or input_x >= input_width: 475 | im2col.simd_store[NELTS]( 476 | y_index + kernel_x * channels, 0.0 477 | ) 478 | else: 479 | var input_index = batch * input_width * input_height * channels + input_y * input_width * channels + input_x * channels 480 | im2col.simd_store[NELTS]( 481 | y_index + kernel_x * channels, 482 | input.simd_load[NELTS](input_index), 483 | ) 484 | 485 | vectorize[fw_vec_one, NELTS](kernel_width) 486 | 487 | tile[workgroup_function, tile_sizes](0, output_height) 488 | 489 | return im2col 490 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/generics.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | 3 | from voodoo.utils import ( 4 | shape_a, 5 | shape_b, 6 | strides_a, 7 | strides_b, 8 | recursive_broadcast, 9 | Vector, 10 | ) 11 | from voodoo.constants import NELTS, PREFETCH_READ, PREFETCH_WRITE 12 | 13 | 14 | trait Generic: 15 | ... 16 | 17 | 18 | alias generic_activation_vectorized = fn[ 19 | NELTS: Int, arg1: Float32, arg2: Float32, arg3: Float32 20 | ] (SIMD[DType.float32, NELTS]) -> SIMD[DType.float32, NELTS] 21 | 22 | alias generic_arithmetic_vectorized = fn[NELTS: Int] ( 23 | SIMD[DType.float32, NELTS] 24 | ) -> SIMD[DType.float32, NELTS] 25 | 26 | alias generic_binary_arithmetic_vectorized = fn[NELTS: Int] ( 27 | SIMD[DType.float32, NELTS], SIMD[DType.float32, NELTS] 28 | ) -> SIMD[DType.float32, NELTS] 29 | 30 | alias generic_loss_vectorized_fw = generic_binary_arithmetic_vectorized 31 | 32 | alias generic_loss_vectorized_bw = fn[NELTS: Int] ( 33 | SIMD[DType.float32, NELTS], SIMD[DType.float32, NELTS], Float32, Int 34 | ) -> SIMD[DType.float32, NELTS] 35 | 36 | alias generic_optimizer_vectorized = fn[NELTS: Int, learning_rate: Float32] ( 37 | SIMD[DType.float32, NELTS] 38 | ) -> SIMD[DType.float32, NELTS] 39 | 40 | 41 | @register_passable("trivial") 42 | struct GenericActivation[ 43 | fw_vec: generic_activation_vectorized, 44 | bw_vec: generic_activation_vectorized, 45 | arg1: Float32, 46 | arg2: Float32, 47 | arg3: Float32, 48 | ](Generic): 49 | @staticmethod 50 | fn fw(node: Node, parent1: Node): 51 | var node_data = node.get_data() 52 | var parent1_data = parent1.get_data() 53 | 54 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](node_data) 55 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_data) 56 | 57 | @parameter 58 | fn vectorized_fw[NELTS: Int](i: Int): 59 | node_data.simd_store[NELTS]( 60 | i, 61 | fw_vec[NELTS, arg1, arg2, arg3](parent1_data.simd_load[NELTS](i)), 62 | ) 63 | 64 | vectorize[vectorized_fw, NELTS](node.get_cap()) 65 | 66 | @staticmethod 67 | fn bw(node: Node, parent1: Node): 68 | var node_data = node.get_data() 69 | var node_grad = node.get_grad() 70 | var parent1_data = parent1.get_data() 71 | var parent1_grad = parent1.get_grad() 72 | 73 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_grad) 74 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](node_grad) 75 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_data) 76 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](parent1_grad) 77 | 78 | @parameter 79 | fn vectorized_bw[NELTS: Int](i: Int): 80 | parent1_grad.simd_store[NELTS]( 81 | i, 82 | parent1_grad.simd_load[NELTS](i) 83 | + node_grad.simd_load[NELTS](i) 84 | * bw_vec[NELTS, arg1, arg2, arg3](parent1_data.simd_load[NELTS](i)), 85 | ) 86 | 87 | vectorize[vectorized_bw, NELTS](node.get_cap()) 88 | 89 | 90 | @register_passable("trivial") 91 | struct GenericArithmetic[ 92 | fw_vec: generic_arithmetic_vectorized, bw_vec: generic_arithmetic_vectorized 93 | ](Generic): 94 | @staticmethod 95 | fn fw(node: Node, parent1: Node): 96 | var node_data = node.get_data() 97 | var parent1_data = parent1.get_data() 98 | 99 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](node_data) 100 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_data) 101 | 102 | @parameter 103 | fn vectorized_fw[NELTS: Int](i: Int): 104 | node_data.simd_store[NELTS]( 105 | i, 106 | fw_vec[NELTS](parent1_data.simd_load[NELTS](i)), 107 | ) 108 | 109 | vectorize[vectorized_fw, NELTS](node.get_cap()) 110 | 111 | @staticmethod 112 | fn bw(node: Node, parent1: Node): 113 | var node_data = node.get_data() 114 | var node_grad = node.get_grad() 115 | var parent1_data = parent1.get_data() 116 | var parent1_grad = parent1.get_grad() 117 | 118 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_grad) 119 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](node_grad) 120 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](parent1_data) 121 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](parent1_grad) 122 | 123 | @parameter 124 | fn vectorized_bw[NELTS: Int](i: Int): 125 | parent1_grad.simd_store[NELTS]( 126 | i, 127 | parent1_grad.simd_load[NELTS](i) 128 | + node_grad.simd_load[NELTS](i) 129 | * bw_vec[NELTS](parent1_data.simd_load[NELTS](i)), 130 | ) 131 | 132 | vectorize[vectorized_bw, NELTS](node.get_cap()) 133 | 134 | 135 | @register_passable("trivial") 136 | struct GenericBinaryArithmetic[ 137 | fw_vec: generic_binary_arithmetic_vectorized, 138 | bw_a_vec: generic_binary_arithmetic_vectorized, 139 | bw_b_vec: generic_binary_arithmetic_vectorized, 140 | ](Generic): 141 | @staticmethod 142 | fn fw(c: Node, a: Node, b: Node): 143 | recursive_broadcast[Self.kernel_fw[fw_vec], True](c, a, b) 144 | 145 | @staticmethod 146 | fn bw(c: Node, a: Node, b: Node): 147 | if not a.get_is_single(): 148 | recursive_broadcast[Self.kernel_bw[bw_a_vec, True], True](c, a, b) 149 | if not b.get_is_single(): 150 | recursive_broadcast[Self.kernel_bw[bw_b_vec, False], True](c, a, b) 151 | 152 | @staticmethod 153 | fn kernel_fw[ 154 | generic_func: generic_binary_arithmetic_vectorized 155 | ]( 156 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 157 | ) -> None: 158 | var offset_a = a_index * shape_a(depth, a, b) * strides_a(depth, a, b) 159 | var offset_b = b_index * shape_b(depth, a, b) * strides_b(depth, a, b) 160 | var c_rest = c.get_shape()[depth] * c.get_strides()[depth] 161 | var offset_c = c_index * c_rest 162 | 163 | var a_data = a.get_data() 164 | var b_data = b.get_data() 165 | var c_data = c.get_data() 166 | 167 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_data) 168 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_data) 169 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](c_data) 170 | 171 | @parameter 172 | fn vectorized_fw[NELTS: Int](i: Int): 173 | c_data.simd_store[NELTS]( 174 | offset_c + i, 175 | generic_func( 176 | a_data.simd_load[NELTS](offset_a + i), 177 | b_data.simd_load[NELTS](offset_b + i), 178 | ), 179 | ) 180 | 181 | vectorize[vectorized_fw, NELTS](c_rest) 182 | 183 | @staticmethod 184 | fn kernel_bw[ 185 | generic_func: generic_binary_arithmetic_vectorized, 186 | is_a: Bool, 187 | ]( 188 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 189 | ) -> None: 190 | var offset_a = a_index * shape_a(depth, a, b) * strides_a(depth, a, b) 191 | var offset_b = b_index * shape_b(depth, a, b) * strides_b(depth, a, b) 192 | var offset_c = c_index * c.get_shape()[depth] * c.get_strides()[depth] 193 | var c_rest = c.get_shape()[depth] * c.get_strides()[depth] 194 | 195 | @parameter 196 | if is_a: 197 | var a_data = a.get_data() 198 | var b_data = b.get_data() 199 | var a_grad = a.get_grad() 200 | var c_grad = c.get_grad() 201 | 202 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](a_grad) 203 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_grad) 204 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_data) 205 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_data) 206 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](c_grad) 207 | 208 | @parameter 209 | fn vectorized_bw_a[NELTS: Int](i: Int): 210 | a_grad.simd_store[NELTS]( 211 | offset_a + i, 212 | a_grad.simd_load[NELTS](offset_a + i) 213 | + generic_func( 214 | a_data.simd_load[NELTS](offset_a + i), 215 | b_data.simd_load[NELTS](offset_b + i), 216 | ) 217 | * c_grad.simd_load[NELTS](offset_c + i), 218 | ) 219 | 220 | vectorize[vectorized_bw_a, NELTS](c_rest) 221 | else: 222 | var a_data = a.get_data() 223 | var b_data = b.get_data() 224 | var b_grad = b.get_grad() 225 | var c_grad = c.get_grad() 226 | 227 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](b_grad) 228 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_grad) 229 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](a_data) 230 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](b_data) 231 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](c_grad) 232 | 233 | @parameter 234 | fn vectorized_bw_b[NELTS: Int](i: Int): 235 | b_grad.simd_store[NELTS]( 236 | offset_b + i, 237 | b_grad.simd_load[NELTS](offset_b + i) 238 | + generic_func( 239 | a_data.simd_load[NELTS](offset_a + i), 240 | b_data.simd_load[NELTS](offset_b + i), 241 | ) 242 | * c_grad.simd_load[NELTS](offset_c + i), 243 | ) 244 | 245 | vectorize[vectorized_bw_b, NELTS](c_rest) 246 | 247 | 248 | @register_passable("trivial") 249 | struct GenericLoss[ 250 | fw_vec: generic_loss_vectorized_fw, 251 | bw_vec: generic_loss_vectorized_bw, 252 | ](Generic): 253 | @staticmethod 254 | fn fw(node: Node, y_pred: Node, y_true: Node): 255 | var num_dims = len(y_pred.get_shape()) 256 | var N = y_pred.get_shape()[num_dims - 1] 257 | var cap = y_pred.get_cap() 258 | var e: Float32 = 0.0 259 | 260 | var y_pred_data = y_pred.get_data() 261 | var y_true_data = y_true.get_data() 262 | 263 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_pred_data) 264 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_true_data) 265 | 266 | @parameter 267 | fn vectorized_fw[NELTS: Int](i: Int): 268 | node.get_data().store( 269 | 0, 270 | node.get_data()[0] 271 | + fw_vec[NELTS]( 272 | y_true.get_data().simd_load[NELTS](i), 273 | y_pred.get_data().simd_load[NELTS](i), 274 | ).reduce_add(), 275 | ) 276 | 277 | vectorize[vectorized_fw, NELTS](cap) 278 | node.get_data().store(0, node.get_data()[0] / cap / Float32(N)) 279 | 280 | @staticmethod 281 | fn bw(node: Node, y_pred: Node, y_true: Node): 282 | var num_dims = len(y_pred.get_shape()) 283 | var N = y_pred.get_shape()[num_dims - 1] 284 | var cap = y_pred.get_cap() 285 | var scalar = cap / Float32(N) 286 | 287 | var y_pred_data = y_pred.get_data() 288 | var y_pred_grad = y_pred.get_grad() 289 | var y_true_data = y_true.get_data() 290 | var y_true_grad = y_true.get_grad() 291 | 292 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_pred_data) 293 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_pred_grad) 294 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_true_data) 295 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](y_true_grad) 296 | 297 | @parameter 298 | fn vectorized_mae_bw[NELTS: Int](i: Int): 299 | var grad = bw_vec[NELTS]( 300 | y_true_data.simd_load[NELTS](i), y_pred_data.simd_load[NELTS](i), cap, N 301 | ) / scalar 302 | 303 | y_pred_grad.simd_store[NELTS](i, y_pred_grad.simd_load[NELTS](i) + grad) 304 | y_true_grad.simd_store[NELTS](i, y_true_grad.simd_load[NELTS](i) - grad) 305 | 306 | vectorize[vectorized_mae_bw, NELTS](cap) 307 | 308 | 309 | @register_passable("trivial") 310 | struct GenericOptimizer[fw_vec: generic_optimizer_vectorized](Generic): 311 | @staticmethod 312 | fn step[learning_rate: Float32](x: Vector[Node]) raises: 313 | for i in range(len(x)): 314 | var node = x[i] 315 | if node.get_is_static() and node.get_grad_computed(): 316 | var node_data = node.get_data() 317 | var node_grad = node.get_grad() 318 | 319 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](node_data) 320 | DTypePointer[DType.float32].prefetch[PREFETCH_READ](node_grad) 321 | DTypePointer[DType.float32].prefetch[PREFETCH_WRITE](node_data) 322 | 323 | @parameter 324 | fn vectorized_update[NELTS: Int](i: Int): 325 | node_data.simd_store[NELTS]( 326 | i, 327 | node_data.simd_load[NELTS](i) 328 | - fw_vec[NELTS, learning_rate](node_grad.simd_load[NELTS](i)), 329 | ) 330 | 331 | vectorize[vectorized_update, NELTS](node.get_cap()) 332 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/kernels.mojo: -------------------------------------------------------------------------------- 1 | from .operations import ( 2 | Copy, 3 | Reshape, 4 | Transpose, 5 | Sum, 6 | Dropout, 7 | ) 8 | from .matmul import MMul 9 | from .maxpool import MaxPool1D, MaxPool2D 10 | from .conv import Conv1D, Conv2D 11 | from .activations import ( 12 | Relu, 13 | Sigmoid, 14 | Softplus, 15 | Softsign, 16 | Tanh, 17 | Selu, 18 | Elu, 19 | Exp, 20 | LeakyRelu, 21 | Relu6, 22 | Silu, 23 | Gelu, 24 | HardSigmoid, 25 | Linear, 26 | Mish, 27 | ) 28 | from .arithmetic import ( 29 | Sqrt, 30 | Abs, 31 | Exp2, 32 | Log2, 33 | Log, 34 | Sin, 35 | Cos, 36 | Tan, 37 | Asin, 38 | Acos, 39 | Atan, 40 | Sinh, 41 | Cosh, 42 | Add, 43 | Mul, 44 | Sub, 45 | Div, 46 | Pow, 47 | ) 48 | from .losses import MSE, MAE, MAPE, MSLE 49 | 50 | from voodoo.constants import F32_MAX, UNARY_OP, BINARY_OP, OP_TUPLE, NU, NB 51 | from voodoo.utils.operator_codes import * 52 | 53 | 54 | @register_passable("trivial") 55 | struct KERNELS: 56 | @staticmethod 57 | fn get(code: Int) -> OP_TUPLE: 58 | if code == copy_code: 59 | return OP_TUPLE(Copy.fw, NB) 60 | elif code == copy_code + 1: 61 | return OP_TUPLE(Copy.bw, NB) 62 | elif code == reshape_code: 63 | return OP_TUPLE(Reshape.fw, NB) 64 | elif code == reshape_code + 1: 65 | return OP_TUPLE(Reshape.bw, NB) 66 | elif code == transp_code: 67 | return OP_TUPLE(Transpose.fw, NB) 68 | elif code == transp_code + 1: 69 | return OP_TUPLE(Transpose.bw, NB) 70 | elif code == sum_code: 71 | return OP_TUPLE(Sum.fw, NB) 72 | elif code == sum_code + 1: 73 | return OP_TUPLE(Sum.bw, NB) 74 | elif code == dropout_code: 75 | return OP_TUPLE(Dropout.fw, NB) 76 | elif code == dropout_code + 1: 77 | return OP_TUPLE(Dropout.bw, NB) 78 | elif code == mmul_code: 79 | return OP_TUPLE(NU, MMul.fw) 80 | elif code == mmul_code + 1: 81 | return OP_TUPLE(NU, MMul.bw) 82 | elif code == sqrt_code: 83 | return OP_TUPLE(Sqrt.fw, NB) 84 | elif code == sqrt_code + 1: 85 | return OP_TUPLE(Sqrt.bw, NB) 86 | elif code == abs_code: 87 | return OP_TUPLE(Abs.fw, NB) 88 | elif code == abs_code + 1: 89 | return OP_TUPLE(Abs.bw, NB) 90 | elif code == exp2_code: 91 | return OP_TUPLE(Exp2.fw, NB) 92 | elif code == exp2_code + 1: 93 | return OP_TUPLE(Exp2.bw, NB) 94 | elif code == log2_code: 95 | return OP_TUPLE(Log2.fw, NB) 96 | elif code == log2_code + 1: 97 | return OP_TUPLE(Log2.bw, NB) 98 | elif code == log_code: 99 | return OP_TUPLE(Log.fw, NB) 100 | elif code == log_code + 1: 101 | return OP_TUPLE(Log.bw, NB) 102 | elif code == sin_code: 103 | return OP_TUPLE(Sin.fw, NB) 104 | elif code == sin_code + 1: 105 | return OP_TUPLE(Sin.bw, NB) 106 | elif code == cos_code: 107 | return OP_TUPLE(Cos.fw, NB) 108 | elif code == cos_code + 1: 109 | return OP_TUPLE(Cos.bw, NB) 110 | elif code == tan_code: 111 | return OP_TUPLE(Tan.fw, NB) 112 | elif code == tan_code + 1: 113 | return OP_TUPLE(Tan.bw, NB) 114 | elif code == asin_code: 115 | return OP_TUPLE(Asin.fw, NB) 116 | elif code == asin_code + 1: 117 | return OP_TUPLE(Asin.bw, NB) 118 | elif code == acos_code: 119 | return OP_TUPLE(Acos.fw, NB) 120 | elif code == acos_code + 1: 121 | return OP_TUPLE(Acos.bw, NB) 122 | elif code == atan_code: 123 | return OP_TUPLE(Atan.fw, NB) 124 | elif code == atan_code + 1: 125 | return OP_TUPLE(Atan.bw, NB) 126 | elif code == sinh_code: 127 | return OP_TUPLE(Sinh.fw, NB) 128 | elif code == sinh_code + 1: 129 | return OP_TUPLE(Sinh.bw, NB) 130 | elif code == cosh_code: 131 | return OP_TUPLE(Cosh.fw, NB) 132 | elif code == cosh_code + 1: 133 | return OP_TUPLE(Cosh.bw, NB) 134 | elif code == add_code: 135 | return OP_TUPLE(NU, Add.fw) 136 | elif code == add_code + 1: 137 | return OP_TUPLE(NU, Add.bw) 138 | elif code == mul_code: 139 | return OP_TUPLE(NU, Mul.fw) 140 | elif code == mul_code + 1: 141 | return OP_TUPLE(NU, Mul.bw) 142 | elif code == sub_code: 143 | return OP_TUPLE(NU, Sub.fw) 144 | elif code == sub_code + 1: 145 | return OP_TUPLE(NU, Sub.bw) 146 | elif code == div_code: 147 | return OP_TUPLE(NU, Div.fw) 148 | elif code == div_code + 1: 149 | return OP_TUPLE(NU, Div.bw) 150 | elif code == pow_code: 151 | return OP_TUPLE(NU, Pow.fw) 152 | elif code == pow_code + 1: 153 | return OP_TUPLE(NU, Pow.bw) 154 | elif code == mse_code: 155 | return OP_TUPLE(NU, MSE.fw) 156 | elif code == mse_code + 1: 157 | return OP_TUPLE(NU, MSE.bw) 158 | elif code == mae_code: 159 | return OP_TUPLE(NU, MAE.fw) 160 | elif code == mae_code + 1: 161 | return OP_TUPLE(NU, MAE.bw) 162 | elif code == mape_code: 163 | return OP_TUPLE(NU, MAPE.fw) 164 | elif code == mape_code + 1: 165 | return OP_TUPLE(NU, MAPE.bw) 166 | elif code == msle_code: 167 | return OP_TUPLE(NU, MSLE.fw) 168 | elif code == msle_code + 1: 169 | return OP_TUPLE(NU, MSLE.bw) 170 | elif code == relu_code: 171 | return OP_TUPLE(Relu[0.0, F32_MAX, 0.0].fw, NB) 172 | elif code == relu_code + 1: 173 | return OP_TUPLE(Relu[0.0, F32_MAX, 0.0].bw, NB) 174 | elif code == sigmoid_code: 175 | return OP_TUPLE(Sigmoid.fw, NB) 176 | elif code == sigmoid_code + 1: 177 | return OP_TUPLE(Sigmoid.bw, NB) 178 | elif code == softplus_code: 179 | return OP_TUPLE(Softplus.fw, NB) 180 | elif code == softplus_code + 1: 181 | return OP_TUPLE(Softplus.bw, NB) 182 | elif code == softsign_code: 183 | return OP_TUPLE(Softsign.fw, NB) 184 | elif code == softsign_code + 1: 185 | return OP_TUPLE(Softsign.bw, NB) 186 | elif code == tanh_code: 187 | return OP_TUPLE(Tanh.fw, NB) 188 | elif code == tanh_code + 1: 189 | return OP_TUPLE(Tanh.bw, NB) 190 | elif code == selu_code: 191 | return OP_TUPLE(Selu.fw, NB) 192 | elif code == selu_code + 1: 193 | return OP_TUPLE(Selu.bw, NB) 194 | elif code == elu_code: 195 | return OP_TUPLE(Elu[0.0].fw, NB) 196 | elif code == elu_code + 1: 197 | return OP_TUPLE(Elu[0.0].bw, NB) 198 | elif code == exp_code: 199 | return OP_TUPLE(Exp.fw, NB) 200 | elif code == exp_code + 1: 201 | return OP_TUPLE(Exp.bw, NB) 202 | elif code == lrelu_code: 203 | return OP_TUPLE(LeakyRelu[0.0].fw, NB) 204 | elif code == lrelu_code + 1: 205 | return OP_TUPLE(LeakyRelu[0.0].bw, NB) 206 | elif code == relu6_code: 207 | return OP_TUPLE(Relu6.fw, NB) 208 | elif code == relu6_code + 1: 209 | return OP_TUPLE(Relu6.bw, NB) 210 | elif code == silu_code: 211 | return OP_TUPLE(Silu.fw, NB) 212 | elif code == silu_code + 1: 213 | return OP_TUPLE(Silu.bw, NB) 214 | elif code == gelu_code: 215 | return OP_TUPLE(Gelu[0.0].fw, NB) 216 | elif code == gelu_code + 1: 217 | return OP_TUPLE(Gelu[0.0].bw, NB) 218 | elif code == h_sig_code: 219 | return OP_TUPLE(HardSigmoid.fw, NB) 220 | elif code == h_sig_code + 1: 221 | return OP_TUPLE(HardSigmoid.bw, NB) 222 | elif code == linear_code: 223 | return OP_TUPLE(Linear.fw, NB) 224 | elif code == linear_code + 1: 225 | return OP_TUPLE(Linear.bw, NB) 226 | elif code == mish_code: 227 | return OP_TUPLE(Mish.fw, NB) 228 | elif code == mish_code + 1: 229 | return OP_TUPLE(Mish.bw, NB) 230 | elif code == conv1d_code: 231 | return OP_TUPLE(NU, Conv1D.fw) 232 | elif code == conv1d_code + 1: 233 | return OP_TUPLE(NU, Conv1D.bw) 234 | elif code == conv2d_code: 235 | return OP_TUPLE(NU, Conv2D.fw) 236 | elif code == conv2d_code + 1: 237 | return OP_TUPLE(NU, Conv2D.bw) 238 | elif code == maxpool1d_code: 239 | return OP_TUPLE(MaxPool1D.fw, NB) 240 | elif code == maxpool1d_code + 1: 241 | return OP_TUPLE(MaxPool1D.bw, NB) 242 | elif code == maxpool2d_code: 243 | return OP_TUPLE(MaxPool2D.fw, NB) 244 | elif code == maxpool2d_code + 1: 245 | return OP_TUPLE(MaxPool2D.bw, NB) 246 | else: 247 | return OP_TUPLE(NU, NB) 248 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/losses.mojo: -------------------------------------------------------------------------------- 1 | from math import log, abs 2 | 3 | from voodoo.constants import EPSILON 4 | from voodoo.autograd.kernels.generics import GenericLoss 5 | 6 | 7 | trait Loss: 8 | ... 9 | 10 | 11 | @register_passable("trivial") 12 | struct MSE[](Loss): 13 | alias fw = GenericLoss[mse_error, mse_grad].fw 14 | alias bw = GenericLoss[mse_error, mse_grad].bw 15 | 16 | 17 | @register_passable("trivial") 18 | struct MAE[](Loss): 19 | alias fw = GenericLoss[mae_error, mae_grad].fw 20 | alias bw = GenericLoss[mae_error, mae_grad].bw 21 | 22 | 23 | @register_passable("trivial") 24 | struct MAPE[](Loss): 25 | alias fw = GenericLoss[mape_error, mape_grad].fw 26 | alias bw = GenericLoss[mape_error, mape_grad].bw 27 | 28 | 29 | @register_passable("trivial") 30 | struct MSLE[](Loss): 31 | alias fw = GenericLoss[msle_error, msle_grad].fw 32 | alias bw = GenericLoss[msle_error, msle_grad].bw 33 | 34 | 35 | fn mse_error[ 36 | NELTS: Int 37 | ](y_pred: SIMD[DType.float32, NELTS], y_true: SIMD[DType.float32, NELTS]) -> SIMD[ 38 | DType.float32, NELTS 39 | ]: 40 | # f(x, y) = (x - y)^2 41 | return (y_pred - y_true) ** 2.0 42 | 43 | 44 | fn mse_grad[ 45 | NELTS: Int 46 | ]( 47 | y_pred: SIMD[DType.float32, NELTS], 48 | y_true: SIMD[DType.float32, NELTS], 49 | cap: Float32, 50 | N: Int, 51 | ) -> SIMD[DType.float32, NELTS]: 52 | # f'(x, y) with respect to y = -2(x - y) 53 | return -2.0 * (y_pred - y_true) 54 | 55 | 56 | fn mae_error[ 57 | NELTS: Int 58 | ](y_pred: SIMD[DType.float32, NELTS], y_true: SIMD[DType.float32, NELTS]) -> SIMD[ 59 | DType.float32, NELTS 60 | ]: 61 | # f(x, y) = |x - y| 62 | return abs(y_pred - y_true) 63 | 64 | 65 | fn mae_grad[ 66 | NELTS: Int 67 | ]( 68 | y_pred: SIMD[DType.float32, NELTS], 69 | y_true: SIMD[DType.float32, NELTS], 70 | cap: Float32, 71 | N: Int, 72 | ) -> SIMD[DType.float32, NELTS]: 73 | # f'(x, y) with respect to y = -1 if x > y else 1 74 | return (y_pred > y_true).select(Float32(-1.0), 1.0) 75 | 76 | 77 | fn mape_error[ 78 | NELTS: Int 79 | ](y_pred: SIMD[DType.float32, NELTS], y_true: SIMD[DType.float32, NELTS]) -> SIMD[ 80 | DType.float32, NELTS 81 | ]: 82 | # f(x, y) = |x - y| / y 83 | return abs(y_pred - y_true) / (y_true + EPSILON) 84 | 85 | 86 | fn mape_grad[ 87 | NELTS: Int 88 | ]( 89 | y_pred: SIMD[DType.float32, NELTS], 90 | y_true: SIMD[DType.float32, NELTS], 91 | cap: Float32, 92 | N: Int, 93 | ) -> SIMD[DType.float32, NELTS]: 94 | # f'(x, y) with respect to y = -1 if x > y else 1 95 | return (y_pred > y_true).select[DType.float32](-1.0, 1.0) 96 | 97 | 98 | fn msle_error[ 99 | NELTS: Int 100 | ](y_pred: SIMD[DType.float32, NELTS], y_true: SIMD[DType.float32, NELTS]) -> SIMD[ 101 | DType.float32, NELTS 102 | ]: 103 | # f(x, y) = (log(x + 1) - log(y + 1))^2 104 | var y_pred_clipped = (y_pred > 0.0).select[DType.float32](y_pred, 0.0) 105 | var y_true_clipped = (y_true > 0.0).select[DType.float32](y_true, 0.0) 106 | return (log(y_pred_clipped + Float32(1.0)) - log(y_true_clipped + Float32(1.0))) * ( 107 | log(y_pred_clipped + Float32(1.0)) - log(y_true_clipped + Float32(1.0)) 108 | ) 109 | 110 | 111 | fn msle_grad[ 112 | NELTS: Int 113 | ]( 114 | y_pred: SIMD[DType.float32, NELTS], 115 | y_true: SIMD[DType.float32, NELTS], 116 | cap: Float32, 117 | N: Int, 118 | ) -> SIMD[DType.float32, NELTS]: 119 | # f'(x, y) with respect to y = -2(log(x + 1) - log(y + 1)) / (y + 1) 120 | var y_pred_clipped = (y_pred > 0.0).select[DType.float32](y_pred, 0.0) 121 | var y_true_clipped = (y_true > 0.0).select[DType.float32](y_true, 0.0) 122 | return ( 123 | -Float32(2.0) 124 | * (log(y_pred_clipped + Float32(1.0)) - log(y_true_clipped + Float32(1.0))) 125 | / (y_true_clipped + Float32(1.0)) 126 | ) 127 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/matmul.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | from math import max, min 3 | 4 | from voodoo.autograd import Node 5 | from voodoo.utils import recursive_broadcast 6 | from voodoo.constants import PREFETCH_READ, PREFETCH_WRITE, F32_MAX, NELTS 7 | 8 | 9 | alias bw_b_tile_size = 2 10 | 11 | 12 | @register_passable("trivial") 13 | struct MMul: 14 | @staticmethod 15 | fn fw(c: Node, a: Node, b: Node): 16 | recursive_broadcast[Self.kernel_mmul_fw, False](c, a, b) 17 | 18 | @staticmethod 19 | fn bw(c: Node, a: Node, b: Node): 20 | if not a.get_is_single(): 21 | recursive_broadcast[Self.kernel_mmul_bw_a, False](c, a, b) 22 | if not b.get_is_single(): 23 | recursive_broadcast[Self.kernel_mmul_bw_b, False](c, a, b) 24 | 25 | @staticmethod 26 | fn kernel_mmul_fw( 27 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 28 | ) -> None: 29 | var a_num_dims = a.get_num_dims() 30 | var b_num_dims = b.get_num_dims() 31 | 32 | var M = a.get_shape()[a_num_dims - 2] 33 | var K = b.get_shape()[b_num_dims - 2] 34 | var N = c.get_shape()[c.get_num_dims() - 1] 35 | 36 | var offset_a = a_index * M * a.get_shape()[a_num_dims - 1] 37 | var offset_b = b_index * K * b.get_shape()[b_num_dims - 1] 38 | var offset_c = c_index * N * N 39 | 40 | var a_data = a.get_data() 41 | var b_data = b.get_data() 42 | var c_data = c.get_data() 43 | 44 | DTypePointer.prefetch[PREFETCH_READ](a_data) 45 | DTypePointer.prefetch[PREFETCH_READ](b_data) 46 | DTypePointer.prefetch[PREFETCH_READ](c_data) 47 | DTypePointer.prefetch[PREFETCH_WRITE](c_data) 48 | 49 | for m in range(0, M): 50 | var start_offset_a = offset_a + m * K 51 | var start_offset_c = offset_c + m * N 52 | for kb in range(0, K, NELTS): 53 | for k in range(kb, min(kb + NELTS, K)): 54 | var start_offset_b = offset_b + k * N 55 | var a_scalar = a_data.load(start_offset_a + k) 56 | 57 | @parameter 58 | fn dot_fw[NELTS: Int](n: Int): 59 | c_data.simd_store[NELTS]( 60 | start_offset_c + n, 61 | b_data.simd_load[NELTS](start_offset_b + n).fma( 62 | a_scalar, 63 | c_data.simd_load[NELTS](start_offset_c + n), 64 | ), 65 | ) 66 | 67 | vectorize[dot_fw, NELTS](N) 68 | 69 | @staticmethod 70 | fn kernel_mmul_bw_a( 71 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 72 | ) -> None: 73 | var a_num_dims = a.get_num_dims() 74 | var b_num_dims = b.get_num_dims() 75 | 76 | var M = a.get_shape()[a_num_dims - 2] 77 | var K = b.get_shape()[b_num_dims - 2] 78 | var N = c.get_shape()[c.get_num_dims() - 1] 79 | 80 | var offset_a = a_index * M * a.get_shape()[a_num_dims - 1] 81 | var offset_b = b_index * K * b.get_shape()[b_num_dims - 1] 82 | var offset_c = c_index * N * N 83 | 84 | var a_grad = a.get_grad() 85 | var b_data = b.get_data() 86 | var c_grad = c.get_grad() 87 | 88 | DTypePointer.prefetch[PREFETCH_READ](a_grad) 89 | DTypePointer.prefetch[PREFETCH_WRITE](a_grad) 90 | DTypePointer.prefetch[PREFETCH_READ](b_data) 91 | DTypePointer.prefetch[PREFETCH_READ](c_grad) 92 | 93 | for m in range(0, M): 94 | var start_offset_a = offset_a + m * K 95 | var start_offset_c = offset_c + m * N 96 | for nb in range(0, N, NELTS): 97 | for n in range(nb, min(nb + NELTS, N)): 98 | var start_offset_b = offset_b + n * N 99 | var c_grad_scalar = c_grad.load(start_offset_c + n) 100 | 101 | @parameter 102 | fn dot_bw[NELTS: Int](n: Int): 103 | a_grad.simd_store[NELTS]( 104 | start_offset_a + n, 105 | b_data.simd_load[NELTS](start_offset_b + n).fma( 106 | c_grad_scalar, 107 | a_grad.simd_load[NELTS](start_offset_a + n), 108 | ), 109 | ) 110 | 111 | vectorize[dot_bw, NELTS](K) 112 | 113 | @staticmethod 114 | fn kernel_mmul_bw_b( 115 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 116 | ) -> None: 117 | var a_num_dims = a.get_num_dims() 118 | var b_num_dims = b.get_num_dims() 119 | 120 | var M = a.get_shape()[a_num_dims - 2] 121 | var K = b.get_shape()[b_num_dims - 2] 122 | var N = c.get_shape()[c.get_num_dims() - 1] 123 | 124 | var offset_a = a_index * M * a.get_shape()[a_num_dims - 1] 125 | var offset_b = b_index * K * b.get_shape()[b_num_dims - 1] 126 | var offset_c = c_index * N * N 127 | 128 | var a_data = a.get_data() 129 | var b_grad = b.get_grad() 130 | var c_grad = c.get_grad() 131 | 132 | DTypePointer.prefetch[PREFETCH_READ](a_data) 133 | DTypePointer.prefetch[PREFETCH_READ](b_grad) 134 | DTypePointer.prefetch[PREFETCH_WRITE](b_grad) 135 | DTypePointer.prefetch[PREFETCH_READ](c_grad) 136 | 137 | for k in range(0, K): 138 | var start_offset_a = offset_a + k 139 | var start_offset_b = offset_b + k * N 140 | 141 | for m in range(M): 142 | var start_offset_c = offset_c + m * N 143 | var a_scalar = a_data.load(start_offset_a + m * K) 144 | 145 | @parameter 146 | fn dot_bw_b[NELTS: Int](n: Int): 147 | b_grad.simd_store[NELTS]( 148 | start_offset_b + n, 149 | c_grad.simd_load[NELTS](start_offset_c + n).fma( 150 | a_scalar, b_grad.simd_load[NELTS](start_offset_b + n) 151 | ), 152 | ) 153 | 154 | vectorize[dot_bw_b, NELTS](N) 155 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/maxpool.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | from math import max 3 | 4 | from voodoo.autograd import Node 5 | from voodoo.constants import PREFETCH_READ, PREFETCH_WRITE, F32_MAX, NELTS 6 | 7 | 8 | trait MaxPool: 9 | ... 10 | 11 | 12 | @register_passable("trivial") 13 | struct MaxPool1D(MaxPool): 14 | @staticmethod 15 | fn fw(c: Node, a: Node): 16 | var params = c.get_other_params() 17 | 18 | var kernel_width = params[0] 19 | var stride = params[1] 20 | var padding = params[2] 21 | 22 | var batches = a.get_shape()[0] 23 | var channels = a.get_shape()[1] 24 | var input_width = a.get_shape()[2] 25 | 26 | var output_width = c.get_shape()[2] 27 | 28 | DTypePointer.prefetch[PREFETCH_READ](a.get_data()) 29 | DTypePointer.prefetch[PREFETCH_WRITE](c.get_data()) 30 | 31 | for batch in range(batches): 32 | var batch_offset = batch * channels * input_width 33 | var output_batch_offset = batch * channels * output_width 34 | for channel in range(channels): 35 | var channel_offset = channel * input_width 36 | var output_channel_offset = channel * output_width 37 | for output_pos in range(output_width): 38 | var input_pos = output_pos * stride - padding 39 | var max_value = -F32_MAX 40 | 41 | @parameter 42 | fn fw_vec[NELTS: Int](kernel_pos: Int): 43 | var input_index = channel_offset + input_pos + kernel_pos 44 | if input_index >= 0 and input_index < input_width: 45 | var value = a.get_data().simd_load[NELTS]( 46 | batch_offset + input_index 47 | ) 48 | max_value = max(max_value, value.reduce_max()) 49 | 50 | vectorize[fw_vec, NELTS](kernel_width) 51 | c.get_data().store( 52 | output_batch_offset + output_channel_offset + output_pos, 53 | max_value, 54 | ) 55 | 56 | @staticmethod 57 | fn bw(c: Node, a: Node): 58 | var params = c.get_other_params() 59 | 60 | var kernel_width = params[0] 61 | var stride = params[1] 62 | var padding = params[2] 63 | 64 | var batches = a.get_shape()[0] 65 | var channels = a.get_shape()[1] 66 | var input_width = a.get_shape()[2] 67 | 68 | var output_width = c.get_shape()[2] 69 | 70 | DTypePointer.prefetch[PREFETCH_READ](a.get_data()) 71 | DTypePointer.prefetch[PREFETCH_READ](c.get_data()) 72 | DTypePointer.prefetch[PREFETCH_READ](c.get_grad()) 73 | DTypePointer.prefetch[PREFETCH_WRITE](a.get_grad()) 74 | 75 | for batch in range(batches): 76 | var batch_offset = batch * channels * input_width 77 | var output_batch_offset = batch * channels * output_width 78 | for channel in range(channels): 79 | var channel_offset = channel * input_width 80 | var output_channel_offset = channel * output_width 81 | for output_pos in range(output_width): 82 | var input_pos = output_pos * stride - padding 83 | var output_index = output_batch_offset + output_channel_offset + output_pos 84 | var max_value = c.get_data()[output_index] 85 | 86 | @parameter 87 | fn bw_vec[NELTS: Int](kernel_pos: Int): 88 | var input_index = channel_offset + input_pos + kernel_pos 89 | if input_index >= 0 and input_index < input_width: 90 | var value = a.get_data().simd_load[NELTS]( 91 | batch_offset + input_index 92 | ) 93 | var grad = c.get_grad().simd_load[NELTS](output_index) 94 | var grad_value = (value == max_value).select(grad, 0) 95 | a.get_grad().simd_store[NELTS]( 96 | batch_offset + input_index, grad_value 97 | ) 98 | 99 | vectorize[bw_vec, NELTS](kernel_width) 100 | 101 | var grad = c.get_grad()[output_index] 102 | a.get_grad().store(batch_offset + input_pos, grad.reduce_add()) 103 | 104 | 105 | @register_passable("trivial") 106 | struct MaxPool2D(MaxPool): 107 | @staticmethod 108 | fn fw(c: Node, a: Node): 109 | var params = c.get_other_params() 110 | 111 | var kernel_width = params[0] 112 | var kernel_height = params[1] 113 | var stride = params[2] 114 | var padding = params[3] 115 | 116 | var batches = a.get_shape()[0] 117 | var channels = a.get_shape()[1] 118 | var input_height = a.get_shape()[2] 119 | var input_width = a.get_shape()[3] 120 | 121 | var output_height = c.get_shape()[2] 122 | var output_width = c.get_shape()[3] 123 | 124 | DTypePointer.prefetch[PREFETCH_READ](a.get_data()) 125 | DTypePointer.prefetch[PREFETCH_WRITE](c.get_data()) 126 | 127 | for batch in range(batches): 128 | var batch_offset = batch * channels * input_height * input_width 129 | var output_batch_offset = batch * channels * output_height * output_width 130 | for channel in range(channels): 131 | var channel_offset = channel * input_height * input_width 132 | var output_channel_offset = channel * output_height * output_width 133 | for output_y in range(output_height): 134 | var input_y = output_y * stride - padding 135 | for output_x in range(output_width): 136 | var input_x = output_x * stride - padding 137 | var max_value = -F32_MAX 138 | 139 | for kernel_y in range(kernel_height): 140 | 141 | @parameter 142 | fn fw_vec[NELTS: Int](kernel_x: Int): 143 | var input_index = channel_offset + input_y + kernel_y * input_width + input_x + kernel_x 144 | if ( 145 | input_index >= 0 146 | and input_index < input_height * input_width 147 | ): 148 | var value = a.get_data().simd_load[NELTS]( 149 | batch_offset + input_index 150 | ) 151 | max_value = max(max_value, value.reduce_max()) 152 | 153 | vectorize[fw_vec, NELTS](kernel_width) 154 | c.get_data().store( 155 | output_batch_offset 156 | + output_channel_offset 157 | + output_y * output_width 158 | + output_x, 159 | max_value, 160 | ) 161 | 162 | @staticmethod 163 | fn bw(c: Node, a: Node): 164 | var params = c.get_other_params() 165 | 166 | var kernel_width = params[0] 167 | var kernel_height = params[1] 168 | var stride = params[2] 169 | var padding = params[3] 170 | 171 | var batches = a.get_shape()[0] 172 | var channels = a.get_shape()[1] 173 | var input_height = a.get_shape()[2] 174 | var input_width = a.get_shape()[3] 175 | 176 | var output_height = c.get_shape()[2] 177 | var output_width = c.get_shape()[3] 178 | 179 | DTypePointer.prefetch[PREFETCH_READ](a.get_data()) 180 | DTypePointer.prefetch[PREFETCH_READ](c.get_data()) 181 | DTypePointer.prefetch[PREFETCH_READ](c.get_grad()) 182 | DTypePointer.prefetch[PREFETCH_WRITE](a.get_grad()) 183 | 184 | for batch in range(batches): 185 | var batch_offset = batch * channels * input_height * input_width 186 | var output_batch_offset = batch * channels * output_height * output_width 187 | for channel in range(channels): 188 | var channel_offset = channel * input_height * input_width 189 | var output_channel_offset = channel * output_height * output_width 190 | for output_y in range(output_height): 191 | var input_y = output_y * stride - padding 192 | for output_x in range(output_width): 193 | var input_x = output_x * stride - padding 194 | var output_index = ( 195 | output_batch_offset 196 | + output_channel_offset 197 | + output_y * output_width 198 | + output_x 199 | ) 200 | var max_value = c.get_data()[output_index] 201 | 202 | for kernel_y in range(kernel_height): 203 | 204 | @parameter 205 | fn bw_vec[NELTS: Int](kernel_x: Int): 206 | var input_index = channel_offset + input_y + kernel_y * input_width + input_x + kernel_x 207 | if ( 208 | input_index >= 0 209 | and input_index < input_height * input_width 210 | ): 211 | var value = a.get_data().simd_load[NELTS]( 212 | batch_offset + input_index 213 | ) 214 | var grad = c.get_grad().simd_load[NELTS]( 215 | output_index 216 | ) 217 | var grad_value = (value == max_value).select( 218 | grad, 0 219 | ) 220 | a.get_grad().simd_store[NELTS]( 221 | batch_offset + input_index, grad_value 222 | ) 223 | 224 | vectorize[bw_vec, NELTS](kernel_width) 225 | 226 | var grad = c.get_grad()[output_index] 227 | a.get_grad().store( 228 | batch_offset + input_y * input_width + input_x, 229 | grad.reduce_add(), 230 | ) 231 | -------------------------------------------------------------------------------- /voodoo/autograd/kernels/operations.mojo: -------------------------------------------------------------------------------- 1 | from random import random_float64 2 | from algorithm import vectorize 3 | 4 | from voodoo.autograd import Node 5 | from voodoo.constants import NELTS 6 | 7 | 8 | trait Operation: 9 | @staticmethod 10 | fn fw(node: Node, parent1: Node): 11 | ... 12 | 13 | @staticmethod 14 | fn bw(node: Node, parent1: Node): 15 | ... 16 | 17 | 18 | @register_passable("trivial") 19 | struct Copy(Operation): 20 | @staticmethod 21 | fn fw(node: Node, parent1: Node): 22 | @parameter 23 | fn vectorized_copy[NELTS: Int](i: Int): 24 | node.get_data().simd_store[NELTS](i, parent1.get_data().simd_load[NELTS](i)) 25 | 26 | vectorize[vectorized_copy, NELTS](node.get_cap()) 27 | 28 | @staticmethod 29 | fn bw(node: Node, parent1: Node): 30 | @parameter 31 | fn vectorized_copy_bw[NELTS: Int](i: Int): 32 | parent1.get_grad().simd_store[NELTS]( 33 | i, parent1.get_grad().simd_load[NELTS](i) 34 | ) 35 | 36 | vectorize[vectorized_copy_bw, NELTS](node.get_cap()) 37 | 38 | 39 | @register_passable("trivial") 40 | struct Sum(Operation): 41 | @staticmethod 42 | fn fw(node: Node, parent1: Node): 43 | var sum: Float32 = 0.0 44 | 45 | @parameter 46 | fn vectorized_sum[NELTS: Int](i: Int): 47 | sum += parent1.get_data().simd_load[NELTS](i).reduce_add() 48 | 49 | vectorize[vectorized_sum, NELTS](parent1.get_cap()) 50 | node.get_data().store(0, sum) 51 | 52 | @staticmethod 53 | fn bw(node: Node, parent1: Node): 54 | @parameter 55 | fn vectorized_sum_bw[NELTS: Int](i: Int): 56 | parent1.get_grad().simd_store[NELTS]( 57 | i, 58 | parent1.get_grad().simd_load[NELTS](i) + node.get_grad()[0], 59 | ) 60 | 61 | vectorize[vectorized_sum_bw, NELTS](parent1.get_cap()) 62 | 63 | 64 | @register_passable("trivial") 65 | struct Reshape(Operation): 66 | @staticmethod 67 | fn fw(node: Node, parent1: Node): 68 | for s in range(node.get_cap() // parent1.get_cap()): 69 | var offset = s * parent1.get_cap() 70 | 71 | @parameter 72 | fn vectorized_reshape[NELTS: Int](i: Int): 73 | node.get_data().simd_store[NELTS]( 74 | i, parent1.get_data().simd_load[NELTS](i) 75 | ) 76 | 77 | vectorize[vectorized_reshape, NELTS](parent1.get_cap()) 78 | 79 | @staticmethod 80 | fn bw(node: Node, parent1: Node): 81 | for s in range(node.get_cap() // parent1.get_cap()): 82 | var offset = s * parent1.get_cap() 83 | 84 | @parameter 85 | fn vectorized_reshape[NELTS: Int](i: Int): 86 | parent1.get_grad().simd_store[NELTS]( 87 | i, 88 | parent1.get_grad().simd_load[NELTS](i) 89 | + node.get_grad().simd_load[NELTS](i), 90 | ) 91 | 92 | vectorize[vectorized_reshape, NELTS](parent1.get_cap()) 93 | 94 | 95 | @register_passable("trivial") 96 | struct Transpose(Operation): 97 | @staticmethod 98 | fn fw(node: Node, parent1: Node): 99 | var num_dims = parent1.get_num_dims() 100 | var M = parent1.get_shape()[num_dims - 2] 101 | var N = parent1.get_shape()[num_dims - 1] 102 | for s in range(node.get_cap() // (M * N)): 103 | var offset = s * M * N 104 | for i in range(M): 105 | 106 | @parameter 107 | fn vectorized_transp[NELTS: Int](j: Int): 108 | node.get_data().simd_store[NELTS]( 109 | offset + j * M + i, 110 | parent1.get_data().simd_load[NELTS](offset + i * N + j), 111 | ) 112 | 113 | vectorize[vectorized_transp, NELTS](N) 114 | 115 | @staticmethod 116 | fn bw(node: Node, parent1: Node): 117 | var num_dims = parent1.get_num_dims() 118 | var M = parent1.get_shape()[num_dims - 2] 119 | var N = parent1.get_shape()[num_dims - 1] 120 | for s in range(node.get_cap() // (M * N)): 121 | var offset = s * M * N 122 | for i in range(M): 123 | 124 | @parameter 125 | fn vectorized_transp_bw[NELTS: Int](j: Int): 126 | parent1.get_grad().simd_store[NELTS]( 127 | offset + j * M + i, 128 | parent1.get_grad().simd_load[NELTS](offset + j * M + i) 129 | + node.get_grad().simd_load[NELTS](offset + i * N + j), 130 | ) 131 | 132 | vectorize[vectorized_transp_bw, NELTS](N) 133 | 134 | 135 | @register_passable("trivial") 136 | struct Dropout(Operation): 137 | @staticmethod 138 | fn fw(node: Node, parent1: Node): 139 | var params = node.get_other_params() 140 | var keep_prob = 1 - params[0] / 1000000.0 141 | var scale = 1.0 / keep_prob 142 | 143 | @parameter 144 | fn vectorized_dropout[NELTS: Int](i: Int): 145 | var rand = random_float64() 146 | node.get_data().simd_store[NELTS]( 147 | i, 148 | (rand < keep_prob).select[DType.float32](1.0, 0.0) 149 | * parent1.get_data().simd_load[NELTS](i), 150 | ) 151 | 152 | vectorize[vectorized_dropout, NELTS](node.get_cap()) 153 | 154 | @staticmethod 155 | fn bw(node: Node, parent1: Node): 156 | var params = node.get_other_params() 157 | var keep_prob = 1 - params[0] / 1000000.0 158 | var scale = 1.0 / keep_prob 159 | 160 | @parameter 161 | fn vectorized_dropout_bw[NELTS: Int](i: Int): 162 | var previous = node.get_data().simd_load[NELTS](i) 163 | node.get_grad().simd_store[NELTS]( 164 | i, 165 | (previous == 0.0).select[DType.float32]( 166 | parent1.get_grad().simd_load[NELTS](i) * scale, 0.0 167 | ), 168 | ) 169 | 170 | vectorize[vectorized_dropout_bw, NELTS](node.get_cap()) 171 | -------------------------------------------------------------------------------- /voodoo/autograd/node.mojo: -------------------------------------------------------------------------------- 1 | from math import sin, cos, sqrt, log, iota 2 | from random import rand, seed 3 | from memory import memset 4 | from algorithm import sum 5 | 6 | from voodoo.utils import Vector, warn 7 | 8 | 9 | @register_passable("trivial") 10 | struct Node: 11 | var _id_ptr: Pointer[Int] 12 | var _data_id_ptr: Pointer[Int] 13 | var _grad_id_ptr: Pointer[Int] 14 | var _data_ptr: Pointer[DTypePointer[DType.float32]] 15 | var _parents: Vector[Int] 16 | var _children: Vector[Int] 17 | var _dependencies_ptr: Pointer[Int] # Has to be a pointer 18 | var _is_static: Bool 19 | var _computed_ptr: Pointer[Bool] # Has to be a pointer 20 | var _grad_computed_ptr: Pointer[Bool] # Has to be a pointer 21 | var _operator_id: Int 22 | var _tmp_visited: Bool 23 | var _checkpoint: Bool 24 | var _is_single: Bool 25 | var _cap: Int 26 | var _num_dims: Int 27 | var _shape: Vector[Int] 28 | var _strides: Vector[Int] 29 | var _other_params: Vector[Int] 30 | 31 | fn __init__( 32 | id: Int, 33 | shape: Vector[Int], 34 | is_static: Bool = True, 35 | other_params: Vector[Int] = Vector[Int](), 36 | checkpoint: Bool = False, 37 | operator_id: Int = -1, 38 | is_single: Bool = False, 39 | ) raises -> Self: 40 | var id_ptr = Pointer[Int].alloc(1) 41 | id_ptr.store(id) 42 | var data_id_ptr = Pointer[Int].alloc(1) 43 | data_id_ptr.store(-1) 44 | var grad_id_ptr = Pointer[Int].alloc(1) 45 | grad_id_ptr.store(-1) 46 | var data_ptr = Pointer[DTypePointer[DType.float32]].alloc(2) 47 | var data = DTypePointer[DType.float32].get_null() 48 | var grad = DTypePointer[DType.float32].get_null() 49 | data_ptr.store(0, data) 50 | data_ptr.store(1, grad) 51 | var parents = Vector[Int]() 52 | var children = Vector[Int]() 53 | var dependencies_ptr = Pointer[Int].alloc(1) 54 | dependencies_ptr.store(0) 55 | var computed_ptr = Pointer[Bool].alloc(1) 56 | computed_ptr.store(is_static) 57 | var grad_computed_ptr = Pointer[Bool].alloc(1) 58 | grad_computed_ptr.store(False) 59 | 60 | var cap = 1 61 | for i in range(len(shape)): 62 | cap *= shape[i] if shape[i] > 0 else 1 63 | 64 | var strides = Vector[Int](len(shape)) 65 | strides[len(shape) - 1] = 1 66 | for i in range(len(shape) - 1): 67 | strides[len(shape) - i - 2] = ( 68 | strides[len(shape) - i - 1] * shape[len(shape) - i - 1] 69 | ) 70 | 71 | return Node { 72 | _id_ptr: id_ptr, 73 | _data_id_ptr: data_id_ptr, 74 | _grad_id_ptr: grad_id_ptr, 75 | _data_ptr: data_ptr, 76 | _parents: parents, 77 | _children: children, 78 | _dependencies_ptr: dependencies_ptr, 79 | _is_static: is_static, 80 | _computed_ptr: computed_ptr, 81 | _grad_computed_ptr: grad_computed_ptr, 82 | _operator_id: operator_id, 83 | _tmp_visited: False, 84 | _checkpoint: checkpoint, 85 | _is_single: is_single, 86 | _cap: cap, 87 | _num_dims: len(shape), 88 | _shape: shape, 89 | _strides: strides, 90 | _other_params: other_params, 91 | } 92 | 93 | fn get_id(self) -> Int: 94 | return self._id_ptr.load() 95 | 96 | fn set_id(self, id: Int): 97 | self._id_ptr.store(id) 98 | 99 | fn get_data_id(self) -> Int: 100 | return self._data_id_ptr.load() 101 | 102 | fn set_data_id(self, id: Int): 103 | self._data_id_ptr.store(id) 104 | 105 | fn get_grad_id(self) -> Int: 106 | return self._grad_id_ptr.load() 107 | 108 | fn set_grad_id(self, id: Int): 109 | self._grad_id_ptr.store(id) 110 | 111 | fn get_data(self) -> DTypePointer[DType.float32]: 112 | return self._data_ptr[0] 113 | 114 | fn set_data(self, data: DTypePointer[DType.float32]): 115 | self._data_ptr.store(0, data) 116 | 117 | fn get_grad(self) -> DTypePointer[DType.float32]: 118 | return self._data_ptr[1] 119 | 120 | fn set_grad(self, grad: DTypePointer[DType.float32]): 121 | self._data_ptr.store(1, grad) 122 | 123 | fn get_parents(self) -> Vector[Int]: 124 | return self._parents 125 | 126 | fn push_back_parent(inout self, parent: Int): 127 | self._parents.push_back(parent) 128 | 129 | fn clear_parents(inout self): 130 | self._parents.clear() 131 | 132 | fn get_children(self) -> Vector[Int]: 133 | return self._children 134 | 135 | fn push_back_child(inout self, child: Int): 136 | self._children.push_back(child) 137 | 138 | fn clear_children(inout self): 139 | self._children.clear() 140 | 141 | fn get_dependencies(self) -> Int: 142 | return self._dependencies_ptr.load() 143 | 144 | fn set_dependencies(self, dependencies: Int): 145 | self._dependencies_ptr.store(dependencies) 146 | 147 | fn get_is_static(self) -> Bool: 148 | return self._is_static 149 | 150 | fn set_is_static(inout self, is_static: Bool): 151 | self._is_static = is_static 152 | 153 | fn get_computed(self) -> Bool: 154 | return self._computed_ptr.load() 155 | 156 | fn set_computed(self, computed: Bool): 157 | self._computed_ptr.store(computed) 158 | 159 | fn get_grad_computed(self) -> Bool: 160 | return self._grad_computed_ptr.load() 161 | 162 | fn set_grad_computed(self, grad_computed: Bool): 163 | self._grad_computed_ptr.store(grad_computed) 164 | 165 | fn get_operator_id(self) -> Int: 166 | return self._operator_id 167 | 168 | fn set_operator_id(inout self, operator_id: Int): 169 | self._operator_id = operator_id 170 | 171 | fn get_grad_operator_id(self) -> Int: 172 | return self._operator_id + 1 173 | 174 | fn get_tmp_visited(self) -> Bool: 175 | return self._tmp_visited 176 | 177 | fn set_tmp_visited(inout self, tmp_visited: Bool): 178 | self._tmp_visited = tmp_visited 179 | 180 | fn get_checkpoint(self) -> Bool: 181 | return self._checkpoint 182 | 183 | fn set_checkpoint(inout self, checkpoint: Bool): 184 | self._checkpoint = checkpoint 185 | 186 | fn get_is_single(self) -> Bool: 187 | return self._is_single 188 | 189 | fn set_is_single(inout self, is_single: Bool): 190 | self._is_single = is_single 191 | 192 | fn get_cap(self) -> Int: 193 | return self._cap 194 | 195 | fn get_num_dims(self) -> Int: 196 | return self._num_dims 197 | 198 | fn get_shape(self) -> Vector[Int]: 199 | return self._shape 200 | 201 | fn get_strides(self) -> Vector[Int]: 202 | return self._strides 203 | 204 | fn get_other_params(self) -> Vector[Int]: 205 | return self._other_params 206 | 207 | fn is_zero(self) -> Bool: 208 | return sum(Buffer[DType.float32](self._data_ptr[0], self._cap)) == 0.0 209 | 210 | fn fill(self, val: Float32): 211 | for i in range(self._cap): 212 | self._data_ptr[0].store(i, val) 213 | 214 | fn fill_incr(self): 215 | iota(self._data_ptr[0], self._cap) 216 | 217 | fn fill_grad(self, val: Float32): 218 | for i in range(self._cap): 219 | self._data_ptr[1].store(i, val) 220 | 221 | fn grad_fill_incr(self): 222 | iota(self._data_ptr[1], self._cap) 223 | 224 | fn free(self): 225 | self._id_ptr.free() 226 | self._data_id_ptr.free() 227 | self._grad_id_ptr.free() 228 | self._data_ptr[0].free() 229 | self._data_ptr[1].free() 230 | self._data_ptr.free() 231 | self._parents.free() 232 | self._children.free() 233 | self._dependencies_ptr.free() 234 | self._computed_ptr.free() 235 | self._grad_computed_ptr.free() 236 | self._shape.free() 237 | self._strides.free() 238 | self._other_params.free() 239 | 240 | fn print(self, accuracy: Int = 6) raises: 241 | var row: Int = self._shape[self._num_dims - 2] 242 | var cols: Int = self._shape[self._num_dims - 1] 243 | var col_strides: Int = (self._strides[0] * self._shape[0]) // cols 244 | print(" ") 245 | var times = 1 246 | if self._grad_computed_ptr.load() and self._grad_id_ptr.load() != -1: 247 | times = 2 248 | print_no_newline(" 10 and i > 4 and i < col_strides - 5: 251 | if i == 5: 252 | print(" ... ") 253 | continue 254 | else: 255 | if i > 0: 256 | print_no_newline(" ") 257 | else: 258 | print_no_newline("[ ") 259 | 260 | var indent = 0 261 | for d in range(self._num_dims - 1): 262 | if cols * i % self._strides[d] == 0: 263 | print_no_newline("[ ") 264 | indent += 1 265 | else: 266 | print_no_newline(" ") 267 | 268 | for j in range(cols): 269 | if cols > 10 and j >= 3 and j < cols - 3: 270 | if j == 3: 271 | print_no_newline("... , ") 272 | continue 273 | else: 274 | var idx = cols * i + j 275 | print_no_newline( 276 | String(self._data_ptr[0][idx])[:accuracy] if self._data_ptr[ 277 | 0 278 | ][idx] 279 | != 0.0 else String(0.000)[:accuracy] 280 | ) 281 | if j != cols - 1: 282 | print_no_newline(", ") 283 | 284 | for d in range(self._num_dims - 2, -1, -1): 285 | if cols * (i + 1) % self._strides[d] == 0: 286 | print_no_newline(" ]") 287 | 288 | if i < col_strides - 1: 289 | print_no_newline(", ") 290 | put_new_line() 291 | else: 292 | print_no_newline(" ], shape: [") 293 | for i in range(self._num_dims): 294 | print_no_newline(self._shape[i]) 295 | if i < self._num_dims - 1: 296 | print_no_newline(",") 297 | print_no_newline("] ") 298 | print_no_newline(">") 299 | print(" ") 300 | -------------------------------------------------------------------------------- /voodoo/constants.mojo: -------------------------------------------------------------------------------- 1 | from math.limit import inf 2 | from sys.intrinsics import PrefetchOptions 3 | 4 | from voodoo.autograd import Node 5 | 6 | alias PI = 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342 7 | alias MEMORY_POOL_SIZE = 20 8 | alias EPSILON = 1e-8 9 | alias F32_MAX = inf[DType.float32]() 10 | alias PREFETCH_READ = PrefetchOptions().for_read().high_locality().to_data_cache() 11 | alias PREFETCH_WRITE = PrefetchOptions().for_write().high_locality().to_data_cache() 12 | alias NELTS = simdwidthof[DType.float32]() * 2 13 | 14 | alias UNARY_OP = fn (b: Node, a: Node) -> None 15 | alias BINARY_OP = fn (c: Node, a: Node, b: Node) -> None 16 | alias OP_TUPLE = Tuple[UNARY_OP, BINARY_OP] 17 | 18 | 19 | fn NU(b: Node, a: Node): 20 | ... 21 | 22 | 23 | fn NB(c: Node, a: Node, b: Node): 24 | ... 25 | -------------------------------------------------------------------------------- /voodoo/core/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .tensor import Tensor 2 | from .initializers import ( 3 | Initializer, 4 | Constant, 5 | Zeros, 6 | Ones, 7 | GlorotNormal, 8 | GlorotUniform, 9 | HeNormal, 10 | HeUniform, 11 | LecunNormal, 12 | LecunUniform, 13 | RandomNormal, 14 | RandomUniform, 15 | TruncatedNormal, 16 | NoneInitializer, 17 | ) 18 | from .constraints import ( 19 | Constraint, 20 | MaxNorm, 21 | MinMaxNorm, 22 | NonNeg, 23 | RadialConstraint, 24 | UnitNorm, 25 | NoneConstraint, 26 | ) 27 | from .optimizers import Optimizer, SGD 28 | -------------------------------------------------------------------------------- /voodoo/core/constraints.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | from math import sqrt, max, abs 3 | from tensor import TensorShape 4 | 5 | from voodoo.utils import Vector, reduce_vector_mul 6 | from voodoo.constants import NELTS, EPSILON 7 | 8 | 9 | trait Constraint(CollectionElement): 10 | @staticmethod 11 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]) -> None: 12 | ... 13 | 14 | 15 | @register_passable("trivial") 16 | struct MaxNorm[max_value: Float32](Constraint): 17 | """ 18 | A constraint that enforces the maximum norm of the weights. 19 | """ 20 | 21 | @staticmethod 22 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 23 | var num_elements = reduce_vector_mul[shape]() 24 | var norms: Float32 = 0.0 25 | 26 | @parameter 27 | fn vec_norm[NELTS: Int](x: Int): 28 | norms += (data.simd_load[NELTS](x) ** 2).reduce_add() 29 | 30 | vectorize[vec_norm, NELTS](num_elements) 31 | norms = sqrt(norms) 32 | var scale = max_value / (norms + EPSILON) 33 | 34 | @parameter 35 | fn vec[NELTS: Int](x: Int): 36 | data.simd_store[NELTS](x, data.simd_load[NELTS](x) * scale) 37 | 38 | vectorize[vec, NELTS](num_elements) 39 | 40 | 41 | @register_passable("trivial") 42 | struct MinMaxNorm[min_value: Float32, max_value: Float32](Constraint): 43 | """ 44 | A constraint that enforces the minimum and maximum norm of the weights. 45 | """ 46 | 47 | @staticmethod 48 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 49 | var num_elements = reduce_vector_mul[shape]() 50 | var norms: Float32 = 0.0 51 | 52 | @parameter 53 | fn vec_norm[NELTS: Int](x: Int): 54 | norms += (data.simd_load[NELTS](x) ** 2).reduce_add() 55 | 56 | vectorize[vec_norm, NELTS](num_elements) 57 | norms = sqrt(norms) 58 | var scaleMax = max_value / (norms + EPSILON) 59 | var scaleMin = min_value / (norms + EPSILON) 60 | 61 | @parameter 62 | fn vec[NELTS: Int](x: Int): 63 | var d = data.simd_load[NELTS](x) 64 | var norm = d * ( 65 | scaleMax if d > max_value else scaleMin if d < min_value else 1.0 66 | ) 67 | data.simd_store[NELTS](x, norm) 68 | 69 | vectorize[vec, NELTS](num_elements) 70 | 71 | 72 | @register_passable("trivial") 73 | struct NonNeg[](Constraint): 74 | """ 75 | A constraint that enforces non-negative weights. 76 | """ 77 | 78 | @staticmethod 79 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 80 | var num_elements = reduce_vector_mul[shape]() 81 | 82 | @parameter 83 | fn vec[NELTS: Int](x: Int): 84 | data.simd_store[NELTS](x, abs(data.simd_load[NELTS](x))) 85 | 86 | vectorize[vec, NELTS](num_elements) 87 | 88 | 89 | @register_passable("trivial") 90 | struct RadialConstraint[](Constraint): 91 | """ 92 | A constraint that enforces the radial constraint on the weights. 93 | """ 94 | 95 | @staticmethod 96 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 97 | var num_elements = reduce_vector_mul[shape]() 98 | var center = shape[0] // 2 99 | 100 | @parameter 101 | fn vec[NELTS: Int](x: Int): 102 | var i = x // shape[1] 103 | var j = x % shape[1] 104 | var d = sqrt((i - center) ** 2 + (j - center) ** 2) 105 | data.simd_store[NELTS]( 106 | x, data.simd_load[NELTS](x) * (1.0 if d <= center else 0.0) 107 | ) 108 | 109 | vectorize[vec, NELTS](num_elements) 110 | 111 | 112 | @register_passable("trivial") 113 | struct UnitNorm[](Constraint): 114 | """ 115 | A constraint that enforces the unit norm of the weights. 116 | """ 117 | 118 | @staticmethod 119 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 120 | var num_elements = reduce_vector_mul[shape]() 121 | 122 | @parameter 123 | fn vec[NELTS: Int](x: Int): 124 | var norm = sqrt((data.simd_load[NELTS](x) ** 2).reduce_add()) 125 | data.simd_store[NELTS](x, data.simd_load[NELTS](x) / (norm + EPSILON)) 126 | 127 | vectorize[vec, NELTS](num_elements) 128 | 129 | 130 | @register_passable("trivial") 131 | struct NoneConstraint[](Constraint): 132 | """ 133 | An constraint that does nothing. 134 | """ 135 | 136 | @staticmethod 137 | fn constrain[shape: Vector[Int]](data: DTypePointer[DType.float32]): 138 | ... 139 | -------------------------------------------------------------------------------- /voodoo/core/initializers.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | from random import ( 3 | seed, 4 | random_float64, 5 | randn_float64, 6 | randn, 7 | ) 8 | 9 | from voodoo.utils import Vector, reduce_vector_mul 10 | from voodoo.constants import NELTS 11 | 12 | 13 | trait Initializer(CollectionElement): 14 | @staticmethod 15 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 16 | ... 17 | 18 | 19 | @register_passable("trivial") 20 | struct Constant[value: Float64](Initializer): 21 | """ 22 | An initializer that fills a Tensor with a constant value. 23 | """ 24 | 25 | @staticmethod 26 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 27 | seed() 28 | 29 | @parameter 30 | fn vec[NELTS: Int](x: Int): 31 | data.simd_store[NELTS](x, value.to_int()) 32 | 33 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 34 | 35 | 36 | @register_passable("trivial") 37 | struct Zeros[](Initializer): 38 | """ 39 | An initializer that fills a Tensor with zeros. 40 | """ 41 | 42 | @staticmethod 43 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 44 | seed() 45 | 46 | @parameter 47 | fn vec[NELTS: Int](x: Int): 48 | data.simd_store[NELTS](x, 0.0) 49 | 50 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 51 | 52 | 53 | @register_passable("trivial") 54 | struct Ones[](Initializer): 55 | """ 56 | An initializer that fills a Tensor with ones. 57 | """ 58 | 59 | @staticmethod 60 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 61 | seed() 62 | 63 | @parameter 64 | fn vec[NELTS: Int](x: Int): 65 | data.simd_store[NELTS](x, 1.0) 66 | 67 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 68 | 69 | 70 | @register_passable("trivial") 71 | struct GlorotNormal[input_units: Float64, output_units: Float64]( 72 | CollectionElement, Initializer 73 | ): 74 | """ 75 | An initializer that fills a Tensor with values from a Glorot normal distribution, also known as Xavier normal distribution. 76 | """ 77 | 78 | @staticmethod 79 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 80 | seed() 81 | randn( 82 | data, 83 | reduce_vector_mul[shape](), 84 | 0.0, 85 | (2.0 / (input_units + output_units)) ** 0.5, 86 | ) 87 | 88 | 89 | @register_passable("trivial") 90 | struct GlorotUniform[input_units: Float64, output_units: Float64]( 91 | CollectionElement, Initializer 92 | ): 93 | """ 94 | An initializer that fills a Tensor with values from a Glorot uniform distribution, also known as Xavier uniform distribution. 95 | """ 96 | 97 | @staticmethod 98 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 99 | seed() 100 | var limit = (6.0 / (input_units + output_units)) ** 0.5 101 | 102 | @parameter 103 | fn vec[NELTS: Int](x: Int): 104 | data.simd_store[NELTS]( 105 | x, random_float64(-limit, limit).cast[DType.float32]() 106 | ) 107 | 108 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 109 | 110 | 111 | @register_passable("trivial") 112 | struct HeNormal[input_units: Float64](Initializer): 113 | """ 114 | An initializer that fills a Tensor with values from a He normal distribution. 115 | """ 116 | 117 | @staticmethod 118 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 119 | seed() 120 | randn(data, reduce_vector_mul[shape](), 0.0, (2.0 / input_units) ** 0.5) 121 | 122 | 123 | @register_passable("trivial") 124 | struct HeUniform[input_units: Float64](Initializer): 125 | """ 126 | An initializer that fills a Tensor with values from a He uniform distribution. 127 | """ 128 | 129 | @staticmethod 130 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 131 | seed() 132 | var limit = (6.0 / input_units) ** 0.5 133 | 134 | @parameter 135 | fn vec[NELTS: Int](x: Int): 136 | data.simd_store[NELTS]( 137 | x, random_float64(-limit, limit).cast[DType.float32]() 138 | ) 139 | 140 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 141 | 142 | 143 | @register_passable("trivial") 144 | struct Identity[](Initializer): 145 | """ 146 | An initializer that fills a Tensor with the identity matrix. Must be a 2D tensor. 147 | """ 148 | 149 | @staticmethod 150 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 151 | seed() 152 | var n = shape[0] 153 | var m = shape[1] 154 | 155 | @parameter 156 | fn vec[NELTS: Int](x: Int): 157 | var i = x / m 158 | var j = x % m 159 | data.simd_store[NELTS](x, 1.0 if i == j else 0.0) 160 | 161 | vectorize[vec, NELTS](n * m) 162 | 163 | 164 | @register_passable("trivial") 165 | struct LecunNormal[input_units: Float64](Initializer): 166 | """ 167 | An initializer that fills a Tensor with values from a Lecun normal distribution. 168 | """ 169 | 170 | @staticmethod 171 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 172 | seed() 173 | randn(data, reduce_vector_mul[shape](), 0.0, (1.0 / input_units) ** 0.5) 174 | 175 | 176 | @register_passable("trivial") 177 | struct LecunUniform[input_units: Float64](Initializer): 178 | """ 179 | An initializer that fills a Tensor with values from a Lecun uniform distribution. 180 | """ 181 | 182 | @staticmethod 183 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 184 | seed() 185 | var limit = (3.0 / input_units) ** 0.5 186 | 187 | @parameter 188 | fn vec[NELTS: Int](x: Int): 189 | data.simd_store[NELTS]( 190 | x, random_float64(-limit, limit).cast[DType.float32]() 191 | ) 192 | 193 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 194 | 195 | 196 | @register_passable("trivial") 197 | struct RandomNormal[mean: Float64, std: Float64](Initializer): 198 | """ 199 | An initializer that fills a Tensor with values from a normal distribution. 200 | """ 201 | 202 | @staticmethod 203 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 204 | seed() 205 | randn(data, reduce_vector_mul[shape](), mean, std) 206 | 207 | 208 | @register_passable("trivial") 209 | struct RandomUniform[low: Float64, high: Float64](Initializer): 210 | """ 211 | An initializer that fills a Tensor with values from a uniform distribution. 212 | """ 213 | 214 | @staticmethod 215 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 216 | seed() 217 | 218 | @parameter 219 | fn vec[NELTS: Int](x: Int): 220 | data.simd_store[NELTS](x, random_float64(low, high).cast[DType.float32]()) 221 | 222 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 223 | 224 | 225 | @register_passable("trivial") 226 | struct TruncatedNormal[mean: Float64, std: Float64](Initializer): 227 | """ 228 | An initializer that fills a Tensor with values from a truncated normal distribution. 229 | """ 230 | 231 | @staticmethod 232 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 233 | seed() 234 | var low = mean - 2.0 * std 235 | var high = mean + 2.0 * std 236 | 237 | @parameter 238 | fn vec[NELTS: Int](x: Int): 239 | var value = randn_float64(mean, std) 240 | while value < low or value > high: 241 | value = randn_float64(mean, std) 242 | data.simd_store[NELTS](x, value.cast[DType.float32]()) 243 | 244 | vectorize[vec, NELTS](reduce_vector_mul[shape]()) 245 | 246 | 247 | @register_passable("trivial") 248 | struct NoneInitializer[](Initializer): 249 | """ 250 | An initializer that does nothing. 251 | """ 252 | 253 | @staticmethod 254 | fn initialize[shape: Vector[Int]](data: DTypePointer[DType.float32]): 255 | ... 256 | -------------------------------------------------------------------------------- /voodoo/core/layers/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .activation import Activation 2 | from .conv1D import Conv1D 3 | from .conv2D import Conv2D 4 | from .dense import Dense 5 | from .dropout import Dropout 6 | from .flatten import Flatten 7 | from .leakyRelu import LeakyReLu 8 | from .maxPool1D import MaxPool1D 9 | from .maxPool2D import MaxPool2D 10 | from .reshape import Reshape 11 | -------------------------------------------------------------------------------- /voodoo/core/layers/activation.mojo: -------------------------------------------------------------------------------- 1 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 2 | from voodoo.utils import get_activation_code 3 | 4 | 5 | struct Activation[ 6 | activation: String, 7 | ](): 8 | fn __init__( 9 | inout self, 10 | ) raises: 11 | ... 12 | 13 | fn forward( 14 | self, x: Tensor 15 | ) raises -> Tensor[x.shape, NoneInitializer, NoneConstraint, False, False]: 16 | var res = x.compute_activation[get_activation_code[activation]()]() 17 | return res 18 | -------------------------------------------------------------------------------- /voodoo/core/layers/conv1D.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, Initializer, Constraint, NoneInitializer, NoneConstraint 4 | from voodoo.utils import get_activation_code 5 | 6 | 7 | struct Conv1D[ 8 | in_channels: Int, 9 | kernel_width: Int, 10 | stride: Int, 11 | padding: Int, 12 | use_bias: Bool = False, 13 | weight_initializer: Initializer = NoneInitializer, 14 | weight_constraint: Constraint = NoneConstraint, 15 | bias_initializer: Initializer = NoneInitializer, 16 | bias_constraint: Constraint = NoneConstraint, 17 | ](): 18 | var W: Tensor[ 19 | TensorShape(in_channels, kernel_width), weight_initializer, weight_constraint 20 | ] 21 | var bias: Tensor[TensorShape(in_channels, 1, 1), bias_initializer, bias_constraint] 22 | 23 | fn __init__( 24 | inout self, 25 | ) raises: 26 | self.W = Tensor[ 27 | TensorShape(in_channels, kernel_width), 28 | weight_initializer, 29 | weight_constraint, 30 | ]().requires_grad() 31 | 32 | @parameter 33 | if self.use_bias: 34 | self.bias = Tensor[ 35 | TensorShape(in_channels, 1, 1), 36 | bias_initializer, 37 | bias_constraint, 38 | ]().requires_grad() 39 | else: 40 | self.bias = Tensor[ 41 | TensorShape(in_channels, 1, 1), bias_initializer, bias_constraint 42 | ]() 43 | 44 | fn forward( 45 | self, x: Tensor 46 | ) raises -> Tensor[ 47 | TensorShape( 48 | x.shape[0], 49 | x.shape[1], 50 | (x.shape[2] - kernel_width + 2 * padding) // stride + 1, 51 | ), 52 | NoneInitializer, 53 | NoneConstraint, 54 | False, 55 | False, 56 | ]: 57 | var res = x.conv_1d[ 58 | TensorShape( 59 | x.shape[0], 60 | x.shape[1], 61 | (x.shape[2] - kernel_width + 2 * padding) // stride + 1, 62 | ) 63 | ](self.W, self.padding, self.stride) 64 | 65 | @parameter 66 | if self.use_bias: 67 | return res + self.bias 68 | 69 | return res 70 | -------------------------------------------------------------------------------- /voodoo/core/layers/conv2D.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, Initializer, Constraint, NoneInitializer, NoneConstraint 4 | from voodoo.utils import get_activation_code 5 | 6 | 7 | struct Conv2D[ 8 | in_channels: Int, 9 | kernel_width: Int, 10 | kernel_height: Int, 11 | stride: Int, 12 | padding: Int, 13 | activation: String = "none", 14 | use_bias: Bool = False, 15 | weight_initializer: Initializer = NoneInitializer, 16 | weight_constraint: Constraint = NoneConstraint, 17 | bias_initializer: Initializer = NoneInitializer, 18 | bias_constraint: Constraint = NoneConstraint, 19 | ](): 20 | var W: Tensor[ 21 | TensorShape(in_channels, kernel_width, kernel_height), 22 | weight_initializer, 23 | weight_constraint, 24 | ] 25 | var bias: Tensor[TensorShape(in_channels, 1, 1), bias_initializer, bias_constraint] 26 | 27 | fn __init__( 28 | inout self, 29 | ) raises: 30 | self.W = Tensor[ 31 | TensorShape(in_channels, kernel_width, kernel_height), 32 | weight_initializer, 33 | weight_constraint, 34 | ]().requires_grad() 35 | 36 | @parameter 37 | if self.use_bias: 38 | self.bias = Tensor[ 39 | TensorShape(in_channels, 1, 1), bias_initializer, bias_constraint 40 | ]().requires_grad() 41 | else: 42 | self.bias = Tensor[ 43 | TensorShape(in_channels, 1, 1), bias_initializer, bias_constraint 44 | ]() 45 | 46 | fn forward( 47 | self, x: Tensor 48 | ) raises -> Tensor[ 49 | TensorShape( 50 | x.shape[0], 51 | x.shape[1], 52 | (x.shape[2] - kernel_width + 2 * self.padding) // self.stride + 1, 53 | (x.shape[3] - kernel_height + 2 * self.padding) // self.stride + 1, 54 | ), 55 | NoneInitializer, 56 | NoneConstraint, 57 | False, 58 | False, 59 | ]: 60 | var res = x.conv_2d[ 61 | TensorShape( 62 | x.shape[0], 63 | x.shape[1], 64 | (x.shape[2] - kernel_width + 2 * self.padding) // self.stride + 1, 65 | (x.shape[3] - kernel_height + 2 * self.padding) // self.stride + 1, 66 | ) 67 | ](self.W, self.padding, self.stride).compute_activation[self.activation]() 68 | 69 | @parameter 70 | if self.use_bias: 71 | return res + self.bias 72 | 73 | return res 74 | -------------------------------------------------------------------------------- /voodoo/core/layers/dense.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, Initializer, Constraint, NoneInitializer, NoneConstraint 4 | from voodoo.utils import get_activation_code 5 | 6 | 7 | struct Dense[ 8 | in_neurons: Int, 9 | out_neurons: Int, 10 | activation: String = "none", 11 | use_bias: Bool = True, 12 | weight_initializer: Initializer = NoneInitializer, 13 | weight_constraint: Constraint = NoneConstraint, 14 | bias_initializer: Initializer = NoneInitializer, 15 | bias_constraint: Constraint = NoneConstraint, 16 | ](): 17 | var W: Tensor[ 18 | TensorShape(in_neurons, out_neurons), weight_initializer, weight_constraint 19 | ] 20 | var bias: Tensor[TensorShape(out_neurons), bias_initializer, bias_constraint] 21 | 22 | fn __init__( 23 | inout self, 24 | ) raises: 25 | self.W = Tensor[ 26 | TensorShape(in_neurons, out_neurons), weight_initializer, weight_constraint 27 | ]().requires_grad() 28 | 29 | @parameter 30 | if self.use_bias: 31 | self.bias = Tensor[ 32 | TensorShape(out_neurons), bias_initializer, bias_constraint 33 | ]().requires_grad() 34 | else: 35 | self.bias = Tensor[ 36 | TensorShape(out_neurons), bias_initializer, bias_constraint 37 | ]() 38 | 39 | fn forward( 40 | self, x: Tensor 41 | ) raises -> Tensor[x.shape, NoneInitializer, NoneConstraint, False, False]: 42 | var computed = x @ self.W 43 | 44 | @parameter 45 | if self.use_bias: 46 | computed = computed + self.bias 47 | 48 | @parameter 49 | if self.activation != "none": 50 | return computed.compute_activation[get_activation_code[activation]()]() 51 | 52 | return computed 53 | -------------------------------------------------------------------------------- /voodoo/core/layers/dropout.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 4 | 5 | 6 | struct Dropout[ 7 | dropout_rate: Float32 = 0.5, noise_shape: TensorShape = TensorShape(0) 8 | ](): 9 | fn __init__( 10 | inout self, 11 | ) raises: 12 | ... 13 | 14 | fn forward( 15 | self, x: Tensor 16 | ) raises -> Tensor[x.shape, NoneInitializer, NoneConstraint, False, False]: 17 | var res = x.dropout[dropout_rate, noise_shape]() 18 | return res 19 | -------------------------------------------------------------------------------- /voodoo/core/layers/flatten.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 4 | 5 | 6 | struct Flatten[](): 7 | fn __init__( 8 | inout self, 9 | ) raises: 10 | ... 11 | 12 | fn forward( 13 | self, x: Tensor 14 | ) raises -> Tensor[ 15 | TensorShape(x.shape[0], x.shape.num_elements() // x.shape[0]), 16 | NoneInitializer, 17 | NoneConstraint, 18 | False, 19 | False, 20 | ]: 21 | var res = x.flatten() 22 | return res 23 | -------------------------------------------------------------------------------- /voodoo/core/layers/leakyRelu.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, Initializer, Constraint, NoneInitializer, NoneConstraint 4 | from voodoo.utils import get_activation_code, lrelu_code 5 | 6 | 7 | struct LeakyReLu[ 8 | in_neurons: Int, 9 | out_neurons: Int, 10 | use_bias: Bool = True, 11 | weight_initializer: Initializer = NoneInitializer, 12 | weight_constraint: Constraint = NoneConstraint, 13 | bias_initializer: Initializer = NoneInitializer, 14 | bias_constraint: Constraint = NoneConstraint, 15 | alpha: Float32 = 0.2, 16 | ](): 17 | var W: Tensor[ 18 | TensorShape(in_neurons, out_neurons), weight_initializer, weight_constraint 19 | ] 20 | var bias: Tensor[TensorShape(out_neurons), bias_initializer, bias_constraint] 21 | 22 | fn __init__( 23 | inout self, 24 | ) raises: 25 | self.W = Tensor[ 26 | TensorShape(in_neurons, out_neurons), weight_initializer, weight_constraint 27 | ]().requires_grad() 28 | 29 | @parameter 30 | if self.use_bias: 31 | self.bias = Tensor[ 32 | TensorShape(out_neurons), bias_initializer, bias_constraint 33 | ]().requires_grad() 34 | else: 35 | self.bias = Tensor[ 36 | TensorShape(out_neurons), bias_initializer, bias_constraint 37 | ]() 38 | 39 | fn forward( 40 | self, x: Tensor 41 | ) raises -> Tensor[x.shape, NoneInitializer, NoneConstraint, False, False]: 42 | var computed = x @ self.W 43 | 44 | @parameter 45 | if self.use_bias: 46 | computed = computed + self.bias 47 | 48 | return computed.compute_activation[operator_id=lrelu_code, arg1=alpha]() 49 | -------------------------------------------------------------------------------- /voodoo/core/layers/maxPool1D.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 4 | 5 | 6 | struct MaxPool1D[ 7 | kernel_width: Int, 8 | stride: Int = 1, 9 | padding: Int = 0, 10 | ](): 11 | fn __init__( 12 | inout self, 13 | ) raises: 14 | ... 15 | 16 | fn forward( 17 | self, x: Tensor 18 | ) raises -> Tensor[ 19 | TensorShape( 20 | x.shape[0], 21 | (x.shape[1] - kernel_width + 2 * padding) // stride + 1, 22 | x.shape[2], 23 | ), 24 | NoneInitializer, 25 | NoneConstraint, 26 | False, 27 | False, 28 | ]: 29 | var res = x.maxpool_1d[ 30 | TensorShape( 31 | x.shape[0], 32 | (x.shape[1] - kernel_width + 2 * padding) // stride + 1, 33 | x.shape[2], 34 | ) 35 | ]( 36 | kernel_width, 37 | stride, 38 | padding, 39 | ) 40 | return res 41 | -------------------------------------------------------------------------------- /voodoo/core/layers/maxPool2D.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 4 | 5 | 6 | struct MaxPool2D[ 7 | kernel_width: Int, 8 | kernel_height: Int, 9 | stride: Int = 1, 10 | padding: Int = 0, 11 | ](): 12 | fn __init__( 13 | inout self, 14 | ) raises: 15 | ... 16 | 17 | fn forward( 18 | self, x: Tensor 19 | ) raises -> Tensor[ 20 | TensorShape( 21 | x.shape[0], 22 | x.shape[1], 23 | (x.shape[2] - kernel_width + 2 * padding) // stride + 1, 24 | (x.shape[3] - kernel_height + 2 * padding) // stride + 1, 25 | ), 26 | NoneInitializer, 27 | NoneConstraint, 28 | False, 29 | False, 30 | ]: 31 | var res = x.maxpool_2d[ 32 | TensorShape( 33 | x.shape[0], 34 | x.shape[1], 35 | (x.shape[2] - kernel_width + 2 * padding) // stride + 1, 36 | (x.shape[3] - kernel_height + 2 * padding) // stride + 1, 37 | ) 38 | ]( 39 | StaticIntTuple[2](kernel_width, kernel_height), 40 | stride, 41 | padding, 42 | ) 43 | return res 44 | -------------------------------------------------------------------------------- /voodoo/core/layers/reshape.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | 3 | from voodoo.core import Tensor, NoneInitializer, NoneConstraint 4 | 5 | 6 | struct Reshape[new_shape: TensorShape](): 7 | fn __init__(inout self) raises: 8 | ... 9 | 10 | fn forward( 11 | self, x: Tensor 12 | ) raises -> Tensor[new_shape, NoneInitializer, NoneConstraint, False, False]: 13 | var res = x.reshape[new_shape]() 14 | return res 15 | -------------------------------------------------------------------------------- /voodoo/core/optimizers.mojo: -------------------------------------------------------------------------------- 1 | from algorithm import vectorize 2 | 3 | from voodoo.constants import NELTS, PREFETCH_READ, PREFETCH_WRITE 4 | from voodoo.autograd import Node 5 | from voodoo.utils import Vector 6 | 7 | 8 | trait Optimizer(CollectionElement): 9 | @staticmethod 10 | fn step(x: Vector[Node]): 11 | ... 12 | 13 | @staticmethod 14 | fn key() -> String: 15 | ... 16 | 17 | 18 | @register_passable("trivial") 19 | struct SGD[learning_rate: Float32](Optimizer): 20 | @staticmethod 21 | fn step(x: Vector[Node]): 22 | for i in range(len(x)): 23 | var node = x[i] 24 | var node_data = node.get_data() 25 | var node_grad = node.get_grad() 26 | if node.get_is_static() and node.get_grad_id() != 0: 27 | DTypePointer.prefetch[PREFETCH_READ](node_data) 28 | DTypePointer.prefetch[PREFETCH_READ](node_grad) 29 | DTypePointer.prefetch[PREFETCH_WRITE](node_data) 30 | 31 | @parameter 32 | fn vectorized_update[NELTS: Int](i: Int): 33 | node_data.simd_store[NELTS]( 34 | i, 35 | node_data.simd_load[NELTS](i) 36 | - (node_grad.simd_load[NELTS](i) * learning_rate), 37 | ) 38 | 39 | vectorize[vectorized_update, NELTS](node.get_cap()) 40 | 41 | @staticmethod 42 | fn key() -> String: 43 | return "SGD" 44 | -------------------------------------------------------------------------------- /voodoo/core/tensor.mojo: -------------------------------------------------------------------------------- 1 | from tensor import TensorShape 2 | from collections import Optional 3 | 4 | from voodoo.autograd import Node, Graph 5 | from voodoo.utils import ( 6 | Vector, 7 | get_activation_code, 8 | get_loss_code, 9 | add_code, 10 | sub_code, 11 | mul_code, 12 | div_code, 13 | pow_code, 14 | ) 15 | from voodoo.core import ( 16 | Initializer, 17 | NoneInitializer, 18 | Constraint, 19 | NoneConstraint, 20 | Optimizer, 21 | ) 22 | 23 | 24 | struct Tensor[ 25 | shape: TensorShape, 26 | initializer: Initializer = NoneInitializer, 27 | constraint: Constraint = NoneConstraint, 28 | is_static: Bool = True, 29 | is_single: Bool = False, 30 | ]: 31 | var graph: Graph 32 | var node: Node 33 | 34 | fn __init__( 35 | inout self, 36 | ) raises: 37 | self.graph = Graph() 38 | self.node = self.graph.node[False, is_static, is_single, -1]( 39 | shape, Vector[Int]() 40 | ) 41 | self.refresh() 42 | 43 | fn __copyinit__(inout self, other: Self): 44 | self.graph = other.graph 45 | self.node = other.node 46 | 47 | fn load_tensor_for_binary_op[ 48 | new_shape: TensorShape = shape 49 | ](self, other: Tensor) raises -> Tensor[ 50 | new_shape, NoneInitializer, NoneConstraint, False, False 51 | ]: 52 | var self_static_or_single = self.node.get_is_static() or self.node.get_is_single() 53 | var other_static_or_single = other.node.get_is_static() or other.node.get_is_single() 54 | var first_greater = len(self.graph._nodes) < len(other.graph._nodes) 55 | var remove_other = not (self_static_or_single or other_static_or_single) 56 | 57 | var new_tensor = Tensor[ 58 | new_shape, NoneInitializer, NoneConstraint, False, False 59 | ]() 60 | 61 | if self_static_or_single or (not other_static_or_single and first_greater): 62 | new_tensor.graph = other.graph 63 | new_tensor.graph.fuse_graphs(self.graph, remove_other) 64 | else: 65 | new_tensor.graph = self.graph 66 | new_tensor.graph.fuse_graphs(other.graph, remove_other) 67 | 68 | return new_tensor 69 | 70 | fn load_tensor_for_unary_op[ 71 | new_shape: TensorShape = shape 72 | ](self) raises -> Tensor[new_shape, NoneInitializer, NoneConstraint, False, False]: 73 | if self.node.get_is_static() or self.node.get_is_single(): 74 | var new_tensor = Tensor[ 75 | new_shape, NoneInitializer, NoneConstraint, False, False 76 | ]() 77 | new_tensor.graph.fuse_graphs(self.graph) 78 | return new_tensor 79 | else: 80 | var new_tensor = Tensor[ 81 | new_shape, NoneInitializer, NoneConstraint, False, False 82 | ]() 83 | new_tensor.graph = self.graph 84 | return new_tensor 85 | 86 | fn print(inout self, accuracy: Int = 6) raises: 87 | if not self.node.get_computed(): 88 | _ = self.forward() 89 | self.node.print(accuracy) 90 | 91 | fn refresh(self) raises: 92 | initializer.initialize[shape](self.node.get_data()) 93 | constraint.constrain[shape](self.node.get_data()) 94 | 95 | fn fill(owned self, val: Float32) -> Self: 96 | self.node.fill(val) 97 | return self ^ 98 | 99 | fn fill_incr(owned self) raises -> Self: 100 | self.node.fill_incr() 101 | return self ^ 102 | 103 | fn grad_fill_incr(owned self) raises -> Self: 104 | self.node.grad_fill_incr() 105 | return self ^ 106 | 107 | fn requires_grad(owned self) raises -> Self: 108 | self.node.set_is_static(True) 109 | return self ^ 110 | 111 | fn static(owned self) raises -> Self: 112 | _ = self.forward() 113 | self.node.set_is_static(True) 114 | return self ^ 115 | 116 | fn store(self, idx: Int, val: Float32): 117 | self.node.get_data().store(idx, val) 118 | 119 | fn free(self) raises: 120 | self.graph.free() 121 | self.node.free() 122 | 123 | fn forward(inout self) raises -> Self: 124 | _ = self.graph.forward(self.node) 125 | return self 126 | 127 | fn forward_static(inout self) raises -> Self: 128 | _ = self.graph.forward_static(self.node) 129 | return self 130 | 131 | fn backward(inout self) raises: 132 | if not self.node.get_computed(): 133 | _ = self.forward() 134 | self.graph.backward(self.node) 135 | 136 | fn optimize[optimizer: Optimizer](self) raises: 137 | self.graph.optimizer_step[optimizer]() 138 | 139 | fn __getitem__(self, idx: Int) raises -> Float32: 140 | return self.node.get_data()[idx] 141 | 142 | fn __setitem__(self, idx: Int, val: Float32) raises: 143 | self.node.get_data().store(idx, val) 144 | 145 | fn copy( 146 | self, 147 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 148 | var new_tensor = self.load_tensor_for_unary_op() 149 | new_tensor.node = new_tensor.graph.copy(self.node) 150 | return new_tensor 151 | 152 | fn dropout[ 153 | dropout_rate: Float32, noise_shape: TensorShape 154 | ](self) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 155 | var new_tensor = self.load_tensor_for_unary_op() 156 | new_tensor.node = new_tensor.graph.dropout(self.node, dropout_rate, noise_shape) 157 | return new_tensor 158 | 159 | fn _magic_arithmetic_generic[ 160 | operation_code: Int 161 | ](self, other: Tensor) raises -> Tensor[ 162 | shape, NoneInitializer, NoneConstraint, False, False 163 | ]: 164 | var new_tensor = self.load_tensor_for_binary_op(other) 165 | new_tensor.node = new_tensor.graph.arithmetic_general[operation_code]( 166 | self.node, other.node 167 | ) 168 | return new_tensor 169 | 170 | fn __eq__(self, other: Tensor) raises -> Bool: 171 | var new_tensor = self.load_tensor_for_binary_op(other) 172 | new_tensor.node = new_tensor.graph.arithmetic_general[add_code]( 173 | self.node, other.node 174 | ) 175 | return new_tensor.node.is_zero() 176 | 177 | fn __add__( 178 | self, other: Tensor 179 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 180 | return self._magic_arithmetic_generic[add_code](other) 181 | 182 | fn __sub__( 183 | self, other: Tensor 184 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 185 | return self._magic_arithmetic_generic[sub_code](other) 186 | 187 | fn __mul__( 188 | self, other: Tensor 189 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 190 | return self._magic_arithmetic_generic[mul_code](other) 191 | 192 | fn __truediv__( 193 | self, other: Tensor 194 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 195 | return self._magic_arithmetic_generic[div_code](other) 196 | 197 | fn __pow__( 198 | self, other: Tensor 199 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 200 | return self._magic_arithmetic_generic[pow_code](other) 201 | 202 | fn __matmul__( 203 | self, other: Tensor 204 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 205 | var new_tensor = self.load_tensor_for_binary_op(other) 206 | new_tensor.node = new_tensor.graph.mmul(self.node, other.node) 207 | return new_tensor 208 | 209 | fn __radd__( 210 | self, other: Tensor 211 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 212 | return self.__add__(other) 213 | 214 | fn __rsub__( 215 | self, other: Tensor 216 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 217 | return self.__sub__(other) 218 | 219 | fn __rmul__( 220 | self, other: Tensor 221 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 222 | return self.__mul__(other) 223 | 224 | fn __rtruediv__( 225 | self, other: Tensor 226 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 227 | return self.__truediv__(other) 228 | 229 | fn __rpow__( 230 | self, other: Tensor 231 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 232 | return self.__pow__(other) 233 | 234 | fn __rmatmul__( 235 | self, other: Tensor 236 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 237 | return self.__matmul__(other) 238 | 239 | fn __iadd__(inout self, other: Tensor) raises: 240 | self.node = self.__add__(other).node 241 | 242 | fn __isub__(inout self, other: Tensor) raises: 243 | self.node = self.__sub__(other).node 244 | 245 | fn __imul__(inout self, other: Tensor) raises: 246 | self.node = self.__mul__(other).node 247 | 248 | fn __itruediv__(inout self, other: Tensor) raises: 249 | self.node = self.__truediv__(other).node 250 | 251 | fn __ipow__(inout self, other: Tensor) raises: 252 | self.node = self.__pow__(other).node 253 | 254 | fn __imatmul__(inout self, other: Tensor) raises: 255 | self.node = self.__matmul__(other).node 256 | 257 | fn _prep_scalar_tensor( 258 | self, number: Float32 259 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, True]: 260 | var new_tensor = Tensor[ 261 | shape, NoneInitializer, NoneConstraint, False, True 262 | ]().fill(number) 263 | new_tensor.node.set_computed(True) 264 | return new_tensor 265 | 266 | fn __add__( 267 | self, number: Float32 268 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 269 | return self.__add__(self._prep_scalar_tensor(number)) 270 | 271 | fn __sub__( 272 | self, number: Float32 273 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 274 | return self.__sub__(self._prep_scalar_tensor(number)) 275 | 276 | fn __mul__( 277 | self, number: Float32 278 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 279 | return self.__mul__(self._prep_scalar_tensor(number)) 280 | 281 | fn __truediv__( 282 | self, number: Float32 283 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 284 | return self.__truediv__(self._prep_scalar_tensor(number)) 285 | 286 | fn __pow__( 287 | self, number: Float32 288 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 289 | return self.__pow__(self._prep_scalar_tensor(number)) 290 | 291 | fn __radd__( 292 | self, number: Float32 293 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 294 | return self.__add__(number) 295 | 296 | fn __rsub__( 297 | self, number: Float32 298 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 299 | return self.__sub__(number) 300 | 301 | fn __rmul__( 302 | self, number: Float32 303 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 304 | return self.__mul__(number) 305 | 306 | fn __rtruediv__( 307 | self, number: Float32 308 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 309 | return self.__truediv__(number) 310 | 311 | fn __rpow__( 312 | self, number: Float32 313 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 314 | var other = Tensor[shape, NoneInitializer, NoneConstraint, False, False]().fill( 315 | number 316 | ) 317 | other.node.set_is_single(True) 318 | other.node.set_computed(True) 319 | return other.__pow__(self) 320 | 321 | fn reshape[ 322 | new_shape: TensorShape 323 | ](self) raises -> Tensor[new_shape, NoneInitializer, NoneConstraint, False, False]: 324 | var new_tensor = self.load_tensor_for_unary_op[new_shape]() 325 | new_tensor.node = new_tensor.graph.reshape(self.node, shape) 326 | return new_tensor 327 | 328 | fn flatten( 329 | self, 330 | ) raises -> Tensor[ 331 | TensorShape(self.shape[0], self.shape.num_elements() // self.shape[0]), 332 | NoneInitializer, 333 | NoneConstraint, 334 | False, 335 | False, 336 | ]: 337 | var new_tensor = self.load_tensor_for_unary_op[ 338 | TensorShape(self.shape[0], self.shape.num_elements() // self.shape[0]) 339 | ]() 340 | new_tensor.node = new_tensor.graph.reshape( 341 | self.node, 342 | TensorShape(self.shape[0], self.shape.num_elements() // self.shape[0]), 343 | ) 344 | return new_tensor 345 | 346 | fn transp( 347 | self, 348 | ) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 349 | var new_tensor = self.load_tensor_for_unary_op() 350 | new_tensor.node = new_tensor.graph.transp(self.node) 351 | return new_tensor 352 | 353 | fn sum(self) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 354 | var new_tensor = self.load_tensor_for_unary_op() 355 | new_tensor.node = new_tensor.graph.sum(self.node) 356 | return new_tensor 357 | 358 | fn compute_function[ 359 | operator_id: Int 360 | ](self) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 361 | var new_tensor = self.load_tensor_for_unary_op() 362 | new_tensor.node = new_tensor.graph.function_general[operator_id](self.node) 363 | return new_tensor 364 | 365 | fn compute_loss[ 366 | operator_id: Int 367 | ](self, other: Tensor) raises -> Tensor[ 368 | shape, NoneInitializer, NoneConstraint, False, False 369 | ]: 370 | var new_tensor = self.load_tensor_for_binary_op(other) 371 | new_tensor.node = new_tensor.graph.loss_general[operator_id]( 372 | self.node, other.node 373 | ) 374 | return new_tensor 375 | 376 | fn compute_loss[ 377 | operator_name: String 378 | ](self, other: Tensor) raises -> Tensor[ 379 | shape, NoneInitializer, NoneConstraint, False, False 380 | ]: 381 | var new_tensor = self.load_tensor_for_binary_op(other) 382 | new_tensor.node = new_tensor.graph.loss_general[get_loss_code[operator_name]()]( 383 | self.node, other.node 384 | ) 385 | return new_tensor 386 | 387 | fn compute_activation[ 388 | operator_id: Int, arg1: Float32 = 0.0 389 | ](self) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 390 | var new_tensor = self.load_tensor_for_unary_op() 391 | new_tensor.node = new_tensor.graph.activation_general[operator_id, arg1]( 392 | self.node 393 | ) 394 | return new_tensor 395 | 396 | fn compute_activation[ 397 | operator_name: String, arg1: Float32 = 0.0 398 | ](self) raises -> Tensor[shape, NoneInitializer, NoneConstraint, False, False]: 399 | var new_tensor = self.load_tensor_for_unary_op() 400 | new_tensor.node = new_tensor.graph.activation_general[ 401 | get_activation_code[operator_name](), arg1 402 | ](self.node) 403 | return new_tensor 404 | 405 | fn conv_1d[ 406 | new_shape: TensorShape = shape 407 | ](self, other: Tensor, padding: Int, stride: Int) raises -> Tensor[ 408 | new_shape, NoneInitializer, NoneConstraint, False, False 409 | ]: 410 | var new_tensor = self.load_tensor_for_binary_op[new_shape](other) 411 | new_tensor.node = new_tensor.graph.conv_1d( 412 | self.node, other.node, padding, stride 413 | ) 414 | return new_tensor 415 | 416 | fn conv_2d[ 417 | new_shape: TensorShape = shape 418 | ]( 419 | self, other: Tensor, padding: StaticIntTuple[2], stride: StaticIntTuple[2] 420 | ) raises -> Tensor[new_shape, NoneInitializer, NoneConstraint, False, False]: 421 | var new_tensor = self.load_tensor_for_binary_op[new_shape](other) 422 | new_tensor.node = new_tensor.graph.conv_2d( 423 | self.node, other.node, padding, stride 424 | ) 425 | return new_tensor 426 | 427 | fn maxpool_1d[ 428 | new_shape: TensorShape = shape 429 | ](self, kernel_size: Int, stride: Int, padding: Int) raises -> Tensor[ 430 | new_shape, NoneInitializer, NoneConstraint, False, False 431 | ]: 432 | var new_tensor = self.load_tensor_for_unary_op[new_shape]() 433 | new_tensor.node = new_tensor.graph.maxpool_1d( 434 | self.node, kernel_size, stride, padding 435 | ) 436 | return new_tensor 437 | 438 | fn maxpool_2d[ 439 | new_shape: TensorShape = shape 440 | ](self, kernel_size: StaticIntTuple[2], stride: Int, padding: Int) raises -> Tensor[ 441 | new_shape, NoneInitializer, NoneConstraint, False, False 442 | ]: 443 | var new_tensor = self.load_tensor_for_unary_op[new_shape]() 444 | new_tensor.node = new_tensor.graph.maxpool_2d( 445 | self.node, kernel_size, stride, padding 446 | ) 447 | return new_tensor 448 | -------------------------------------------------------------------------------- /voodoo/utils/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .array import Vector, reduce_vector_mul 2 | from .broadcast import ( 3 | shape_a, 4 | shape_b, 5 | strides_a, 6 | strides_b, 7 | get_broadcasted_shape_for_ew_op, 8 | recursive_broadcast, 9 | ) 10 | from .console import warn, error, info, success, debug, clear 11 | from .code_lookup import get_activation_code, get_loss_code 12 | from .operator_codes import * 13 | -------------------------------------------------------------------------------- /voodoo/utils/array.mojo: -------------------------------------------------------------------------------- 1 | from memory import memset_zero, memcpy 2 | from tensor import TensorShape 3 | from math import max 4 | 5 | 6 | @register_passable("trivial") 7 | struct Vector[type: AnyRegType](Sized): 8 | var _data: Pointer[type] 9 | var _len: Pointer[Int] 10 | var _cap: Int 11 | 12 | fn __init__(len: Int = 0) -> Self: 13 | var _cap = max(len, 8) 14 | var _data = Pointer[type].alloc(_cap) 15 | var _len = Pointer[Int].alloc(1) 16 | 17 | memset_zero(_data, _cap) 18 | _len.store(len) 19 | 20 | return Vector[type] {_data: _data, _len: _len, _cap: _cap} 21 | 22 | fn __init__(shape: TensorShape) -> Self: 23 | var len = shape.rank() 24 | var _data = Pointer[type].alloc(len) 25 | var _len = Pointer[Int].alloc(1) 26 | 27 | for i in range(len): 28 | _data.store(i, shape[i]) 29 | 30 | _len.store(len) 31 | 32 | return Vector[type] {_data: _data, _len: _len, _cap: len} 33 | 34 | fn __len__(self) -> Int: 35 | return self._len.load() 36 | 37 | fn __getitem__(self, idx: Int) -> type: 38 | return self._data.load(idx) 39 | 40 | fn __setitem__(self, idx: Int, value: type): 41 | self._data.store(idx, value) 42 | 43 | fn push_back(inout self, elem: type): 44 | var len = self._len.load() 45 | var curr_cap = self._cap 46 | 47 | if len == curr_cap: 48 | self._resize[True](max(1, curr_cap << 1)) 49 | 50 | self._data.store(len, elem) 51 | self._len.store(len + 1) 52 | 53 | fn pop_back(inout self) -> type: 54 | var new_len = self._len.load() - 1 55 | var curr_cap = self._cap 56 | 57 | self._len.store(new_len) 58 | var tmp = self._data.load(new_len) 59 | 60 | if new_len <= (curr_cap >> 2) and curr_cap > 32: 61 | self._resize[False](curr_cap >> 1) 62 | 63 | return tmp 64 | 65 | fn free(owned self): 66 | self._data.free() 67 | self._len.free() 68 | 69 | fn clear(inout self): 70 | self._resize[False](8) 71 | self._len.store(0) 72 | 73 | memset_zero(self._data, self._cap) 74 | 75 | fn copy(self) -> Self: 76 | var len = self._len.load() 77 | var new_vector = Vector[type](len) 78 | 79 | memcpy(new_vector._data, self._data, len) 80 | 81 | return new_vector 82 | 83 | fn _resize[up: Bool](inout self, new_cap: Int): 84 | var new_data = Pointer[type].alloc(new_cap) 85 | 86 | @parameter 87 | if up: 88 | memset_zero(new_data, new_cap) 89 | memcpy(new_data, self._data, self._cap) 90 | else: 91 | memcpy(new_data, self._data, new_cap) 92 | 93 | self._cap = new_cap 94 | self._data.free() 95 | self._data = new_data 96 | 97 | 98 | fn reduce_vector_mul[v: Vector[Int]]() -> Int: 99 | var result = 1 100 | 101 | for i in range(len(v)): 102 | result *= v[i] 103 | 104 | return result 105 | -------------------------------------------------------------------------------- /voodoo/utils/broadcast.mojo: -------------------------------------------------------------------------------- 1 | from math import max 2 | 3 | from voodoo.autograd import Node 4 | 5 | 6 | fn shape_a(depth: Int, a: Node, b: Node) -> Int: 7 | var diff = max(b.get_num_dims() - a.get_num_dims(), 0) 8 | return a.get_shape()[depth - diff] if depth >= diff else 1 9 | 10 | 11 | fn shape_b(depth: Int, a: Node, b: Node) -> Int: 12 | var diff = max(a.get_num_dims() - b.get_num_dims(), 0) 13 | return b.get_shape()[depth - diff] if depth >= diff else 1 14 | 15 | 16 | fn strides_a(depth: Int, a: Node, b: Node) -> Int: 17 | var diff = max(b.get_num_dims() - a.get_num_dims(), 0) 18 | return a.get_strides()[depth - diff] if depth >= diff else a.get_strides()[0] 19 | 20 | 21 | fn strides_b(depth: Int, a: Node, b: Node) -> Int: 22 | var diff = max(a.get_num_dims() - b.get_num_dims(), 0) 23 | return b.get_strides()[depth - diff] if depth >= diff else b.get_strides()[0] 24 | 25 | 26 | fn get_broadcasted_shape_for_ew_op(parent1: Node, parent2: Node) -> Vector[Int]: 27 | var shape = Vector[Int]() 28 | var target = parent1 if parent1.get_num_dims() - parent2.get_num_dims() > 0 else parent2 29 | for i in range(target.get_num_dims()): 30 | shape.push_back(target.get_shape()[i]) 31 | return shape 32 | 33 | 34 | fn base_case[ 35 | use_strides: Bool 36 | ](depth: Int, a: Node, b: Node, a_b_diff: Int, b_a_diff: Int) -> Bool: 37 | @parameter 38 | if use_strides: 39 | return ( 40 | a.get_strides()[depth - b_a_diff] if depth 41 | >= b_a_diff else a.get_strides()[0] 42 | ) * (a.get_shape()[depth - b_a_diff] if depth >= b_a_diff else 1) == ( 43 | b.get_strides()[depth - a_b_diff] if depth 44 | >= a_b_diff else b.get_strides()[0] 45 | ) * ( 46 | b.get_shape()[depth - a_b_diff] if depth >= a_b_diff else 1 47 | ) 48 | else: 49 | return depth == max(a.get_num_dims(), b.get_num_dims()) - 2 50 | 51 | 52 | fn precompute_broadcasted_shape( 53 | diff: Int, shape: Pointer[Int], num_dims: Int 54 | ) -> Pointer[Int]: 55 | var precomputed_shape = Pointer[Int].alloc(num_dims) 56 | for i in range(num_dims): 57 | precomputed_shape[i] = 1 if i < diff else shape[i - diff] 58 | return precomputed_shape 59 | 60 | 61 | fn recursive_broadcast[ 62 | kernel: fn ( 63 | c: Node, a: Node, b: Node, a_index: Int, b_index: Int, c_index: Int, depth: Int 64 | ) -> None, 65 | use_strides: Bool, 66 | ]( 67 | c: Node, 68 | a: Node, 69 | b: Node, 70 | a_index: Int = 0, 71 | b_index: Int = 0, 72 | c_index: Int = 0, 73 | depth: Int = 0, 74 | ): 75 | var stack = Vector[Tuple[Int, Int, Int, Int]]() 76 | stack.push_back((a_index, b_index, c_index, depth)) 77 | 78 | var a_b_diff = max(a.get_num_dims() - b.get_num_dims(), 0) 79 | var b_a_diff = max(b.get_num_dims() - a.get_num_dims(), 0) 80 | var a_shape = a.get_shape()._data 81 | var b_shape = b.get_shape()._data 82 | var c_shape = c.get_shape()._data 83 | 84 | var a_shape_precomputed = precompute_broadcasted_shape( 85 | b_a_diff, a_shape, a.get_num_dims() 86 | ) 87 | var b_shape_precomputed = precompute_broadcasted_shape( 88 | a_b_diff, b_shape, b.get_num_dims() 89 | ) 90 | 91 | while len(stack) > 0: 92 | var item = stack.pop_back() 93 | 94 | var item_a_index = item.get[0, Int]() 95 | var item_b_index = item.get[1, Int]() 96 | var item_c_index = item.get[2, Int]() 97 | var item_depth = item.get[3, Int]() 98 | 99 | if base_case[use_strides](item_depth, a, b, a_b_diff, b_a_diff): 100 | kernel(c, a, b, item_a_index, item_b_index, item_c_index, item_depth) 101 | continue 102 | 103 | var a_shape = a_shape_precomputed[item_depth] 104 | var b_shape = b_shape_precomputed[item_depth] 105 | var c_shape_indexed = c_shape[item_depth] * item_c_index 106 | 107 | var scaled_a_index = item_a_index * a_shape 108 | var scaled_b_index = item_b_index * b_shape 109 | var max_shape = max(a_shape, b_shape) 110 | 111 | var a_step = 0 if a_shape == 1 else 1 112 | var b_step = 0 if b_shape == 1 else 1 113 | var new_depth = item_depth + 1 114 | 115 | for s in range(max_shape): 116 | stack.push_back( 117 | ( 118 | scaled_a_index + s * a_step, 119 | scaled_b_index + s * b_step, 120 | c_shape_indexed + s, 121 | new_depth, 122 | ) 123 | ) 124 | 125 | a_shape_precomputed.free() 126 | b_shape_precomputed.free() 127 | stack.free() 128 | -------------------------------------------------------------------------------- /voodoo/utils/code_lookup.mojo: -------------------------------------------------------------------------------- 1 | fn get_activation_code[name: String]() -> Int: 2 | @parameter 3 | if name == "relu": 4 | return relu_code 5 | elif name == "sigmoid": 6 | return sigmoid_code 7 | elif name == "softplus": 8 | return softplus_code 9 | elif name == "softsign": 10 | return softsign_code 11 | elif name == "tanh": 12 | return tanh_code 13 | elif name == "selu": 14 | return selu_code 15 | elif name == "elu": 16 | return elu_code 17 | elif name == "exp": 18 | return exp_code 19 | elif name == "lrelu": 20 | return lrelu_code 21 | elif name == "relu6": 22 | return relu6_code 23 | elif name == "silu": 24 | return silu_code 25 | elif name == "gelu": 26 | return gelu_code 27 | elif name == "h_sig": 28 | return h_sig_code 29 | elif name == "linear": 30 | return linear_code 31 | elif name == "mish": 32 | return mish_code 33 | warn("Invalid activation function: " + name + " using linear\n") 34 | return linear_code 35 | 36 | 37 | fn get_loss_code[name: String]() -> Int: 38 | @parameter 39 | if name == "mse": 40 | return mse_code 41 | elif name == "mae": 42 | return mae_code 43 | elif name == "mape": 44 | return mape_code 45 | elif name == "msle": 46 | return msle_code 47 | warn("Invalid loss function: " + name + " using mse\n") 48 | return mse_code 49 | -------------------------------------------------------------------------------- /voodoo/utils/console.mojo: -------------------------------------------------------------------------------- 1 | fn warn(msg: String): 2 | print_no_newline(chr(27) + "[0;33m" + msg + chr(27) + "[0;37m") 3 | 4 | 5 | fn error(msg: String): 6 | print_no_newline(chr(27) + "[0;31m" + msg + chr(27) + "[0;37m") 7 | 8 | 9 | fn info(msg: String): 10 | print_no_newline(chr(27) + "[0;34m" + msg + chr(27) + "[0;37m") 11 | 12 | 13 | fn success(msg: String): 14 | print_no_newline(chr(27) + "[0;32m" + msg + chr(27) + "[0;37m") 15 | 16 | 17 | fn debug(msg: String): 18 | print_no_newline(chr(27) + "[0;35m" + msg + chr(27) + "[0;37m") 19 | 20 | 21 | fn clear(): 22 | print_no_newline(chr(27) + "[2J" + chr(27) + "[0;37m") 23 | -------------------------------------------------------------------------------- /voodoo/utils/operator_codes.mojo: -------------------------------------------------------------------------------- 1 | alias copy_code = 0 2 | alias reshape_code = 2 3 | alias transp_code = 4 4 | alias sum_code = 6 5 | alias dropout_code = 8 6 | alias mmul_code = 10 7 | alias sqrt_code = 12 8 | alias abs_code = 14 9 | alias exp2_code = 16 10 | alias log2_code = 18 11 | alias log_code = 20 12 | alias sin_code = 22 13 | alias cos_code = 24 14 | alias tan_code = 26 15 | alias asin_code = 28 16 | alias acos_code = 30 17 | alias atan_code = 32 18 | alias sinh_code = 34 19 | alias cosh_code = 36 20 | alias add_code = 38 21 | alias mul_code = 40 22 | alias sub_code = 42 23 | alias div_code = 44 24 | alias pow_code = 46 25 | alias mse_code = 48 26 | alias mae_code = 50 27 | alias mape_code = 52 28 | alias msle_code = 54 29 | alias relu_code = 56 30 | alias sigmoid_code = 58 31 | alias softplus_code = 60 32 | alias softsign_code = 62 33 | alias tanh_code = 64 34 | alias selu_code = 66 35 | alias elu_code = 68 36 | alias exp_code = 70 37 | alias lrelu_code = 72 38 | alias relu6_code = 74 39 | alias silu_code = 76 40 | alias gelu_code = 78 41 | alias h_sig_code = 80 42 | alias linear_code = 82 43 | alias mish_code = 84 44 | alias conv1d_code = 86 45 | alias conv2d_code = 88 46 | alias maxpool1d_code = 90 47 | alias maxpool2d_code = 92 48 | --------------------------------------------------------------------------------