├── book
    ├── .gitignore
    ├── README.md
    ├── book.toml
    └── src
    │   ├── SUMMARY.md
    │   ├── tensor
    │       ├── utils.md
    │       ├── README.md
    │       ├── operation.md
    │       └── ops_padding.md
    │   ├── device.md
    │   ├── nn
    │       ├── dropout.md
    │       ├── linear.md
    │       ├── layer.md
    │       └── convolution.md
    │   ├── README.md
    │   └── guide
    │       └── installation.md
├── benches
    ├── README.md
    ├── benches
    │   └── maidenx_tensor
    │   │   ├── creation
    │   │       └── mod.rs
    │   │   ├── main.rs
    │   │   └── ops
    │   │       ├── mod.rs
    │   │       ├── reduction.rs
    │   │       └── unary.rs
    ├── src
    │   └── lib.rs
    └── Cargo.toml
├── crates
    ├── maidenx_cuda
    │   ├── .gitignore
    │   ├── src
    │   │   ├── nn
    │   │   │   ├── activation
    │   │   │   │   ├── mod.rs
    │   │   │   │   └── softmax.rs
    │   │   │   ├── mod.rs
    │   │   │   └── conv.rs
    │   │   └── ops
    │   │   │   ├── mod.rs
    │   │   │   ├── matmul.rs
    │   │   │   ├── reduction.rs
    │   │   │   └── padding.rs
    │   ├── Cargo.toml
    │   ├── kernels
    │   │   └── cuda_utils.cuh
    │   └── CMakeLists.txt
    ├── maidenx_mps
    │   ├── .gitignore
    │   ├── src
    │   │   ├── nn
    │   │   │   ├── activation
    │   │   │   │   └── mod.rs
    │   │   │   └── mod.rs
    │   │   ├── ops
    │   │   │   └── mod.rs
    │   │   └── metal_context.rs
    │   ├── kernels
    │   │   ├── README.md
    │   │   └── metal_utils.metal
    │   ├── Cargo.toml
    │   ├── Makefile
    │   └── build.rs
    ├── maidenx_cpu
    │   ├── src
    │   │   ├── nn
    │   │   │   ├── activation
    │   │   │   │   └── mod.rs
    │   │   │   └── mod.rs
    │   │   ├── lib.rs
    │   │   ├── ops
    │   │   │   └── mod.rs
    │   │   └── utils.rs
    │   └── Cargo.toml
    ├── maidenx_core
    │   ├── src
    │   │   ├── be
    │   │   │   ├── nn
    │   │   │   │   └── mod.rs
    │   │   │   ├── ops
    │   │   │   │   └── mod.rs
    │   │   │   └── mod.rs
    │   │   ├── lib.rs
    │   │   └── device.rs
    │   └── Cargo.toml
    ├── maidenx_nn
    │   ├── src
    │   │   ├── optimizers
    │   │   │   ├── mod.rs
    │   │   │   ├── sgd.rs
    │   │   │   └── adam.rs
    │   │   ├── losses
    │   │   │   ├── mod.rs
    │   │   │   ├── mae.rs
    │   │   │   ├── mse.rs
    │   │   │   ├── crossentropy.rs
    │   │   │   └── huber.rs
    │   │   ├── layers
    │   │   │   ├── mod.rs
    │   │   │   ├── activation.rs
    │   │   │   ├── linear.rs
    │   │   │   └── dropout.rs
    │   │   ├── lib.rs
    │   │   ├── optimizer.rs
    │   │   └── layer.rs
    │   ├── macros
    │   │   └── Cargo.toml
    │   └── Cargo.toml
    ├── maidenx_tensor_v2
    │   ├── src
    │   │   ├── ops
    │   │   │   ├── indexing.rs
    │   │   │   ├── _try_indexing.rs
    │   │   │   ├── matmul.rs
    │   │   │   ├── padding.rs
    │   │   │   ├── transform.rs
    │   │   │   ├── binary.rs
    │   │   │   ├── reduction.rs
    │   │   │   └── unary.rs
    │   │   ├── utils.rs
    │   │   ├── constants.rs
    │   │   ├── prelude.rs
    │   │   ├── ops.rs
    │   │   ├── iterator.rs
    │   │   └── utils
    │   │   │   ├── tensor.rs
    │   │   │   ├── broadcast.rs
    │   │   │   ├── promotion.rs
    │   │   │   └── graph.rs
    │   ├── README.md
    │   ├── Cargo.toml
    │   └── tests
    │   │   ├── memory.rs
    │   │   └── cast.rs
    ├── maidenx_macro_utils
    │   ├── src
    │   │   ├── lib.rs
    │   │   └── manifest.rs
    │   └── Cargo.toml
    ├── maidenx_tensor
    │   ├── src
    │   │   ├── utils
    │   │   │   ├── mod.rs
    │   │   │   ├── logical.rs
    │   │   │   ├── broadcast.rs
    │   │   │   └── promotion.rs
    │   │   ├── ops
    │   │   │   └── mod.rs
    │   │   ├── iterator.rs
    │   │   ├── f.rs
    │   │   ├── operators.rs
    │   │   ├── wt.rs
    │   │   ├── d.rs
    │   │   └── vec.rs
    │   ├── Cargo.toml
    │   └── tests
    │   │   ├── tensor.rs
    │   │   └── wt.rs
    └── maidenx_internal
    │   ├── src
    │       ├── lib.rs
    │       └── prelude.rs
    │   └── Cargo.toml
├── .gitignore
├── assets
    └── serde
    │   ├── x.bin
    │   └── x.json
├── README.md
├── rustfmt.toml
├── src
    └── lib.rs
├── tools
    └── publish.sh
├── examples
    ├── serde
    │   └── serde.rs
    ├── test.rs
    └── device
    │   ├── mps.rs
    │   ├── cpu.rs
    │   └── cuda.rs
├── docs
    ├── neural-networks.md
    └── supported.md
├── Cargo.toml
├── .github
    └── workflows
    │   └── deploy-book.yml
├── LICENSE
└── README_.md


/book/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | 


--------------------------------------------------------------------------------
/book/README.md:
--------------------------------------------------------------------------------
1 | # MaidenX Book
2 | 


--------------------------------------------------------------------------------
/benches/README.md:
--------------------------------------------------------------------------------
1 | # MaidenX Benchmark
2 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/.gitignore:
--------------------------------------------------------------------------------
1 | .clangd
2 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/benches/benches/maidenx_tensor/creation/mod.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | 
4 | # OS
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/src/nn/activation/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod softmax;
2 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/nn/activation/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod softmax;
2 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/src/nn/activation/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod softmax;
2 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/src/be/nn/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod activation;
2 | pub mod conv;
3 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/src/nn/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod activation;
2 | pub mod conv;
3 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/nn/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod activation;
2 | pub mod conv;
3 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/src/nn/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod activation;
2 | pub mod conv;
3 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/optimizers/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod adam;
2 | pub mod sgd;
3 | 


--------------------------------------------------------------------------------
/assets/serde/x.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/miniex/maidenx/HEAD/assets/serde/x.bin


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/indexing.rs:
--------------------------------------------------------------------------------
1 | use crate::Tensor;
2 | 
3 | impl Tensor {}
4 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/_try_indexing.rs:
--------------------------------------------------------------------------------
1 | use crate::Tensor;
2 | 
3 | impl Tensor {}
4 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/src/lib.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "nn")]
2 | pub mod nn;
3 | pub mod ops;
4 | 
5 | pub mod utils;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/losses/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod crossentropy;
2 | pub mod huber;
3 | pub mod mae;
4 | pub mod mse;
5 | 


--------------------------------------------------------------------------------
/crates/maidenx_macro_utils/src/lib.rs:
--------------------------------------------------------------------------------
1 | extern crate proc_macro;
2 | 
3 | mod manifest;
4 | 
5 | pub use manifest::*;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/utils.rs:
--------------------------------------------------------------------------------
1 | pub mod broadcast;
2 | pub mod graph;
3 | pub mod promotion;
4 | pub mod tensor;
5 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/utils/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod broadcast;
2 | pub mod indexing;
3 | pub mod logical;
4 | pub mod promotion;
5 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod binary;
2 | pub mod matmul;
3 | pub mod padding;
4 | pub mod reduction;
5 | pub mod unary;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod binary;
2 | pub mod matmul;
3 | pub mod padding;
4 | pub mod reduction;
5 | pub mod unary;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod binary;
2 | pub mod matmul;
3 | pub mod padding;
4 | pub mod reduction;
5 | pub mod unary;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/src/be/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod binary;
2 | pub mod matmul;
3 | pub mod padding;
4 | pub mod reduction;
5 | pub mod unary;
6 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/constants.rs:
--------------------------------------------------------------------------------
1 | use crate::{Tensor, TensorId};
2 | 
3 | pub static NULL_TENSOR: Tensor = Tensor(TensorId(0));
4 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/src/be/mod.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "nn")]
2 | pub mod nn;
3 | pub mod ops;
4 | 
5 | type CleanupFn = Option<Box<dyn FnOnce()>>;
6 | 


--------------------------------------------------------------------------------
/benches/benches/maidenx_tensor/main.rs:
--------------------------------------------------------------------------------
1 | mod creation;
2 | mod ops;
3 | 
4 | use criterion::criterion_main;
5 | 
6 | criterion_main!(ops::benches);
7 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/prelude.rs:
--------------------------------------------------------------------------------
1 | pub use crate::{constants, constants::*, eager, eager_mode, get_mode, lazy, lazy_mode, Tensor, TensorMode};
2 | 


--------------------------------------------------------------------------------
/benches/src/lib.rs:
--------------------------------------------------------------------------------
1 | #[macro_export]
2 | macro_rules! bench {
3 |     ($name:literal) => {
4 |         concat!(module_path!(), "::", $name)
5 |     };
6 | }
7 | 


--------------------------------------------------------------------------------
/book/book.toml:
--------------------------------------------------------------------------------
1 | [book]
2 | authors = ["Han Damin <miniex@daminstudio.net>"]
3 | language = "en"
4 | multilingual = false
5 | src = "src"
6 | title = "maidenx guide"
7 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/layers/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod activation;
2 | pub mod conv;
3 | pub mod dropout;
4 | pub mod embedding;
5 | pub mod linear;
6 | pub mod normalization;
7 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | mod binary;
2 | mod indexing;
3 | mod matmul;
4 | mod padding;
5 | mod reduction;
6 | mod transform;
7 | pub(crate) mod unary;
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MaidenX
2 | 
3 | This library has been discontinued and this repository will be archived. The successor library is [hodu](https://github.com/hodu-rs/hodu).
4 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/kernels/README.md:
--------------------------------------------------------------------------------
1 | # MPS Kernels
2 | 
3 | > [!CAUTION]
4 | > This kernel code is Claude and GPT-riddled.
5 | > Only the brave should modify this code
6 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 120
2 | tab_spaces = 4
3 | newline_style = "Unix"
4 | edition = "2021"
5 | use_field_init_shorthand = true
6 | match_block_trailing_comma = true
7 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2024-present, Han Damin (miniex). All rights reserved.
2 | // See LICENSE file in the project root for full license information.
3 | 
4 | pub use maidenx_internal::*;
5 | 


--------------------------------------------------------------------------------
/benches/benches/maidenx_tensor/ops/mod.rs:
--------------------------------------------------------------------------------
1 | mod binary;
2 | mod reduction;
3 | mod unary;
4 | 
5 | use criterion::criterion_group;
6 | 
7 | criterion_group!(benches, binary::basic, unary::basic, reduction::basic);
8 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/matmul.rs:
--------------------------------------------------------------------------------
1 | use crate::Tensor;
2 | 
3 | impl Tensor {
4 |     pub fn matmul(&self, rhs: &Self) -> Tensor {
5 |         self.try_matmul(rhs).expect("failed to perform matrix multiplication")
6 |     }
7 | }
8 | 


--------------------------------------------------------------------------------
/crates/maidenx_internal/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod prelude;
2 | 
3 | pub use maidenx_core as core;
4 | pub use maidenx_nn as nn;
5 | pub use maidenx_tensor as tensor;
6 | pub use maidenx_tensor_v2 as tensor_v2;
7 | 
8 | pub use maidenx_core::dtype::*;
9 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/utils/logical.rs:
--------------------------------------------------------------------------------
1 | use crate::Tensor;
2 | use maidenx_core::error::Result;
3 | 
4 | pub fn any_true(src: &Tensor) -> Result<bool> {
5 |     let vector = src.to_flatten_vec::<bool>()?;
6 | 
7 |     Ok(vector.iter().any(|&x| x))
8 | }
9 | 


--------------------------------------------------------------------------------
/crates/maidenx_internal/src/prelude.rs:
--------------------------------------------------------------------------------
1 | pub use crate::core::{
2 |     device::{auto_set_device, get_default_device, set_default_device, Device},
3 |     dtype::*,
4 |     scalar::Scalar,
5 | };
6 | pub use crate::nn::layer::Layer;
7 | pub use crate::tensor::Tensor;
8 | pub use crate::{bf16, f16};
9 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod be;
 2 | pub mod buffer;
 3 | pub mod device;
 4 | pub mod dtype;
 5 | pub mod error;
 6 | pub mod layout;
 7 | pub mod scalar;
 8 | 
 9 | pub use maidenx_cpu as cpu;
10 | #[cfg(feature = "cuda")]
11 | pub use maidenx_cuda as cuda;
12 | #[cfg(feature = "mps")]
13 | pub use maidenx_mps as mps;
14 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops.rs:
--------------------------------------------------------------------------------
 1 | mod binary;
 2 | mod indexing;
 3 | mod matmul;
 4 | mod padding;
 5 | mod reduction;
 6 | mod unary;
 7 | 
 8 | mod _try_binary;
 9 | mod _try_indexing;
10 | mod _try_matmul;
11 | mod _try_padding;
12 | mod _try_reduction;
13 | mod _try_transform;
14 | mod _try_unary;
15 | 
16 | mod transform;
17 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod layer;
 2 | pub mod optimizer;
 3 | 
 4 | pub mod layers;
 5 | pub mod losses;
 6 | pub mod optimizers;
 7 | 
 8 | pub use crate::{
 9 |     layers::{activation::*, conv::*, dropout::*, embedding::*, linear::*, normalization::*},
10 |     losses::{huber::*, mae::*, mse::*},
11 |     optimizers::{adam::*, sgd::*},
12 | };
13 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/README.md:
--------------------------------------------------------------------------------
 1 | # MaidenX Tensor
 2 | 
 3 | ## Current Limitations
 4 | 
 5 | - Currently only supports a single computation graph
 6 | - Sequential execution only (no parallel execution)
 7 | 
 8 | ## Future Plans
 9 | 
10 | - Support for multiple computation graphs
11 | - Parallel execution of graph operations
12 | - Performance optimizations for computation graph execution
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/optimizer.rs:
--------------------------------------------------------------------------------
 1 | use maidenx_core::{error::Result, scalar::Scalar};
 2 | pub use maidenx_nn_macros::Optimizer;
 3 | use maidenx_tensor::Tensor;
 4 | 
 5 | pub trait Optimizer {
 6 |     fn step(&mut self, parameters: &mut [&mut Tensor]) -> Result<()>;
 7 |     fn zero_grad(&mut self, parameters: &mut [&mut Tensor]) -> Result<()>;
 8 |     fn set_learning_rate(&mut self, learning_rate: impl Into<Scalar>);
 9 | }
10 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_cpu"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx CPU backend"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | nn = []
13 | 
14 | [dependencies]
15 | half = { workspace = true }
16 | paste = { workspace = true }
17 | rayon = { workspace = true }
18 | libc = "0.2"
19 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_mps"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx MPS backend"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | nn = []
13 | 
14 | [dependencies]
15 | half = { workspace = true }
16 | paste = { workspace = true }
17 | metal = { version = "0.30.0", features = ["mps"] }
18 | 


--------------------------------------------------------------------------------
/assets/serde/x.json:
--------------------------------------------------------------------------------
1 | {"weight":{"data":{"buffer_data":[92,43,136,63,188,154,39,63,125,19,13,64],"buffer_len":3,"buffer_dtype":"f32","buffer_device":"CPU"},"metadata":{"device":"CPU","dtype":"f32","layout":{"shape":[1,3],"strides":[3,1],"offset":0},"requires_grad":true}},"bias":{"data":{"buffer_data":[27,72,118,62],"buffer_len":1,"buffer_dtype":"f32","buffer_device":"CPU"},"metadata":{"device":"CPU","dtype":"f32","layout":{"shape":[],"strides":[],"offset":0},"requires_grad":true}},"state":{"training":true}}


--------------------------------------------------------------------------------
/crates/maidenx_macro_utils/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_macro_utils"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx macro utils"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [dependencies]
12 | toml_edit = { version = "0.22.7", default-features = false, features = [
13 |     "parse",
14 | ] }
15 | syn = "2.0"
16 | quote = "1.0"
17 | proc-macro2 = "1.0"
18 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_cuda"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx CUDA backend"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | nn = []
13 | 
14 | [dependencies]
15 | half = { workspace = true }
16 | paste = { workspace = true }
17 | libc = "0.2"
18 | 
19 | [build-dependencies]
20 | cmake = "0.1"
21 | walkdir = "2.5"
22 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_nn_macros"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx nn macros"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [lib]
12 | proc-macro = true
13 | 
14 | [features]
15 | serde = []
16 | 
17 | [dependencies]
18 | maidenx_macro_utils = { path = "../../maidenx_macro_utils", version = "0.2.0-dev" }
19 | syn = { version = "2.0", features = ["full"] }
20 | quote = "1.0"
21 | proc-macro2 = "1.0"
22 | 


--------------------------------------------------------------------------------
/benches/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "benches"
 3 | edition = "2021"
 4 | description = "Benchmarks that test MaidenX's performance"
 5 | publish = false
 6 | license = "BSD-3-Clause"
 7 | autobenches = false
 8 | 
 9 | [features]
10 | cpu = []
11 | cuda = ["maidenx_core/cuda", "maidenx_tensor/cuda"]
12 | 
13 | [dependencies]
14 | criterion = { version = "0.5", features = ["html_reports"] }
15 | 
16 | [dev-dependencies]
17 | maidenx_core = { path = "../crates/maidenx_core" }
18 | maidenx_tensor = { path = "../crates/maidenx_tensor" }
19 | 
20 | [lib]
21 | bench = false
22 | 
23 | [[bench]]
24 | name = "tensor"
25 | path = "benches/maidenx_tensor/main.rs"
26 | harness = false
27 | 


--------------------------------------------------------------------------------
/tools/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | crates=(
 4 |     maidenx_cpu
 5 |     maidenx_cuda
 6 |     maidenx_mps
 7 | 
 8 |     maidenx_macro_utils
 9 | 
10 |     maidenx_core
11 |     maidenx_tensor
12 |     
13 |     maidenx_nn/macros
14 |     maidenx_nn
15 | 
16 |     maidenx_internal
17 | )
18 | 
19 | if [ -n "$(git status --porcelain)" ]; then
20 |     echo "You have local changes!"
21 |     exit 1
22 | fi
23 | 
24 | pushd crates
25 | 
26 | for crate in "${crates[@]}"
27 | do
28 |   echo "Publishing ${crate}"
29 |   cp ../LICENSE "$crate"
30 |   pushd "$crate"
31 |   git add LICENSE
32 |   cargo publish --no-verify --allow-dirty
33 |   popd
34 |   sleep 20
35 | done
36 | 
37 | popd
38 | 
39 | echo "Publishing root crate"
40 | cargo publish --allow-dirty
41 | 
42 | echo "Cleaning local state"
43 | git reset HEAD --hard
44 | 
45 | 


--------------------------------------------------------------------------------
/examples/serde/serde.rs:
--------------------------------------------------------------------------------
 1 | use maidenx::nn::*;
 2 | use maidenx::prelude::*;
 3 | 
 4 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 5 |     // set_default_device(Device::CPU);
 6 |     // set_default_dtype(float32);
 7 |     // set_default_dtype(DType::F32);
 8 | 
 9 |     let tensor_x = Tensor::new(vec![1, 2, 3])?;
10 |     let bytes = tensor_x.to_bytes()?;
11 |     let tensor_y = Tensor::from_bytes(&bytes)?;
12 | 
13 |     // let x = Linear::new(3, 1, true)?;
14 |     // x.save("assets/serde/x", "bytes")?;
15 |     // x.save("assets/serde/x", "bin")?;
16 |     // x.save("assets/serde/x", "json")?;
17 |     let y = Linear::load("assets/serde/x.bin")?;
18 |     let z = Linear::load("assets/serde/x.json")?;
19 | 
20 |     println!("y: {:?}\nz: {:?}", y.forward(&tensor_y)?, z.forward(&tensor_x)?);
21 | 
22 |     Ok(())
23 | }
24 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/nn/activation/softmax.rs:
--------------------------------------------------------------------------------
 1 | use half::{bf16, f16};
 2 | 
 3 | #[link(name = "nn")]
 4 | extern "C" {}
 5 | 
 6 | #[macro_export]
 7 | macro_rules! declare_extern_softmax_ops {
 8 |     ($($dtype:ident => $ty:ty),*) => {
 9 |         paste::paste! {
10 |             extern "C" {
11 |                 $(
12 |                     pub fn [<cuda_softmax_ $dtype>](
13 |                         num_els: usize,
14 |                         num_dims: usize,
15 |                         dim: usize,
16 |                         metadata: *const usize,
17 |                         input: *const $ty,
18 |                         output: *mut $ty,
19 |                     );
20 |                 )*
21 |             }
22 |         }
23 |     }
24 | }
25 | 
26 | declare_extern_softmax_ops! {
27 |     bf16 => bf16,
28 |     f16 => f16,
29 |     f32 => f32,
30 |     f64 => f64
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/iterator.rs:
--------------------------------------------------------------------------------
 1 | pub struct TensorIterator {
 2 |     pub shape: Vec<usize>,
 3 |     pub current: Vec<usize>,
 4 |     pub done: bool,
 5 | }
 6 | 
 7 | impl Iterator for TensorIterator {
 8 |     type Item = Vec<usize>;
 9 | 
10 |     fn next(&mut self) -> Option<Self::Item> {
11 |         if self.done {
12 |             return None;
13 |         }
14 | 
15 |         let result = self.current.clone();
16 | 
17 |         let mut dim = self.current.len() - 1;
18 |         loop {
19 |             self.current[dim] += 1;
20 |             if self.current[dim] < self.shape[dim] {
21 |                 break;
22 |             }
23 | 
24 |             self.current[dim] = 0;
25 | 
26 |             if dim == 0 {
27 |                 self.done = true;
28 |                 break;
29 |             }
30 | 
31 |             dim -= 1;
32 |         }
33 | 
34 |         Some(result)
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_core"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx core"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | default = []
13 | 
14 | nn = ["maidenx_cpu/nn", "maidenx_cuda?/nn", "maidenx_mps?/nn"]
15 | serde = ["dep:serde"]
16 | 
17 | cuda = ["dep:maidenx_cuda"]
18 | mps = ["dep:maidenx_mps"]
19 | 
20 | [dependencies]
21 | maidenx_cpu = { path = "../maidenx_cpu", version = "0.2.0-dev" }
22 | maidenx_cuda = { path = "../maidenx_cuda", version = "0.2.0-dev", optional = true }
23 | maidenx_mps = { path = "../maidenx_mps", version = "0.2.0-dev", optional = true }
24 | half = { workspace = true }
25 | paste = { workspace = true }
26 | serde = { workspace = true, optional = true, features = ["derive"] }
27 | 


--------------------------------------------------------------------------------
/crates/maidenx_internal/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_internal"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx internal"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | nn = ["maidenx_core/nn", "dep:maidenx_nn"]
13 | serde = ["maidenx_core/serde", "maidenx_tensor/serde", "maidenx_nn?/serde"]
14 | 
15 | cuda = ["maidenx_core/cuda", "maidenx_tensor/cuda"]
16 | mps = ["maidenx_core/mps", "maidenx_tensor/mps"]
17 | 
18 | [dependencies]
19 | maidenx_core = { path = "../maidenx_core", version = "0.2.0-dev" }
20 | maidenx_nn = { path = "../maidenx_nn", version = "0.2.0-dev", optional = true }
21 | maidenx_tensor = { path = "../maidenx_tensor", version = "0.2.0-dev" }
22 | maidenx_tensor_v2 = { path = "../maidenx_tensor_v2", version = "0.2.0-dev" }
23 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/iterator.rs:
--------------------------------------------------------------------------------
 1 | pub struct TensorIndexIterator {
 2 |     pub shape: Vec<usize>,
 3 |     pub current: Vec<usize>,
 4 |     pub done: bool,
 5 | }
 6 | 
 7 | impl Iterator for TensorIndexIterator {
 8 |     type Item = Vec<usize>;
 9 | 
10 |     fn next(&mut self) -> Option<Self::Item> {
11 |         if self.done {
12 |             return None;
13 |         }
14 | 
15 |         let result = self.current.clone();
16 | 
17 |         let mut dim = self.current.len() - 1;
18 |         loop {
19 |             self.current[dim] += 1;
20 |             if self.current[dim] < self.shape[dim] {
21 |                 break;
22 |             }
23 | 
24 |             self.current[dim] = 0;
25 | 
26 |             if dim == 0 {
27 |                 self.done = true;
28 |                 break;
29 |             }
30 | 
31 |             dim -= 1;
32 |         }
33 | 
34 |         Some(result)
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/test.rs:
--------------------------------------------------------------------------------
 1 | use maidenx::tensor_v2::prelude::*;
 2 | 
 3 | fn main() {
 4 |     lazy!();
 5 | 
 6 |     let a = Tensor::new(vec![1.0, 2.0, 3.0]);
 7 |     let b = Tensor::new(vec![1.0, 2.0, 3.0]);
 8 |     let c = Tensor::new(vec![1.0, 2.0, 3.0]);
 9 |     a.enable_grad();
10 |     b.enable_grad();
11 |     c.enable_grad();
12 | 
13 |     let d = a.add(&a.mul(&b)).add(&a.mul(&c)).add(&b.mul(&c));
14 |     d.forward();
15 |     d.backward();
16 | 
17 |     println!("d: {}", d);
18 | 
19 |     println!("a.grad: {}", a.grad());
20 |     println!("b.grad: {}", b.grad());
21 |     println!("c.grad: {}", c.grad());
22 | 
23 |     let e = a.add(&a.mul(&b).mul(&b)).add(&a.mul(&a)).add(&a.mul(&b).mul(&c));
24 |     e.forward();
25 |     e.backward();
26 | 
27 |     println!("e: {}", e);
28 | 
29 |     println!("a.grad: {}", a.grad());
30 |     println!("b.grad: {}", b.grad());
31 |     println!("c.grad: {}", c.grad());
32 | }
33 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_tensor"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx tensor"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | serde = ["maidenx_core/serde", "dep:serde", "dep:serde_json", "dep:bincode"]
13 | graph = []
14 | 
15 | cuda = ["maidenx_core/cuda"]
16 | mps = ["maidenx_core/mps"]
17 | 
18 | [dependencies]
19 | maidenx_core = { path = "../maidenx_core", version = "0.2.0-dev" }
20 | half = { workspace = true }
21 | paste = { workspace = true }
22 | rand = { version = "0.8.5" }
23 | rand_distr = { version = "0.4.3" }
24 | serde = { workspace = true, optional = true, features = ["derive"] }
25 | serde_json = { workspace = true, optional = true }
26 | bincode = { workspace = true, optional = true, features = ["serde"] }
27 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_tensor_v2"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx tensor"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | serde = ["maidenx_core/serde", "dep:serde", "dep:serde_json", "dep:bincode"]
13 | 
14 | cuda = ["maidenx_core/cuda"]
15 | mps = ["maidenx_core/mps"]
16 | 
17 | [dependencies]
18 | maidenx_core = { path = "../maidenx_core", version = "0.2.0-dev" }
19 | half = { workspace = true }
20 | paste = { workspace = true }
21 | rand = { version = "0.9.1" }
22 | rand_distr = { version = "0.5.1" }
23 | serde = { workspace = true, optional = true, features = ["derive"] }
24 | serde_json = { workspace = true, optional = true }
25 | bincode = { workspace = true, optional = true, features = ["serde"] }
26 | dashmap = { workspace = true }
27 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/padding.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use maidenx_core::scalar::Scalar;
 3 | 
 4 | impl Tensor {
 5 |     pub fn pad(&self, paddings: &[(usize, usize)], pad_value: impl Into<Scalar>) -> Tensor {
 6 |         self.try_pad(paddings, pad_value).expect("failed to pad tensor")
 7 |     }
 8 | 
 9 |     pub fn pad_with_constant(&self, paddings: &[(usize, usize)], pad_value: impl Into<Scalar>) -> Tensor {
10 |         self.try_pad_with_constant(paddings, pad_value)
11 |             .expect("failed to pad tensor with constant")
12 |     }
13 | 
14 |     pub fn pad_with_reflection(&self, paddings: &[(usize, usize)]) -> Tensor {
15 |         self.try_pad_with_reflection(paddings)
16 |             .expect("failed to pad tensor with reflection")
17 |     }
18 | 
19 |     pub fn pad_with_replication(&self, paddings: &[(usize, usize)]) -> Tensor {
20 |         self.try_pad_with_replication(paddings)
21 |             .expect("failed to pad tensor with replication")
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/utils/tensor.rs:
--------------------------------------------------------------------------------
 1 | use crate::{get_metadata_mut, Tensor, TensorId, TensorUpdateStatus};
 2 | use maidenx_core::error::{Error, Result};
 3 | 
 4 | pub fn update_tensor_status(tid: TensorId, status: TensorUpdateStatus) -> Result<()> {
 5 |     let metadata_ref = get_metadata_mut(tid).ok_or_else(|| Error::InvalidState("tensor metadata not found".into()))?;
 6 |     let mut metadata = metadata_ref
 7 |         .write()
 8 |         .map_err(|_| Error::InvalidState("failed to acquire metadata lock".into()))?;
 9 |     metadata.set_update_status(status);
10 |     Ok(())
11 | }
12 | 
13 | pub fn share_storage_id(source: &Tensor, dest: &Tensor) -> Result<()> {
14 |     if let Some(source_storage_id) = crate::get_storage_id(source.id()) {
15 |         crate::link_tensor_to_storage(dest.id(), source_storage_id);
16 |         Ok(())
17 |     } else {
18 |         Err(maidenx_core::error::Error::InvalidState(
19 |             "source tensor has no storage".into(),
20 |         ))
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/nn/conv.rs:
--------------------------------------------------------------------------------
 1 | use half::{bf16, f16};
 2 | 
 3 | #[link(name = "nn")]
 4 | extern "C" {}
 5 | 
 6 | #[macro_export]
 7 | macro_rules! declare_extern_conv_ops {
 8 |     ($($dtype:ident => $ty:ty),*) => {
 9 |         paste::paste! {
10 |             extern "C" {
11 |                 $(
12 |                     pub fn [<cuda_conv2d_im2col_ $dtype>](
13 |                         num_els: usize,
14 |                         metadata: *const usize,
15 |                         input: *const $ty,
16 |                         col: *mut $ty,
17 |                     );
18 | 
19 |                     pub fn [<cuda_conv2d_col2im_ $dtype>](
20 |                         num_els: usize,
21 |                         metadata: *const usize,
22 |                         col: *const $ty,
23 |                         output: *mut $ty,
24 |                     );
25 |                 )*
26 |             }
27 |         }
28 |     }
29 | }
30 | 
31 | declare_extern_conv_ops! {
32 |     bf16 => bf16,
33 |     f16 => f16,
34 |     f32 => f32,
35 |     f64 => f64
36 | }
37 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "maidenx_nn"
 3 | version = "0.2.0-dev"
 4 | description = "maidenx nn"
 5 | license = "BSD-3-Clause"
 6 | authors = ["Han Damin <miniex@daminstudio.net>"]
 7 | edition = "2021"
 8 | publish = true
 9 | repository = "https://github.com/miniex/maidenx"
10 | 
11 | [features]
12 | default = []
13 | 
14 | serde = [
15 |     "dep:serde",
16 |     "dep:serde_json",
17 |     "dep:bincode",
18 |     "maidenx_nn_macros/serde",
19 |     "maidenx_core/serde",
20 | ]
21 | 
22 | cuda = ["maidenx_core/cuda", "maidenx_tensor/cuda"]
23 | mps = ["maidenx_core/mps", "maidenx_tensor/mps"]
24 | 
25 | [dependencies]
26 | maidenx_nn_macros = { path = "macros", version = "0.2.0-dev" }
27 | 
28 | maidenx_core = { path = "../maidenx_core", version = "0.2.0-dev", features = [
29 |     "nn",
30 | ] }
31 | maidenx_tensor = { path = "../maidenx_tensor", version = "0.2.0-dev" }
32 | serde = { workspace = true, optional = true }
33 | serde_json = { workspace = true, optional = true }
34 | bincode = { workspace = true, optional = true, features = ["serde"] }
35 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/layer.rs:
--------------------------------------------------------------------------------
 1 | use maidenx_core::error::Result;
 2 | pub use maidenx_nn_macros::Layer;
 3 | use maidenx_tensor::Tensor;
 4 | #[cfg(feature = "serde")]
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | pub trait Layer<I = &'static Tensor> {
 8 |     fn forward(&self, input: I) -> Result<Tensor>;
 9 |     fn parameters(&mut self) -> Vec<&mut Tensor>;
10 | 
11 |     fn is_training(&self) -> bool;
12 |     fn train(&mut self);
13 |     fn eval(&mut self);
14 | }
15 | 
16 | #[derive(Clone)]
17 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
18 | pub struct LayerState {
19 |     training: bool,
20 | }
21 | 
22 | impl Default for LayerState {
23 |     fn default() -> Self {
24 |         Self { training: true }
25 |     }
26 | }
27 | 
28 | impl LayerState {
29 |     pub fn new() -> Self {
30 |         Self::default()
31 |     }
32 | 
33 |     pub fn is_training(&self) -> bool {
34 |         self.training
35 |     }
36 | 
37 |     pub fn train(&mut self) {
38 |         self.training = true;
39 |     }
40 | 
41 |     pub fn eval(&mut self) {
42 |         self.training = false;
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/optimizers/sgd.rs:
--------------------------------------------------------------------------------
 1 | use crate::optimizer::Optimizer;
 2 | use maidenx_core::{error::Result, scalar::Scalar};
 3 | use maidenx_tensor::Tensor;
 4 | 
 5 | #[derive(Optimizer)]
 6 | pub struct SGD {
 7 |     learning_rate: Scalar,
 8 | }
 9 | 
10 | impl SGD {
11 |     pub fn new(learning_rate: impl Into<Scalar>) -> Self {
12 |         Self {
13 |             learning_rate: learning_rate.into(),
14 |         }
15 |     }
16 | 
17 |     pub fn step(&mut self, parameters: &mut [&mut Tensor]) -> Result<()> {
18 |         for param in parameters.iter_mut() {
19 |             if let Some(grad) = param.grad()? {
20 |                 let lr = self.learning_rate;
21 | 
22 |                 param.sub_(&grad.mul_scalar(lr)?)?;
23 |             }
24 |         }
25 |         Ok(())
26 |     }
27 | 
28 |     pub fn zero_grad(&mut self, parameters: &mut [&mut Tensor]) -> Result<()> {
29 |         for param in parameters.iter_mut() {
30 |             param.zero_grad()?;
31 |         }
32 |         Ok(())
33 |     }
34 | 
35 |     pub fn set_learning_rate(&mut self, learning_rate: impl Into<Scalar>) {
36 |         self.learning_rate = learning_rate.into();
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/utils/broadcast.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use maidenx_core::error::{Error, Result};
 3 | 
 4 | pub fn broadcast_tensors(a: &Tensor, b: &Tensor) -> Result<(Tensor, Tensor)> {
 5 |     let a_shape = a.shape();
 6 |     let b_shape = b.shape();
 7 | 
 8 |     let a_ndim = a_shape.len();
 9 |     let b_ndim = b_shape.len();
10 | 
11 |     let output_ndim = a_ndim.max(b_ndim);
12 |     let mut output_shape = vec![0; output_ndim];
13 | 
14 |     for i in 0..output_ndim {
15 |         let a_dim = if i < a_ndim { a_shape[a_ndim - 1 - i] } else { 1 };
16 |         let b_dim = if i < b_ndim { b_shape[b_ndim - 1 - i] } else { 1 };
17 | 
18 |         if a_dim == 1 || b_dim == 1 || a_dim == b_dim {
19 |             output_shape[output_ndim - 1 - i] = a_dim.max(b_dim);
20 |         } else {
21 |             return Err(Error::InvalidShape {
22 |                 message: format!("Cannot broadcast shapes {:?} and {:?}", a_shape, b_shape),
23 |             });
24 |         }
25 |     }
26 | 
27 |     let a_broadcasted = a.broadcast(&output_shape)?;
28 |     let b_broadcasted = b.broadcast(&output_shape)?;
29 | 
30 |     Ok((a_broadcasted, b_broadcasted))
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/utils/broadcast.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use maidenx_core::error::{Error, Result};
 3 | 
 4 | pub fn broadcast_tensors(a: &Tensor, b: &Tensor) -> Result<(Tensor, Tensor)> {
 5 |     let a_shape = a.shape();
 6 |     let b_shape = b.shape();
 7 | 
 8 |     let a_ndim = a_shape.len();
 9 |     let b_ndim = b_shape.len();
10 | 
11 |     let output_ndim = a_ndim.max(b_ndim);
12 |     let mut output_shape = vec![0; output_ndim];
13 | 
14 |     for i in 0..output_ndim {
15 |         let a_dim = if i < a_ndim { a_shape[a_ndim - 1 - i] } else { 1 };
16 |         let b_dim = if i < b_ndim { b_shape[b_ndim - 1 - i] } else { 1 };
17 | 
18 |         if a_dim == 1 || b_dim == 1 || a_dim == b_dim {
19 |             output_shape[output_ndim - 1 - i] = a_dim.max(b_dim);
20 |         } else {
21 |             return Err(Error::InvalidShape {
22 |                 message: format!("Cannot broadcast shapes {:?} and {:?}", a_shape, b_shape),
23 |             });
24 |         }
25 |     }
26 | 
27 |     let a_broadcasted = a.try_broadcast(&output_shape)?;
28 |     let b_broadcasted = b.try_broadcast(&output_shape)?;
29 | 
30 |     Ok((a_broadcasted, b_broadcasted))
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/book/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | [Introduction](./README.md)
 4 | 
 5 | ---
 6 | 
 7 | # User Guide
 8 | 
 9 | - [Installation](./guide/installation.md)
10 | - [Create Tensors](./guide/create-tensors.md)
11 | 
12 | ---
13 | 
14 | # Documentation
15 | 
16 | - [Tensor](./tensor/README.md)
17 |     - [Creation](./tensor/creation.md)
18 |     - [Operation](./tensor/operation.md)
19 |         - [binary](./tensor/ops_binary.md)
20 |         - [unary](./tensor/ops_unary.md)
21 |         - [reduction](./tensor/ops_reduction.md)
22 |         - [transform](./tensor/ops_transform.md)
23 |         - [padding](./tensor/ops_padding.md)
24 |         - [indexing](./tensor/ops_indexing.md)
25 |     - [Utils](./tensor/utils.md)
26 | - [Neural Networks](./nn/README.md)
27 |     - [Layer](./nn/layer.md)
28 |         - [Activation Layer](./nn/activation.md)
29 |         - [Convolution Layer](./nn/convolution.md)
30 |         - [Dropout Layer](./nn/dropout.md)
31 |         - [Embedding Layer](./nn/embedding.md)
32 |         - [Linear Layer](./nn/linear.md)
33 |         - [Normalization Layer](./nn/normalization.md)
34 |     - [Optimizer](./nn/optimizer.md)
35 | - [Device](./device.md)
36 | - [DType](./dtype.md)
37 | 


--------------------------------------------------------------------------------
/docs/neural-networks.md:
--------------------------------------------------------------------------------
 1 | # Neural Networks
 2 | 
 3 | ## Layers
 4 | 
 5 | | Category | Name | Notes |
 6 | |---|---|---|
 7 | | **Linear Layers** |
 8 | |            | `Linear`           |   |
 9 | | **Convolution Layers** |
10 | |            | `Conv2d`           |   |
11 | | **Normalization Layers** |
12 | |            | `LayerNorm`        |   |
13 | | **Dropout Layers** |
14 | |            | `Dropout`          |   |
15 | | **Embedding Layers** |
16 | |            | `Embedding`        |   |
17 | | **Activation Layers** |
18 | |            | `Softmax`          |   |
19 | | tensor ops aliases |
20 | |            | `ReLU`             |   |
21 | |            | `Sigmoid`          |   |
22 | |            | `Tanh`             |   |
23 | |            | `LeakyReLU`        |   |
24 | |            | `GELU`             |   |
25 | |            | `ELU`              |   |
26 | 
27 | ## Loss Layers
28 | 
29 | | Category | Name | Notes |
30 | |---|---|---|
31 | |            | `Huber`            |   |
32 | |            | `MAE`              |   |
33 | |            | `MSE`              |   |
34 | |            | `CrossEntropyLoss` |   |
35 | 
36 | ## Optimizer
37 | 
38 | | Category | Name | Notes |
39 | |---|---|---|
40 | |            | `Adam`             |   |
41 | |            | `SGD`              |   |
42 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/utils/promotion.rs:
--------------------------------------------------------------------------------
 1 | use maidenx_core::dtype::DType;
 2 | 
 3 | pub fn get_promoted_dtype(dtype1: DType, dtype2: DType) -> DType {
 4 |     let mut dtype1 = dtype1;
 5 |     let mut dtype2 = dtype2;
 6 | 
 7 |     if dtype1 == DType::BOOL {
 8 |         dtype1 = DType::U8;
 9 |     }
10 |     if dtype2 == DType::BOOL {
11 |         dtype2 = DType::U8;
12 |     }
13 | 
14 |     match (dtype1, dtype2) {
15 |         (dtype1, dtype2) if dtype1 == dtype2 => dtype1,
16 | 
17 |         (_, DType::F64) | (DType::F64, _) => DType::F64,
18 |         (_, DType::F32) | (DType::F32, _) => DType::F32,
19 |         (DType::BF16, DType::F16) | (DType::F16, DType::BF16) => DType::F32,
20 |         (_, DType::F16) | (DType::F16, _) => DType::F16,
21 |         (_, DType::BF16) | (DType::BF16, _) => DType::BF16,
22 | 
23 |         (_, DType::I64) | (DType::I64, _) => DType::I64,
24 |         (_, DType::I32) | (DType::I32, _) => DType::I32,
25 |         (_, DType::I16) | (DType::I16, _) => DType::I16,
26 |         (_, DType::I8) | (DType::I8, _) => DType::I8,
27 |         (_, DType::U64) | (DType::U64, _) => DType::I64,
28 |         (_, DType::U32) | (DType::U32, _) => DType::I64,
29 |         (_, DType::U16) | (DType::U16, _) => DType::I32,
30 |         (_, DType::U8) | (DType::U8, _) => DType::I32,
31 | 
32 |         _ => dtype1,
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/crates/maidenx_cpu/src/utils.rs:
--------------------------------------------------------------------------------
 1 | #[inline]
 2 | pub fn is_contiguous(num_dims: usize, dims: &[usize], strides: &[usize]) -> bool {
 3 |     let mut acc = 1;
 4 |     for d in 0..num_dims {
 5 |         let dim_idx = num_dims - 1 - d;
 6 |         if dims[dim_idx] > 1 && acc != strides[dim_idx] {
 7 |             return false;
 8 |         }
 9 |         acc *= dims[dim_idx];
10 |     }
11 |     true
12 | }
13 | 
14 | #[inline]
15 | pub fn get_strided_index(idx: usize, num_dims: usize, dims: &[usize], strides: &[usize]) -> usize {
16 |     let mut strided_i = 0;
17 |     let mut remaining_idx = idx;
18 | 
19 |     for d in 0..num_dims {
20 |         let dim_idx = num_dims - 1 - d;
21 | 
22 |         if strides[dim_idx] != 0 {
23 |             let dim_idx_value = remaining_idx % dims[dim_idx];
24 |             strided_i += dim_idx_value * strides[dim_idx];
25 |         }
26 | 
27 |         remaining_idx /= dims[dim_idx];
28 |     }
29 | 
30 |     strided_i
31 | }
32 | 
33 | #[inline]
34 | pub fn restrided(strided_i: usize, num_dims: usize, dims: &[usize], strides: &[usize], new_strides: &[usize]) -> usize {
35 |     let mut idx = 0;
36 | 
37 |     for d in 0..num_dims {
38 |         idx += if strides[d] == 0 {
39 |             0
40 |         } else {
41 |             ((strided_i / strides[d]) % dims[d]) * new_strides[d]
42 |         };
43 |     }
44 | 
45 |     idx
46 | }
47 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["crates/*", "benches"]
 3 | 
 4 | [package]
 5 | name = "maidenx"
 6 | version = "0.2.0-dev"
 7 | description = "Rust ML Framework for Maiden Engine"
 8 | license = "BSD-3-Clause"
 9 | authors = ["Han Damin <miniex@daminstudio.net>"]
10 | edition = "2021"
11 | publish = true
12 | repository = "https://github.com/miniex/maidenx"
13 | documentation = "https://docs.rs/maidenx"
14 | readme = "README.md"
15 | keywords = ["tensor", "machine-learning", "maidenx"]
16 | categories = ["science"]
17 | 
18 | [workspace.dependencies]
19 | half = "2.6.0"
20 | paste = "1.0"
21 | rayon = "1.10"
22 | serde = "1.0.219"
23 | serde_json = "1.0.140"
24 | bincode = "2.0.1"
25 | dashmap = "6.1.0"
26 | 
27 | [features]
28 | default = ["nn", "serde"]
29 | 
30 | nn = ["maidenx_internal/nn"]
31 | serde = ["maidenx_internal/serde"]
32 | 
33 | cuda = ["maidenx_internal/cuda"]
34 | mps = ["maidenx_internal/mps"]
35 | 
36 | [dependencies]
37 | maidenx_internal = { path = "crates/maidenx_internal", version = "0.2.0-dev" }
38 | 
39 | [[example]]
40 | name = "cpu"
41 | path = "examples/device/cpu.rs"
42 | 
43 | [[example]]
44 | name = "cuda"
45 | path = "examples/device/cuda.rs"
46 | 
47 | [[example]]
48 | name = "mps"
49 | path = "examples/device/mps.rs"
50 | 
51 | [[example]]
52 | name = "serde"
53 | path = "examples/serde/serde.rs"
54 | 
55 | [[example]]
56 | name = "test"
57 | path = "examples/test.rs"
58 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/tests/tensor.rs:
--------------------------------------------------------------------------------
 1 | mod utils;
 2 | 
 3 | use maidenx_core::error::Result;
 4 | use utils::setup_tensor_without_dtype;
 5 | 
 6 | #[test]
 7 | fn any() -> Result<()> {
 8 |     let x = setup_tensor_without_dtype(vec![false, false, true, false, false])?;
 9 |     assert!(x.any()?);
10 | 
11 |     let y = setup_tensor_without_dtype(vec![false, false, false, false, false])?;
12 |     assert!(!y.any()?);
13 | 
14 |     let z = setup_tensor_without_dtype(vec![0.0f32, 0.0, 0.0, 1.0, 0.0])?;
15 |     assert!(z.any()?);
16 | 
17 |     let w = setup_tensor_without_dtype(vec![0.0f32, 0.0, 0.0, 0.0, 0.0])?;
18 |     assert!(!w.any()?);
19 | 
20 |     Ok(())
21 | }
22 | 
23 | #[test]
24 | fn get() -> Result<()> {
25 |     let x = setup_tensor_without_dtype(vec![3.0f32, 4.0, 5.0, 9.0, 7.0, 3.0])?;
26 |     let scalar = x.get(&[4])?;
27 | 
28 |     assert_eq!(scalar.as_f32(), 7.0f32);
29 | 
30 |     Ok(())
31 | }
32 | 
33 | #[test]
34 | fn set() -> Result<()> {
35 |     let mut x = setup_tensor_without_dtype(vec![3.0f32, 4.0, 5.0, 9.0, 7.0, 3.0])?;
36 |     x.set(&[4], 2.0)?;
37 | 
38 |     assert_eq!(x.to_flatten_vec::<f32>()?, vec![3.0f32, 4.0, 5.0, 9.0, 2.0, 3.0]);
39 | 
40 |     Ok(())
41 | }
42 | 
43 | #[test]
44 | fn selcet() -> Result<()> {
45 |     let x = setup_tensor_without_dtype(vec![3.0f32, 4.0, 5.0, 9.0, 7.0, 3.0])?;
46 |     let select = x.select(0, 4)?;
47 | 
48 |     assert_eq!(select.to_flatten_vec::<f32>()?, vec![7.0]);
49 | 
50 |     Ok(())
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/ops/matmul.rs:
--------------------------------------------------------------------------------
 1 | use half::{bf16, f16};
 2 | 
 3 | #[link(name = "ops")]
 4 | extern "C" {}
 5 | 
 6 | #[macro_export]
 7 | macro_rules! declare_extern_matmul_ops {
 8 |     ($($dtype:ident => $ty:ty),*) => {
 9 |         paste::paste! {
10 |             extern "C" {
11 |                 $(
12 |                     pub fn [<cuda_matmul_ $dtype>](
13 |                         num_els: usize,
14 |                         metadata: *const usize,
15 |                         a: *const $ty,
16 |                         b: *const $ty,
17 |                         c: *mut $ty,
18 |                     );
19 | 
20 |                     pub fn [<cuda_matmul_backward_ $dtype>](
21 |                         num_els_a: usize,
22 |                         num_els_b: usize,
23 |                         metadata: *const usize,
24 |                         grad_output: *const $ty,
25 |                         a: *const $ty,
26 |                         b: *const $ty,
27 |                         grad_a: *mut $ty,
28 |                         grad_b: *mut $ty,
29 |                     );
30 |                 )*
31 |             }
32 |         }
33 |     }
34 | }
35 | 
36 | declare_extern_matmul_ops! {
37 |     bf16 => bf16,
38 |     f16 => f16,
39 |     f32 => f32,
40 |     f64 => f64,
41 |     bool => bool,
42 |     u8 => u8,
43 |     u16 => u16,
44 |     u32 => u32,
45 |     u64 => u64,
46 |     i8 => i8,
47 |     i16 => i16,
48 |     i32 => i32,
49 |     i64 => i64
50 | }
51 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-book.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy mdBook
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - 'book/**'
 9 |       - '.github/workflows/deploy-book.yml'
10 |   
11 |   # Allow manual triggering
12 |   workflow_dispatch:
13 | 
14 | permissions:
15 |   contents: write
16 |   pages: write
17 |   id-token: write
18 | 
19 | jobs:
20 |   deploy:
21 |     runs-on: ubuntu-latest
22 |     concurrency:
23 |       group: ${{ github.workflow }}-${{ github.ref }}
24 |     steps:
25 |       - uses: actions/checkout@v4
26 |         with:
27 |           fetch-depth: 0
28 |       
29 |       - name: Install latest mdbook
30 |         run: |
31 |           tag=$(curl 'https://api.github.com/repos/rust-lang/mdbook/releases/latest' | jq -r '.tag_name')
32 |           url="https://github.com/rust-lang/mdbook/releases/download/${tag}/mdbook-${tag}-x86_64-unknown-linux-gnu.tar.gz"
33 |           mkdir mdbook
34 |           curl -sSL $url | tar -xz --directory=./mdbook
35 |           echo "$(pwd)/mdbook" >> $GITHUB_PATH
36 |       
37 |       - name: Build Book
38 |         run: cd book && mdbook build
39 |       
40 |       - name: Setup Pages
41 |         uses: actions/configure-pages@v4
42 |       
43 |       - name: Upload artifact
44 |         uses: actions/upload-pages-artifact@v3
45 |         with:
46 |           path: 'book/book'
47 |       
48 |       - name: Deploy to GitHub Pages
49 |         id: deployment
50 |         uses: actions/deploy-pages@v4


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/tests/memory.rs:
--------------------------------------------------------------------------------
 1 | mod utils;
 2 | 
 3 | use maidenx_core::{device::auto_set_device, error::Result};
 4 | use maidenx_tensor_v2::{Tensor, TensorMode};
 5 | use utils::test_both_modes;
 6 | 
 7 | #[test]
 8 | fn contiguous() -> Result<()> {
 9 |     test_both_modes(|mode| {
10 |         auto_set_device();
11 | 
12 |         let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0];
13 |         let tensor = Tensor::from_flatten_vec(data, &[3, 4]);
14 |         assert!(tensor.is_contiguous());
15 | 
16 |         let transposed = tensor.transpose(0, 1);
17 |         assert!(!transposed.is_contiguous());
18 | 
19 |         let contiguous_tensor = transposed.contiguous();
20 | 
21 |         assert!(contiguous_tensor.is_contiguous());
22 |         assert_eq!(contiguous_tensor.mode(), mode);
23 |         assert_ne!(transposed.id(), contiguous_tensor.id());
24 | 
25 |         match mode {
26 |             TensorMode::Eager => {
27 |                 assert!(contiguous_tensor.is_const());
28 |                 assert!(contiguous_tensor.is_storaged());
29 | 
30 |                 let contiguous_data = contiguous_tensor.to_flatten_vec::<f32>();
31 |                 assert_eq!(contiguous_data.len(), 12);
32 |             },
33 |             TensorMode::Lazy => {
34 |                 assert!(!contiguous_tensor.is_const());
35 |                 assert!(!contiguous_tensor.is_storaged());
36 | 
37 |                 contiguous_tensor.forward();
38 |                 assert!(contiguous_tensor.is_storaged());
39 | 
40 |                 let contiguous_data = contiguous_tensor.to_flatten_vec::<f32>();
41 |                 assert_eq!(contiguous_data.len(), 12);
42 |             },
43 |         }
44 |         Ok(())
45 |     })
46 | }
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024-present, Han Damin (miniex). All rights reserved.
 2 | 
 3 | All contributions by Han Damin (miniex):
 4 | Copyright (c) 2024-present, Han Damin (miniex).
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 | 1. Redistributions of source code must retain the above copyright
11 |    notice, this list of conditions and the following disclaimer.
12 | 
13 | 2. Redistributions in binary form must reproduce the above copyright
14 |    notice, this list of conditions and the following disclaimer in the
15 |    documentation and/or other materials provided with the distribution.
16 | 
17 | 3. Neither the pseudonym "Han Damin" nor the pseudonym "miniex" nor the names of any contributors may
18 |    be used to endorse or promote products derived from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/utils/graph.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     get_current_graph_id, get_graph_mut, utils::tensor::share_storage_id, with_mode, Tensor, TensorId, TensorMode,
 3 |     TensorNode,
 4 | };
 5 | use maidenx_core::error::{Error, Result};
 6 | use std::sync::Arc;
 7 | 
 8 | pub fn add_to_graph<F>(op_name: &'static str, inputs: &[&Tensor], outputs: &[&Tensor], execute_fn: F) -> Result<()>
 9 | where
10 |     F: Fn(&[Tensor], &[Tensor]) -> Result<()> + Send + Sync + 'static,
11 | {
12 |     if inputs.is_empty() {
13 |         return Err(Error::InvalidState("No input tensors provided".into()));
14 |     }
15 | 
16 |     let input_tids: Vec<TensorId> = inputs.iter().map(|t| t.id()).collect();
17 |     let output_tids: Vec<TensorId> = outputs.iter().map(|t| t.id()).collect();
18 | 
19 |     let compute_fn = Arc::new(execute_fn);
20 | 
21 |     let node = TensorNode::new(op_name, input_tids, output_tids, Some(compute_fn));
22 |     let graph_id = get_current_graph_id();
23 |     let graph_entry =
24 |         get_graph_mut(graph_id).ok_or_else(|| Error::InvalidState(format!("graph {:?} not found", graph_id)))?;
25 |     let mut graph = graph_entry.write().map_err(|_| Error::Lock)?;
26 |     graph.add_node(node);
27 | 
28 |     Ok(())
29 | }
30 | 
31 | pub fn accumulate(from: &Tensor, to: &Tensor) -> Result<()> {
32 |     add_to_graph("accumulate", &[from], &[to], move |inputs, outputs| {
33 |         with_mode(TensorMode::Eager, || {
34 |             if outputs[0].is_storaged() {
35 |                 let temp = outputs[0].try_add(&inputs[0])?;
36 |                 share_storage_id(&temp, &outputs[0])?;
37 |             } else {
38 |                 share_storage_id(&inputs[0], &outputs[0])?;
39 |             }
40 |             Ok(())
41 |         })
42 |     })
43 | }
44 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/kernels/cuda_utils.cuh:
--------------------------------------------------------------------------------
 1 | // Reference: candle-rs (https://github.com/huggingface/candle)
 2 | 
 3 | #include "cuda_bf16.h"
 4 | #include "cuda_fp16.h"
 5 | #include <cmath>
 6 | #include <stdint.h>
 7 | 
 8 | __device__ __forceinline__ bool is_contiguous(const size_t num_dims,
 9 |                                               const size_t *dims,
10 |                                               const size_t *strides) {
11 |   size_t acc = 1;
12 |   for (unsigned int d = 0; d < num_dims; d++) {
13 |     unsigned int dim_idx = num_dims - 1 - d;
14 |     if (dims[dim_idx] > 1 && acc != strides[dim_idx]) {
15 |       return false;
16 |     }
17 |     acc *= dims[dim_idx];
18 |   }
19 |   return true;
20 | }
21 | 
22 | __device__ __forceinline__ unsigned int
23 | get_strided_index(size_t idx, size_t num_dims, const size_t *dims,
24 |                   const size_t *strides) {
25 |   size_t strided_i = 0;
26 |   for (int d = num_dims - 1; d >= 0; d--) {
27 |     size_t dim_idx_value = idx % dims[d];
28 |     strided_i += (strides[d] == 0 ? 0 : dim_idx_value * strides[d]);
29 |     idx /= dims[d];
30 |   }
31 |   return strided_i;
32 | }
33 | 
34 | __device__ __forceinline__ unsigned int restrided(const unsigned int strided_i,
35 |                                                   const size_t num_dims,
36 |                                                   const size_t *dims,
37 |                                                   const size_t *strides,
38 |                                                   const size_t *new_strides) {
39 |   unsigned int idx = 0;
40 |   for (int d = 0; d < num_dims; d++) {
41 |     idx += (strides[d] == 0 ? 0 : (strided_i / strides[d]) % dims[d]) *
42 |            new_strides[d];
43 |   }
44 |   return idx;
45 | }
46 | 
47 | // __half
48 | 
49 | // __nv_bfloat16
50 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/kernels/metal_utils.metal:
--------------------------------------------------------------------------------
 1 | #include <metal_stdlib>
 2 | #include <metal_math>
 3 | 
 4 | using namespace metal;
 5 | 
 6 | inline bool is_contiguous(const size_t num_dims,
 7 |                          constant size_t *dims,
 8 |                          constant size_t *strides) {
 9 |     size_t acc = 1;
10 |     for (unsigned int d = 0; d < num_dims; d++) {
11 |         unsigned int dim_idx = num_dims - 1 - d;
12 |         if (dims[dim_idx] > 1 && acc != strides[dim_idx]) {
13 |             return false;
14 |         }
15 |         acc *= dims[dim_idx];
16 |     }
17 |     return true;
18 | }
19 | 
20 | inline unsigned int
21 | get_strided_index(unsigned int idx, const size_t num_dims, constant size_t *dims,
22 |                   constant size_t *strides) {
23 |     unsigned int strided_i = 0;
24 |     for (unsigned int d = 0; d < num_dims; d++) {
25 |         unsigned int dim_idx = num_dims - 1 - d;
26 |         unsigned int current_dim = dims[dim_idx];
27 |         unsigned int current_stride = strides[dim_idx];
28 | 
29 |         unsigned int idx_mod_dim = current_dim == 0 ? 0 : (idx % current_dim);
30 | 
31 |         if (current_stride != 0) {
32 |             strided_i += idx_mod_dim * current_stride;
33 |         }
34 | 
35 |         idx = current_dim == 0 ? 0 : (idx / current_dim);
36 |     }
37 |     return strided_i;
38 | }
39 | 
40 | inline unsigned int restrided(const unsigned int strided_i,
41 |                             const size_t num_dims,
42 |                             constant size_t *dims,
43 |                             constant size_t *strides,
44 |                             constant size_t *new_strides) {
45 |     unsigned int idx = 0;
46 |     for (size_t d = 0; d < num_dims; d++) {
47 |         idx += (strides[d] == 0 ? 0 : (strided_i / strides[d]) % dims[d]) *
48 |                new_strides[d];
49 |     }
50 |     return idx;
51 | }
52 | 


--------------------------------------------------------------------------------
/examples/device/mps.rs:
--------------------------------------------------------------------------------
 1 | use maidenx::nn::*;
 2 | use maidenx::prelude::*;
 3 | use std::time::Instant;
 4 | 
 5 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 6 |     set_default_device(Device::MPS);
 7 |     // set_default_dtype(float32);
 8 |     // set_default_dtype(DType::F32);
 9 | 
10 |     let input_data: Vec<Vec<f32>> = (0..10000)
11 |         .map(|i| {
12 |             vec![
13 |                 (i % 100) as f32 / 100.0,
14 |                 ((i % 100) + 1) as f32 / 100.0,
15 |                 ((i % 100) + 2) as f32 / 100.0,
16 |             ]
17 |         })
18 |         .collect();
19 |     let target_data: Vec<Vec<f32>> = (0..10000).map(|i| vec![((i % 100) * 10) as f32 / 1000.0]).collect();
20 | 
21 |     let mut input = Tensor::new(input_data)?;
22 |     let target = Tensor::new(target_data)?;
23 |     input.with_grad()?;
24 | 
25 |     let mut linear = Linear::new(3, 1, true)?;
26 |     let mse_loss = MSELoss::new();
27 |     let mut optimizer = SGD::new(0.01);
28 |     let epochs = 1000;
29 | 
30 |     let mut hundred_epochs_start = Instant::now();
31 | 
32 |     for epoch in 0..epochs {
33 |         let pred = linear.forward(&input)?;
34 |         let loss = mse_loss.forward((&pred, &target))?;
35 |         loss.backward()?;
36 | 
37 |         optimizer.step(&mut linear.parameters())?;
38 |         optimizer.zero_grad(&mut linear.parameters())?;
39 | 
40 |         if (epoch + 1) % 100 == 0 {
41 |             let hundred_elapsed = hundred_epochs_start.elapsed();
42 |             let params = linear.parameters();
43 |             println!(
44 |                 "Epoch {}: Loss = {}, 100 Epochs Time = {:?}, Weight = {}, Bias = {}",
45 |                 epoch + 1,
46 |                 loss,
47 |                 hundred_elapsed,
48 |                 params[0],
49 |                 params.get(1).unwrap()
50 |             );
51 |             hundred_epochs_start = Instant::now();
52 |         }
53 |     }
54 | 
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/device/cpu.rs:
--------------------------------------------------------------------------------
 1 | use maidenx::nn::*;
 2 | use maidenx::prelude::*;
 3 | use std::time::Instant;
 4 | 
 5 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 6 |     // set_default_device(Device::CPU);
 7 |     // set_default_dtype(float32);
 8 |     // set_default_dtype(DType::F32);
 9 | 
10 |     let input_data: Vec<Vec<f32>> = (0..10000)
11 |         .map(|i| {
12 |             vec![
13 |                 (i % 100) as f32 / 100.0,
14 |                 ((i % 100) + 1) as f32 / 100.0,
15 |                 ((i % 100) + 2) as f32 / 100.0,
16 |             ]
17 |         })
18 |         .collect();
19 |     let target_data: Vec<Vec<f32>> = (0..10000).map(|i| vec![((i % 100) * 10) as f32 / 1000.0]).collect();
20 | 
21 |     let mut input = Tensor::new(input_data)?;
22 |     let target = Tensor::new(target_data)?;
23 |     input.with_grad()?;
24 | 
25 |     let mut linear = Linear::new(3, 1, true)?;
26 |     let mse_loss = MSELoss::new();
27 |     let mut optimizer = SGD::new(0.01);
28 |     let epochs = 1000;
29 | 
30 |     let mut hundred_epochs_start = Instant::now();
31 | 
32 |     for epoch in 0..epochs {
33 |         let pred = linear.forward(&input)?;
34 |         let loss = mse_loss.forward((&pred, &target))?;
35 |         loss.backward()?;
36 | 
37 |         optimizer.step(&mut linear.parameters())?;
38 |         optimizer.zero_grad(&mut linear.parameters())?;
39 | 
40 |         if (epoch + 1) % 100 == 0 {
41 |             let hundred_elapsed = hundred_epochs_start.elapsed();
42 |             let params = linear.parameters();
43 |             println!(
44 |                 "Epoch {}: Loss = {}, 100 Epochs Time = {:?}, Weight = {}, Bias = {}",
45 |                 epoch + 1,
46 |                 loss,
47 |                 hundred_elapsed,
48 |                 params[0],
49 |                 params.get(1).unwrap()
50 |             );
51 |             hundred_epochs_start = Instant::now();
52 |         }
53 |     }
54 | 
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/device/cuda.rs:
--------------------------------------------------------------------------------
 1 | use maidenx::nn::*;
 2 | use maidenx::prelude::*;
 3 | use std::time::Instant;
 4 | 
 5 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 6 |     set_default_device(Device::CUDA(0));
 7 |     // set_default_dtype(float32);
 8 |     // set_default_dtype(DType::F32);
 9 | 
10 |     let input_data: Vec<Vec<f32>> = (0..10000)
11 |         .map(|i| {
12 |             vec![
13 |                 (i % 100) as f32 / 100.0,
14 |                 ((i % 100) + 1) as f32 / 100.0,
15 |                 ((i % 100) + 2) as f32 / 100.0,
16 |             ]
17 |         })
18 |         .collect();
19 |     let target_data: Vec<Vec<f32>> = (0..10000).map(|i| vec![((i % 100) * 10) as f32 / 1000.0]).collect();
20 | 
21 |     let mut input = Tensor::new(input_data)?;
22 |     let target = Tensor::new(target_data)?;
23 |     input.with_grad()?;
24 | 
25 |     let mut linear = Linear::new(3, 1, true)?;
26 |     let mse_loss = MSELoss::new();
27 |     let mut optimizer = SGD::new(0.01);
28 |     let epochs = 1000;
29 | 
30 |     let mut hundred_epochs_start = Instant::now();
31 | 
32 |     for epoch in 0..epochs {
33 |         let pred = linear.forward(&input)?;
34 |         let loss = mse_loss.forward((&pred, &target))?;
35 |         loss.backward()?;
36 | 
37 |         optimizer.step(&mut linear.parameters())?;
38 |         optimizer.zero_grad(&mut linear.parameters())?;
39 | 
40 |         if (epoch + 1) % 100 == 0 {
41 |             let hundred_elapsed = hundred_epochs_start.elapsed();
42 |             let params = linear.parameters();
43 |             println!(
44 |                 "Epoch {}: Loss = {}, 100 Epochs Time = {:?}, Weight = {}, Bias = {}",
45 |                 epoch + 1,
46 |                 loss,
47 |                 hundred_elapsed,
48 |                 params[0],
49 |                 params.get(1).unwrap()
50 |             );
51 |             hundred_epochs_start = Instant::now();
52 |         }
53 |     }
54 | 
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/transform.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use maidenx_core::scalar::Scalar;
 3 | 
 4 | impl Tensor {
 5 |     pub fn view<T: Into<Scalar> + Clone>(&self, shape: &[T]) -> Self {
 6 |         self.try_view(shape).expect("failed to create view")
 7 |     }
 8 | 
 9 |     pub fn squeeze<T: Into<Scalar> + Clone>(&self, dims: &[T]) -> Self {
10 |         self.try_squeeze(dims).expect("failed to squeeze tensor")
11 |     }
12 | 
13 |     pub fn squeeze_all(&self) -> Self {
14 |         self.try_squeeze_all().expect("failed to squeeze all dimensions")
15 |     }
16 | 
17 |     pub fn unsqueeze<T: Into<Scalar> + Clone>(&self, dims: &[T]) -> Self {
18 |         self.try_unsqueeze(dims).expect("failed to unsqueeze tensor")
19 |     }
20 | 
21 |     pub fn transpose(&self, dim0: impl Into<Scalar>, dim1: impl Into<Scalar>) -> Self {
22 |         self.try_transpose(dim0, dim1).expect("failed to transpose tensor")
23 |     }
24 | 
25 |     pub fn slice(
26 |         &self,
27 |         dim: impl Into<Scalar>,
28 |         start: impl Into<Scalar>,
29 |         end: Option<impl Into<Scalar>>,
30 |         step: impl Into<Scalar>,
31 |     ) -> Self {
32 |         self.try_slice(dim, start, end, step).expect("failed to slice tensor")
33 |     }
34 | 
35 |     pub fn unfold(&self, dim: impl Into<Scalar>, size: impl Into<Scalar>, step: impl Into<Scalar>) -> Self {
36 |         self.try_unfold(dim, size, step).expect("failed to unfold tensor")
37 |     }
38 | 
39 |     pub fn broadcast(&self, shape: &[usize]) -> Self {
40 |         self.try_broadcast(shape).expect("failed to broadcast tensor")
41 |     }
42 | 
43 |     pub fn broadcast_like(&self, other: &Self) -> Self {
44 |         self.try_broadcast_like(other)
45 |             .expect("failed to broadcast tensor like other")
46 |     }
47 | 
48 |     pub fn broadcast_left(&self, batch_dims: &[usize]) -> Self {
49 |         self.try_broadcast_left(batch_dims)
50 |             .expect("failed to broadcast tensor left")
51 |     }
52 | 
53 |     pub fn reshape<T: Into<Scalar> + Clone>(&self, shape: &[T]) -> Self {
54 |         self.try_reshape(shape).expect("failed to reshape tensor")
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/docs/supported.md:
--------------------------------------------------------------------------------
 1 | # Supported
 2 | 
 3 | ## Device
 4 | 
 5 | - CPU
 6 | - CUDA
 7 | - MPS
 8 | - Vulkan (planned)
 9 | 
10 | ## DType
11 | 
12 | | Data type | DType | Notes |
13 | |---|---|---|
14 | | 16-bit floating point     | `maidenx::bfloat16`   |                      |
15 | | 16-bit floating point     | `maidenx::float16`    |                      |
16 | | 32-bit floating point     | `maidenx::float32`    |                      |
17 | | 64-bit floating point     | `maidenx::float64`    | Not supported on MPS |
18 | | boolean                   | `maidenx::bool`       |                      |
19 | | 8-bit integer (unsigned)  | `maidenx::uint8`      |                      |
20 | | 16-bit integer (unsigned) | `maidenx::uint16`     |                      |
21 | | 32-bit integer (unsigned) | `maidenx::uint32`     |                      |
22 | | 64-bit integer (unsigned) | `maidenx::uint64`     | Not supported on MPS |
23 | | 8-bit integer (signed)    | `maidenx::int8`       |                      |
24 | | 16-bit integer (signed)   | `maidenx::int16`      |                      |
25 | | 32-bit integer (signed)   | `maidenx::int32`      |                      |
26 | | 64-bit integer (signed)   | `maidenx::int64`      | Not supported on MPS |
27 | 
28 | > [!NOTE]
29 | > The boolean type in the MaidenX framework is promoted based on the type of operation:
30 | > 
31 | > - For logical operations: Remains as maidenx::bool
32 | > - For arithmetic operations: Promoted to maidenx::uint8 (8-bit unsigned integer)
33 | > - For operations involving floating-point numbers: Promoted to maidenx::float32 (32-bit floating point)
34 | >
35 | > This conversion happens automatically in the framework to ensure type compatibility during different operations, as boolean values (true/false) need to be represented as numeric values (1/0) when used in mathematical contexts.
36 | 
37 | > [!IMPORTANT]
38 | > Automatic differentiation in MaidenX only supports floating-point types.
39 | > Integer and boolean types cannot be used with gradient computation.
40 | 
41 | > [!IMPORTANT]
42 | > MPS (Metal Performance Shaders) does not support 64-bit data types (uint64, int64, float64).
43 | > When using MPS as your compute device, please use 32-bit or lower precision data types instead.
44 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/losses/mae.rs:
--------------------------------------------------------------------------------
 1 | use crate::layer::{Layer, LayerState};
 2 | use maidenx_core::{error::Result, scalar::Scalar};
 3 | use maidenx_tensor::Tensor;
 4 | #[cfg(feature = "serde")]
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | #[derive(Layer, Clone, Default)]
 8 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 9 | #[layer(inputs = 2)]
10 | pub struct MAELoss {
11 |     state: LayerState,
12 | }
13 | 
14 | impl MAELoss {
15 |     pub fn new() -> Self {
16 |         Self {
17 |             state: LayerState::new(),
18 |         }
19 |     }
20 | 
21 |     pub fn forward(&self, (pred, target): (&Tensor, &Tensor)) -> Result<Tensor> {
22 |         let diff = pred.sub(target)?;
23 |         let abs_diff = diff.abs()?;
24 | 
25 |         let batch_size = Scalar::new(pred.shape()[0]);
26 |         let mean = abs_diff.sum_all()?.div_scalar(batch_size)?;
27 | 
28 |         Ok(mean)
29 |     }
30 | }
31 | 
32 | #[cfg(test)]
33 | mod tests {
34 |     use super::*;
35 |     use maidenx_core::device::{set_default_device, Device};
36 | 
37 |     fn setup_device() {
38 |         #[cfg(feature = "cuda")]
39 |         set_default_device(Device::CUDA(0));
40 |         #[cfg(not(any(feature = "cuda")))]
41 |         set_default_device(Device::CPU);
42 |     }
43 | 
44 |     #[test]
45 |     fn forward() -> Result<()> {
46 |         setup_device();
47 | 
48 |         let pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
49 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
50 | 
51 |         let mae_loss = MAELoss::new();
52 |         let loss = mae_loss.forward((&pred, &target))?;
53 | 
54 |         assert_eq!(loss.to_flatten_vec::<f32>()?, vec![1.0]);
55 | 
56 |         Ok(())
57 |     }
58 | 
59 |     #[test]
60 |     fn backward() -> Result<()> {
61 |         setup_device();
62 | 
63 |         let mut pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
64 |         pred.with_grad()?;
65 | 
66 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
67 | 
68 |         let mae_loss = MAELoss::new();
69 |         let loss = mae_loss.forward((&pred, &target))?;
70 |         loss.backward()?;
71 | 
72 |         if let Some(grad) = pred.grad()? {
73 |             let pred_grad = grad.to_flatten_vec::<f32>()?;
74 |             let expected_grad = vec![1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0];
75 |             assert_eq!(pred_grad, expected_grad);
76 |         }
77 | 
78 |         Ok(())
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/losses/mse.rs:
--------------------------------------------------------------------------------
 1 | use crate::layer::{Layer, LayerState};
 2 | use maidenx_core::{error::Result, scalar::Scalar};
 3 | use maidenx_tensor::Tensor;
 4 | #[cfg(feature = "serde")]
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | #[derive(Layer, Clone, Default)]
 8 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 9 | #[layer(inputs = 2)]
10 | pub struct MSELoss {
11 |     state: LayerState,
12 | }
13 | 
14 | impl MSELoss {
15 |     pub fn new() -> Self {
16 |         Self {
17 |             state: LayerState::new(),
18 |         }
19 |     }
20 | 
21 |     pub fn forward(&self, (pred, target): (&Tensor, &Tensor)) -> Result<Tensor> {
22 |         let diff = pred.sub(target)?;
23 |         let squared = diff.pow(2.0)?;
24 | 
25 |         let num_elements = Scalar::new(squared.size());
26 |         let mean = squared.sum_all()?.div_scalar(num_elements)?;
27 | 
28 |         Ok(mean)
29 |     }
30 | }
31 | 
32 | #[cfg(test)]
33 | mod tests {
34 |     use super::*;
35 |     use maidenx_core::device::{set_default_device, Device};
36 | 
37 |     fn setup_device() {
38 |         #[cfg(feature = "cuda")]
39 |         set_default_device(Device::CUDA(0));
40 |         #[cfg(not(any(feature = "cuda")))]
41 |         set_default_device(Device::CPU);
42 |     }
43 | 
44 |     #[test]
45 |     fn forward() -> Result<()> {
46 |         setup_device();
47 | 
48 |         let pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
49 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
50 |         let mse_loss = MSELoss::new();
51 | 
52 |         let loss = mse_loss.forward((&pred, &target))?;
53 | 
54 |         assert_eq!(loss.to_flatten_vec::<f32>()?, vec![1.0]);
55 | 
56 |         Ok(())
57 |     }
58 | 
59 |     #[test]
60 |     fn backward() -> Result<()> {
61 |         setup_device();
62 | 
63 |         let mut pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
64 |         pred.with_grad()?;
65 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
66 |         let mse_loss = MSELoss::new();
67 |         let loss = mse_loss.forward((&pred, &target))?;
68 | 
69 |         loss.backward()?;
70 | 
71 |         if let Some(grad) = pred.grad()? {
72 |             let pred_grad = grad.to_flatten_vec::<f32>()?;
73 |             let expected_grad = vec![2.0 / 3.0, 2.0 / 3.0, 2.0 / 3.0];
74 |             assert_eq!(pred_grad, expected_grad);
75 |         }
76 | 
77 |         Ok(())
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/f.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use crate::{Tensor, TensorData, TensorMetadata};
 4 | use maidenx_core::{
 5 |     buffer::BufferManager,
 6 |     device::Device,
 7 |     error::{Error, Result},
 8 | };
 9 | 
10 | impl Tensor {
11 |     pub fn contiguous(&self) -> Result<Self> {
12 |         if self.is_contiguous() {
13 |             return Ok(self.clone());
14 |         }
15 | 
16 |         let mut result = Self::empty_like(self)?;
17 | 
18 |         match self.device() {
19 |             Device::CPU => {
20 |                 for indices in self.index_iter()? {
21 |                     let value = self.get(&indices)?;
22 |                     result.set(&indices, value)?;
23 |                 }
24 |             },
25 |             #[cfg(feature = "cuda")]
26 |             Device::CUDA(device_id) => {
27 |                 let temp = self.to_device(Device::CPU)?;
28 |                 let contiguous_temp = temp.contiguous()?;
29 |                 result = contiguous_temp.to_device(Device::CUDA(device_id))?;
30 |             },
31 |             #[cfg(feature = "mps")]
32 |             Device::MPS => {
33 |                 let temp = self.to_device(Device::CPU)?;
34 |                 let contiguous_temp = temp.contiguous()?;
35 |                 result = contiguous_temp.to_device(Device::MPS)?;
36 |             },
37 |         }
38 | 
39 |         Ok(result)
40 |     }
41 | 
42 |     pub fn copy(&self) -> Result<Self> {
43 |         let device = self.device();
44 |         let dtype = self.dtype();
45 |         let layout = self.layout().clone();
46 | 
47 |         let mut buffer = BufferManager::create(self.buffer().len(), device, dtype)?;
48 | 
49 |         {
50 |             let buffer_mut = Arc::get_mut(&mut buffer).ok_or(Error::BufferShared)?;
51 |             unsafe {
52 |                 buffer_mut.copy_from(self.buffer(), 0, 0, self.buffer().len())?;
53 |             }
54 |         }
55 | 
56 |         Ok(Self {
57 |             data: TensorData { buffer, grad: None },
58 |             metadata: TensorMetadata {
59 |                 device,
60 |                 dtype,
61 |                 layout,
62 |                 requires_grad: false,
63 |             },
64 |             node: None,
65 |         })
66 |     }
67 | 
68 |     pub fn detach(&self) -> Result<Self> {
69 |         let mut result = self.clone();
70 |         result.metadata.requires_grad = false;
71 |         result.node = None;
72 | 
73 |         Ok(result)
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/binary.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | 
 3 | impl Tensor {
 4 |     pub fn add(&self, rhs: &Self) -> Self {
 5 |         self.try_add(rhs).expect("failed to add tensors")
 6 |     }
 7 | 
 8 |     pub fn sub(&self, rhs: &Self) -> Self {
 9 |         self.try_sub(rhs).expect("failed to subtract tensors")
10 |     }
11 | 
12 |     pub fn mul(&self, rhs: &Self) -> Self {
13 |         self.try_mul(rhs).expect("failed to multiply tensors")
14 |     }
15 | 
16 |     pub fn div(&self, rhs: &Self) -> Self {
17 |         self.try_div(rhs).expect("failed to divide tensors")
18 |     }
19 | 
20 |     pub fn maximum(&self, rhs: &Self) -> Self {
21 |         self.try_maximum(rhs).expect("failed to compute maximum")
22 |     }
23 | 
24 |     pub fn minimum(&self, rhs: &Self) -> Self {
25 |         self.try_minimum(rhs).expect("failed to compute minimum")
26 |     }
27 | 
28 |     pub fn logical_and(&self, rhs: &Self) -> Self {
29 |         self.try_logical_and(rhs).expect("failed to compute logical_and")
30 |     }
31 | 
32 |     pub fn logical_or(&self, rhs: &Self) -> Self {
33 |         self.try_logical_or(rhs).expect("failed to compute logical_or")
34 |     }
35 | 
36 |     pub fn logical_xor(&self, rhs: &Self) -> Self {
37 |         self.try_logical_xor(rhs).expect("failed to compute logical_xor")
38 |     }
39 | 
40 |     pub fn eq(&self, rhs: &Self) -> Self {
41 |         self.try_eq(rhs).expect("failed to compute eq")
42 |     }
43 | 
44 |     pub fn ne(&self, rhs: &Self) -> Self {
45 |         self.try_ne(rhs).expect("failed to compute ne")
46 |     }
47 | 
48 |     pub fn lt(&self, rhs: &Self) -> Self {
49 |         self.try_lt(rhs).expect("failed to compute lt")
50 |     }
51 | 
52 |     pub fn le(&self, rhs: &Self) -> Self {
53 |         self.try_le(rhs).expect("failed to compute le")
54 |     }
55 | 
56 |     pub fn gt(&self, rhs: &Self) -> Self {
57 |         self.try_gt(rhs).expect("failed to compute gt")
58 |     }
59 | 
60 |     pub fn ge(&self, rhs: &Self) -> Self {
61 |         self.try_ge(rhs).expect("failed to compute ge")
62 |     }
63 | 
64 |     pub fn add_(&mut self, rhs: &Self) {
65 |         self.try_add_(rhs).expect("failed to add_ tensors")
66 |     }
67 | 
68 |     pub fn sub_(&mut self, rhs: &Self) {
69 |         self.try_sub_(rhs).expect("failed to sub_ tensors")
70 |     }
71 | 
72 |     pub fn mul_(&mut self, rhs: &Self) {
73 |         self.try_mul_(rhs).expect("failed to mul_ tensors")
74 |     }
75 | 
76 |     pub fn div_(&mut self, rhs: &Self) {
77 |         self.try_div_(rhs).expect("failed to div_ tensors")
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/reduction.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use maidenx_core::scalar::Scalar;
 3 | 
 4 | impl Tensor {
 5 |     pub fn sum(&self, dim: impl Into<Scalar>, keep_dim: bool) -> Tensor {
 6 |         self.try_sum(dim, keep_dim).expect("failed to sum tensor")
 7 |     }
 8 | 
 9 |     pub fn sum_all(&self) -> Self {
10 |         self.try_sum_all().expect("failed to sum all tensor")
11 |     }
12 | 
13 |     pub fn sum_to_shape(&self, shape: &[usize]) -> Tensor {
14 |         self.try_sum_to_shape(shape).expect("failed to sum to shape")
15 |     }
16 | 
17 |     pub fn mean(&self, dim: impl Into<Scalar>, keep_dim: bool) -> Self {
18 |         self.try_mean(dim, keep_dim).expect("failed to mean tensor")
19 |     }
20 | 
21 |     pub fn mean_all(&self) -> Self {
22 |         self.try_mean_all().expect("failed to mean all tensor")
23 |     }
24 | 
25 |     pub fn fold(&self, dim: impl Into<Scalar>, size: impl Into<Scalar>, step: impl Into<Scalar>) -> Tensor {
26 |         self.try_fold(dim, size, step).expect("failed to fold tensor")
27 |     }
28 | 
29 |     pub fn max(&self, dim: impl Into<Scalar>, keep_dim: bool) -> Tensor {
30 |         self.try_max(dim, keep_dim).expect("failed to max tensor")
31 |     }
32 | 
33 |     pub fn max_all(&self) -> Self {
34 |         self.try_max_all().expect("failed to max all tensor")
35 |     }
36 | 
37 |     pub fn min(&self, dim: impl Into<Scalar>, keep_dim: bool) -> Tensor {
38 |         self.try_min(dim, keep_dim).expect("failed to min tensor")
39 |     }
40 | 
41 |     pub fn min_all(&self) -> Self {
42 |         self.try_min_all().expect("failed to min all tensor")
43 |     }
44 | 
45 |     pub fn norm(&self, p: impl Into<Scalar>, dim: impl Into<Scalar>, keep_dim: bool) -> Self {
46 |         self.try_norm(p, dim, keep_dim).expect("failed to norm tensor")
47 |     }
48 | 
49 |     pub fn norm_all(&self, p: impl Into<Scalar>) -> Tensor {
50 |         self.try_norm_all(p).expect("failed to norm all tensor")
51 |     }
52 | 
53 |     pub fn var(&self, dim: impl Into<Scalar>, keep_dim: bool, unbiased: bool) -> Self {
54 |         self.try_var(dim, keep_dim, unbiased).expect("failed to var tensor")
55 |     }
56 | 
57 |     pub fn var_all(&self, unbiased: bool) -> Tensor {
58 |         self.try_var_all(unbiased).expect("failed to var all tensor")
59 |     }
60 | 
61 |     pub fn std(&self, dim: impl Into<Scalar>, keep_dim: bool, unbiased: bool) -> Tensor {
62 |         self.try_std(dim, keep_dim, unbiased).expect("failed to std tensor")
63 |     }
64 | 
65 |     pub fn std_all(&self, unbiased: bool) -> Tensor {
66 |         self.try_std_all(unbiased).expect("failed to std all tensor")
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/book/src/tensor/utils.md:
--------------------------------------------------------------------------------
 1 | # Tensor Utilities
 2 | 
 3 | ## Device and Type Conversion
 4 | 
 5 | These utilities allow you to change a tensor's device or data type.
 6 | 
 7 | ### with_device / to_device
 8 | ```rust
 9 | pub fn with_device(&mut self, device: Device) -> Result<()>
10 | pub fn to_device(&self, device: Device) -> Result<Self>
11 | ```
12 | Changes the device where a tensor is stored.
13 | 
14 | - **Parameters**:
15 |   - `device`: The target device (CPU, CUDA, MPS)
16 | - **Returns**: 
17 |   - `with_device`: Modifies the tensor in-place and returns Result
18 |   - `to_device`: Returns a new tensor on the specified device
19 | - **Example**:
20 | ```rust
21 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?;
22 | let b = a.to_device(Device::CPU)?;  // Copy to CPU
23 | 
24 | // In-place version
25 | let mut c = Tensor::new(vec![1.0, 2.0, 3.0])?;
26 | c.with_device(Device::CPU)?;  // Move to CPU in-place
27 | ```
28 | 
29 | ### with_dtype / to_dtype
30 | ```rust
31 | pub fn with_dtype(&mut self, dtype: DType) -> Result<()>
32 | pub fn to_dtype(&self, dtype: DType) -> Result<Self>
33 | ```
34 | Changes the data type of a tensor.
35 | 
36 | - **Parameters**:
37 |   - `dtype`: The target data type (F32, F64, I32, etc.)
38 | - **Returns**: 
39 |   - `with_dtype`: Modifies the tensor in-place and returns Result
40 |   - `to_dtype`: Returns a new tensor with the specified data type
41 | - **Example**:
42 | ```rust
43 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?;
44 | let b = a.to_dtype(DType::F64)?;  // Convert to 64-bit float
45 | 
46 | // In-place version
47 | let mut c = Tensor::new(vec![1.0, 2.0, 3.0])?;
48 | c.with_dtype(DType::I32)?;  // Convert to 32-bit int in-place
49 | ```
50 | 
51 | ### with_shape / to_shape
52 | ```rust
53 | pub fn with_shape(&mut self, shape: &[usize]) -> Result<()>
54 | pub fn to_shape(&self, shape: &[usize]) -> Result<Self>
55 | ```
56 | Changes the shape of a tensor without modifying the data.
57 | 
58 | - **Parameters**:
59 |   - `shape`: The new shape dimensions
60 | - **Returns**: 
61 |   - `with_shape`: Modifies the tensor in-place and returns Result
62 |   - `to_shape`: Returns a new tensor with the specified shape
63 | - **Example**:
64 | ```rust
65 | let a = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?;
66 | let b = a.to_shape(&[2, 2])?;  // Reshape to 2x2
67 | 
68 | // In-place version
69 | let mut c = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?;
70 | c.with_shape(&[2, 2])?;  // Reshape to 2x2 in-place
71 | ```
72 | 
73 | ### with_grad
74 | ```rust
75 | pub fn with_grad(&mut self) -> Result<()>
76 | ```
77 | Enables gradient computation for a tensor.
78 | 
79 | - **Returns**: Modifies the tensor in-place and returns Result
80 | - **Example**:
81 | ```rust
82 | let mut a = Tensor::new(vec![1.0, 2.0, 3.0])?;
83 | a.with_grad()?;  // Enable gradients
84 | ```
85 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/Makefile:
--------------------------------------------------------------------------------
 1 | # Directory settings
 2 | KERNEL_DIR = kernels
 3 | OUTPUT_DIR = build
 4 | INCLUDE_DIR = $(KERNEL_DIR)
 5 | 
 6 | # Output files
 7 | OPS_LIB = $(OUTPUT_DIR)/ops.metallib
 8 | NN_LIB = $(OUTPUT_DIR)/nn.metallib
 9 | 
10 | # Intermediate files
11 | OPS_DIR = $(OUTPUT_DIR)/ops
12 | NN_DIR = $(OUTPUT_DIR)/nn
13 | 
14 | # Utils file
15 | ATOMICS = $(KERNEL_DIR)/atomics.metal
16 | ATOMICS_AIR = $(OUTPUT_DIR)/atomics.air
17 | UTILS = $(KERNEL_DIR)/metal_utils.metal
18 | UTILS_AIR = $(OUTPUT_DIR)/metal_utils.air
19 | 
20 | # Source files
21 | OPS_SOURCES = $(KERNEL_DIR)/ops/binary.metal \
22 |               $(KERNEL_DIR)/ops/matmul.metal \
23 |               $(KERNEL_DIR)/ops/padding.metal \
24 |               $(KERNEL_DIR)/ops/reduction.metal \
25 |               $(KERNEL_DIR)/ops/unary.metal
26 | 
27 | NN_SOURCES = $(KERNEL_DIR)/nn/activation/softmax.metal \
28 |              $(KERNEL_DIR)/nn/conv.metal
29 | 
30 | # Derived .air file paths
31 | OPS_AIR_FILES = $(patsubst $(KERNEL_DIR)/ops/%.metal,$(OPS_DIR)/%.air,$(OPS_SOURCES))
32 | NN_AIR_FILES = $(patsubst $(KERNEL_DIR)/nn/%.metal,$(NN_DIR)/%.air,$(NN_SOURCES))
33 | NN_ACTIVATION_AIR_FILES = $(patsubst $(KERNEL_DIR)/nn/activation/%.metal,$(NN_DIR)/activation/%.air,$(wildcard $(KERNEL_DIR)/nn/activation/*.metal))
34 | 
35 | # Compiler settings
36 | METAL = xcrun -sdk macosx metal
37 | METALLIB = xcrun -sdk macosx metallib
38 | METAL_FLAGS = -I $(INCLUDE_DIR)
39 | 
40 | # Default target
41 | all: prepare $(OPS_LIB) $(NN_LIB)
42 | 
43 | # Create build directories
44 | prepare:
45 | 	@mkdir -p $(OUTPUT_DIR)
46 | 	@mkdir -p $(OPS_DIR)
47 | 	@mkdir -p $(NN_DIR)
48 | 	@mkdir -p $(NN_DIR)/activation
49 | 
50 | # Compile utils
51 | $(UTILS_AIR): $(UTILS) $(HEADERS)
52 | 	$(METAL) $(METAL_FLAGS) -c $< -o $@
53 | 
54 | # Compile OPS metal files individually
55 | $(OPS_DIR)/%.air: $(KERNEL_DIR)/ops/%.metal $(HEADERS) $(UTILS_AIR)
56 | 	$(METAL) $(METAL_FLAGS) -c $< -o $@
57 | 
58 | # Compile NN metal files individually
59 | $(NN_DIR)/%.air: $(KERNEL_DIR)/nn/%.metal $(HEADERS) $(UTILS_AIR)
60 | 	$(METAL) $(METAL_FLAGS) -c $< -o $@
61 | 
62 | $(NN_DIR)/activation/%.air: $(KERNEL_DIR)/nn/activation/%.metal $(HEADERS) $(UTILS_AIR)
63 | 	$(METAL) $(METAL_FLAGS) -c $< -o $@
64 | 
65 | # Create OPS metallib from all .air files
66 | $(OPS_LIB): $(UTILS_AIR) $(OPS_AIR_FILES)
67 | 	$(METALLIB) $^ -o $@
68 | 
69 | # Create NN metallib from all .air files
70 | $(NN_LIB): $(UTILS_AIR) $(NN_AIR_FILES) $(NN_ACTIVATION_AIR_FILES)
71 | 	$(METALLIB) $^ -o $@
72 | 
73 | # Cleanup
74 | clean:
75 | 	rm -rf $(OUTPUT_DIR)
76 | 
77 | # Debugging targets
78 | debug-paths:
79 | 	@echo "OPS sources: $(OPS_SOURCES)"
80 | 	@echo "OPS AIR files: $(OPS_AIR_FILES)"
81 | 	@echo "NN sources: $(NN_SOURCES)"
82 | 	@echo "NN AIR files: $(NN_AIR_FILES)"
83 | 	@echo "NN activation AIR files: $(NN_ACTIVATION_AIR_FILES)"
84 | 
85 | # Dependency declarations
86 | .PHONY: all prepare clean debug-paths
87 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/losses/crossentropy.rs:
--------------------------------------------------------------------------------
 1 | use crate::layer::{Layer, LayerState};
 2 | use maidenx_core::{error::Result, scalar::Scalar};
 3 | use maidenx_tensor::Tensor;
 4 | #[cfg(feature = "serde")]
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | #[derive(Layer, Clone, Default)]
 8 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 9 | #[layer(inputs = 2)]
10 | pub struct CrossEntropyLoss {
11 |     state: LayerState,
12 | }
13 | 
14 | impl CrossEntropyLoss {
15 |     pub fn new() -> Self {
16 |         Self {
17 |             state: LayerState::new(),
18 |         }
19 |     }
20 | 
21 |     pub fn forward(&self, (logits, targets): (&Tensor, &Tensor)) -> Result<Tensor> {
22 |         let batch_size = Scalar::new(logits.shape()[0]);
23 | 
24 |         let softmax = crate::Softmax::new(1)?;
25 |         let probs = softmax.forward(&logits)?;
26 | 
27 |         let epsilon = Scalar::new(1e-10);
28 |         let probs_safe = probs.add_scalar(epsilon)?;
29 |         let log_probs = probs_safe.log()?;
30 | 
31 |         let targets_reshaped = if targets.ndim() == 1 {
32 |             targets.reshape(&[targets.shape()[0], 1])?
33 |         } else {
34 |             targets.clone()
35 |         };
36 | 
37 |         let target_log_probs = log_probs.gather(1, &targets_reshaped)?;
38 |         let nll_loss = target_log_probs.neg()?.sum_all()?.div_scalar(batch_size)?;
39 | 
40 |         Ok(nll_loss)
41 |     }
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::*;
47 |     use maidenx_core::device::{set_default_device, Device};
48 | 
49 |     fn setup_device() {
50 |         #[cfg(feature = "cuda")]
51 |         set_default_device(Device::CUDA(0));
52 |         #[cfg(not(any(feature = "cuda")))]
53 |         set_default_device(Device::CPU);
54 |     }
55 | 
56 |     #[test]
57 |     fn forward() -> Result<()> {
58 |         setup_device();
59 | 
60 |         let logits = Tensor::new(vec![vec![1.0f32, 2.0, 0.1], vec![0.1f32, 2.0, 3.0]])?;
61 | 
62 |         let targets = Tensor::new(vec![0i64, 2])?;
63 | 
64 |         let ce_loss = CrossEntropyLoss::new();
65 |         let loss = ce_loss.forward((&logits, &targets))?;
66 | 
67 |         let expected_loss = 0.884864;
68 | 
69 |         assert!((loss.to_flatten_vec::<f32>()?[0] - expected_loss).abs() < 1e-2);
70 | 
71 |         Ok(())
72 |     }
73 | 
74 |     #[test]
75 |     fn backward() -> Result<()> {
76 |         setup_device();
77 | 
78 |         let mut logits = Tensor::new(vec![vec![1.0f32, 2.0, 0.1], vec![0.1f32, 2.0, 3.0]])?;
79 |         logits.with_grad()?;
80 | 
81 |         let targets = Tensor::new(vec![0i64, 2])?;
82 | 
83 |         let ce_loss = CrossEntropyLoss::new();
84 |         let loss = ce_loss.forward((&logits, &targets))?;
85 |         loss.backward()?;
86 | 
87 |         if let Some(grad) = logits.grad()? {
88 |             let grad_values = grad.to_flatten_vec::<f32>()?;
89 |             let expected_grads = [-0.3788, 0.3295, 0.0493, 0.0193, 0.1293, -0.1486];
90 | 
91 |             for (g, e) in grad_values.iter().zip(expected_grads.iter()) {
92 |                 assert!((g - e).abs() < 1e-2);
93 |             }
94 |         }
95 | 
96 |         Ok(())
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/book/src/tensor/README.md:
--------------------------------------------------------------------------------
 1 | # Tensor
 2 | 
 3 | This chapter covers MaidenX's core tensor functionality, which provides multi-dimensional array operations with automatic differentiation (autograd) support.
 4 | 
 5 | ## Core Tensor Features
 6 | 
 7 | MaidenX tensors provide:
 8 | 
 9 | - Multi-dimensional array representation
10 | - Support for various data types (F32, F64, I32, I64, etc.)
11 | - Automatic differentiation for gradient-based optimization
12 | - Device support (CPU, CUDA, MPS)
13 | - Broadcasting for performing operations between tensors of different shapes
14 | - Extensive operation library (arithmetic, reduction, transformation, etc.)
15 | 
16 | ## Tensor Structure
17 | 
18 | The `Tensor` struct is the primary data structure for representing multi-dimensional arrays:
19 | 
20 | ```rust
21 | pub struct Tensor {
22 |     data: TensorData,          // Holds buffer and gradient information
23 |     metadata: TensorMetadata,  // Holds device, dtype, layout, requires_grad
24 |     node: Option<TensorNode>,  // Stores computational graph information for autograd
25 | }
26 | ```
27 | 
28 | ## Display and Debug Output
29 | 
30 | MaidenX tensors implement both the `Display` and `Debug` traits for convenient printing:
31 | 
32 | ### Display Format
33 | 
34 | The Display format shows just the tensor's data in a nested array format:
35 | 
36 | ```rust
37 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?;
38 | println!("{}", a);  // Outputs: [1.00000000, 2.00000000, 3.00000000]
39 | 
40 | let b = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?.reshape(&[2, 2])?;
41 | println!("{}", b);  // Outputs: [[1.00000000, 2.00000000], [3.00000000, 4.00000000]]
42 | ```
43 | 
44 | ### Debug Format
45 | 
46 | The Debug format provides comprehensive information about the tensor, including shape, device, data type, data values, and gradient information:
47 | 
48 | ```rust
49 | let mut a = Tensor::new(vec![1.0, 2.0, 3.0])?;
50 | a.with_grad()?;
51 | println!("{:?}", a);
52 | // Outputs something like:
53 | // Tensor(shape=[3], device=cpu, dtype=f32, data=[1.00000000, 2.00000000, 3.00000000], requires_grad=true, grad=[0.00000000, 0.00000000, 0.00000000])
54 | ```
55 | 
56 | ## Serialization and Deserialization
57 | 
58 | MaidenX supports tensor serialization and deserialization through Serde (when the "serde" feature is enabled):
59 | 
60 | ```rust
61 | // Binary serialization
62 | let tensor = Tensor::new(vec![1.0, 2.0, 3.0])?;
63 | let bytes = tensor.to_bytes()?;
64 | let tensor_from_bytes = Tensor::from_bytes(&bytes)?;
65 | 
66 | // JSON serialization
67 | let json = tensor.to_json()?;
68 | let tensor_from_json = Tensor::from_json(&json)?;
69 | ```
70 | 
71 | The serialization preserves:
72 | - Tensor data
73 | - Shape and layout information
74 | - Device information
75 | - Data type
76 | - Requires grad flag (but not gradient values)
77 | 
78 | However, computational graph information (the `node` field) is not serialized, so autograd history is not preserved.
79 | 
80 | ## Getting Started
81 | 
82 | For detailed guides on tensor operations, see the following sections:
83 | - [Tensor Creation](./creation.md): Ways to create and initialize tensors
84 | - [Tensor Operations](./operation.md): Overview of tensor operations
85 | - [Tensor Utilities](./utils.md): Utility functions for tensor manipulation
86 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/ops/reduction.rs:
--------------------------------------------------------------------------------
  1 | use half::{bf16, f16};
  2 | 
  3 | #[link(name = "ops")]
  4 | extern "C" {}
  5 | 
  6 | #[macro_export]
  7 | macro_rules! declare_extern_reduction_ops {
  8 |     ($(
  9 |         $dtype:ident => {
 10 |             type: $ty:ty,
 11 |             standard_ops: [$($std_op:ident),*],
 12 |             shape_ops: [$($shape_op:ident),*]
 13 |         }
 14 |     ),*) => {
 15 |         paste::paste! {
 16 |             extern "C" {
 17 |                 $(
 18 |                     $(
 19 |                         pub fn [<cuda_ $std_op _ $dtype:lower>](
 20 |                             num_els: usize,
 21 |                             num_dims: usize,
 22 |                             num_red_dims: usize,
 23 |                             metadata: *const usize,
 24 |                             inp: *const $ty,
 25 |                             out: *mut $ty,
 26 |                         );
 27 |                     )*
 28 |                     $(
 29 |                         pub fn [<cuda_ $shape_op _ $dtype:lower>](
 30 |                             num_els: usize,
 31 |                             num_dims: usize,
 32 |                             metadata: *const usize,
 33 |                             inp: *const $ty,
 34 |                             out: *mut $ty,
 35 |                         );
 36 |                     )*
 37 |                 )*
 38 |             }
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | declare_extern_reduction_ops! {
 44 |     BF16 => {
 45 |         type: bf16,
 46 |         standard_ops: [sum, mean, max, min],
 47 |         shape_ops: [sum_to_shape, fold]
 48 |     },
 49 |     F16 => {
 50 |         type: f16,
 51 |         standard_ops: [sum, mean, max, min],
 52 |         shape_ops: [sum_to_shape, fold]
 53 |     },
 54 |     F32 => {
 55 |         type: f32,
 56 |         standard_ops: [sum, mean, max, min],
 57 |         shape_ops: [sum_to_shape, fold]
 58 |     },
 59 |     F64 => {
 60 |         type: f64,
 61 |         standard_ops: [sum, mean, max, min],
 62 |         shape_ops: [sum_to_shape, fold]
 63 |     },
 64 |     U8 => {
 65 |         type: u8,
 66 |         standard_ops: [sum, max, min],
 67 |         shape_ops: [sum_to_shape, fold]
 68 |     },
 69 |     U16 => {
 70 |         type: u16,
 71 |         standard_ops: [sum, max, min],
 72 |         shape_ops: [sum_to_shape, fold]
 73 |     },
 74 |     U32 => {
 75 |         type: u32,
 76 |         standard_ops: [sum, max, min],
 77 |         shape_ops: [sum_to_shape, fold]
 78 |     },
 79 |     U64 => {
 80 |         type: u64,
 81 |         standard_ops: [sum, max, min],
 82 |         shape_ops: [sum_to_shape, fold]
 83 |     },
 84 |     I8 => {
 85 |         type: i8,
 86 |         standard_ops: [sum, max, min],
 87 |         shape_ops: [sum_to_shape, fold]
 88 |     },
 89 |     I16 => {
 90 |         type: i16,
 91 |         standard_ops: [sum, max, min],
 92 |         shape_ops: [sum_to_shape, fold]
 93 |     },
 94 |     I32 => {
 95 |         type: i32,
 96 |         standard_ops: [sum, max, min],
 97 |         shape_ops: [sum_to_shape, fold]
 98 |     },
 99 |     I64 => {
100 |         type: i64,
101 |         standard_ops: [sum, max, min],
102 |         shape_ops: [sum_to_shape, fold]
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15)
 2 | project(maidenx_be_cuda LANGUAGES CUDA C CXX)
 3 | 
 4 | # Set C++17 as the minimum required standard for all C++ code
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 7 | 
 8 | # CUDA settings
 9 | set(CMAKE_CUDA_STANDARD 17)
10 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
11 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
12 | set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
13 | set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86 89)
14 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
15 | 
16 | # Feature flag for NN (default OFF)
17 | option(BUILD_NN "Build the neural network library" OFF)
18 | 
19 | # Optimization flags
20 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
21 | 
22 | #  Add kernels and headers directories to include path
23 | include_directories(
24 |     ${CMAKE_CURRENT_SOURCE_DIR}
25 |     ${CMAKE_CURRENT_SOURCE_DIR}/kernels
26 | )
27 | 
28 | # Modified function to add a CUDA library with improved header handling
29 | function(add_cuda_library LIB_NAME SOURCES)
30 |     # Find .cu source files and their corresponding .cuh files
31 |     set(ALL_FILES "")
32 |     foreach(SOURCE ${SOURCES})
33 |         # Add the source file
34 |         list(APPEND ALL_FILES ${SOURCE})
35 | 
36 |         # Get the base name without extension
37 |         get_filename_component(SOURCE_NAME ${SOURCE} NAME_WE)
38 |         get_filename_component(SOURCE_DIR ${SOURCE} DIRECTORY)
39 | 
40 |         # Look for corresponding .cuh files in the source directory
41 |         file(GLOB CUH_FILES
42 |             ${SOURCE_DIR}/${SOURCE_NAME}.cuh
43 |             ${SOURCE_DIR}/*.cuh
44 |         )
45 | 
46 |         # Look for corresponding .cuh files in the _headers directory
47 |         file(GLOB HEADER_CUH_FILES
48 |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/_headers/${SOURCE_NAME}.cuh
49 |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/_headers/*.cuh
50 |         )
51 | 
52 |         list(APPEND ALL_FILES ${CUH_FILES} ${HEADER_CUH_FILES})
53 |     endforeach()
54 | 
55 |     # Remove duplicates
56 |     list(REMOVE_DUPLICATES ALL_FILES)
57 | 
58 |     add_library(${LIB_NAME} STATIC ${ALL_FILES})
59 |     set_target_properties(${LIB_NAME} PROPERTIES
60 |         CUDA_SEPARABLE_COMPILATION ON
61 |         CUDA_RESOLVE_DEVICE_SYMBOLS ON
62 |         POSITION_INDEPENDENT_CODE ON
63 |         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
64 |         ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
65 |     )
66 |     target_link_libraries(${LIB_NAME}
67 |         PRIVATE
68 |         ${CUDA_LIBRARIES}
69 |         ${CUDA_CUDA_LIBRARY}
70 |         ${CUDA_CUDART_LIBRARY}
71 |     )
72 | endfunction()
73 | 
74 | # Add "ops" library
75 | file(GLOB OPS_SOURCES
76 |     kernels/ops/*.cu
77 | )
78 | add_cuda_library(ops "${OPS_SOURCES}")
79 | set_target_properties(ops PROPERTIES
80 |     CUDA_SEPARABLE_COMPILATION OFF
81 | )
82 | 
83 | # Add "nn" library
84 | if(BUILD_NN)
85 |     message(STATUS "Building with neural network support")
86 |     file(GLOB_RECURSE NN_SOURCES
87 |         kernels/nn/*.cu
88 |     )
89 |     add_cuda_library(nn "${NN_SOURCES}")
90 |     set_target_properties(nn PROPERTIES
91 |         CUDA_SEPARABLE_COMPILATION OFF
92 |     )
93 | else()
94 |     message(STATUS "Neural network support disabled")
95 | endif()
96 | 


--------------------------------------------------------------------------------
/crates/maidenx_core/src/device.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "serde")]
  2 | use serde::{Deserialize, Deserializer, Serialize, Serializer};
  3 | 
  4 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
  5 | pub enum Device {
  6 |     CPU,
  7 |     #[cfg(feature = "cuda")]
  8 |     CUDA(usize),
  9 |     #[cfg(feature = "mps")]
 10 |     MPS,
 11 | }
 12 | 
 13 | #[cfg(feature = "serde")]
 14 | impl Serialize for Device {
 15 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 16 |     where
 17 |         S: Serializer,
 18 |     {
 19 |         serializer.serialize_str(&self.name())
 20 |     }
 21 | }
 22 | 
 23 | #[cfg(feature = "serde")]
 24 | impl<'de> Deserialize<'de> for Device {
 25 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 26 |     where
 27 |         D: Deserializer<'de>,
 28 |     {
 29 |         struct DeviceVisitor;
 30 | 
 31 |         impl serde::de::Visitor<'_> for DeviceVisitor {
 32 |             type Value = Device;
 33 | 
 34 |             fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
 35 |                 formatter.write_str("a string representing a Device (CPU, CUDA|id, or MPS)")
 36 |             }
 37 | 
 38 |             fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
 39 |             where
 40 |                 E: serde::de::Error,
 41 |             {
 42 |                 if value == "CPU" {
 43 |                     return Ok(Device::CPU);
 44 |                 }
 45 | 
 46 |                 #[cfg(feature = "cuda")]
 47 |                 if let Some(cuda_str) = value.strip_prefix("CUDA|") {
 48 |                     if let Ok(id) = cuda_str.parse::<usize>() {
 49 |                         return Ok(Device::CUDA(id));
 50 |                     } else {
 51 |                         return Err(E::custom(format!("invalid CUDA device ID: {}", cuda_str)));
 52 |                     }
 53 |                 }
 54 | 
 55 |                 #[cfg(feature = "mps")]
 56 |                 if value == "MPS" {
 57 |                     return Ok(Device::MPS);
 58 |                 }
 59 | 
 60 |                 Err(E::custom(format!("unknown device: {}", value)))
 61 |             }
 62 |         }
 63 | 
 64 |         deserializer.deserialize_str(DeviceVisitor)
 65 |     }
 66 | }
 67 | 
 68 | impl Device {
 69 |     pub fn name(&self) -> String {
 70 |         match self {
 71 |             Device::CPU => "CPU".to_string(),
 72 |             #[cfg(feature = "cuda")]
 73 |             Device::CUDA(id) => format!("CUDA|{}", id),
 74 |             #[cfg(feature = "mps")]
 75 |             Device::MPS => "MPS".to_string(),
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | thread_local! {
 81 |     static DEFAULT_DEVICE: std::cell::Cell<Device> = const { std::cell::Cell::new(Device::CPU) };
 82 | }
 83 | 
 84 | pub fn get_default_device() -> Device {
 85 |     DEFAULT_DEVICE.with(|d| d.get())
 86 | }
 87 | 
 88 | pub fn set_default_device(device: Device) {
 89 |     DEFAULT_DEVICE.with(|d| d.set(device));
 90 | }
 91 | 
 92 | pub fn auto_set_device() {
 93 |     #[cfg(feature = "cuda")]
 94 |     set_default_device(Device::CUDA(0));
 95 |     #[cfg(feature = "mps")]
 96 |     set_default_device(Device::MPS);
 97 |     #[cfg(not(any(feature = "cuda", feature = "mps")))]
 98 |     set_default_device(Device::CPU);
 99 | }
100 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/operators.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | use std::ops::{Add, Div, Mul, Sub};
  3 | 
  4 | impl Add<&Tensor> for &Tensor {
  5 |     type Output = Tensor;
  6 | 
  7 |     fn add(self, rhs: &Tensor) -> Self::Output {
  8 |         Tensor::add(self, rhs).unwrap()
  9 |     }
 10 | }
 11 | 
 12 | impl Add<Tensor> for &Tensor {
 13 |     type Output = Tensor;
 14 | 
 15 |     fn add(self, rhs: Tensor) -> Self::Output {
 16 |         Tensor::add(self, &rhs).unwrap()
 17 |     }
 18 | }
 19 | 
 20 | impl Add<&Tensor> for Tensor {
 21 |     type Output = Tensor;
 22 | 
 23 |     fn add(self, rhs: &Tensor) -> Self::Output {
 24 |         Tensor::add(&self, rhs).unwrap()
 25 |     }
 26 | }
 27 | 
 28 | impl Add<Tensor> for Tensor {
 29 |     type Output = Tensor;
 30 | 
 31 |     fn add(self, rhs: Tensor) -> Self::Output {
 32 |         Tensor::add(&self, &rhs).unwrap()
 33 |     }
 34 | }
 35 | 
 36 | impl Sub<&Tensor> for &Tensor {
 37 |     type Output = Tensor;
 38 | 
 39 |     fn sub(self, rhs: &Tensor) -> Self::Output {
 40 |         Tensor::sub(self, rhs).unwrap()
 41 |     }
 42 | }
 43 | 
 44 | impl Sub<Tensor> for &Tensor {
 45 |     type Output = Tensor;
 46 | 
 47 |     fn sub(self, rhs: Tensor) -> Self::Output {
 48 |         Tensor::sub(self, &rhs).unwrap()
 49 |     }
 50 | }
 51 | 
 52 | impl Sub<&Tensor> for Tensor {
 53 |     type Output = Tensor;
 54 | 
 55 |     fn sub(self, rhs: &Tensor) -> Self::Output {
 56 |         Tensor::sub(&self, rhs).unwrap()
 57 |     }
 58 | }
 59 | 
 60 | impl Sub<Tensor> for Tensor {
 61 |     type Output = Tensor;
 62 | 
 63 |     fn sub(self, rhs: Tensor) -> Self::Output {
 64 |         Tensor::sub(&self, &rhs).unwrap()
 65 |     }
 66 | }
 67 | 
 68 | impl Mul<&Tensor> for &Tensor {
 69 |     type Output = Tensor;
 70 | 
 71 |     fn mul(self, rhs: &Tensor) -> Self::Output {
 72 |         Tensor::mul(self, rhs).unwrap()
 73 |     }
 74 | }
 75 | 
 76 | impl Mul<Tensor> for &Tensor {
 77 |     type Output = Tensor;
 78 | 
 79 |     fn mul(self, rhs: Tensor) -> Self::Output {
 80 |         Tensor::mul(self, &rhs).unwrap()
 81 |     }
 82 | }
 83 | 
 84 | impl Mul<&Tensor> for Tensor {
 85 |     type Output = Tensor;
 86 | 
 87 |     fn mul(self, rhs: &Tensor) -> Self::Output {
 88 |         Tensor::mul(&self, rhs).unwrap()
 89 |     }
 90 | }
 91 | 
 92 | impl Mul<Tensor> for Tensor {
 93 |     type Output = Tensor;
 94 | 
 95 |     fn mul(self, rhs: Tensor) -> Self::Output {
 96 |         Tensor::mul(&self, &rhs).unwrap()
 97 |     }
 98 | }
 99 | 
100 | impl Div<&Tensor> for &Tensor {
101 |     type Output = Tensor;
102 | 
103 |     fn div(self, rhs: &Tensor) -> Self::Output {
104 |         Tensor::div(self, rhs).unwrap()
105 |     }
106 | }
107 | 
108 | impl Div<Tensor> for &Tensor {
109 |     type Output = Tensor;
110 | 
111 |     fn div(self, rhs: Tensor) -> Self::Output {
112 |         Tensor::div(self, &rhs).unwrap()
113 |     }
114 | }
115 | 
116 | impl Div<&Tensor> for Tensor {
117 |     type Output = Tensor;
118 | 
119 |     fn div(self, rhs: &Tensor) -> Self::Output {
120 |         Tensor::div(&self, rhs).unwrap()
121 |     }
122 | }
123 | 
124 | impl Div<Tensor> for Tensor {
125 |     type Output = Tensor;
126 | 
127 |     fn div(self, rhs: Tensor) -> Self::Output {
128 |         Tensor::div(&self, &rhs).unwrap()
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/crates/maidenx_cuda/src/ops/padding.rs:
--------------------------------------------------------------------------------
 1 | use half::{bf16, f16};
 2 | 
 3 | #[link(name = "ops")]
 4 | extern "C" {}
 5 | 
 6 | #[macro_export]
 7 | macro_rules! declare_padding_ops {
 8 |     ($($dtype:ident: $ty:ty),*) => {
 9 |         paste::paste! {
10 |             extern "C" {
11 |                 $(
12 |                     // Constant padding (forward)
13 |                     pub fn [<cuda_pad_with_constant_ $dtype:lower>](
14 |                         num_els_in: usize,
15 |                         num_els_out: usize,
16 |                         num_dims: usize,
17 |                         info: *const usize,
18 |                         inp: *const $ty,
19 |                         out: *mut $ty,
20 |                         pad_value: $ty
21 |                     );
22 | 
23 |                     // Reflection padding (forward)
24 |                     pub fn [<cuda_pad_with_reflection_ $dtype:lower>](
25 |                         num_els_in: usize,
26 |                         num_els_out: usize,
27 |                         num_dims: usize,
28 |                         info: *const usize,
29 |                         inp: *const $ty,
30 |                         out: *mut $ty
31 |                     );
32 | 
33 |                     // Replication padding (forward)
34 |                     pub fn [<cuda_pad_with_replication_ $dtype:lower>](
35 |                         num_els_in: usize,
36 |                         num_els_out: usize,
37 |                         num_dims: usize,
38 |                         info: *const usize,
39 |                         inp: *const $ty,
40 |                         out: *mut $ty
41 |                     );
42 | 
43 |                     // Constant padding (backward)
44 |                     pub fn [<cuda_pad_with_constant_backward_ $dtype:lower>](
45 |                         num_els_in: usize,
46 |                         num_els_out: usize,
47 |                         num_dims: usize,
48 |                         info: *const usize,
49 |                         grad_out: *const $ty,
50 |                         grad_in: *mut $ty
51 |                     );
52 | 
53 |                     // Reflection padding (backward)
54 |                     pub fn [<cuda_pad_with_reflection_backward_ $dtype:lower>](
55 |                         num_els_in: usize,
56 |                         num_els_out: usize,
57 |                         num_dims: usize,
58 |                         info: *const usize,
59 |                         grad_out: *const $ty,
60 |                         grad_in: *mut $ty
61 |                     );
62 | 
63 |                     // Replication padding (backward)
64 |                     pub fn [<cuda_pad_with_replication_backward_ $dtype:lower>](
65 |                         num_els_in: usize,
66 |                         num_els_out: usize,
67 |                         num_dims: usize,
68 |                         info: *const usize,
69 |                         grad_out: *const $ty,
70 |                         grad_in: *mut $ty
71 |                     );
72 |                 )*
73 |             }
74 |         }
75 |     }
76 | }
77 | 
78 | declare_padding_ops! {
79 |     BF16: bf16,
80 |     F16: f16,
81 |     F32: f32,
82 |     F64: f64,
83 |     U8: u8,
84 |     U16: u16,
85 |     U32: u32,
86 |     U64: u64,
87 |     I8: i8,
88 |     I16: i16,
89 |     I32: i32,
90 |     I64: i64
91 | }
92 | 


--------------------------------------------------------------------------------
/crates/maidenx_macro_utils/src/manifest.rs:
--------------------------------------------------------------------------------
 1 | extern crate proc_macro;
 2 | 
 3 | use proc_macro::TokenStream;
 4 | use std::{env, path::PathBuf};
 5 | use syn::parse_str;
 6 | use toml_edit::{DocumentMut, Item};
 7 | 
 8 | pub struct MaidenXManifest {
 9 |     manifest: DocumentMut,
10 | }
11 | 
12 | impl Default for MaidenXManifest {
13 |     fn default() -> Self {
14 |         Self {
15 |             manifest: env::var_os("CARGO_MANIFEST_DIR")
16 |                 .map(PathBuf::from)
17 |                 .map(|mut path| {
18 |                     path.push("Cargo.toml");
19 |                     if !path.exists() {
20 |                         panic!("No Cargo manifest found for crate. Expected: {}", path.display());
21 |                     }
22 |                     let manifest = std::fs::read_to_string(path.clone())
23 |                         .unwrap_or_else(|_| panic!("Unable to read cargo manifest: {}", path.display()));
24 |                     manifest
25 |                         .parse::<DocumentMut>()
26 |                         .unwrap_or_else(|_| panic!("Failed to parse cargo manifest: {}", path.display()))
27 |                 })
28 |                 .expect("CARGO_MANIFEST_DIR is not defined."),
29 |         }
30 |     }
31 | }
32 | 
33 | const MAIDENX: &str = "maidenx";
34 | const MAIDENX_INTERNAL: &str = "maidenx_internal";
35 | 
36 | impl MaidenXManifest {
37 |     pub fn maybe_get_path(&self, name: &str) -> Option<syn::Path> {
38 |         if name == env::var("CARGO_PKG_NAME").expect("CARGO_PKG_NAME is not defined.") {
39 |             return Some(parse_str("crate").unwrap());
40 |         }
41 | 
42 |         fn dep_package(dep: &Item) -> Option<&str> {
43 |             if dep.as_str().is_some() {
44 |                 None
45 |             } else {
46 |                 dep.get("package").map(|name| name.as_str().unwrap())
47 |             }
48 |         }
49 | 
50 |         let find_in_deps = |deps: &Item| -> Option<syn::Path> {
51 |             let package = if let Some(dep) = deps.get(name) {
52 |                 return Some(Self::parse_str(dep_package(dep).unwrap_or(name)));
53 |             } else if let Some(dep) = deps.get(MAIDENX) {
54 |                 dep_package(dep).unwrap_or(MAIDENX)
55 |             } else if let Some(dep) = deps.get(MAIDENX_INTERNAL) {
56 |                 dep_package(dep).unwrap_or(MAIDENX_INTERNAL)
57 |             } else {
58 |                 return None;
59 |             };
60 | 
61 |             let mut path = Self::parse_str::<syn::Path>(package);
62 |             if let Some(module) = name.strip_prefix("maidenx_") {
63 |                 path.segments.push(Self::parse_str(module));
64 |             }
65 |             Some(path)
66 |         };
67 | 
68 |         let deps = self.manifest.get("dependencies");
69 |         let deps_dev = self.manifest.get("dev-dependencies");
70 | 
71 |         deps.and_then(find_in_deps).or_else(|| deps_dev.and_then(find_in_deps))
72 |     }
73 | 
74 |     pub fn get_path(&self, name: &str) -> syn::Path {
75 |         let sanitized_name = name.replace('-', "_");
76 | 
77 |         self.maybe_get_path(&sanitized_name)
78 |             .unwrap_or_else(|| Self::parse_str(&sanitized_name))
79 |     }
80 | 
81 |     pub fn try_parse_str<T: syn::parse::Parse>(path: &str) -> Option<T> {
82 |         syn::parse(path.parse::<TokenStream>().ok()?).ok()
83 |     }
84 | 
85 |     pub fn parse_str<T: syn::parse::Parse>(path: &str) -> T {
86 |         Self::try_parse_str(path).unwrap()
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/losses/huber.rs:
--------------------------------------------------------------------------------
  1 | use crate::layer::{Layer, LayerState};
  2 | use maidenx_core::{error::Result, scalar::Scalar};
  3 | use maidenx_tensor::Tensor;
  4 | #[cfg(feature = "serde")]
  5 | use serde::{Deserialize, Serialize};
  6 | 
  7 | #[derive(Layer, Clone)]
  8 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
  9 | #[layer(inputs = 2)]
 10 | pub struct HuberLoss {
 11 |     delta: Scalar,
 12 | 
 13 |     state: LayerState,
 14 | }
 15 | 
 16 | impl HuberLoss {
 17 |     pub fn new(delta: impl Into<Scalar>) -> Self {
 18 |         Self {
 19 |             delta: delta.into(),
 20 |             state: LayerState::new(),
 21 |         }
 22 |     }
 23 | 
 24 |     pub fn forward(&self, (pred, target): (&Tensor, &Tensor)) -> Result<Tensor> {
 25 |         let diff = pred.sub(target)?;
 26 |         let abs_diff = diff.abs()?;
 27 | 
 28 |         // Compute quadratic terms for small differences
 29 |         let quadratic_loss = diff.pow(2.0)?.div_scalar(2.0)?;
 30 | 
 31 |         // Compute linear terms for large differences
 32 |         let linear_loss = abs_diff
 33 |             .mul_scalar(self.delta)?
 34 |             .sub_scalar(self.delta.powi(2) / Scalar::new(2))?;
 35 | 
 36 |         // Create mask for selecting between quadratic and linear terms
 37 |         let mask = abs_diff.le_scalar(self.delta)?.to_dtype(abs_diff.dtype())?;
 38 | 
 39 |         // Combine losses using mask
 40 |         let loss = mask
 41 |             .mul(&quadratic_loss)?
 42 |             .add(&mask.logical_not()?.to_dtype(mask.dtype())?.mul(&linear_loss)?)?;
 43 | 
 44 |         // Compute mean loss
 45 |         let batch_size = Scalar::new(pred.shape()[0]);
 46 |         let mean_loss = loss.sum_all()?.div_scalar(batch_size)?;
 47 | 
 48 |         Ok(mean_loss)
 49 |     }
 50 | }
 51 | 
 52 | #[cfg(test)]
 53 | mod tests {
 54 |     use super::*;
 55 |     use maidenx_core::device::{set_default_device, Device};
 56 | 
 57 |     fn setup_device() {
 58 |         #[cfg(feature = "cuda")]
 59 |         set_default_device(Device::CUDA(0));
 60 |         #[cfg(not(any(feature = "cuda")))]
 61 |         set_default_device(Device::CPU);
 62 |     }
 63 | 
 64 |     #[test]
 65 |     fn forward() -> Result<()> {
 66 |         setup_device();
 67 | 
 68 |         let pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
 69 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
 70 |         let huber_loss = HuberLoss::new(1.0);
 71 |         let loss = huber_loss.forward((&pred, &target))?;
 72 | 
 73 |         let expected_loss = (0.5 + 0.5 + 0.5) / 3.0;
 74 |         assert!((loss.to_flatten_vec::<f32>()?[0] - expected_loss).abs() < 1e-6);
 75 | 
 76 |         Ok(())
 77 |     }
 78 | 
 79 |     #[test]
 80 |     fn backward() -> Result<()> {
 81 |         setup_device();
 82 | 
 83 |         let mut pred = Tensor::new(vec![2.0f32, 3.0, 4.0])?;
 84 |         pred.with_grad()?;
 85 |         let target = Tensor::new(vec![1.0f32, 2.0, 3.0])?;
 86 |         let huber_loss = HuberLoss::new(1.0);
 87 |         let loss = huber_loss.forward((&pred, &target))?;
 88 |         loss.backward()?;
 89 | 
 90 |         if let Some(grad) = pred.grad()? {
 91 |             let pred_grad = grad.to_flatten_vec::<f32>()?;
 92 |             let expected_grad = [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0];
 93 |             for (g, e) in pred_grad.iter().zip(expected_grad.iter()) {
 94 |                 assert!((g - e).abs() < 1e-6);
 95 |             }
 96 |         }
 97 | 
 98 |         Ok(())
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/utils/promotion.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | #[cfg(feature = "mps")]
  3 | use maidenx_core::device::Device;
  4 | use maidenx_core::{dtype::DType, error::Result, scalar::Scalar};
  5 | 
  6 | pub fn get_signed_dtype(dtype: DType) -> DType {
  7 |     match dtype {
  8 |         DType::U8 => DType::I32,
  9 |         DType::U16 => DType::I32,
 10 |         DType::U32 => DType::I64,
 11 |         DType::U64 => DType::I64,
 12 |         _ => dtype,
 13 |     }
 14 | }
 15 | 
 16 | pub fn get_promoted_dtype(dtype1: DType, dtype2: DType) -> DType {
 17 |     let mut dtype1 = dtype1;
 18 |     let mut dtype2 = dtype2;
 19 | 
 20 |     if dtype1 == DType::BOOL {
 21 |         dtype1 = DType::U8;
 22 |     }
 23 |     if dtype2 == DType::BOOL {
 24 |         dtype2 = DType::U8;
 25 |     }
 26 | 
 27 |     match (dtype1, dtype2) {
 28 |         (dtype1, dtype2) if dtype1 == dtype2 => dtype1,
 29 | 
 30 |         (_, DType::F64) | (DType::F64, _) => DType::F64,
 31 |         (_, DType::F32) | (DType::F32, _) => DType::F32,
 32 |         (DType::BF16, DType::F16) | (DType::F16, DType::BF16) => DType::F32,
 33 |         (_, DType::F16) | (DType::F16, _) => DType::F16,
 34 |         (_, DType::BF16) | (DType::BF16, _) => DType::BF16,
 35 | 
 36 |         (_, DType::I64) | (DType::I64, _) => DType::I64,
 37 |         (_, DType::I32) | (DType::I32, _) => DType::I32,
 38 |         (_, DType::I16) | (DType::I16, _) => DType::I16,
 39 |         (_, DType::I8) | (DType::I8, _) => DType::I8,
 40 |         (_, DType::U64) | (DType::U64, _) => DType::I64,
 41 |         (_, DType::U32) | (DType::U32, _) => DType::I64,
 42 |         (_, DType::U16) | (DType::U16, _) => DType::I32,
 43 |         (_, DType::U8) | (DType::U8, _) => DType::I32,
 44 | 
 45 |         _ => dtype1,
 46 |     }
 47 | }
 48 | 
 49 | // pub fn get_promoted_dtype_with_scalar(tensor_dtype: DType, scalar_dtype: DType) -> DType {
 50 | //     let tensor_dtype = if tensor_dtype == DType::BOOL { DType::U8 } else { tensor_dtype };
 51 | //
 52 | //     let scalar_dtype = if scalar_dtype == DType::BOOL { DType::U8 } else { scalar_dtype };
 53 | //
 54 | //     match (tensor_dtype, scalar_dtype) {
 55 | //         (tensor_dtype, scalar_dtype) if tensor_dtype == scalar_dtype => tensor_dtype,
 56 | //
 57 | //         (t_dtype, s_dtype) if t_dtype.is_int() && s_dtype.is_float() => s_dtype,
 58 | //         _ => tensor_dtype,
 59 | //     }
 60 | // }
 61 | 
 62 | pub fn promote_tensor(src: &Tensor, target_dtype: DType) -> Result<Tensor> {
 63 |     let src = if src.dtype() != target_dtype {
 64 |         let mut src = src.clone();
 65 |         src.with_dtype(target_dtype)?;
 66 | 
 67 |         src
 68 |     } else {
 69 |         src.clone()
 70 |     };
 71 | 
 72 |     Ok(src)
 73 | }
 74 | 
 75 | pub fn promote_scalar_for_tensor(src: Scalar, target_dtype: DType, with_tensor: &Tensor) -> Result<Scalar> {
 76 |     let src = if src.dtype() != target_dtype {
 77 |         let src = src.to_dtype(target_dtype);
 78 | 
 79 |         src
 80 |     } else {
 81 |         src.clone()
 82 |     };
 83 | 
 84 |     #[cfg(feature = "mps")]
 85 |     let src = if with_tensor.device() == Device::MPS {
 86 |         match src.dtype() {
 87 |             DType::U64 => src.to_dtype(DType::U32),
 88 |             DType::I64 => src.to_dtype(DType::I32),
 89 |             DType::F64 => src.to_dtype(DType::F32),
 90 |             _ => src,
 91 |         }
 92 |     } else {
 93 |         src
 94 |     };
 95 | 
 96 |     #[cfg(not(feature = "mps"))]
 97 |     let _ = with_tensor;
 98 | 
 99 |     Ok(src)
100 | }
101 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/optimizers/adam.rs:
--------------------------------------------------------------------------------
 1 | use crate::optimizer::Optimizer;
 2 | use maidenx_core::{error::Result, scalar::Scalar};
 3 | use maidenx_tensor::Tensor;
 4 | 
 5 | #[derive(Optimizer)]
 6 | pub struct Adam {
 7 |     learning_rate: Scalar,
 8 |     beta1: Scalar,
 9 |     beta2: Scalar,
10 |     epsilon: Scalar,
11 |     t: usize,
12 |     m: Vec<Tensor>,
13 |     v: Vec<Tensor>,
14 | }
15 | 
16 | impl Adam {
17 |     pub fn new(
18 |         learning_rate: impl Into<Scalar>,
19 |         beta1: impl Into<Scalar>,
20 |         beta2: impl Into<Scalar>,
21 |         epsilon: impl Into<Scalar>,
22 |     ) -> Self {
23 |         Self {
24 |             learning_rate: learning_rate.into(),
25 |             beta1: beta1.into(),
26 |             beta2: beta2.into(),
27 |             epsilon: epsilon.into(),
28 |             t: 0,
29 |             m: Vec::new(),
30 |             v: Vec::new(),
31 |         }
32 |     }
33 | 
34 |     pub fn step(&mut self, parameters: &mut [&mut Tensor]) -> Result<()> {
35 |         if self.m.is_empty() || self.v.is_empty() {
36 |             self.m = parameters
37 |                 .iter()
38 |                 .map(|param| Tensor::zeros_like(param))
39 |                 .collect::<Result<Vec<_>>>()?;
40 |             self.v = parameters
41 |                 .iter()
42 |                 .map(|param| Tensor::zeros_like(param))
43 |                 .collect::<Result<Vec<_>>>()?;
44 |         }
45 | 
46 |         self.t += 1;
47 | 
48 |         for ((param, m), v) in parameters.iter_mut().zip(self.m.iter_mut()).zip(self.v.iter_mut()) {
49 |             if let Some(grad) = param.grad()? {
50 |                 let lr = self.learning_rate;
51 |                 let beta1 = self.beta1;
52 |                 let beta2 = self.beta2;
53 |                 let epsilon = self.epsilon;
54 |                 let one = Scalar::new(1.0);
55 | 
56 |                 let mut beta1_t = beta1;
57 |                 let mut beta2_t = beta2;
58 |                 for _ in 0..self.t {
59 |                     beta1_t = beta1_t * beta1;
60 |                     beta2_t = beta2_t * beta2;
61 |                 }
62 | 
63 |                 let one_minus_beta1_t = one - beta1_t;
64 |                 let one_minus_beta2_t = one - beta2_t;
65 |                 let lr_t = lr * (one_minus_beta2_t.sqrt()) / one_minus_beta1_t;
66 | 
67 |                 let one_minus_beta1 = one - beta1;
68 |                 let one_minus_beta2 = one - beta2;
69 | 
70 |                 // Update biased first moment estimate
71 |                 *m = m.mul_scalar(beta1)?.add(&grad.mul_scalar(one_minus_beta1)?)?;
72 | 
73 |                 // Update biased second raw moment estimate
74 |                 *v = v.mul_scalar(beta2)?.add(&grad.square()?.mul_scalar(one_minus_beta2)?)?;
75 | 
76 |                 // Compute bias-corrected estimates
77 |                 let m_hat = m.div_scalar(one_minus_beta1_t)?;
78 |                 let v_hat = v.div_scalar(one_minus_beta2_t)?;
79 | 
80 |                 // Update parameters
81 |                 let step = m_hat.div(&v_hat.sqrt()?.add_scalar(epsilon)?)?;
82 |                 param.sub_(&step.mul_scalar(lr_t)?)?;
83 |             }
84 |         }
85 |         Ok(())
86 |     }
87 | 
88 |     pub fn zero_grad(&mut self, parameters: &mut [&mut Tensor]) -> Result<()> {
89 |         for param in parameters.iter_mut() {
90 |             param.zero_grad()?;
91 |         }
92 |         Ok(())
93 |     }
94 | 
95 |     pub fn set_learning_rate(&mut self, learning_rate: impl Into<Scalar>) {
96 |         self.learning_rate = learning_rate.into();
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/wt.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | use maidenx_core::{
  3 |     buffer::BufferManager,
  4 |     device::Device,
  5 |     dtype::DType,
  6 |     error::{Error, Result},
  7 |     layout::Layout,
  8 | };
  9 | use std::sync::{Arc, Mutex};
 10 | 
 11 | impl Tensor {
 12 |     pub fn with_shape(&mut self, shape: &[usize]) -> Result<()> {
 13 |         if self.size() != Layout::compute_size(shape) {
 14 |             return Err(Error::InvalidShape {
 15 |                 message: format!(
 16 |                     "Shape mismatch: expected total size {}, but got {} for shape {:?}",
 17 |                     self.size(),
 18 |                     Layout::compute_size(shape),
 19 |                     shape
 20 |                 ),
 21 |             });
 22 |         }
 23 | 
 24 |         let offset = self.offset();
 25 |         self.metadata.layout = Layout::from_shape(shape);
 26 |         self.metadata.layout.set_offset(offset);
 27 | 
 28 |         Ok(())
 29 |     }
 30 | 
 31 |     pub fn to_shape(&self, shape: &[usize]) -> Result<Self> {
 32 |         let mut tensor = self.clone();
 33 |         tensor.with_shape(shape)?;
 34 |         Ok(tensor)
 35 |     }
 36 | 
 37 |     pub fn with_device(&mut self, device: Device) -> Result<()> {
 38 |         let cur_device = self.device();
 39 |         if cur_device == device {
 40 |             return Ok(());
 41 |         }
 42 | 
 43 |         let buffer_len = self.buffer().len();
 44 |         let dtype = self.dtype();
 45 | 
 46 |         let mut buffer = BufferManager::create(buffer_len, device, dtype)?;
 47 | 
 48 |         {
 49 |             let buffer_mut = Arc::get_mut(&mut buffer).ok_or(Error::BufferShared)?;
 50 |             buffer_mut.copy_from_with_device(self.buffer(), 0, 0, self.buffer().len())?;
 51 |         }
 52 | 
 53 |         self.data.buffer = buffer;
 54 |         self.metadata.device = device;
 55 | 
 56 |         Ok(())
 57 |     }
 58 | 
 59 |     pub fn to_device(&self, device: Device) -> Result<Self> {
 60 |         let mut tensor = self.clone();
 61 |         tensor.with_device(device)?;
 62 |         Ok(tensor)
 63 |     }
 64 | 
 65 |     pub fn with_dtype(&mut self, dtype: DType) -> Result<()> {
 66 |         #[cfg(feature = "mps")]
 67 |         if self.device() == Device::MPS && dtype.size_in_bytes() == 8 {
 68 |             return Err(Error::UnsupportedDType);
 69 |         }
 70 | 
 71 |         let buffer_len = self.buffer().len();
 72 |         let device = self.device();
 73 | 
 74 |         let mut buffer = BufferManager::create(buffer_len, device, dtype)?;
 75 | 
 76 |         {
 77 |             let buffer_mut = Arc::get_mut(&mut buffer).ok_or(Error::BufferShared)?;
 78 |             buffer_mut.copy_from_with_dtype_cast(self.buffer(), 0, 0, self.buffer().len())?;
 79 |         }
 80 | 
 81 |         self.data.buffer = buffer;
 82 |         self.metadata.dtype = dtype;
 83 | 
 84 |         Ok(())
 85 |     }
 86 | 
 87 |     pub fn to_dtype(&self, dtype: DType) -> Result<Self> {
 88 |         let mut tensor = self.clone();
 89 |         tensor.with_dtype(dtype)?;
 90 |         Ok(tensor)
 91 |     }
 92 | 
 93 |     pub fn with_grad(&mut self) -> Result<()> {
 94 |         if !self.dtype().is_float() {
 95 |             return Err(Error::UnsupportedDType);
 96 |         }
 97 | 
 98 |         self.metadata.requires_grad = true;
 99 |         if self.data.grad.is_none() {
100 |             let grad_storage = Tensor::zeros_like(self)?;
101 |             self.data.grad = Some(Arc::new(Mutex::new(grad_storage)));
102 |         }
103 | 
104 |         Ok(())
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/layers/activation.rs:
--------------------------------------------------------------------------------
  1 | pub mod softmax;
  2 | 
  3 | use crate::layer::{Layer, LayerState};
  4 | use maidenx_core::{error::Result, scalar::Scalar};
  5 | use maidenx_tensor::Tensor;
  6 | #[cfg(feature = "serde")]
  7 | use serde::{Deserialize, Serialize};
  8 | // Re-exports
  9 | pub use softmax::*;
 10 | 
 11 | #[derive(Layer, Clone)]
 12 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 13 | pub struct ReLU {
 14 |     state: LayerState,
 15 | }
 16 | 
 17 | impl ReLU {
 18 |     pub fn new() -> Result<Self> {
 19 |         Ok(Self {
 20 |             state: LayerState::new(),
 21 |         })
 22 |     }
 23 | 
 24 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 25 |         input.relu()
 26 |     }
 27 | 
 28 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
 29 |         vec![]
 30 |     }
 31 | }
 32 | 
 33 | #[derive(Layer, Clone)]
 34 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 35 | pub struct Sigmoid {
 36 |     state: LayerState,
 37 | }
 38 | 
 39 | impl Sigmoid {
 40 |     pub fn new() -> Result<Self> {
 41 |         Ok(Self {
 42 |             state: LayerState::new(),
 43 |         })
 44 |     }
 45 | 
 46 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 47 |         input.sigmoid()
 48 |     }
 49 | 
 50 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
 51 |         vec![]
 52 |     }
 53 | }
 54 | 
 55 | #[derive(Layer, Clone)]
 56 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 57 | pub struct Tanh {
 58 |     state: LayerState,
 59 | }
 60 | 
 61 | impl Tanh {
 62 |     pub fn new() -> Result<Self> {
 63 |         Ok(Self {
 64 |             state: LayerState::new(),
 65 |         })
 66 |     }
 67 | 
 68 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 69 |         input.tanh()
 70 |     }
 71 | 
 72 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
 73 |         vec![]
 74 |     }
 75 | }
 76 | 
 77 | #[derive(Layer, Clone)]
 78 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 79 | pub struct LeakyReLU {
 80 |     exponent: Scalar,
 81 | 
 82 |     state: LayerState,
 83 | }
 84 | 
 85 | impl LeakyReLU {
 86 |     pub fn new(exponent: impl Into<Scalar>) -> Result<Self> {
 87 |         Ok(Self {
 88 |             exponent: exponent.into(),
 89 |             state: LayerState::new(),
 90 |         })
 91 |     }
 92 | 
 93 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 94 |         input.leaky_relu(self.exponent)
 95 |     }
 96 | 
 97 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
 98 |         vec![]
 99 |     }
100 | }
101 | 
102 | #[derive(Layer, Clone)]
103 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
104 | pub struct GELU {
105 |     state: LayerState,
106 | }
107 | 
108 | impl GELU {
109 |     pub fn new() -> Result<Self> {
110 |         Ok(Self {
111 |             state: LayerState::new(),
112 |         })
113 |     }
114 | 
115 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
116 |         input.gelu()
117 |     }
118 | 
119 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
120 |         vec![]
121 |     }
122 | }
123 | 
124 | #[derive(Layer, Clone)]
125 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
126 | pub struct ELU {
127 |     exponent: Scalar,
128 | 
129 |     state: LayerState,
130 | }
131 | 
132 | impl ELU {
133 |     pub fn new(exponent: impl Into<Scalar>) -> Result<Self> {
134 |         Ok(Self {
135 |             exponent: exponent.into(),
136 |             state: LayerState::new(),
137 |         })
138 |     }
139 | 
140 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
141 |         input.elu(self.exponent)
142 |     }
143 | 
144 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
145 |         vec![]
146 |     }
147 | }
148 | 


--------------------------------------------------------------------------------
/book/src/tensor/operation.md:
--------------------------------------------------------------------------------
 1 | # Tensor Operations
 2 | 
 3 | maidenx provides a comprehensive set of tensor operations for numerical computing and deep learning. This page provides an overview of the major operation categories.
 4 | 
 5 | ## Operation Categories
 6 | 
 7 | maidenx tensor operations are organized into the following categories:
 8 | 
 9 | | Category | Description |
10 | |----------|-------------|
11 | | [Binary Operations](./ops_binary.md) | Operations between two tensors (add, mul, div, etc.) |
12 | | [Unary Operations](./ops_unary.md) | Operations on a single tensor (neg, abs, exp, etc.) |
13 | | [Reduction Operations](./ops_reduction.md) | Operations that reduce tensor dimensions (sum, mean, max, etc.) |
14 | | [Transform Operations](./ops_transform.md) | Operations that transform tensor shape or layout |
15 | | [Padding Operations](./ops_padding.md) | Operations that add padding around tensor boundaries |
16 | | [Indexing Operations](./ops_indexing.md) | Operations for advanced indexing and selection |
17 | 
18 | ## Common Operation Features
19 | 
20 | Most operations in maidenx share these common features:
21 | 
22 | ### Automatic Differentiation (Autograd)
23 | 
24 | Many operations support automatic differentiation, which is crucial for training neural networks. Operations that support autograd will automatically track gradients when the tensor has `requires_grad` enabled.
25 | 
26 | ```rust
27 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?.with_grad()?;
28 | let b = a.mul_scalar(2.0)?;  // 'b' will also have autograd enabled
29 | ```
30 | 
31 | ### Type Promotion
32 | 
33 | When performing operations between tensors of different data types, maidenx automatically promotes types according to standard rules:
34 | 
35 | - If one tensor is floating point and one is integer, the integer tensor is converted to floating point
36 | - When mixing different floating point precisions, the lower precision is promoted to the higher one
37 | 
38 | ```rust
39 | let a = Tensor::new(vec![1, 2, 3])?;  // i32 tensor
40 | let b = Tensor::new(vec![1.0, 2.0, 3.0])?;  // f32 tensor
41 | let c = a.add(&b)?;  // Result will be f32
42 | ```
43 | 
44 | ### Broadcasting
45 | 
46 | Most operations support broadcasting, which allows operations between tensors of different shapes by implicitly expanding the smaller tensor:
47 | 
48 | ```rust
49 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?;
50 | let b = Tensor::new(vec![1.0])?;
51 | let c = a.add(&b)?;  // [2.0, 3.0, 4.0]
52 | ```
53 | 
54 | ### Error Handling
55 | 
56 | All operations return a `Result` type, which allows for clear error handling:
57 | 
58 | ```rust
59 | match tensor.add(&other_tensor) {
60 |     Ok(result) => println!("Addition successful"),
61 |     Err(e) => println!("Error: {}", e),
62 | }
63 | ```
64 | 
65 | ## Operation Examples
66 | 
67 | Here are some common operation examples:
68 | 
69 | ```rust
70 | // Create some tensors
71 | let a = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?.reshape(&[2, 2])?;
72 | let b = Tensor::new(vec![5.0, 6.0, 7.0, 8.0])?.reshape(&[2, 2])?;
73 | 
74 | // Binary operations
75 | let sum = a.add(&b)?;  // [[6.0, 8.0], [10.0, 12.0]]
76 | let product = a.mul(&b)?;  // [[5.0, 12.0], [21.0, 32.0]]
77 | 
78 | // Unary operations
79 | let neg_a = a.neg()?;  // [[-1.0, -2.0], [-3.0, -4.0]]
80 | let exp_a = a.exp()?;  // Element-wise exponential
81 | 
82 | // Reduction operations
83 | let sum_a = a.sum(0, false)?;  // [4.0, 6.0]
84 | let max_a = a.max_all()?;  // 4.0
85 | 
86 | // Transform operations
87 | let reshaped = a.reshape(&[4])?;  // [1.0, 2.0, 3.0, 4.0]
88 | let transposed = a.transpose(0, 1)?;  // [[1.0, 3.0], [2.0, 4.0]]
89 | 
90 | // Indexing operations
91 | let indices = Tensor::new(vec![0])?.reshape(&[1])?;
92 | let first_row = a.index_select(0, &indices)?;  // [[1.0, 2.0]]
93 | ```
94 | 
95 | For detailed documentation on each operation category, please refer to the specific section pages linked above.


--------------------------------------------------------------------------------
/book/src/device.md:
--------------------------------------------------------------------------------
  1 | # Device
  2 | 
  3 | MaidenX supports multiple computing devices to run tensor operations, allowing you to choose the most suitable hardware for your specific use case. This flexibility lets you develop on one platform and deploy on another without changing your code.
  4 | 
  5 | ## Supported Devices
  6 | 
  7 | | Device | Description | Availability |
  8 | |--------|-------------|--------------|
  9 | | **CPU** | Standard CPU execution | Always available |
 10 | | **CUDA** | NVIDIA GPU acceleration via CUDA | Available with `cuda` feature flag |
 11 | | **MPS** | Apple Silicon GPU acceleration via Metal Performance Shaders | Available with `mps` feature flag |
 12 | | **Vulkan** | Cross-platform GPU acceleration | Planned for future release |
 13 | 
 14 | ## Device Selection
 15 | 
 16 | You can set the default device for tensor operations using:
 17 | 
 18 | ```rust
 19 | use maidenx::prelude::*;
 20 | 
 21 | // Set default device to CPU
 22 | set_default_device(Device::CPU);
 23 | 
 24 | // Set default device to first CUDA GPU
 25 | #[cfg(feature = "cuda")]
 26 | set_default_device(Device::CUDA(0));
 27 | 
 28 | // Set default device to Apple Silicon GPU
 29 | #[cfg(feature = "mps")]
 30 | set_default_device(Device::MPS);
 31 | ```
 32 | 
 33 | ## Per-Tensor Device Placement
 34 | 
 35 | You can also create tensors on specific devices, regardless of the default:
 36 | 
 37 | ```rust
 38 | // Create a tensor on CPU
 39 | let cpu_tensor = Tensor::new_with_spec(
 40 |     vec![1.0, 2.0, 3.0], 
 41 |     Device::CPU, 
 42 |     DType::F32
 43 | )?;
 44 | 
 45 | // Create a tensor on CUDA (if available)
 46 | #[cfg(feature = "cuda")]
 47 | let cuda_tensor = Tensor::new_with_spec(
 48 |     vec![1.0, 2.0, 3.0], 
 49 |     Device::CUDA(0), 
 50 |     DType::F32
 51 | )?;
 52 | ```
 53 | 
 54 | ## Moving Tensors Between Devices
 55 | 
 56 | Tensors can be moved between devices using the `to_device` method:
 57 | 
 58 | ```rust
 59 | // Move tensor to CPU
 60 | let tensor_on_cpu = tensor.to_device(Device::CPU)?;
 61 | 
 62 | // Move tensor to CUDA (if available)
 63 | #[cfg(feature = "cuda")]
 64 | let tensor_on_cuda = tensor.to_device(Device::CUDA(0))?;
 65 | ```
 66 | 
 67 | ## Device-Specific Considerations
 68 | 
 69 | ### CPU
 70 | 
 71 | - Available on all platforms
 72 | - Good for development and debugging
 73 | - Slower for large-scale computations
 74 | - No special requirements
 75 | 
 76 | ### CUDA
 77 | 
 78 | - Requires NVIDIA GPU and CUDA toolkit
 79 | - Best performance for large models and batch sizes
 80 | - Enabled with the `cuda` feature flag
 81 | - Supports multiple GPU selection via `Device::CUDA(device_id)`
 82 | 
 83 | ### MPS (Metal Performance Shaders)
 84 | 
 85 | - Available on Apple Silicon (M1/M2/M3) devices
 86 | - Good performance on Apple hardware
 87 | - Enabled with the `mps` feature flag
 88 | - Does not support 64-bit data types (F64, I64, U64)
 89 | 
 90 | ### Vulkan (Planned)
 91 | 
 92 | - Will provide cross-platform GPU acceleration
 93 | - Intended to work on various GPUs (NVIDIA, AMD, Intel)
 94 | - Not yet implemented
 95 | 
 96 | ## Example: Multi-Device Code
 97 | 
 98 | Here's how to write code that can run on any available device:
 99 | 
100 | ```rust
101 | use maidenx::prelude::*;
102 | 
103 | fn main() -> Result<()> {
104 |     // Choose the best available device
105 |     auto_set_device();
106 |     
107 |     println!("Using device: {}", get_default_device().name());
108 |     
109 |     // Create a tensor (will use the default device)
110 |     let a = Tensor::new(vec![1.0, 2.0, 3.0])?;
111 |     let b = Tensor::new(vec![4.0, 5.0, 6.0])?;
112 |     
113 |     // Operations run on the tensor's device
114 |     let c = a.add(&b)?;
115 |     
116 |     println!("Result: {}", c);
117 |     
118 |     Ok(())
119 | }
120 | ```
121 | 
122 | This code automatically selects the best available device based on feature flags, with CUDA preferred over MPS, and MPS preferred over CPU.
123 | 


--------------------------------------------------------------------------------
/book/src/nn/dropout.md:
--------------------------------------------------------------------------------
  1 | # Dropout Layer
  2 | 
  3 | The Dropout layer is a regularization technique that helps prevent neural networks from overfitting. It randomly sets a fraction of input units to zero during training, which helps prevent co-adaptation of neurons.
  4 | 
  5 | ## Definition
  6 | 
  7 | ```rust
  8 | pub struct Dropout {
  9 |     p: f32,
 10 |     state: LayerState,
 11 | }
 12 | ```
 13 | 
 14 | ## Constructor
 15 | 
 16 | ```rust
 17 | pub fn new(p: f32) -> Result<Self>
 18 | ```
 19 | 
 20 | Creates a new Dropout layer with the specified dropout probability.
 21 | 
 22 | **Parameters**:
 23 | - `p`: Probability of an element to be zeroed (between 0 and 1)
 24 | 
 25 | **Example**:
 26 | ```rust
 27 | let dropout = Dropout::new(0.5)?; // 50% dropout probability
 28 | ```
 29 | 
 30 | ## Forward Pass
 31 | 
 32 | ```rust
 33 | pub fn forward(&self, input: &Tensor) -> Result<Tensor>
 34 | ```
 35 | 
 36 | Applies dropout to the input tensor.
 37 | 
 38 | **Parameters**:
 39 | - `input`: Input tensor of any shape
 40 | 
 41 | **Returns**: Output tensor of the same shape as input
 42 | 
 43 | **Example**:
 44 | ```rust
 45 | // During training
 46 | let mut dropout = Dropout::new(0.5)?;
 47 | dropout.train(); // Activate training mode
 48 | let x = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?;
 49 | let y = dropout.forward(&x)?; // Some elements will be zeroed
 50 | 
 51 | // During evaluation
 52 | dropout.eval(); // Activate evaluation mode
 53 | let z = dropout.forward(&x)?; // No elements will be zeroed, same as input
 54 | ```
 55 | 
 56 | ## Behavior Differences in Training and Evaluation
 57 | 
 58 | Dropout behaves differently depending on the layer's state:
 59 | 
 60 | 1. **Training Mode** (`is_training() == true`):
 61 |    - Randomly zeroes elements of the input tensor with probability `p`
 62 |    - Scales the remaining elements by a factor of `1/(1-p)` to maintain the expected sum
 63 |    - For example, with `p=0.5`, approximately half the elements will be zeroed, and the remaining elements will be multiplied by 2
 64 | 
 65 | 2. **Evaluation Mode** (`is_training() == false`):
 66 |    - Identity function - returns the input unchanged
 67 |    - No elements are zeroed out
 68 | 
 69 | ## Implementation Details
 70 | 
 71 | MaidenX's Dropout implementation includes:
 72 | 
 73 | 1. A binary mask tensor that determines which elements to keep (1) or zero out (0)
 74 | 2. A scaling factor of `1/(1-p)` applied to the kept elements to maintain the expected activation magnitude
 75 | 3. Support for autograd to allow proper gradient flow during training
 76 | 
 77 | ## Tips for Using Dropout
 78 | 
 79 | - Dropout is typically applied after activation functions
 80 | - Common dropout rates range from 0.1 to 0.5
 81 | - Higher dropout rates provide stronger regularization but may require longer training
 82 | - Always remember to call `layer.eval()` during inference/evaluation
 83 | - Dropout is often more effective in larger networks
 84 | 
 85 | ## Example Usage in a Neural Network
 86 | 
 87 | ```rust
 88 | // Define a simple neural network with dropout
 89 | let mut linear1 = Linear::new(784, 512, true)?;
 90 | let mut dropout1 = Dropout::new(0.2)?;
 91 | let mut linear2 = Linear::new(512, 10, true)?;
 92 | 
 93 | // Training loop
 94 | for _ in 0..num_epochs {
 95 |     // Set to training mode
 96 |     linear1.train();
 97 |     dropout1.train();
 98 |     linear2.train();
 99 |     
100 |     let hidden = linear1.forward(&input)?;
101 |     let hidden_dropped = dropout1.forward(&hidden)?; // Apply dropout
102 |     let output = linear2.forward(&hidden_dropped)?;
103 |     
104 |     // Compute loss and update parameters
105 |     // ...
106 | }
107 | 
108 | // Evaluation
109 | linear1.eval();
110 | dropout1.eval(); // Important: disable dropout during evaluation
111 | linear2.eval();
112 | 
113 | let hidden = linear1.forward(&test_input)?;
114 | let hidden_dropped = dropout1.forward(&hidden)?; // No dropout is applied
115 | let predictions = linear2.forward(&hidden_dropped)?;
116 | ```
117 | 


--------------------------------------------------------------------------------
/book/src/tensor/ops_padding.md:
--------------------------------------------------------------------------------
  1 | # Padding Operations
  2 | 
  3 | Padding operations in maidenx add values around the borders of a tensor, expanding its dimensions.
  4 | 
  5 | ## Basic Padding
  6 | 
  7 | ### pad
  8 | ```rust
  9 | fn pad(&self, paddings: &[(usize, usize)], pad_value: impl Into<Scalar>) -> Result<Tensor>
 10 | ```
 11 | Pads a tensor with a constant value (alias for pad_with_constant).
 12 | 
 13 | - **Parameters**:
 14 |   - `paddings`: List of (before, after) padding pairs for each dimension
 15 |   - `pad_value`: The value to pad with
 16 | - **Returns**: A new tensor with padding applied
 17 | - **Supports Autograd**: Yes
 18 | - **Example**:
 19 | ```rust
 20 | let a = Tensor::new(vec![1.0, 2.0, 3.0])?.reshape(&[3])?;
 21 | let b = a.pad(&[(1, 2)], 0.0)?; // [0.0, 1.0, 2.0, 3.0, 0.0, 0.0]
 22 | ```
 23 | 
 24 | ## Padding Modes
 25 | 
 26 | ### pad_with_constant
 27 | ```rust
 28 | fn pad_with_constant(&self, paddings: &[(usize, usize)], pad_value: impl Into<Scalar>) -> Result<Tensor>
 29 | ```
 30 | Pads a tensor with a constant value.
 31 | 
 32 | - **Parameters**:
 33 |   - `paddings`: List of (before, after) padding pairs for each dimension
 34 |   - `pad_value`: The value to pad with
 35 | - **Returns**: A new tensor with constant padding
 36 | - **Supports Autograd**: Yes
 37 | - **Example**:
 38 | ```rust
 39 | let a = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?.reshape(&[2, 2])?;
 40 | let b = a.pad_with_constant(&[(0, 1), (1, 0)], 0.0)?;
 41 | // [[0.0, 1.0, 2.0],
 42 | //  [0.0, 3.0, 4.0],
 43 | //  [0.0, 0.0, 0.0]]
 44 | ```
 45 | 
 46 | ### pad_with_reflection
 47 | ```rust
 48 | fn pad_with_reflection(&self, paddings: &[(usize, usize)]) -> Result<Tensor>
 49 | ```
 50 | Pads a tensor by reflecting the tensor values at the boundaries.
 51 | 
 52 | - **Parameters**:
 53 |   - `paddings`: List of (before, after) padding pairs for each dimension
 54 | - **Returns**: A new tensor with reflection padding
 55 | - **Supports Autograd**: Yes
 56 | - **Note**: Reflection padding requires the input dimension to be greater than 1
 57 | - **Example**:
 58 | ```rust
 59 | let a = Tensor::new(vec![1.0, 2.0, 3.0, 4.0, 5.0])?.reshape(&[5])?;
 60 | let b = a.pad_with_reflection(&[(2, 2)])?;
 61 | // [3.0, 2.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.0, 3.0]
 62 | ```
 63 | 
 64 | ### pad_with_replication
 65 | ```rust
 66 | fn pad_with_replication(&self, paddings: &[(usize, usize)]) -> Result<Tensor>
 67 | ```
 68 | Pads a tensor by replicating the edge values.
 69 | 
 70 | - **Parameters**:
 71 |   - `paddings`: List of (before, after) padding pairs for each dimension
 72 | - **Returns**: A new tensor with replication padding
 73 | - **Supports Autograd**: Yes
 74 | - **Example**:
 75 | ```rust
 76 | let a = Tensor::new(vec![1.0, 2.0, 3.0, 4.0, 5.0])?.reshape(&[5])?;
 77 | let b = a.pad_with_replication(&[(2, 2)])?;
 78 | // [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 5.0]
 79 | ```
 80 | 
 81 | ## Multi-dimensional Padding
 82 | 
 83 | For multi-dimensional tensors, padding is applied to each dimension separately based on the provided padding pairs. This allows for complex padding patterns to be created.
 84 | 
 85 | ### Example: 2D Padding
 86 | 
 87 | ```rust
 88 | // Create a 2D tensor
 89 | let a = Tensor::new(vec![
 90 |     1.0, 2.0, 3.0,
 91 |     4.0, 5.0, 6.0
 92 | ])?.reshape(&[2, 3])?;
 93 | 
 94 | // Pad with zeros: 1 row at top, 1 row at bottom, 2 columns on left, 1 column on right
 95 | let b = a.pad_with_constant(&[(1, 1), (2, 1)], 0.0)?;
 96 | // Result:
 97 | // [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 98 | //  [0.0, 0.0, 1.0, 2.0, 3.0, 0.0],
 99 | //  [0.0, 0.0, 4.0, 5.0, 6.0, 0.0],
100 | //  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
101 | ```
102 | 
103 | ## Padding Behavior with Autograd
104 | 
105 | All padding operations support automatic differentiation (autograd). During the backward pass, gradients from the padded regions are properly handled:
106 | - For constant padding, gradients in the padded regions are ignored
107 | - For reflection and replication padding, gradients are properly accumulated into the original tensor
108 | 
109 | This makes padding operations safe to use in training neural networks or other gradient-based optimization tasks.


--------------------------------------------------------------------------------
/book/src/nn/linear.md:
--------------------------------------------------------------------------------
  1 | # Linear Layer
  2 | 
  3 | The Linear layer (also known as a fully connected or dense layer) performs a linear transformation on the input data. It's one of the most fundamental building blocks in neural networks.
  4 | 
  5 | ## Definition
  6 | 
  7 | ```rust
  8 | pub struct Linear {
  9 |     weight: Tensor,
 10 |     bias: Option<Tensor>,
 11 |     state: LayerState,
 12 | }
 13 | ```
 14 | 
 15 | ## Constructor
 16 | 
 17 | ```rust
 18 | pub fn new(in_features: usize, out_features: usize, with_bias: bool) -> Result<Self>
 19 | ```
 20 | 
 21 | Creates a new Linear layer with the specified dimensions.
 22 | 
 23 | **Parameters**:
 24 | - `in_features`: The size of each input sample
 25 | - `out_features`: The size of each output sample
 26 | - `with_bias`: Whether to include a bias term
 27 | 
 28 | **Example**:
 29 | ```rust
 30 | let linear = Linear::new(784, 256, true)?;
 31 | ```
 32 | 
 33 | For more control over the initialization, you can use the extended constructor:
 34 | 
 35 | ```rust
 36 | pub fn new_with_spec(
 37 |     in_features: usize, 
 38 |     out_features: usize, 
 39 |     with_bias: bool, 
 40 |     device: Device, 
 41 |     dtype: DType
 42 | ) -> Result<Self>
 43 | ```
 44 | 
 45 | **Additional Parameters**:
 46 | - `device`: The device to place the layer's parameters on (CPU, CUDA, or MPS)
 47 | - `dtype`: The data type for the layer's parameters
 48 | 
 49 | **Example**:
 50 | ```rust
 51 | let linear = Linear::new_with_spec(
 52 |     784, 
 53 |     256, 
 54 |     true, 
 55 |     Device::CUDA(0), 
 56 |     DType::F32
 57 | )?;
 58 | ```
 59 | 
 60 | ## Forward Pass
 61 | 
 62 | ```rust
 63 | pub fn forward(&self, input: &Tensor) -> Result<Tensor>
 64 | ```
 65 | 
 66 | Applies the linear transformation y = xW + b.
 67 | 
 68 | **Parameters**:
 69 | - `input`: The input tensor with shape \[batch_size, ..., in_features\]
 70 | 
 71 | **Returns**: Output tensor with shape \[batch_size, ..., out_features\]
 72 | 
 73 | **Example**:
 74 | ```rust
 75 | let input = Tensor::new(vec![1.0, 2.0, 3.0, 4.0])?.reshape(&[2, 2])?;
 76 | let linear = Linear::new(2, 3, true)?;
 77 | let output = linear.forward(&input)?; // Shape: [2, 3]
 78 | ```
 79 | 
 80 | ## Parameter Access
 81 | 
 82 | ```rust
 83 | pub fn weight(&self) -> &Tensor
 84 | pub fn bias(&self) -> Option<&Tensor>
 85 | ```
 86 | 
 87 | Provides access to the layer's weight and bias parameters.
 88 | 
 89 | **Example**:
 90 | ```rust
 91 | let linear = Linear::new(2, 3, true)?;
 92 | let weight = linear.weight(); // Shape: [3, 2]
 93 | let bias = linear.bias().unwrap(); // Shape: [3]
 94 | ```
 95 | 
 96 | ## Layer Implementation
 97 | 
 98 | The Linear layer implements the `Layer` trait, providing methods for parameter collection and training state management:
 99 | 
100 | ```rust
101 | pub fn parameters(&mut self) -> Vec<&mut Tensor>
102 | ```
103 | 
104 | Returns all trainable parameters of the layer (weight and bias if present).
105 | 
106 | ## Mathematical Operation
107 | 
108 | For an input tensor x of shape \[batch_size, in_features\], the Linear layer computes:
109 | 
110 | ```
111 | output = x @ weight.T + bias
112 | ```
113 | 
114 | Where:
115 | - @ represents the matrix multiplication
116 | - weight.T is the transposed weight matrix of shape \[out_features, in_features\]
117 | - bias is the bias vector of shape \[out_features\]
118 | 
119 | The output tensor has shape \[batch_size, out_features\].
120 | 
121 | ## Broadcasting Support
122 | 
123 | The Linear layer supports broadcasting for batched inputs. If the input tensor has additional leading dimensions, they are preserved in the output:
124 | 
125 | ```rust
126 | let input = Tensor::new(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0])?.reshape(&[3, 2])?;
127 | let linear = Linear::new(2, 4, true)?;
128 | let output = linear.forward(&input)?; // Shape: [3, 4]
129 | ```
130 | 
131 | For a more complex batch structure:
132 | 
133 | ```rust
134 | // Input shape: [batch_size, sequence_length, in_features]
135 | let input = Tensor::new(vec![/* values */])?.reshape(&[32, 10, 64])?;
136 | let linear = Linear::new(64, 128, true)?;
137 | let output = linear.forward(&input)?; // Shape: [32, 10, 128]
138 | ```
139 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/d.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | use maidenx_core::dtype::DType;
  3 | use std::fmt;
  4 | 
  5 | macro_rules! impl_display_for_type {
  6 |     ($val_type:ty, $format:expr) => {
  7 |         fn display_tensor_data(
  8 |             f: &mut fmt::Formatter<'_>,
  9 |             data: &[$val_type],
 10 |             stride: usize,
 11 |             shape: &[usize],
 12 |             depth: usize,
 13 |         ) -> fmt::Result {
 14 |             match shape.len() {
 15 |                 0 => write!(f, "{}", data[0]),
 16 |                 1 => {
 17 |                     write!(f, "[")?;
 18 |                     for (i, val) in data.iter().enumerate() {
 19 |                         if i > 0 {
 20 |                             write!(f, ", ")?
 21 |                         }
 22 |                         write!(f, $format, val)?;
 23 |                     }
 24 |                     write!(f, "]")
 25 |                 },
 26 |                 _ => {
 27 |                     let sub_stride = stride / shape[0];
 28 |                     write!(f, "[")?;
 29 |                     for i in 0..shape[0] {
 30 |                         display_tensor_data(
 31 |                             f,
 32 |                             &data[i * sub_stride..(i + 1) * sub_stride],
 33 |                             sub_stride,
 34 |                             &shape[1..],
 35 |                             depth + 1,
 36 |                         )?;
 37 |                         if i < shape[0] - 1 {
 38 |                             write!(f, ", ")?;
 39 |                         }
 40 |                     }
 41 |                     write!(f, "]")
 42 |                 },
 43 |             }
 44 |         }
 45 |     };
 46 | }
 47 | 
 48 | impl fmt::Display for Tensor {
 49 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 50 |         macro_rules! handle_type {
 51 |             ($type:ty, $format:expr) => {{
 52 |                 if let Ok(data) = self.to_flatten_vec::<$type>() {
 53 |                     impl_display_for_type!($type, $format);
 54 |                     display_tensor_data(f, &data, self.size(), self.shape(), 0)
 55 |                 } else {
 56 |                     write!(f, "Failed to fetch data")
 57 |                 }
 58 |             }};
 59 |         }
 60 | 
 61 |         match self.dtype() {
 62 |             DType::BF16 | DType::F16 => handle_type!(f32, "{:.8}"),
 63 |             DType::F32 => handle_type!(f32, "{:.8}"),
 64 |             DType::F64 => handle_type!(f64, "{:.8}"),
 65 |             DType::BOOL => handle_type!(bool, "{}"),
 66 |             DType::U8 => handle_type!(u8, "{}"),
 67 |             DType::U16 => handle_type!(u16, "{}"),
 68 |             DType::U32 => handle_type!(u32, "{}"),
 69 |             DType::U64 => handle_type!(u64, "{}"),
 70 |             DType::I8 => handle_type!(i8, "{}"),
 71 |             DType::I16 => handle_type!(i16, "{}"),
 72 |             DType::I32 => handle_type!(i32, "{}"),
 73 |             DType::I64 => handle_type!(i64, "{}"),
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | impl fmt::Debug for Tensor {
 79 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 80 |         write!(f, "Tensor(shape=[")?;
 81 | 
 82 |         // Write shape
 83 |         let shape = self.shape();
 84 |         for (i, dim) in shape.iter().enumerate() {
 85 |             if i > 0 {
 86 |                 write!(f, ", ")?
 87 |             }
 88 |             write!(f, "{}", dim)?;
 89 |         }
 90 | 
 91 |         write!(f, "], device={}, dtype={}", self.device().name(), self.dtype().as_str())?;
 92 | 
 93 |         write!(f, ", data=")?;
 94 |         fmt::Display::fmt(self, f)?;
 95 | 
 96 |         write!(f, ", requires_grad={}", self.requires_grad())?;
 97 | 
 98 |         if self.requires_grad() {
 99 |             match self.grad() {
100 |                 Ok(Some(grad)) => {
101 |                     write!(f, ", grad=")?;
102 |                     fmt::Display::fmt(&grad, f)?;
103 |                 },
104 |                 Ok(None) => write!(f, ", grad=None")?,
105 |                 Err(_) => write!(f, ", grad=<locked>")?,
106 |             }
107 |         }
108 | 
109 |         write!(f, ")")
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/tests/cast.rs:
--------------------------------------------------------------------------------
  1 | mod utils;
  2 | 
  3 | use maidenx_core::{
  4 |     device::{auto_set_device, Device},
  5 |     dtype::DType,
  6 |     error::Result,
  7 | };
  8 | use maidenx_tensor_v2::{Tensor, TensorMode};
  9 | use utils::test_both_modes;
 10 | 
 11 | #[test]
 12 | fn with_device() -> Result<()> {
 13 |     auto_set_device();
 14 |     let mut tensor = Tensor::ones(&[2, 3]);
 15 |     let original_id = tensor.id();
 16 |     let original_data = tensor.to_flatten_vec::<f32>();
 17 | 
 18 |     tensor.with_device(Device::CPU);
 19 | 
 20 |     assert_eq!(tensor.device(), Device::CPU);
 21 |     assert_eq!(tensor.mode(), TensorMode::Eager);
 22 |     assert_eq!(tensor.id(), original_id); // Same tensor (in-place modification)
 23 |     assert!(tensor.is_const());
 24 |     assert!(tensor.is_storaged());
 25 | 
 26 |     let new_data = tensor.to_flatten_vec::<f32>();
 27 |     assert_eq!(original_data, new_data);
 28 |     assert_eq!(new_data, vec![1.0; 6]);
 29 |     Ok(())
 30 | }
 31 | 
 32 | #[test]
 33 | fn with_dtype() -> Result<()> {
 34 |     auto_set_device();
 35 |     let mut tensor = Tensor::ones(&[2, 3]);
 36 |     let original_tid = tensor.id();
 37 | 
 38 |     tensor.with_dtype(DType::F16);
 39 | 
 40 |     assert_eq!(tensor.dtype(), DType::F16);
 41 |     assert_eq!(tensor.mode(), TensorMode::Eager);
 42 |     assert_eq!(tensor.id(), original_tid); // Same tensor (in-place modification)
 43 |     assert!(tensor.is_const());
 44 |     assert!(tensor.is_storaged());
 45 | 
 46 |     let new_data = tensor.to_flatten_vec::<f32>();
 47 |     assert_eq!(new_data, vec![1.0; 6]);
 48 |     Ok(())
 49 | }
 50 | 
 51 | #[test]
 52 | fn to_device() -> Result<()> {
 53 |     test_both_modes(|mode| {
 54 |         auto_set_device();
 55 |         let tensor = Tensor::ones(&[2, 3]);
 56 |         let original_data = tensor.to_flatten_vec::<f32>();
 57 |         let new_tensor = tensor.to_device(Device::CPU);
 58 | 
 59 |         assert_eq!(new_tensor.device(), Device::CPU);
 60 |         assert_eq!(new_tensor.mode(), mode);
 61 |         assert_ne!(tensor.id(), new_tensor.id());
 62 | 
 63 |         match mode {
 64 |             TensorMode::Eager => {
 65 |                 assert!(new_tensor.is_const());
 66 |                 assert!(new_tensor.is_storaged());
 67 | 
 68 |                 let new_data = new_tensor.to_flatten_vec::<f32>();
 69 |                 assert_eq!(original_data, new_data);
 70 |                 assert_eq!(new_data, vec![1.0; 6]);
 71 |             },
 72 |             TensorMode::Lazy => {
 73 |                 assert!(!new_tensor.is_const());
 74 |                 assert!(!new_tensor.is_storaged());
 75 | 
 76 |                 new_tensor.forward();
 77 |                 assert!(new_tensor.is_storaged());
 78 | 
 79 |                 let new_data = new_tensor.to_flatten_vec::<f32>();
 80 |                 assert_eq!(original_data, new_data);
 81 |                 assert_eq!(new_data, vec![1.0; 6]);
 82 |             },
 83 |         }
 84 |         Ok(())
 85 |     })
 86 | }
 87 | 
 88 | #[test]
 89 | fn to_dtype() -> Result<()> {
 90 |     test_both_modes(|mode| {
 91 |         auto_set_device();
 92 |         let tensor = Tensor::ones(&[2, 3]);
 93 |         let new_tensor = tensor.to_dtype(DType::F16);
 94 | 
 95 |         assert_eq!(new_tensor.dtype(), DType::F16);
 96 |         assert_eq!(new_tensor.mode(), mode);
 97 |         assert_ne!(tensor.id(), new_tensor.id());
 98 | 
 99 |         match mode {
100 |             TensorMode::Eager => {
101 |                 assert!(new_tensor.is_const());
102 |                 assert!(new_tensor.is_storaged());
103 | 
104 |                 let new_data = new_tensor.to_flatten_vec::<f32>();
105 |                 assert_eq!(new_data, vec![1.0; 6]);
106 |             },
107 |             TensorMode::Lazy => {
108 |                 assert!(!new_tensor.is_const());
109 |                 assert!(!new_tensor.is_storaged());
110 | 
111 |                 new_tensor.forward();
112 |                 assert!(new_tensor.is_storaged());
113 | 
114 |                 let new_data = new_tensor.to_flatten_vec::<f32>();
115 |                 assert_eq!(new_data, vec![1.0; 6]);
116 |             },
117 |         }
118 |         Ok(())
119 |     })
120 | }
121 | 


--------------------------------------------------------------------------------
/benches/benches/maidenx_tensor/ops/reduction.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{black_box, Criterion};
  2 | use maidenx_core::{device::Device, dtype::DType, error::Result};
  3 | use maidenx_tensor::Tensor;
  4 | 
  5 | // Constants for benchmark data sizes
  6 | const SIZES: [(usize, &str); 3] = [(20, "small"), (200, "medium"), (1000, "large")];
  7 | 
  8 | // Helper function for tensor creation and benchmarking
  9 | fn bench_op<F>(b: &mut criterion::Bencher, device: Device, dtype: DType, dims: &[usize], op_fn: F)
 10 | where
 11 |     F: Fn(&Tensor) -> Result<Tensor>,
 12 | {
 13 |     let data: Vec<f32> = (0..dims.iter().product::<usize>()).map(|i| i as f32).collect();
 14 | 
 15 |     b.iter(|| {
 16 |         let mut x = Tensor::new(data.clone()).unwrap().reshape(dims).unwrap();
 17 |         x.with_device(device).unwrap();
 18 |         x.with_dtype(dtype).unwrap();
 19 |         black_box(op_fn(&x)).unwrap()
 20 |     })
 21 | }
 22 | 
 23 | // Create tensor dimensions based on target size
 24 | fn create_dims(size: usize) -> Vec<usize> {
 25 |     if size <= 20 {
 26 |         // For small size, create a 4D tensor
 27 |         let dim = (size as f64).powf(0.25).ceil() as usize;
 28 |         vec![dim, dim, dim, dim]
 29 |     } else if size <= 200 {
 30 |         // For medium size, create a 3D tensor
 31 |         let dim = (size as f64).powf(1.0 / 3.0).ceil() as usize;
 32 |         vec![dim, dim, dim]
 33 |     } else {
 34 |         // For large size, create a 2D tensor
 35 |         let dim = (size as f64).sqrt().ceil() as usize;
 36 |         vec![dim, dim]
 37 |     }
 38 | }
 39 | 
 40 | pub fn basic(criterion: &mut Criterion) {
 41 |     let mut group = criterion.benchmark_group("reduction/basic");
 42 |     group.warm_up_time(core::time::Duration::from_millis(500));
 43 |     group.measurement_time(core::time::Duration::from_secs(3));
 44 |     group.sample_size(50);
 45 | 
 46 |     // Define operations with their implementations
 47 |     let operations: Vec<(&str, Box<dyn Fn(&Tensor) -> Result<Tensor>>)> = vec![
 48 |         ("sum_all", Box::new(|x| x.sum_all())),
 49 |         ("sum_dim_0", Box::new(|x| x.sum(0, false))),
 50 |         ("sum_dim_last", Box::new(|x| x.sum(x.shape().len() - 1, false))),
 51 |         ("sum_keepdim", Box::new(|x| x.sum(0, true))),
 52 |         ("mean_all", Box::new(|x| x.mean_all())),
 53 |         ("mean_dim_0", Box::new(|x| x.mean(0, false))),
 54 |         ("mean_dim_last", Box::new(|x| x.mean(x.shape().len() - 1, false))),
 55 |         ("max_all", Box::new(|x| x.max_all())),
 56 |         ("max_dim_0", Box::new(|x| x.max(0, false))),
 57 |         ("min_all", Box::new(|x| x.min_all())),
 58 |         ("min_dim_0", Box::new(|x| x.min(0, false))),
 59 |         ("norm_all", Box::new(|x| x.norm_all(2.0))),
 60 |         ("norm_dim_0", Box::new(|x| x.norm(2.0, 0, false))),
 61 |         ("var_dim_0", Box::new(|x| x.var(0, false, false))),
 62 |         ("std_dim_0", Box::new(|x| x.std(0, false, false))),
 63 |     ];
 64 | 
 65 |     // Run benchmarks for CPU
 66 |     #[cfg(feature = "cpu")]
 67 |     {
 68 |         let device = Device::CPU;
 69 |         let dtype = DType::F32;
 70 | 
 71 |         for (op_name, op_fn) in &operations {
 72 |             for &(size, size_name) in &SIZES {
 73 |                 let dims = create_dims(size);
 74 |                 let dims_str = dims.iter().map(|d| d.to_string()).collect::<Vec<_>>().join("x");
 75 |                 let bench_name = format!("{}/cpu/{}/{}", op_name, size_name, dims_str);
 76 | 
 77 |                 group.bench_function(&bench_name, |b| bench_op(b, device, dtype, &dims, op_fn));
 78 |             }
 79 |         }
 80 |     }
 81 | 
 82 |     // Run benchmarks for CUDA if enabled
 83 |     #[cfg(feature = "cuda")]
 84 |     {
 85 |         let device = Device::CUDA(0);
 86 |         let dtype = DType::F32;
 87 | 
 88 |         for (op_name, op_fn) in &operations {
 89 |             for &(size, size_name) in &SIZES {
 90 |                 let dims = create_dims(size);
 91 |                 let dims_str = dims.iter().map(|d| d.to_string()).collect::<Vec<_>>().join("x");
 92 |                 let bench_name = format!("{}/cuda/{}/{}", op_name, size_name, dims_str);
 93 | 
 94 |                 group.bench_function(&bench_name, |b| bench_op(b, device, dtype, &dims, op_fn));
 95 |             }
 96 |         }
 97 |     }
 98 | 
 99 |     group.finish();
100 | }
101 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor/src/vec.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | use maidenx_core::{
  3 |     dtype::DType,
  4 |     error::{Error, Result},
  5 | };
  6 | 
  7 | impl Tensor {
  8 |     pub fn to_flatten_vec<T: Default + Clone + 'static>(&self) -> Result<Vec<T>> {
  9 |         let target_dtype =
 10 |             get_dtype_for_type::<T>().ok_or_else(|| Error::InvalidArgument("Unsupported type".into()))?;
 11 |         let tensor = if self.dtype() != target_dtype {
 12 |             self.to_dtype(target_dtype)?
 13 |         } else {
 14 |             self.clone()
 15 |         };
 16 | 
 17 |         let size = tensor.size();
 18 |         let shape = tensor.shape();
 19 |         let strides = tensor.strides();
 20 |         let offset = tensor.offset();
 21 |         let elem_size = tensor.dtype().size_in_bytes();
 22 |         let buffer_size = tensor.buffer().len() * elem_size;
 23 | 
 24 |         let mut raw_data = vec![0u8; buffer_size];
 25 | 
 26 |         unsafe {
 27 |             tensor
 28 |                 .buffer()
 29 |                 .copy_to_host(raw_data.as_mut_ptr() as *mut std::ffi::c_void, buffer_size, 0, 0)?;
 30 |         }
 31 | 
 32 |         let mut result = vec![T::default(); size];
 33 |         let mut indices = vec![0; shape.len()];
 34 |         let mut dst_idx = 0;
 35 | 
 36 |         let calc_src_offset = |indices: &[usize], strides: &[usize], offset: usize| -> usize {
 37 |             offset
 38 |                 + indices
 39 |                     .iter()
 40 |                     .zip(strides.iter())
 41 |                     .map(|(&idx, &stride)| idx * stride)
 42 |                     .sum::<usize>()
 43 |         };
 44 | 
 45 |         loop {
 46 |             let src_offset = calc_src_offset(&indices, strides, offset);
 47 | 
 48 |             if src_offset * elem_size < raw_data.len() {
 49 |                 // Copy element from source to destination
 50 |                 unsafe {
 51 |                     std::ptr::copy_nonoverlapping(
 52 |                         raw_data.as_ptr().add(src_offset * elem_size),
 53 |                         (result.as_mut_ptr() as *mut u8).add(dst_idx * elem_size),
 54 |                         elem_size,
 55 |                     );
 56 |                 }
 57 |             }
 58 | 
 59 |             dst_idx += 1;
 60 | 
 61 |             // Update indices
 62 |             if dst_idx == size {
 63 |                 return Ok(result);
 64 |             }
 65 | 
 66 |             let mut dim = shape.len();
 67 |             while dim > 0 {
 68 |                 dim -= 1;
 69 |                 indices[dim] += 1;
 70 |                 if indices[dim] < shape[dim] {
 71 |                     break;
 72 |                 }
 73 |                 indices[dim] = 0;
 74 |             }
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | fn get_dtype_for_type<T: 'static>() -> Option<DType> {
 80 |     if std::any::TypeId::of::<T>() == std::any::TypeId::of::<half::bf16>() {
 81 |         Some(DType::BF16)
 82 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<half::f16>() {
 83 |         Some(DType::F16)
 84 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
 85 |         Some(DType::F32)
 86 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f64>() {
 87 |         Some(DType::F64)
 88 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<bool>() {
 89 |         Some(DType::BOOL)
 90 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<u8>() {
 91 |         Some(DType::U8)
 92 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<u16>() {
 93 |         Some(DType::U16)
 94 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<u32>() {
 95 |         Some(DType::U32)
 96 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<u64>() {
 97 |         Some(DType::U64)
 98 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<i8>() {
 99 |         Some(DType::I8)
100 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<i16>() {
101 |         Some(DType::I16)
102 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<i32>() {
103 |         Some(DType::I32)
104 |     } else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<i64>() {
105 |         Some(DType::I64)
106 |     } else {
107 |         None
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/book/src/nn/layer.md:
--------------------------------------------------------------------------------
  1 | # Layer
  2 | 
  3 | The `Layer` trait is the foundation of neural network components in MaidenX. It defines the interface that all neural network layers must implement.
  4 | 
  5 | ## Layer Trait Definition
  6 | 
  7 | ```rust
  8 | pub trait Layer<I = &'static Tensor> {
  9 |     fn forward(&self, input: I) -> Result<Tensor>;
 10 |     fn parameters(&mut self) -> Vec<&mut Tensor>;
 11 |     
 12 |     fn is_training(&self) -> bool;
 13 |     fn train(&mut self);
 14 |     fn eval(&mut self);
 15 | }
 16 | ```
 17 | 
 18 | The `Layer` trait makes it easy to create custom layers and combine them into complex architectures. The generic parameter `I` allows layers to handle different input types, with the default being a reference to a `Tensor`.
 19 | 
 20 | ## Core Methods
 21 | 
 22 | ### forward
 23 | 
 24 | ```rust
 25 | fn forward(&self, input: I) -> Result<Tensor>;
 26 | ```
 27 | 
 28 | The `forward` method performs the layer's computation on the input and returns the output tensor. It's the primary function that defines the layer's behavior.
 29 | 
 30 | ### parameters
 31 | 
 32 | ```rust
 33 | fn parameters(&mut self) -> Vec<&mut Tensor>;
 34 | ```
 35 | 
 36 | Returns all trainable parameters of the layer as mutable references, which can then be updated by optimizers during training.
 37 | 
 38 | ## Training State Management
 39 | 
 40 | ### is_training
 41 | 
 42 | ```rust
 43 | fn is_training(&self) -> bool;
 44 | ```
 45 | 
 46 | Returns whether the layer is in training mode (true) or evaluation mode (false).
 47 | 
 48 | ### train
 49 | 
 50 | ```rust
 51 | fn train(&mut self);
 52 | ```
 53 | 
 54 | Sets the layer to training mode. This affects behaviors like dropout and batch normalization.
 55 | 
 56 | ### eval
 57 | 
 58 | ```rust
 59 | fn eval(&mut self);
 60 | ```
 61 | 
 62 | Sets the layer to evaluation mode. This typically disables regularization techniques like dropout.
 63 | 
 64 | ## LayerState
 65 | 
 66 | Most layer implementations use the `LayerState` structure to track their training state:
 67 | 
 68 | ```rust
 69 | pub struct LayerState {
 70 |     training: bool,
 71 | }
 72 | ```
 73 | 
 74 | `LayerState` provides a simple way to implement the training state methods:
 75 | 
 76 | ```rust
 77 | impl LayerState {
 78 |     pub fn new() -> Self {
 79 |         Self { training: true }
 80 |     }
 81 | 
 82 |     pub fn is_training(&self) -> bool {
 83 |         self.training
 84 |     }
 85 | 
 86 |     pub fn train(&mut self) {
 87 |         self.training = true;
 88 |     }
 89 | 
 90 |     pub fn eval(&mut self) {
 91 |         self.training = false;
 92 |     }
 93 | }
 94 | ```
 95 | 
 96 | ## Custom Layer Implementation
 97 | 
 98 | To implement a custom layer, you need to implement the `Layer` trait:
 99 | 
100 | ```rust
101 | #[derive(Layer, Clone)]
102 | struct MyCustomLayer {
103 |     weight: Tensor,
104 |     bias: Option<Tensor>,
105 |     state: LayerState,
106 | }
107 | 
108 | impl Layer for MyCustomLayer {
109 |     fn forward(&self, input: &Tensor) -> Result<Tensor> {
110 |         // Custom forward computation
111 |         let output = input.matmul(&self.weight)?;
112 |         if let Some(ref bias) = self.bias {
113 |             Ok(output.add(bias)?)
114 |         } else {
115 |             Ok(output)
116 |         }
117 |     }
118 |     
119 |     fn parameters(&mut self) -> Vec<&mut Tensor> {
120 |         let mut params = vec![&mut self.weight];
121 |         if let Some(ref mut bias) = self.bias {
122 |             params.push(bias);
123 |         }
124 |         params
125 |     }
126 |     
127 |     fn is_training(&self) -> bool {
128 |         self.state.is_training()
129 |     }
130 |     
131 |     fn train(&mut self) {
132 |         self.state.train();
133 |     }
134 |     
135 |     fn eval(&mut self) {
136 |         self.state.eval();
137 |     }
138 | }
139 | ```
140 | 
141 | ## Using the Layer Macro
142 | 
143 | MaidenX provides a derive macro to simplify layer implementation:
144 | 
145 | ```rust
146 | #[derive(Layer, Clone)]
147 | struct MySimpleLayer {
148 |     weight: Tensor,
149 |     state: LayerState,
150 | }
151 | 
152 | // The Layer trait methods for training state are automatically implemented
153 | impl MySimpleLayer {
154 |     fn forward(&self, input: &Tensor) -> Result<Tensor> {
155 |         // Your implementation here
156 |     }
157 |     
158 |     fn parameters(&mut self) -> Vec<&mut Tensor> {
159 |         vec![&mut self.weight]
160 |     }
161 | }
162 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/src/metal_context.rs:
--------------------------------------------------------------------------------
  1 | use metal::{CommandQueue, ComputePipelineState, Device, Library};
  2 | use std::sync::{Mutex, OnceLock};
  3 | use std::{collections::HashMap, path::Path};
  4 | 
  5 | pub fn initialize_ops() -> Result<(), KernelError> {
  6 |     initialize_metal()?;
  7 | 
  8 |     let context = get_metal_context()?;
  9 |     let libraries = context.libraries.lock().unwrap();
 10 | 
 11 |     if libraries.contains_key("ops") && libraries.contains_key("nn") {
 12 |         return Ok(());
 13 |     }
 14 | 
 15 |     drop(libraries);
 16 | 
 17 |     let metallib_ops_path = env!("MAIDENX_MPS_OPS_METALLIB_PATH");
 18 |     load_metal_library("ops", metallib_ops_path)?;
 19 | 
 20 |     #[cfg(feature = "nn")]
 21 |     {
 22 |         let metallib_nn_path = env!("MAIDENX_MPS_NN_METALLIB_PATH");
 23 |         load_metal_library("nn", metallib_nn_path)?;
 24 |     }
 25 | 
 26 |     Ok(())
 27 | }
 28 | 
 29 | pub struct MetalContext {
 30 |     device: Device,
 31 |     command_queue: CommandQueue,
 32 |     pipelines: Mutex<HashMap<String, ComputePipelineState>>,
 33 |     libraries: Mutex<HashMap<String, Library>>,
 34 | }
 35 | 
 36 | static METAL_CONTEXT: OnceLock<MetalContext> = OnceLock::new();
 37 | 
 38 | #[derive(Debug)]
 39 | pub enum KernelError {
 40 |     DeviceNotFound,
 41 |     LibraryNotFound(String),
 42 |     FunctionNotFound(String),
 43 |     PipelineCreationFailed(String),
 44 |     ContextNotInitialized,
 45 |     ExecutionFailed(String),
 46 | }
 47 | 
 48 | impl MetalContext {
 49 |     fn new() -> Result<Self, KernelError> {
 50 |         let device = Device::system_default().ok_or(KernelError::DeviceNotFound)?;
 51 |         let command_queue = device.new_command_queue();
 52 |         Ok(Self {
 53 |             device,
 54 |             command_queue,
 55 |             pipelines: Mutex::new(HashMap::new()),
 56 |             libraries: Mutex::new(HashMap::new()),
 57 |         })
 58 |     }
 59 | 
 60 |     fn add_library(&self, name: &str, path: &str) -> Result<(), KernelError> {
 61 |         let library = self
 62 |             .device
 63 |             .new_library_with_file(Path::new(path))
 64 |             .map_err(|_| KernelError::LibraryNotFound(path.to_string()))?;
 65 |         let mut libraries = self.libraries.lock().unwrap();
 66 |         libraries.insert(name.to_string(), library);
 67 |         Ok(())
 68 |     }
 69 | 
 70 |     fn get_or_create_pipeline(&self, function_name: &str) -> Result<ComputePipelineState, KernelError> {
 71 |         {
 72 |             let pipelines = self.pipelines.lock().unwrap();
 73 |             if let Some(pipeline) = pipelines.get(function_name) {
 74 |                 return Ok(pipeline.clone());
 75 |             }
 76 |         }
 77 |         let libraries = self.libraries.lock().unwrap();
 78 | 
 79 |         for library in libraries.values() {
 80 |             if let Ok(function) = library.get_function(function_name, None) {
 81 |                 let pipeline = self
 82 |                     .device
 83 |                     .new_compute_pipeline_state_with_function(&function)
 84 |                     .map_err(|_| KernelError::PipelineCreationFailed(function_name.to_string()))?;
 85 | 
 86 |                 let mut pipelines = self.pipelines.lock().unwrap();
 87 |                 pipelines.insert(function_name.to_string(), pipeline.clone());
 88 |                 return Ok(pipeline);
 89 |             }
 90 |         }
 91 |         Err(KernelError::FunctionNotFound(function_name.to_string()))
 92 |     }
 93 | }
 94 | 
 95 | pub fn initialize_metal() -> Result<(), KernelError> {
 96 |     if METAL_CONTEXT.get().is_none() {
 97 |         let context = MetalContext::new()?;
 98 |         let _ = METAL_CONTEXT.set(context);
 99 |     }
100 |     Ok(())
101 | }
102 | 
103 | pub fn get_metal_context() -> Result<&'static MetalContext, KernelError> {
104 |     METAL_CONTEXT.get().ok_or(KernelError::ContextNotInitialized)
105 | }
106 | 
107 | pub fn load_metal_library(name: &str, path: &str) -> Result<(), KernelError> {
108 |     initialize_metal()?;
109 | 
110 |     let context = get_metal_context()?;
111 |     context.add_library(name, path)
112 | }
113 | 
114 | pub fn execute_function<F>(function_name: &str, setup_fn: F) -> Result<(), KernelError>
115 | where
116 |     F: FnOnce(ComputePipelineState, &CommandQueue, &Device) -> Result<(), KernelError>,
117 | {
118 |     initialize_metal()?;
119 | 
120 |     let context = get_metal_context()?;
121 |     let pipeline = context.get_or_create_pipeline(function_name)?;
122 | 
123 |     setup_fn(pipeline, &context.command_queue, &context.device)
124 | }
125 | 


--------------------------------------------------------------------------------
/README_.md:
--------------------------------------------------------------------------------
  1 | # MaidenX
  2 | 
  3 | A Rust-based machine learning framework developed as part of the Maiden Engine project. MaidenX is designed with an educational focus, structured to mirror PyTorch's architecture to facilitate learning and understanding of ML framework implementations.
  4 | This library prioritizes code readability, ensuring that anyone can easily understand and work with the codebase.
  5 | 
  6 | [![License](https://img.shields.io/badge/license-BSD--3--Clause-blue.svg)](https://github.com/miniex/maidenx#license)
  7 | [![Crates.io](https://img.shields.io/crates/v/maidenx.svg)](https://crates.io/crates/maidenx)
  8 | 
  9 | > [!WARNING]
 10 | >
 11 | > This is a personal learning and development project. As such:
 12 | > - The framework is under active development
 13 | > - Features may be experimental or incomplete
 14 | > - Functionality is not guaranteed for production use
 15 | > 
 16 | > It is recommended to use the latest version.
 17 | 
 18 | The project serves primarily as a testbed for AI engine development and learning purposes.
 19 | 
 20 | ## Goals
 21 | 
 22 | MaidenX is being developed with a vision to create a lightweight, fast, and human-like artificial intelligence framework.
 23 | The library focuses on simplicity, performance, and user convenience, ensuring that developers can work effortlessly while enjoying robust machine learning capabilities.
 24 | As the project evolves, MaidenX aims to serve as a foundation for innovative AI solutions and advanced learning resources.
 25 | 
 26 | ## Guide
 27 | 
 28 | ### Features
 29 | 
 30 | MaidenX organizes its functionality into separate features, allowing users to select only what they need. More features will be added as the project evolves.
 31 | 
 32 | #### Default Features
 33 | 
 34 | These are included by default and recommended for most use cases:
 35 | 
 36 | |feature name|description|
 37 | |-|-|
 38 | |nn|Core neural network functionality that provides implementations of neural network components and architectures|
 39 | |serde|Integration with Rust's serde framework enabling serialization/deserialization of tensors and neural network layers for saving and loading models|
 40 | |graph|Enables computational graph mode where tensor operations are executed as deferred operations within a compuation graph rather than immediately, providing an alternative execution model|
 41 | 
 42 | #### Optional Features
 43 | 
 44 | |feature name|description|
 45 | |-|-|
 46 | |cuda|GPU acceleration support using NVIDIA CUDA for significantly faster tensor operations and model training|
 47 | |mps|Apple Metal Performance Shaders support for hardware acceleration on macOS devices|
 48 | 
 49 | ### Docs
 50 | 
 51 | - [GUIDE](https://miniex.github.io/maidenx/) - MaidenX Guide
 52 | 
 53 | - [Supported Operations and Layers](docs/supported.md) - Complete list of all operations and layers supported by MaidenX
 54 | - [Tensor Documentation](docs/tensor.md) - Detailed information about MaidenX tensor implementation
 55 | - [Neural Networks Guide](docs/neural-networks.md) - Guide to using neural network components
 56 | 
 57 | ### Examples
 58 | 
 59 | ```rust
 60 | use maidenx::nn::*;
 61 | use maidenx::prelude::*;
 62 | use std::time::Instant;
 63 | 
 64 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 65 |     let input_data: Vec<Vec<f32>> = (0..10000)
 66 |         .map(|i| vec![(i % 100) as f32 / 100.0, ((i % 100) + 1) as f32 / 100.0, ((i % 100) + 2) as f32 / 100.0])
 67 |         .collect();
 68 |     let target_data: Vec<Vec<f32>> = (0..10000).map(|i| vec![((i % 100) * 10) as f32 / 1000.0]).collect();
 69 | 
 70 |     let mut input = Tensor::new(input_data)?;
 71 |     let target = Tensor::new(target_data)?;
 72 |     input.with_grad()?;
 73 | 
 74 |     let mut linear = Linear::new(3, 1, true)?;
 75 |     let mse_loss = MSE::new();
 76 |     let mut optimizer = SGD::new(0.01);
 77 |     let epochs = 1000;
 78 | 
 79 |     let mut hundred_epochs_start = Instant::now();
 80 | 
 81 |     for epoch in 0..epochs {
 82 |         let pred = linear.forward(&input)?;
 83 |         let loss = mse_loss.forward((&pred, &target))?;
 84 |         loss.backward()?;
 85 | 
 86 |         optimizer.step(&mut linear.parameters())?;
 87 |         optimizer.zero_grad(&mut linear.parameters())?;
 88 | 
 89 |         if (epoch + 1) % 100 == 0 {
 90 |             let hundred_elapsed = hundred_epochs_start.elapsed();
 91 |             let params = linear.parameters();
 92 |             println!(
 93 |                 "Epoch {}: Loss = {}, 100 Epochs Time = {:?}, Weight = {}, Bias = {}",
 94 |                 epoch + 1,
 95 |                 loss,
 96 |                 hundred_elapsed,
 97 |                 params[0],
 98 |                 params.get(1).unwrap()
 99 |             );
100 |             hundred_epochs_start = Instant::now();
101 |         }
102 |     }
103 | 
104 |     Ok(())
105 | }
106 | ```
107 | 


--------------------------------------------------------------------------------
/benches/benches/maidenx_tensor/ops/unary.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{black_box, Criterion};
  2 | use maidenx_core::{device::Device, dtype::DType, error::Result};
  3 | use maidenx_tensor::Tensor;
  4 | 
  5 | // Constants for benchmark data sizes
  6 | const SIZES: [(usize, &str); 2] = [(1000, "small"), (10000, "medium")];
  7 | 
  8 | // Helper function for tensor creation and benchmarking
  9 | fn bench_op<F>(
 10 |     b: &mut criterion::Bencher,
 11 |     device: Device,
 12 |     dtype: DType,
 13 |     size: usize,
 14 |     data_transform: impl Fn(Vec<f32>) -> Vec<f32>,
 15 |     op_fn: F,
 16 | ) where
 17 |     F: Fn(&Tensor) -> Result<Tensor>,
 18 | {
 19 |     // Generate initial data
 20 |     let raw_data: Vec<f32> = (0..size).map(|i| (i % 10) as f32 / 10.0).collect();
 21 |     let data = data_transform(raw_data);
 22 | 
 23 |     b.iter(|| {
 24 |         let mut x = Tensor::new(data.clone()).unwrap();
 25 |         x.with_device(device).unwrap();
 26 |         x.with_dtype(dtype).unwrap();
 27 |         black_box(op_fn(&x)).unwrap()
 28 |     })
 29 | }
 30 | 
 31 | pub fn basic(criterion: &mut Criterion) {
 32 |     let mut group = criterion.benchmark_group("unary/basic");
 33 |     group.warm_up_time(core::time::Duration::from_millis(500));
 34 |     group.measurement_time(core::time::Duration::from_secs(3));
 35 |     group.sample_size(50);
 36 | 
 37 |     // Define operations with their data transformations and implementations
 38 |     let operations: Vec<(
 39 |         &str,
 40 |         Box<dyn Fn(Vec<f32>) -> Vec<f32>>,
 41 |         Box<dyn Fn(&Tensor) -> Result<Tensor>>,
 42 |     )> = vec![
 43 |         // Math operations
 44 |         (
 45 |             "abs",
 46 |             Box::new(|v| v.iter().map(|x| x - 0.5).collect()),
 47 |             Box::new(|x| x.abs()),
 48 |         ),
 49 |         ("neg", Box::new(|v| v), Box::new(|x| x.neg())),
 50 |         (
 51 |             "sign",
 52 |             Box::new(|v| v.iter().map(|x| x - 0.5).collect()),
 53 |             Box::new(|x| x.sign()),
 54 |         ),
 55 |         ("sqrt", Box::new(|v| v), Box::new(|x| x.sqrt())),
 56 |         ("pow", Box::new(|v| v), Box::new(|x| x.pow(2.0))),
 57 |         ("exp", Box::new(|v| v), Box::new(|x| x.exp())),
 58 |         (
 59 |             "log",
 60 |             Box::new(|v| v.iter().map(|x| x + 0.01).collect()),
 61 |             Box::new(|x| x.log()),
 62 |         ),
 63 |         // Trigonometric operations
 64 |         ("sin", Box::new(|v| v), Box::new(|x| x.sin())),
 65 |         ("cos", Box::new(|v| v), Box::new(|x| x.cos())),
 66 |         ("tan", Box::new(|v| v), Box::new(|x| x.tan())),
 67 |         // Neural network operations
 68 |         (
 69 |             "sigmoid",
 70 |             Box::new(|v| v.iter().map(|x| x * 10.0 - 5.0).collect()),
 71 |             Box::new(|x| x.sigmoid()),
 72 |         ),
 73 |         (
 74 |             "relu",
 75 |             Box::new(|v| v.iter().map(|x| x * 2.0 - 1.0).collect()),
 76 |             Box::new(|x| x.relu()),
 77 |         ),
 78 |         (
 79 |             "tanh",
 80 |             Box::new(|v| v.iter().map(|x| x * 2.0 - 1.0).collect()),
 81 |             Box::new(|x| x.tanh()),
 82 |         ),
 83 |         (
 84 |             "leaky_relu",
 85 |             Box::new(|v| v.iter().map(|x| x * 2.0 - 1.0).collect()),
 86 |             Box::new(|x| x.leaky_relu(0.01)),
 87 |         ),
 88 |         (
 89 |             "gelu",
 90 |             Box::new(|v| v.iter().map(|x| x * 2.0 - 1.0).collect()),
 91 |             Box::new(|x| x.gelu()),
 92 |         ),
 93 |         (
 94 |             "elu",
 95 |             Box::new(|v| v.iter().map(|x| x * 2.0 - 1.0).collect()),
 96 |             Box::new(|x| x.elu(1.0)),
 97 |         ),
 98 |     ];
 99 | 
100 |     // Run benchmarks for CPU
101 |     #[cfg(feature = "cpu")]
102 |     {
103 |         let device = Device::CPU;
104 |         let dtype = DType::F32;
105 | 
106 |         for (op_name, data_transform, op_fn) in &operations {
107 |             for &(size, size_name) in &SIZES {
108 |                 let bench_name = format!("{}/cpu/{}", op_name, size_name);
109 | 
110 |                 group.bench_function(&bench_name, |b| bench_op(b, device, dtype, size, data_transform, op_fn));
111 |             }
112 |         }
113 |     }
114 | 
115 |     // Run benchmarks for CUDA if enabled
116 |     #[cfg(feature = "cuda")]
117 |     {
118 |         let device = Device::CUDA(0);
119 |         let dtype = DType::F32;
120 | 
121 |         for (op_name, data_transform, op_fn) in &operations {
122 |             for &(size, size_name) in &SIZES {
123 |                 let bench_name = format!("{}/cuda/{}", op_name, size_name);
124 | 
125 |                 group.bench_function(&bench_name, |b| bench_op(b, device, dtype, size, data_transform, op_fn));
126 |             }
127 |         }
128 |     }
129 | 
130 |     group.finish();
131 | }
132 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/layers/linear.rs:
--------------------------------------------------------------------------------
  1 | use crate::layer::{Layer, LayerState};
  2 | use maidenx_core::{
  3 |     device::{get_default_device, Device},
  4 |     dtype::{get_default_dtype, DType},
  5 |     error::Result,
  6 | };
  7 | use maidenx_tensor::Tensor;
  8 | #[cfg(feature = "serde")]
  9 | use serde::{Deserialize, Serialize};
 10 | 
 11 | #[derive(Layer, Clone)]
 12 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 13 | pub struct Linear {
 14 |     weight: Tensor,
 15 |     bias: Option<Tensor>,
 16 | 
 17 |     state: LayerState,
 18 | }
 19 | 
 20 | impl Linear {
 21 |     pub fn new(in_features: usize, out_features: usize, with_bias: bool) -> Result<Self> {
 22 |         let device = get_default_device();
 23 |         let dtype = get_default_dtype();
 24 | 
 25 |         Self::new_with_spec(in_features, out_features, with_bias, device, dtype)
 26 |     }
 27 | 
 28 |     pub fn new_with_spec(
 29 |         in_features: usize,
 30 |         out_features: usize,
 31 |         with_bias: bool,
 32 |         device: Device,
 33 |         dtype: DType,
 34 |     ) -> Result<Self> {
 35 |         let k: f32 = 1.0 / (in_features as f32).sqrt();
 36 | 
 37 |         // weight
 38 |         let mut w = Tensor::randn_with_spec(&[out_features, in_features], device, dtype)?;
 39 |         w.with_grad()?;
 40 |         w.mul_scalar(k)?;
 41 | 
 42 |         // bias
 43 |         let b = if with_bias {
 44 |             let mut b = Tensor::randn_with_spec(&[], device, dtype)?;
 45 |             b.with_grad()?;
 46 |             b.mul_scalar(k)?;
 47 | 
 48 |             Some(b)
 49 |         } else {
 50 |             None
 51 |         };
 52 | 
 53 |         Ok(Self {
 54 |             weight: w,
 55 |             bias: b,
 56 |             state: LayerState::new(),
 57 |         })
 58 |     }
 59 | 
 60 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 61 |         let input_shape = input.shape();
 62 |         let batch_dims = if input_shape.len() >= 2 {
 63 |             &input_shape[..input_shape.len() - 2]
 64 |         } else {
 65 |             &[]
 66 |         };
 67 | 
 68 |         let broadcasted_weight = if !batch_dims.is_empty() {
 69 |             self.weight.broadcast_left(batch_dims)?
 70 |         } else {
 71 |             self.weight.clone()
 72 |         };
 73 | 
 74 |         let output = input.matmul(&broadcasted_weight.transpose(-1, -2)?)?;
 75 | 
 76 |         if let Some(ref bias) = self.bias {
 77 |             Ok(output.add(bias)?)
 78 |         } else {
 79 |             Ok(output)
 80 |         }
 81 |     }
 82 | 
 83 |     pub fn parameters(&mut self) -> Vec<&mut Tensor> {
 84 |         let mut params = vec![];
 85 |         params.push(&mut self.weight);
 86 |         if let Some(ref mut b) = self.bias {
 87 |             params.push(b);
 88 |         }
 89 |         params
 90 |     }
 91 | 
 92 |     pub fn weight(&self) -> &Tensor {
 93 |         &self.weight
 94 |     }
 95 | 
 96 |     pub fn bias(&self) -> Option<&Tensor> {
 97 |         self.bias.as_ref()
 98 |     }
 99 | }
100 | 
101 | #[cfg(test)]
102 | mod tests {
103 |     use super::*;
104 |     use maidenx_core::device::set_default_device;
105 | 
106 |     fn setup_device() {
107 |         #[cfg(feature = "cuda")]
108 |         set_default_device(Device::CUDA(0));
109 |         #[cfg(not(any(feature = "cuda")))]
110 |         set_default_device(Device::CPU);
111 |     }
112 | 
113 |     #[test]
114 |     fn linear_forward() -> Result<()> {
115 |         setup_device();
116 | 
117 |         let linear = Linear::new(2, 3, true)?;
118 | 
119 |         let input = Tensor::new(vec![vec![1.0f32, 2.0], vec![3.0, 4.0]])?;
120 |         let output = linear.forward(&input)?;
121 | 
122 |         assert_eq!(output.shape(), &[2, 3]);
123 | 
124 |         let output_vec = output.to_flatten_vec::<f32>()?;
125 |         assert_eq!(output_vec.len(), 6);
126 | 
127 |         Ok(())
128 |     }
129 | 
130 |     #[test]
131 |     fn linear_backward() -> Result<()> {
132 |         setup_device();
133 | 
134 |         let linear = Linear::new(2, 3, true)?;
135 | 
136 |         let mut input = Tensor::new(vec![vec![1.0f32, 2.0], vec![3.0, 4.0]])?;
137 |         input.with_grad()?;
138 |         let output = linear.forward(&input)?;
139 |         let loss = output.sum_all()?;
140 |         loss.backward()?;
141 | 
142 |         let input_grad = input.grad()?.expect("Input gradient should exist");
143 |         assert_eq!(input_grad.shape(), &[2, 2]);
144 | 
145 |         let weight_grad = linear.weight().grad()?.expect("Weight gradient should exist");
146 |         assert_eq!(weight_grad.shape(), &[3, 2]);
147 | 
148 |         if let Some(bias) = linear.bias() {
149 |             let bias_grad = bias.grad()?.expect("Bias gradient should exist");
150 |             assert!(bias_grad.shape().is_empty());
151 |         }
152 | 
153 |         Ok(())
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/book/src/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | ## What is MaidenX?
  4 | 
  5 | MaidenX is a Rust-based machine learning framework developed as part of the Maiden Engine project. It is designed with an educational focus, structured to mirror PyTorch's architecture to facilitate learning and understanding of ML framework implementations. The library prioritizes code readability, ensuring that anyone can easily understand and work with the codebase.
  6 | 
  7 | ## Key Features
  8 | 
  9 | - **Pure Rust Implementation**: Built entirely in Rust, providing memory safety, concurrency, and performance benefits.
 10 | - **PyTorch-like API**: Familiar and intuitive API design for those coming from PyTorch.
 11 | - **Multiple Backends**: Support for CPU, CUDA (NVIDIA GPUs), and MPS (Apple Silicon) computation.
 12 | - **Automatic Differentiation**: Built-in autograd system for gradient-based optimization.
 13 | - **Computational Graph**: Optional computational graph mode for deferred execution.
 14 | - **Serialization**: Integration with Rust's serde framework for model saving and loading.
 15 | - **Comprehensive Operations**: Rich set of tensor operations with autograd support.
 16 | - **Neural Network Layers**: Ready-to-use implementations of common neural network components.
 17 | 
 18 | ## Architecture
 19 | 
 20 | MaidenX is organized into several core components:
 21 | 
 22 | ### 1. Tensor System (`maidenx_tensor`)
 23 | 
 24 | The tensor module provides the foundation for all numerical operations. Key features include:
 25 | 
 26 | - Support for multiple data types (float32, float64, int32, etc.)
 27 | - Comprehensive tensor operations (arithmetic, transformation, reduction)
 28 | - Automatic broadcasting for compatible shapes
 29 | - In-place and out-of-place operations
 30 | - Efficient memory management and buffer handling
 31 | 
 32 | ### 2. Neural Network Components (`maidenx_nn`)
 33 | 
 34 | The neural network module offers building blocks for constructing machine learning models:
 35 | 
 36 | - Common layers: Linear, Conv2d, LayerNorm, Dropout, Embedding
 37 | - Activation functions: ReLU, Sigmoid, Tanh, GELU, Softmax, etc.
 38 | - Loss functions: MSE, MAE, Huber, CrossEntropy
 39 | - Optimizers: SGD, Adam
 40 | 
 41 | ### 3. Backend System (`maidenx_core`)
 42 | 
 43 | The core backend system provides device-specific implementations:
 44 | 
 45 | - CPU backend for universal compatibility
 46 | - CUDA backend for NVIDIA GPU acceleration
 47 | - MPS backend for Apple Silicon GPU acceleration
 48 | - Abstract device interface for consistent API across backends
 49 | 
 50 | ## Getting Started
 51 | 
 52 | MaidenX organizes its functionality into separate features, allowing users to select only what they need:
 53 | 
 54 | ### Default Features
 55 | 
 56 | These are included by default and recommended for most use cases:
 57 | 
 58 | - **nn**: Core neural network functionality
 59 | - **serde**: Serialization/deserialization support
 60 | - **graph**: Computational graph mode for deferred operations
 61 | 
 62 | ### Optional Features
 63 | 
 64 | - **cuda**: GPU acceleration support using NVIDIA CUDA
 65 | - **mps**: Apple Metal Performance Shaders support for Apple Silicon
 66 | 
 67 | ## Example Usage
 68 | 
 69 | Here's a simple example of training a linear model with MaidenX:
 70 | 
 71 | ```rust
 72 | use maidenx::nn::*;
 73 | use maidenx::prelude::*;
 74 | use std::time::Instant;
 75 | 
 76 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 77 |     // Create input and target data
 78 |     let input_data: Vec<Vec<f32>> = (0..10000)
 79 |         .map(|i| vec![(i % 100) as f32 / 100.0, ((i % 100) + 1) as f32 / 100.0, ((i % 100) + 2) as f32 / 100.0])
 80 |         .collect();
 81 |     let target_data: Vec<Vec<f32>> = (0..10000).map(|i| vec![((i % 100) * 10) as f32 / 1000.0]).collect();
 82 | 
 83 |     let mut input = Tensor::new(input_data)?;
 84 |     let target = Tensor::new(target_data)?;
 85 |     input.with_grad()?;
 86 | 
 87 |     // Create model, loss function, and optimizer
 88 |     let mut linear = Linear::new(3, 1, true)?;
 89 |     let mse_loss = MSE::new();
 90 |     let mut optimizer = SGD::new(0.01);
 91 |     let epochs = 1000;
 92 | 
 93 |     // Training loop
 94 |     for epoch in 0..epochs {
 95 |         let pred = linear.forward(&input)?;
 96 |         let loss = mse_loss.forward((&pred, &target))?;
 97 |         loss.backward()?;
 98 | 
 99 |         optimizer.step(&mut linear.parameters())?;
100 |         optimizer.zero_grad(&mut linear.parameters())?;
101 | 
102 |         if (epoch + 1) % 100 == 0 {
103 |             println!("Epoch {}: Loss = {}", epoch + 1, loss);
104 |         }
105 |     }
106 | 
107 |     Ok(())
108 | }
109 | ```
110 | 
111 | ## Supported Operations and Layers
112 | 
113 | MaidenX includes a comprehensive set of tensor operations and neural network layers, which we'll explore in more detail in the following chapters.
114 | 


--------------------------------------------------------------------------------
/crates/maidenx_mps/build.rs:
--------------------------------------------------------------------------------
  1 | use std::env;
  2 | use std::fs;
  3 | use std::path::{Path, PathBuf};
  4 | use std::process::Command;
  5 | 
  6 | fn main() {
  7 |     let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
  8 |     let base_dir = Path::new(&manifest_dir);
  9 | 
 10 |     // Define directories
 11 |     let kernel_dir = base_dir.join("kernels");
 12 |     let output_dir = base_dir.join("build");
 13 |     let ops_dir = output_dir.join("ops");
 14 | 
 15 |     // Create output directories
 16 |     fs::create_dir_all(&output_dir).expect("Failed to create output directory");
 17 |     fs::create_dir_all(&ops_dir).expect("Failed to create ops directory");
 18 | 
 19 |     // Define output file
 20 |     let ops_lib = output_dir.join("ops.metallib");
 21 | 
 22 |     // Utility files
 23 |     let utils = kernel_dir.join("metal_utils.metal");
 24 |     let utils_air = output_dir.join("metal_utils.air");
 25 | 
 26 |     // Compile utilities
 27 |     compile_metal_file(&utils, &utils_air, &kernel_dir);
 28 | 
 29 |     // Compile ops sources
 30 |     let ops_sources = vec![
 31 |         "ops/binary.metal",
 32 |         "ops/matmul.metal",
 33 |         "ops/padding.metal",
 34 |         "ops/reduction.metal",
 35 |         "ops/unary.metal",
 36 |     ];
 37 | 
 38 |     let mut ops_air_files = vec![utils_air.clone()];
 39 |     for source in &ops_sources {
 40 |         let source_path = kernel_dir.join(source);
 41 |         let filename = Path::new(source).file_stem().unwrap().to_str().unwrap();
 42 |         let air_path = ops_dir.join(format!("{}.air", filename));
 43 |         compile_metal_file(&source_path, &air_path, &kernel_dir);
 44 |         ops_air_files.push(air_path);
 45 |     }
 46 | 
 47 |     create_metallib(&ops_air_files, &ops_lib);
 48 | 
 49 |     // Set cargo environment variables
 50 |     println!("cargo:rustc-env=MAIDENX_MPS_OPS_METALLIB_PATH={}", ops_lib.display());
 51 | 
 52 |     // Conditionally compile nn if feature is enabled
 53 |     if env::var("CARGO_FEATURE_NN").is_ok() {
 54 |         let nn_dir = output_dir.join("nn");
 55 |         let nn_activation_dir = nn_dir.join("activation");
 56 | 
 57 |         fs::create_dir_all(&nn_dir).expect("Failed to create nn directory");
 58 |         fs::create_dir_all(&nn_activation_dir).expect("Failed to create nn/activation directory");
 59 | 
 60 |         let nn_lib = output_dir.join("nn.metallib");
 61 | 
 62 |         let nn_sources = vec!["nn/conv.metal"];
 63 |         let nn_activation_sources = vec!["nn/activation/softmax.metal"];
 64 | 
 65 |         let mut nn_air_files = vec![utils_air];
 66 |         for source in &nn_sources {
 67 |             let source_path = kernel_dir.join(source);
 68 |             let filename = Path::new(source).file_stem().unwrap().to_str().unwrap();
 69 |             let air_path = nn_dir.join(format!("{}.air", filename));
 70 |             compile_metal_file(&source_path, &air_path, &kernel_dir);
 71 |             nn_air_files.push(air_path);
 72 |         }
 73 | 
 74 |         for source in &nn_activation_sources {
 75 |             let source_path = kernel_dir.join(source);
 76 |             let filename = Path::new(source).file_stem().unwrap().to_str().unwrap();
 77 |             let air_path = nn_activation_dir.join(format!("{}.air", filename));
 78 |             compile_metal_file(&source_path, &air_path, &kernel_dir);
 79 |             nn_air_files.push(air_path);
 80 |         }
 81 | 
 82 |         create_metallib(&nn_air_files, &nn_lib);
 83 | 
 84 |         println!("cargo:rustc-env=MAIDENX_MPS_NN_METALLIB_PATH={}", nn_lib.display());
 85 | 
 86 |         println!("cargo:rerun-if-changed=build/nn.metallib");
 87 |     }
 88 | 
 89 |     println!("cargo:rerun-if-changed=kernels");
 90 |     println!("cargo:rerun-if-changed=build/ops.metallib");
 91 | }
 92 | 
 93 | fn compile_metal_file(source: &Path, output: &Path, include_dir: &Path) {
 94 |     println!("Compiling: {} -> {}", source.display(), output.display());
 95 | 
 96 |     let status = Command::new("xcrun")
 97 |         .args([
 98 |             "-sdk",
 99 |             "macosx",
100 |             "metal",
101 |             "-I",
102 |             include_dir.to_str().unwrap(),
103 |             "-c",
104 |             source.to_str().unwrap(),
105 |             "-o",
106 |             output.to_str().unwrap(),
107 |         ])
108 |         .status()
109 |         .expect("Failed to execute metal compiler");
110 | 
111 |     if !status.success() {
112 |         panic!("Failed to compile {}", source.display());
113 |     }
114 | }
115 | 
116 | fn create_metallib(air_files: &[PathBuf], output: &Path) {
117 |     println!("Creating metallib: {}", output.display());
118 | 
119 |     let mut cmd = Command::new("xcrun");
120 |     cmd.args(["-sdk", "macosx", "metallib"]);
121 | 
122 |     for air_file in air_files {
123 |         cmd.arg(air_file);
124 |     }
125 | 
126 |     cmd.args(["-o", output.to_str().unwrap()]);
127 | 
128 |     let status = cmd.status().expect("Failed to execute metallib command");
129 | 
130 |     if !status.success() {
131 |         panic!("Failed to create metallib: {}", output.display());
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/crates/maidenx_nn/src/layers/dropout.rs:
--------------------------------------------------------------------------------
  1 | use crate::layer::{Layer, LayerState};
  2 | use maidenx_core::{dtype::DType, error::Result};
  3 | use maidenx_tensor::{Tensor, TensorNode};
  4 | #[cfg(feature = "serde")]
  5 | use serde::{Deserialize, Serialize};
  6 | 
  7 | #[derive(Layer, Clone)]
  8 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
  9 | pub struct Dropout {
 10 |     p: f32,
 11 | 
 12 |     state: LayerState,
 13 | }
 14 | 
 15 | impl Dropout {
 16 |     pub fn new(p: f32) -> Result<Self> {
 17 |         Ok(Self {
 18 |             p,
 19 |             state: LayerState::new(),
 20 |         })
 21 |     }
 22 | 
 23 |     pub fn forward(&self, input: &Tensor) -> Result<Tensor> {
 24 |         if self.state.is_training() {
 25 |             let random = Tensor::randn_like(input)?;
 26 |             let mut mask = random.gt_scalar(self.p)?;
 27 |             mask.with_dtype(DType::U8)?;
 28 |             let scale = 1.0 / (1.0 - self.p);
 29 |             let scaled_mask = mask.mul_scalar(scale)?;
 30 | 
 31 |             if input.requires_grad() {
 32 |                 let input_clone = input.clone();
 33 |                 let scaled_mask_clone = scaled_mask.clone();
 34 | 
 35 |                 let mut output = input.mul(&scaled_mask)?;
 36 | 
 37 |                 let backward_fn = Box::new(move |_inputs: &[Tensor], grad_out: &Tensor| -> Result<Vec<Tensor>> {
 38 |                     let grad_input = grad_out.mul(&scaled_mask_clone)?;
 39 | 
 40 |                     Ok(vec![grad_input])
 41 |                 });
 42 | 
 43 |                 let node = TensorNode::new("dropout".to_string(), vec![input_clone], Some(backward_fn));
 44 |                 output.set_node(node);
 45 | 
 46 |                 Ok(output)
 47 |             } else {
 48 |                 input.mul(&scaled_mask)
 49 |             }
 50 |         } else {
 51 |             Ok(input.clone())
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | #[cfg(test)]
 57 | mod tests {
 58 |     use super::*;
 59 |     use maidenx_core::device::{set_default_device, Device};
 60 | 
 61 |     fn setup_device() {
 62 |         #[cfg(feature = "cuda")]
 63 |         set_default_device(Device::CUDA(0));
 64 |         #[cfg(not(any(feature = "cuda")))]
 65 |         set_default_device(Device::CPU);
 66 |     }
 67 | 
 68 |     #[test]
 69 |     fn forward_with_training() -> Result<()> {
 70 |         setup_device();
 71 |         let mut dropout = Dropout::new(0.5)?;
 72 |         dropout.train();
 73 | 
 74 |         let input = Tensor::new(vec![vec![1.0f32; 100]; 10])?;
 75 |         let output = dropout.forward(&input)?;
 76 | 
 77 |         assert_eq!(output.shape(), &[10, 100]);
 78 | 
 79 |         let output_vec = output.to_flatten_vec::<f32>()?;
 80 | 
 81 |         for &val in &output_vec {
 82 |             assert!(val == 0.0 || (val - 2.0).abs() < 1e-5);
 83 |         }
 84 | 
 85 |         let zeros_count = output_vec.iter().filter(|&&x| x == 0.0).count();
 86 |         let non_zeros_count = output_vec.iter().filter(|&&x| (x - 2.0).abs() < 1e-5).count();
 87 | 
 88 |         assert!(zeros_count > 0, "There should be at least some zeros");
 89 |         assert!(non_zeros_count > 0, "There should be at least some non-zero values");
 90 |         assert_eq!(
 91 |             zeros_count + non_zeros_count,
 92 |             1000,
 93 |             "All values should be either 0 or 2.0"
 94 |         );
 95 | 
 96 |         Ok(())
 97 |     }
 98 | 
 99 |     #[test]
100 |     fn forward_with_eval() -> Result<()> {
101 |         setup_device();
102 | 
103 |         let mut dropout = Dropout::new(0.5)?;
104 |         dropout.eval();
105 | 
106 |         let input = Tensor::new(vec![vec![1.0f32, 2.0], vec![3.0, 4.0]])?;
107 |         let output = dropout.forward(&input)?;
108 | 
109 |         assert_eq!(output.shape(), &[2, 2]);
110 |         let output_vec = output.to_flatten_vec::<f32>()?;
111 |         let input_vec = input.to_flatten_vec::<f32>()?;
112 | 
113 |         for i in 0..output_vec.len() {
114 |             assert!((output_vec[i] - input_vec[i]).abs() < 1e-5);
115 |         }
116 | 
117 |         Ok(())
118 |     }
119 | 
120 |     #[test]
121 |     fn backward() -> Result<()> {
122 |         setup_device();
123 | 
124 |         let mut dropout = Dropout::new(0.5)?;
125 |         dropout.train();
126 | 
127 |         let mut input = Tensor::new(vec![vec![1.0f32; 10]; 10])?;
128 |         input.with_grad()?;
129 | 
130 |         let output = dropout.forward(&input)?;
131 |         let loss = output.sum_all()?;
132 |         loss.backward()?;
133 | 
134 |         let input_grad = input.grad()?.expect("Input gradient should exist");
135 |         assert_eq!(input_grad.shape(), &[10, 10]);
136 | 
137 |         let grad_vec = input_grad.to_flatten_vec::<f32>()?;
138 |         let output_vec = output.to_flatten_vec::<f32>()?;
139 | 
140 |         for i in 0..grad_vec.len() {
141 |             if output_vec[i] == 0.0 {
142 |                 assert!((grad_vec[i]).abs() < 1e-5);
143 |             } else {
144 |                 assert!((grad_vec[i] - 2.0).abs() < 1e-5);
145 |             }
146 |         }
147 | 
148 |         Ok(())
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/book/src/guide/installation.md:
--------------------------------------------------------------------------------
  1 | # Installation
  2 | 
  3 | MaidenX is a Rust machine learning framework that's available through [crates.io](https://crates.io/crates/maidenx). This guide will walk you through the installation process, including setting up optional hardware acceleration features.
  4 | 
  5 | ## Basic Installation
  6 | 
  7 | To add MaidenX to your Rust project, add it as a dependency in your `Cargo.toml` file:
  8 | 
  9 | ```toml
 10 | [dependencies]
 11 | maidenx = "*"
 12 | ```
 13 | 
 14 | This will include the default features (`nn`, `serde`, and `graph`), which are suitable for most use cases.
 15 | 
 16 | ## Feature Configuration
 17 | 
 18 | MaidenX provides several optional features that you can enable based on your needs:
 19 | 
 20 | ### Default Features
 21 | 
 22 | These are included automatically and provide core functionality:
 23 | 
 24 | | Feature | Description |
 25 | |---------|-------------|
 26 | | `nn` | Neural network components (layers, optimizers, activations) |
 27 | | `serde` | Serialization/deserialization for saving and loading models |
 28 | | `graph` | Computational graph for deferred tensor operations |
 29 | 
 30 | ### Hardware Acceleration
 31 | 
 32 | For improved performance, you can enable hardware-specific backends:
 33 | 
 34 | | Feature | Description | Requirements |
 35 | |---------|-------------|--------------|
 36 | | `cuda` | NVIDIA GPU acceleration | NVIDIA GPU, CUDA toolkit |
 37 | | `mps` | Apple Silicon GPU acceleration | Apple Silicon Mac |
 38 | 
 39 | To enable specific features, modify your dependency in `Cargo.toml`:
 40 | 
 41 | ```toml
 42 | [dependencies]
 43 | maidenx = { version = "*", features = ["cuda"] }  # For NVIDIA GPU support
 44 | ```
 45 | 
 46 | Or:
 47 | 
 48 | ```toml
 49 | [dependencies]
 50 | maidenx = { version = "*", features = ["mps"] }  # For Apple Silicon GPU support
 51 | ```
 52 | 
 53 | ## Hardware-Specific Setup
 54 | 
 55 | ### CUDA Backend (NVIDIA GPUs)
 56 | 
 57 | To use the CUDA backend:
 58 | 
 59 | 1. Install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) (compatible with your NVIDIA GPU)
 60 | 2. Ensure your system's PATH includes the CUDA binaries
 61 | 3. Enable the `cuda` feature in your Cargo.toml
 62 | 
 63 | ### MPS Backend (Apple Silicon)
 64 | 
 65 | To use the Metal Performance Shaders backend:
 66 | 
 67 | 1. Ensure you're using macOS on Apple Silicon hardware (M1/M2/M3)
 68 | 2. Have Xcode and the Command Line Tools installed
 69 | 3. Enable the `mps` feature in your Cargo.toml
 70 | 
 71 | ## Setting Default Device and Data Type
 72 | 
 73 | MaidenX allows you to configure the global default device and data type for tensor operations:
 74 | 
 75 | ```rust
 76 | use maidenx::prelude::*;
 77 | 
 78 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 79 |     // Check current default device and dtype
 80 |     println!("Default device: {:?}", get_default_device());
 81 |     println!("Default dtype: {:?}", get_default_dtype());
 82 |     
 83 |     // Set new defaults
 84 |     set_default_device(Device::CPU);
 85 |     set_default_dtype(DType::F32);
 86 |     
 87 |     // Create a tensor using the defaults
 88 |     let tensor = Tensor::ones(&[2, 3])?;
 89 |     println!("Device: {:?}, dtype: {:?}", tensor.device(), tensor.dtype());
 90 |     
 91 |     // Automatic device selection based on available hardware
 92 |     auto_set_device();
 93 |     println!("Auto-selected device: {:?}", get_default_device());
 94 |     
 95 |     Ok(())
 96 | }
 97 | ```
 98 | 
 99 | The `auto_set_device()` function will select the best available device in this order:
100 | 1. CUDA if available and the `cuda` feature is enabled
101 | 2. MPS if available and the `mps` feature is enabled
102 | 3. CPU as fallback
103 | 
104 | ## Verifying Installation
105 | 
106 | To verify that MaidenX is correctly installed and configured, you can run a simple example:
107 | 
108 | ```rust
109 | use maidenx::prelude::*;
110 | 
111 | fn main() -> Result<(), Box<dyn std::error::Error>> {
112 |     // Create a simple tensor
113 |     let tensor = Tensor::ones(&[2, 3])?;
114 |     println!("Tensor shape: {:?}", tensor.shape());
115 |     println!("Tensor device: {:?}", tensor.device());
116 |     
117 |     Ok(())
118 | }
119 | ```
120 | 
121 | If you've enabled hardware acceleration, you can explicitly create tensors on specific devices:
122 | 
123 | ```rust
124 | use maidenx::prelude::*;
125 | 
126 | fn main() -> Result<(), Box<dyn std::error::Error>> {
127 |     // Create tensors on different devices
128 |     let cpu_tensor = Tensor::ones(&[2, 3])?.to_device(Device::CPU)?;
129 |     
130 |     #[cfg(feature = "cuda")]
131 |     let cuda_tensor = Tensor::ones(&[2, 3])?.to_device(Device::CUDA(0))?;
132 |     
133 |     #[cfg(feature = "mps")]
134 |     let mps_tensor = Tensor::ones(&[2, 3])?.to_device(Device::MPS)?;
135 |     
136 |     println!("CPU Tensor: {:?}", cpu_tensor);
137 |     
138 |     Ok(())
139 | }
140 | ```
141 | 
142 | ## Next Steps
143 | 
144 | Once you've successfully installed MaidenX, you're ready to start creating and manipulating tensors. Continue to the [Creating Tensors](./create-tensors.md) guide to learn the basics of working with MaidenX's tensor system.


--------------------------------------------------------------------------------
/crates/maidenx_tensor/tests/wt.rs:
--------------------------------------------------------------------------------
  1 | mod utils;
  2 | 
  3 | use maidenx_core::{
  4 |     device::{auto_set_device, Device},
  5 |     dtype::DType,
  6 |     error::Result,
  7 | };
  8 | use utils::{setup_tensor, setup_tensor_without_dtype};
  9 | 
 10 | #[test]
 11 | fn with_shape() -> Result<()> {
 12 |     auto_set_device();
 13 | 
 14 |     let mut x = setup_tensor_without_dtype(vec![1, 2, 3, 4, 5, 6])?;
 15 | 
 16 |     // Test reshaping to 2x3
 17 |     x.with_shape(&[2, 3])?;
 18 |     assert_eq!(x.shape(), &[2, 3]);
 19 |     assert_eq!(x.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4, 5, 6]);
 20 | 
 21 |     // Test reshaping to 3x2
 22 |     x.with_shape(&[3, 2])?;
 23 |     assert_eq!(x.shape(), &[3, 2]);
 24 |     assert_eq!(x.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4, 5, 6]);
 25 | 
 26 |     // Test with invalid shape (should fail)
 27 |     let result = x.with_shape(&[2, 2]);
 28 |     assert!(result.is_err());
 29 | 
 30 |     Ok(())
 31 | }
 32 | 
 33 | #[test]
 34 | fn to_shape() -> Result<()> {
 35 |     auto_set_device();
 36 | 
 37 |     let x = setup_tensor_without_dtype(vec![1, 2, 3, 4, 5, 6])?;
 38 | 
 39 |     // Test to_shape to 2x3
 40 |     let y = x.to_shape(&[2, 3])?;
 41 |     assert_eq!(y.shape(), &[2, 3]);
 42 |     assert_eq!(y.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4, 5, 6]);
 43 | 
 44 |     // Original tensor should remain unchanged
 45 |     assert_eq!(x.shape(), &[6]);
 46 | 
 47 |     // Test with invalid shape (should fail)
 48 |     let result = x.to_shape(&[2, 2]);
 49 |     assert!(result.is_err());
 50 | 
 51 |     Ok(())
 52 | }
 53 | 
 54 | #[test]
 55 | fn with_device() -> Result<()> {
 56 |     auto_set_device();
 57 | 
 58 |     let mut x = setup_tensor_without_dtype(vec![1, 2, 3, 4])?;
 59 |     let original_device = x.device();
 60 | 
 61 |     // Test with same device (no-op)
 62 |     x.with_device(original_device)?;
 63 |     assert_eq!(x.device(), original_device);
 64 |     assert_eq!(x.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4]);
 65 | 
 66 |     // Test with CPU device (always available)
 67 |     x.with_device(Device::CPU)?;
 68 |     assert_eq!(x.device(), Device::CPU);
 69 |     assert_eq!(x.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4]);
 70 | 
 71 |     Ok(())
 72 | }
 73 | 
 74 | #[test]
 75 | fn to_device() -> Result<()> {
 76 |     auto_set_device();
 77 | 
 78 |     let x = setup_tensor_without_dtype(vec![1, 2, 3, 4])?;
 79 |     let original_device = x.device();
 80 | 
 81 |     // Test to_device to CPU
 82 |     let y = x.to_device(Device::CPU)?;
 83 |     assert_eq!(y.device(), Device::CPU);
 84 |     assert_eq!(y.to_flatten_vec::<i32>()?, vec![1, 2, 3, 4]);
 85 | 
 86 |     // Original tensor should remain unchanged
 87 |     assert_eq!(x.device(), original_device);
 88 | 
 89 |     Ok(())
 90 | }
 91 | 
 92 | #[test]
 93 | fn with_dtype() -> Result<()> {
 94 |     auto_set_device();
 95 | 
 96 |     let mut x = setup_tensor_without_dtype(vec![1, 2, 3, 4])?;
 97 |     let original_dtype = x.dtype();
 98 | 
 99 |     // Test conversion to F32 dtype
100 |     x.with_dtype(DType::F32)?;
101 |     assert_eq!(x.dtype(), DType::F32);
102 |     assert_eq!(x.to_flatten_vec::<f32>()?, vec![1.0, 2.0, 3.0, 4.0]);
103 | 
104 |     // Test conversion back to original dtype
105 |     x.with_dtype(original_dtype)?;
106 |     assert_eq!(x.dtype(), original_dtype);
107 | 
108 |     // Test invalid dtype conversion for MPS (conditionally compiled)
109 |     #[cfg(feature = "mps")]
110 |     if x.device() == Device::MPS {
111 |         let result = x.with_dtype(DType::F64);
112 |         assert!(result.is_err());
113 |     }
114 | 
115 |     Ok(())
116 | }
117 | 
118 | #[test]
119 | fn to_dtype() -> Result<()> {
120 |     auto_set_device();
121 | 
122 |     let x = setup_tensor_without_dtype(vec![1, 2, 3, 4])?;
123 |     let original_dtype = x.dtype();
124 | 
125 |     // Test to_dtype to F32
126 |     let y = x.to_dtype(DType::F32)?;
127 |     assert_eq!(y.dtype(), DType::F32);
128 |     assert_eq!(y.to_flatten_vec::<f32>()?, vec![1.0, 2.0, 3.0, 4.0]);
129 | 
130 |     // Original tensor should remain unchanged
131 |     assert_eq!(x.dtype(), original_dtype);
132 | 
133 |     // Test invalid dtype conversion for MPS (conditionally compiled)
134 |     #[cfg(feature = "mps")]
135 |     if x.device() == Device::MPS {
136 |         let result = x.to_dtype(DType::F64);
137 |         assert!(result.is_err());
138 |     }
139 | 
140 |     Ok(())
141 | }
142 | 
143 | #[test]
144 | fn with_grad() -> Result<()> {
145 |     auto_set_device();
146 | 
147 |     // Float tensor (should work)
148 |     let mut x = setup_tensor(vec![1.0, 2.0, 3.0, 4.0], DType::F32)?;
149 | 
150 |     // Should not have grad initially
151 |     assert!(!x.requires_grad());
152 |     assert!(x.grad()?.is_none());
153 | 
154 |     // Enable grad
155 |     x.with_grad()?;
156 |     assert!(x.requires_grad());
157 |     assert!(x.grad()?.is_some());
158 | 
159 |     // Check grad tensor has same shape and initialized to zeros
160 |     let grad = x.grad()?.unwrap();
161 |     assert_eq!(grad.shape(), x.shape());
162 |     assert_eq!(grad.to_flatten_vec::<f32>()?, vec![0.0, 0.0, 0.0, 0.0]);
163 | 
164 |     // Non-float tensor (should fail)
165 |     let mut y = setup_tensor(vec![1, 2, 3, 4], DType::I32)?;
166 |     let result = y.with_grad();
167 |     assert!(result.is_err());
168 | 
169 |     Ok(())
170 | }
171 | 


--------------------------------------------------------------------------------
/crates/maidenx_tensor_v2/src/ops/unary.rs:
--------------------------------------------------------------------------------
  1 | use crate::Tensor;
  2 | use maidenx_core::scalar::Scalar;
  3 | 
  4 | impl Tensor {
  5 |     pub fn neg(&self) -> Self {
  6 |         self.try_neg().expect("failed to negate tensor")
  7 |     }
  8 | 
  9 |     pub fn abs(&self) -> Self {
 10 |         self.try_abs().expect("failed to compute absolute value")
 11 |     }
 12 | 
 13 |     pub fn sign(&self) -> Self {
 14 |         self.try_sign().expect("failed to compute sign")
 15 |     }
 16 | 
 17 |     pub fn square(&self) -> Self {
 18 |         self.try_square().expect("failed to compute square")
 19 |     }
 20 | 
 21 |     pub fn sqrt(&self) -> Self {
 22 |         self.try_sqrt().expect("failed to compute sqrt")
 23 |     }
 24 | 
 25 |     pub fn relu(&self) -> Self {
 26 |         self.try_relu().expect("failed to compute relu")
 27 |     }
 28 | 
 29 |     pub fn sigmoid(&self) -> Self {
 30 |         self.try_sigmoid().expect("failed to compute sigmoid")
 31 |     }
 32 | 
 33 |     pub fn tanh(&self) -> Self {
 34 |         self.try_tanh().expect("failed to compute tanh")
 35 |     }
 36 | 
 37 |     pub fn gelu(&self) -> Self {
 38 |         self.try_gelu().expect("failed to compute gelu")
 39 |     }
 40 | 
 41 |     pub fn sin(&self) -> Self {
 42 |         self.try_sin().expect("failed to compute sin")
 43 |     }
 44 | 
 45 |     pub fn cos(&self) -> Self {
 46 |         self.try_cos().expect("failed to compute cos")
 47 |     }
 48 | 
 49 |     pub fn tan(&self) -> Self {
 50 |         self.try_tan().expect("failed to compute tan")
 51 |     }
 52 | 
 53 |     pub fn ln(&self) -> Self {
 54 |         self.try_ln().expect("failed to compute ln")
 55 |     }
 56 | 
 57 |     pub fn log(&self) -> Self {
 58 |         self.try_log().expect("failed to compute log")
 59 |     }
 60 | 
 61 |     pub fn log10(&self) -> Self {
 62 |         self.try_log10().expect("failed to compute log10")
 63 |     }
 64 | 
 65 |     pub fn log2(&self) -> Self {
 66 |         self.try_log2().expect("failed to compute log2")
 67 |     }
 68 | 
 69 |     pub fn exp(&self) -> Self {
 70 |         self.try_exp().expect("failed to compute exp")
 71 |     }
 72 | 
 73 |     pub fn exp10(&self) -> Self {
 74 |         self.try_exp10().expect("failed to compute exp10")
 75 |     }
 76 | 
 77 |     pub fn exp2(&self) -> Self {
 78 |         self.try_exp2().expect("failed to compute exp2")
 79 |     }
 80 | 
 81 |     pub fn softplus(&self) -> Self {
 82 |         self.try_softplus().expect("failed to compute softplus")
 83 |     }
 84 | 
 85 |     pub fn recip(&self) -> Self {
 86 |         self.try_recip().expect("failed to compute recip")
 87 |     }
 88 | 
 89 |     pub fn logical_not(&self) -> Self {
 90 |         self.try_logical_not().expect("failed to compute logical_not")
 91 |     }
 92 | 
 93 |     pub fn add_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
 94 |         self.try_add_scalar(scalar).expect("failed to add scalar")
 95 |     }
 96 | 
 97 |     pub fn sub_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
 98 |         self.try_sub_scalar(scalar).expect("failed to subtract scalar")
 99 |     }
100 | 
101 |     pub fn mul_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
102 |         self.try_mul_scalar(scalar).expect("failed to multiply by scalar")
103 |     }
104 | 
105 |     pub fn div_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
106 |         self.try_div_scalar(scalar).expect("failed to divide by scalar")
107 |     }
108 | 
109 |     pub fn maximum_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
110 |         self.try_maximum_scalar(scalar)
111 |             .expect("failed to compute maximum_scalar")
112 |     }
113 | 
114 |     pub fn minimum_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
115 |         self.try_minimum_scalar(scalar)
116 |             .expect("failed to compute minimum_scalar")
117 |     }
118 | 
119 |     pub fn pow<T: Into<Scalar>>(&self, exponent: T) -> Self {
120 |         self.try_pow(exponent).expect("failed to compute pow")
121 |     }
122 | 
123 |     pub fn leaky_relu<T: Into<Scalar>>(&self, alpha: T) -> Self {
124 |         self.try_leaky_relu(alpha).expect("failed to compute leaky_relu")
125 |     }
126 | 
127 |     pub fn elu<T: Into<Scalar>>(&self, alpha: T) -> Self {
128 |         self.try_elu(alpha).expect("failed to compute elu")
129 |     }
130 | 
131 |     pub fn eq_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
132 |         self.try_eq_scalar(scalar).expect("failed to compute eq_scalar")
133 |     }
134 | 
135 |     pub fn ne_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
136 |         self.try_ne_scalar(scalar).expect("failed to compute ne_scalar")
137 |     }
138 | 
139 |     pub fn lt_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
140 |         self.try_lt_scalar(scalar).expect("failed to compute lt_scalar")
141 |     }
142 | 
143 |     pub fn le_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
144 |         self.try_le_scalar(scalar).expect("failed to compute le_scalar")
145 |     }
146 | 
147 |     pub fn gt_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
148 |         self.try_gt_scalar(scalar).expect("failed to compute gt_scalar")
149 |     }
150 | 
151 |     pub fn ge_scalar<T: Into<Scalar>>(&self, scalar: T) -> Self {
152 |         self.try_ge_scalar(scalar).expect("failed to compute ge_scalar")
153 |     }
154 | }
155 | 


--------------------------------------------------------------------------------
/book/src/nn/convolution.md:
--------------------------------------------------------------------------------
  1 | # Convolution Layer
  2 | 
  3 | The Convolution layer applies a convolution operation to the input data. It's particularly effective for processing grid-structured data such as images.
  4 | 
  5 | ## Conv2d
  6 | 
  7 | The 2D convolution layer operates on 4D tensors with the shape \[batch_size, channels, height, width\].
  8 | 
  9 | ### Definition
 10 | 
 11 | ```rust
 12 | pub struct Conv2d {
 13 |     weight: Tensor,
 14 |     bias: Option<Tensor>,
 15 |     kernel_size: (usize, usize),
 16 |     stride: (usize, usize),
 17 |     padding: (usize, usize),
 18 |     state: LayerState,
 19 | }
 20 | ```
 21 | 
 22 | ### Constructor
 23 | 
 24 | ```rust
 25 | pub fn new(
 26 |     in_channels: usize,
 27 |     out_channels: usize,
 28 |     kernel_size: (usize, usize),
 29 |     stride: (usize, usize),
 30 |     padding: (usize, usize),
 31 |     with_bias: bool
 32 | ) -> Result<Self>
 33 | ```
 34 | 
 35 | Creates a new 2D convolution layer with the specified parameters.
 36 | 
 37 | **Parameters**:
 38 | - `in_channels`: Number of input channels
 39 | - `out_channels`: Number of output channels
 40 | - `kernel_size`: Size of the convolving kernel as (height, width)
 41 | - `stride`: Stride of the convolution as (height, width)
 42 | - `padding`: Zero-padding added to both sides of the input as (height, width)
 43 | - `with_bias`: Whether to include a bias term
 44 | 
 45 | **Example**:
 46 | ```rust
 47 | let conv = Conv2d::new(3, 64, (3, 3), (1, 1), (1, 1), true)?;
 48 | ```
 49 | 
 50 | For more control over the initialization, you can use the extended constructor:
 51 | 
 52 | ```rust
 53 | pub fn new_with_spec(
 54 |     in_channels: usize,
 55 |     out_channels: usize,
 56 |     kernel_size: (usize, usize),
 57 |     stride: (usize, usize),
 58 |     padding: (usize, usize),
 59 |     with_bias: bool,
 60 |     device: Device,
 61 |     dtype: DType
 62 | ) -> Result<Self>
 63 | ```
 64 | 
 65 | **Additional Parameters**:
 66 | - `device`: The device to place the layer's parameters on (CPU, CUDA, or MPS)
 67 | - `dtype`: The data type for the layer's parameters
 68 | 
 69 | **Example**:
 70 | ```rust
 71 | let conv = Conv2d::new_with_spec(
 72 |     3, 
 73 |     64,
 74 |     (3, 3),
 75 |     (1, 1),
 76 |     (1, 1),
 77 |     true,
 78 |     Device::CUDA(0),
 79 |     DType::F32
 80 | )?;
 81 | ```
 82 | 
 83 | ### Forward Pass
 84 | 
 85 | ```rust
 86 | pub fn forward(&self, input: &Tensor) -> Result<Tensor>
 87 | ```
 88 | 
 89 | Applies the convolution operation to the input tensor.
 90 | 
 91 | **Parameters**:
 92 | - `input`: The input tensor with shape \[batch_size, in_channels, height, width\]
 93 | 
 94 | **Returns**: Output tensor with shape \[batch_size, out_channels, output_height, output_width\]
 95 | 
 96 | **Example**:
 97 | ```rust
 98 | let input = Tensor::new(vec![/* values */])?.reshape(&[1, 3, 32, 32])?;
 99 | let conv = Conv2d::new(3, 64, (3, 3), (1, 1), (1, 1), true)?;
100 | let output = conv.forward(&input)?; // Shape: [1, 64, 32, 32]
101 | ```
102 | 
103 | ### Parameter Access
104 | 
105 | ```rust
106 | pub fn weight(&self) -> &Tensor
107 | pub fn bias(&self) -> Option<&Tensor>
108 | ```
109 | 
110 | Provides access to the layer's weight and bias parameters.
111 | 
112 | **Example**:
113 | ```rust
114 | let conv = Conv2d::new(3, 64, (3, 3), (1, 1), (1, 1), true)?;
115 | let weight = conv.weight(); // Shape: [64, 3, 3, 3]
116 | let bias = conv.bias().unwrap(); // Shape: [64]
117 | ```
118 | 
119 | ### Layer Implementation
120 | 
121 | The Conv2d layer implements the `Layer` trait, providing methods for parameter collection and training state management:
122 | 
123 | ```rust
124 | pub fn parameters(&mut self) -> Vec<&mut Tensor>
125 | ```
126 | 
127 | Returns all trainable parameters of the layer (weight and bias if present).
128 | 
129 | ## Output Dimensions
130 | 
131 | For a given input dimensions, the output dimensions of the convolution are computed as:
132 | 
133 | ```
134 | output_height = (input_height + 2 * padding.0 - kernel_size.0) / stride.0 + 1
135 | output_width = (input_width + 2 * padding.1 - kernel_size.1) / stride.1 + 1
136 | ```
137 | 
138 | ## Implementation Details
139 | 
140 | The MaidenX Conv2d implementation uses the im2col algorithm for efficient computation:
141 | 
142 | 1. The input tensor is transformed into a matrix where each column contains the values in a sliding window
143 | 2. Matrix multiplication is performed between this transformed matrix and the flattened kernel weights
144 | 3. The result is reshaped back to the expected output dimensions
145 | 
146 | This approach allows leveraging optimized matrix multiplication operations for convolution.
147 | 
148 | ## Common Configurations
149 | 
150 | Here are some common Conv2d configurations:
151 | 
152 | ### Basic Convolution (Same Padding)
153 | 
154 | ```rust
155 | // Maintains spatial dimensions
156 | let conv = Conv2d::new(in_channels, out_channels, (3, 3), (1, 1), (1, 1), true)?;
157 | ```
158 | 
159 | ### Strided Convolution (Downsampling)
160 | 
161 | ```rust
162 | // Reduces spatial dimensions by half
163 | let conv = Conv2d::new(in_channels, out_channels, (3, 3), (2, 2), (1, 1), true)?;
164 | ```
165 | 
166 | ### 1x1 Convolution (Channel Mixing)
167 | 
168 | ```rust
169 | // Changes channel dimensions only
170 | let conv = Conv2d::new(in_channels, out_channels, (1, 1), (1, 1), (0, 0), true)?;
171 | ```
172 | 


--------------------------------------------------------------------------------