├── docs
    ├── dev_guide
    │   ├── dev_guide.md
    │   ├── auto_diff
    │   │   ├── backward.md
    │   │   ├── difftensor.md
    │   │   └── introduction.md
    │   ├── pointer
    │   │   └── pointer.md
    │   ├── adding_new_op.md
    │   ├── test_rules.md
    │   ├── new_type.md
    │   └── iterator
    │   │   └── iterator.md
    ├── benchmarks
    │   └── benchmarks.md
    └── user_guide
    │   ├── user_guide.md
    │   ├── utils
    │       ├── set_seed.md
    │       ├── num_threads.md
    │       └── set_display_precision.md
    │   ├── custom_allocator
    │       └── custom_allocator.md
    │   ├── unary
    │       ├── acos.md
    │       ├── asin.md
    │       ├── atan.md
    │       ├── sin.md
    │       ├── sinh.md
    │       ├── cos.md
    │       ├── cosh.md
    │       ├── tan.md
    │       ├── tanh.md
    │       ├── abs.md
    │       ├── acosh.md
    │       ├── asinh.md
    │       ├── atanh.md
    │       ├── exp2.md
    │       ├── exp.md
    │       ├── sqrt.md
    │       ├── recip.md
    │       ├── log10.md
    │       ├── softplus.md
    │       ├── mish.md
    │       ├── cbrt.md
    │       ├── ln.md
    │       ├── log2.md
    │       ├── softsign.md
    │       ├── sigmoid.md
    │       ├── exp10.md
    │       ├── erf.md
    │       ├── sincos.md
    │       ├── gelu.md
    │       ├── cosh_.md
    │       ├── asin_.md
    │       ├── elu.md
    │       ├── hard_sigmoid.md
    │       ├── hard_swish.md
    │       ├── acos_.md
    │       ├── atan_.md
    │       ├── cos_.md
    │       ├── sin_.md
    │       ├── sinh_.md
    │       ├── tan_.md
    │       ├── ln_.md
    │       ├── tanh_.md
    │       ├── asinh_.md
    │       ├── abs_.md
    │       ├── acosh_.md
    │       ├── atanh_.md
    │       ├── celu.md
    │       ├── sqrt_.md
    │       ├── cbrt_.md
    │       ├── exp2_.md
    │       ├── exp_.md
    │       ├── log2_.md
    │       ├── log10_.md
    │       ├── recip_.md
    │       ├── mish_.md
    │       ├── softplus_.md
    │       ├── sigmoid_.md
    │       ├── softsign_.md
    │       ├── selu.md
    │       ├── erf_.md
    │       ├── exp10_.md
    │       ├── gelu_.md
    │       ├── elu_.md
    │       ├── hard_swish_.md
    │       ├── hard_sigmoid_.md
    │       └── celu_.md
    │   ├── random
    │       ├── randn_like.md
    │       ├── randn.md
    │       ├── rand_like.md
    │       ├── rand.md
    │       ├── beta.md
    │       └── exponential.md
    │   ├── associated_methods
    │       ├── cpu
    │       │   ├── to_cuda.md
    │       │   └── forget_copy.md
    │       └── cuda
    │       │   ├── to_cpu.md
    │       │   └── forget_copy.md
    │   ├── binary
    │       ├── add.md
    │       ├── div.md
    │       ├── mul.md
    │       ├── rem.md
    │       ├── sub.md
    │       ├── add_.md
    │       ├── div_.md
    │       ├── mul_.md
    │       ├── rem_.md
    │       ├── sub_.md
    │       └── pow_.md
    │   ├── custom_type
    │       └── custom_type.md
    │   ├── cmp
    │       ├── tensor_eq.md
    │       ├── tensor_gt.md
    │       ├── tensor_lt.md
    │       ├── tensor_neq.md
    │       ├── tensor_ge.md
    │       └── tensor_le.md
    │   ├── iterator
    │       ├── collect.md
    │       ├── par_iter_mut.md
    │       ├── par_iter.md
    │       └── strided_map.md
    │   ├── shape_manipulate
    │       └── flipud.md
    │   ├── creation
    │       ├── identity.md
    │       ├── arange.md
    │       ├── ones_like.md
    │       └── zeros_like.md
    │   └── windows
    │       ├── hamming_window.md
    │       └── hann_window.md
├── hpt-dyn
    ├── src
    │   ├── ops
    │   │   ├── tensor
    │   │   │   ├── cmp.rs
    │   │   │   └── conv2d
    │   │   │   │   └── type_kernels
    │   │   │   │       ├── i8_microkernels.rs
    │   │   │   │       ├── u8_microkernels.rs
    │   │   │   │       ├── bool_microkernels.rs
    │   │   │   │       ├── f32_microkernels.rs
    │   │   │   │       ├── f64_microkernels.rs
    │   │   │   │       ├── i16_microkernels.rs
    │   │   │   │       ├── i32_microkernels.rs
    │   │   │   │       ├── i64_microkernels.rs
    │   │   │   │       ├── u16_microkernels.rs
    │   │   │   │       ├── u32_microkernels.rs
    │   │   │   │       ├── u64_microkernels.rs
    │   │   │   │       ├── isize_microkernels.rs
    │   │   │   │       ├── usize_microkernels.rs
    │   │   │   │       ├── complex32_microkernels.rs
    │   │   │   │       └── complex64_microkernels.rs
    │   │   ├── common
    │   │   │   ├── traits.rs
    │   │   │   └── mod.rs
    │   │   ├── mod.rs
    │   │   └── models
    │   │   │   └── onnx.rs
    │   ├── utils
    │   │   ├── threadpool.rs
    │   │   ├── onnx
    │   │   │   ├── layout_sense.rs
    │   │   │   ├── load_model.rs
    │   │   │   ├── plot.rs
    │   │   │   └── parse_args
    │   │   │   │   ├── squeeze.rs
    │   │   │   │   ├── affine_grid.rs
    │   │   │   │   └── parse.rs
    │   │   └── mod.rs
    │   └── lib.rs
    └── build.rs
├── hpt-macros
    ├── src
    │   └── save_derive.rs
    ├── .gitignore
    └── Cargo.toml
├── hpt-bench
    ├── src
    │   └── main.rs
    ├── benches
    │   └── benchmarks
    │   │   └── broadcast
    │   │       └── broadcast.rs
    └── scan_benchmarks_result.py
├── hpt-tests
    ├── src
    │   ├── hpt_common
    │   │   ├── slice.rs
    │   │   ├── err_handler.rs
    │   │   ├── pointer.rs
    │   │   └── shape.rs
    │   ├── hpt
    │   │   ├── cpu
    │   │   │   ├── utils.rs
    │   │   │   ├── assert_utils.rs
    │   │   │   └── from_raw.rs
    │   │   └── cuda
    │   │   │   └── from_raw.rs
    │   ├── utils
    │   │   └── random_utils.rs
    │   ├── macro_tests
    │   │   ├── stmt_item
    │   │   │   └── item.rs
    │   │   └── control_flows
    │   │   │   ├── for_loop.expanded.rs
    │   │   │   └── while_loop.expanded.rs
    │   └── hpt_types
    │   │   └── test_display.rs
    └── .gitignore
├── hpt-types
    ├── src
    │   ├── dyn_dispatch
    │   │   └── vector.rs
    │   ├── vectors
    │   │   ├── .DS_Store
    │   │   └── arch_simd
    │   │   │   └── _128bit
    │   │   │       └── sse
    │   │   │           └── boolx16.rs
    │   ├── into_scalar.rs
    │   └── into_vec.rs
    ├── .gitignore
    └── Cargo.toml
├── hpt-cudakernels
    ├── src
    │   ├── utils
    │   │   ├── normalout.cuh
    │   │   ├── check_type.cuh
    │   │   ├── type_alias.cuh
    │   │   ├── loop_progress.cuh
    │   │   ├── promotion
    │   │   │   └── promotes.cuh
    │   │   └── extra_vecs.cuh
    │   ├── lib.rs
    │   ├── reginfo.rs
    │   ├── pooling
    │   │   └── pooling_template.cuh
    │   ├── reduce
    │   │   ├── sum.cu
    │   │   ├── prod.cu
    │   │   ├── all.cu
    │   │   ├── any.cu
    │   │   ├── max.cu
    │   │   ├── min.cu
    │   │   ├── nansum.cu
    │   │   ├── mean.cu
    │   │   ├── nanprod.cu
    │   │   ├── sum_square.cu
    │   │   ├── reducel1.cu
    │   │   ├── reducel2.cu
    │   │   ├── reducel3.cu
    │   │   ├── logsumexp.cu
    │   │   └── reduce_helper.cuh
    │   └── unary
    │   │   ├── ln.cu
    │   │   ├── exp.cu
    │   │   ├── sin.cu
    │   │   ├── tan.cu
    │   │   ├── cos.cu
    │   │   ├── erf.cu
    │   │   ├── acos.cu
    │   │   ├── asin.cu
    │   │   └── atan.cu
    ├── .gitignore
    └── Cargo.toml
├── hpt
    ├── .gitignore
    └── src
    │   └── backends
    │       ├── cuda
    │           ├── tensor_external
    │           │   └── advance.rs
    │           ├── cuda_slice.rs
    │           └── utils
    │           │   └── launch_cfg
    │           │       └── launch_cfg_trait.rs
    │       ├── common
    │           ├── readme.md
    │           └── conv.rs
    │       └── cpu
    │           ├── .DS_Store
    │           ├── kernels
    │               ├── conv2d
    │               │   └── type_kernels
    │               │   │   ├── bool_microkernels.rs
    │               │   │   ├── f32_microkernels.rs
    │               │   │   ├── f64_microkernels.rs
    │               │   │   ├── i16_microkernels.rs
    │               │   │   ├── i32_microkernels.rs
    │               │   │   ├── i64_microkernels.rs
    │               │   │   ├── i8_microkernels.rs
    │               │   │   ├── u16_microkernels.rs
    │               │   │   ├── u32_microkernels.rs
    │               │   │   ├── u64_microkernels.rs
    │               │   │   ├── u8_microkernels.rs
    │               │   │   ├── isize_microkernels.rs
    │               │   │   ├── usize_microkernels.rs
    │               │   │   ├── complex32_microkernels.rs
    │               │   │   └── complex64_microkernels.rs
    │               └── matmul
    │               │   ├── readme.md
    │               │   └── type_kernels
    │               │       ├── complex32_microkernels.rs
    │               │       └── complex64_microkernels.rs
    │           ├── tensor_external
    │               ├── cumulative.rs
    │               └── tensordot.rs
    │           └── tensor_internal
    │               └── cumulative.rs
├── matconv_simd
    ├── src
    │   └── simd
    │   │   ├── _512bit
    │   │       ├── avx512
    │   │       │   ├── bf16x32.rs
    │   │       │   ├── boolx64.rs
    │   │       │   ├── cplx32x8.rs
    │   │       │   ├── cplx64x4.rs
    │   │       │   ├── f64x8.rs
    │   │       │   ├── i16x32.rs
    │   │       │   ├── i32x16.rs
    │   │       │   ├── i8x64.rs
    │   │       │   ├── u32x16.rs
    │   │       │   ├── u64x8.rs
    │   │       │   ├── u8x64.rs
    │   │       │   └── u16x32.rs
    │   │       └── common
    │   │       │   ├── f32x16.rs
    │   │       │   ├── f64x8.rs
    │   │       │   ├── i32x16.rs
    │   │       │   ├── i8x64.rs
    │   │       │   ├── u8x64.rs
    │   │       │   ├── i16x32.rs
    │   │       │   ├── i64x8.rs
    │   │       │   ├── u64x8.rs
    │   │       │   └── mask.rs
    │   │   ├── .DS_Store
    │   │   ├── _128bit
    │   │       ├── common
    │   │       │   ├── f16x8.rs
    │   │       │   ├── f32x4.rs
    │   │       │   ├── f64x2.rs
    │   │       │   ├── i16x8.rs
    │   │       │   ├── i32x4.rs
    │   │       │   ├── u16x8.rs
    │   │       │   ├── i64x2.rs
    │   │       │   ├── u8x16.rs
    │   │       │   ├── u64x2.rs
    │   │       │   ├── i8x16.rs
    │   │       │   └── bf16x8.rs
    │   │       └── sse
    │   │       │   └── boolx16.rs
    │   │   └── _256bit
    │   │       ├── common
    │   │           ├── f32x8.rs
    │   │           ├── i32x8.rs
    │   │           ├── i8x32.rs
    │   │           ├── u8x32.rs
    │   │           ├── i16x16.rs
    │   │           ├── i64x4.rs
    │   │           ├── u64x4.rs
    │   │           └── f64x4.rs
    │   │       └── avx2
    │   │           └── f64x4.rs
    └── Cargo.toml
├── hpt-common
    ├── .gitignore
    ├── src
    │   ├── utils
    │   │   ├── conv_algos.rs
    │   │   └── simd_ref.rs
    │   └── error
    │   │   ├── onnx.rs
    │   │   ├── mod.rs
    │   │   └── autograd.rs
    └── Cargo.toml
├── hpt-traits
    ├── .gitignore
    ├── Cargo.toml
    └── src
    │   └── ops
    │       └── slice.rs
├── hpt-allocator
    ├── .gitignore
    ├── src
    │   ├── allocators
    │   │   └── mod.rs
    │   ├── storage
    │   │   ├── cpu.rs
    │   │   └── cuda.rs
    │   ├── ptr.rs
    │   └── utils
    │   │   ├── cache_resize.rs
    │   │   └── deallocate.rs
    └── Cargo.toml
├── .DS_Store
├── hpt-examples
    ├── src
    │   └── main.rs
    ├── Cargo.toml
    └── examples
    │   └── iterator
    │       └── main.rs
├── hpt-matmul
    ├── .DS_Store
    ├── src
    │   └── .DS_Store
    └── Cargo.toml
├── .idea
    └── .gitignore
├── hpt-display
    ├── src
    │   └── lib.rs
    └── Cargo.toml
├── hpt-codegen
    ├── src
    │   └── fuse
    │   │   └── dead_node_elimination.rs
    └── Cargo.toml
├── .cargo
    └── config.toml
├── .gitignore
├── hpt-conv
    └── Cargo.toml
├── hpt-dataloader
    ├── src
    │   ├── from_safetensors
    │   │   └── from_safetensors.rs
    │   └── lib.rs
    └── Cargo.toml
├── hpt-iterator
    └── Cargo.toml
├── package.json
└── .github
    └── workflows
        └── docs.yml


/docs/dev_guide/dev_guide.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/benchmarks/benchmarks.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/user_guide/user_guide.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/cmp.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/threadpool.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-macros/src/save_derive.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/dev_guide/auto_diff/backward.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/dev_guide/auto_diff/difftensor.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-bench/src/main.rs:
--------------------------------------------------------------------------------
1 | fn main() {}
2 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/common/traits.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/layout_sense.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt_common/slice.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hpt-types/src/dyn_dispatch/vector.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/dev_guide/auto_diff/introduction.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/normalout.cuh:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/hpt/src/backends/cuda/tensor_external/advance.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/bf16x32.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/boolx64.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/cplx32x8.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/cplx64x4.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/f64x8.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/i16x32.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/i32x16.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/i8x64.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/u32x16.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/u64x8.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/u8x64.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-bench/benches/benchmarks/broadcast/broadcast.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hpt-common/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/common/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod traits;


--------------------------------------------------------------------------------
/hpt-macros/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/hpt-tests/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/hpt-traits/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/hpt-types/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/avx512/u16x32.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hpt-allocator/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /true
3 | Cargo.lock


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/.DS_Store


--------------------------------------------------------------------------------
/hpt-examples/src/main.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     println!("Hello, world!");
3 | }
4 | 


--------------------------------------------------------------------------------
/hpt-matmul/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-matmul/.DS_Store


--------------------------------------------------------------------------------
/hpt-matmul/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-matmul/src/.DS_Store


--------------------------------------------------------------------------------
/hpt/src/backends/common/readme.md:
--------------------------------------------------------------------------------
1 | This folder contains all the ops that supports all the backend


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt/src/backends/cpu/.DS_Store


--------------------------------------------------------------------------------
/hpt-allocator/src/allocators/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod cpu;
2 | #[cfg(feature = "cuda")]
3 | pub(crate) mod cuda;
4 | 


--------------------------------------------------------------------------------
/hpt-types/src/vectors/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-types/src/vectors/.DS_Store


--------------------------------------------------------------------------------
/matconv_simd/src/simd/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/matconv_simd/src/simd/.DS_Store


--------------------------------------------------------------------------------
/hpt-dyn/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     // prost_build::compile_protos(&["src/onnx.proto"], &["src/"]).unwrap();
3 | }


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod common;
2 | pub(crate) mod tensor;
3 | pub(crate) mod models {
4 |     pub(crate) mod onnx;
5 | }


--------------------------------------------------------------------------------
/hpt-cudakernels/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod reginfo;
2 | pub use reginfo::RegisterInfo;
3 | #[cfg(feature = "cuda")]
4 | include!(concat!(env!("OUT_DIR"), "/generated_constants.rs"));
5 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/i8_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i8 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/u8_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u8 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/bool_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for bool {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/f32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for f32 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/f64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for f64 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/i16_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i16 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/i32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i32 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/i64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i64 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/u16_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u16 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/u32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u32 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/u64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u64 {}
4 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 基于编辑器的 HTTP 客户端请求
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reginfo.rs:
--------------------------------------------------------------------------------
1 | #[derive(Debug, Clone, Copy)]
2 | pub struct RegisterInfo {
3 |     pub pred: usize,
4 |     pub b16: usize,
5 |     pub b32: usize,
6 |     pub b64: usize,
7 | }
8 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/isize_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for isize {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/usize_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for usize {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/bool_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for bool {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/f32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for f32 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/f64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for f64 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/i16_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i16 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/i32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i32 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/i64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i64 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/i8_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for i8 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/u16_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u16 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/u32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u32 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/u64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u64 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/u8_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for u8 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/matmul/readme.md:
--------------------------------------------------------------------------------
1 | ## Acknowledgements
2 | 
3 | Our matrix multiplication implementation is inspired from [gemm](https://github.com/sarah-quinones/gemm) by Sarah Quinones.


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/isize_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for isize {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/usize_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
2 | 
3 | impl Conv2dMicroKernel for usize {}
4 | 


--------------------------------------------------------------------------------
/matconv_simd/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "matconv_simd"
3 | version = "0.1.0"
4 | edition = "2024"
5 | 
6 | [dependencies]
7 | half = { workspace = true }
8 | num-complex = { workspace = true }
9 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/matmul/type_kernels/complex32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::matmul::microkernel_trait::MatmulMicroKernel;
2 | use num::complex::Complex32;
3 | impl MatmulMicroKernel for Complex32 {}
4 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/matmul/type_kernels/complex64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use crate::backends::cpu::kernels::matmul::microkernel_trait::MatmulMicroKernel;
2 | use num::complex::Complex64;
3 | impl MatmulMicroKernel for Complex64 {}
4 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/complex32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use num::complex::Complex32;
2 | 
3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
4 | 
5 | impl Conv2dMicroKernel for Complex32 {}
6 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/tensor/conv2d/type_kernels/complex64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use num::complex::Complex64;
2 | 
3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
4 | 
5 | impl Conv2dMicroKernel for Complex64 {}
6 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cuda/cuda_slice.rs:
--------------------------------------------------------------------------------
1 | use cudarc::driver::DeviceRepr;
2 | 
3 | #[repr(C)]
4 | pub(crate) struct CudaSlice {
5 |     pub(crate) inner: cudarc::driver::sys::CUdeviceptr,
6 | }
7 | 
8 | unsafe impl DeviceRepr for CudaSlice {}
9 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/f16x8.rs:
--------------------------------------------------------------------------------
1 | 
2 | /// a vector of 8 f16 values
3 | #[allow(non_camel_case_types)]
4 | #[derive(Default, Clone, Copy, PartialEq, Debug)]
5 | #[repr(C, align(16))]
6 | pub struct f16x8(pub(crate) [half::f16; 8]);


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/complex32_microkernels.rs:
--------------------------------------------------------------------------------
1 | use num::complex::Complex32;
2 | 
3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
4 | 
5 | impl Conv2dMicroKernel for Complex32 {}
6 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/kernels/conv2d/type_kernels/complex64_microkernels.rs:
--------------------------------------------------------------------------------
1 | use num::complex::Complex64;
2 | 
3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel;
4 | 
5 | impl Conv2dMicroKernel for Complex64 {}
6 | 


--------------------------------------------------------------------------------
/hpt-display/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! This crate is used to display n-dimensional arrays
 2 | 
 3 | #![deny(missing_docs)]
 4 | 
 5 | /// A module contains display function
 6 | mod display;
 7 | /// A module contains formats
 8 | mod formats;
 9 | pub use display::display;
10 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/check_type.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define CHECK_FLOAT_TYPE(T) \
4 |     static_assert(std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16> || std::is_same_v<T, float> || std::is_same_v<T, double>, "T must be half, __nv_bfloat16, float, or double");
5 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/f32x16.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | #[allow(non_camel_case_types)]
6 | #[derive(Clone, Copy, Debug)]
7 | #[repr(C, align(64))]
8 | pub struct f32x16(#[cfg(target_arch = "x86_64")] pub(crate) __m512);


--------------------------------------------------------------------------------
/hpt-codegen/src/fuse/dead_node_elimination.rs:
--------------------------------------------------------------------------------
1 | use super::cfg::CFG;
2 | 
3 | pub(crate) struct _NodeEliminator<'a> {
4 |     pub(crate) cfg: &'a mut CFG,
5 |     pub(crate) current_assignment: Option<syn::Ident>,
6 | }
7 | 
8 | impl<'ast> syn::visit::Visit<'ast> for _NodeEliminator<'ast> {}
9 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/f32x8.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 8 f32 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(32))]
9 | pub struct f32x8(#[cfg(target_arch = "x86_64")] pub(crate) __m256);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/i32x8.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 8 i32 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(32))]
9 | pub struct i32x8(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/i8x32.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 32 i8 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(32))]
9 | pub struct i8x32(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/u8x32.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 32 u8 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(32))]
9 | pub struct u8x32(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/f64x8.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 2 f64 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(64))]
9 | pub struct f64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512d);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/i32x16.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 8 i32 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(64))]
9 | pub struct i32x16(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/i8x64.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 32 i8 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(64))]
9 | pub struct i8x64(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/u8x64.rs:
--------------------------------------------------------------------------------
1 | 
2 | #[cfg(target_arch = "x86_64")]
3 | use std::arch::x86_64::*;
4 | 
5 | /// a vector of 32 u8 values
6 | #[allow(non_camel_case_types)]
7 | #[derive(Clone, Copy, Debug)]
8 | #[repr(C, align(64))]
9 | pub struct u8x64(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);


--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
 1 | [target.'cfg(target_os = "macos")']
 2 | rustflags = ["-C", "target-cpu=native"]
 3 | 
 4 | [target.'cfg(target_os = "windows")']
 5 | rustflags = [
 6 |     "-C",
 7 |     "target-feature=+avx2",
 8 |     "-C",
 9 |     "target-feature=+fma",
10 |     "-C",
11 |     "target-feature=+f16c",
12 | ]
13 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/i16x16.rs:
--------------------------------------------------------------------------------
1 | #[cfg(target_arch = "x86_64")]
2 | use std::arch::x86_64::*;
3 | 
4 | /// a vector of 16 i16 values
5 | #[allow(non_camel_case_types)]
6 | #[derive(Clone, Copy, Debug)]
7 | #[repr(C, align(32))]
8 | pub struct i16x16(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);
9 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/i16x32.rs:
--------------------------------------------------------------------------------
1 | #[cfg(target_arch = "x86_64")]
2 | use std::arch::x86_64::*;
3 | 
4 | /// a vector of 16 i16 values
5 | #[allow(non_camel_case_types)]
6 | #[derive(Clone, Copy, Debug)]
7 | #[repr(C, align(64))]
8 | pub struct i16x32(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);
9 | 


--------------------------------------------------------------------------------
/hpt-allocator/src/storage/cpu.rs:
--------------------------------------------------------------------------------
1 | use dashmap::DashMap;
2 | use once_cell::sync::Lazy;
3 | 
4 | use super::CommonStorage;
5 | 
6 | /// This is a global variable that stores the allocated ptrs and their reference count for CPU devices
7 | pub static CPU_STORAGE: Lazy<DashMap<usize, CommonStorage>> = Lazy::new(|| DashMap::new());
8 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/i64x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | /// a vector of 4 i64 values
 6 | #[allow(non_camel_case_types)]
 7 | #[derive(Clone, Copy, Debug)]
 8 | #[repr(C, align(32))]
 9 | pub struct i64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);
10 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/u64x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | /// a vector of 2 u64 values
 6 | #[allow(non_camel_case_types)]
 7 | #[derive(Clone, Copy, Debug)]
 8 | #[repr(C, align(32))]
 9 | pub struct u64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256i);
10 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/i64x8.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | /// a vector of 4 i64 values
 6 | #[allow(non_camel_case_types)]
 7 | #[derive(Clone, Copy, Debug)]
 8 | #[repr(C, align(64))]
 9 | pub struct i64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);
10 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/u64x8.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | /// a vector of 2 u64 values
 6 | #[allow(non_camel_case_types)]
 7 | #[derive(Clone, Copy, Debug)]
 8 | #[repr(C, align(64))]
 9 | pub struct u64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512i);
10 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/common/f64x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | /// a vector of 2 f64 values
 6 | #[allow(non_camel_case_types)]
 7 | #[derive(Clone, Copy, Debug)]
 8 | #[repr(C, align(32))]
 9 | pub struct f64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256d);
10 | 
11 | 


--------------------------------------------------------------------------------
/hpt-allocator/src/ptr.rs:
--------------------------------------------------------------------------------
1 | /// just a wrapper around `*mut u8`, implementing `Send` and `Sync` trait to let the compiler know that it is safe to send and share across threads
2 | #[derive(Debug, PartialEq, Eq, Hash, Clone)]
3 | pub(crate) struct SafePtr {
4 |     pub(crate) ptr: *mut u8,
5 | }
6 | unsafe impl Send for SafePtr {}
7 | unsafe impl Sync for SafePtr {}
8 | 


--------------------------------------------------------------------------------
/docs/user_guide/utils/set_seed.md:
--------------------------------------------------------------------------------
 1 | # set_seed
 2 | 
 3 | ```rust
 4 | set_seed<B: BackendTy>(seed: u64)
 5 | ```
 6 | 
 7 | Set the seed for random number generation
 8 | ## Parameters:
 9 | `seed`: seed for generating random number
10 | `B`: hpt::Cuda | hpt::Cpu
11 | 
12 | 
13 | ## Backend Support
14 | | Backend | Supported |
15 | |---------|-----------|
16 | | CPU     | ❌         |
17 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-allocator/src/storage/cuda.rs:
--------------------------------------------------------------------------------
 1 | use std::{collections::HashMap, sync::Mutex};
 2 | 
 3 | use once_cell::sync::Lazy;
 4 | 
 5 | use crate::storage::CommonStorage;
 6 | 
 7 | /// This is a global variable that stores the allocated ptrs and their reference count for CUDA devices
 8 | pub static CUDA_STORAGE: Lazy<Mutex<HashMap<usize, CommonStorage>>> =
 9 |     Lazy::new(|| Mutex::new(HashMap::new()));
10 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/type_alias.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define f32 float
 4 | #define f64 double
 5 | #define bf16 __nv_bfloat16
 6 | #define f16 __half
 7 | 
 8 | #define i8 int8_t
 9 | #define i16 int16_t
10 | #define i32 int32_t
11 | #define i64 int64_t
12 | #define u8 uint8_t
13 | #define u16 uint16_t
14 | #define u32 uint32_t
15 | #define u64 uint64_t
16 | 
17 | #define bf162 __nv_bfloat162
18 | #define f162 __half2
19 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/f32x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_feature = "neon")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | 
 7 | /// a vector of 4 f32 values
 8 | #[allow(non_camel_case_types)]
 9 | #[derive(Clone, Copy, Debug)]
10 | #[repr(C, align(16))]
11 | pub struct f32x4(
12 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128,
13 |     #[cfg(target_arch = "aarch64")] pub(crate) float32x4_t,
14 | );
15 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/f64x2.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_feature = "neon")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | 
 7 | /// a vector of 2 f64 values
 8 | #[allow(non_camel_case_types)]
 9 | #[derive(Clone, Copy, Debug)]
10 | #[repr(C, align(16))]
11 | pub struct f64x2(
12 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128d,
13 |     #[cfg(target_arch = "aarch64")] pub(crate) float64x2_t,
14 | );
15 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/i16x8.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "aarch64")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | 
 7 | /// a vector of 8 i16 values
 8 | #[allow(non_camel_case_types)]
 9 | #[derive(Clone, Copy, Debug)]
10 | #[repr(C, align(16))]
11 | pub struct i16x8(
12 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
13 |     #[cfg(target_arch = "aarch64")] pub(crate) int16x8_t,
14 | );
15 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/i32x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "aarch64")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | 
 7 | /// a vector of 4 i32 values
 8 | #[allow(non_camel_case_types)]
 9 | #[derive(Clone, Copy, Debug)]
10 | #[repr(C, align(16))]
11 | pub struct i32x4(
12 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
13 |     #[cfg(target_arch = "aarch64")] pub(crate) int32x4_t,
14 | );
15 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/u16x8.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #[cfg(target_arch = "aarch64")]
 4 | use std::arch::aarch64::*;
 5 | #[cfg(target_arch = "x86_64")]
 6 | use std::arch::x86_64::*;
 7 | 
 8 | /// a vector of 8 u16 values
 9 | #[allow(non_camel_case_types)]
10 | #[derive(Clone, Copy, Debug)]
11 | #[repr(C, align(16))]
12 | pub struct u16x8(
13 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
14 |     #[cfg(target_arch = "aarch64")] pub(crate) uint16x8_t,
15 | );


--------------------------------------------------------------------------------
/hpt-common/src/utils/conv_algos.rs:
--------------------------------------------------------------------------------
 1 | /// enum for conv algorithms
 2 | pub enum ConvAlgo {
 3 |     /// ImplicitGemm
 4 |     ImplicitGemm,
 5 |     /// ImplicitPrecompGemm
 6 |     ImplicitPrecompGemm,
 7 |     /// Gemm
 8 |     Gemm,
 9 |     /// Direct
10 |     Direct,
11 |     /// Fft
12 |     Fft,
13 |     /// FftTiling
14 |     FftTiling,
15 |     /// Winograd
16 |     Winograd,
17 |     /// WinogradNonFused
18 |     WinogradNonFused,
19 |     /// Count
20 |     Count,
21 | }
22 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt/cpu/utils.rs:
--------------------------------------------------------------------------------
 1 | use hpt::Tensor;
 2 | 
 3 | use hpt::common::cpu::TensorLike;
 4 | use hpt::common::{CommonBounds, TensorInfo};
 5 | 
 6 | pub(crate) fn copy_from_tch<T: CommonBounds>(
 7 |     a: &mut Tensor<T>,
 8 |     tch_a: &tch::Tensor,
 9 | ) -> anyhow::Result<()> {
10 |     let a_size = a.size();
11 |     a.as_raw_mut().copy_from_slice(unsafe {
12 |         std::slice::from_raw_parts(tch_a.data_ptr() as *const T, a_size)
13 |     });
14 |     Ok(())
15 | }
16 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/load_model.rs:
--------------------------------------------------------------------------------
 1 | use std::{fs::File, io::Read};
 2 | 
 3 | use prost::Message;
 4 | 
 5 | use crate::{onnx::ModelProto, ops::models::onnx::OnnxModel};
 6 | 
 7 | pub fn load_onnx(path: &str) -> Result<OnnxModel, String> {
 8 |     let mut file = File::open(path).expect("找不到模型文件");
 9 |     let mut buf = Vec::new();
10 |     file.read_to_end(&mut buf).unwrap();
11 | 
12 |     let model = ModelProto::decode(&*buf).expect("模型解析失败");
13 | 
14 |     Ok(OnnxModel::Model(model))
15 | }
16 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/loop_progress.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T, typename Func>
 4 | struct ProgressUpdater
 5 | {
 6 |     Func update_func;
 7 |     T *data;
 8 |     __device__ __forceinline__ ProgressUpdater(Func f, T *data) : update_func(f), data(data) {}
 9 |     __device__ __forceinline__ void update() { update_func(data); }
10 |     __device__ __forceinline__ T get() const { return *data; }
11 |     __device__ __forceinline__ void set_ptr(T *data) { this->data = data; }
12 | };


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /true
 3 | Cargo.lock
 4 | *.ft
 5 | *.ftz
 6 | *.zip
 7 | *.ll
 8 | *.xlsx
 9 | *.7z
10 | *.safetensors
11 | *.txt
12 | *.model
13 | *.xml
14 | *.onnx
15 | 
16 | .vscode/
17 | 
18 | resnet.safetensor
19 | resnet_inp
20 | 
21 | /src/**/target
22 | /src/**/true
23 | /src/**/*.ll
24 | /src/**/*.txt
25 | /src/tensor-graph/src-tauri/*.txt
26 | /.VSCodeCounter
27 | 
28 | node_modules/
29 | docs/.vuepress/.cache/
30 | docs/.vuepress/.temp/
31 | docs/.vuepress/dist/
32 | 
33 | cutlass_gemm.cu
34 | cutlass
35 | 
36 | package-lock.json


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/plot.rs:
--------------------------------------------------------------------------------
 1 | use petgraph::{dot::{Config, Dot}, prelude::StableGraph};
 2 | 
 3 | #[allow(unused)]
 4 | pub(crate) fn generate_online_graphviz_link<N, E>(graph: &StableGraph<N, E>) -> String
 5 |     where N: std::fmt::Debug, E: std::fmt::Debug
 6 | {
 7 |     let dot = Dot::with_config(&graph, &[Config::EdgeNoLabel]);
 8 |     let dot_string = format!("{:?}", dot);
 9 | 
10 |     // URL编码DOT字符串
11 |     let encoded = urlencoding::encode(&dot_string);
12 |     format!("https://dreampuf.github.io/GraphvizOnline/#{encoded}")
13 | }
14 | 


--------------------------------------------------------------------------------
/hpt-display/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-display"
 3 | version = "0.1.2"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library for displaying tensors"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" }
12 | hpt-common = { path = "../hpt-common", version = "0.1.1" }
13 | hpt-types = { path = "../hpt-types", version = "0.1.1" }
14 | num-complex = { workspace = true }
15 | 


--------------------------------------------------------------------------------
/hpt-conv/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-conv"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | hpt-matmul = { path = "../hpt-matmul" }
 8 | matconv_simd = { path = "../matconv_simd" }
 9 | seq-macro = { workspace = true }
10 | half = { workspace = true }
11 | 
12 | [features]
13 | default = ["f32", "f16"]
14 | bound_check = []
15 | bool = []
16 | f32 = []
17 | f16 = []
18 | bf16 = []
19 | f64 = []
20 | i8 = []
21 | u8 = []
22 | i16 = []
23 | u16 = []
24 | i32 = []
25 | u32 = []
26 | i64 = []
27 | u64 = []
28 | cplx32 = []
29 | cplx64 = []


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/parse_args/squeeze.rs:
--------------------------------------------------------------------------------
 1 | use super::parse::{ Parse, ParseArgs };
 2 | 
 3 | pub(crate) struct SqueezeArgs<'a> {
 4 |     pub(crate) data: &'a str,
 5 |     pub(crate) axes: Option<&'a str>,
 6 | }
 7 |     
 8 | impl<'a> Parse<'a> for SqueezeArgs<'a> {
 9 |     fn parse<'b: 'a>(node: &'b crate::onnx::NodeProto) -> SqueezeArgs<'a> {
10 |         let data = node.input[0].as_str();
11 |         let axes = node.input.get(1).map(|s| s.as_str());
12 |         SqueezeArgs {
13 |             data,
14 |             axes,
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/hpt/src/backends/common/conv.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) fn cal_conv2d_output_shape(
 2 |     img_height: i64,
 3 |     img_width: i64,
 4 |     kh: i64,
 5 |     kw: i64,
 6 |     padding: &[(i64, i64); 2],
 7 |     stride: &[i64; 2],
 8 |     dilation: &[i64; 2],
 9 | ) -> (i64, i64) {
10 |     let out_height =
11 |         (img_height + padding[0].0 + padding[0].1 - dilation[0] * (kh - 1) - 1) / stride[0] + 1;
12 |     let out_width =
13 |         (img_width + padding[1].0 + padding[1].1 - dilation[1] * (kw - 1) - 1) / stride[1] + 1;
14 |     (out_height, out_width)
15 | }
16 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cuda/utils/launch_cfg/launch_cfg_trait.rs:
--------------------------------------------------------------------------------
 1 | use cudarc::driver::LaunchConfig;
 2 | 
 3 | pub(crate) trait LaunchConfigUtils {
 4 |     #[allow(unused)]
 5 |     fn block_size(&self) -> u32;
 6 |     #[allow(unused)]
 7 |     fn grid_size(&self) -> u32;
 8 | }
 9 | 
10 | impl LaunchConfigUtils for LaunchConfig {
11 |     fn block_size(&self) -> u32 {
12 |         self.block_dim.0 * self.block_dim.1 * self.block_dim.2
13 |     }
14 | 
15 |     fn grid_size(&self) -> u32 {
16 |         self.grid_dim.0 * self.grid_dim.1 * self.grid_dim.2
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/promotion/promotes.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "normal_promote/bool.cuh"
 3 | #include "normal_promote/i8.cuh"
 4 | #include "normal_promote/i16.cuh"
 5 | #include "normal_promote/i32.cuh"
 6 | #include "normal_promote/i64.cuh"
 7 | #include "normal_promote/u8.cuh"
 8 | #include "normal_promote/u16.cuh"
 9 | #include "normal_promote/u32.cuh"
10 | #include "normal_promote/u64.cuh"
11 | 
12 | #include "normal_promote/f16.cuh"
13 | #include "normal_promote/bf16.cuh"
14 | #include "normal_promote/f32.cuh"
15 | #include "normal_promote/f64.cuh"
16 | 


--------------------------------------------------------------------------------
/docs/user_guide/custom_allocator/custom_allocator.md:
--------------------------------------------------------------------------------
 1 | # Custom Allocator
 2 | 
 3 | Since hpt is designed in purely generic types, the user can define their own memory allocator.
 4 | 
 5 | # How
 6 | 
 7 | You can reference the steps at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-examples/examples/custom_allocator/main.rs).
 8 | 
 9 | # Note
10 | 
11 | Custom Allocator must have life time `'static`, `Send`, `Sync`, `Clone` implemented
12 | 
13 | ## Backend Support
14 | | Backend | Supported |
15 | |---------|-----------|
16 | | CPU     | ✅         |
17 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-dataloader/src/from_safetensors/from_safetensors.rs:
--------------------------------------------------------------------------------
 1 | use safetensors::SafeTensors;
 2 | 
 3 | #[diagnostic::on_unimplemented(
 4 |     message = "Cannot perform operation on type `{Self}` because it doesn't implement required features"
 5 | )]
 6 | pub trait FromSafeTensors {
 7 |     fn from_safe_tensors(data: &SafeTensors, tensor_name: &str) -> Self;
 8 | }
 9 | 
10 | impl<T: FromSafeTensors> FromSafeTensors for Option<T> {
11 |     fn from_safe_tensors(data: &SafeTensors, tensor_name: &str) -> Self {
12 |         Some(T::from_safe_tensors(data, tensor_name))
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /true
 3 | Cargo.lock
 4 | *.ft
 5 | *.ftz
 6 | *.zip
 7 | *.ll
 8 | *.xlsx
 9 | *.7z
10 | *.safetensors
11 | *.txt
12 | *.model
13 | *.xml
14 | 
15 | resnet.safetensor
16 | resnet_inp
17 | 
18 | /src/**/target
19 | /src/**/true
20 | /src/**/*.ll
21 | /src/**/*.txt
22 | /src/tensor-graph/src-tauri/*.txt
23 | /.VSCodeCounter
24 | 
25 | node_modules/
26 | docs/.vuepress/.cache/
27 | docs/.vuepress/.temp/
28 | docs/.vuepress/dist/
29 | 
30 | ./build.rs
31 | 
32 | cutlass_gemm.cu
33 | cutlass
34 | test_cute.cu
35 | gemm
36 | gemm.cu
37 | matmul.cu
38 | matmul


--------------------------------------------------------------------------------
/hpt-cudakernels/src/pooling/pooling_template.cuh:
--------------------------------------------------------------------------------
 1 | #include "../utils/type_alias.cuh"
 2 | 
 3 | 
 4 | template <typename T, typename Intermediate>
 5 | __device__ __forceinline__ void pooling2d_forward(
 6 |     T *input, T *output,
 7 |     i32 batch_size, i32 channels,
 8 |     i32 input_height, i32 input_width,
 9 |     i32 output_height, i32 output_width,
10 |     i32 kernel_h, i32 kernel_w,
11 |     i32 stride_h, i32 stride_w,
12 |     i32 padding_h, i32 padding_w,
13 |     Intermediate *workspace)
14 | {
15 |     i32 global_idx = blockIdx.x * blockDim.x + threadIdx.x;
16 |     
17 | }
18 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/sum.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "reduce_classes.cuh"
 3 | 
 4 | DECLARE_KERNEL(bool, bool, sum, Sum)
 5 | DECLARE_KERNEL(u8, u8, sum, Sum)
 6 | DECLARE_KERNEL(u16, u16, sum, Sum)
 7 | DECLARE_KERNEL(u32, u32, sum, Sum)
 8 | DECLARE_KERNEL(u64, u64, sum, Sum)
 9 | DECLARE_KERNEL(i8, i8, sum, Sum)
10 | DECLARE_KERNEL(i16, i16, sum, Sum)
11 | DECLARE_KERNEL(i32, i32, sum, Sum)
12 | DECLARE_KERNEL(i64, i64, sum, Sum)
13 | DECLARE_KERNEL(f32, f32, sum, Sum)
14 | DECLARE_KERNEL(f64, f64, sum, Sum)
15 | DECLARE_KERNEL(f16, f16, sum, Sum)
16 | DECLARE_KERNEL(bf16, bf16, sum, Sum)
17 | 


--------------------------------------------------------------------------------
/hpt-traits/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-traits"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library defines tensor operator traits for hpt"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | hpt-common = { path = "../hpt-common", version = "0.1.1" }
12 | hpt-types = { path = "../hpt-types", version = "0.1.1" }
13 | num = { workspace = true }
14 | rand_distr = { workspace = true }
15 | 
16 | [features]
17 | track_caller = []
18 | default = ["track_caller"]
19 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/acos.md:
--------------------------------------------------------------------------------
 1 | # acos
 2 | ```rust
 3 | acos(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse cosine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.acos()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/asin.md:
--------------------------------------------------------------------------------
 1 | # asin
 2 | ```rust
 3 | asin(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse sine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.asin()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/atan.md:
--------------------------------------------------------------------------------
 1 | # atan
 2 | ```rust
 3 | atan(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse tangent
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.atan()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sin.md:
--------------------------------------------------------------------------------
 1 | # sin
 2 | ```rust
 3 | sin(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Trigonometric sine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.sin()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sinh.md:
--------------------------------------------------------------------------------
 1 | # sinh
 2 | ```rust
 3 | sinh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Hyperbolic sine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.sinh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-iterator/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-iterator"
 3 | version = "0.1.2"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library implements iterator for hpt"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | hpt-common = { path = "../hpt-common", version = "0.1.1" }
12 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" }
13 | hpt-types = { path = "../hpt-types", version = "0.1.1" }
14 | rayon = { workspace = true }
15 | 
16 | [features]
17 | track_caller = []
18 | bound_check = []
19 | 


--------------------------------------------------------------------------------
/hpt-tests/src/utils/random_utils.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) fn generate_all_combinations(arr: &[usize]) -> Vec<Vec<i64>> {
 2 |     let n = arr.len();
 3 |     let total_combinations = 1 << n;
 4 |     let mut result = Vec::with_capacity(total_combinations);
 5 | 
 6 |     for i in 0..total_combinations {
 7 |         let mut combination = Vec::new();
 8 |         for j in 0..n {
 9 |             if (i & (1 << j)) != 0 {
10 |                 combination.push(arr[j] as i64);
11 |             }
12 |         }
13 |         if combination.len() > 0 {
14 |             result.push(combination);
15 |         }
16 |     }
17 | 
18 |     result
19 | }
20 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/cos.md:
--------------------------------------------------------------------------------
 1 | # cos
 2 | ```rust
 3 | cos(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Trigonometric cosine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.cos()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/cosh.md:
--------------------------------------------------------------------------------
 1 | # cosh
 2 | ```rust
 3 | cosh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Hyperbolic cosine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.cosh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/tan.md:
--------------------------------------------------------------------------------
 1 | # tan
 2 | ```rust
 3 | tan(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Trigonometric tangent
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.tan()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/tanh.md:
--------------------------------------------------------------------------------
 1 | # tanh
 2 | ```rust
 3 | tanh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Hyperbolic tangent
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.tanh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/abs.md:
--------------------------------------------------------------------------------
 1 | # abs
 2 | ```rust
 3 | abs(x: &Tensor<T>) -> Result<Tensor<T>, TensorError>
 4 | ```
 5 | Calculate absolute of input
 6 | ## Parameters:
 7 | `x`: input Tensor
 8 | ## Returns:
 9 | Tensor with type `T`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::NormalUaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([-10.0]);
16 |     let b = a.abs()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/acosh.md:
--------------------------------------------------------------------------------
 1 | # acosh
 2 | ```rust
 3 | acosh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse hyperbolic cosine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.acosh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/asinh.md:
--------------------------------------------------------------------------------
 1 | # asinh
 2 | ```rust
 3 | asinh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse hyperbolic sine
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.asinh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/atanh.md:
--------------------------------------------------------------------------------
 1 | # atanh
 2 | ```rust
 3 | atanh(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Inverse hyperbolic tangent
 6 | ## Parameters:
 7 | `x`: Angle(radians)
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.atanh()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/utils/num_threads.md:
--------------------------------------------------------------------------------
 1 | # set_num_threads
 2 | 
 3 | ```rust
 4 | set_num_threads(num_threads: usize)
 5 | ```
 6 | 
 7 | Set the parallelism thread numbers
 8 | ## Parameters:
 9 | `num_threads`: number of threads to use
10 | 
11 | ## Backend Support
12 | | Backend | Supported |
13 | |---------|-----------|
14 | | CPU     | ✅         |
15 | | Cuda    | ❌        |
16 | 
17 | # get_num_threads
18 | 
19 | ```rust
20 | get_num_threads()
21 | ```
22 | 
23 | Get the current number of threads using
24 | 
25 | ## Backend Support
26 | | Backend | Supported |
27 | |---------|-----------|
28 | | CPU     | ✅         |
29 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/prod.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "reduce_classes.cuh"
 3 | 
 4 | DECLARE_KERNEL(bool, bool, prod, Prod)
 5 | DECLARE_KERNEL(u8, u8, prod, Prod)
 6 | DECLARE_KERNEL(u16, u16, prod, Prod)
 7 | DECLARE_KERNEL(u32, u32, prod, Prod)
 8 | DECLARE_KERNEL(u64, u64, prod, Prod)
 9 | DECLARE_KERNEL(i8, i8, prod, Prod)
10 | DECLARE_KERNEL(i16, i16, prod, Prod)
11 | DECLARE_KERNEL(i32, i32, prod, Prod)
12 | DECLARE_KERNEL(i64, i64, prod, Prod)
13 | DECLARE_KERNEL(f32, f32, prod, Prod)
14 | DECLARE_KERNEL(f64, f64, prod, Prod)
15 | DECLARE_KERNEL(f16, f16, prod, Prod)
16 | DECLARE_KERNEL(bf16, bf16, prod, Prod)
17 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp2.md:
--------------------------------------------------------------------------------
 1 | # exp2
 2 | ```rust
 3 | exp2(x: &Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large 2^x$ for all elements
 6 | ## Parameters:
 7 | `x`: Input values
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.exp2()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-common/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-common"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | description = "An internal library for common utilities for hpt"
 6 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 7 | repository = "https://github.com/Jianqoq/Hpt"
 8 | license = "MIT OR Apache-2.0"
 9 | 
10 | [dependencies]
11 | hpt-types = { path = "../hpt-types", version = "0.1.1" }
12 | thiserror = { workspace = true }
13 | serde = { workspace = true }
14 | cudarc = { workspace = true, optional = true }
15 | rand_distr = { workspace = true }
16 | 
17 | [features]
18 | track_caller = []
19 | bound_check = []
20 | cuda = ["cudarc"]
21 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp.md:
--------------------------------------------------------------------------------
 1 | # exp
 2 | ```rust
 3 | exp(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute exponential of `x` for all elements
 6 | ## Parameters:
 7 | `x`: Input values
 8 | ## Returns:
 9 | Tensor with type `C`
10 | ## Examples:
11 | ```rust
12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::new([10.0]);
16 |     let b = a.exp()?;
17 |     println!("{}", b);
18 |     Ok(())
19 | }
20 | ```
21 | ## Backend Support
22 | | Backend | Supported |
23 | |---------|-----------|
24 | | CPU     | ✅         |
25 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/all.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "reduce_classes.cuh"
 4 | 
 5 | DECLARE_KERNEL(bool, bool, all, All)
 6 | DECLARE_KERNEL(u8, u8, all, All)
 7 | DECLARE_KERNEL(u16, u16, all, All)
 8 | DECLARE_KERNEL(u32, u32, all, All)
 9 | DECLARE_KERNEL(u64, u64, all, All)
10 | DECLARE_KERNEL(i8, i8, all, All)
11 | DECLARE_KERNEL(i16, i16, all, All)
12 | DECLARE_KERNEL(i32, i32, all, All)
13 | DECLARE_KERNEL(i64, i64, all, All)
14 | DECLARE_KERNEL(f32, f32, all, All)
15 | DECLARE_KERNEL(f64, f64, all, All)
16 | DECLARE_KERNEL(f16, f16, all, All)
17 | DECLARE_KERNEL(bf16, bf16, all, All)
18 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/any.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "reduce_classes.cuh"
 4 | 
 5 | DECLARE_KERNEL(bool, bool, any, Any)
 6 | DECLARE_KERNEL(u8, u8, any, Any)
 7 | DECLARE_KERNEL(u16, u16, any, Any)
 8 | DECLARE_KERNEL(u32, u32, any, Any)
 9 | DECLARE_KERNEL(u64, u64, any, Any)
10 | DECLARE_KERNEL(i8, i8, any, Any)
11 | DECLARE_KERNEL(i16, i16, any, Any)
12 | DECLARE_KERNEL(i32, i32, any, Any)
13 | DECLARE_KERNEL(i64, i64, any, Any)
14 | DECLARE_KERNEL(f32, f32, any, Any)
15 | DECLARE_KERNEL(f64, f64, any, Any)
16 | DECLARE_KERNEL(f16, f16, any, Any)
17 | DECLARE_KERNEL(bf16, bf16, any, Any)
18 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/max.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_utils.cuh"
 3 | #include "reduce_classes.cuh"
 4 | 
 5 | DECLARE_KERNEL(bool, bool, max, Max)
 6 | DECLARE_KERNEL(u8, u8, max, Max)
 7 | DECLARE_KERNEL(u16, u16, max, Max)
 8 | DECLARE_KERNEL(u32, u32, max, Max)
 9 | DECLARE_KERNEL(u64, u64, max, Max)
10 | DECLARE_KERNEL(i8, i8, max, Max)
11 | DECLARE_KERNEL(i16, i16, max, Max)
12 | DECLARE_KERNEL(i32, i32, max, Max)
13 | DECLARE_KERNEL(i64, i64, max, Max)
14 | DECLARE_KERNEL(f32, f32, max, Max)
15 | DECLARE_KERNEL(f64, f64, max, Max)
16 | DECLARE_KERNEL(f16, f16, max, Max)
17 | DECLARE_KERNEL(bf16, bf16, max, Max)
18 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/min.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_utils.cuh"
 3 | #include "reduce_classes.cuh"
 4 | 
 5 | DECLARE_KERNEL(bool, bool, min, Min)
 6 | DECLARE_KERNEL(u8, u8, min, Min)
 7 | DECLARE_KERNEL(u16, u16, min, Min)
 8 | DECLARE_KERNEL(u32, u32, min, Min)
 9 | DECLARE_KERNEL(u64, u64, min, Min)
10 | DECLARE_KERNEL(i8, i8, min, Min)
11 | DECLARE_KERNEL(i16, i16, min, Min)
12 | DECLARE_KERNEL(i32, i32, min, Min)
13 | DECLARE_KERNEL(i64, i64, min, Min)
14 | DECLARE_KERNEL(f32, f32, min, Min)
15 | DECLARE_KERNEL(f64, f64, min, Min)
16 | DECLARE_KERNEL(f16, f16, min, Min)
17 | DECLARE_KERNEL(bf16, bf16, min, Min)
18 | 


--------------------------------------------------------------------------------
/hpt-macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-macros"
 3 | version = "0.1.2"
 4 | edition = "2021"
 5 | description = "An internal library for generating helper functions for hpt"
 6 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 7 | repository = "https://github.com/Jianqoq/Hpt"
 8 | license = "MIT OR Apache-2.0"
 9 | 
10 | [dependencies]
11 | syn = { workspace = true }
12 | quote = { workspace = true }
13 | regex = { workspace = true }
14 | proc-macro2 = { workspace = true }
15 | 
16 | [features]
17 | cuda = []
18 | 
19 | [lib]
20 | proc-macro = true
21 | 
22 | [package.metadata.rust-analyzer]
23 | rustc_private = true
24 | proc_macro_srv = true
25 | 


--------------------------------------------------------------------------------
/docs/user_guide/random/randn_like.md:
--------------------------------------------------------------------------------
 1 | # randn
 2 | ```rust
 3 | randn_like(x: &Tensor<T>) -> Result<Tensor<T>, TensorError>
 4 | ```
 5 | same as `randn` but the shape will be based on `x`.
 6 | ## Parameters:
 7 | `x`: input Tensor
 8 | ## Returns:
 9 | Tensor with type `T`
10 | ## Examples:
11 | ```rust
12 | use hpt::{error::TensorError, ops::Random, Tensor};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::randn([10, 10])?;
16 |     println!("{}", a.randn_like()?);
17 |     Ok(())
18 | }
19 | ```
20 | ## Backend Support
21 | | Backend | Supported |
22 | |---------|-----------|
23 | | CPU     | ✅         |
24 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sqrt.md:
--------------------------------------------------------------------------------
 1 | # sqrt
 2 | ```rust
 3 | sqrt(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\sqrt{x}$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([4.0]);
19 |     let b = a.sqrt()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-types/src/into_scalar.rs:
--------------------------------------------------------------------------------
 1 | use crate::convertion::Convertor;
 2 | use half::{bf16, f16};
 3 | use hpt_macros::impl_into_scalar;
 4 | use num_complex::{Complex32, Complex64};
 5 | /// A trait for converting a scalar into another scalar type.
 6 | pub trait Cast<T> {
 7 |     /// Convert the scalar into another scalar type.
 8 |     fn cast(self) -> T;
 9 | }
10 | 
11 | impl_into_scalar!();
12 | 
13 | #[cfg(feature = "cuda")]
14 | mod cud_impl {
15 |     use super::*;
16 |     use crate::cuda_types::convertion::CudaConvertor;
17 |     use crate::cuda_types::scalar::Scalar;
18 |     use hpt_macros::impl_into_cuda_scalar;
19 |     impl_into_cuda_scalar!();
20 | }
21 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/recip.md:
--------------------------------------------------------------------------------
 1 | # recip
 2 | ```rust
 3 | recip(x: &Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{1}{x}$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.recip()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/log10.md:
--------------------------------------------------------------------------------
 1 | # log10
 2 | ```rust
 3 | log10(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \log_{10}(x)$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([100.0]);
19 |     let b = a.log10()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/softplus.md:
--------------------------------------------------------------------------------
 1 | # softplus
 2 | ```rust
 3 | softplus(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\ln(1 + e^x)$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.softplus()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-common/src/error/onnx.rs:
--------------------------------------------------------------------------------
 1 | use std::panic::Location;
 2 | 
 3 | use thiserror::Error;
 4 | 
 5 | /// Onnx-related errors
 6 | #[derive(Debug, Error)]
 7 | pub enum OnnxError {
 8 |     /// Onnx error
 9 |     #[error("Onnx error: {message} at {location}")]
10 |     Any {
11 |         /// Message describing the error
12 |         message: String,
13 |         /// Location where the error occurred
14 |         location: &'static Location<'static>,
15 |     },
16 | }
17 | 
18 | impl OnnxError {
19 |     /// Create a new Onnx error
20 |     #[track_caller]
21 |     pub fn new(message: String) -> Self {
22 |         Self::Any { message, location: Location::caller() }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/mish.md:
--------------------------------------------------------------------------------
 1 | # mish
 2 | ```rust
 3 | mish(x: &Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x * \tanh(\ln(1 + e^x))$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.mish()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/cbrt.md:
--------------------------------------------------------------------------------
 1 | # cbrt
 2 | ```rust
 3 | cbrt(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \sqrt[3]{x}$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([8.0]);
19 |     let b = a.cbrt()?;
20 |     println!("{}", b);  // prints: 2.0
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/ln.md:
--------------------------------------------------------------------------------
 1 | # ln
 2 | ```rust
 3 | ln(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \ln(x)$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.718281828459045]);
19 |     let b = a.ln()?;
20 |     println!("{}", b);  // prints: 1.0
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/log2.md:
--------------------------------------------------------------------------------
 1 | # log2
 2 | ```rust
 3 | log2(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \log_{2}(x)$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([8.0]);
19 |     let b = a.log2()?;
20 |     println!("{}", b);  // prints: 3.0
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-codegen/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-codegen"
 3 | version = "0.1.2"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | syn = { version = "2.0.82", default-features = false, features = [
 9 |     "derive",
10 |     "full",
11 |     "visit",
12 |     "visit-mut",
13 |     "extra-traits"
14 | ] }
15 | quote = { workspace = true }
16 | regex = { workspace = true }
17 | proc-macro2 = { workspace = true }
18 | petgraph = { workspace = true }
19 | thiserror = { workspace = true }
20 | anyhow = { workspace = true }
21 | 
22 | [lib]
23 | proc-macro = true
24 | 
25 | [package.metadata.rust-analyzer]
26 | rustc_private = true
27 | proc_macro_srv = true
28 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/softsign.md:
--------------------------------------------------------------------------------
 1 | # softsign
 2 | ```rust
 3 | softsign(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{x}{1 + |x|}$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.softsign()?;
20 |     println!("{}", b);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/nansum.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "reduce_classes.cuh"
 3 | 
 4 | DECLARE_KERNEL(bool, bool, nansum, NanSum)
 5 | DECLARE_KERNEL(u8, u8, nansum, NanSum)
 6 | DECLARE_KERNEL(u16, u16, nansum, NanSum)
 7 | DECLARE_KERNEL(u32, u32, nansum, NanSum)
 8 | DECLARE_KERNEL(u64, u64, nansum, NanSum)
 9 | DECLARE_KERNEL(i8, i8, nansum, NanSum)
10 | DECLARE_KERNEL(i16, i16, nansum, NanSum)
11 | DECLARE_KERNEL(i32, i32, nansum, NanSum)
12 | DECLARE_KERNEL(i64, i64, nansum, NanSum)
13 | DECLARE_KERNEL(f32, f32, nansum, NanSum)
14 | DECLARE_KERNEL(f64, f64, nansum, NanSum)
15 | DECLARE_KERNEL(f16, f16, nansum, NanSum)
16 | DECLARE_KERNEL(bf16, bf16, nansum, NanSum)
17 | 


--------------------------------------------------------------------------------
/hpt-traits/src/ops/slice.rs:
--------------------------------------------------------------------------------
 1 | use hpt_common::error::base::TensorError;
 2 | 
 3 | /// trait for slicing tensor
 4 | pub trait Slice: Sized {
 5 |     /// Create a new Tensor by slicing an existing Tensor. Slicing allows you to extract a portion of a tensor using index ranges for each dimension.
 6 |     ///
 7 |     /// ## Parameters:
 8 |     /// `index`: `(start, end, step)`: Select from start to end with step
 9 |     ///
10 |     /// ## Example:
11 |     /// ```rust
12 |     /// let a = Tensor::<f32>::arange(0, 16)?.reshape(&[4, 4])?;
13 |     /// let b = a.slice(&[(1, 3, 1), (0, 4, 1)])?;
14 |     /// ```
15 |     fn slice(&self, index: &[(i64, i64, i64)]) -> Result<Self, TensorError>;
16 | }
17 | 


--------------------------------------------------------------------------------
/docs/dev_guide/pointer/pointer.md:
--------------------------------------------------------------------------------
1 | ### Pointer
2 | 
3 | Hpt is using multi threading across the whole operators implementation. However, raw pointer can't be send to threads safely.
4 | 
5 | So we created a [wrapper](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-common/src/utils/pointer.rs#L11) for pointer.
6 | 
7 | In the whole project, almost all the parallel iteration are using [wrapper](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-common/src/utils/pointer.rs#L11) to read and write data.
8 | 
9 | You may notice there is a `bound_check` feature, however, this feature is not fully tested and may not reliable. This feature need to stablize.


--------------------------------------------------------------------------------
/docs/user_guide/random/randn.md:
--------------------------------------------------------------------------------
 1 | # randn
 2 | ```rust
 3 | randn(shape: &[i64] | &Vec<i64> | &[i64; _]) -> Result<Tensor<T>, TensorError>
 4 | ```
 5 | create a Tensor with data in normal distribution. `mean = 0.0`, `std_dev = 1.0`.
 6 | ## Parameters:
 7 | `shape`: shape of the output
 8 | ## Returns:
 9 | Tensor with type `T`
10 | ## Examples:
11 | ```rust
12 | use hpt::{error::TensorError, ops::Random, Tensor};
13 | 
14 | fn main() -> Result<(), TensorError> {
15 |     let a = Tensor::<f32>::randn([10, 10])?;
16 |     println!("{}", a);
17 |     Ok(())
18 | }
19 | ```
20 | ## Backend Support
21 | | Backend | Supported |
22 | |---------|-----------|
23 | | CPU     | ✅         |
24 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-allocator/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-allocator"
 3 | version = "0.1.2"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library for memory allocator for hpt"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | lru = { workspace = true }
12 | once_cell = { workspace = true }
13 | lazy_static = { workspace = true }
14 | ctor = { workspace = true }
15 | cudarc = { workspace = true, optional = true }
16 | dashmap = { workspace = true }
17 | hpt-common = { path = "../hpt-common", version = "0.1.1" }
18 | 
19 | [features]
20 | cuda = ["cudarc"]
21 | track_caller = []
22 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/sigmoid.md:
--------------------------------------------------------------------------------
 1 | # sigmoid
 2 | ```rust
 3 | sigmoid(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{1}{1 + e^{-x}}$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.sigmoid()?;
20 |     println!("{}", b);  // prints: 0.8807971
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/mean.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "../utils/check_type.cuh"
 4 | #include "reduce_classes.cuh"
 5 | 
 6 | DECLARE_KERNEL(bool, bool, mean, Mean)
 7 | DECLARE_KERNEL(u8, u8, mean, Mean)
 8 | DECLARE_KERNEL(u16, u16, mean, Mean)
 9 | DECLARE_KERNEL(u32, u32, mean, Mean)
10 | DECLARE_KERNEL(u64, u64, mean, Mean)
11 | DECLARE_KERNEL(i8, i8, mean, Mean)
12 | DECLARE_KERNEL(i16, i16, mean, Mean)
13 | DECLARE_KERNEL(i32, i32, mean, Mean)
14 | DECLARE_KERNEL(i64, i64, mean, Mean)
15 | DECLARE_KERNEL(f32, f32, mean, Mean)
16 | DECLARE_KERNEL(f64, f64, mean, Mean)
17 | DECLARE_KERNEL(f16, f16, mean, Mean)
18 | DECLARE_KERNEL(bf16, bf16, mean, Mean)
19 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/nanprod.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "reduce_classes.cuh"
 3 | 
 4 | DECLARE_KERNEL(bool, bool, nanprod, NanProd)
 5 | DECLARE_KERNEL(u8, u8, nanprod, NanProd)
 6 | DECLARE_KERNEL(u16, u16, nanprod, NanProd)
 7 | DECLARE_KERNEL(u32, u32, nanprod, NanProd)
 8 | DECLARE_KERNEL(u64, u64, nanprod, NanProd)
 9 | DECLARE_KERNEL(i8, i8, nanprod, NanProd)
10 | DECLARE_KERNEL(i16, i16, nanprod, NanProd)
11 | DECLARE_KERNEL(i32, i32, nanprod, NanProd)
12 | DECLARE_KERNEL(i64, i64, nanprod, NanProd)
13 | DECLARE_KERNEL(f32, f32, nanprod, NanProd)
14 | DECLARE_KERNEL(f64, f64, nanprod, NanProd)
15 | DECLARE_KERNEL(f16, f16, nanprod, NanProd)
16 | DECLARE_KERNEL(bf16, bf16, nanprod, NanProd)
17 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/ops/models/onnx.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use crate::utils::onnx::operators::Operator;
 4 | use crate::Tensor;
 5 | use crate::onnx::ModelProto;
 6 | 
 7 | #[derive(Debug)]
 8 | pub(crate) struct Meta {
 9 |     pub(crate) permute: Option<Vec<i64>>,
10 | }
11 | 
12 | #[derive(Debug)]
13 | pub struct Initialized {
14 |     pub(crate) model: ModelProto,
15 |     pub(crate) initializer_map: HashMap<String, Tensor>,
16 |     pub(crate) permutes: HashMap<String, Meta>,
17 |     pub(crate) operators: Vec<Operator>,
18 |     pub(crate) node_degree: HashMap<String, u32>,
19 | }
20 | 
21 | #[derive(Debug)]
22 | pub enum OnnxModel {
23 |     Model(ModelProto),
24 |     Initialized(Initialized),
25 | }
26 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp10.md:
--------------------------------------------------------------------------------
 1 | # exp10
 2 | ```rust
 3 | exp10(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute 10 raised to the power of `x` for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values (exponents)
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);  // 10^2
19 |     let b = a.exp10()?;
20 |     println!("{}", b);  // prints: 100.0
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/i64x2.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_feature = "neon")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | use std::ops::Index;
 7 | 
 8 | /// a vector of 2 i64 values
 9 | #[allow(non_camel_case_types)]
10 | #[derive(Clone, Copy, Debug)]
11 | #[repr(C, align(16))]
12 | pub struct i64x2(
13 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
14 |     #[cfg(target_arch = "aarch64")] pub(crate) int64x2_t,
15 | );
16 | 
17 | impl Index<usize> for i64x2 {
18 |     type Output = i64;
19 |     fn index(&self, index: usize) -> &Self::Output {
20 |         let ptr = self as *const _ as *const i64;
21 |         unsafe { &*ptr.add(index) }
22 |     }
23 | }


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/u8x16.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "aarch64")]
 3 | use std::arch::aarch64::*;
 4 | #[cfg(target_arch = "x86_64")]
 5 | use std::arch::x86_64::*;
 6 | use std::ops::Index;
 7 | 
 8 | /// a vector of 16 u8 values
 9 | #[allow(non_camel_case_types)]
10 | #[derive(Clone, Copy, Debug)]
11 | #[repr(C, align(16))]
12 | pub struct u8x16(
13 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
14 |     #[cfg(target_arch = "aarch64")] pub(crate) uint8x16_t,
15 | );
16 | 
17 | impl Index<usize> for u8x16 {
18 |     type Output = u8;
19 |     fn index(&self, index: usize) -> &Self::Output {
20 |         let ptr = self as *const _ as *const u8;
21 |         unsafe { &*ptr.add(index) }
22 |     }
23 | }


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/u64x2.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | #[cfg(target_arch = "aarch64")]
 6 | use std::arch::aarch64::*;
 7 | use std::ops::Index;
 8 | 
 9 | /// a vector of 2 u64 values
10 | #[allow(non_camel_case_types)]
11 | #[derive(Clone, Copy, Debug)]
12 | #[repr(C, align(16))]
13 | pub struct u64x2(
14 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
15 |     #[cfg(target_arch = "aarch64")] pub(crate) uint64x2_t,
16 | );
17 | 
18 | impl Index<usize> for u64x2 {
19 |     type Output = u64;
20 |     fn index(&self, index: usize) -> &Self::Output {
21 |         let ptr = self as *const _ as *const u64;
22 |         unsafe { &*ptr.add(index) }
23 |     }
24 | }


--------------------------------------------------------------------------------
/docs/user_guide/unary/erf.md:
--------------------------------------------------------------------------------
 1 | # erf
 2 | ```rust
 3 | erf(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} dt$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([1.0]);
19 |     let b = a.erf()?;
20 |     println!("{}", b);  // prints: 0.8427008
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/i8x16.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | #[cfg(target_arch = "x86_64")]
 3 | use std::arch::x86_64::*;
 4 | 
 5 | #[cfg(target_arch = "aarch64")]
 6 | use std::arch::aarch64::*;
 7 | use std::ops::Index;
 8 | 
 9 | /// a vector of 16 i8 values
10 | #[allow(non_camel_case_types)]
11 | #[derive(Clone, Copy, Debug)]
12 | #[repr(C, align(16))]
13 | pub struct i8x16(
14 |     #[cfg(target_arch = "x86_64")] pub(crate) __m128i,
15 |     #[cfg(target_arch = "aarch64")] pub(crate) int8x16_t,
16 | );
17 | 
18 | impl Index<usize> for i8x16 {
19 |     type Output = i8;
20 |     fn index(&self, index: usize) -> &Self::Output {
21 |         let ptr = self as *const _ as *const i8;
22 |         unsafe { &*ptr.add(index) }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-cudakernels"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | description = "A library implements cuda kernels for hpt"
 6 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 7 | repository = "https://github.com/Jianqoq/Hpt"
 8 | license = "MIT OR Apache-2.0"
 9 | 
10 | [dependencies]
11 | phf = { workspace = true }
12 | 
13 | [build-dependencies]
14 | phf = { workspace = true }
15 | phf_codegen = { workspace = true }
16 | regex = { workspace = true }
17 | walkdir = { workspace = true }
18 | rayon = { workspace = true }
19 | num_cpus = { workspace = true }
20 | 
21 | [lib]
22 | path = "src/lib.rs"
23 | required-features = ["cuda"]
24 | 
25 | [features]
26 | default = []
27 | cuda = []
28 | 


--------------------------------------------------------------------------------
/hpt-types/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-types"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library define primitive types functions for hpt"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | hpt-macros = { path = "../hpt-macros", version = "0.1.1" }
12 | num-complex = { workspace = true }
13 | half = { workspace = true }
14 | paste = { workspace = true }
15 | num-traits = { workspace = true }
16 | libm = { workspace = true }
17 | serde = { workspace = true }
18 | duplicate = { workspace = true }
19 | [features]
20 | cuda = []
21 | default = ["normal_promote"]
22 | normal_promote = []
23 | 


--------------------------------------------------------------------------------
/hpt-allocator/src/utils/cache_resize.rs:
--------------------------------------------------------------------------------
 1 | use std::num::NonZeroUsize;
 2 | 
 3 | use lru::LruCache;
 4 | 
 5 | use crate::ptr::SafePtr;
 6 | 
 7 | pub fn resize_lru_cache(
 8 |     cache: &mut LruCache<std::alloc::Layout, Vec<SafePtr>>,
 9 |     deallocate_fn: impl Fn(*mut u8, std::alloc::Layout),
10 |     new_size: usize,
11 | ) {
12 |     if cache.cap().get() <= new_size {
13 |         cache.resize(NonZeroUsize::new(new_size).unwrap());
14 |     } else {
15 |         let new = LruCache::new(NonZeroUsize::new(new_size).unwrap());
16 |         for (layout, ptrs) in cache.iter() {
17 |             for safe_ptr in ptrs {
18 |                 deallocate_fn(safe_ptr.ptr, *layout);
19 |             }
20 |         }
21 |         *cache = new;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/docs/dev_guide/adding_new_op.md:
--------------------------------------------------------------------------------
 1 | # How to add new Tensor operator
 2 | 
 3 | ### Things to know
 4 | Tensor operators are define in `tensor-traits` crate. If you want to implement new Tensor operator, you may need to create a new trait if there is no suitable trait for the new operator.
 5 | 
 6 | ### How
 7 | 2. Implement the trait method at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt/src/backends) based on what backend you want to implement, mostly, you should implement for all backends.
 8 | 
 9 | 3. Ensure performance is ideal by comparing with other frameworks
10 | 
11 | 4. Write test cases at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-tests/src/hpt). Make sure to follow the Dev Guide test cases rules.
12 | 
13 | 5. commit and make a pull request


--------------------------------------------------------------------------------
/docs/user_guide/unary/sincos.md:
--------------------------------------------------------------------------------
 1 | # sincos
 2 | ```rust
 3 | sincos(x: &Tensor<T>) -> Result<(Tensor<C>, Tensor<C>), TensorError>
 4 | ```
 5 | Simultaneously computes sine and cosine of the input tensor
 6 | 
 7 | ## Parameters:
 8 | `x`: Angle(radians)
 9 | 
10 | ## Returns:
11 | Tuple of two tensors (sine, cosine) with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([10.0]);
19 |     let (sin, cos) = a.sincos()?;
20 |     println!("sin: {}, cos: {}", sin, cos);
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-tests/src/hpt_common/err_handler.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_imports)]
 2 | use hpt_common::error::shape::ShapeError;
 3 | 
 4 | #[test]
 5 | fn test_check_ndim_match() {
 6 |     ShapeError::check_dim(2, 2).unwrap();
 7 | }
 8 | 
 9 | #[test]
10 | fn test_check_ndim_match_err() {
11 |     assert!(ShapeError::check_dim(2, 3)
12 |         .unwrap_err()
13 |         .to_string()
14 |         .contains("Dimension mismatch: expected 2, got 3"));
15 | }
16 | 
17 | #[test]
18 | fn test_size_match() {
19 |     ShapeError::check_size_match(2, 2).unwrap();
20 | }
21 | 
22 | #[test]
23 | fn test_size_match_err() {
24 |     assert!(ShapeError::check_size_match(2, 3)
25 |         .unwrap_err()
26 |         .to_string()
27 |         .contains("Size mismatch: expected 2, got 3"));
28 | }
29 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/sum_square.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "reduce_classes.cuh"
 3 | 
 4 | DECLARE_KERNEL(bool, bool, sumsquare, SumSquare)
 5 | DECLARE_KERNEL(u8, u8, sumsquare, SumSquare)
 6 | DECLARE_KERNEL(u16, u16, sumsquare, SumSquare)
 7 | DECLARE_KERNEL(u32, u32, sumsquare, SumSquare)
 8 | DECLARE_KERNEL(u64, u64, sumsquare, SumSquare)
 9 | DECLARE_KERNEL(i8, i8, sumsquare, SumSquare)
10 | DECLARE_KERNEL(i16, i16, sumsquare, SumSquare)
11 | DECLARE_KERNEL(i32, i32, sumsquare, SumSquare)
12 | DECLARE_KERNEL(i64, i64, sumsquare, SumSquare)
13 | DECLARE_KERNEL(f32, f32, sumsquare, SumSquare)
14 | DECLARE_KERNEL(f64, f64, sumsquare, SumSquare)
15 | DECLARE_KERNEL(f16, f16, sumsquare, SumSquare)
16 | DECLARE_KERNEL(bf16, bf16, sumsquare, SumSquare)
17 | 


--------------------------------------------------------------------------------
/docs/user_guide/associated_methods/cpu/to_cuda.md:
--------------------------------------------------------------------------------
 1 | # to_cuda
 2 | ```rust
 3 | fn to_cuda<const DEVICE_ID: usize>(x: &Tensor<T, Cpu>) -> Result<Tensor<T, Cuda>, TensorError>
 4 | ```
 5 | Transfers a tensor from CPU memory to CUDA GPU memory, creating a new tensor on the specified CUDA device.
 6 | 
 7 | ## Parameters:
 8 | `DEVICE_ID`: A compile-time constant specifying the target CUDA device ID (default is 0)
 9 | 
10 | ## Returns:
11 | A new `Tensor<T>` located on the specified CUDA device, or a TensorError if the transfer fails.
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{error::TensorError, Tensor};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([1.5, 2.7, 3.2]).to_cuda::<0>()?;
19 |     println!("{}", a);
20 |     Ok(())
21 | }
22 | ```


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/tensor_external/cumulative.rs:
--------------------------------------------------------------------------------
 1 | use crate::Tensor;
 2 | use hpt_allocator::{
 3 |     traits::{Allocator, AllocatorOutputRetrive},
 4 |     Cpu,
 5 | };
 6 | use hpt_common::error::base::TensorError;
 7 | use hpt_traits::{ops::cumulative::CumulativeOps, tensor::CommonBounds};
 8 | 
 9 | impl<T: CommonBounds, const DEVICE: usize, Al> CumulativeOps for Tensor<T, Cpu, DEVICE, Al>
10 | where
11 |     Al: Allocator,
12 |     Al::Output: AllocatorOutputRetrive,
13 | {
14 |     fn cumsum<A: Into<Option<i64>>>(&self, axis: A) -> Result<Self, TensorError> {
15 |         Ok(self.inner.cumsum(axis)?.into())
16 |     }
17 |     fn cumprod<A: Into<Option<i64>>>(&self, axis: A) -> Result<Self, TensorError> {
18 |         Ok(self.inner.cumprod(axis)?.into())
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "hpt-docs",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "docs:dev": "vuepress dev docs",
 8 |     "docs:build": "vuepress build docs"
 9 |   },
10 |   "keywords": [],
11 |   "author": "",
12 |   "license": "ISC",
13 |   "devDependencies": {
14 |     "@vuepress/bundler-vite": "^2.0.0-rc.19",
15 |     "@vuepress/plugin-markdown-math": "^2.0.0-rc.73",
16 |     "@vuepress/theme-default": "^2.0.0-rc.73",
17 |     "chart.js": "^4.4.9",
18 |     "katex": "^0.16.21",
19 |     "mermaid": "^11.4.1",
20 |     "sass-embedded": "^1.83.4",
21 |     "vuepress": "^2.0.0-rc.19",
22 |     "vuepress-plugin-md-enhance": "^2.0.0-rc.71",
23 |     "vuepress-plugin-mermaidjs": "^2.0.0-beta.2"
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/docs/user_guide/binary/add.md:
--------------------------------------------------------------------------------
 1 | # add
 2 | ```rust
 3 | std::ops::Add::add(
 4 |     x: &Tensor<A> | Tensor<A> | scalar, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large x + y$ for all elements
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `C`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([10.0]);
24 |     let b = &a + &a;
25 |     println!("{}", b);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/div.md:
--------------------------------------------------------------------------------
 1 | # div
 2 | ```rust
 3 | std::ops::Div::div(
 4 |     x: &Tensor<A> | Tensor<A> | scalar, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large x / y$ for all elements
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `C`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([10.0]);
24 |     let b = &a / &a;
25 |     println!("{}", b);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/mul.md:
--------------------------------------------------------------------------------
 1 | # mul
 2 | ```rust
 3 | std::ops::Mul::mul(
 4 |     x: &Tensor<A> | Tensor<A> | scalar, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large x * y$ for all elements
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `C`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([10.0]);
24 |     let b = &a * &a;
25 |     println!("{}", b);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/rem.md:
--------------------------------------------------------------------------------
 1 | # rem
 2 | ```rust
 3 | std::ops::Rem::rem(
 4 |     x: &Tensor<A> | Tensor<A> | scalar, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large x mod y$ for all elements
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `C`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([10.0]);
24 |     let b = &a % &a;
25 |     println!("{}", b);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/sub.md:
--------------------------------------------------------------------------------
 1 | # sub
 2 | ```rust
 3 | std::ops::Sub::sub(
 4 |     x: &Tensor<A> | Tensor<A> | scalar, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large x - y$ for all elements
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `C`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([10.0]);
24 |     let b = &a - &a;
25 |     println!("{}", b);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/gelu.md:
--------------------------------------------------------------------------------
 1 | # gelu
 2 | ```rust
 3 | gelu(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x \cdot \Phi(x)$ where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.gelu()?;
20 |     println!("{}", b);  // prints: 1.9545977
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/reducel1.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "reduce_classes.cuh"
 4 | 
 5 | DECLARE_KERNEL(bool, bool, reducel1, ReduceL1)
 6 | DECLARE_KERNEL(u8, u8, reducel1, ReduceL1)
 7 | DECLARE_KERNEL(u16, u16, reducel1, ReduceL1)
 8 | DECLARE_KERNEL(u32, u32, reducel1, ReduceL1)
 9 | DECLARE_KERNEL(u64, u64, reducel1, ReduceL1)
10 | DECLARE_KERNEL(i8, i8, reducel1, ReduceL1)
11 | DECLARE_KERNEL(i16, i16, reducel1, ReduceL1)
12 | DECLARE_KERNEL(i32, i32, reducel1, ReduceL1)
13 | DECLARE_KERNEL(i64, i64, reducel1, ReduceL1)
14 | DECLARE_KERNEL(f32, f32, reducel1, ReduceL1)
15 | DECLARE_KERNEL(f64, f64, reducel1, ReduceL1)
16 | DECLARE_KERNEL(f16, f16, reducel1, ReduceL1)
17 | DECLARE_KERNEL(bf16, bf16, reducel1, ReduceL1)
18 | 


--------------------------------------------------------------------------------
/docs/dev_guide/test_rules.md:
--------------------------------------------------------------------------------
1 | ### Test cases
2 | 
3 | All the operators implemented must be tested in 4 cases.
4 | 1. contiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L143)
5 | 2. uncontiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L198)
6 | 3. sliced contiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L241)
7 | 4. sliced uncontiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L333)
8 | 
9 | Besides that, all the algorithm must be tested by using random input and random arguments.


--------------------------------------------------------------------------------
/docs/user_guide/unary/cosh_.md:
--------------------------------------------------------------------------------
 1 | # cosh_
 2 | ```rust
 3 | cosh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Hyperbolic cosine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | ## Returns:
12 | Tensor with type `C`
13 | ## Examples:
14 | ```rust
15 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([10.0]);
19 |     let b = a.cosh_(&mut a.clone())?;
20 |     println!("{}", b);
21 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
22 |     Ok(())
23 | }
24 | ```
25 | ## Backend Support
26 | | Backend | Supported |
27 | |---------|-----------|
28 | | CPU     | ✅         |
29 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/custom_type/custom_type.md:
--------------------------------------------------------------------------------
 1 | # Custom Type
 2 | 
 3 | Since hpt is designed in purely generic types, the user can define their own type and can use custom type to do all the computation hpt supports.
 4 | 
 5 | # How
 6 | 
 7 | You can reference the steps at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-examples/examples/custom_type/main.rs).
 8 | 
 9 | # Note
10 | 
11 | For now, your custom type must implemented `Copy` trait. The reason why hpt doesn't support type with only `Clone` is because of the conv2d implementation issue. The conv2d used fixed size array to preallocate registers `[T; N]`, and this requires `T` implemented `Copy` trait.
12 | 
13 | ## Backend Support
14 | | Backend | Supported |
15 | |---------|-----------|
16 | | CPU     | ✅         |
17 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/docs/user_guide/random/rand_like.md:
--------------------------------------------------------------------------------
 1 | # rand_like
 2 | ```rust
 3 | rand_like(
 4 |     x: &Tensor<T>, 
 5 |     low: T, 
 6 |     high: T
 7 | ) -> Result<Tensor<T>, TensorError>
 8 | ```
 9 | same as `rand` but the shape will be based on `x`.
10 | ## Parameters:
11 | `x`: input Tensor
12 | 
13 | `low`: the lowest value
14 | 
15 | `high`: the highest value
16 | ## Returns:
17 | Tensor with type `T`
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::Random, Tensor};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::rand([10, 10], 0.0, 10.0)?;
24 |     println!("{}", a.rand_like(0.0, 10.0)?);
25 |     Ok(())
26 | }
27 | ```
28 | ## Backend Support
29 | | Backend | Supported |
30 | |---------|-----------|
31 | | CPU     | ✅         |
32 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/asin_.md:
--------------------------------------------------------------------------------
 1 | # asin
 2 | ```rust
 3 | asin_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse sine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.asin_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/elu.md:
--------------------------------------------------------------------------------
 1 | # elu
 2 | ```rust
 3 | elu(x: &Tensor<T>, alpha: C) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x$ for $x > 0$, $\large \alpha(e^x - 1)$ for $x \leq 0$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `alpha`: Parameter controlling the saturation of negative values
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([-1.0]);
20 |     let b = a.elu(1.0)?;
21 |     println!("{}", b);  // prints: -0.6321206
22 |     Ok(())
23 | }
24 | ```
25 | ## Backend Support
26 | | Backend | Supported |
27 | |---------|-----------|
28 | | CPU     | ✅         |
29 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/hard_sigmoid.md:
--------------------------------------------------------------------------------
 1 | # hard_sigmoid
 2 | ```rust
 3 | hard_sigmoid(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{max}(0, \text{min}(1, \frac{x}{6} + 0.5))$ for all elements. A piece-wise linear approximation of the sigmoid function.
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.hard_sigmoid()?;
20 |     println!("{}", b);  // prints: 0.8333333
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/hard_swish.md:
--------------------------------------------------------------------------------
 1 | # hard_swish
 2 | ```rust
 3 | hard_swish(x: &Tensor<T>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x \cdot \text{min}(\text{max}(0, \frac{x}{6} + 0.5), 1)$ for all elements. A piece-wise linear approximation of the swish function.
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | ## Returns:
11 | Tensor with type `C`
12 | 
13 | ## Examples:
14 | ```rust
15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
16 | 
17 | fn main() -> Result<(), TensorError> {
18 |     let a = Tensor::<f32>::new([2.0]);
19 |     let b = a.hard_swish()?;
20 |     println!("{}", b);  // prints: 1.6666666
21 |     Ok(())
22 | }
23 | ```
24 | ## Backend Support
25 | | Backend | Supported |
26 | |---------|-----------|
27 | | CPU     | ✅         |
28 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   deploy-gh-pages:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v3
14 | 
15 |       - name: Setup Node.js
16 |         uses: actions/setup-node@v3
17 |         with:
18 |           node-version: '23'
19 | 
20 |       - name: Clean npm cache
21 |         run: npm cache clean --force
22 | 
23 |       - name: Install dependencies
24 |         run: npm install
25 | 
26 |       - name: Build VuePress site
27 |         run: npm run docs:build
28 | 
29 |       - name: Deploy to GitHub Pages
30 |         uses: JamesIves/github-pages-deploy-action@v4
31 |         with:
32 |           folder: docs/.vuepress/dist
33 |           branch: gh-pages


--------------------------------------------------------------------------------
/docs/user_guide/random/rand.md:
--------------------------------------------------------------------------------
 1 | # rand
 2 | ```rust
 3 | rand(
 4 |     shape: &[i64] | &Vec<i64> | &[i64; _], 
 5 |     low: T, 
 6 |     high: T
 7 | ) -> Result<Tensor<T>, TensorError>
 8 | ```
 9 | create a Tensor with data uniformly distributed between `low` and `high`.
10 | ## Parameters:
11 | `shape`: shape of the output
12 | 
13 | `low`: the lowest value
14 | 
15 | `high`: the highest value
16 | ## Returns:
17 | Tensor with type `T`
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::Random, Tensor};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::rand([10, 10], 0.0, 10.0)?;
24 |     println!("{}", a);
25 |     Ok(())
26 | }
27 | ```
28 | ## Backend Support
29 | | Backend | Supported |
30 | |---------|-----------|
31 | | CPU     | ✅         |
32 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/acos_.md:
--------------------------------------------------------------------------------
 1 | # acos
 2 | ```rust
 3 | acos_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse cosine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([-10.0]);
20 |     let b = a.acos_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/atan_.md:
--------------------------------------------------------------------------------
 1 | # atan
 2 | ```rust
 3 | atan_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse tangent with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.atan_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/cos_.md:
--------------------------------------------------------------------------------
 1 | # cos_
 2 | ```rust
 3 | cos_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Trigonometric cosine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.cos_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sin_.md:
--------------------------------------------------------------------------------
 1 | # sin
 2 | ```rust
 3 | sin_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Trigonometric sine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.sin_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sinh_.md:
--------------------------------------------------------------------------------
 1 | # sinh_
 2 | ```rust
 3 | sinh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Hyperbolic sine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.sinh_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/tan_.md:
--------------------------------------------------------------------------------
 1 | # tan_
 2 | ```rust
 3 | tan_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Trigonometric tangent with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.tan_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/ln_.md:
--------------------------------------------------------------------------------
 1 | # ln_
 2 | ```rust
 3 | ln_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \ln(x)$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.ln_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/tanh_.md:
--------------------------------------------------------------------------------
 1 | # tanh_
 2 | ```rust
 3 | tanh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Hyperbolic tangent with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.tanh_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_eq.md:
--------------------------------------------------------------------------------
 1 | # tensor_eq
 2 | ```rust
 3 | tensor_eq(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is equal to element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_eq(&a)?;
25 |     println!("{}", b); // [true true true]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/asinh_.md:
--------------------------------------------------------------------------------
 1 | # asinh_
 2 | ```rust
 3 | asinh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse hyperbolic sine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.asinh_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/parse_args/affine_grid.rs:
--------------------------------------------------------------------------------
 1 | use super::parse::{ Parse, ParseArgs };
 2 | 
 3 | pub(crate) struct AffineGridArgs<'a> {
 4 |     pub(crate) theta: &'a str,
 5 |     pub(crate) size: &'a str,
 6 |     pub(crate) align_corners: bool,
 7 | }
 8 | 
 9 | // impl<'a> Parse<AffineGridArgs<'a>> for ParseArgs {
10 | //     type Output<'b> = AffineGridArgs<'b> where Self: 'b;
11 | //     fn parse<'b>(&'b mut self, node: &'b crate::onnx::NodeProto) -> AffineGridArgs<'b> {
12 | //         let theta = node.input[0].as_str();
13 | //         let size = node.input[1].as_str();
14 | //         let align_corners = self.parse_int_attribute(node, "align_corners", 0) == 1;
15 | //         AffineGridArgs {
16 | //             theta,
17 | //             size,
18 | //             align_corners,
19 | //         }
20 | //     }
21 | // }
22 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/abs_.md:
--------------------------------------------------------------------------------
 1 | # abs_
 2 | ```rust
 3 | abs_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<T> | Tensor<T>
 6 | ) -> Result<Tensor<T>, TensorError>
 7 | ```
 8 | Calculate absolute of input with out
 9 | ## Parameters:
10 | `x`: input Tensor
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `T`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::NormalUaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([-10.0]);
20 |     let b = a.abs_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | 
26 | ```
27 | ## Backend Support
28 | | Backend | Supported |
29 | |---------|-----------|
30 | | CPU     | ✅         |
31 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/acosh_.md:
--------------------------------------------------------------------------------
 1 | # acosh_
 2 | ```rust
 3 | acosh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse hyperbolic cosine with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([-10.0]);
20 |     let b = a.acosh_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/atanh_.md:
--------------------------------------------------------------------------------
 1 | # atanh_
 2 | ```rust
 3 | atanh_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Inverse hyperbolic tangent with out
 9 | ## Parameters:
10 | `x`: Angle(radians)
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.atanh_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/celu.md:
--------------------------------------------------------------------------------
 1 | # celu
 2 | ```rust
 3 | celu(x: &Tensor<T>, alpha: C) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{max}(0, x) + \text{min}(0, \alpha \cdot (e^{x/\alpha} - 1))$ for all elements
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | 
10 | `alpha`: Parameter controlling the saturation of negative values
11 | 
12 | ## Returns:
13 | Tensor with type `C`
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
18 | 
19 | fn main() -> Result<(), TensorError> {
20 |     let a = Tensor::<f32>::new([-1.0]);
21 |     let b = a.celu(1.0)?;
22 |     println!("{}", b);  // prints: -0.6321206
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/sqrt_.md:
--------------------------------------------------------------------------------
 1 | # sqrt_
 2 | ```rust
 3 | sqrt_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \sqrt{x}$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.sqrt_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_gt.md:
--------------------------------------------------------------------------------
 1 | # tensor_gt
 2 | ```rust
 3 | tensor_gt(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is greater than the element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_gt(&a)?;
25 |     println!("{}", b); // [false false false]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_lt.md:
--------------------------------------------------------------------------------
 1 | # tensor_lt
 2 | ```rust
 3 | tensor_lt(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is less than the element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_lt(&a)?;
25 |     println!("{}", b); // [false false false]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_neq.md:
--------------------------------------------------------------------------------
 1 | # tensor_neq
 2 | ```rust
 3 | tensor_neq(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is not equal to element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_neq(&a)?;
25 |     println!("{}", b); // [false false false]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/cbrt_.md:
--------------------------------------------------------------------------------
 1 | # cbrt_
 2 | ```rust
 3 | cbrt_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \sqrt[3]{x}$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.cbrt_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp2_.md:
--------------------------------------------------------------------------------
 1 | # exp2_
 2 | ```rust
 3 | exp2_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute $\large 2^x$ for all elements with out
 9 | ## Parameters:
10 | `x`: Input values
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `C`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.exp2_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp_.md:
--------------------------------------------------------------------------------
 1 | # exp_
 2 | ```rust
 3 | exp_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute exponential of `x` for all elements with out
 9 | ## Parameters:
10 | `x`: Input values
11 | `out`: Tensor to write to
12 | ## Returns:
13 | Tensor with type `T`
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.exp_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/log2_.md:
--------------------------------------------------------------------------------
 1 | # log2_
 2 | ```rust
 3 | log2_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \log_{2}(x)$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.log2_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-dataloader/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-dataloader"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | description = "An internal library for data loading for hpt"
 7 | authors = ["JianJian Li <ljj1849532909@gmail.com>"]
 8 | repository = "https://github.com/Jianqoq/Hpt"
 9 | 
10 | [dependencies]
11 | zip = { workspace = true }
12 | serde = { workspace = true }
13 | serde_json = { workspace = true }
14 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" }
15 | hpt-types = { path = "../hpt-types", version = "0.1.1" }
16 | hpt-common = { path = "../hpt-common", version = "0.1.1" }
17 | indicatif = { workspace = true }
18 | flate2 = { workspace = true }
19 | num = { workspace = true }
20 | safetensors = { workspace = true }
21 | bytemuck = { workspace = true }
22 | half = { workspace = true }
23 | 


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_ge.md:
--------------------------------------------------------------------------------
 1 | # tensor_ge
 2 | ```rust
 3 | tensor_ge(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is greater or equal to the element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_ge(&a)?;
25 |     println!("{}", b); // [true true true]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/cmp/tensor_le.md:
--------------------------------------------------------------------------------
 1 | # tensor_le
 2 | ```rust
 3 | tensor_le(
 4 |     x: &Tensor<A> | Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B>
 6 | ) -> Result<Tensor<bool>, TensorError>
 7 | ```
 8 | check if element from x is less or equal to the element from y
 9 | 
10 | ## Parameters:
11 | `x`: First input tensor
12 | 
13 | `y`: Second input tensor
14 | 
15 | ## Returns:
16 | Tensor with type `bool`
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::new([2.0, 2.0, 2.0]);
24 |     let b = a.tensor_le(&a)?;
25 |     println!("{}", b); // [true true true]
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/log10_.md:
--------------------------------------------------------------------------------
 1 | # log10_
 2 | ```rust
 3 | log10_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \log_{10}(x)$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.log10_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/recip_.md:
--------------------------------------------------------------------------------
 1 | # recip_
 2 | ```rust
 3 | recip_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{1}{x}$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.recip_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/reducel2.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "../utils/check_type.cuh"
 4 | #include "reduce_classes.cuh"
 5 | 
 6 | DECLARE_KERNEL(half, bool, reducel2, ReduceL2)
 7 | DECLARE_KERNEL(half, u8, reducel2, ReduceL2)
 8 | DECLARE_KERNEL(half, u16, reducel2, ReduceL2)
 9 | DECLARE_KERNEL(float, u32, reducel2, ReduceL2)
10 | DECLARE_KERNEL(double, u64, reducel2, ReduceL2)
11 | DECLARE_KERNEL(half, i8, reducel2, ReduceL2)
12 | DECLARE_KERNEL(half, i16, reducel2, ReduceL2)
13 | DECLARE_KERNEL(float, i32, reducel2, ReduceL2)
14 | DECLARE_KERNEL(double, i64, reducel2, ReduceL2)
15 | DECLARE_KERNEL(float, f32, reducel2, ReduceL2)
16 | DECLARE_KERNEL(double, f64, reducel2, ReduceL2)
17 | DECLARE_KERNEL(half, f16, reducel2, ReduceL2)
18 | DECLARE_KERNEL(bf16, bf16, reducel2, ReduceL2)
19 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/reducel3.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "../utils/check_type.cuh"
 4 | #include "reduce_classes.cuh"
 5 | 
 6 | DECLARE_KERNEL(half, bool, reducel3, ReduceL3)
 7 | DECLARE_KERNEL(half, u8, reducel3, ReduceL3)
 8 | DECLARE_KERNEL(half, u16, reducel3, ReduceL3)
 9 | DECLARE_KERNEL(float, u32, reducel3, ReduceL3)
10 | DECLARE_KERNEL(double, u64, reducel3, ReduceL3)
11 | DECLARE_KERNEL(half, i8, reducel3, ReduceL3)
12 | DECLARE_KERNEL(half, i16, reducel3, ReduceL3)
13 | DECLARE_KERNEL(float, i32, reducel3, ReduceL3)
14 | DECLARE_KERNEL(double, i64, reducel3, ReduceL3)
15 | DECLARE_KERNEL(float, f32, reducel3, ReduceL3)
16 | DECLARE_KERNEL(double, f64, reducel3, ReduceL3)
17 | DECLARE_KERNEL(half, f16, reducel3, ReduceL3)
18 | DECLARE_KERNEL(bf16, bf16, reducel3, ReduceL3)
19 | 


--------------------------------------------------------------------------------
/docs/user_guide/associated_methods/cuda/to_cpu.md:
--------------------------------------------------------------------------------
 1 | # to_cpu
 2 | ```rust
 3 | fn to_cpu<const DEVICE_ID: usize>(x: &Tensor<T, Cuda>) -> Result<Tensor<T, Cpu>, TensorError>
 4 | ```
 5 | Transfers a tensor from CUDA GPU memory to CPU memory, creating a new tensor in host memory.
 6 | 
 7 | currently only `DEVICE_ID` = 0 is supported
 8 | 
 9 | ## Parameters:
10 | `DEVICE_ID`: A compile-time constant specifying the target CPU device ID (default is 0)
11 | 
12 | ## Returns:
13 | A new `Tensor<T>` located on the specified CPU device, or a TensorError if the transfer fails.
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{backend::Cuda, error::TensorError, ops::TensorCreator, Tensor};
18 | 
19 | fn main() -> Result<(), TensorError> {
20 |     let a = Tensor::<f32, Cuda>::empty([1, 2, 3])?;
21 |     println!("{}", a.to_cpu::<0>()?);
22 |     Ok(())
23 | }
24 | ```


--------------------------------------------------------------------------------
/docs/user_guide/unary/mish_.md:
--------------------------------------------------------------------------------
 1 | # mish_
 2 | ```rust
 3 | mish_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x * \tanh(\ln(1 + e^x))$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.mish_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/softplus_.md:
--------------------------------------------------------------------------------
 1 | # softplus_
 2 | ```rust
 3 | softplus_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\ln(1 + e^x)$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.softplus_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/logsumexp.cu:
--------------------------------------------------------------------------------
 1 | #include "declare_macros.cuh"
 2 | #include "../utils/type_cast.cuh"
 3 | #include "../utils/check_type.cuh"
 4 | #include "reduce_classes.cuh"
 5 | 
 6 | DECLARE_KERNEL(bool, bool, logsumexp, LogSumExp)
 7 | DECLARE_KERNEL(u8, u8, logsumexp, LogSumExp)
 8 | DECLARE_KERNEL(u16, u16, logsumexp, LogSumExp)
 9 | DECLARE_KERNEL(u32, u32, logsumexp, LogSumExp)
10 | DECLARE_KERNEL(u64, u64, logsumexp, LogSumExp)
11 | DECLARE_KERNEL(i8, i8, logsumexp, LogSumExp)
12 | DECLARE_KERNEL(i16, i16, logsumexp, LogSumExp)
13 | DECLARE_KERNEL(i32, i32, logsumexp, LogSumExp)
14 | DECLARE_KERNEL(i64, i64, logsumexp, LogSumExp)
15 | DECLARE_KERNEL(f32, f32, logsumexp, LogSumExp)
16 | DECLARE_KERNEL(f64, f64, logsumexp, LogSumExp)
17 | DECLARE_KERNEL(f16, f16, logsumexp, LogSumExp)
18 | DECLARE_KERNEL(bf16, bf16, logsumexp, LogSumExp)
19 | 


--------------------------------------------------------------------------------
/docs/dev_guide/new_type.md:
--------------------------------------------------------------------------------
 1 | # How to add new type support
 2 | 
 3 | ### Things to know
 4 | 1. Hpt has vector and scalar implementation. To support new type, you will need to go to `hpt-types` crate.
 5 | 
 6 | ### How
 7 | 1. Go to `hpt-types/src/dtype.rs`.
 8 | 2. Implement `TypeCommon` for new type, implement `CudaType` for new type if the new type is supported in cuda
 9 | 3. Add Dtype variant for the `Dtype` enum.
10 | 4. Go to `hpt-types/src/scalars/`, based on the existing implementation for primitive type, implement the traits for new type.
11 | 5. Go to `hpt-types/src/vectors/`, based on the existing implementation for primitive type, implement the traits for new type.
12 | 6. Go to `hpt-types/src/convertion.rs`, add type conversion for the new type, add `to_new_type` method in traits.
13 | 7. Go to `hpt-types/src/promotion/`, add type promotion for new type.


--------------------------------------------------------------------------------
/docs/user_guide/unary/sigmoid_.md:
--------------------------------------------------------------------------------
 1 | # sigmoid_
 2 | ```rust
 3 | sigmoid_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{1}{1 + e^{-x}}$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `T`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.sigmoid_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/softsign_.md:
--------------------------------------------------------------------------------
 1 | # softsign_
 2 | ```rust
 3 | softsign_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \frac{x}{1 + |x|}$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.softsign_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/utils/extra_vecs.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cuda_fp16.h"
 3 | #include "cuda_bf16.h"
 4 | 
 5 | struct half3
 6 | {
 7 |     __half x;
 8 |     __half y;
 9 |     __half z;
10 | };
11 | 
12 | struct __align__(8) half4
13 | {
14 |     __half x;
15 |     __half y;
16 |     __half z;
17 |     __half w;
18 | };
19 | 
20 | struct bf163
21 | {
22 |     __nv_bfloat16 x;
23 |     __nv_bfloat16 y;
24 |     __nv_bfloat16 z;
25 | };
26 | 
27 | struct __align__(8) bf164
28 | {
29 |     __nv_bfloat16 x;
30 |     __nv_bfloat16 y;
31 |     __nv_bfloat16 z;
32 |     __nv_bfloat16 w;
33 | };
34 | 
35 | struct bool2
36 | {
37 |     bool x;
38 |     bool y;
39 | };
40 | 
41 | struct bool3
42 | {
43 |     bool x;
44 |     bool y;
45 |     bool z;
46 | };
47 | 
48 | struct bool4
49 | {
50 |     bool x;
51 |     bool y;
52 |     bool z;
53 |     bool w;
54 | };
55 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt/cpu/assert_utils.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) fn assert_f64(a: f64, b: f64, diff: f64) -> anyhow::Result<()> {
 2 |     let rel_diff = ((a - b) / (a.abs() + b.abs() + f64::EPSILON)).abs();
 3 |     if rel_diff > diff {
 4 |         return Err(anyhow::anyhow!(
 5 |             "{} != {} (relative_diff: {}).\n",
 6 |             a,
 7 |             b,
 8 |             rel_diff
 9 |         ));
10 |     }
11 |     Ok(())
12 | }
13 | 
14 | #[allow(unused)]
15 | #[must_use]
16 | pub(crate) fn assert_f32(a: f32, b: f32, diff: f32) -> anyhow::Result<()> {
17 |     let rel_diff = ((a - b) / (a.abs() + b.abs() + f32::EPSILON)).abs();
18 |     if rel_diff > diff {
19 |         return Err(anyhow::anyhow!(
20 |             "{} != {} (relative_diff: {}).\n",
21 |             a,
22 |             b,
23 |             rel_diff
24 |         ));
25 |     }
26 |     Ok(())
27 | }
28 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/selu.md:
--------------------------------------------------------------------------------
 1 | # selu
 2 | ```rust
 3 | selu(
 4 |     x: &Tensor<T>
 5 | ) -> Result<Tensor<C>, TensorError>
 6 | ```
 7 | Compute $\large \lambda * (\alpha * (e^x - 1))$ for $x < 0$, $\large \lambda * x$ for $x \geq 0$ for all elements
 8 | 
 9 | where `alpha` is `1.6732632423543772848170429916717`, `gamma` is `1.0507009873554804934193349852946`
10 | 
11 | ## Parameters:
12 | `x`: Input values
13 | 
14 | ## Returns:
15 | Tensor with type `C`
16 | 
17 | ## Examples:
18 | ```rust
19 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError};
20 | 
21 | fn main() -> Result<(), TensorError> {
22 |     let a = Tensor::<f32>::new([2.0]);
23 |     let b = a.selu()?;
24 |     println!("{}", b);
25 |     Ok(())
26 | }
27 | ```
28 | ## Backend Support
29 | | Backend | Supported |
30 | |---------|-----------|
31 | | CPU     | ✅         |
32 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/iterator/collect.md:
--------------------------------------------------------------------------------
 1 | # collect
 2 | ```rust
 3 | fn collect<U>(strided_map) -> U
 4 | ```
 5 | 
 6 | Convert map struct into `U` type, `U` must be `Tensor` type.
 7 | 
 8 | ## Parameters:
 9 | 
10 | strided_map: Map struct, in hpt, there are couple of map struct
11 | 
12 | ## Returns:
13 | 
14 | Tensor
15 | 
16 | ## Examples:
17 | ```rust
18 | use hpt::iter::TensorIterator;
19 | use hpt::Tensor;
20 | fn main() -> anyhow::Result<()> {
21 |     let x = Tensor::<f64>::new(&[1f64, 2., 3.]);
22 | 
23 |     let res = x
24 |         .par_iter()
25 |         .strided_map(|(res, x)| {
26 |             *res = x.sin();
27 |         })
28 |         .collect::<Tensor<f64>>();
29 | 
30 |     println!("{}", res);
31 |     Ok(())
32 | }
33 | ```
34 | ## Backend Support
35 | | Backend | Supported |
36 | |---------|-----------|
37 | | CPU     | ✅         |
38 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-types/src/vectors/arch_simd/_128bit/sse/boolx16.rs:
--------------------------------------------------------------------------------
 1 | use crate::traits::VecTrait;
 2 | use crate::vectors::arch_simd::_128bit::common::boolx16::boolx16;
 3 | 
 4 | impl VecTrait<bool> for boolx16 {
 5 |     const SIZE: usize = 16;
 6 |     type Base = bool;
 7 |     #[inline(always)]
 8 |     fn mul_add(self, _: Self, _: Self) -> Self {
 9 |         todo!()
10 |     }
11 |     #[inline(always)]
12 |     fn sum(&self) -> bool {
13 |         self.0.iter().map(|&x| x as u8).sum::<u8>() > 0
14 |     }
15 |     #[inline(always)]
16 |     fn splat(val: bool) -> boolx16 {
17 |         boolx16([val; 16])
18 |     }
19 |     #[inline(always)]
20 |     unsafe fn from_ptr(ptr: *const bool) -> Self {
21 |         let mut result = [false; 16];
22 |         for i in 0..16 {
23 |             result[i] = unsafe { *ptr.add(i) };
24 |         }
25 |         boolx16(result)
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_512bit/common/mask.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) const U16MASK: [u16; 17] = [
 2 |     0b0000_0000_0000_0000,
 3 |     0b0000_0000_0000_0001,
 4 |     0b0000_0000_0000_0011,
 5 |     0b0000_0000_0000_0111,
 6 |     0b0000_0000_0000_1111,
 7 |     0b0000_0000_0001_1111,
 8 |     0b0000_0000_0011_1111,
 9 |     0b0000_0000_0111_1111,
10 |     0b0000_0000_1111_1111,
11 |     0b0000_0001_1111_1111,
12 |     0b0000_0011_1111_1111,
13 |     0b0000_0111_1111_1111,
14 |     0b0000_1111_1111_1111,
15 |     0b0001_1111_1111_1111,
16 |     0b0011_1111_1111_1111,
17 |     0b0111_1111_1111_1111,
18 |     0b1111_1111_1111_1111,
19 | ];
20 | 
21 | pub(crate) const U8MASK: [u8; 9] = [
22 |     0b0000_0000,
23 |     0b0000_0001,
24 |     0b0000_0011,
25 |     0b0000_0111,
26 |     0b0000_1111,
27 |     0b0001_1111,
28 |     0b0011_1111,
29 |     0b0111_1111,
30 |     0b1111_1111,
31 | ];


--------------------------------------------------------------------------------
/docs/user_guide/associated_methods/cpu/forget_copy.md:
--------------------------------------------------------------------------------
 1 | # forget_copy
 2 | ```rust
 3 | unsafe fn forget_copy(x: &Tensor<T, Cpu>) -> Result<(*mut u8, std::alloc::Layout), TensorError>
 4 | ```
 5 | clone the current Tensor data and return raw data.
 6 | 
 7 | ## Note
 8 | Similar as `forget`, but `forget_copy` doesn't need to check reference count
 9 | 
10 | ## Parameters:
11 | `x`: The input tensor
12 | 
13 | ## Returns:
14 | `*mut u8`: A pointer pointing to the cloned tensor's data
15 | `std::alloc::Layout`: Can be used to check the byte size
16 | 
17 | ## Examples:
18 | ```rust
19 | use hpt::{error::TensorError, ops::TensorCreator, Tensor};
20 | 
21 | fn main() -> Result<(), TensorError> {
22 |     let a = Tensor::<f32>::empty([1, 2, 3])?;
23 |     let (ptr, layout) = unsafe { a.forget_copy() }?;
24 |     unsafe { std::alloc::dealloc(ptr, layout) };
25 |     Ok(())
26 | }
27 | ```


--------------------------------------------------------------------------------
/docs/user_guide/associated_methods/cuda/forget_copy.md:
--------------------------------------------------------------------------------
 1 | # forget_copy
 2 | ```rust
 3 | unsafe fn forget_copy(x: &Tensor<T, Cuda>) -> Result<(cudarc::driver::CudaSlice<u8>, std::alloc::Layout), TensorError>
 4 | ```
 5 | clone the current Tensor data and return raw data.
 6 | 
 7 | ## Note
 8 | Similar as `forget`, but `forget_copy` doesn't need to check reference count
 9 | 
10 | ## Parameters:
11 | `x`: The input tensor
12 | 
13 | ## Returns:
14 | `cudarc::driver::CudaSlice<u8>`: A slice pointing to the cloned tensor's data
15 | `std::alloc::Layout`: Can be used to check the byte size
16 | 
17 | ## Examples:
18 | ```rust
19 | use hpt::{backend::Cuda, error::TensorError, ops::TensorCreator, Tensor};
20 | 
21 | fn main() -> Result<(), TensorError> {
22 |     let a = Tensor::<f32, Cuda>::empty([1, 2, 3])?;
23 |     let ret = unsafe { a.forget_copy() }?;
24 |     Ok(())
25 | }
26 | ```


--------------------------------------------------------------------------------
/hpt-bench/scan_benchmarks_result.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | with open('bench_results.txt', 'r', encoding="UTF-16LE") as f:
 4 |     content = f.read()
 5 | 
 6 | pattern = r'matmul f16 Benchmarks/(hpt|torch|hpt\(builtin\))/(\d+)\s+time:\s+\[\d+\.\d+ ms (\d+\.\d+) ms \d+\.\d+ ms\]'
 7 | 
 8 | results = {
 9 |     'hpt': [],
10 |     'torch': [],
11 |     'hpt(builtin)': []
12 | }
13 | 
14 | sizes = []
15 | last_size = None
16 | 
17 | matches = re.findall(pattern, content)
18 | for test_type, size, median in matches:
19 |     if test_type == 'hpt' and (last_size is None or int(size) != last_size):
20 |         sizes.append(int(size))
21 |         last_size = int(size)
22 |     results[test_type].append(float(median))
23 | 
24 | print("input size:", sizes)
25 | print("HPT:", results['hpt'])
26 | print("PyTorch:", results['torch'])
27 | print("HPT(builtin):", results['hpt(builtin)'])
28 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/erf_.md:
--------------------------------------------------------------------------------
 1 | # erf_
 2 | ```rust
 3 | erf_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} dt$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.erf_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-dyn/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod ops;
 2 | pub(crate) mod tensor;
 3 | pub(crate) mod utils;
 4 | use std::sync::atomic::AtomicUsize;
 5 | 
 6 | pub use hpt_types::dtype::DType;
 7 | pub use tensor::Tensor;
 8 | pub use utils::device::Device;
 9 | 
10 | static DISPLAY_PRECISION: AtomicUsize = AtomicUsize::new(4);
11 | static DISPLAY_LR_ELEMENTS: AtomicUsize = AtomicUsize::new(4);
12 | static ALIGN: usize = 128;
13 | 
14 | pub fn current_num_threads() -> usize {
15 |     num_cpus::get_physical()
16 | }
17 | 
18 | pub fn set_num_threads(num_threads: usize) {
19 |     rayon::ThreadPoolBuilder::new().num_threads(num_threads).build_global().unwrap();
20 | }
21 | 
22 | pub fn physical_cores() -> usize {
23 |     num_cpus::get_physical()
24 | }
25 | 
26 | pub mod onnx {
27 |     pub use crate::utils::onnx::load_model::load_onnx;
28 |     pub(crate) use crate::utils::onnx::proto::*;
29 | }
30 | 


--------------------------------------------------------------------------------
/docs/user_guide/iterator/par_iter_mut.md:
--------------------------------------------------------------------------------
 1 | # par_iter_mut
 2 | ```rust
 3 | fn par_iter_mut(x: &mut Tensor<T>) -> ParStridedMut<T>
 4 | ```
 5 | 
 6 | similar as `par_iter`, input pass to the closure will be mutable. You can use it to do inplace computation.
 7 | 
 8 | ## Parameters:
 9 | 
10 | x: Tensor to iterate
11 | 
12 | ## Returns:
13 | 
14 | `ParStridedMut`
15 | 
16 | ## Examples:
17 | ```rust
18 | use hpt::Tensor;
19 | use hpt::iter::TensorIterator;
20 | use hpt::iter::rayon::iter::ParallelIterator;
21 | 
22 | fn main() -> anyhow::Result<()> {
23 |     let mut x = Tensor::<f64>::new(&[1f64, 2., 3.]);
24 | 
25 |     x.par_iter_mut().for_each(|x|{
26 |         *x = x.sin();
27 |     });
28 | 
29 |     println!("{}", x);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-tests/src/macro_tests/stmt_item/item.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate hpt_tests;
 3 | use hpt_codegen::*;
 4 | pub fn main() {
 5 |     fuse_proc_macro!(
 6 |         fn struct_item(a: f32, b: f32) -> anyhow::Result<f32>{
 7 |             struct A {
 8 |                 a: f32,
 9 |                 b: f32,
10 |             }
11 |             Ok(A { a, b })
12 |         }
13 |     );
14 |     fuse_proc_macro!(
15 |         fn macro_item(a: f32, b: f32) -> anyhow::Result<f32>{
16 |             macro_rules! a {
17 |                 ($a:expr) => {
18 |                     $a
19 |                 };
20 |             }
21 |             Ok(a!(a))
22 |         }
23 |     );
24 |     fuse_proc_macro!(
25 |         fn trait_item(a: f32, b: f32) -> anyhow::Result<f32>{
26 |             trait A {
27 |                 fn a(&self) -> f32;
28 |             }
29 |             Ok(a)
30 |         }
31 |     );
32 | }
33 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/exp10_.md:
--------------------------------------------------------------------------------
 1 | # exp10_
 2 | ```rust
 3 | exp10_(
 4 |     x: &Tensor<T>, 
 5 |     out: &mut Tensor<C> | Tensor<C>
 6 | ) -> Result<Tensor<C>, TensorError>
 7 | ```
 8 | Compute 10 raised to the power of `x` for all elements with output tensor
 9 | 
10 | ## Parameters:
11 | `x`: Input values (exponents)  
12 | `out`: Tensor to write to
13 | 
14 | ## Returns:
15 | Tensor with type `C`
16 | 
17 | ## Examples:
18 | ```rust
19 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
20 | 
21 | fn main() -> Result<(), TensorError> {
22 |     let a = Tensor::<f32>::new([10.0]);
23 |     let b = a.exp10_(&mut a.clone())?;
24 |     println!("{}", b);
25 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/iterator/par_iter.md:
--------------------------------------------------------------------------------
 1 | # par_iter
 2 | ```rust
 3 | fn par_iter(x: &Tensor<T>) -> ParStrided<T>
 4 | ```
 5 | 
 6 | Convert Tensor to `ParStrided` iterator, `ParStrided` will split the tasks and execute the method the user provides
 7 | 
 8 | ## Parameters:
 9 | 
10 | x: Tensor to iterate
11 | 
12 | ## Returns:
13 | 
14 | `ParStrided`
15 | 
16 | ## Examples:
17 | ```rust
18 | use hpt::iter::TensorIterator;
19 | use hpt::Tensor;
20 | 
21 | fn main() -> anyhow::Result<()> {
22 |     let x = Tensor::<f64>::new(&[1f64, 2., 3.]);
23 | 
24 |     let res = x
25 |         .par_iter()
26 |         .strided_map(|(res, x)| {
27 |             *res = x.sin();
28 |         })
29 |         .collect::<Tensor<f64>>();
30 | 
31 |     println!("{}", res);
32 |     Ok(())
33 | }
34 | ```
35 | ## Backend Support
36 | | Backend | Supported |
37 | |---------|-----------|
38 | | CPU     | ✅         |
39 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/onnx/parse_args/parse.rs:
--------------------------------------------------------------------------------
 1 | use crate::onnx::NodeProto;
 2 | 
 3 | pub(crate) struct ParseArgs {
 4 |     arg_idx: usize,
 5 | }
 6 | 
 7 | impl ParseArgs {
 8 |     pub(crate) fn new() -> Self {
 9 |         Self { arg_idx: 0 }
10 |     }
11 | 
12 |     pub(crate) fn parse_int_attribute(
13 |         &mut self,
14 |         node: &NodeProto,
15 |         target: &str,
16 |         default: i64
17 |     ) -> i64 {
18 |         if let Some(attr) = node.attribute.get(self.arg_idx) {
19 |             if attr.name() == target {
20 |                 let res = attr.i.unwrap_or(default);
21 |                 self.arg_idx += 1;
22 |                 res
23 |             } else {
24 |                 default
25 |             }
26 |         } else {
27 |             default
28 |         }
29 |     }
30 | }
31 | 
32 | pub(crate) trait Parse<'p> {
33 |     fn parse<'a: 'p>(node: &'a NodeProto) -> Self;
34 | }
35 | 


--------------------------------------------------------------------------------
/docs/user_guide/binary/add_.md:
--------------------------------------------------------------------------------
 1 | # add_
 2 | ```rust
 3 | add_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x + y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Returns:
19 | Tensor with type `C`
20 | 
21 | ## Examples:
22 | ```rust
23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError};
24 | 
25 | fn main() -> Result<(), TensorError> {
26 |     let a = Tensor::<f32>::new([2.0]);
27 |     let b = Tensor::<f32>::new([3.0]);
28 |     let c = a.add_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/div_.md:
--------------------------------------------------------------------------------
 1 | # div_
 2 | ```rust
 3 | div_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x / y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Returns:
19 | Tensor with type `C`
20 | 
21 | ## Examples:
22 | ```rust
23 | use hpt::{ops::FloatBinOps, Tensor, error::TensorError};
24 | 
25 | fn main() -> Result<(), TensorError> {
26 |     let a = Tensor::<f32>::new([2.0]);
27 |     let b = Tensor::<f32>::new([3.0]);
28 |     let c = a.div_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/mul_.md:
--------------------------------------------------------------------------------
 1 | # mul_
 2 | ```rust
 3 | mul_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x * y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Returns:
19 | Tensor with type `C`
20 | 
21 | ## Examples:
22 | ```rust
23 | use hpt::{error::TensorError, ops::NormalBinOps, Tensor};
24 | 
25 | fn main() -> Result<(), TensorError> {
26 |     let a = Tensor::<f32>::new([2.0]);
27 |     let b = Tensor::<f32>::new([3.0]);
28 |     let c = a.mul_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/rem_.md:
--------------------------------------------------------------------------------
 1 | # rem_
 2 | ```rust
 3 | rem_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x mod y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Returns:
19 | Tensor with type `C`
20 | 
21 | ## Examples:
22 | ```rust
23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError};
24 | 
25 | fn main() -> Result<(), TensorError> {
26 |     let a = Tensor::<f32>::new([2.0]);
27 |     let b = Tensor::<f32>::new([3.0]);
28 |     let c = a.rem_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/sub_.md:
--------------------------------------------------------------------------------
 1 | # sub_
 2 | ```rust
 3 | sub_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x - y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Returns:
19 | Tensor with type `C`
20 | 
21 | ## Examples:
22 | ```rust
23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError};
24 | 
25 | fn main() -> Result<(), TensorError> {
26 |     let a = Tensor::<f32>::new([2.0]);
27 |     let b = Tensor::<f32>::new([3.0]);
28 |     let c = a.sub_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-common/src/error/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Error handling for tensor operations
 2 | //!
 3 | //! This module contains various error types used throughout the tensor library,
 4 | //! organized by their domains (shape, device, memory, kernel, etc.)
 5 | 
 6 | /// Autograd-related errors (gradient computation, etc.)
 7 | pub mod autograd;
 8 | /// Base error types and common functionality
 9 | pub mod base;
10 | /// Common errors
11 | pub mod common;
12 | /// Device-related errors (GPU, CPU, etc.)
13 | pub mod device;
14 | /// Kernel-related errors (CUDA, etc.)
15 | pub mod kernel;
16 | /// Memory allocation and management errors
17 | pub mod memory;
18 | /// Parameter-related errors (function arguments, etc.)
19 | pub mod param;
20 | /// Random distribution-related errors
21 | pub mod random;
22 | /// Shape-related errors (dimension mismatch, broadcasting, etc.)
23 | pub mod shape;
24 | /// Onnx-related errors
25 | pub mod onnx;
26 | 
27 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/tensor_external/tensordot.rs:
--------------------------------------------------------------------------------
 1 | use hpt_common::error::base::TensorError;
 2 | use hpt_traits::ops::binary::TensorDot;
 3 | use hpt_types::type_promote::NormalOut;
 4 | 
 5 | use crate::{tensor_base::_Tensor, Tensor};
 6 | use hpt_traits::tensor::CommonBounds;
 7 | impl<A, B> TensorDot<Tensor<B>> for Tensor<A>
 8 | where
 9 |     _Tensor<A>: TensorDot<_Tensor<B>, Output = _Tensor<<A as NormalOut<B>>::Output>>,
10 |     A: CommonBounds + NormalOut<B>,
11 |     B: CommonBounds,
12 |     <A as NormalOut<B>>::Output: CommonBounds,
13 | {
14 |     type Output = Tensor<<A as NormalOut<B>>::Output>;
15 | 
16 |     fn tensordot<const N: usize>(
17 |         &self,
18 |         rhs: &Tensor<B>,
19 |         axes: ([i64; N], [i64; N]),
20 |     ) -> std::result::Result<Self::Output, TensorError> {
21 |         let res = self.inner.tensordot(rhs.inner.as_ref(), axes)?;
22 |         Ok(res.into())
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/hpt-common/src/error/autograd.rs:
--------------------------------------------------------------------------------
 1 | use std::panic::Location;
 2 | 
 3 | use thiserror::Error;
 4 | 
 5 | /// Errors related to autograd
 6 | #[derive(Debug, Error)]
 7 | pub enum AutogradError {
 8 |     /// Error that occurs when inplace computation is not allowed in autograd
 9 |     #[error("Inplace computation {op} is not allowed in autograd, at {location}")]
10 |     InplaceCompError {
11 |         /// Operation name
12 |         op: &'static str,
13 |         /// Location where the error occurred
14 |         location: &'static Location<'static>,
15 |     },
16 |     /// Error that occurs when the operation is not supported in autograd
17 |     #[error("Operation {op} is not supported in autograd, at {location}")]
18 |     UnsupportOpError {
19 |         /// Operation name
20 |         op: &'static str,
21 |         /// Location where the error occurred
22 |         location: &'static Location<'static>,
23 |     },
24 | }
25 | 


--------------------------------------------------------------------------------
/hpt-dyn/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod allocator;
 2 | pub(crate) mod backend;
 3 | pub(crate) mod device;
 4 | pub(crate) mod display;
 5 | pub(crate) mod index_cal;
 6 | pub(crate) mod prefetch;
 7 | pub(crate) mod onnx {
 8 |     pub(crate) mod load_model;
 9 |     pub(crate) mod proto;
10 |     pub(crate) mod execute;
11 |     pub(crate) mod map_dtype;
12 |     pub(crate) mod init;
13 |     pub(crate) mod operators;
14 |     pub(crate) mod fwd;
15 |     pub(crate) mod layout_sense;
16 |     pub(crate) mod build_graph;
17 |     pub(crate) mod plot;
18 |     pub(crate) mod run_fwd;
19 |     pub(crate) mod run_init;
20 |     pub(crate) mod optimize {
21 |         pub(crate) mod constant_fold;
22 |         pub(crate) mod fuse;
23 |     }
24 |     pub(crate) mod parse_args {
25 |         pub(crate) mod parse;
26 |         pub(crate) mod affine_grid;
27 |         pub(crate) mod squeeze;
28 |     }
29 | }
30 | pub(crate) mod threadpool;


--------------------------------------------------------------------------------
/docs/user_guide/unary/gelu_.md:
--------------------------------------------------------------------------------
 1 | # gelu_
 2 | ```rust
 3 | gelu_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x \cdot \Phi(x)$ where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.gelu_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-dataloader/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod compression_trait;
 2 | pub mod data_loader;
 3 | pub use compression_trait::CompressionAlgo;
 4 | pub use compression_trait::DataLoaderTrait;
 5 | pub use compression_trait::Meta;
 6 | pub use compression_trait::{DataLoader, TensorLoader, TensorSaver};
 7 | pub use data_loader::Endian;
 8 | pub use flate2::write::{DeflateEncoder, GzEncoder, ZlibEncoder};
 9 | pub use from_safetensors::from_safetensors::FromSafeTensors;
10 | pub use struct_save::gen_header;
11 | pub use struct_save::load::{Load, MetaLoad};
12 | pub use struct_save::save::save;
13 | pub use struct_save::save::Save;
14 | pub use utils::CPUTensorCreator;
15 | mod struct_save {
16 |     pub mod gen_header;
17 |     pub mod load;
18 |     pub mod save;
19 | }
20 | 
21 | pub mod from_safetensors {
22 |     pub mod from_safetensors;
23 | }
24 | 
25 | pub mod load;
26 | pub mod save;
27 | pub mod utils;
28 | 
29 | pub(crate) const CHUNK_BUFF: usize = 1024 * 1024;
30 | 


--------------------------------------------------------------------------------
/hpt-tests/src/macro_tests/control_flows/for_loop.expanded.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate hpt_tests;
 3 | use hpt_codegen::*;
 4 | pub fn main() {
 5 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
 6 |         let __for_out_0 = for i in 0..1000 {
 7 |             a += 10;
 8 |         };
 9 |         Ok(a)
10 |     }
11 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
12 |         let __for_out_0 = for _ in (0..1000).iter() {
13 |             a += 10;
14 |         };
15 |         Ok(a)
16 |     }
17 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
18 |         let __for_out_0 = for _ in b.iter().enumerate() {
19 |             a += 10;
20 |         };
21 |         Ok(a)
22 |     }
23 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
24 |         let __for_out_0 = for _ in b.iter().enumerate() {
25 |             a += 10;
26 |             continue;
27 |             break;
28 |         };
29 |         Ok(a)
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/hpt/src/backends/cpu/tensor_internal/cumulative.rs:
--------------------------------------------------------------------------------
 1 | use crate::backends::cpu::utils::unary::unary::cumulate;
 2 | use crate::tensor_base::_Tensor;
 3 | use hpt_allocator::traits::Allocator;
 4 | use hpt_allocator::traits::AllocatorOutputRetrive;
 5 | use hpt_allocator::Cpu;
 6 | use hpt_common::error::base::TensorError;
 7 | use hpt_traits::ops::cumulative::CumulativeOps;
 8 | use hpt_traits::tensor::CommonBounds;
 9 | 
10 | impl<T: CommonBounds, const DEVICE: usize, A2> CumulativeOps for _Tensor<T, Cpu, DEVICE, A2>
11 | where
12 |     A2: Allocator,
13 |     A2::Output: AllocatorOutputRetrive,
14 | {
15 |     fn cumsum<A: Into<Option<i64>>>(&self, axis: A) -> std::result::Result<Self, TensorError> {
16 |         cumulate(self, axis, T::ZERO, |a, b| a._add(b))
17 |     }
18 | 
19 |     fn cumprod<A: Into<Option<i64>>>(&self, axis: A) -> std::result::Result<Self, TensorError> {
20 |         cumulate(self, axis, T::ONE, |a, b| a._mul(b))
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/elu_.md:
--------------------------------------------------------------------------------
 1 | # elu_
 2 | ```rust
 3 | elu_(x: &Tensor<T>, alpha: C, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x$ for $x > 0$, $\large \alpha(e^x - 1)$ for $x \leq 0$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `alpha`: Parameter controlling the saturation of negative values
10 | `out`: Tensor to write to
11 | 
12 | ## Returns:
13 | Tensor with type `C`
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
18 | 
19 | fn main() -> Result<(), TensorError> {
20 |     let a = Tensor::<f32>::new([-1.0]);
21 |     let b = a.elu_(1.0, &mut a.clone())?;
22 |     println!("{}", b);
23 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
24 |     Ok(())
25 | }
26 | ```
27 | ## Backend Support
28 | | Backend | Supported |
29 | |---------|-----------|
30 | | CPU     | ✅         |
31 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/hard_swish_.md:
--------------------------------------------------------------------------------
 1 | # hard_swish_
 2 | ```rust
 3 | hard_swish_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large x \cdot \text{min}(\text{max}(0, \frac{x}{6} + 0.5), 1)$ for all elements with out. A piece-wise linear approximation of the swish function.
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.hard_swish_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-tests/src/hpt_types/test_display.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use half;
 4 | use half::bf16;
 5 | use half::f16;
 6 | use hpt::types::TypeCommon;
 7 | use num_complex::Complex32 as c32;
 8 | use num_complex::Complex64 as c64;
 9 | 
10 | macro_rules! test_display {
11 |     ($type:ty) => {
12 |         paste::paste! {
13 |             #[test]
14 |             fn [<test_ $type _display>]() {
15 |                 assert_eq!(format!("{}", <$type as TypeCommon>::STR), stringify!($type));
16 |             }
17 |         }
18 |     };
19 | }
20 | 
21 | test_display!(bool);
22 | test_display!(f32);
23 | test_display!(f64);
24 | test_display!(i8);
25 | test_display!(i16);
26 | test_display!(i32);
27 | test_display!(i64);
28 | test_display!(u8);
29 | test_display!(u16);
30 | test_display!(u32);
31 | test_display!(u64);
32 | test_display!(isize);
33 | test_display!(usize);
34 | test_display!(f16);
35 | test_display!(bf16);
36 | test_display!(c32);
37 | test_display!(c64);
38 | 


--------------------------------------------------------------------------------
/docs/user_guide/unary/hard_sigmoid_.md:
--------------------------------------------------------------------------------
 1 | # hard_sigmoid_
 2 | ```rust
 3 | hard_sigmoid_(x: &Tensor<T>, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{max}(0, \text{min}(1, \frac{x}{6} + 0.5))$ for all elements with out. A piece-wise linear approximation of the sigmoid function.
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `out`: Tensor to write to
10 | 
11 | ## Returns:
12 | Tensor with type `C`
13 | 
14 | ## Examples:
15 | ```rust
16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     let a = Tensor::<f32>::new([10.0]);
20 |     let b = a.hard_sigmoid_(&mut a.clone())?;
21 |     println!("{}", b);
22 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
23 |     Ok(())
24 | }
25 | ```
26 | ## Backend Support
27 | | Backend | Supported |
28 | |---------|-----------|
29 | | CPU     | ✅         |
30 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/unary/celu_.md:
--------------------------------------------------------------------------------
 1 | # celu_
 2 | ```rust
 3 | celu_(x: &Tensor<T>, alpha: C, out: &mut Tensor<C> | Tensor<C>) -> Result<Tensor<C>, TensorError>
 4 | ```
 5 | Compute $\large \text{max}(0, x) + \text{min}(0, \alpha \cdot (e^{x/\alpha} - 1))$ for all elements with out
 6 | 
 7 | ## Parameters:
 8 | `x`: Input values
 9 | `alpha`: Parameter controlling the saturation of negative values
10 | `out`: Tensor to write to
11 | 
12 | ## Returns:
13 | Tensor with type `C`
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor};
18 | 
19 | fn main() -> Result<(), TensorError> {
20 |     let a = Tensor::<f32>::new([-1.0]);
21 |     let b = a.celu_(1.0, &mut a.clone())?;
22 |     println!("{}", b);
23 |     assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64);
24 |     Ok(())
25 | }
26 | ```
27 | ## Backend Support
28 | | Backend | Supported |
29 | |---------|-----------|
30 | | CPU     | ✅         |
31 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-matmul/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-matmul"
 3 | version = "0.1.1"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | half = { workspace = true }
 8 | num-complex = { workspace = true }
 9 | seq-macro = { workspace = true }
10 | num-traits = { workspace = true }
11 | dyn-stack = { workspace = true }
12 | gemm-common = { workspace = true }
13 | spindle = { workspace = true }
14 | duplicate = { workspace = true }
15 | num-integer = { workspace = true }
16 | rayon = {workspace = true}
17 | num_cpus = { workspace = true }
18 | raw-cpuid = { workspace = true }
19 | matconv_simd = { path = "../matconv_simd" }
20 | 
21 | [target.'cfg(target_os = "macos")'.dependencies]
22 | libc = { workspace = true }
23 | 
24 | [features]
25 | default = ["f32", "f16"]
26 | bound_check = []
27 | bool = []
28 | f32 = []
29 | f16 = []
30 | bf16 = []
31 | f64 = []
32 | i8 = []
33 | u8 = []
34 | i16 = []
35 | u16 = []
36 | i32 = []
37 | u32 = []
38 | i64 = []
39 | u64 = []
40 | cplx32 = []
41 | cplx64 = []
42 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt_common/pointer.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_imports)]
 2 | 
 3 | use hpt_common::{layout::layout::Layout, shape::shape::Shape, utils::pointer::Pointer};
 4 | 
 5 | #[test]
 6 | fn test_index() {
 7 |     let mut a = [10, 11, 12, 13];
 8 |     let mut ptr = Pointer::new(&mut a as *mut i32, 4);
 9 |     assert_eq!(ptr[0usize], 10);
10 |     ptr += 1i64;
11 |     assert_eq!(ptr[0usize], 11);
12 |     ptr += 1isize;
13 |     assert_eq!(ptr[0usize], 12);
14 |     ptr += 1usize;
15 |     assert_eq!(ptr[0usize], 13);
16 |     ptr -= 1i64;
17 |     assert_eq!(ptr[0usize], 12);
18 |     ptr -= 1isize;
19 |     assert_eq!(ptr[0usize], 11);
20 |     ptr -= 1usize;
21 |     assert_eq!(ptr[0usize], 10);
22 | 
23 |     ptr += 1i64;
24 |     assert_eq!(*ptr, 11);
25 |     *ptr = 20;
26 |     assert_eq!(*ptr, 20);
27 | 
28 |     let string = format!("{}", ptr);
29 |     assert_eq!(
30 |         string,
31 |         format!("Pointer( ptr: {}, val: {} )", ptr.ptr as usize, ptr[0usize])
32 |     );
33 | }
34 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/reduce/reduce_helper.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_fp16.h>
 4 | #include <cuda_bf16.h>
 5 | 
 6 | template <typename T, typename R, unsigned int WarpSize>
 7 | class ReduceOp
 8 | {
 9 | public:
10 |     __device__ __forceinline__ static R combine(T a, R b) { return R(); };
11 |     __device__ __forceinline__ static T identity() { return T(); };
12 |     __device__ __forceinline__ static R warp_reduce(R a) { return R(); }
13 |     __device__ __forceinline__ static R pre_op(T a) { return a; }
14 |     __device__ __forceinline__ static R post_op(R a, size_t reduce_size) { return a; }
15 | };
16 | 
17 | __device__ __forceinline__ bool is_last_block(int32_t *finished, size_t size)
18 | {
19 |     __shared__ bool is_last;
20 | 
21 |     __syncthreads();
22 |     if (threadIdx.x == 0 && threadIdx.y == 0)
23 |     {
24 |         int32_t tmp = atomicAdd(finished, 1);
25 |         is_last = tmp == (size - 1);
26 |     }
27 |     __syncthreads();
28 |     return is_last;
29 | }
30 | 


--------------------------------------------------------------------------------
/hpt-examples/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hpt-examples"
 3 | version = "0.1.3"
 4 | edition = "2021"
 5 | license = "MIT OR Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | serde = { workspace = true }
 9 | rayon = { workspace = true }
10 | hpt = { path = "../hpt", features = ["track_caller"] }
11 | hpt-dyn = { path = "../hpt-dyn", features = ["f32", "i64"] }
12 | anyhow = "1.0.40"
13 | mimalloc = "0.1.43"
14 | # candle-core = { version = "0.8.2", features = ["mkl"] }
15 | # candle-nn = "0.8.2"
16 | serde_json = "1.0"
17 | safetensors = "0.5.0"
18 | 
19 | [profile.release]
20 | opt-level = 3
21 | incremental = true
22 | debug = true
23 | lto = "fat"
24 | codegen-units = 1
25 | 
26 | [profile.dev]
27 | opt-level = 0
28 | incremental = false
29 | debug = true
30 | # lto = "fat"
31 | # codegen-units = 1
32 | 
33 | [profile.test]
34 | opt-level = 0
35 | incremental = false
36 | debug = true
37 | # lto = "fat"
38 | # codegen-units = 1
39 | 
40 | [features]
41 | # default = ["cuda"]
42 | cuda = ["hpt/cuda"]
43 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/sse/boolx16.rs:
--------------------------------------------------------------------------------
 1 | use crate::traits::VecTrait;
 2 | use crate::vectors::arch_simd::_128bit::common::boolx16::boolx16;
 3 | 
 4 | impl VecTrait<bool> for boolx16 {
 5 |     const SIZE: usize = 16;
 6 |     type Base = bool;
 7 |     #[inline(always)]
 8 |     fn copy_from_slice(&mut self, slice: &[bool]) {
 9 |         self.0.copy_from_slice(slice);
10 |     }
11 |     #[inline(always)]
12 |     fn mul_add(self, _: Self, _: Self) -> Self {
13 |         todo!()
14 |     }
15 |     #[inline(always)]
16 |     fn sum(&self) -> bool {
17 |         self.0.iter().map(|&x| x as u8).sum::<u8>() > 0
18 |     }
19 |     #[inline(always)]
20 |     fn splat(val: bool) -> boolx16 {
21 |         boolx16([val; 16])
22 |     }
23 |     #[inline(always)]
24 |     unsafe fn from_ptr(ptr: *const bool) -> Self {
25 |         let mut result = [false; 16];
26 |         for i in 0..16 {
27 |             result[i] = unsafe { *ptr.add(i) };
28 |         }
29 |         boolx16(result)
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt/cuda/from_raw.rs:
--------------------------------------------------------------------------------
 1 | #[test]
 2 | fn test_from_raw() {
 3 |     use hpt::backend::Cuda;
 4 |     use hpt::ops::ShapeManipulate;
 5 |     use hpt::slice;
 6 |     use hpt::{ops::Random, Tensor};
 7 |     let m = 10;
 8 |     let n = 10;
 9 |     let a = Tensor::<f32, Cuda>::randn(&[m, n]).expect("failed to create tensor");
10 | 
11 |     let raw = unsafe {
12 |         a.device()
13 |             .alloc(m * n * 4)
14 |             .expect("failed to alloc raw pointer")
15 |     };
16 | 
17 |     let c = unsafe { Tensor::<f32, Cuda>::from_raw(raw, &[m, n]) }
18 |         .expect("failed to create tensor from raw pointer");
19 | 
20 |     let _ = a + c.clone();
21 | 
22 |     let _sliced_c = slice!(c[0, ..]).expect("failed to slice tensor");
23 | 
24 |     let reshaped = _sliced_c
25 |         .reshape(&[10, 10])
26 |         .expect("failed to reshape tensor");
27 | 
28 |     drop(reshaped);
29 |     drop(_sliced_c);
30 |     let (_, _) = unsafe { c.forget().expect("failed to forget tensor") };
31 | }
32 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt/cpu/from_raw.rs:
--------------------------------------------------------------------------------
 1 | #[test]
 2 | fn test_from_raw() {
 3 |     use hpt::ops::ShapeManipulate;
 4 |     use hpt::slice;
 5 |     use hpt::{ops::Random, Tensor};
 6 |     let m = 10;
 7 |     let n = 10;
 8 |     let a = Tensor::<f32>::randn(&[m, n]).expect("failed to create tensor");
 9 |     let layout = std::alloc::Layout::from_size_align(m * n * 4, 64).unwrap();
10 | 
11 |     let raw = unsafe { std::alloc::alloc(layout) };
12 |     let c = unsafe { Tensor::<f32>::from_raw(raw as *mut f32, &[m, n]) }
13 |         .expect("failed to create tensor from raw pointer");
14 | 
15 |     let _ = a + c.clone();
16 | 
17 |     let _sliced_c = slice!(c[0, ..]).expect("failed to slice tensor");
18 | 
19 |     let reshaped = _sliced_c
20 |         .reshape(&[10, 10])
21 |         .expect("failed to reshape tensor");
22 | 
23 |     drop(reshaped);
24 |     drop(_sliced_c);
25 |     let (raw, _) = unsafe { c.forget().expect("failed to forget tensor") };
26 | 
27 |     unsafe { std::alloc::dealloc(raw, layout) };
28 | }
29 | 


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_128bit/common/bf16x8.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | /// a vector of 8 bf16 values
 3 | #[allow(non_camel_case_types)]
 4 | #[derive(Default, Clone, Copy, PartialEq, Debug)]
 5 | #[repr(C, align(16))]
 6 | pub struct bf16x8(pub(crate) [half::bf16; 8]);
 7 | 
 8 | impl std::ops::Add for bf16x8 {
 9 |     type Output = Self;
10 | 
11 |     #[inline(always)]
12 |     fn add(self, rhs: Self) -> Self::Output {
13 |         let [x0, x1] = self.to_2_f32vec();
14 |         let [y0, y1] = rhs.to_2_f32vec();
15 |         let low_add = x0 + y0;
16 |         let high_add = x1 + y1;
17 |         bf16x8::from_2_f32vec([low_add, high_add])
18 |     }
19 | }
20 | impl std::ops::Mul for bf16x8 {
21 |     type Output = Self;
22 | 
23 |     #[inline(always)]
24 |     fn mul(self, rhs: Self) -> Self::Output {
25 |         let [x0, x1] = self.to_2_f32vec();
26 |         let [y0, y1] = rhs.to_2_f32vec();
27 |         let low_mul = x0 * y0;
28 |         let high_mul = x1 * y1;
29 |         bf16x8::from_2_f32vec([low_mul, high_mul])
30 |     }
31 | }


--------------------------------------------------------------------------------
/hpt-tests/src/macro_tests/control_flows/while_loop.expanded.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | extern crate hpt_tests;
 3 | use hpt_codegen::*;
 4 | pub fn main() {
 5 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
 6 |         let __while_out_0 = while a < 1000 && a > 0 {
 7 |             a += 10;
 8 |         };
 9 |         Ok(a)
10 |     }
11 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
12 |         let __while_out_0 = while let Some(i) = (0..1000).iter().next() {
13 |             a += 10;
14 |         };
15 |         Ok(a)
16 |     }
17 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
18 |         let __while_out_0 = while let syn::Expr::Path(path) = b.iter().next() {
19 |             a += 10;
20 |         };
21 |         Ok(a)
22 |     }
23 |     fn case1(a: f32, b: f32) -> anyhow::Result<f32> {
24 |         let __while_out_0 = while let syn::Expr::Path(path) = b.iter().next() {
25 |             a += 10;
26 |             continue;
27 |             break;
28 |         };
29 |         Ok(a)
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/docs/user_guide/shape_manipulate/flipud.md:
--------------------------------------------------------------------------------
 1 | # flipud
 2 | ```rust
 3 | flipud(
 4 |     x: &Tensor<T>
 5 | ) -> Result<Tensor<T>, TensorError>
 6 | ```
 7 | Reverses the order of elements along axis 0 (rows) of the tensor. The tensor must be at least 1-dimensional.
 8 | 
 9 | ## Parameters:
10 | `x`: Input tensor with ndim >= 1
11 | 
12 | ## Returns:
13 | A new tensor with elements reversed along axis 0 (up/down flip).
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{ops::ShapeManipulate, Tensor, error::TensorError};
18 | fn main() -> Result<(), TensorError> {
19 |     // Create a 2D tensor
20 |     let a = Tensor::<f32>::new(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(&[2, 3])?;
21 |     // [[1, 2, 3],
22 |     //  [4, 5, 6]]
23 | 
24 |     // Flip up/down
25 |     let b = a.flipud()?;
26 |     // [[4, 5, 6],
27 |     //  [1, 2, 3]]
28 |     println!("{}", b);
29 | 
30 |     Ok(())
31 | }
32 | ```
33 | ## Backend Support
34 | | Backend | Supported |
35 | |---------|-----------|
36 | | CPU     | ✅         |
37 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/creation/identity.md:
--------------------------------------------------------------------------------
 1 | # identity
 2 | ```rust
 3 | identity(
 4 |     n: usize
 5 | ) -> Result<Tensor<T>, TensorError>
 6 | ```
 7 | Creates a 2-D identity tensor (1's on the main diagonal and 0's elsewhere).
 8 | 
 9 | ## Parameters:
10 | `n`: Number of rows and columns
11 | 
12 | ## Returns:
13 | A square 2-D tensor of shape [n, n] with ones on the main diagonal.
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{error::TensorError, ops::TensorCreator, Tensor};
18 | fn main() -> Result<(), TensorError> {
19 |     // Create a 3x3 identity matrix
20 |     let a = Tensor::<f32>::identity(3)?;
21 |     println!("{}", a);
22 |     // [[1, 0, 0],
23 |     //  [0, 1, 0],
24 |     //  [0, 0, 1]]
25 | 
26 |     // Create a 2x2 identity matrix
27 |     let b = Tensor::<f32>::identity(2)?;
28 |     println!("{}", b);
29 |     // [[1, 0],
30 |     //  [0, 1]]
31 | 
32 |     Ok(())
33 | }
34 | ```
35 | ## Backend Support
36 | | Backend | Supported |
37 | |---------|-----------|
38 | | CPU     | ✅         |
39 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/random/beta.md:
--------------------------------------------------------------------------------
 1 | # beta
 2 | ```rust
 3 | beta(
 4 |     alpha: T,
 5 |     beta: T,
 6 |     shape: &[i64] | &Vec<i64> | &[i64; _]
 7 | ) -> Result<Tensor<T>, TensorError>
 8 | ```
 9 | Create a Tensor with values drawn from a beta distribution with parameters `alpha` and `beta`. The beta distribution is a continuous probability distribution defined on the interval [0, 1].
10 | ## Parameters:
11 | `alpha`: Shape parameter alpha (α) of the beta distribution. Must be positive.
12 | 
13 | `beta`: Shape parameter beta (β) of the beta distribution. Must be positive.
14 | 
15 | `shape`: shape of the output
16 | ## Returns:
17 | Tensor with type `T`
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::Random, Tensor};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     let a = Tensor::<f32>::beta(2.0, 5.0, &[10, 10])?;
24 |     println!("{}", a);
25 |     Ok(())
26 | }
27 | ```
28 | ## Backend Support
29 | | Backend | Supported |
30 | |---------|-----------|
31 | | CPU     | ✅         |
32 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/docs/user_guide/creation/arange.md:
--------------------------------------------------------------------------------
 1 | # arange
 2 | ```rust
 3 | arange(
 4 |     start: T,
 5 |     end: T
 6 | ) -> Result<Tensor<T>, TensorError>
 7 | ```
 8 | Creates a 1-D tensor with evenly spaced values within a given interval `[start, end)`.
 9 | 
10 | ## Parameters:
11 | `start`: Start of interval (inclusive)
12 | 
13 | `end`: End of interval (exclusive)
14 | 
15 | ## Returns:
16 | A 1-D tensor with values from `start` to `end-1`.
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{Tensor, error::TensorError, ops::TensorCreator};
21 | fn main() -> Result<(), TensorError> {
22 |     // Create sequence from 0 to 5
23 |     let a = Tensor::<f32>::arange(0, 5)?;
24 |     println!("{}", a);
25 |     // [0, 1, 2, 3, 4]
26 | 
27 |     // Using floating point numbers
28 |     let b = Tensor::<f32>::arange(1.5, 5.5)?;
29 |     println!("{}", b);
30 |     // [1.5, 2.5, 3.5, 4.5]
31 | 
32 |     Ok(())
33 | }
34 | ```
35 | ## Backend Support
36 | | Backend | Supported |
37 | |---------|-----------|
38 | | CPU     | ✅         |
39 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/utils/set_display_precision.md:
--------------------------------------------------------------------------------
 1 | # set_display_precision
 2 | 
 3 | ```rust
 4 | set_display_precision(precision: usize)
 5 | ```
 6 | 
 7 | Controls how many decimal places are shown when displaying tensor values.
 8 | ## Parameters:
 9 | - `precision`: `usize`
10 |   - Number of decimal places to display
11 |   - Must be a non-negative integer
12 |   - Default is 4
13 | 
14 | ## Examples
15 | ```rust
16 | use hpt::{backend::Cpu, error::TensorError, utils::set_display_precision, Tensor};
17 | 
18 | fn main() -> Result<(), TensorError> {
19 |     // Set precision to 3 decimal places
20 |     set_display_precision(3);
21 | 
22 |     let tensor = Tensor::<f64, Cpu>::new([1.23456789]);
23 |     println!("{}", tensor); // Output: 1.235
24 | 
25 |     // Set precision to 6 decimal places
26 |     set_display_precision(6);
27 |     println!("{}", tensor); // Output: 1.234568
28 |     Ok(())
29 | }
30 | ```
31 | 
32 | ## Backend Support
33 | | Backend | Supported |
34 | |---------|-----------|
35 | | CPU     | ✅         |
36 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/binary/pow_.md:
--------------------------------------------------------------------------------
 1 | # pow_
 2 | ```rust
 3 | pow_(
 4 |     x: Tensor<A>, 
 5 |     y: &Tensor<B> | Tensor<B> | scalar,
 6 |     out: &mut Tensor<C> | Tensor<C>
 7 | ) -> Result<Tensor<C>, TensorError>
 8 | ```
 9 | Compute $\large x ^ y$ for all elements with out
10 | 
11 | ## Parameters:
12 | `x`: First input tensor
13 | 
14 | `y`: Second input tensor or scalar exponent
15 | 
16 | `out`: Tensor to write to
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::FloatBinOps, Tensor};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     // Create input tensors
24 |     let a = Tensor::<f32>::new(&[2.0, 3.0, 4.0]);
25 |     let b = Tensor::<f32>::new(&[2.0, 3.0, 2.0]);
26 | 
27 |     // Compute power and store result in a
28 |     let c = a.pow_(&b, &mut a.clone())?;
29 |     println!("{}", c);
30 |     // Output:
31 |     // [4.0, 27.0, 16.0]
32 | 
33 |     Ok(())
34 | }
35 | ```
36 | 
37 | ## Backend Support
38 | | Backend | Supported |
39 | |---------|-----------|
40 | | CPU     | ✅        |
41 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/hpt-common/src/utils/simd_ref.rs:
--------------------------------------------------------------------------------
 1 | use std::marker::PhantomData;
 2 | 
 3 | use crate::utils::pointer::Pointer;
 4 | 
 5 | /// A struct contains a mutable simd vector, this struct force the user to use write unaligned and read unaligned when they use simd iterator
 6 | #[derive(Debug)]
 7 | pub struct MutVec<'a, T> {
 8 |     ptr: Pointer<T>,
 9 |     _phantom: PhantomData<&'a mut T>,
10 | }
11 | 
12 | impl<'a, T> MutVec<'a, T> {
13 |     /// create a new MutVec
14 |     #[inline(always)]
15 |     pub fn new(ptr: Pointer<T>) -> Self {
16 |         Self {
17 |             ptr,
18 |             _phantom: PhantomData,
19 |         }
20 |     }
21 | 
22 |     /// perform write unaligned operation
23 |     #[inline(always)]
24 |     pub fn write_unaligned(&self, value: T) {
25 |         unsafe {
26 |             self.ptr.ptr.write_unaligned(value);
27 |         }
28 |     }
29 | 
30 |     #[inline(always)]
31 |     /// perform read unaligned operation
32 |     pub fn read_unaligned(&self) -> T {
33 |         unsafe { self.ptr.ptr.read_unaligned() }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/ln.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(ln_f16, f16, FloatOutUnaryPromote, Ln);
 8 | DEFINE_UNARY_KERNEL(ln_bf16, bf16, FloatOutUnaryPromote, Ln);
 9 | DEFINE_UNARY_KERNEL(ln_f32, f32, FloatOutUnaryPromote, Ln);
10 | DEFINE_UNARY_KERNEL(ln_f64, f64, FloatOutUnaryPromote, Ln);
11 | DEFINE_UNARY_KERNEL(ln_bool, bool, FloatOutUnaryPromote, Ln);
12 | DEFINE_UNARY_KERNEL(ln_i8, i8, FloatOutUnaryPromote, Ln);
13 | DEFINE_UNARY_KERNEL(ln_i16, i16, FloatOutUnaryPromote, Ln);
14 | DEFINE_UNARY_KERNEL(ln_i32, i32, FloatOutUnaryPromote, Ln);
15 | DEFINE_UNARY_KERNEL(ln_i64, i64, FloatOutUnaryPromote, Ln);
16 | DEFINE_UNARY_KERNEL(ln_u8, u8, FloatOutUnaryPromote, Ln);
17 | DEFINE_UNARY_KERNEL(ln_u16, u16, FloatOutUnaryPromote, Ln);
18 | DEFINE_UNARY_KERNEL(ln_u32, u32, FloatOutUnaryPromote, Ln);
19 | DEFINE_UNARY_KERNEL(ln_u64, u64, FloatOutUnaryPromote, Ln);
20 | 


--------------------------------------------------------------------------------
/hpt-tests/src/hpt_common/shape.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_imports)]
 2 | use std::sync::Arc;
 3 | 
 4 | use hpt_common::shape::shape::Shape;
 5 | 
 6 | #[test]
 7 | fn test_new() {
 8 |     let shape = Shape::new(&[1, 2, 3]);
 9 |     assert_eq!(shape.inner(), &[1, 2, 3]);
10 | }
11 | 
12 | #[test]
13 | fn test_to_strides() {
14 |     let shape = Shape::new(&[1, 2, 3]);
15 |     let strides = shape.to_strides();
16 |     assert_eq!(strides.inner(), &[6, 3, 1]);
17 | }
18 | 
19 | #[test]
20 | fn test_to_string() {
21 |     let shape = Shape::new(&[1, 2, 3]);
22 |     let string = format!("{:?}", shape);
23 |     assert_eq!(string, "shape([1, 2, 3])");
24 | }
25 | 
26 | #[test]
27 | fn test_default() {
28 |     let shape = Shape::default();
29 |     let arr: [i64; 0] = [];
30 |     assert_eq!(shape.inner(), &arr);
31 | }
32 | 
33 | #[test]
34 | fn test_from() {
35 |     let shape = Shape::from(&Arc::new(vec![1, 2, 3]));
36 |     assert_eq!(shape.inner(), &[1, 2, 3]);
37 |     let shape = Shape::from(Arc::new([1, 2, 3]));
38 |     assert_eq!(shape.inner(), &[1, 2, 3]);
39 | }
40 | 


--------------------------------------------------------------------------------
/hpt-allocator/src/utils/deallocate.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashSet;
 2 | 
 3 | use dashmap::DashMap;
 4 | use lru::LruCache;
 5 | 
 6 | use crate::{
 7 |     ptr::SafePtr,
 8 |     storage::{CommonStorage, Storage},
 9 | };
10 | 
11 | pub(crate) fn deallocate_helper(
12 |     cache: &mut LruCache<std::alloc::Layout, Vec<SafePtr>>,
13 |     allocated: &mut HashSet<SafePtr>,
14 |     storage: &DashMap<usize, CommonStorage>,
15 |     layout: &std::alloc::Layout,
16 |     ptr: *mut u8,
17 |     should_drop: bool,
18 |     device_id: usize,
19 | ) {
20 |     if let Some(mut storage) = storage.get_mut(&device_id) {
21 |         if storage.decrement_ref(SafePtr { ptr }) && should_drop {
22 |             allocated.remove(&SafePtr { ptr });
23 |             if let Some(ptrs) = cache.get_mut(layout) {
24 |                 ptrs.push(SafePtr { ptr });
25 |             } else {
26 |                 cache.put(layout.clone(), vec![SafePtr { ptr }]);
27 |             }
28 |         }
29 |     } else {
30 |         panic!("device {} not found in storage", device_id);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/docs/user_guide/iterator/strided_map.md:
--------------------------------------------------------------------------------
 1 | # strided_map
 2 | ```rust
 3 | fn strided_map(self, f: F) -> ParStridedMap
 4 | ```
 5 | 
 6 | Applies a function to each element of the iterator with strided access pattern. Useful for parallel data transformation operations.
 7 | 
 8 | ## Parameters
 9 | 
10 | - `self`: The parallel iterator
11 |   - Type: `ParStrided<T>` or `ParStridedZip<T>`
12 | - `f`: The mapping function
13 |   - Type: `FnMut((&mut T, &T))`
14 |   - Requirements: Must be thread-safe (`Send + Sync`)
15 | 
16 | ## Returns
17 | 
18 | A `ParStridedMap` iterator
19 | 
20 | ## Examples:
21 | ```rust
22 | use hpt::Tensor;
23 | use hpt::iter::TensorIterator;
24 | 
25 | fn main() -> anyhow::Result<()> {
26 |     let x = Tensor::<f64>::new(&[1f64, 2., 3.]);
27 | 
28 |     let res = x.par_iter().strided_map(|(res, x)|{
29 |         *res = x.sin();
30 |     }).collect::<Tensor<f64>>();
31 | 
32 |     println!("{}", res);
33 |     Ok(())
34 | }
35 | ```
36 | ## Backend Support
37 | | Backend | Supported |
38 | |---------|-----------|
39 | | CPU     | ✅         |
40 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/exp.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(exp_f16, f16, FloatOutUnaryPromote, Exp);
 8 | DEFINE_UNARY_KERNEL(exp_bf16, bf16, FloatOutUnaryPromote, Exp);
 9 | DEFINE_UNARY_KERNEL(exp_f32, f32, FloatOutUnaryPromote, Exp);
10 | DEFINE_UNARY_KERNEL(exp_f64, f64, FloatOutUnaryPromote, Exp);
11 | DEFINE_UNARY_KERNEL(exp_bool, bool, FloatOutUnaryPromote, Exp);
12 | DEFINE_UNARY_KERNEL(exp_i8, i8, FloatOutUnaryPromote, Exp);
13 | DEFINE_UNARY_KERNEL(exp_i16, i16, FloatOutUnaryPromote, Exp);
14 | DEFINE_UNARY_KERNEL(exp_i32, i32, FloatOutUnaryPromote, Exp);
15 | DEFINE_UNARY_KERNEL(exp_i64, i64, FloatOutUnaryPromote, Exp);
16 | DEFINE_UNARY_KERNEL(exp_u8, u8, FloatOutUnaryPromote, Exp);
17 | DEFINE_UNARY_KERNEL(exp_u16, u16, FloatOutUnaryPromote, Exp);
18 | DEFINE_UNARY_KERNEL(exp_u32, u32, FloatOutUnaryPromote, Exp);
19 | DEFINE_UNARY_KERNEL(exp_u64, u64, FloatOutUnaryPromote, Exp);
20 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/sin.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(sin_f16, f16, FloatOutUnaryPromote, Sin);
 8 | DEFINE_UNARY_KERNEL(sin_bf16, bf16, FloatOutUnaryPromote, Sin);
 9 | DEFINE_UNARY_KERNEL(sin_f32, f32, FloatOutUnaryPromote, Sin);
10 | DEFINE_UNARY_KERNEL(sin_f64, f64, FloatOutUnaryPromote, Sin);
11 | DEFINE_UNARY_KERNEL(sin_bool, bool, FloatOutUnaryPromote, Sin);
12 | DEFINE_UNARY_KERNEL(sin_i8, i8, FloatOutUnaryPromote, Sin);
13 | DEFINE_UNARY_KERNEL(sin_i16, i16, FloatOutUnaryPromote, Sin);
14 | DEFINE_UNARY_KERNEL(sin_i32, i32, FloatOutUnaryPromote, Sin);
15 | DEFINE_UNARY_KERNEL(sin_i64, i64, FloatOutUnaryPromote, Sin);
16 | DEFINE_UNARY_KERNEL(sin_u8, u8, FloatOutUnaryPromote, Sin);
17 | DEFINE_UNARY_KERNEL(sin_u16, u16, FloatOutUnaryPromote, Sin);
18 | DEFINE_UNARY_KERNEL(sin_u32, u32, FloatOutUnaryPromote, Sin);
19 | DEFINE_UNARY_KERNEL(sin_u64, u64, FloatOutUnaryPromote, Sin);
20 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/tan.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(tan_f16, f16, FloatOutUnaryPromote, Tan);
 8 | DEFINE_UNARY_KERNEL(tan_bf16, bf16, FloatOutUnaryPromote, Tan);
 9 | DEFINE_UNARY_KERNEL(tan_f32, f32, FloatOutUnaryPromote, Tan);
10 | DEFINE_UNARY_KERNEL(tan_f64, f64, FloatOutUnaryPromote, Tan);
11 | DEFINE_UNARY_KERNEL(tan_bool, bool, FloatOutUnaryPromote, Tan);
12 | DEFINE_UNARY_KERNEL(tan_i8, i8, FloatOutUnaryPromote, Tan);
13 | DEFINE_UNARY_KERNEL(tan_i16, i16, FloatOutUnaryPromote, Tan);
14 | DEFINE_UNARY_KERNEL(tan_i32, i32, FloatOutUnaryPromote, Tan);
15 | DEFINE_UNARY_KERNEL(tan_i64, i64, FloatOutUnaryPromote, Tan);
16 | DEFINE_UNARY_KERNEL(tan_u8, u8, FloatOutUnaryPromote, Tan);
17 | DEFINE_UNARY_KERNEL(tan_u16, u16, FloatOutUnaryPromote, Tan);
18 | DEFINE_UNARY_KERNEL(tan_u32, u32, FloatOutUnaryPromote, Tan);
19 | DEFINE_UNARY_KERNEL(tan_u64, u64, FloatOutUnaryPromote, Tan);
20 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/cos.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(cos_f16, f16, FloatOutUnaryPromote, Cos);
 8 | DEFINE_UNARY_KERNEL(cos_bf16, bf16, FloatOutUnaryPromote, Cos);
 9 | DEFINE_UNARY_KERNEL(cos_f32, f32, FloatOutUnaryPromote, Cos);
10 | DEFINE_UNARY_KERNEL(cos_f64, f64, FloatOutUnaryPromote, Cos);
11 | DEFINE_UNARY_KERNEL(cos_bool, bool, FloatOutUnaryPromote, Cos);
12 | DEFINE_UNARY_KERNEL(cos_i8, i8, FloatOutUnaryPromote, Cos);
13 | DEFINE_UNARY_KERNEL(cos_i16, i16, FloatOutUnaryPromote, Cos);
14 | DEFINE_UNARY_KERNEL(cos_i32, i32, FloatOutUnaryPromote, Cos);
15 | DEFINE_UNARY_KERNEL(cos_i64, i64, FloatOutUnaryPromote, Cos);
16 | DEFINE_UNARY_KERNEL(cos_u8, u8, FloatOutUnaryPromote, Cos);
17 | DEFINE_UNARY_KERNEL(cos_u16, u16, FloatOutUnaryPromote, Cos);
18 | DEFINE_UNARY_KERNEL(cos_u32, u32, FloatOutUnaryPromote, Cos);
19 | DEFINE_UNARY_KERNEL(cos_u64, u64, FloatOutUnaryPromote, Cos);
20 | 
21 | 


--------------------------------------------------------------------------------
/docs/user_guide/windows/hamming_window.md:
--------------------------------------------------------------------------------
 1 | # hamming_window
 2 | ```rust
 3 | hamming_window(
 4 |     window_length: i64,
 5 |     periodic: bool
 6 | ) -> Result<Tensor<T>, TensorError>
 7 | ```
 8 | Creates a Hamming window tensor. The Hamming window is a taper formed by using a weighted cosine.
 9 | 
10 | ## Parameters:
11 | `window_length`: The length of the window
12 | 
13 | `periodic`: If true, returns a window to be used as periodic function. If false, returns a symmetric window
14 | 
15 | ## Returns:
16 | A 1-D tensor containing the window.
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::WindowOps, Tensor};
21 | fn main() -> Result<(), TensorError> {
22 |     // Create a periodic Hamming window of length 5
23 |     let a = Tensor::<f32>::hamming_window(5, true)?;
24 |     println!("{}", a);
25 |     // [0.0800, 0.3979, 0.9121, 0.9121, 0.0800]
26 | 
27 |     // Create a symmetric Hamming window of length 5
28 |     let b = Tensor::<f32>::hamming_window(5, false)?;
29 |     println!("{}", b);
30 |     // [0.08, 0.54, 1.00, 0.54]
31 | 
32 |     Ok(())
33 | }
34 | ```


--------------------------------------------------------------------------------
/docs/user_guide/creation/ones_like.md:
--------------------------------------------------------------------------------
 1 | # ones_like
 2 | ```rust
 3 | ones_like(
 4 |     x: &Tensor<T>
 5 | ) -> Result<Tensor<T>, TensorError>
 6 | ```
 7 | Creates a new tensor filled with ones with the same shape as the input tensor.
 8 | 
 9 | ## Parameters:
10 | `x`: The input tensor whose shape will be used.
11 | 
12 | ## Returns:
13 | A new tensor of ones with the same shape as the input tensor.
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{
18 |     error::TensorError,
19 |     ops::{ShapeManipulate, TensorCreator},
20 |     Tensor,
21 | };
22 | fn main() -> Result<(), TensorError> {
23 |     // Create a tensor
24 |     let a = Tensor::<f32>::new(&[1.0, 2.0, 3.0, 4.0]).reshape(&[2, 2])?;
25 |     println!("a: {}", a);
26 |     // [[1, 2],
27 |     //  [3, 4]]
28 | 
29 |     // Create a tensor of ones with same shape
30 |     let b = a.ones_like()?;
31 |     println!("b: {}", b);
32 |     // [[1, 1],
33 |     //  [1, 1]]
34 | 
35 |     Ok(())
36 | }
37 | ```
38 | ## Backend Support
39 | | Backend | Supported |
40 | |---------|-----------|
41 | | CPU     | ✅         |
42 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/docs/user_guide/windows/hann_window.md:
--------------------------------------------------------------------------------
 1 | # hann_window
 2 | ```rust
 3 | hann_window(
 4 |     window_length: i64,
 5 |     periodic: bool
 6 | ) -> Result<Tensor<T>, TensorError>
 7 | ```
 8 | Creates a Hann window tensor. The Hann window is a taper formed by using a raised cosine with α = β = 0.5.
 9 | 
10 | ## Parameters:
11 | `window_length`: The length of the window
12 | 
13 | `periodic`: If true, returns a window to be used as periodic function. If false, returns a symmetric window
14 | 
15 | ## Returns:
16 | A 1-D tensor containing the window.
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::WindowOps, Tensor};
21 | fn main() -> Result<(), TensorError> {
22 |     // Create a periodic Hann window of length 5
23 |     let a = Tensor::<f32>::hann_window(5, true)?;
24 |     println!("{}", a);
25 |     // [0.0000, 0.3455, 0.9045, 0.9045, 0.0000]
26 | 
27 |     // Create a symmetric Hann window of length 5
28 |     let b = Tensor::<f32>::hann_window(5, false)?;
29 |     println!("{}", b);
30 |     // [0.0000, 0.5000, 1.0000, 0.5000, 0.0000]
31 | 
32 |     Ok(())
33 | }
34 | ```


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/erf.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | 
 8 | DEFINE_UNARY_KERNEL(erf_f16, f16, FloatOutUnaryPromote, Erf);
 9 | DEFINE_UNARY_KERNEL(erf_bf16, bf16, FloatOutUnaryPromote, Erf);
10 | DEFINE_UNARY_KERNEL(erf_f32, f32, FloatOutUnaryPromote, Erf);
11 | DEFINE_UNARY_KERNEL(erf_f64, f64, FloatOutUnaryPromote, Erf);
12 | DEFINE_UNARY_KERNEL(erf_bool, bool, FloatOutUnaryPromote, Erf);
13 | DEFINE_UNARY_KERNEL(erf_i8, i8, FloatOutUnaryPromote, Erf);
14 | DEFINE_UNARY_KERNEL(erf_i16, i16, FloatOutUnaryPromote, Erf);
15 | DEFINE_UNARY_KERNEL(erf_i32, i32, FloatOutUnaryPromote, Erf);
16 | DEFINE_UNARY_KERNEL(erf_i64, i64, FloatOutUnaryPromote, Erf);
17 | DEFINE_UNARY_KERNEL(erf_u8, u8, FloatOutUnaryPromote, Erf);
18 | DEFINE_UNARY_KERNEL(erf_u16, u16, FloatOutUnaryPromote, Erf);
19 | DEFINE_UNARY_KERNEL(erf_u32, u32, FloatOutUnaryPromote, Erf);
20 | DEFINE_UNARY_KERNEL(erf_u64, u64, FloatOutUnaryPromote, Erf);
21 | 
22 | 


--------------------------------------------------------------------------------
/hpt-examples/examples/iterator/main.rs:
--------------------------------------------------------------------------------
 1 | use hpt::{
 2 |     error::TensorError,
 3 |     iter::{ParStridedIteratorSimdZip, ParStridedIteratorZip, TensorIterator},
 4 |     ops::{Random, TensorCreator},
 5 |     Tensor,
 6 | };
 7 | use rayon::iter::ParallelIterator;
 8 | 
 9 | fn main() -> Result<(), TensorError> {
10 |     let a = Tensor::<f32>::randn([2, 4, 6, 8])?;
11 |     let mut b = Tensor::<f32>::empty([2, 4, 6, 8])?;
12 | 
13 |     b.par_iter_mut().zip(a.par_iter()).for_each(|(b, a)| {
14 |         *b = a;
15 |     });
16 |     println!("{}", b);
17 | 
18 |     let res = b
19 |         .par_iter()
20 |         .zip(a.par_iter())
21 |         .strided_map(|(res, (b, a))| *res = b + a)
22 |         .collect::<Tensor<f32>>();
23 |     println!("{}", res);
24 | 
25 |     let res = b
26 |         .par_iter_simd()
27 |         .zip(a.par_iter_simd())
28 |         .strided_map_simd(
29 |             |(res, (b, a))| *res = b + a,
30 |             |(res, (b, a))| res.write_unaligned(b + a),
31 |         )
32 |         .collect::<Tensor<f32>>();
33 |     println!("{}", res);
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/docs/dev_guide/iterator/iterator.md:
--------------------------------------------------------------------------------
1 | ### Iterator
2 | 
3 | Iterator are implemented using Rayon trait `UnindexedProducer`, the tasks are splitted in [split](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-iterator/src/par_strided.rs#L541) method. The main loop is happened in [fold_with](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-iterator/src/par_strided_zip.rs#L471). Iterator can be used to implement elementwise or broadcast elementwise calculations. Usage can be found at [here](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt/src/ops/cpu/utils/binary/binary_normal.rs#L46) and [here](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt/src/ops/cpu/utils/unary/unary.rs#L19)
4 | 
5 | ### Known issue
6 | 
7 | 1. Current iterator like `ParStrided`, the `fold_with` method doesn't have any looping logic. Maybe we can write same logic as `ParStridedZip` in `fold_with`.
8 | 
9 | 2. When outer loop size is 1, there will be no parallelism because the tasks splits based on the outer loop size.


--------------------------------------------------------------------------------
/hpt-types/src/into_vec.rs:
--------------------------------------------------------------------------------
 1 | /// a trait to convert a vector to another vector
 2 | pub trait IntoVec<T> {
 3 |     /// convert a vector to another vector T
 4 |     fn into_vec(self) -> T;
 5 | }
 6 | 
 7 | #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
 8 | mod into_vec {
 9 |     use super::IntoVec;
10 |     use crate::convertion::VecConvertor;
11 |     use crate::simd::_256bit::common::*;
12 |     use hpt_macros::impl_into_vec;
13 |     impl_into_vec!();
14 | }
15 | 
16 | #[cfg(target_feature = "avx512f")]
17 | mod into_vec {
18 |     use super::IntoVec;
19 |     use crate::convertion::VecConvertor;
20 |     use crate::simd::_512bit::common::*;
21 |     use hpt_macros::impl_into_vec;
22 |     impl_into_vec!();
23 | }
24 | 
25 | #[cfg(all(
26 |     any(target_feature = "sse", target_arch = "arm", target_arch = "aarch64"),
27 |     not(target_feature = "avx2")
28 | ))]
29 | mod into_vec {
30 |     use super::IntoVec;
31 |     use crate::convertion::VecConvertor;
32 |     use crate::simd::_128bit::common::*;
33 |     use hpt_macros::impl_into_vec;
34 |     impl_into_vec!();
35 | }
36 | 


--------------------------------------------------------------------------------
/docs/user_guide/creation/zeros_like.md:
--------------------------------------------------------------------------------
 1 | # zeros_like
 2 | ```rust
 3 | zeros_like(
 4 |     x: &Tensor<T>
 5 | ) -> Result<Tensor<T>, TensorError>
 6 | ```
 7 | Creates a new tensor filled with zeros with the same shape as the input tensor.
 8 | 
 9 | ## Parameters:
10 | `x`: The input tensor whose shape will be used.
11 | 
12 | ## Returns:
13 | A new tensor of zeros with the same shape as the input tensor.
14 | 
15 | ## Examples:
16 | ```rust
17 | use hpt::{
18 |     error::TensorError,
19 |     ops::{ShapeManipulate, TensorCreator},
20 |     Tensor,
21 | };
22 | fn main() -> Result<(), TensorError> {
23 |     // Create a tensor
24 |     let a = Tensor::<f32>::new(&[1.0, 2.0, 3.0, 4.0]).reshape(&[2, 2])?;
25 |     println!("a: {}", a);
26 |     // [[1, 2],
27 |     //  [3, 4]]
28 | 
29 |     // Create a tensor of zeros with same shape
30 |     let b = a.zeros_like()?;
31 |     println!("b: {}", b);
32 |     // [[0, 0],
33 |     //  [0, 0]]
34 | 
35 |     Ok(())
36 | }
37 | ```
38 | ## Backend Support
39 | | Backend | Supported |
40 | |---------|-----------|
41 | | CPU     | ✅         |
42 | | Cuda    | ✅        |


--------------------------------------------------------------------------------
/matconv_simd/src/simd/_256bit/avx2/f64x4.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | use std::arch::x86_64::*;
 3 | use crate::simd::_256bit::common::f64x4::f64x4;
 4 | 
 5 | impl f64x4 {
 6 |     #[inline(always)]
 7 |     pub(crate) fn mul_add(self, a: Self, b: Self) -> Self {
 8 |         #[cfg(not(target_feature = "fma"))]
 9 |         unsafe {
10 |             f64x4(_mm256_add_pd(_mm256_mul_pd(self.0, a.0), b.0))
11 |         }
12 |         #[cfg(target_feature = "fma")]
13 |         unsafe {
14 |             f64x4(_mm256_fmadd_pd(self.0, a.0, b.0))
15 |         }
16 |     }
17 |     #[inline(always)]
18 |     pub(crate) fn splat(val: f64) -> f64x4 {
19 |         unsafe { f64x4(_mm256_set1_pd(val)) }
20 |     }
21 | }
22 | 
23 | impl std::ops::Add for f64x4 {
24 |     type Output = Self;
25 |     #[inline(always)]
26 |     fn add(self, rhs: Self) -> Self {
27 |         unsafe { f64x4(_mm256_add_pd(self.0, rhs.0)) }
28 |     }
29 | }
30 | impl std::ops::Mul for f64x4 {
31 |     type Output = Self;
32 |     #[inline(always)]
33 |     fn mul(self, rhs: Self) -> Self {
34 |         unsafe { f64x4(_mm256_mul_pd(self.0, rhs.0)) }
35 |     }
36 | }


--------------------------------------------------------------------------------
/docs/user_guide/random/exponential.md:
--------------------------------------------------------------------------------
 1 | # exponential
 2 | ```rust
 3 | exponential(
 4 |     lambda: T,
 5 |     shape: &[i64] | &Vec<i64> | &[i64; _]
 6 | ) -> Result<Tensor<T>, TensorError>
 7 | ```
 8 | Create a Tensor with values drawn from an exponential distribution with rate parameter `lambda`. The exponential distribution describes the time between events in a Poisson point process.
 9 | 
10 | ## Parameters:
11 | `lambda`: Rate parameter (λ) of the exponential distribution. Must be positive.
12 | 
13 | `shape`: Shape of the output tensor.
14 | 
15 | ## Returns:
16 | Tensor with type `T` containing random values from the exponential distribution.
17 | 
18 | ## Examples:
19 | ```rust
20 | use hpt::{error::TensorError, ops::Random, Tensor};
21 | 
22 | fn main() -> Result<(), TensorError> {
23 |     // Create a 10x10 tensor with exponential distribution (λ=2.0)
24 |     let a = Tensor::<f32>::exponential(2.0, &[10, 10])?;
25 |     println!("{}", a);
26 |     Ok(())
27 | }
28 | ```
29 | ## Backend Support
30 | | Backend | Supported |
31 | |---------|-----------|
32 | | CPU     | ✅         |
33 | | Cuda    | ❌        |


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/acos.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(acos_f16, f16, FloatOutUnaryPromote, ACos);
 8 | DEFINE_UNARY_KERNEL(acos_bf16, bf16, FloatOutUnaryPromote, ACos);
 9 | DEFINE_UNARY_KERNEL(acos_f32, f32, FloatOutUnaryPromote, ACos);
10 | DEFINE_UNARY_KERNEL(acos_f64, f64, FloatOutUnaryPromote, ACos);
11 | DEFINE_UNARY_KERNEL(acos_bool, bool, FloatOutUnaryPromote, ACos);
12 | DEFINE_UNARY_KERNEL(acos_i8, i8, FloatOutUnaryPromote, ACos);
13 | DEFINE_UNARY_KERNEL(acos_i16, i16, FloatOutUnaryPromote, ACos);
14 | DEFINE_UNARY_KERNEL(acos_i32, i32, FloatOutUnaryPromote, ACos);
15 | DEFINE_UNARY_KERNEL(acos_i64, i64, FloatOutUnaryPromote, ACos);
16 | DEFINE_UNARY_KERNEL(acos_u8, u8, FloatOutUnaryPromote, ACos);
17 | DEFINE_UNARY_KERNEL(acos_u16, u16, FloatOutUnaryPromote, ACos);
18 | DEFINE_UNARY_KERNEL(acos_u32, u32, FloatOutUnaryPromote, ACos);
19 | DEFINE_UNARY_KERNEL(acos_u64, u64, FloatOutUnaryPromote, ACos);
20 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/asin.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(asin_f16, f16, FloatOutUnaryPromote, ASin);
 8 | DEFINE_UNARY_KERNEL(asin_bf16, bf16, FloatOutUnaryPromote, ASin);
 9 | DEFINE_UNARY_KERNEL(asin_f32, f32, FloatOutUnaryPromote, ASin);
10 | DEFINE_UNARY_KERNEL(asin_f64, f64, FloatOutUnaryPromote, ASin);
11 | DEFINE_UNARY_KERNEL(asin_bool, bool, FloatOutUnaryPromote, ASin);
12 | DEFINE_UNARY_KERNEL(asin_i8, i8, FloatOutUnaryPromote, ASin);
13 | DEFINE_UNARY_KERNEL(asin_i16, i16, FloatOutUnaryPromote, ASin);
14 | DEFINE_UNARY_KERNEL(asin_i32, i32, FloatOutUnaryPromote, ASin);
15 | DEFINE_UNARY_KERNEL(asin_i64, i64, FloatOutUnaryPromote, ASin);
16 | DEFINE_UNARY_KERNEL(asin_u8, u8, FloatOutUnaryPromote, ASin);
17 | DEFINE_UNARY_KERNEL(asin_u16, u16, FloatOutUnaryPromote, ASin);
18 | DEFINE_UNARY_KERNEL(asin_u32, u32, FloatOutUnaryPromote, ASin);
19 | DEFINE_UNARY_KERNEL(asin_u64, u64, FloatOutUnaryPromote, ASin);
20 | 


--------------------------------------------------------------------------------
/hpt-cudakernels/src/unary/atan.cu:
--------------------------------------------------------------------------------
 1 | #include "unary_template.cuh"
 2 | #include "../utils/type_alias.cuh"
 3 | #include "../utils/promotion/promotes.cuh"
 4 | #include "../utils/check_type.cuh"
 5 | #include "unary_classes.cuh"
 6 | 
 7 | DEFINE_UNARY_KERNEL(atan_f16, f16, FloatOutUnaryPromote, ATan);
 8 | DEFINE_UNARY_KERNEL(atan_bf16, bf16, FloatOutUnaryPromote, ATan);
 9 | DEFINE_UNARY_KERNEL(atan_f32, f32, FloatOutUnaryPromote, ATan);
10 | DEFINE_UNARY_KERNEL(atan_f64, f64, FloatOutUnaryPromote, ATan);
11 | DEFINE_UNARY_KERNEL(atan_bool, bool, FloatOutUnaryPromote, ATan);
12 | DEFINE_UNARY_KERNEL(atan_i8, i8, FloatOutUnaryPromote, ATan);
13 | DEFINE_UNARY_KERNEL(atan_i16, i16, FloatOutUnaryPromote, ATan);
14 | DEFINE_UNARY_KERNEL(atan_i32, i32, FloatOutUnaryPromote, ATan);
15 | DEFINE_UNARY_KERNEL(atan_i64, i64, FloatOutUnaryPromote, ATan);
16 | DEFINE_UNARY_KERNEL(atan_u8, u8, FloatOutUnaryPromote, ATan);
17 | DEFINE_UNARY_KERNEL(atan_u16, u16, FloatOutUnaryPromote, ATan);
18 | DEFINE_UNARY_KERNEL(atan_u32, u32, FloatOutUnaryPromote, ATan);
19 | DEFINE_UNARY_KERNEL(atan_u64, u64, FloatOutUnaryPromote, ATan);
20 | 


--------------------------------------------------------------------------------