├── docs ├── dev_guide │ ├── dev_guide.md │ ├── auto_diff │ │ ├── backward.md │ │ ├── difftensor.md │ │ └── introduction.md │ ├── pointer │ │ └── pointer.md │ ├── adding_new_op.md │ ├── test_rules.md │ ├── new_type.md │ └── iterator │ │ └── iterator.md ├── benchmarks │ └── benchmarks.md └── user_guide │ ├── user_guide.md │ ├── utils │ ├── set_seed.md │ ├── num_threads.md │ └── set_display_precision.md │ ├── custom_allocator │ └── custom_allocator.md │ ├── unary │ ├── acos.md │ ├── asin.md │ ├── atan.md │ ├── sin.md │ ├── sinh.md │ ├── cos.md │ ├── cosh.md │ ├── tan.md │ ├── tanh.md │ ├── abs.md │ ├── acosh.md │ ├── asinh.md │ ├── atanh.md │ ├── exp2.md │ ├── exp.md │ ├── sqrt.md │ ├── recip.md │ ├── log10.md │ ├── softplus.md │ ├── mish.md │ ├── cbrt.md │ ├── ln.md │ ├── log2.md │ ├── softsign.md │ ├── sigmoid.md │ ├── exp10.md │ ├── erf.md │ ├── sincos.md │ ├── gelu.md │ ├── cosh_.md │ ├── asin_.md │ ├── elu.md │ ├── hard_sigmoid.md │ ├── hard_swish.md │ ├── acos_.md │ ├── atan_.md │ ├── cos_.md │ ├── sin_.md │ ├── sinh_.md │ ├── tan_.md │ ├── ln_.md │ ├── tanh_.md │ ├── asinh_.md │ ├── abs_.md │ ├── acosh_.md │ ├── atanh_.md │ ├── celu.md │ ├── sqrt_.md │ ├── cbrt_.md │ ├── exp2_.md │ ├── exp_.md │ ├── log2_.md │ ├── log10_.md │ ├── recip_.md │ ├── mish_.md │ ├── softplus_.md │ ├── sigmoid_.md │ ├── softsign_.md │ ├── selu.md │ ├── erf_.md │ ├── exp10_.md │ ├── gelu_.md │ ├── elu_.md │ ├── hard_swish_.md │ ├── hard_sigmoid_.md │ └── celu_.md │ ├── random │ ├── randn_like.md │ ├── randn.md │ ├── rand_like.md │ ├── rand.md │ ├── beta.md │ └── exponential.md │ ├── associated_methods │ ├── cpu │ │ ├── to_cuda.md │ │ └── forget_copy.md │ └── cuda │ │ ├── to_cpu.md │ │ └── forget_copy.md │ ├── binary │ ├── add.md │ ├── div.md │ ├── mul.md │ ├── rem.md │ ├── sub.md │ ├── add_.md │ ├── div_.md │ ├── mul_.md │ ├── rem_.md │ ├── sub_.md │ └── pow_.md │ ├── custom_type │ └── custom_type.md │ ├── cmp │ ├── tensor_eq.md │ ├── tensor_gt.md │ ├── tensor_lt.md │ ├── tensor_neq.md │ ├── tensor_ge.md │ └── tensor_le.md │ ├── iterator │ ├── collect.md │ ├── par_iter_mut.md │ ├── par_iter.md │ └── strided_map.md │ ├── shape_manipulate │ └── flipud.md │ ├── creation │ ├── identity.md │ ├── arange.md │ ├── ones_like.md │ └── zeros_like.md │ └── windows │ ├── hamming_window.md │ └── hann_window.md ├── hpt-dyn ├── src │ ├── ops │ │ ├── tensor │ │ │ ├── cmp.rs │ │ │ └── conv2d │ │ │ │ └── type_kernels │ │ │ │ ├── i8_microkernels.rs │ │ │ │ ├── u8_microkernels.rs │ │ │ │ ├── bool_microkernels.rs │ │ │ │ ├── f32_microkernels.rs │ │ │ │ ├── f64_microkernels.rs │ │ │ │ ├── i16_microkernels.rs │ │ │ │ ├── i32_microkernels.rs │ │ │ │ ├── i64_microkernels.rs │ │ │ │ ├── u16_microkernels.rs │ │ │ │ ├── u32_microkernels.rs │ │ │ │ ├── u64_microkernels.rs │ │ │ │ ├── isize_microkernels.rs │ │ │ │ ├── usize_microkernels.rs │ │ │ │ ├── complex32_microkernels.rs │ │ │ │ └── complex64_microkernels.rs │ │ ├── common │ │ │ ├── traits.rs │ │ │ └── mod.rs │ │ ├── mod.rs │ │ └── models │ │ │ └── onnx.rs │ ├── utils │ │ ├── threadpool.rs │ │ ├── onnx │ │ │ ├── layout_sense.rs │ │ │ ├── load_model.rs │ │ │ ├── plot.rs │ │ │ └── parse_args │ │ │ │ ├── squeeze.rs │ │ │ │ ├── affine_grid.rs │ │ │ │ └── parse.rs │ │ └── mod.rs │ └── lib.rs └── build.rs ├── hpt-macros ├── src │ └── save_derive.rs ├── .gitignore └── Cargo.toml ├── hpt-bench ├── src │ └── main.rs ├── benches │ └── benchmarks │ │ └── broadcast │ │ └── broadcast.rs └── scan_benchmarks_result.py ├── hpt-tests ├── src │ ├── hpt_common │ │ ├── slice.rs │ │ ├── err_handler.rs │ │ ├── pointer.rs │ │ └── shape.rs │ ├── hpt │ │ ├── cpu │ │ │ ├── utils.rs │ │ │ ├── assert_utils.rs │ │ │ └── from_raw.rs │ │ └── cuda │ │ │ └── from_raw.rs │ ├── utils │ │ └── random_utils.rs │ ├── macro_tests │ │ ├── stmt_item │ │ │ └── item.rs │ │ └── control_flows │ │ │ ├── for_loop.expanded.rs │ │ │ └── while_loop.expanded.rs │ └── hpt_types │ │ └── test_display.rs └── .gitignore ├── hpt-types ├── src │ ├── dyn_dispatch │ │ └── vector.rs │ ├── vectors │ │ ├── .DS_Store │ │ └── arch_simd │ │ │ └── _128bit │ │ │ └── sse │ │ │ └── boolx16.rs │ ├── into_scalar.rs │ └── into_vec.rs ├── .gitignore └── Cargo.toml ├── hpt-cudakernels ├── src │ ├── utils │ │ ├── normalout.cuh │ │ ├── check_type.cuh │ │ ├── type_alias.cuh │ │ ├── loop_progress.cuh │ │ ├── promotion │ │ │ └── promotes.cuh │ │ └── extra_vecs.cuh │ ├── lib.rs │ ├── reginfo.rs │ ├── pooling │ │ └── pooling_template.cuh │ ├── reduce │ │ ├── sum.cu │ │ ├── prod.cu │ │ ├── all.cu │ │ ├── any.cu │ │ ├── max.cu │ │ ├── min.cu │ │ ├── nansum.cu │ │ ├── mean.cu │ │ ├── nanprod.cu │ │ ├── sum_square.cu │ │ ├── reducel1.cu │ │ ├── reducel2.cu │ │ ├── reducel3.cu │ │ ├── logsumexp.cu │ │ └── reduce_helper.cuh │ └── unary │ │ ├── ln.cu │ │ ├── exp.cu │ │ ├── sin.cu │ │ ├── tan.cu │ │ ├── cos.cu │ │ ├── erf.cu │ │ ├── acos.cu │ │ ├── asin.cu │ │ └── atan.cu ├── .gitignore └── Cargo.toml ├── hpt ├── .gitignore └── src │ └── backends │ ├── cuda │ ├── tensor_external │ │ └── advance.rs │ ├── cuda_slice.rs │ └── utils │ │ └── launch_cfg │ │ └── launch_cfg_trait.rs │ ├── common │ ├── readme.md │ └── conv.rs │ └── cpu │ ├── .DS_Store │ ├── kernels │ ├── conv2d │ │ └── type_kernels │ │ │ ├── bool_microkernels.rs │ │ │ ├── f32_microkernels.rs │ │ │ ├── f64_microkernels.rs │ │ │ ├── i16_microkernels.rs │ │ │ ├── i32_microkernels.rs │ │ │ ├── i64_microkernels.rs │ │ │ ├── i8_microkernels.rs │ │ │ ├── u16_microkernels.rs │ │ │ ├── u32_microkernels.rs │ │ │ ├── u64_microkernels.rs │ │ │ ├── u8_microkernels.rs │ │ │ ├── isize_microkernels.rs │ │ │ ├── usize_microkernels.rs │ │ │ ├── complex32_microkernels.rs │ │ │ └── complex64_microkernels.rs │ └── matmul │ │ ├── readme.md │ │ └── type_kernels │ │ ├── complex32_microkernels.rs │ │ └── complex64_microkernels.rs │ ├── tensor_external │ ├── cumulative.rs │ └── tensordot.rs │ └── tensor_internal │ └── cumulative.rs ├── matconv_simd ├── src │ └── simd │ │ ├── _512bit │ │ ├── avx512 │ │ │ ├── bf16x32.rs │ │ │ ├── boolx64.rs │ │ │ ├── cplx32x8.rs │ │ │ ├── cplx64x4.rs │ │ │ ├── f64x8.rs │ │ │ ├── i16x32.rs │ │ │ ├── i32x16.rs │ │ │ ├── i8x64.rs │ │ │ ├── u32x16.rs │ │ │ ├── u64x8.rs │ │ │ ├── u8x64.rs │ │ │ └── u16x32.rs │ │ └── common │ │ │ ├── f32x16.rs │ │ │ ├── f64x8.rs │ │ │ ├── i32x16.rs │ │ │ ├── i8x64.rs │ │ │ ├── u8x64.rs │ │ │ ├── i16x32.rs │ │ │ ├── i64x8.rs │ │ │ ├── u64x8.rs │ │ │ └── mask.rs │ │ ├── .DS_Store │ │ ├── _128bit │ │ ├── common │ │ │ ├── f16x8.rs │ │ │ ├── f32x4.rs │ │ │ ├── f64x2.rs │ │ │ ├── i16x8.rs │ │ │ ├── i32x4.rs │ │ │ ├── u16x8.rs │ │ │ ├── i64x2.rs │ │ │ ├── u8x16.rs │ │ │ ├── u64x2.rs │ │ │ ├── i8x16.rs │ │ │ └── bf16x8.rs │ │ └── sse │ │ │ └── boolx16.rs │ │ └── _256bit │ │ ├── common │ │ ├── f32x8.rs │ │ ├── i32x8.rs │ │ ├── i8x32.rs │ │ ├── u8x32.rs │ │ ├── i16x16.rs │ │ ├── i64x4.rs │ │ ├── u64x4.rs │ │ └── f64x4.rs │ │ └── avx2 │ │ └── f64x4.rs └── Cargo.toml ├── hpt-common ├── .gitignore ├── src │ ├── utils │ │ ├── conv_algos.rs │ │ └── simd_ref.rs │ └── error │ │ ├── onnx.rs │ │ ├── mod.rs │ │ └── autograd.rs └── Cargo.toml ├── hpt-traits ├── .gitignore ├── Cargo.toml └── src │ └── ops │ └── slice.rs ├── hpt-allocator ├── .gitignore ├── src │ ├── allocators │ │ └── mod.rs │ ├── storage │ │ ├── cpu.rs │ │ └── cuda.rs │ ├── ptr.rs │ └── utils │ │ ├── cache_resize.rs │ │ └── deallocate.rs └── Cargo.toml ├── .DS_Store ├── hpt-examples ├── src │ └── main.rs ├── Cargo.toml └── examples │ └── iterator │ └── main.rs ├── hpt-matmul ├── .DS_Store ├── src │ └── .DS_Store └── Cargo.toml ├── .idea └── .gitignore ├── hpt-display ├── src │ └── lib.rs └── Cargo.toml ├── hpt-codegen ├── src │ └── fuse │ │ └── dead_node_elimination.rs └── Cargo.toml ├── .cargo └── config.toml ├── .gitignore ├── hpt-conv └── Cargo.toml ├── hpt-dataloader ├── src │ ├── from_safetensors │ │ └── from_safetensors.rs │ └── lib.rs └── Cargo.toml ├── hpt-iterator └── Cargo.toml ├── package.json └── .github └── workflows └── docs.yml /docs/dev_guide/dev_guide.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/benchmarks/benchmarks.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/user_guide/user_guide.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/cmp.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/threadpool.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-macros/src/save_derive.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/dev_guide/auto_diff/backward.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/dev_guide/auto_diff/difftensor.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-bench/src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() {} 2 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/common/traits.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/layout_sense.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt_common/slice.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hpt-types/src/dyn_dispatch/vector.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/dev_guide/auto_diff/introduction.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/normalout.cuh: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /hpt/src/backends/cuda/tensor_external/advance.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/bf16x32.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/boolx64.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/cplx32x8.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/cplx64x4.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/f64x8.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/i16x32.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/i32x16.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/i8x64.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/u32x16.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/u64x8.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/u8x64.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-bench/benches/benchmarks/broadcast/broadcast.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hpt-common/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /hpt-dyn/src/ops/common/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod traits; -------------------------------------------------------------------------------- /hpt-macros/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /hpt-tests/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /hpt-traits/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /hpt-types/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/avx512/u16x32.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hpt-allocator/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/.DS_Store -------------------------------------------------------------------------------- /hpt-examples/src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("Hello, world!"); 3 | } 4 | -------------------------------------------------------------------------------- /hpt-matmul/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-matmul/.DS_Store -------------------------------------------------------------------------------- /hpt-matmul/src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-matmul/src/.DS_Store -------------------------------------------------------------------------------- /hpt/src/backends/common/readme.md: -------------------------------------------------------------------------------- 1 | This folder contains all the ops that supports all the backend -------------------------------------------------------------------------------- /hpt/src/backends/cpu/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt/src/backends/cpu/.DS_Store -------------------------------------------------------------------------------- /hpt-allocator/src/allocators/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod cpu; 2 | #[cfg(feature = "cuda")] 3 | pub(crate) mod cuda; 4 | -------------------------------------------------------------------------------- /hpt-types/src/vectors/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/hpt-types/src/vectors/.DS_Store -------------------------------------------------------------------------------- /matconv_simd/src/simd/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jianqoq/Hpt/HEAD/matconv_simd/src/simd/.DS_Store -------------------------------------------------------------------------------- /hpt-dyn/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | // prost_build::compile_protos(&["src/onnx.proto"], &["src/"]).unwrap(); 3 | } -------------------------------------------------------------------------------- /hpt-dyn/src/ops/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod common; 2 | pub(crate) mod tensor; 3 | pub(crate) mod models { 4 | pub(crate) mod onnx; 5 | } -------------------------------------------------------------------------------- /hpt-cudakernels/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod reginfo; 2 | pub use reginfo::RegisterInfo; 3 | #[cfg(feature = "cuda")] 4 | include!(concat!(env!("OUT_DIR"), "/generated_constants.rs")); 5 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/i8_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i8 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/u8_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u8 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/bool_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for bool {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/f32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for f32 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/f64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for f64 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/i16_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i16 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/i32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i32 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/i64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i64 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/u16_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u16 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/u32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u32 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/u64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::ops::tensor::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u64 {} 4 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reginfo.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, Copy)] 2 | pub struct RegisterInfo { 3 | pub pred: usize, 4 | pub b16: usize, 5 | pub b32: usize, 6 | pub b64: usize, 7 | } 8 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/isize_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for isize {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/usize_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for usize {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/bool_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for bool {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/f32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for f32 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/f64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for f64 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/i16_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i16 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/i32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i32 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/i64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i64 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/i8_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for i8 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/u16_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u16 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/u32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u32 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/u64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u64 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/u8_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for u8 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/matmul/readme.md: -------------------------------------------------------------------------------- 1 | ## Acknowledgements 2 | 3 | Our matrix multiplication implementation is inspired from [gemm](https://github.com/sarah-quinones/gemm) by Sarah Quinones. -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/isize_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for isize {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/usize_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 2 | 3 | impl Conv2dMicroKernel for usize {} 4 | -------------------------------------------------------------------------------- /matconv_simd/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "matconv_simd" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | half = { workspace = true } 8 | num-complex = { workspace = true } 9 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/matmul/type_kernels/complex32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::matmul::microkernel_trait::MatmulMicroKernel; 2 | use num::complex::Complex32; 3 | impl MatmulMicroKernel for Complex32 {} 4 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/matmul/type_kernels/complex64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::kernels::matmul::microkernel_trait::MatmulMicroKernel; 2 | use num::complex::Complex64; 3 | impl MatmulMicroKernel for Complex64 {} 4 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/complex32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use num::complex::Complex32; 2 | 3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 4 | 5 | impl Conv2dMicroKernel for Complex32 {} 6 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/tensor/conv2d/type_kernels/complex64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use num::complex::Complex64; 2 | 3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 4 | 5 | impl Conv2dMicroKernel for Complex64 {} 6 | -------------------------------------------------------------------------------- /hpt/src/backends/cuda/cuda_slice.rs: -------------------------------------------------------------------------------- 1 | use cudarc::driver::DeviceRepr; 2 | 3 | #[repr(C)] 4 | pub(crate) struct CudaSlice { 5 | pub(crate) inner: cudarc::driver::sys::CUdeviceptr, 6 | } 7 | 8 | unsafe impl DeviceRepr for CudaSlice {} 9 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/f16x8.rs: -------------------------------------------------------------------------------- 1 | 2 | /// a vector of 8 f16 values 3 | #[allow(non_camel_case_types)] 4 | #[derive(Default, Clone, Copy, PartialEq, Debug)] 5 | #[repr(C, align(16))] 6 | pub struct f16x8(pub(crate) [half::f16; 8]); -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/complex32_microkernels.rs: -------------------------------------------------------------------------------- 1 | use num::complex::Complex32; 2 | 3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 4 | 5 | impl Conv2dMicroKernel for Complex32 {} 6 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/kernels/conv2d/type_kernels/complex64_microkernels.rs: -------------------------------------------------------------------------------- 1 | use num::complex::Complex64; 2 | 3 | use crate::backends::cpu::kernels::conv2d::microkernel_trait::Conv2dMicroKernel; 4 | 5 | impl Conv2dMicroKernel for Complex64 {} 6 | -------------------------------------------------------------------------------- /hpt-display/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This crate is used to display n-dimensional arrays 2 | 3 | #![deny(missing_docs)] 4 | 5 | /// A module contains display function 6 | mod display; 7 | /// A module contains formats 8 | mod formats; 9 | pub use display::display; 10 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/check_type.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define CHECK_FLOAT_TYPE(T) \ 4 | static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, "T must be half, __nv_bfloat16, float, or double"); 5 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/f32x16.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | #[allow(non_camel_case_types)] 6 | #[derive(Clone, Copy, Debug)] 7 | #[repr(C, align(64))] 8 | pub struct f32x16(#[cfg(target_arch = "x86_64")] pub(crate) __m512); -------------------------------------------------------------------------------- /hpt-codegen/src/fuse/dead_node_elimination.rs: -------------------------------------------------------------------------------- 1 | use super::cfg::CFG; 2 | 3 | pub(crate) struct _NodeEliminator<'a> { 4 | pub(crate) cfg: &'a mut CFG, 5 | pub(crate) current_assignment: Option, 6 | } 7 | 8 | impl<'ast> syn::visit::Visit<'ast> for _NodeEliminator<'ast> {} 9 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/f32x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 8 f32 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct f32x8(#[cfg(target_arch = "x86_64")] pub(crate) __m256); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/i32x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 8 i32 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct i32x8(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/i8x32.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 32 i8 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct i8x32(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/u8x32.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 32 u8 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct u8x32(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/f64x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 2 f64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct f64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512d); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/i32x16.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 8 i32 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct i32x16(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/i8x64.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 32 i8 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct i8x64(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/u8x64.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 32 u8 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct u8x64(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.'cfg(target_os = "macos")'] 2 | rustflags = ["-C", "target-cpu=native"] 3 | 4 | [target.'cfg(target_os = "windows")'] 5 | rustflags = [ 6 | "-C", 7 | "target-feature=+avx2", 8 | "-C", 9 | "target-feature=+fma", 10 | "-C", 11 | "target-feature=+f16c", 12 | ] 13 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/i16x16.rs: -------------------------------------------------------------------------------- 1 | #[cfg(target_arch = "x86_64")] 2 | use std::arch::x86_64::*; 3 | 4 | /// a vector of 16 i16 values 5 | #[allow(non_camel_case_types)] 6 | #[derive(Clone, Copy, Debug)] 7 | #[repr(C, align(32))] 8 | pub struct i16x16(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); 9 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/i16x32.rs: -------------------------------------------------------------------------------- 1 | #[cfg(target_arch = "x86_64")] 2 | use std::arch::x86_64::*; 3 | 4 | /// a vector of 16 i16 values 5 | #[allow(non_camel_case_types)] 6 | #[derive(Clone, Copy, Debug)] 7 | #[repr(C, align(64))] 8 | pub struct i16x32(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); 9 | -------------------------------------------------------------------------------- /hpt-allocator/src/storage/cpu.rs: -------------------------------------------------------------------------------- 1 | use dashmap::DashMap; 2 | use once_cell::sync::Lazy; 3 | 4 | use super::CommonStorage; 5 | 6 | /// This is a global variable that stores the allocated ptrs and their reference count for CPU devices 7 | pub static CPU_STORAGE: Lazy> = Lazy::new(|| DashMap::new()); 8 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/i64x4.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 4 i64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct i64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); 10 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/u64x4.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 2 u64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct u64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256i); 10 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/i64x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 4 i64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct i64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); 10 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/u64x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 2 u64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(64))] 9 | pub struct u64x8(#[cfg(target_arch = "x86_64")] pub(crate) __m512i); 10 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/common/f64x4.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | /// a vector of 2 f64 values 6 | #[allow(non_camel_case_types)] 7 | #[derive(Clone, Copy, Debug)] 8 | #[repr(C, align(32))] 9 | pub struct f64x4(#[cfg(target_arch = "x86_64")] pub(crate) __m256d); 10 | 11 | -------------------------------------------------------------------------------- /hpt-allocator/src/ptr.rs: -------------------------------------------------------------------------------- 1 | /// just a wrapper around `*mut u8`, implementing `Send` and `Sync` trait to let the compiler know that it is safe to send and share across threads 2 | #[derive(Debug, PartialEq, Eq, Hash, Clone)] 3 | pub(crate) struct SafePtr { 4 | pub(crate) ptr: *mut u8, 5 | } 6 | unsafe impl Send for SafePtr {} 7 | unsafe impl Sync for SafePtr {} 8 | -------------------------------------------------------------------------------- /docs/user_guide/utils/set_seed.md: -------------------------------------------------------------------------------- 1 | # set_seed 2 | 3 | ```rust 4 | set_seed(seed: u64) 5 | ``` 6 | 7 | Set the seed for random number generation 8 | ## Parameters: 9 | `seed`: seed for generating random number 10 | `B`: hpt::Cuda | hpt::Cpu 11 | 12 | 13 | ## Backend Support 14 | | Backend | Supported | 15 | |---------|-----------| 16 | | CPU | ❌ | 17 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-allocator/src/storage/cuda.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, sync::Mutex}; 2 | 3 | use once_cell::sync::Lazy; 4 | 5 | use crate::storage::CommonStorage; 6 | 7 | /// This is a global variable that stores the allocated ptrs and their reference count for CUDA devices 8 | pub static CUDA_STORAGE: Lazy>> = 9 | Lazy::new(|| Mutex::new(HashMap::new())); 10 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/type_alias.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define f32 float 4 | #define f64 double 5 | #define bf16 __nv_bfloat16 6 | #define f16 __half 7 | 8 | #define i8 int8_t 9 | #define i16 int16_t 10 | #define i32 int32_t 11 | #define i64 int64_t 12 | #define u8 uint8_t 13 | #define u16 uint16_t 14 | #define u32 uint32_t 15 | #define u64 uint64_t 16 | 17 | #define bf162 __nv_bfloat162 18 | #define f162 __half2 19 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/f32x4.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_feature = "neon")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | 7 | /// a vector of 4 f32 values 8 | #[allow(non_camel_case_types)] 9 | #[derive(Clone, Copy, Debug)] 10 | #[repr(C, align(16))] 11 | pub struct f32x4( 12 | #[cfg(target_arch = "x86_64")] pub(crate) __m128, 13 | #[cfg(target_arch = "aarch64")] pub(crate) float32x4_t, 14 | ); 15 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/f64x2.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_feature = "neon")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | 7 | /// a vector of 2 f64 values 8 | #[allow(non_camel_case_types)] 9 | #[derive(Clone, Copy, Debug)] 10 | #[repr(C, align(16))] 11 | pub struct f64x2( 12 | #[cfg(target_arch = "x86_64")] pub(crate) __m128d, 13 | #[cfg(target_arch = "aarch64")] pub(crate) float64x2_t, 14 | ); 15 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/i16x8.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "aarch64")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | 7 | /// a vector of 8 i16 values 8 | #[allow(non_camel_case_types)] 9 | #[derive(Clone, Copy, Debug)] 10 | #[repr(C, align(16))] 11 | pub struct i16x8( 12 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 13 | #[cfg(target_arch = "aarch64")] pub(crate) int16x8_t, 14 | ); 15 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/i32x4.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "aarch64")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | 7 | /// a vector of 4 i32 values 8 | #[allow(non_camel_case_types)] 9 | #[derive(Clone, Copy, Debug)] 10 | #[repr(C, align(16))] 11 | pub struct i32x4( 12 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 13 | #[cfg(target_arch = "aarch64")] pub(crate) int32x4_t, 14 | ); 15 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/u16x8.rs: -------------------------------------------------------------------------------- 1 | 2 | 3 | #[cfg(target_arch = "aarch64")] 4 | use std::arch::aarch64::*; 5 | #[cfg(target_arch = "x86_64")] 6 | use std::arch::x86_64::*; 7 | 8 | /// a vector of 8 u16 values 9 | #[allow(non_camel_case_types)] 10 | #[derive(Clone, Copy, Debug)] 11 | #[repr(C, align(16))] 12 | pub struct u16x8( 13 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 14 | #[cfg(target_arch = "aarch64")] pub(crate) uint16x8_t, 15 | ); -------------------------------------------------------------------------------- /hpt-common/src/utils/conv_algos.rs: -------------------------------------------------------------------------------- 1 | /// enum for conv algorithms 2 | pub enum ConvAlgo { 3 | /// ImplicitGemm 4 | ImplicitGemm, 5 | /// ImplicitPrecompGemm 6 | ImplicitPrecompGemm, 7 | /// Gemm 8 | Gemm, 9 | /// Direct 10 | Direct, 11 | /// Fft 12 | Fft, 13 | /// FftTiling 14 | FftTiling, 15 | /// Winograd 16 | Winograd, 17 | /// WinogradNonFused 18 | WinogradNonFused, 19 | /// Count 20 | Count, 21 | } 22 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt/cpu/utils.rs: -------------------------------------------------------------------------------- 1 | use hpt::Tensor; 2 | 3 | use hpt::common::cpu::TensorLike; 4 | use hpt::common::{CommonBounds, TensorInfo}; 5 | 6 | pub(crate) fn copy_from_tch( 7 | a: &mut Tensor, 8 | tch_a: &tch::Tensor, 9 | ) -> anyhow::Result<()> { 10 | let a_size = a.size(); 11 | a.as_raw_mut().copy_from_slice(unsafe { 12 | std::slice::from_raw_parts(tch_a.data_ptr() as *const T, a_size) 13 | }); 14 | Ok(()) 15 | } 16 | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/load_model.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, io::Read}; 2 | 3 | use prost::Message; 4 | 5 | use crate::{onnx::ModelProto, ops::models::onnx::OnnxModel}; 6 | 7 | pub fn load_onnx(path: &str) -> Result { 8 | let mut file = File::open(path).expect("找不到模型文件"); 9 | let mut buf = Vec::new(); 10 | file.read_to_end(&mut buf).unwrap(); 11 | 12 | let model = ModelProto::decode(&*buf).expect("模型解析失败"); 13 | 14 | Ok(OnnxModel::Model(model)) 15 | } 16 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/loop_progress.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | struct ProgressUpdater 5 | { 6 | Func update_func; 7 | T *data; 8 | __device__ __forceinline__ ProgressUpdater(Func f, T *data) : update_func(f), data(data) {} 9 | __device__ __forceinline__ void update() { update_func(data); } 10 | __device__ __forceinline__ T get() const { return *data; } 11 | __device__ __forceinline__ void set_ptr(T *data) { this->data = data; } 12 | }; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock 4 | *.ft 5 | *.ftz 6 | *.zip 7 | *.ll 8 | *.xlsx 9 | *.7z 10 | *.safetensors 11 | *.txt 12 | *.model 13 | *.xml 14 | *.onnx 15 | 16 | .vscode/ 17 | 18 | resnet.safetensor 19 | resnet_inp 20 | 21 | /src/**/target 22 | /src/**/true 23 | /src/**/*.ll 24 | /src/**/*.txt 25 | /src/tensor-graph/src-tauri/*.txt 26 | /.VSCodeCounter 27 | 28 | node_modules/ 29 | docs/.vuepress/.cache/ 30 | docs/.vuepress/.temp/ 31 | docs/.vuepress/dist/ 32 | 33 | cutlass_gemm.cu 34 | cutlass 35 | 36 | package-lock.json -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/plot.rs: -------------------------------------------------------------------------------- 1 | use petgraph::{dot::{Config, Dot}, prelude::StableGraph}; 2 | 3 | #[allow(unused)] 4 | pub(crate) fn generate_online_graphviz_link(graph: &StableGraph) -> String 5 | where N: std::fmt::Debug, E: std::fmt::Debug 6 | { 7 | let dot = Dot::with_config(&graph, &[Config::EdgeNoLabel]); 8 | let dot_string = format!("{:?}", dot); 9 | 10 | // URL编码DOT字符串 11 | let encoded = urlencoding::encode(&dot_string); 12 | format!("https://dreampuf.github.io/GraphvizOnline/#{encoded}") 13 | } 14 | -------------------------------------------------------------------------------- /hpt-display/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-display" 3 | version = "0.1.2" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library for displaying tensors" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" } 12 | hpt-common = { path = "../hpt-common", version = "0.1.1" } 13 | hpt-types = { path = "../hpt-types", version = "0.1.1" } 14 | num-complex = { workspace = true } 15 | -------------------------------------------------------------------------------- /hpt-conv/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-conv" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | hpt-matmul = { path = "../hpt-matmul" } 8 | matconv_simd = { path = "../matconv_simd" } 9 | seq-macro = { workspace = true } 10 | half = { workspace = true } 11 | 12 | [features] 13 | default = ["f32", "f16"] 14 | bound_check = [] 15 | bool = [] 16 | f32 = [] 17 | f16 = [] 18 | bf16 = [] 19 | f64 = [] 20 | i8 = [] 21 | u8 = [] 22 | i16 = [] 23 | u16 = [] 24 | i32 = [] 25 | u32 = [] 26 | i64 = [] 27 | u64 = [] 28 | cplx32 = [] 29 | cplx64 = [] -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/parse_args/squeeze.rs: -------------------------------------------------------------------------------- 1 | use super::parse::{ Parse, ParseArgs }; 2 | 3 | pub(crate) struct SqueezeArgs<'a> { 4 | pub(crate) data: &'a str, 5 | pub(crate) axes: Option<&'a str>, 6 | } 7 | 8 | impl<'a> Parse<'a> for SqueezeArgs<'a> { 9 | fn parse<'b: 'a>(node: &'b crate::onnx::NodeProto) -> SqueezeArgs<'a> { 10 | let data = node.input[0].as_str(); 11 | let axes = node.input.get(1).map(|s| s.as_str()); 12 | SqueezeArgs { 13 | data, 14 | axes, 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /hpt/src/backends/common/conv.rs: -------------------------------------------------------------------------------- 1 | pub(crate) fn cal_conv2d_output_shape( 2 | img_height: i64, 3 | img_width: i64, 4 | kh: i64, 5 | kw: i64, 6 | padding: &[(i64, i64); 2], 7 | stride: &[i64; 2], 8 | dilation: &[i64; 2], 9 | ) -> (i64, i64) { 10 | let out_height = 11 | (img_height + padding[0].0 + padding[0].1 - dilation[0] * (kh - 1) - 1) / stride[0] + 1; 12 | let out_width = 13 | (img_width + padding[1].0 + padding[1].1 - dilation[1] * (kw - 1) - 1) / stride[1] + 1; 14 | (out_height, out_width) 15 | } 16 | -------------------------------------------------------------------------------- /hpt/src/backends/cuda/utils/launch_cfg/launch_cfg_trait.rs: -------------------------------------------------------------------------------- 1 | use cudarc::driver::LaunchConfig; 2 | 3 | pub(crate) trait LaunchConfigUtils { 4 | #[allow(unused)] 5 | fn block_size(&self) -> u32; 6 | #[allow(unused)] 7 | fn grid_size(&self) -> u32; 8 | } 9 | 10 | impl LaunchConfigUtils for LaunchConfig { 11 | fn block_size(&self) -> u32 { 12 | self.block_dim.0 * self.block_dim.1 * self.block_dim.2 13 | } 14 | 15 | fn grid_size(&self) -> u32 { 16 | self.grid_dim.0 * self.grid_dim.1 * self.grid_dim.2 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/promotion/promotes.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "normal_promote/bool.cuh" 3 | #include "normal_promote/i8.cuh" 4 | #include "normal_promote/i16.cuh" 5 | #include "normal_promote/i32.cuh" 6 | #include "normal_promote/i64.cuh" 7 | #include "normal_promote/u8.cuh" 8 | #include "normal_promote/u16.cuh" 9 | #include "normal_promote/u32.cuh" 10 | #include "normal_promote/u64.cuh" 11 | 12 | #include "normal_promote/f16.cuh" 13 | #include "normal_promote/bf16.cuh" 14 | #include "normal_promote/f32.cuh" 15 | #include "normal_promote/f64.cuh" 16 | -------------------------------------------------------------------------------- /docs/user_guide/custom_allocator/custom_allocator.md: -------------------------------------------------------------------------------- 1 | # Custom Allocator 2 | 3 | Since hpt is designed in purely generic types, the user can define their own memory allocator. 4 | 5 | # How 6 | 7 | You can reference the steps at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-examples/examples/custom_allocator/main.rs). 8 | 9 | # Note 10 | 11 | Custom Allocator must have life time `'static`, `Send`, `Sync`, `Clone` implemented 12 | 13 | ## Backend Support 14 | | Backend | Supported | 15 | |---------|-----------| 16 | | CPU | ✅ | 17 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-dataloader/src/from_safetensors/from_safetensors.rs: -------------------------------------------------------------------------------- 1 | use safetensors::SafeTensors; 2 | 3 | #[diagnostic::on_unimplemented( 4 | message = "Cannot perform operation on type `{Self}` because it doesn't implement required features" 5 | )] 6 | pub trait FromSafeTensors { 7 | fn from_safe_tensors(data: &SafeTensors, tensor_name: &str) -> Self; 8 | } 9 | 10 | impl FromSafeTensors for Option { 11 | fn from_safe_tensors(data: &SafeTensors, tensor_name: &str) -> Self { 12 | Some(T::from_safe_tensors(data, tensor_name)) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /hpt-cudakernels/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /true 3 | Cargo.lock 4 | *.ft 5 | *.ftz 6 | *.zip 7 | *.ll 8 | *.xlsx 9 | *.7z 10 | *.safetensors 11 | *.txt 12 | *.model 13 | *.xml 14 | 15 | resnet.safetensor 16 | resnet_inp 17 | 18 | /src/**/target 19 | /src/**/true 20 | /src/**/*.ll 21 | /src/**/*.txt 22 | /src/tensor-graph/src-tauri/*.txt 23 | /.VSCodeCounter 24 | 25 | node_modules/ 26 | docs/.vuepress/.cache/ 27 | docs/.vuepress/.temp/ 28 | docs/.vuepress/dist/ 29 | 30 | ./build.rs 31 | 32 | cutlass_gemm.cu 33 | cutlass 34 | test_cute.cu 35 | gemm 36 | gemm.cu 37 | matmul.cu 38 | matmul -------------------------------------------------------------------------------- /hpt-cudakernels/src/pooling/pooling_template.cuh: -------------------------------------------------------------------------------- 1 | #include "../utils/type_alias.cuh" 2 | 3 | 4 | template 5 | __device__ __forceinline__ void pooling2d_forward( 6 | T *input, T *output, 7 | i32 batch_size, i32 channels, 8 | i32 input_height, i32 input_width, 9 | i32 output_height, i32 output_width, 10 | i32 kernel_h, i32 kernel_w, 11 | i32 stride_h, i32 stride_w, 12 | i32 padding_h, i32 padding_w, 13 | Intermediate *workspace) 14 | { 15 | i32 global_idx = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | } 18 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/sum.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "reduce_classes.cuh" 3 | 4 | DECLARE_KERNEL(bool, bool, sum, Sum) 5 | DECLARE_KERNEL(u8, u8, sum, Sum) 6 | DECLARE_KERNEL(u16, u16, sum, Sum) 7 | DECLARE_KERNEL(u32, u32, sum, Sum) 8 | DECLARE_KERNEL(u64, u64, sum, Sum) 9 | DECLARE_KERNEL(i8, i8, sum, Sum) 10 | DECLARE_KERNEL(i16, i16, sum, Sum) 11 | DECLARE_KERNEL(i32, i32, sum, Sum) 12 | DECLARE_KERNEL(i64, i64, sum, Sum) 13 | DECLARE_KERNEL(f32, f32, sum, Sum) 14 | DECLARE_KERNEL(f64, f64, sum, Sum) 15 | DECLARE_KERNEL(f16, f16, sum, Sum) 16 | DECLARE_KERNEL(bf16, bf16, sum, Sum) 17 | -------------------------------------------------------------------------------- /hpt-traits/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-traits" 3 | version = "0.1.3" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library defines tensor operator traits for hpt" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | hpt-common = { path = "../hpt-common", version = "0.1.1" } 12 | hpt-types = { path = "../hpt-types", version = "0.1.1" } 13 | num = { workspace = true } 14 | rand_distr = { workspace = true } 15 | 16 | [features] 17 | track_caller = [] 18 | default = ["track_caller"] 19 | -------------------------------------------------------------------------------- /docs/user_guide/unary/acos.md: -------------------------------------------------------------------------------- 1 | # acos 2 | ```rust 3 | acos(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse cosine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.acos()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/asin.md: -------------------------------------------------------------------------------- 1 | # asin 2 | ```rust 3 | asin(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse sine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.asin()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/atan.md: -------------------------------------------------------------------------------- 1 | # atan 2 | ```rust 3 | atan(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse tangent 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.atan()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sin.md: -------------------------------------------------------------------------------- 1 | # sin 2 | ```rust 3 | sin(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Trigonometric sine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.sin()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sinh.md: -------------------------------------------------------------------------------- 1 | # sinh 2 | ```rust 3 | sinh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Hyperbolic sine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.sinh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-iterator/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-iterator" 3 | version = "0.1.2" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library implements iterator for hpt" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | hpt-common = { path = "../hpt-common", version = "0.1.1" } 12 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" } 13 | hpt-types = { path = "../hpt-types", version = "0.1.1" } 14 | rayon = { workspace = true } 15 | 16 | [features] 17 | track_caller = [] 18 | bound_check = [] 19 | -------------------------------------------------------------------------------- /hpt-tests/src/utils/random_utils.rs: -------------------------------------------------------------------------------- 1 | pub(crate) fn generate_all_combinations(arr: &[usize]) -> Vec> { 2 | let n = arr.len(); 3 | let total_combinations = 1 << n; 4 | let mut result = Vec::with_capacity(total_combinations); 5 | 6 | for i in 0..total_combinations { 7 | let mut combination = Vec::new(); 8 | for j in 0..n { 9 | if (i & (1 << j)) != 0 { 10 | combination.push(arr[j] as i64); 11 | } 12 | } 13 | if combination.len() > 0 { 14 | result.push(combination); 15 | } 16 | } 17 | 18 | result 19 | } 20 | -------------------------------------------------------------------------------- /docs/user_guide/unary/cos.md: -------------------------------------------------------------------------------- 1 | # cos 2 | ```rust 3 | cos(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Trigonometric cosine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.cos()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/cosh.md: -------------------------------------------------------------------------------- 1 | # cosh 2 | ```rust 3 | cosh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Hyperbolic cosine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.cosh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/tan.md: -------------------------------------------------------------------------------- 1 | # tan 2 | ```rust 3 | tan(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Trigonometric tangent 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.tan()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/tanh.md: -------------------------------------------------------------------------------- 1 | # tanh 2 | ```rust 3 | tanh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Hyperbolic tangent 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.tanh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/abs.md: -------------------------------------------------------------------------------- 1 | # abs 2 | ```rust 3 | abs(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Calculate absolute of input 6 | ## Parameters: 7 | `x`: input Tensor 8 | ## Returns: 9 | Tensor with type `T` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::NormalUaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([-10.0]); 16 | let b = a.abs()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/acosh.md: -------------------------------------------------------------------------------- 1 | # acosh 2 | ```rust 3 | acosh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse hyperbolic cosine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.acosh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/asinh.md: -------------------------------------------------------------------------------- 1 | # asinh 2 | ```rust 3 | asinh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse hyperbolic sine 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.asinh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/atanh.md: -------------------------------------------------------------------------------- 1 | # atanh 2 | ```rust 3 | atanh(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Inverse hyperbolic tangent 6 | ## Parameters: 7 | `x`: Angle(radians) 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.atanh()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/utils/num_threads.md: -------------------------------------------------------------------------------- 1 | # set_num_threads 2 | 3 | ```rust 4 | set_num_threads(num_threads: usize) 5 | ``` 6 | 7 | Set the parallelism thread numbers 8 | ## Parameters: 9 | `num_threads`: number of threads to use 10 | 11 | ## Backend Support 12 | | Backend | Supported | 13 | |---------|-----------| 14 | | CPU | ✅ | 15 | | Cuda | ❌ | 16 | 17 | # get_num_threads 18 | 19 | ```rust 20 | get_num_threads() 21 | ``` 22 | 23 | Get the current number of threads using 24 | 25 | ## Backend Support 26 | | Backend | Supported | 27 | |---------|-----------| 28 | | CPU | ✅ | 29 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/prod.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "reduce_classes.cuh" 3 | 4 | DECLARE_KERNEL(bool, bool, prod, Prod) 5 | DECLARE_KERNEL(u8, u8, prod, Prod) 6 | DECLARE_KERNEL(u16, u16, prod, Prod) 7 | DECLARE_KERNEL(u32, u32, prod, Prod) 8 | DECLARE_KERNEL(u64, u64, prod, Prod) 9 | DECLARE_KERNEL(i8, i8, prod, Prod) 10 | DECLARE_KERNEL(i16, i16, prod, Prod) 11 | DECLARE_KERNEL(i32, i32, prod, Prod) 12 | DECLARE_KERNEL(i64, i64, prod, Prod) 13 | DECLARE_KERNEL(f32, f32, prod, Prod) 14 | DECLARE_KERNEL(f64, f64, prod, Prod) 15 | DECLARE_KERNEL(f16, f16, prod, Prod) 16 | DECLARE_KERNEL(bf16, bf16, prod, Prod) 17 | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp2.md: -------------------------------------------------------------------------------- 1 | # exp2 2 | ```rust 3 | exp2(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large 2^x$ for all elements 6 | ## Parameters: 7 | `x`: Input values 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.exp2()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-common" 3 | version = "0.1.3" 4 | edition = "2021" 5 | description = "An internal library for common utilities for hpt" 6 | authors = ["JianJian Li "] 7 | repository = "https://github.com/Jianqoq/Hpt" 8 | license = "MIT OR Apache-2.0" 9 | 10 | [dependencies] 11 | hpt-types = { path = "../hpt-types", version = "0.1.1" } 12 | thiserror = { workspace = true } 13 | serde = { workspace = true } 14 | cudarc = { workspace = true, optional = true } 15 | rand_distr = { workspace = true } 16 | 17 | [features] 18 | track_caller = [] 19 | bound_check = [] 20 | cuda = ["cudarc"] 21 | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp.md: -------------------------------------------------------------------------------- 1 | # exp 2 | ```rust 3 | exp(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute exponential of `x` for all elements 6 | ## Parameters: 7 | `x`: Input values 8 | ## Returns: 9 | Tensor with type `C` 10 | ## Examples: 11 | ```rust 12 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::new([10.0]); 16 | let b = a.exp()?; 17 | println!("{}", b); 18 | Ok(()) 19 | } 20 | ``` 21 | ## Backend Support 22 | | Backend | Supported | 23 | |---------|-----------| 24 | | CPU | ✅ | 25 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/all.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "reduce_classes.cuh" 4 | 5 | DECLARE_KERNEL(bool, bool, all, All) 6 | DECLARE_KERNEL(u8, u8, all, All) 7 | DECLARE_KERNEL(u16, u16, all, All) 8 | DECLARE_KERNEL(u32, u32, all, All) 9 | DECLARE_KERNEL(u64, u64, all, All) 10 | DECLARE_KERNEL(i8, i8, all, All) 11 | DECLARE_KERNEL(i16, i16, all, All) 12 | DECLARE_KERNEL(i32, i32, all, All) 13 | DECLARE_KERNEL(i64, i64, all, All) 14 | DECLARE_KERNEL(f32, f32, all, All) 15 | DECLARE_KERNEL(f64, f64, all, All) 16 | DECLARE_KERNEL(f16, f16, all, All) 17 | DECLARE_KERNEL(bf16, bf16, all, All) 18 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/any.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "reduce_classes.cuh" 4 | 5 | DECLARE_KERNEL(bool, bool, any, Any) 6 | DECLARE_KERNEL(u8, u8, any, Any) 7 | DECLARE_KERNEL(u16, u16, any, Any) 8 | DECLARE_KERNEL(u32, u32, any, Any) 9 | DECLARE_KERNEL(u64, u64, any, Any) 10 | DECLARE_KERNEL(i8, i8, any, Any) 11 | DECLARE_KERNEL(i16, i16, any, Any) 12 | DECLARE_KERNEL(i32, i32, any, Any) 13 | DECLARE_KERNEL(i64, i64, any, Any) 14 | DECLARE_KERNEL(f32, f32, any, Any) 15 | DECLARE_KERNEL(f64, f64, any, Any) 16 | DECLARE_KERNEL(f16, f16, any, Any) 17 | DECLARE_KERNEL(bf16, bf16, any, Any) 18 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/max.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_utils.cuh" 3 | #include "reduce_classes.cuh" 4 | 5 | DECLARE_KERNEL(bool, bool, max, Max) 6 | DECLARE_KERNEL(u8, u8, max, Max) 7 | DECLARE_KERNEL(u16, u16, max, Max) 8 | DECLARE_KERNEL(u32, u32, max, Max) 9 | DECLARE_KERNEL(u64, u64, max, Max) 10 | DECLARE_KERNEL(i8, i8, max, Max) 11 | DECLARE_KERNEL(i16, i16, max, Max) 12 | DECLARE_KERNEL(i32, i32, max, Max) 13 | DECLARE_KERNEL(i64, i64, max, Max) 14 | DECLARE_KERNEL(f32, f32, max, Max) 15 | DECLARE_KERNEL(f64, f64, max, Max) 16 | DECLARE_KERNEL(f16, f16, max, Max) 17 | DECLARE_KERNEL(bf16, bf16, max, Max) 18 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/min.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_utils.cuh" 3 | #include "reduce_classes.cuh" 4 | 5 | DECLARE_KERNEL(bool, bool, min, Min) 6 | DECLARE_KERNEL(u8, u8, min, Min) 7 | DECLARE_KERNEL(u16, u16, min, Min) 8 | DECLARE_KERNEL(u32, u32, min, Min) 9 | DECLARE_KERNEL(u64, u64, min, Min) 10 | DECLARE_KERNEL(i8, i8, min, Min) 11 | DECLARE_KERNEL(i16, i16, min, Min) 12 | DECLARE_KERNEL(i32, i32, min, Min) 13 | DECLARE_KERNEL(i64, i64, min, Min) 14 | DECLARE_KERNEL(f32, f32, min, Min) 15 | DECLARE_KERNEL(f64, f64, min, Min) 16 | DECLARE_KERNEL(f16, f16, min, Min) 17 | DECLARE_KERNEL(bf16, bf16, min, Min) 18 | -------------------------------------------------------------------------------- /hpt-macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-macros" 3 | version = "0.1.2" 4 | edition = "2021" 5 | description = "An internal library for generating helper functions for hpt" 6 | authors = ["JianJian Li "] 7 | repository = "https://github.com/Jianqoq/Hpt" 8 | license = "MIT OR Apache-2.0" 9 | 10 | [dependencies] 11 | syn = { workspace = true } 12 | quote = { workspace = true } 13 | regex = { workspace = true } 14 | proc-macro2 = { workspace = true } 15 | 16 | [features] 17 | cuda = [] 18 | 19 | [lib] 20 | proc-macro = true 21 | 22 | [package.metadata.rust-analyzer] 23 | rustc_private = true 24 | proc_macro_srv = true 25 | -------------------------------------------------------------------------------- /docs/user_guide/random/randn_like.md: -------------------------------------------------------------------------------- 1 | # randn 2 | ```rust 3 | randn_like(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | same as `randn` but the shape will be based on `x`. 6 | ## Parameters: 7 | `x`: input Tensor 8 | ## Returns: 9 | Tensor with type `T` 10 | ## Examples: 11 | ```rust 12 | use hpt::{error::TensorError, ops::Random, Tensor}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::randn([10, 10])?; 16 | println!("{}", a.randn_like()?); 17 | Ok(()) 18 | } 19 | ``` 20 | ## Backend Support 21 | | Backend | Supported | 22 | |---------|-----------| 23 | | CPU | ✅ | 24 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sqrt.md: -------------------------------------------------------------------------------- 1 | # sqrt 2 | ```rust 3 | sqrt(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\sqrt{x}$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([4.0]); 19 | let b = a.sqrt()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-types/src/into_scalar.rs: -------------------------------------------------------------------------------- 1 | use crate::convertion::Convertor; 2 | use half::{bf16, f16}; 3 | use hpt_macros::impl_into_scalar; 4 | use num_complex::{Complex32, Complex64}; 5 | /// A trait for converting a scalar into another scalar type. 6 | pub trait Cast { 7 | /// Convert the scalar into another scalar type. 8 | fn cast(self) -> T; 9 | } 10 | 11 | impl_into_scalar!(); 12 | 13 | #[cfg(feature = "cuda")] 14 | mod cud_impl { 15 | use super::*; 16 | use crate::cuda_types::convertion::CudaConvertor; 17 | use crate::cuda_types::scalar::Scalar; 18 | use hpt_macros::impl_into_cuda_scalar; 19 | impl_into_cuda_scalar!(); 20 | } 21 | -------------------------------------------------------------------------------- /docs/user_guide/unary/recip.md: -------------------------------------------------------------------------------- 1 | # recip 2 | ```rust 3 | recip(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{1}{x}$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.recip()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/log10.md: -------------------------------------------------------------------------------- 1 | # log10 2 | ```rust 3 | log10(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \log_{10}(x)$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([100.0]); 19 | let b = a.log10()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/softplus.md: -------------------------------------------------------------------------------- 1 | # softplus 2 | ```rust 3 | softplus(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\ln(1 + e^x)$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.softplus()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-common/src/error/onnx.rs: -------------------------------------------------------------------------------- 1 | use std::panic::Location; 2 | 3 | use thiserror::Error; 4 | 5 | /// Onnx-related errors 6 | #[derive(Debug, Error)] 7 | pub enum OnnxError { 8 | /// Onnx error 9 | #[error("Onnx error: {message} at {location}")] 10 | Any { 11 | /// Message describing the error 12 | message: String, 13 | /// Location where the error occurred 14 | location: &'static Location<'static>, 15 | }, 16 | } 17 | 18 | impl OnnxError { 19 | /// Create a new Onnx error 20 | #[track_caller] 21 | pub fn new(message: String) -> Self { 22 | Self::Any { message, location: Location::caller() } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /docs/user_guide/unary/mish.md: -------------------------------------------------------------------------------- 1 | # mish 2 | ```rust 3 | mish(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x * \tanh(\ln(1 + e^x))$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.mish()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/cbrt.md: -------------------------------------------------------------------------------- 1 | # cbrt 2 | ```rust 3 | cbrt(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \sqrt[3]{x}$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([8.0]); 19 | let b = a.cbrt()?; 20 | println!("{}", b); // prints: 2.0 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/ln.md: -------------------------------------------------------------------------------- 1 | # ln 2 | ```rust 3 | ln(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \ln(x)$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.718281828459045]); 19 | let b = a.ln()?; 20 | println!("{}", b); // prints: 1.0 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/log2.md: -------------------------------------------------------------------------------- 1 | # log2 2 | ```rust 3 | log2(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \log_{2}(x)$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([8.0]); 19 | let b = a.log2()?; 20 | println!("{}", b); // prints: 3.0 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-codegen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-codegen" 3 | version = "0.1.2" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | 7 | [dependencies] 8 | syn = { version = "2.0.82", default-features = false, features = [ 9 | "derive", 10 | "full", 11 | "visit", 12 | "visit-mut", 13 | "extra-traits" 14 | ] } 15 | quote = { workspace = true } 16 | regex = { workspace = true } 17 | proc-macro2 = { workspace = true } 18 | petgraph = { workspace = true } 19 | thiserror = { workspace = true } 20 | anyhow = { workspace = true } 21 | 22 | [lib] 23 | proc-macro = true 24 | 25 | [package.metadata.rust-analyzer] 26 | rustc_private = true 27 | proc_macro_srv = true 28 | -------------------------------------------------------------------------------- /docs/user_guide/unary/softsign.md: -------------------------------------------------------------------------------- 1 | # softsign 2 | ```rust 3 | softsign(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{x}{1 + |x|}$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.softsign()?; 20 | println!("{}", b); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/nansum.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "reduce_classes.cuh" 3 | 4 | DECLARE_KERNEL(bool, bool, nansum, NanSum) 5 | DECLARE_KERNEL(u8, u8, nansum, NanSum) 6 | DECLARE_KERNEL(u16, u16, nansum, NanSum) 7 | DECLARE_KERNEL(u32, u32, nansum, NanSum) 8 | DECLARE_KERNEL(u64, u64, nansum, NanSum) 9 | DECLARE_KERNEL(i8, i8, nansum, NanSum) 10 | DECLARE_KERNEL(i16, i16, nansum, NanSum) 11 | DECLARE_KERNEL(i32, i32, nansum, NanSum) 12 | DECLARE_KERNEL(i64, i64, nansum, NanSum) 13 | DECLARE_KERNEL(f32, f32, nansum, NanSum) 14 | DECLARE_KERNEL(f64, f64, nansum, NanSum) 15 | DECLARE_KERNEL(f16, f16, nansum, NanSum) 16 | DECLARE_KERNEL(bf16, bf16, nansum, NanSum) 17 | -------------------------------------------------------------------------------- /hpt-traits/src/ops/slice.rs: -------------------------------------------------------------------------------- 1 | use hpt_common::error::base::TensorError; 2 | 3 | /// trait for slicing tensor 4 | pub trait Slice: Sized { 5 | /// Create a new Tensor by slicing an existing Tensor. Slicing allows you to extract a portion of a tensor using index ranges for each dimension. 6 | /// 7 | /// ## Parameters: 8 | /// `index`: `(start, end, step)`: Select from start to end with step 9 | /// 10 | /// ## Example: 11 | /// ```rust 12 | /// let a = Tensor::::arange(0, 16)?.reshape(&[4, 4])?; 13 | /// let b = a.slice(&[(1, 3, 1), (0, 4, 1)])?; 14 | /// ``` 15 | fn slice(&self, index: &[(i64, i64, i64)]) -> Result; 16 | } 17 | -------------------------------------------------------------------------------- /docs/dev_guide/pointer/pointer.md: -------------------------------------------------------------------------------- 1 | ### Pointer 2 | 3 | Hpt is using multi threading across the whole operators implementation. However, raw pointer can't be send to threads safely. 4 | 5 | So we created a [wrapper](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-common/src/utils/pointer.rs#L11) for pointer. 6 | 7 | In the whole project, almost all the parallel iteration are using [wrapper](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-common/src/utils/pointer.rs#L11) to read and write data. 8 | 9 | You may notice there is a `bound_check` feature, however, this feature is not fully tested and may not reliable. This feature need to stablize. -------------------------------------------------------------------------------- /docs/user_guide/random/randn.md: -------------------------------------------------------------------------------- 1 | # randn 2 | ```rust 3 | randn(shape: &[i64] | &Vec | &[i64; _]) -> Result, TensorError> 4 | ``` 5 | create a Tensor with data in normal distribution. `mean = 0.0`, `std_dev = 1.0`. 6 | ## Parameters: 7 | `shape`: shape of the output 8 | ## Returns: 9 | Tensor with type `T` 10 | ## Examples: 11 | ```rust 12 | use hpt::{error::TensorError, ops::Random, Tensor}; 13 | 14 | fn main() -> Result<(), TensorError> { 15 | let a = Tensor::::randn([10, 10])?; 16 | println!("{}", a); 17 | Ok(()) 18 | } 19 | ``` 20 | ## Backend Support 21 | | Backend | Supported | 22 | |---------|-----------| 23 | | CPU | ✅ | 24 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-allocator/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-allocator" 3 | version = "0.1.2" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library for memory allocator for hpt" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | lru = { workspace = true } 12 | once_cell = { workspace = true } 13 | lazy_static = { workspace = true } 14 | ctor = { workspace = true } 15 | cudarc = { workspace = true, optional = true } 16 | dashmap = { workspace = true } 17 | hpt-common = { path = "../hpt-common", version = "0.1.1" } 18 | 19 | [features] 20 | cuda = ["cudarc"] 21 | track_caller = [] 22 | -------------------------------------------------------------------------------- /docs/user_guide/unary/sigmoid.md: -------------------------------------------------------------------------------- 1 | # sigmoid 2 | ```rust 3 | sigmoid(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{1}{1 + e^{-x}}$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.sigmoid()?; 20 | println!("{}", b); // prints: 0.8807971 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/mean.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "../utils/check_type.cuh" 4 | #include "reduce_classes.cuh" 5 | 6 | DECLARE_KERNEL(bool, bool, mean, Mean) 7 | DECLARE_KERNEL(u8, u8, mean, Mean) 8 | DECLARE_KERNEL(u16, u16, mean, Mean) 9 | DECLARE_KERNEL(u32, u32, mean, Mean) 10 | DECLARE_KERNEL(u64, u64, mean, Mean) 11 | DECLARE_KERNEL(i8, i8, mean, Mean) 12 | DECLARE_KERNEL(i16, i16, mean, Mean) 13 | DECLARE_KERNEL(i32, i32, mean, Mean) 14 | DECLARE_KERNEL(i64, i64, mean, Mean) 15 | DECLARE_KERNEL(f32, f32, mean, Mean) 16 | DECLARE_KERNEL(f64, f64, mean, Mean) 17 | DECLARE_KERNEL(f16, f16, mean, Mean) 18 | DECLARE_KERNEL(bf16, bf16, mean, Mean) 19 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/nanprod.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "reduce_classes.cuh" 3 | 4 | DECLARE_KERNEL(bool, bool, nanprod, NanProd) 5 | DECLARE_KERNEL(u8, u8, nanprod, NanProd) 6 | DECLARE_KERNEL(u16, u16, nanprod, NanProd) 7 | DECLARE_KERNEL(u32, u32, nanprod, NanProd) 8 | DECLARE_KERNEL(u64, u64, nanprod, NanProd) 9 | DECLARE_KERNEL(i8, i8, nanprod, NanProd) 10 | DECLARE_KERNEL(i16, i16, nanprod, NanProd) 11 | DECLARE_KERNEL(i32, i32, nanprod, NanProd) 12 | DECLARE_KERNEL(i64, i64, nanprod, NanProd) 13 | DECLARE_KERNEL(f32, f32, nanprod, NanProd) 14 | DECLARE_KERNEL(f64, f64, nanprod, NanProd) 15 | DECLARE_KERNEL(f16, f16, nanprod, NanProd) 16 | DECLARE_KERNEL(bf16, bf16, nanprod, NanProd) 17 | -------------------------------------------------------------------------------- /hpt-dyn/src/ops/models/onnx.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use crate::utils::onnx::operators::Operator; 4 | use crate::Tensor; 5 | use crate::onnx::ModelProto; 6 | 7 | #[derive(Debug)] 8 | pub(crate) struct Meta { 9 | pub(crate) permute: Option>, 10 | } 11 | 12 | #[derive(Debug)] 13 | pub struct Initialized { 14 | pub(crate) model: ModelProto, 15 | pub(crate) initializer_map: HashMap, 16 | pub(crate) permutes: HashMap, 17 | pub(crate) operators: Vec, 18 | pub(crate) node_degree: HashMap, 19 | } 20 | 21 | #[derive(Debug)] 22 | pub enum OnnxModel { 23 | Model(ModelProto), 24 | Initialized(Initialized), 25 | } 26 | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp10.md: -------------------------------------------------------------------------------- 1 | # exp10 2 | ```rust 3 | exp10(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute 10 raised to the power of `x` for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values (exponents) 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); // 10^2 19 | let b = a.exp10()?; 20 | println!("{}", b); // prints: 100.0 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/i64x2.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_feature = "neon")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | use std::ops::Index; 7 | 8 | /// a vector of 2 i64 values 9 | #[allow(non_camel_case_types)] 10 | #[derive(Clone, Copy, Debug)] 11 | #[repr(C, align(16))] 12 | pub struct i64x2( 13 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 14 | #[cfg(target_arch = "aarch64")] pub(crate) int64x2_t, 15 | ); 16 | 17 | impl Index for i64x2 { 18 | type Output = i64; 19 | fn index(&self, index: usize) -> &Self::Output { 20 | let ptr = self as *const _ as *const i64; 21 | unsafe { &*ptr.add(index) } 22 | } 23 | } -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/u8x16.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "aarch64")] 3 | use std::arch::aarch64::*; 4 | #[cfg(target_arch = "x86_64")] 5 | use std::arch::x86_64::*; 6 | use std::ops::Index; 7 | 8 | /// a vector of 16 u8 values 9 | #[allow(non_camel_case_types)] 10 | #[derive(Clone, Copy, Debug)] 11 | #[repr(C, align(16))] 12 | pub struct u8x16( 13 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 14 | #[cfg(target_arch = "aarch64")] pub(crate) uint8x16_t, 15 | ); 16 | 17 | impl Index for u8x16 { 18 | type Output = u8; 19 | fn index(&self, index: usize) -> &Self::Output { 20 | let ptr = self as *const _ as *const u8; 21 | unsafe { &*ptr.add(index) } 22 | } 23 | } -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/u64x2.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | #[cfg(target_arch = "aarch64")] 6 | use std::arch::aarch64::*; 7 | use std::ops::Index; 8 | 9 | /// a vector of 2 u64 values 10 | #[allow(non_camel_case_types)] 11 | #[derive(Clone, Copy, Debug)] 12 | #[repr(C, align(16))] 13 | pub struct u64x2( 14 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 15 | #[cfg(target_arch = "aarch64")] pub(crate) uint64x2_t, 16 | ); 17 | 18 | impl Index for u64x2 { 19 | type Output = u64; 20 | fn index(&self, index: usize) -> &Self::Output { 21 | let ptr = self as *const _ as *const u64; 22 | unsafe { &*ptr.add(index) } 23 | } 24 | } -------------------------------------------------------------------------------- /docs/user_guide/unary/erf.md: -------------------------------------------------------------------------------- 1 | # erf 2 | ```rust 3 | erf(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} dt$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([1.0]); 19 | let b = a.erf()?; 20 | println!("{}", b); // prints: 0.8427008 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/i8x16.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(target_arch = "x86_64")] 3 | use std::arch::x86_64::*; 4 | 5 | #[cfg(target_arch = "aarch64")] 6 | use std::arch::aarch64::*; 7 | use std::ops::Index; 8 | 9 | /// a vector of 16 i8 values 10 | #[allow(non_camel_case_types)] 11 | #[derive(Clone, Copy, Debug)] 12 | #[repr(C, align(16))] 13 | pub struct i8x16( 14 | #[cfg(target_arch = "x86_64")] pub(crate) __m128i, 15 | #[cfg(target_arch = "aarch64")] pub(crate) int8x16_t, 16 | ); 17 | 18 | impl Index for i8x16 { 19 | type Output = i8; 20 | fn index(&self, index: usize) -> &Self::Output { 21 | let ptr = self as *const _ as *const i8; 22 | unsafe { &*ptr.add(index) } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /hpt-cudakernels/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-cudakernels" 3 | version = "0.1.3" 4 | edition = "2021" 5 | description = "A library implements cuda kernels for hpt" 6 | authors = ["JianJian Li "] 7 | repository = "https://github.com/Jianqoq/Hpt" 8 | license = "MIT OR Apache-2.0" 9 | 10 | [dependencies] 11 | phf = { workspace = true } 12 | 13 | [build-dependencies] 14 | phf = { workspace = true } 15 | phf_codegen = { workspace = true } 16 | regex = { workspace = true } 17 | walkdir = { workspace = true } 18 | rayon = { workspace = true } 19 | num_cpus = { workspace = true } 20 | 21 | [lib] 22 | path = "src/lib.rs" 23 | required-features = ["cuda"] 24 | 25 | [features] 26 | default = [] 27 | cuda = [] 28 | -------------------------------------------------------------------------------- /hpt-types/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-types" 3 | version = "0.1.3" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library define primitive types functions for hpt" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | hpt-macros = { path = "../hpt-macros", version = "0.1.1" } 12 | num-complex = { workspace = true } 13 | half = { workspace = true } 14 | paste = { workspace = true } 15 | num-traits = { workspace = true } 16 | libm = { workspace = true } 17 | serde = { workspace = true } 18 | duplicate = { workspace = true } 19 | [features] 20 | cuda = [] 21 | default = ["normal_promote"] 22 | normal_promote = [] 23 | -------------------------------------------------------------------------------- /hpt-allocator/src/utils/cache_resize.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroUsize; 2 | 3 | use lru::LruCache; 4 | 5 | use crate::ptr::SafePtr; 6 | 7 | pub fn resize_lru_cache( 8 | cache: &mut LruCache>, 9 | deallocate_fn: impl Fn(*mut u8, std::alloc::Layout), 10 | new_size: usize, 11 | ) { 12 | if cache.cap().get() <= new_size { 13 | cache.resize(NonZeroUsize::new(new_size).unwrap()); 14 | } else { 15 | let new = LruCache::new(NonZeroUsize::new(new_size).unwrap()); 16 | for (layout, ptrs) in cache.iter() { 17 | for safe_ptr in ptrs { 18 | deallocate_fn(safe_ptr.ptr, *layout); 19 | } 20 | } 21 | *cache = new; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /docs/dev_guide/adding_new_op.md: -------------------------------------------------------------------------------- 1 | # How to add new Tensor operator 2 | 3 | ### Things to know 4 | Tensor operators are define in `tensor-traits` crate. If you want to implement new Tensor operator, you may need to create a new trait if there is no suitable trait for the new operator. 5 | 6 | ### How 7 | 2. Implement the trait method at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt/src/backends) based on what backend you want to implement, mostly, you should implement for all backends. 8 | 9 | 3. Ensure performance is ideal by comparing with other frameworks 10 | 11 | 4. Write test cases at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-tests/src/hpt). Make sure to follow the Dev Guide test cases rules. 12 | 13 | 5. commit and make a pull request -------------------------------------------------------------------------------- /docs/user_guide/unary/sincos.md: -------------------------------------------------------------------------------- 1 | # sincos 2 | ```rust 3 | sincos(x: &Tensor) -> Result<(Tensor, Tensor), TensorError> 4 | ``` 5 | Simultaneously computes sine and cosine of the input tensor 6 | 7 | ## Parameters: 8 | `x`: Angle(radians) 9 | 10 | ## Returns: 11 | Tuple of two tensors (sine, cosine) with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([10.0]); 19 | let (sin, cos) = a.sincos()?; 20 | println!("sin: {}, cos: {}", sin, cos); 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-tests/src/hpt_common/err_handler.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_imports)] 2 | use hpt_common::error::shape::ShapeError; 3 | 4 | #[test] 5 | fn test_check_ndim_match() { 6 | ShapeError::check_dim(2, 2).unwrap(); 7 | } 8 | 9 | #[test] 10 | fn test_check_ndim_match_err() { 11 | assert!(ShapeError::check_dim(2, 3) 12 | .unwrap_err() 13 | .to_string() 14 | .contains("Dimension mismatch: expected 2, got 3")); 15 | } 16 | 17 | #[test] 18 | fn test_size_match() { 19 | ShapeError::check_size_match(2, 2).unwrap(); 20 | } 21 | 22 | #[test] 23 | fn test_size_match_err() { 24 | assert!(ShapeError::check_size_match(2, 3) 25 | .unwrap_err() 26 | .to_string() 27 | .contains("Size mismatch: expected 2, got 3")); 28 | } 29 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/sum_square.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "reduce_classes.cuh" 3 | 4 | DECLARE_KERNEL(bool, bool, sumsquare, SumSquare) 5 | DECLARE_KERNEL(u8, u8, sumsquare, SumSquare) 6 | DECLARE_KERNEL(u16, u16, sumsquare, SumSquare) 7 | DECLARE_KERNEL(u32, u32, sumsquare, SumSquare) 8 | DECLARE_KERNEL(u64, u64, sumsquare, SumSquare) 9 | DECLARE_KERNEL(i8, i8, sumsquare, SumSquare) 10 | DECLARE_KERNEL(i16, i16, sumsquare, SumSquare) 11 | DECLARE_KERNEL(i32, i32, sumsquare, SumSquare) 12 | DECLARE_KERNEL(i64, i64, sumsquare, SumSquare) 13 | DECLARE_KERNEL(f32, f32, sumsquare, SumSquare) 14 | DECLARE_KERNEL(f64, f64, sumsquare, SumSquare) 15 | DECLARE_KERNEL(f16, f16, sumsquare, SumSquare) 16 | DECLARE_KERNEL(bf16, bf16, sumsquare, SumSquare) 17 | -------------------------------------------------------------------------------- /docs/user_guide/associated_methods/cpu/to_cuda.md: -------------------------------------------------------------------------------- 1 | # to_cuda 2 | ```rust 3 | fn to_cuda(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Transfers a tensor from CPU memory to CUDA GPU memory, creating a new tensor on the specified CUDA device. 6 | 7 | ## Parameters: 8 | `DEVICE_ID`: A compile-time constant specifying the target CUDA device ID (default is 0) 9 | 10 | ## Returns: 11 | A new `Tensor` located on the specified CUDA device, or a TensorError if the transfer fails. 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{error::TensorError, Tensor}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([1.5, 2.7, 3.2]).to_cuda::<0>()?; 19 | println!("{}", a); 20 | Ok(()) 21 | } 22 | ``` -------------------------------------------------------------------------------- /hpt/src/backends/cpu/tensor_external/cumulative.rs: -------------------------------------------------------------------------------- 1 | use crate::Tensor; 2 | use hpt_allocator::{ 3 | traits::{Allocator, AllocatorOutputRetrive}, 4 | Cpu, 5 | }; 6 | use hpt_common::error::base::TensorError; 7 | use hpt_traits::{ops::cumulative::CumulativeOps, tensor::CommonBounds}; 8 | 9 | impl CumulativeOps for Tensor 10 | where 11 | Al: Allocator, 12 | Al::Output: AllocatorOutputRetrive, 13 | { 14 | fn cumsum>>(&self, axis: A) -> Result { 15 | Ok(self.inner.cumsum(axis)?.into()) 16 | } 17 | fn cumprod>>(&self, axis: A) -> Result { 18 | Ok(self.inner.cumprod(axis)?.into()) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hpt-docs", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "docs:dev": "vuepress dev docs", 8 | "docs:build": "vuepress build docs" 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "ISC", 13 | "devDependencies": { 14 | "@vuepress/bundler-vite": "^2.0.0-rc.19", 15 | "@vuepress/plugin-markdown-math": "^2.0.0-rc.73", 16 | "@vuepress/theme-default": "^2.0.0-rc.73", 17 | "chart.js": "^4.4.9", 18 | "katex": "^0.16.21", 19 | "mermaid": "^11.4.1", 20 | "sass-embedded": "^1.83.4", 21 | "vuepress": "^2.0.0-rc.19", 22 | "vuepress-plugin-md-enhance": "^2.0.0-rc.71", 23 | "vuepress-plugin-mermaidjs": "^2.0.0-beta.2" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /docs/user_guide/binary/add.md: -------------------------------------------------------------------------------- 1 | # add 2 | ```rust 3 | std::ops::Add::add( 4 | x: &Tensor | Tensor | scalar, 5 | y: &Tensor | Tensor | scalar 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large x + y$ for all elements 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `C` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([10.0]); 24 | let b = &a + &a; 25 | println!("{}", b); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/div.md: -------------------------------------------------------------------------------- 1 | # div 2 | ```rust 3 | std::ops::Div::div( 4 | x: &Tensor | Tensor | scalar, 5 | y: &Tensor | Tensor | scalar 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large x / y$ for all elements 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `C` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([10.0]); 24 | let b = &a / &a; 25 | println!("{}", b); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/mul.md: -------------------------------------------------------------------------------- 1 | # mul 2 | ```rust 3 | std::ops::Mul::mul( 4 | x: &Tensor | Tensor | scalar, 5 | y: &Tensor | Tensor | scalar 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large x * y$ for all elements 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `C` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([10.0]); 24 | let b = &a * &a; 25 | println!("{}", b); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/rem.md: -------------------------------------------------------------------------------- 1 | # rem 2 | ```rust 3 | std::ops::Rem::rem( 4 | x: &Tensor | Tensor | scalar, 5 | y: &Tensor | Tensor | scalar 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large x mod y$ for all elements 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `C` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([10.0]); 24 | let b = &a % &a; 25 | println!("{}", b); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/sub.md: -------------------------------------------------------------------------------- 1 | # sub 2 | ```rust 3 | std::ops::Sub::sub( 4 | x: &Tensor | Tensor | scalar, 5 | y: &Tensor | Tensor | scalar 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large x - y$ for all elements 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `C` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([10.0]); 24 | let b = &a - &a; 25 | println!("{}", b); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/gelu.md: -------------------------------------------------------------------------------- 1 | # gelu 2 | ```rust 3 | gelu(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x \cdot \Phi(x)$ where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.gelu()?; 20 | println!("{}", b); // prints: 1.9545977 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/reducel1.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "reduce_classes.cuh" 4 | 5 | DECLARE_KERNEL(bool, bool, reducel1, ReduceL1) 6 | DECLARE_KERNEL(u8, u8, reducel1, ReduceL1) 7 | DECLARE_KERNEL(u16, u16, reducel1, ReduceL1) 8 | DECLARE_KERNEL(u32, u32, reducel1, ReduceL1) 9 | DECLARE_KERNEL(u64, u64, reducel1, ReduceL1) 10 | DECLARE_KERNEL(i8, i8, reducel1, ReduceL1) 11 | DECLARE_KERNEL(i16, i16, reducel1, ReduceL1) 12 | DECLARE_KERNEL(i32, i32, reducel1, ReduceL1) 13 | DECLARE_KERNEL(i64, i64, reducel1, ReduceL1) 14 | DECLARE_KERNEL(f32, f32, reducel1, ReduceL1) 15 | DECLARE_KERNEL(f64, f64, reducel1, ReduceL1) 16 | DECLARE_KERNEL(f16, f16, reducel1, ReduceL1) 17 | DECLARE_KERNEL(bf16, bf16, reducel1, ReduceL1) 18 | -------------------------------------------------------------------------------- /docs/dev_guide/test_rules.md: -------------------------------------------------------------------------------- 1 | ### Test cases 2 | 3 | All the operators implemented must be tested in 4 cases. 4 | 1. contiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L143) 5 | 2. uncontiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L198) 6 | 3. sliced contiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L241) 7 | 4. sliced uncontiguous Tensor [see](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-tests/src/hpt/cpu/reduce.rs#L333) 8 | 9 | Besides that, all the algorithm must be tested by using random input and random arguments. -------------------------------------------------------------------------------- /docs/user_guide/unary/cosh_.md: -------------------------------------------------------------------------------- 1 | # cosh_ 2 | ```rust 3 | cosh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Hyperbolic cosine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | ## Returns: 12 | Tensor with type `C` 13 | ## Examples: 14 | ```rust 15 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([10.0]); 19 | let b = a.cosh_(&mut a.clone())?; 20 | println!("{}", b); 21 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 22 | Ok(()) 23 | } 24 | ``` 25 | ## Backend Support 26 | | Backend | Supported | 27 | |---------|-----------| 28 | | CPU | ✅ | 29 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/custom_type/custom_type.md: -------------------------------------------------------------------------------- 1 | # Custom Type 2 | 3 | Since hpt is designed in purely generic types, the user can define their own type and can use custom type to do all the computation hpt supports. 4 | 5 | # How 6 | 7 | You can reference the steps at [here](https://github.com/Jianqoq/Hpt/tree/main/hpt-examples/examples/custom_type/main.rs). 8 | 9 | # Note 10 | 11 | For now, your custom type must implemented `Copy` trait. The reason why hpt doesn't support type with only `Clone` is because of the conv2d implementation issue. The conv2d used fixed size array to preallocate registers `[T; N]`, and this requires `T` implemented `Copy` trait. 12 | 13 | ## Backend Support 14 | | Backend | Supported | 15 | |---------|-----------| 16 | | CPU | ✅ | 17 | | Cuda | ❌ | -------------------------------------------------------------------------------- /docs/user_guide/random/rand_like.md: -------------------------------------------------------------------------------- 1 | # rand_like 2 | ```rust 3 | rand_like( 4 | x: &Tensor, 5 | low: T, 6 | high: T 7 | ) -> Result, TensorError> 8 | ``` 9 | same as `rand` but the shape will be based on `x`. 10 | ## Parameters: 11 | `x`: input Tensor 12 | 13 | `low`: the lowest value 14 | 15 | `high`: the highest value 16 | ## Returns: 17 | Tensor with type `T` 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::Random, Tensor}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::rand([10, 10], 0.0, 10.0)?; 24 | println!("{}", a.rand_like(0.0, 10.0)?); 25 | Ok(()) 26 | } 27 | ``` 28 | ## Backend Support 29 | | Backend | Supported | 30 | |---------|-----------| 31 | | CPU | ✅ | 32 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/asin_.md: -------------------------------------------------------------------------------- 1 | # asin 2 | ```rust 3 | asin_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse sine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.asin_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/elu.md: -------------------------------------------------------------------------------- 1 | # elu 2 | ```rust 3 | elu(x: &Tensor, alpha: C) -> Result, TensorError> 4 | ``` 5 | Compute $\large x$ for $x > 0$, $\large \alpha(e^x - 1)$ for $x \leq 0$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `alpha`: Parameter controlling the saturation of negative values 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([-1.0]); 20 | let b = a.elu(1.0)?; 21 | println!("{}", b); // prints: -0.6321206 22 | Ok(()) 23 | } 24 | ``` 25 | ## Backend Support 26 | | Backend | Supported | 27 | |---------|-----------| 28 | | CPU | ✅ | 29 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/hard_sigmoid.md: -------------------------------------------------------------------------------- 1 | # hard_sigmoid 2 | ```rust 3 | hard_sigmoid(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{max}(0, \text{min}(1, \frac{x}{6} + 0.5))$ for all elements. A piece-wise linear approximation of the sigmoid function. 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.hard_sigmoid()?; 20 | println!("{}", b); // prints: 0.8333333 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/hard_swish.md: -------------------------------------------------------------------------------- 1 | # hard_swish 2 | ```rust 3 | hard_swish(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x \cdot \text{min}(\text{max}(0, \frac{x}{6} + 0.5), 1)$ for all elements. A piece-wise linear approximation of the swish function. 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | ## Returns: 11 | Tensor with type `C` 12 | 13 | ## Examples: 14 | ```rust 15 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 16 | 17 | fn main() -> Result<(), TensorError> { 18 | let a = Tensor::::new([2.0]); 19 | let b = a.hard_swish()?; 20 | println!("{}", b); // prints: 1.6666666 21 | Ok(()) 22 | } 23 | ``` 24 | ## Backend Support 25 | | Backend | Supported | 26 | |---------|-----------| 27 | | CPU | ✅ | 28 | | Cuda | ✅ | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy-gh-pages: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v3 14 | 15 | - name: Setup Node.js 16 | uses: actions/setup-node@v3 17 | with: 18 | node-version: '23' 19 | 20 | - name: Clean npm cache 21 | run: npm cache clean --force 22 | 23 | - name: Install dependencies 24 | run: npm install 25 | 26 | - name: Build VuePress site 27 | run: npm run docs:build 28 | 29 | - name: Deploy to GitHub Pages 30 | uses: JamesIves/github-pages-deploy-action@v4 31 | with: 32 | folder: docs/.vuepress/dist 33 | branch: gh-pages -------------------------------------------------------------------------------- /docs/user_guide/random/rand.md: -------------------------------------------------------------------------------- 1 | # rand 2 | ```rust 3 | rand( 4 | shape: &[i64] | &Vec | &[i64; _], 5 | low: T, 6 | high: T 7 | ) -> Result, TensorError> 8 | ``` 9 | create a Tensor with data uniformly distributed between `low` and `high`. 10 | ## Parameters: 11 | `shape`: shape of the output 12 | 13 | `low`: the lowest value 14 | 15 | `high`: the highest value 16 | ## Returns: 17 | Tensor with type `T` 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::Random, Tensor}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::rand([10, 10], 0.0, 10.0)?; 24 | println!("{}", a); 25 | Ok(()) 26 | } 27 | ``` 28 | ## Backend Support 29 | | Backend | Supported | 30 | |---------|-----------| 31 | | CPU | ✅ | 32 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/acos_.md: -------------------------------------------------------------------------------- 1 | # acos 2 | ```rust 3 | acos_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse cosine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([-10.0]); 20 | let b = a.acos_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/atan_.md: -------------------------------------------------------------------------------- 1 | # atan 2 | ```rust 3 | atan_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse tangent with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.atan_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/cos_.md: -------------------------------------------------------------------------------- 1 | # cos_ 2 | ```rust 3 | cos_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Trigonometric cosine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.cos_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sin_.md: -------------------------------------------------------------------------------- 1 | # sin 2 | ```rust 3 | sin_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Trigonometric sine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.sin_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sinh_.md: -------------------------------------------------------------------------------- 1 | # sinh_ 2 | ```rust 3 | sinh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Hyperbolic sine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.sinh_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/tan_.md: -------------------------------------------------------------------------------- 1 | # tan_ 2 | ```rust 3 | tan_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Trigonometric tangent with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.tan_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/ln_.md: -------------------------------------------------------------------------------- 1 | # ln_ 2 | ```rust 3 | ln_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \ln(x)$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.ln_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/tanh_.md: -------------------------------------------------------------------------------- 1 | # tanh_ 2 | ```rust 3 | tanh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Hyperbolic tangent with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.tanh_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_eq.md: -------------------------------------------------------------------------------- 1 | # tensor_eq 2 | ```rust 3 | tensor_eq( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is equal to element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_eq(&a)?; 25 | println!("{}", b); // [true true true] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/asinh_.md: -------------------------------------------------------------------------------- 1 | # asinh_ 2 | ```rust 3 | asinh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse hyperbolic sine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.asinh_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/parse_args/affine_grid.rs: -------------------------------------------------------------------------------- 1 | use super::parse::{ Parse, ParseArgs }; 2 | 3 | pub(crate) struct AffineGridArgs<'a> { 4 | pub(crate) theta: &'a str, 5 | pub(crate) size: &'a str, 6 | pub(crate) align_corners: bool, 7 | } 8 | 9 | // impl<'a> Parse> for ParseArgs { 10 | // type Output<'b> = AffineGridArgs<'b> where Self: 'b; 11 | // fn parse<'b>(&'b mut self, node: &'b crate::onnx::NodeProto) -> AffineGridArgs<'b> { 12 | // let theta = node.input[0].as_str(); 13 | // let size = node.input[1].as_str(); 14 | // let align_corners = self.parse_int_attribute(node, "align_corners", 0) == 1; 15 | // AffineGridArgs { 16 | // theta, 17 | // size, 18 | // align_corners, 19 | // } 20 | // } 21 | // } 22 | -------------------------------------------------------------------------------- /docs/user_guide/unary/abs_.md: -------------------------------------------------------------------------------- 1 | # abs_ 2 | ```rust 3 | abs_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Calculate absolute of input with out 9 | ## Parameters: 10 | `x`: input Tensor 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `T` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::NormalUaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([-10.0]); 20 | let b = a.abs_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | 26 | ``` 27 | ## Backend Support 28 | | Backend | Supported | 29 | |---------|-----------| 30 | | CPU | ✅ | 31 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/acosh_.md: -------------------------------------------------------------------------------- 1 | # acosh_ 2 | ```rust 3 | acosh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse hyperbolic cosine with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([-10.0]); 20 | let b = a.acosh_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/atanh_.md: -------------------------------------------------------------------------------- 1 | # atanh_ 2 | ```rust 3 | atanh_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Inverse hyperbolic tangent with out 9 | ## Parameters: 10 | `x`: Angle(radians) 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.atanh_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/celu.md: -------------------------------------------------------------------------------- 1 | # celu 2 | ```rust 3 | celu(x: &Tensor, alpha: C) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{max}(0, x) + \text{min}(0, \alpha \cdot (e^{x/\alpha} - 1))$ for all elements 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | 10 | `alpha`: Parameter controlling the saturation of negative values 11 | 12 | ## Returns: 13 | Tensor with type `C` 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 18 | 19 | fn main() -> Result<(), TensorError> { 20 | let a = Tensor::::new([-1.0]); 21 | let b = a.celu(1.0)?; 22 | println!("{}", b); // prints: -0.6321206 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/sqrt_.md: -------------------------------------------------------------------------------- 1 | # sqrt_ 2 | ```rust 3 | sqrt_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \sqrt{x}$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.sqrt_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_gt.md: -------------------------------------------------------------------------------- 1 | # tensor_gt 2 | ```rust 3 | tensor_gt( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is greater than the element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_gt(&a)?; 25 | println!("{}", b); // [false false false] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_lt.md: -------------------------------------------------------------------------------- 1 | # tensor_lt 2 | ```rust 3 | tensor_lt( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is less than the element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_lt(&a)?; 25 | println!("{}", b); // [false false false] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_neq.md: -------------------------------------------------------------------------------- 1 | # tensor_neq 2 | ```rust 3 | tensor_neq( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is not equal to element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_neq(&a)?; 25 | println!("{}", b); // [false false false] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/cbrt_.md: -------------------------------------------------------------------------------- 1 | # cbrt_ 2 | ```rust 3 | cbrt_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \sqrt[3]{x}$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.cbrt_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp2_.md: -------------------------------------------------------------------------------- 1 | # exp2_ 2 | ```rust 3 | exp2_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute $\large 2^x$ for all elements with out 9 | ## Parameters: 10 | `x`: Input values 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `C` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.exp2_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp_.md: -------------------------------------------------------------------------------- 1 | # exp_ 2 | ```rust 3 | exp_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute exponential of `x` for all elements with out 9 | ## Parameters: 10 | `x`: Input values 11 | `out`: Tensor to write to 12 | ## Returns: 13 | Tensor with type `T` 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.exp_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/log2_.md: -------------------------------------------------------------------------------- 1 | # log2_ 2 | ```rust 3 | log2_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \log_{2}(x)$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.log2_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-dataloader/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-dataloader" 3 | version = "0.1.3" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | description = "An internal library for data loading for hpt" 7 | authors = ["JianJian Li "] 8 | repository = "https://github.com/Jianqoq/Hpt" 9 | 10 | [dependencies] 11 | zip = { workspace = true } 12 | serde = { workspace = true } 13 | serde_json = { workspace = true } 14 | hpt-traits = { path = "../hpt-traits", version = "0.1.1" } 15 | hpt-types = { path = "../hpt-types", version = "0.1.1" } 16 | hpt-common = { path = "../hpt-common", version = "0.1.1" } 17 | indicatif = { workspace = true } 18 | flate2 = { workspace = true } 19 | num = { workspace = true } 20 | safetensors = { workspace = true } 21 | bytemuck = { workspace = true } 22 | half = { workspace = true } 23 | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_ge.md: -------------------------------------------------------------------------------- 1 | # tensor_ge 2 | ```rust 3 | tensor_ge( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is greater or equal to the element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_ge(&a)?; 25 | println!("{}", b); // [true true true] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/cmp/tensor_le.md: -------------------------------------------------------------------------------- 1 | # tensor_le 2 | ```rust 3 | tensor_le( 4 | x: &Tensor | Tensor, 5 | y: &Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | check if element from x is less or equal to the element from y 9 | 10 | ## Parameters: 11 | `x`: First input tensor 12 | 13 | `y`: Second input tensor 14 | 15 | ## Returns: 16 | Tensor with type `bool` 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{ops::TensorCmp, Tensor, error::TensorError}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::new([2.0, 2.0, 2.0]); 24 | let b = a.tensor_le(&a)?; 25 | println!("{}", b); // [true true true] 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/log10_.md: -------------------------------------------------------------------------------- 1 | # log10_ 2 | ```rust 3 | log10_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \log_{10}(x)$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.log10_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/recip_.md: -------------------------------------------------------------------------------- 1 | # recip_ 2 | ```rust 3 | recip_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{1}{x}$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.recip_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/reducel2.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "../utils/check_type.cuh" 4 | #include "reduce_classes.cuh" 5 | 6 | DECLARE_KERNEL(half, bool, reducel2, ReduceL2) 7 | DECLARE_KERNEL(half, u8, reducel2, ReduceL2) 8 | DECLARE_KERNEL(half, u16, reducel2, ReduceL2) 9 | DECLARE_KERNEL(float, u32, reducel2, ReduceL2) 10 | DECLARE_KERNEL(double, u64, reducel2, ReduceL2) 11 | DECLARE_KERNEL(half, i8, reducel2, ReduceL2) 12 | DECLARE_KERNEL(half, i16, reducel2, ReduceL2) 13 | DECLARE_KERNEL(float, i32, reducel2, ReduceL2) 14 | DECLARE_KERNEL(double, i64, reducel2, ReduceL2) 15 | DECLARE_KERNEL(float, f32, reducel2, ReduceL2) 16 | DECLARE_KERNEL(double, f64, reducel2, ReduceL2) 17 | DECLARE_KERNEL(half, f16, reducel2, ReduceL2) 18 | DECLARE_KERNEL(bf16, bf16, reducel2, ReduceL2) 19 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/reducel3.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "../utils/check_type.cuh" 4 | #include "reduce_classes.cuh" 5 | 6 | DECLARE_KERNEL(half, bool, reducel3, ReduceL3) 7 | DECLARE_KERNEL(half, u8, reducel3, ReduceL3) 8 | DECLARE_KERNEL(half, u16, reducel3, ReduceL3) 9 | DECLARE_KERNEL(float, u32, reducel3, ReduceL3) 10 | DECLARE_KERNEL(double, u64, reducel3, ReduceL3) 11 | DECLARE_KERNEL(half, i8, reducel3, ReduceL3) 12 | DECLARE_KERNEL(half, i16, reducel3, ReduceL3) 13 | DECLARE_KERNEL(float, i32, reducel3, ReduceL3) 14 | DECLARE_KERNEL(double, i64, reducel3, ReduceL3) 15 | DECLARE_KERNEL(float, f32, reducel3, ReduceL3) 16 | DECLARE_KERNEL(double, f64, reducel3, ReduceL3) 17 | DECLARE_KERNEL(half, f16, reducel3, ReduceL3) 18 | DECLARE_KERNEL(bf16, bf16, reducel3, ReduceL3) 19 | -------------------------------------------------------------------------------- /docs/user_guide/associated_methods/cuda/to_cpu.md: -------------------------------------------------------------------------------- 1 | # to_cpu 2 | ```rust 3 | fn to_cpu(x: &Tensor) -> Result, TensorError> 4 | ``` 5 | Transfers a tensor from CUDA GPU memory to CPU memory, creating a new tensor in host memory. 6 | 7 | currently only `DEVICE_ID` = 0 is supported 8 | 9 | ## Parameters: 10 | `DEVICE_ID`: A compile-time constant specifying the target CPU device ID (default is 0) 11 | 12 | ## Returns: 13 | A new `Tensor` located on the specified CPU device, or a TensorError if the transfer fails. 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{backend::Cuda, error::TensorError, ops::TensorCreator, Tensor}; 18 | 19 | fn main() -> Result<(), TensorError> { 20 | let a = Tensor::::empty([1, 2, 3])?; 21 | println!("{}", a.to_cpu::<0>()?); 22 | Ok(()) 23 | } 24 | ``` -------------------------------------------------------------------------------- /docs/user_guide/unary/mish_.md: -------------------------------------------------------------------------------- 1 | # mish_ 2 | ```rust 3 | mish_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x * \tanh(\ln(1 + e^x))$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.mish_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/softplus_.md: -------------------------------------------------------------------------------- 1 | # softplus_ 2 | ```rust 3 | softplus_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\ln(1 + e^x)$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.softplus_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/logsumexp.cu: -------------------------------------------------------------------------------- 1 | #include "declare_macros.cuh" 2 | #include "../utils/type_cast.cuh" 3 | #include "../utils/check_type.cuh" 4 | #include "reduce_classes.cuh" 5 | 6 | DECLARE_KERNEL(bool, bool, logsumexp, LogSumExp) 7 | DECLARE_KERNEL(u8, u8, logsumexp, LogSumExp) 8 | DECLARE_KERNEL(u16, u16, logsumexp, LogSumExp) 9 | DECLARE_KERNEL(u32, u32, logsumexp, LogSumExp) 10 | DECLARE_KERNEL(u64, u64, logsumexp, LogSumExp) 11 | DECLARE_KERNEL(i8, i8, logsumexp, LogSumExp) 12 | DECLARE_KERNEL(i16, i16, logsumexp, LogSumExp) 13 | DECLARE_KERNEL(i32, i32, logsumexp, LogSumExp) 14 | DECLARE_KERNEL(i64, i64, logsumexp, LogSumExp) 15 | DECLARE_KERNEL(f32, f32, logsumexp, LogSumExp) 16 | DECLARE_KERNEL(f64, f64, logsumexp, LogSumExp) 17 | DECLARE_KERNEL(f16, f16, logsumexp, LogSumExp) 18 | DECLARE_KERNEL(bf16, bf16, logsumexp, LogSumExp) 19 | -------------------------------------------------------------------------------- /docs/dev_guide/new_type.md: -------------------------------------------------------------------------------- 1 | # How to add new type support 2 | 3 | ### Things to know 4 | 1. Hpt has vector and scalar implementation. To support new type, you will need to go to `hpt-types` crate. 5 | 6 | ### How 7 | 1. Go to `hpt-types/src/dtype.rs`. 8 | 2. Implement `TypeCommon` for new type, implement `CudaType` for new type if the new type is supported in cuda 9 | 3. Add Dtype variant for the `Dtype` enum. 10 | 4. Go to `hpt-types/src/scalars/`, based on the existing implementation for primitive type, implement the traits for new type. 11 | 5. Go to `hpt-types/src/vectors/`, based on the existing implementation for primitive type, implement the traits for new type. 12 | 6. Go to `hpt-types/src/convertion.rs`, add type conversion for the new type, add `to_new_type` method in traits. 13 | 7. Go to `hpt-types/src/promotion/`, add type promotion for new type. -------------------------------------------------------------------------------- /docs/user_guide/unary/sigmoid_.md: -------------------------------------------------------------------------------- 1 | # sigmoid_ 2 | ```rust 3 | sigmoid_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{1}{1 + e^{-x}}$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `T` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.sigmoid_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/softsign_.md: -------------------------------------------------------------------------------- 1 | # softsign_ 2 | ```rust 3 | softsign_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \frac{x}{1 + |x|}$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.softsign_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/utils/extra_vecs.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_fp16.h" 3 | #include "cuda_bf16.h" 4 | 5 | struct half3 6 | { 7 | __half x; 8 | __half y; 9 | __half z; 10 | }; 11 | 12 | struct __align__(8) half4 13 | { 14 | __half x; 15 | __half y; 16 | __half z; 17 | __half w; 18 | }; 19 | 20 | struct bf163 21 | { 22 | __nv_bfloat16 x; 23 | __nv_bfloat16 y; 24 | __nv_bfloat16 z; 25 | }; 26 | 27 | struct __align__(8) bf164 28 | { 29 | __nv_bfloat16 x; 30 | __nv_bfloat16 y; 31 | __nv_bfloat16 z; 32 | __nv_bfloat16 w; 33 | }; 34 | 35 | struct bool2 36 | { 37 | bool x; 38 | bool y; 39 | }; 40 | 41 | struct bool3 42 | { 43 | bool x; 44 | bool y; 45 | bool z; 46 | }; 47 | 48 | struct bool4 49 | { 50 | bool x; 51 | bool y; 52 | bool z; 53 | bool w; 54 | }; 55 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt/cpu/assert_utils.rs: -------------------------------------------------------------------------------- 1 | pub(crate) fn assert_f64(a: f64, b: f64, diff: f64) -> anyhow::Result<()> { 2 | let rel_diff = ((a - b) / (a.abs() + b.abs() + f64::EPSILON)).abs(); 3 | if rel_diff > diff { 4 | return Err(anyhow::anyhow!( 5 | "{} != {} (relative_diff: {}).\n", 6 | a, 7 | b, 8 | rel_diff 9 | )); 10 | } 11 | Ok(()) 12 | } 13 | 14 | #[allow(unused)] 15 | #[must_use] 16 | pub(crate) fn assert_f32(a: f32, b: f32, diff: f32) -> anyhow::Result<()> { 17 | let rel_diff = ((a - b) / (a.abs() + b.abs() + f32::EPSILON)).abs(); 18 | if rel_diff > diff { 19 | return Err(anyhow::anyhow!( 20 | "{} != {} (relative_diff: {}).\n", 21 | a, 22 | b, 23 | rel_diff 24 | )); 25 | } 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /docs/user_guide/unary/selu.md: -------------------------------------------------------------------------------- 1 | # selu 2 | ```rust 3 | selu( 4 | x: &Tensor 5 | ) -> Result, TensorError> 6 | ``` 7 | Compute $\large \lambda * (\alpha * (e^x - 1))$ for $x < 0$, $\large \lambda * x$ for $x \geq 0$ for all elements 8 | 9 | where `alpha` is `1.6732632423543772848170429916717`, `gamma` is `1.0507009873554804934193349852946` 10 | 11 | ## Parameters: 12 | `x`: Input values 13 | 14 | ## Returns: 15 | Tensor with type `C` 16 | 17 | ## Examples: 18 | ```rust 19 | use hpt::{ops::FloatUnaryOps, Tensor, error::TensorError}; 20 | 21 | fn main() -> Result<(), TensorError> { 22 | let a = Tensor::::new([2.0]); 23 | let b = a.selu()?; 24 | println!("{}", b); 25 | Ok(()) 26 | } 27 | ``` 28 | ## Backend Support 29 | | Backend | Supported | 30 | |---------|-----------| 31 | | CPU | ✅ | 32 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/iterator/collect.md: -------------------------------------------------------------------------------- 1 | # collect 2 | ```rust 3 | fn collect(strided_map) -> U 4 | ``` 5 | 6 | Convert map struct into `U` type, `U` must be `Tensor` type. 7 | 8 | ## Parameters: 9 | 10 | strided_map: Map struct, in hpt, there are couple of map struct 11 | 12 | ## Returns: 13 | 14 | Tensor 15 | 16 | ## Examples: 17 | ```rust 18 | use hpt::iter::TensorIterator; 19 | use hpt::Tensor; 20 | fn main() -> anyhow::Result<()> { 21 | let x = Tensor::::new(&[1f64, 2., 3.]); 22 | 23 | let res = x 24 | .par_iter() 25 | .strided_map(|(res, x)| { 26 | *res = x.sin(); 27 | }) 28 | .collect::>(); 29 | 30 | println!("{}", res); 31 | Ok(()) 32 | } 33 | ``` 34 | ## Backend Support 35 | | Backend | Supported | 36 | |---------|-----------| 37 | | CPU | ✅ | 38 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-types/src/vectors/arch_simd/_128bit/sse/boolx16.rs: -------------------------------------------------------------------------------- 1 | use crate::traits::VecTrait; 2 | use crate::vectors::arch_simd::_128bit::common::boolx16::boolx16; 3 | 4 | impl VecTrait for boolx16 { 5 | const SIZE: usize = 16; 6 | type Base = bool; 7 | #[inline(always)] 8 | fn mul_add(self, _: Self, _: Self) -> Self { 9 | todo!() 10 | } 11 | #[inline(always)] 12 | fn sum(&self) -> bool { 13 | self.0.iter().map(|&x| x as u8).sum::() > 0 14 | } 15 | #[inline(always)] 16 | fn splat(val: bool) -> boolx16 { 17 | boolx16([val; 16]) 18 | } 19 | #[inline(always)] 20 | unsafe fn from_ptr(ptr: *const bool) -> Self { 21 | let mut result = [false; 16]; 22 | for i in 0..16 { 23 | result[i] = unsafe { *ptr.add(i) }; 24 | } 25 | boolx16(result) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_512bit/common/mask.rs: -------------------------------------------------------------------------------- 1 | pub(crate) const U16MASK: [u16; 17] = [ 2 | 0b0000_0000_0000_0000, 3 | 0b0000_0000_0000_0001, 4 | 0b0000_0000_0000_0011, 5 | 0b0000_0000_0000_0111, 6 | 0b0000_0000_0000_1111, 7 | 0b0000_0000_0001_1111, 8 | 0b0000_0000_0011_1111, 9 | 0b0000_0000_0111_1111, 10 | 0b0000_0000_1111_1111, 11 | 0b0000_0001_1111_1111, 12 | 0b0000_0011_1111_1111, 13 | 0b0000_0111_1111_1111, 14 | 0b0000_1111_1111_1111, 15 | 0b0001_1111_1111_1111, 16 | 0b0011_1111_1111_1111, 17 | 0b0111_1111_1111_1111, 18 | 0b1111_1111_1111_1111, 19 | ]; 20 | 21 | pub(crate) const U8MASK: [u8; 9] = [ 22 | 0b0000_0000, 23 | 0b0000_0001, 24 | 0b0000_0011, 25 | 0b0000_0111, 26 | 0b0000_1111, 27 | 0b0001_1111, 28 | 0b0011_1111, 29 | 0b0111_1111, 30 | 0b1111_1111, 31 | ]; -------------------------------------------------------------------------------- /docs/user_guide/associated_methods/cpu/forget_copy.md: -------------------------------------------------------------------------------- 1 | # forget_copy 2 | ```rust 3 | unsafe fn forget_copy(x: &Tensor) -> Result<(*mut u8, std::alloc::Layout), TensorError> 4 | ``` 5 | clone the current Tensor data and return raw data. 6 | 7 | ## Note 8 | Similar as `forget`, but `forget_copy` doesn't need to check reference count 9 | 10 | ## Parameters: 11 | `x`: The input tensor 12 | 13 | ## Returns: 14 | `*mut u8`: A pointer pointing to the cloned tensor's data 15 | `std::alloc::Layout`: Can be used to check the byte size 16 | 17 | ## Examples: 18 | ```rust 19 | use hpt::{error::TensorError, ops::TensorCreator, Tensor}; 20 | 21 | fn main() -> Result<(), TensorError> { 22 | let a = Tensor::::empty([1, 2, 3])?; 23 | let (ptr, layout) = unsafe { a.forget_copy() }?; 24 | unsafe { std::alloc::dealloc(ptr, layout) }; 25 | Ok(()) 26 | } 27 | ``` -------------------------------------------------------------------------------- /docs/user_guide/associated_methods/cuda/forget_copy.md: -------------------------------------------------------------------------------- 1 | # forget_copy 2 | ```rust 3 | unsafe fn forget_copy(x: &Tensor) -> Result<(cudarc::driver::CudaSlice, std::alloc::Layout), TensorError> 4 | ``` 5 | clone the current Tensor data and return raw data. 6 | 7 | ## Note 8 | Similar as `forget`, but `forget_copy` doesn't need to check reference count 9 | 10 | ## Parameters: 11 | `x`: The input tensor 12 | 13 | ## Returns: 14 | `cudarc::driver::CudaSlice`: A slice pointing to the cloned tensor's data 15 | `std::alloc::Layout`: Can be used to check the byte size 16 | 17 | ## Examples: 18 | ```rust 19 | use hpt::{backend::Cuda, error::TensorError, ops::TensorCreator, Tensor}; 20 | 21 | fn main() -> Result<(), TensorError> { 22 | let a = Tensor::::empty([1, 2, 3])?; 23 | let ret = unsafe { a.forget_copy() }?; 24 | Ok(()) 25 | } 26 | ``` -------------------------------------------------------------------------------- /hpt-bench/scan_benchmarks_result.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | with open('bench_results.txt', 'r', encoding="UTF-16LE") as f: 4 | content = f.read() 5 | 6 | pattern = r'matmul f16 Benchmarks/(hpt|torch|hpt\(builtin\))/(\d+)\s+time:\s+\[\d+\.\d+ ms (\d+\.\d+) ms \d+\.\d+ ms\]' 7 | 8 | results = { 9 | 'hpt': [], 10 | 'torch': [], 11 | 'hpt(builtin)': [] 12 | } 13 | 14 | sizes = [] 15 | last_size = None 16 | 17 | matches = re.findall(pattern, content) 18 | for test_type, size, median in matches: 19 | if test_type == 'hpt' and (last_size is None or int(size) != last_size): 20 | sizes.append(int(size)) 21 | last_size = int(size) 22 | results[test_type].append(float(median)) 23 | 24 | print("input size:", sizes) 25 | print("HPT:", results['hpt']) 26 | print("PyTorch:", results['torch']) 27 | print("HPT(builtin):", results['hpt(builtin)']) 28 | -------------------------------------------------------------------------------- /docs/user_guide/unary/erf_.md: -------------------------------------------------------------------------------- 1 | # erf_ 2 | ```rust 3 | erf_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} dt$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.erf_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-dyn/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod ops; 2 | pub(crate) mod tensor; 3 | pub(crate) mod utils; 4 | use std::sync::atomic::AtomicUsize; 5 | 6 | pub use hpt_types::dtype::DType; 7 | pub use tensor::Tensor; 8 | pub use utils::device::Device; 9 | 10 | static DISPLAY_PRECISION: AtomicUsize = AtomicUsize::new(4); 11 | static DISPLAY_LR_ELEMENTS: AtomicUsize = AtomicUsize::new(4); 12 | static ALIGN: usize = 128; 13 | 14 | pub fn current_num_threads() -> usize { 15 | num_cpus::get_physical() 16 | } 17 | 18 | pub fn set_num_threads(num_threads: usize) { 19 | rayon::ThreadPoolBuilder::new().num_threads(num_threads).build_global().unwrap(); 20 | } 21 | 22 | pub fn physical_cores() -> usize { 23 | num_cpus::get_physical() 24 | } 25 | 26 | pub mod onnx { 27 | pub use crate::utils::onnx::load_model::load_onnx; 28 | pub(crate) use crate::utils::onnx::proto::*; 29 | } 30 | -------------------------------------------------------------------------------- /docs/user_guide/iterator/par_iter_mut.md: -------------------------------------------------------------------------------- 1 | # par_iter_mut 2 | ```rust 3 | fn par_iter_mut(x: &mut Tensor) -> ParStridedMut 4 | ``` 5 | 6 | similar as `par_iter`, input pass to the closure will be mutable. You can use it to do inplace computation. 7 | 8 | ## Parameters: 9 | 10 | x: Tensor to iterate 11 | 12 | ## Returns: 13 | 14 | `ParStridedMut` 15 | 16 | ## Examples: 17 | ```rust 18 | use hpt::Tensor; 19 | use hpt::iter::TensorIterator; 20 | use hpt::iter::rayon::iter::ParallelIterator; 21 | 22 | fn main() -> anyhow::Result<()> { 23 | let mut x = Tensor::::new(&[1f64, 2., 3.]); 24 | 25 | x.par_iter_mut().for_each(|x|{ 26 | *x = x.sin(); 27 | }); 28 | 29 | println!("{}", x); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-tests/src/macro_tests/stmt_item/item.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate hpt_tests; 3 | use hpt_codegen::*; 4 | pub fn main() { 5 | fuse_proc_macro!( 6 | fn struct_item(a: f32, b: f32) -> anyhow::Result{ 7 | struct A { 8 | a: f32, 9 | b: f32, 10 | } 11 | Ok(A { a, b }) 12 | } 13 | ); 14 | fuse_proc_macro!( 15 | fn macro_item(a: f32, b: f32) -> anyhow::Result{ 16 | macro_rules! a { 17 | ($a:expr) => { 18 | $a 19 | }; 20 | } 21 | Ok(a!(a)) 22 | } 23 | ); 24 | fuse_proc_macro!( 25 | fn trait_item(a: f32, b: f32) -> anyhow::Result{ 26 | trait A { 27 | fn a(&self) -> f32; 28 | } 29 | Ok(a) 30 | } 31 | ); 32 | } 33 | -------------------------------------------------------------------------------- /docs/user_guide/unary/exp10_.md: -------------------------------------------------------------------------------- 1 | # exp10_ 2 | ```rust 3 | exp10_( 4 | x: &Tensor, 5 | out: &mut Tensor | Tensor 6 | ) -> Result, TensorError> 7 | ``` 8 | Compute 10 raised to the power of `x` for all elements with output tensor 9 | 10 | ## Parameters: 11 | `x`: Input values (exponents) 12 | `out`: Tensor to write to 13 | 14 | ## Returns: 15 | Tensor with type `C` 16 | 17 | ## Examples: 18 | ```rust 19 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 20 | 21 | fn main() -> Result<(), TensorError> { 22 | let a = Tensor::::new([10.0]); 23 | let b = a.exp10_(&mut a.clone())?; 24 | println!("{}", b); 25 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/iterator/par_iter.md: -------------------------------------------------------------------------------- 1 | # par_iter 2 | ```rust 3 | fn par_iter(x: &Tensor) -> ParStrided 4 | ``` 5 | 6 | Convert Tensor to `ParStrided` iterator, `ParStrided` will split the tasks and execute the method the user provides 7 | 8 | ## Parameters: 9 | 10 | x: Tensor to iterate 11 | 12 | ## Returns: 13 | 14 | `ParStrided` 15 | 16 | ## Examples: 17 | ```rust 18 | use hpt::iter::TensorIterator; 19 | use hpt::Tensor; 20 | 21 | fn main() -> anyhow::Result<()> { 22 | let x = Tensor::::new(&[1f64, 2., 3.]); 23 | 24 | let res = x 25 | .par_iter() 26 | .strided_map(|(res, x)| { 27 | *res = x.sin(); 28 | }) 29 | .collect::>(); 30 | 31 | println!("{}", res); 32 | Ok(()) 33 | } 34 | ``` 35 | ## Backend Support 36 | | Backend | Supported | 37 | |---------|-----------| 38 | | CPU | ✅ | 39 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/onnx/parse_args/parse.rs: -------------------------------------------------------------------------------- 1 | use crate::onnx::NodeProto; 2 | 3 | pub(crate) struct ParseArgs { 4 | arg_idx: usize, 5 | } 6 | 7 | impl ParseArgs { 8 | pub(crate) fn new() -> Self { 9 | Self { arg_idx: 0 } 10 | } 11 | 12 | pub(crate) fn parse_int_attribute( 13 | &mut self, 14 | node: &NodeProto, 15 | target: &str, 16 | default: i64 17 | ) -> i64 { 18 | if let Some(attr) = node.attribute.get(self.arg_idx) { 19 | if attr.name() == target { 20 | let res = attr.i.unwrap_or(default); 21 | self.arg_idx += 1; 22 | res 23 | } else { 24 | default 25 | } 26 | } else { 27 | default 28 | } 29 | } 30 | } 31 | 32 | pub(crate) trait Parse<'p> { 33 | fn parse<'a: 'p>(node: &'a NodeProto) -> Self; 34 | } 35 | -------------------------------------------------------------------------------- /docs/user_guide/binary/add_.md: -------------------------------------------------------------------------------- 1 | # add_ 2 | ```rust 3 | add_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x + y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Returns: 19 | Tensor with type `C` 20 | 21 | ## Examples: 22 | ```rust 23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError}; 24 | 25 | fn main() -> Result<(), TensorError> { 26 | let a = Tensor::::new([2.0]); 27 | let b = Tensor::::new([3.0]); 28 | let c = a.add_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/div_.md: -------------------------------------------------------------------------------- 1 | # div_ 2 | ```rust 3 | div_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x / y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Returns: 19 | Tensor with type `C` 20 | 21 | ## Examples: 22 | ```rust 23 | use hpt::{ops::FloatBinOps, Tensor, error::TensorError}; 24 | 25 | fn main() -> Result<(), TensorError> { 26 | let a = Tensor::::new([2.0]); 27 | let b = Tensor::::new([3.0]); 28 | let c = a.div_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/mul_.md: -------------------------------------------------------------------------------- 1 | # mul_ 2 | ```rust 3 | mul_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x * y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Returns: 19 | Tensor with type `C` 20 | 21 | ## Examples: 22 | ```rust 23 | use hpt::{error::TensorError, ops::NormalBinOps, Tensor}; 24 | 25 | fn main() -> Result<(), TensorError> { 26 | let a = Tensor::::new([2.0]); 27 | let b = Tensor::::new([3.0]); 28 | let c = a.mul_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/rem_.md: -------------------------------------------------------------------------------- 1 | # rem_ 2 | ```rust 3 | rem_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x mod y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Returns: 19 | Tensor with type `C` 20 | 21 | ## Examples: 22 | ```rust 23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError}; 24 | 25 | fn main() -> Result<(), TensorError> { 26 | let a = Tensor::::new([2.0]); 27 | let b = Tensor::::new([3.0]); 28 | let c = a.rem_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/sub_.md: -------------------------------------------------------------------------------- 1 | # sub_ 2 | ```rust 3 | sub_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x - y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Returns: 19 | Tensor with type `C` 20 | 21 | ## Examples: 22 | ```rust 23 | use hpt::{ops::NormalBinOps, Tensor, error::TensorError}; 24 | 25 | fn main() -> Result<(), TensorError> { 26 | let a = Tensor::::new([2.0]); 27 | let b = Tensor::::new([3.0]); 28 | let c = a.sub_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-common/src/error/mod.rs: -------------------------------------------------------------------------------- 1 | //! Error handling for tensor operations 2 | //! 3 | //! This module contains various error types used throughout the tensor library, 4 | //! organized by their domains (shape, device, memory, kernel, etc.) 5 | 6 | /// Autograd-related errors (gradient computation, etc.) 7 | pub mod autograd; 8 | /// Base error types and common functionality 9 | pub mod base; 10 | /// Common errors 11 | pub mod common; 12 | /// Device-related errors (GPU, CPU, etc.) 13 | pub mod device; 14 | /// Kernel-related errors (CUDA, etc.) 15 | pub mod kernel; 16 | /// Memory allocation and management errors 17 | pub mod memory; 18 | /// Parameter-related errors (function arguments, etc.) 19 | pub mod param; 20 | /// Random distribution-related errors 21 | pub mod random; 22 | /// Shape-related errors (dimension mismatch, broadcasting, etc.) 23 | pub mod shape; 24 | /// Onnx-related errors 25 | pub mod onnx; 26 | 27 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/tensor_external/tensordot.rs: -------------------------------------------------------------------------------- 1 | use hpt_common::error::base::TensorError; 2 | use hpt_traits::ops::binary::TensorDot; 3 | use hpt_types::type_promote::NormalOut; 4 | 5 | use crate::{tensor_base::_Tensor, Tensor}; 6 | use hpt_traits::tensor::CommonBounds; 7 | impl TensorDot> for Tensor 8 | where 9 | _Tensor: TensorDot<_Tensor, Output = _Tensor<>::Output>>, 10 | A: CommonBounds + NormalOut, 11 | B: CommonBounds, 12 | >::Output: CommonBounds, 13 | { 14 | type Output = Tensor<>::Output>; 15 | 16 | fn tensordot( 17 | &self, 18 | rhs: &Tensor, 19 | axes: ([i64; N], [i64; N]), 20 | ) -> std::result::Result { 21 | let res = self.inner.tensordot(rhs.inner.as_ref(), axes)?; 22 | Ok(res.into()) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /hpt-common/src/error/autograd.rs: -------------------------------------------------------------------------------- 1 | use std::panic::Location; 2 | 3 | use thiserror::Error; 4 | 5 | /// Errors related to autograd 6 | #[derive(Debug, Error)] 7 | pub enum AutogradError { 8 | /// Error that occurs when inplace computation is not allowed in autograd 9 | #[error("Inplace computation {op} is not allowed in autograd, at {location}")] 10 | InplaceCompError { 11 | /// Operation name 12 | op: &'static str, 13 | /// Location where the error occurred 14 | location: &'static Location<'static>, 15 | }, 16 | /// Error that occurs when the operation is not supported in autograd 17 | #[error("Operation {op} is not supported in autograd, at {location}")] 18 | UnsupportOpError { 19 | /// Operation name 20 | op: &'static str, 21 | /// Location where the error occurred 22 | location: &'static Location<'static>, 23 | }, 24 | } 25 | -------------------------------------------------------------------------------- /hpt-dyn/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod allocator; 2 | pub(crate) mod backend; 3 | pub(crate) mod device; 4 | pub(crate) mod display; 5 | pub(crate) mod index_cal; 6 | pub(crate) mod prefetch; 7 | pub(crate) mod onnx { 8 | pub(crate) mod load_model; 9 | pub(crate) mod proto; 10 | pub(crate) mod execute; 11 | pub(crate) mod map_dtype; 12 | pub(crate) mod init; 13 | pub(crate) mod operators; 14 | pub(crate) mod fwd; 15 | pub(crate) mod layout_sense; 16 | pub(crate) mod build_graph; 17 | pub(crate) mod plot; 18 | pub(crate) mod run_fwd; 19 | pub(crate) mod run_init; 20 | pub(crate) mod optimize { 21 | pub(crate) mod constant_fold; 22 | pub(crate) mod fuse; 23 | } 24 | pub(crate) mod parse_args { 25 | pub(crate) mod parse; 26 | pub(crate) mod affine_grid; 27 | pub(crate) mod squeeze; 28 | } 29 | } 30 | pub(crate) mod threadpool; -------------------------------------------------------------------------------- /docs/user_guide/unary/gelu_.md: -------------------------------------------------------------------------------- 1 | # gelu_ 2 | ```rust 3 | gelu_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x \cdot \Phi(x)$ where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.gelu_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-dataloader/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod compression_trait; 2 | pub mod data_loader; 3 | pub use compression_trait::CompressionAlgo; 4 | pub use compression_trait::DataLoaderTrait; 5 | pub use compression_trait::Meta; 6 | pub use compression_trait::{DataLoader, TensorLoader, TensorSaver}; 7 | pub use data_loader::Endian; 8 | pub use flate2::write::{DeflateEncoder, GzEncoder, ZlibEncoder}; 9 | pub use from_safetensors::from_safetensors::FromSafeTensors; 10 | pub use struct_save::gen_header; 11 | pub use struct_save::load::{Load, MetaLoad}; 12 | pub use struct_save::save::save; 13 | pub use struct_save::save::Save; 14 | pub use utils::CPUTensorCreator; 15 | mod struct_save { 16 | pub mod gen_header; 17 | pub mod load; 18 | pub mod save; 19 | } 20 | 21 | pub mod from_safetensors { 22 | pub mod from_safetensors; 23 | } 24 | 25 | pub mod load; 26 | pub mod save; 27 | pub mod utils; 28 | 29 | pub(crate) const CHUNK_BUFF: usize = 1024 * 1024; 30 | -------------------------------------------------------------------------------- /hpt-tests/src/macro_tests/control_flows/for_loop.expanded.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate hpt_tests; 3 | use hpt_codegen::*; 4 | pub fn main() { 5 | fn case1(a: f32, b: f32) -> anyhow::Result { 6 | let __for_out_0 = for i in 0..1000 { 7 | a += 10; 8 | }; 9 | Ok(a) 10 | } 11 | fn case1(a: f32, b: f32) -> anyhow::Result { 12 | let __for_out_0 = for _ in (0..1000).iter() { 13 | a += 10; 14 | }; 15 | Ok(a) 16 | } 17 | fn case1(a: f32, b: f32) -> anyhow::Result { 18 | let __for_out_0 = for _ in b.iter().enumerate() { 19 | a += 10; 20 | }; 21 | Ok(a) 22 | } 23 | fn case1(a: f32, b: f32) -> anyhow::Result { 24 | let __for_out_0 = for _ in b.iter().enumerate() { 25 | a += 10; 26 | continue; 27 | break; 28 | }; 29 | Ok(a) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hpt/src/backends/cpu/tensor_internal/cumulative.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::cpu::utils::unary::unary::cumulate; 2 | use crate::tensor_base::_Tensor; 3 | use hpt_allocator::traits::Allocator; 4 | use hpt_allocator::traits::AllocatorOutputRetrive; 5 | use hpt_allocator::Cpu; 6 | use hpt_common::error::base::TensorError; 7 | use hpt_traits::ops::cumulative::CumulativeOps; 8 | use hpt_traits::tensor::CommonBounds; 9 | 10 | impl CumulativeOps for _Tensor 11 | where 12 | A2: Allocator, 13 | A2::Output: AllocatorOutputRetrive, 14 | { 15 | fn cumsum>>(&self, axis: A) -> std::result::Result { 16 | cumulate(self, axis, T::ZERO, |a, b| a._add(b)) 17 | } 18 | 19 | fn cumprod>>(&self, axis: A) -> std::result::Result { 20 | cumulate(self, axis, T::ONE, |a, b| a._mul(b)) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /docs/user_guide/unary/elu_.md: -------------------------------------------------------------------------------- 1 | # elu_ 2 | ```rust 3 | elu_(x: &Tensor, alpha: C, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x$ for $x > 0$, $\large \alpha(e^x - 1)$ for $x \leq 0$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `alpha`: Parameter controlling the saturation of negative values 10 | `out`: Tensor to write to 11 | 12 | ## Returns: 13 | Tensor with type `C` 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 18 | 19 | fn main() -> Result<(), TensorError> { 20 | let a = Tensor::::new([-1.0]); 21 | let b = a.elu_(1.0, &mut a.clone())?; 22 | println!("{}", b); 23 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 24 | Ok(()) 25 | } 26 | ``` 27 | ## Backend Support 28 | | Backend | Supported | 29 | |---------|-----------| 30 | | CPU | ✅ | 31 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/hard_swish_.md: -------------------------------------------------------------------------------- 1 | # hard_swish_ 2 | ```rust 3 | hard_swish_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large x \cdot \text{min}(\text{max}(0, \frac{x}{6} + 0.5), 1)$ for all elements with out. A piece-wise linear approximation of the swish function. 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.hard_swish_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-tests/src/hpt_types/test_display.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use half; 4 | use half::bf16; 5 | use half::f16; 6 | use hpt::types::TypeCommon; 7 | use num_complex::Complex32 as c32; 8 | use num_complex::Complex64 as c64; 9 | 10 | macro_rules! test_display { 11 | ($type:ty) => { 12 | paste::paste! { 13 | #[test] 14 | fn []() { 15 | assert_eq!(format!("{}", <$type as TypeCommon>::STR), stringify!($type)); 16 | } 17 | } 18 | }; 19 | } 20 | 21 | test_display!(bool); 22 | test_display!(f32); 23 | test_display!(f64); 24 | test_display!(i8); 25 | test_display!(i16); 26 | test_display!(i32); 27 | test_display!(i64); 28 | test_display!(u8); 29 | test_display!(u16); 30 | test_display!(u32); 31 | test_display!(u64); 32 | test_display!(isize); 33 | test_display!(usize); 34 | test_display!(f16); 35 | test_display!(bf16); 36 | test_display!(c32); 37 | test_display!(c64); 38 | -------------------------------------------------------------------------------- /docs/user_guide/unary/hard_sigmoid_.md: -------------------------------------------------------------------------------- 1 | # hard_sigmoid_ 2 | ```rust 3 | hard_sigmoid_(x: &Tensor, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{max}(0, \text{min}(1, \frac{x}{6} + 0.5))$ for all elements with out. A piece-wise linear approximation of the sigmoid function. 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `out`: Tensor to write to 10 | 11 | ## Returns: 12 | Tensor with type `C` 13 | 14 | ## Examples: 15 | ```rust 16 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | let a = Tensor::::new([10.0]); 20 | let b = a.hard_sigmoid_(&mut a.clone())?; 21 | println!("{}", b); 22 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 23 | Ok(()) 24 | } 25 | ``` 26 | ## Backend Support 27 | | Backend | Supported | 28 | |---------|-----------| 29 | | CPU | ✅ | 30 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/unary/celu_.md: -------------------------------------------------------------------------------- 1 | # celu_ 2 | ```rust 3 | celu_(x: &Tensor, alpha: C, out: &mut Tensor | Tensor) -> Result, TensorError> 4 | ``` 5 | Compute $\large \text{max}(0, x) + \text{min}(0, \alpha \cdot (e^{x/\alpha} - 1))$ for all elements with out 6 | 7 | ## Parameters: 8 | `x`: Input values 9 | `alpha`: Parameter controlling the saturation of negative values 10 | `out`: Tensor to write to 11 | 12 | ## Returns: 13 | Tensor with type `C` 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{common::TensorInfo, error::TensorError, ops::FloatUnaryOps, Tensor}; 18 | 19 | fn main() -> Result<(), TensorError> { 20 | let a = Tensor::::new([-1.0]); 21 | let b = a.celu_(1.0, &mut a.clone())?; 22 | println!("{}", b); 23 | assert_eq!(a.ptr().ptr as u64, b.ptr().ptr as u64); 24 | Ok(()) 25 | } 26 | ``` 27 | ## Backend Support 28 | | Backend | Supported | 29 | |---------|-----------| 30 | | CPU | ✅ | 31 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-matmul/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-matmul" 3 | version = "0.1.1" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | half = { workspace = true } 8 | num-complex = { workspace = true } 9 | seq-macro = { workspace = true } 10 | num-traits = { workspace = true } 11 | dyn-stack = { workspace = true } 12 | gemm-common = { workspace = true } 13 | spindle = { workspace = true } 14 | duplicate = { workspace = true } 15 | num-integer = { workspace = true } 16 | rayon = {workspace = true} 17 | num_cpus = { workspace = true } 18 | raw-cpuid = { workspace = true } 19 | matconv_simd = { path = "../matconv_simd" } 20 | 21 | [target.'cfg(target_os = "macos")'.dependencies] 22 | libc = { workspace = true } 23 | 24 | [features] 25 | default = ["f32", "f16"] 26 | bound_check = [] 27 | bool = [] 28 | f32 = [] 29 | f16 = [] 30 | bf16 = [] 31 | f64 = [] 32 | i8 = [] 33 | u8 = [] 34 | i16 = [] 35 | u16 = [] 36 | i32 = [] 37 | u32 = [] 38 | i64 = [] 39 | u64 = [] 40 | cplx32 = [] 41 | cplx64 = [] 42 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt_common/pointer.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_imports)] 2 | 3 | use hpt_common::{layout::layout::Layout, shape::shape::Shape, utils::pointer::Pointer}; 4 | 5 | #[test] 6 | fn test_index() { 7 | let mut a = [10, 11, 12, 13]; 8 | let mut ptr = Pointer::new(&mut a as *mut i32, 4); 9 | assert_eq!(ptr[0usize], 10); 10 | ptr += 1i64; 11 | assert_eq!(ptr[0usize], 11); 12 | ptr += 1isize; 13 | assert_eq!(ptr[0usize], 12); 14 | ptr += 1usize; 15 | assert_eq!(ptr[0usize], 13); 16 | ptr -= 1i64; 17 | assert_eq!(ptr[0usize], 12); 18 | ptr -= 1isize; 19 | assert_eq!(ptr[0usize], 11); 20 | ptr -= 1usize; 21 | assert_eq!(ptr[0usize], 10); 22 | 23 | ptr += 1i64; 24 | assert_eq!(*ptr, 11); 25 | *ptr = 20; 26 | assert_eq!(*ptr, 20); 27 | 28 | let string = format!("{}", ptr); 29 | assert_eq!( 30 | string, 31 | format!("Pointer( ptr: {}, val: {} )", ptr.ptr as usize, ptr[0usize]) 32 | ); 33 | } 34 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/reduce/reduce_helper.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | class ReduceOp 8 | { 9 | public: 10 | __device__ __forceinline__ static R combine(T a, R b) { return R(); }; 11 | __device__ __forceinline__ static T identity() { return T(); }; 12 | __device__ __forceinline__ static R warp_reduce(R a) { return R(); } 13 | __device__ __forceinline__ static R pre_op(T a) { return a; } 14 | __device__ __forceinline__ static R post_op(R a, size_t reduce_size) { return a; } 15 | }; 16 | 17 | __device__ __forceinline__ bool is_last_block(int32_t *finished, size_t size) 18 | { 19 | __shared__ bool is_last; 20 | 21 | __syncthreads(); 22 | if (threadIdx.x == 0 && threadIdx.y == 0) 23 | { 24 | int32_t tmp = atomicAdd(finished, 1); 25 | is_last = tmp == (size - 1); 26 | } 27 | __syncthreads(); 28 | return is_last; 29 | } 30 | -------------------------------------------------------------------------------- /hpt-examples/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hpt-examples" 3 | version = "0.1.3" 4 | edition = "2021" 5 | license = "MIT OR Apache-2.0" 6 | 7 | [dependencies] 8 | serde = { workspace = true } 9 | rayon = { workspace = true } 10 | hpt = { path = "../hpt", features = ["track_caller"] } 11 | hpt-dyn = { path = "../hpt-dyn", features = ["f32", "i64"] } 12 | anyhow = "1.0.40" 13 | mimalloc = "0.1.43" 14 | # candle-core = { version = "0.8.2", features = ["mkl"] } 15 | # candle-nn = "0.8.2" 16 | serde_json = "1.0" 17 | safetensors = "0.5.0" 18 | 19 | [profile.release] 20 | opt-level = 3 21 | incremental = true 22 | debug = true 23 | lto = "fat" 24 | codegen-units = 1 25 | 26 | [profile.dev] 27 | opt-level = 0 28 | incremental = false 29 | debug = true 30 | # lto = "fat" 31 | # codegen-units = 1 32 | 33 | [profile.test] 34 | opt-level = 0 35 | incremental = false 36 | debug = true 37 | # lto = "fat" 38 | # codegen-units = 1 39 | 40 | [features] 41 | # default = ["cuda"] 42 | cuda = ["hpt/cuda"] 43 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/sse/boolx16.rs: -------------------------------------------------------------------------------- 1 | use crate::traits::VecTrait; 2 | use crate::vectors::arch_simd::_128bit::common::boolx16::boolx16; 3 | 4 | impl VecTrait for boolx16 { 5 | const SIZE: usize = 16; 6 | type Base = bool; 7 | #[inline(always)] 8 | fn copy_from_slice(&mut self, slice: &[bool]) { 9 | self.0.copy_from_slice(slice); 10 | } 11 | #[inline(always)] 12 | fn mul_add(self, _: Self, _: Self) -> Self { 13 | todo!() 14 | } 15 | #[inline(always)] 16 | fn sum(&self) -> bool { 17 | self.0.iter().map(|&x| x as u8).sum::() > 0 18 | } 19 | #[inline(always)] 20 | fn splat(val: bool) -> boolx16 { 21 | boolx16([val; 16]) 22 | } 23 | #[inline(always)] 24 | unsafe fn from_ptr(ptr: *const bool) -> Self { 25 | let mut result = [false; 16]; 26 | for i in 0..16 { 27 | result[i] = unsafe { *ptr.add(i) }; 28 | } 29 | boolx16(result) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt/cuda/from_raw.rs: -------------------------------------------------------------------------------- 1 | #[test] 2 | fn test_from_raw() { 3 | use hpt::backend::Cuda; 4 | use hpt::ops::ShapeManipulate; 5 | use hpt::slice; 6 | use hpt::{ops::Random, Tensor}; 7 | let m = 10; 8 | let n = 10; 9 | let a = Tensor::::randn(&[m, n]).expect("failed to create tensor"); 10 | 11 | let raw = unsafe { 12 | a.device() 13 | .alloc(m * n * 4) 14 | .expect("failed to alloc raw pointer") 15 | }; 16 | 17 | let c = unsafe { Tensor::::from_raw(raw, &[m, n]) } 18 | .expect("failed to create tensor from raw pointer"); 19 | 20 | let _ = a + c.clone(); 21 | 22 | let _sliced_c = slice!(c[0, ..]).expect("failed to slice tensor"); 23 | 24 | let reshaped = _sliced_c 25 | .reshape(&[10, 10]) 26 | .expect("failed to reshape tensor"); 27 | 28 | drop(reshaped); 29 | drop(_sliced_c); 30 | let (_, _) = unsafe { c.forget().expect("failed to forget tensor") }; 31 | } 32 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt/cpu/from_raw.rs: -------------------------------------------------------------------------------- 1 | #[test] 2 | fn test_from_raw() { 3 | use hpt::ops::ShapeManipulate; 4 | use hpt::slice; 5 | use hpt::{ops::Random, Tensor}; 6 | let m = 10; 7 | let n = 10; 8 | let a = Tensor::::randn(&[m, n]).expect("failed to create tensor"); 9 | let layout = std::alloc::Layout::from_size_align(m * n * 4, 64).unwrap(); 10 | 11 | let raw = unsafe { std::alloc::alloc(layout) }; 12 | let c = unsafe { Tensor::::from_raw(raw as *mut f32, &[m, n]) } 13 | .expect("failed to create tensor from raw pointer"); 14 | 15 | let _ = a + c.clone(); 16 | 17 | let _sliced_c = slice!(c[0, ..]).expect("failed to slice tensor"); 18 | 19 | let reshaped = _sliced_c 20 | .reshape(&[10, 10]) 21 | .expect("failed to reshape tensor"); 22 | 23 | drop(reshaped); 24 | drop(_sliced_c); 25 | let (raw, _) = unsafe { c.forget().expect("failed to forget tensor") }; 26 | 27 | unsafe { std::alloc::dealloc(raw, layout) }; 28 | } 29 | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_128bit/common/bf16x8.rs: -------------------------------------------------------------------------------- 1 | 2 | /// a vector of 8 bf16 values 3 | #[allow(non_camel_case_types)] 4 | #[derive(Default, Clone, Copy, PartialEq, Debug)] 5 | #[repr(C, align(16))] 6 | pub struct bf16x8(pub(crate) [half::bf16; 8]); 7 | 8 | impl std::ops::Add for bf16x8 { 9 | type Output = Self; 10 | 11 | #[inline(always)] 12 | fn add(self, rhs: Self) -> Self::Output { 13 | let [x0, x1] = self.to_2_f32vec(); 14 | let [y0, y1] = rhs.to_2_f32vec(); 15 | let low_add = x0 + y0; 16 | let high_add = x1 + y1; 17 | bf16x8::from_2_f32vec([low_add, high_add]) 18 | } 19 | } 20 | impl std::ops::Mul for bf16x8 { 21 | type Output = Self; 22 | 23 | #[inline(always)] 24 | fn mul(self, rhs: Self) -> Self::Output { 25 | let [x0, x1] = self.to_2_f32vec(); 26 | let [y0, y1] = rhs.to_2_f32vec(); 27 | let low_mul = x0 * y0; 28 | let high_mul = x1 * y1; 29 | bf16x8::from_2_f32vec([low_mul, high_mul]) 30 | } 31 | } -------------------------------------------------------------------------------- /hpt-tests/src/macro_tests/control_flows/while_loop.expanded.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate hpt_tests; 3 | use hpt_codegen::*; 4 | pub fn main() { 5 | fn case1(a: f32, b: f32) -> anyhow::Result { 6 | let __while_out_0 = while a < 1000 && a > 0 { 7 | a += 10; 8 | }; 9 | Ok(a) 10 | } 11 | fn case1(a: f32, b: f32) -> anyhow::Result { 12 | let __while_out_0 = while let Some(i) = (0..1000).iter().next() { 13 | a += 10; 14 | }; 15 | Ok(a) 16 | } 17 | fn case1(a: f32, b: f32) -> anyhow::Result { 18 | let __while_out_0 = while let syn::Expr::Path(path) = b.iter().next() { 19 | a += 10; 20 | }; 21 | Ok(a) 22 | } 23 | fn case1(a: f32, b: f32) -> anyhow::Result { 24 | let __while_out_0 = while let syn::Expr::Path(path) = b.iter().next() { 25 | a += 10; 26 | continue; 27 | break; 28 | }; 29 | Ok(a) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /docs/user_guide/shape_manipulate/flipud.md: -------------------------------------------------------------------------------- 1 | # flipud 2 | ```rust 3 | flipud( 4 | x: &Tensor 5 | ) -> Result, TensorError> 6 | ``` 7 | Reverses the order of elements along axis 0 (rows) of the tensor. The tensor must be at least 1-dimensional. 8 | 9 | ## Parameters: 10 | `x`: Input tensor with ndim >= 1 11 | 12 | ## Returns: 13 | A new tensor with elements reversed along axis 0 (up/down flip). 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{ops::ShapeManipulate, Tensor, error::TensorError}; 18 | fn main() -> Result<(), TensorError> { 19 | // Create a 2D tensor 20 | let a = Tensor::::new(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(&[2, 3])?; 21 | // [[1, 2, 3], 22 | // [4, 5, 6]] 23 | 24 | // Flip up/down 25 | let b = a.flipud()?; 26 | // [[4, 5, 6], 27 | // [1, 2, 3]] 28 | println!("{}", b); 29 | 30 | Ok(()) 31 | } 32 | ``` 33 | ## Backend Support 34 | | Backend | Supported | 35 | |---------|-----------| 36 | | CPU | ✅ | 37 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/creation/identity.md: -------------------------------------------------------------------------------- 1 | # identity 2 | ```rust 3 | identity( 4 | n: usize 5 | ) -> Result, TensorError> 6 | ``` 7 | Creates a 2-D identity tensor (1's on the main diagonal and 0's elsewhere). 8 | 9 | ## Parameters: 10 | `n`: Number of rows and columns 11 | 12 | ## Returns: 13 | A square 2-D tensor of shape [n, n] with ones on the main diagonal. 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{error::TensorError, ops::TensorCreator, Tensor}; 18 | fn main() -> Result<(), TensorError> { 19 | // Create a 3x3 identity matrix 20 | let a = Tensor::::identity(3)?; 21 | println!("{}", a); 22 | // [[1, 0, 0], 23 | // [0, 1, 0], 24 | // [0, 0, 1]] 25 | 26 | // Create a 2x2 identity matrix 27 | let b = Tensor::::identity(2)?; 28 | println!("{}", b); 29 | // [[1, 0], 30 | // [0, 1]] 31 | 32 | Ok(()) 33 | } 34 | ``` 35 | ## Backend Support 36 | | Backend | Supported | 37 | |---------|-----------| 38 | | CPU | ✅ | 39 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/random/beta.md: -------------------------------------------------------------------------------- 1 | # beta 2 | ```rust 3 | beta( 4 | alpha: T, 5 | beta: T, 6 | shape: &[i64] | &Vec | &[i64; _] 7 | ) -> Result, TensorError> 8 | ``` 9 | Create a Tensor with values drawn from a beta distribution with parameters `alpha` and `beta`. The beta distribution is a continuous probability distribution defined on the interval [0, 1]. 10 | ## Parameters: 11 | `alpha`: Shape parameter alpha (α) of the beta distribution. Must be positive. 12 | 13 | `beta`: Shape parameter beta (β) of the beta distribution. Must be positive. 14 | 15 | `shape`: shape of the output 16 | ## Returns: 17 | Tensor with type `T` 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::Random, Tensor}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | let a = Tensor::::beta(2.0, 5.0, &[10, 10])?; 24 | println!("{}", a); 25 | Ok(()) 26 | } 27 | ``` 28 | ## Backend Support 29 | | Backend | Supported | 30 | |---------|-----------| 31 | | CPU | ✅ | 32 | | Cuda | ❌ | -------------------------------------------------------------------------------- /docs/user_guide/creation/arange.md: -------------------------------------------------------------------------------- 1 | # arange 2 | ```rust 3 | arange( 4 | start: T, 5 | end: T 6 | ) -> Result, TensorError> 7 | ``` 8 | Creates a 1-D tensor with evenly spaced values within a given interval `[start, end)`. 9 | 10 | ## Parameters: 11 | `start`: Start of interval (inclusive) 12 | 13 | `end`: End of interval (exclusive) 14 | 15 | ## Returns: 16 | A 1-D tensor with values from `start` to `end-1`. 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{Tensor, error::TensorError, ops::TensorCreator}; 21 | fn main() -> Result<(), TensorError> { 22 | // Create sequence from 0 to 5 23 | let a = Tensor::::arange(0, 5)?; 24 | println!("{}", a); 25 | // [0, 1, 2, 3, 4] 26 | 27 | // Using floating point numbers 28 | let b = Tensor::::arange(1.5, 5.5)?; 29 | println!("{}", b); 30 | // [1.5, 2.5, 3.5, 4.5] 31 | 32 | Ok(()) 33 | } 34 | ``` 35 | ## Backend Support 36 | | Backend | Supported | 37 | |---------|-----------| 38 | | CPU | ✅ | 39 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/utils/set_display_precision.md: -------------------------------------------------------------------------------- 1 | # set_display_precision 2 | 3 | ```rust 4 | set_display_precision(precision: usize) 5 | ``` 6 | 7 | Controls how many decimal places are shown when displaying tensor values. 8 | ## Parameters: 9 | - `precision`: `usize` 10 | - Number of decimal places to display 11 | - Must be a non-negative integer 12 | - Default is 4 13 | 14 | ## Examples 15 | ```rust 16 | use hpt::{backend::Cpu, error::TensorError, utils::set_display_precision, Tensor}; 17 | 18 | fn main() -> Result<(), TensorError> { 19 | // Set precision to 3 decimal places 20 | set_display_precision(3); 21 | 22 | let tensor = Tensor::::new([1.23456789]); 23 | println!("{}", tensor); // Output: 1.235 24 | 25 | // Set precision to 6 decimal places 26 | set_display_precision(6); 27 | println!("{}", tensor); // Output: 1.234568 28 | Ok(()) 29 | } 30 | ``` 31 | 32 | ## Backend Support 33 | | Backend | Supported | 34 | |---------|-----------| 35 | | CPU | ✅ | 36 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/binary/pow_.md: -------------------------------------------------------------------------------- 1 | # pow_ 2 | ```rust 3 | pow_( 4 | x: Tensor, 5 | y: &Tensor | Tensor | scalar, 6 | out: &mut Tensor | Tensor 7 | ) -> Result, TensorError> 8 | ``` 9 | Compute $\large x ^ y$ for all elements with out 10 | 11 | ## Parameters: 12 | `x`: First input tensor 13 | 14 | `y`: Second input tensor or scalar exponent 15 | 16 | `out`: Tensor to write to 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::FloatBinOps, Tensor}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | // Create input tensors 24 | let a = Tensor::::new(&[2.0, 3.0, 4.0]); 25 | let b = Tensor::::new(&[2.0, 3.0, 2.0]); 26 | 27 | // Compute power and store result in a 28 | let c = a.pow_(&b, &mut a.clone())?; 29 | println!("{}", c); 30 | // Output: 31 | // [4.0, 27.0, 16.0] 32 | 33 | Ok(()) 34 | } 35 | ``` 36 | 37 | ## Backend Support 38 | | Backend | Supported | 39 | |---------|-----------| 40 | | CPU | ✅ | 41 | | Cuda | ✅ | -------------------------------------------------------------------------------- /hpt-common/src/utils/simd_ref.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | 3 | use crate::utils::pointer::Pointer; 4 | 5 | /// A struct contains a mutable simd vector, this struct force the user to use write unaligned and read unaligned when they use simd iterator 6 | #[derive(Debug)] 7 | pub struct MutVec<'a, T> { 8 | ptr: Pointer, 9 | _phantom: PhantomData<&'a mut T>, 10 | } 11 | 12 | impl<'a, T> MutVec<'a, T> { 13 | /// create a new MutVec 14 | #[inline(always)] 15 | pub fn new(ptr: Pointer) -> Self { 16 | Self { 17 | ptr, 18 | _phantom: PhantomData, 19 | } 20 | } 21 | 22 | /// perform write unaligned operation 23 | #[inline(always)] 24 | pub fn write_unaligned(&self, value: T) { 25 | unsafe { 26 | self.ptr.ptr.write_unaligned(value); 27 | } 28 | } 29 | 30 | #[inline(always)] 31 | /// perform read unaligned operation 32 | pub fn read_unaligned(&self) -> T { 33 | unsafe { self.ptr.ptr.read_unaligned() } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/ln.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(ln_f16, f16, FloatOutUnaryPromote, Ln); 8 | DEFINE_UNARY_KERNEL(ln_bf16, bf16, FloatOutUnaryPromote, Ln); 9 | DEFINE_UNARY_KERNEL(ln_f32, f32, FloatOutUnaryPromote, Ln); 10 | DEFINE_UNARY_KERNEL(ln_f64, f64, FloatOutUnaryPromote, Ln); 11 | DEFINE_UNARY_KERNEL(ln_bool, bool, FloatOutUnaryPromote, Ln); 12 | DEFINE_UNARY_KERNEL(ln_i8, i8, FloatOutUnaryPromote, Ln); 13 | DEFINE_UNARY_KERNEL(ln_i16, i16, FloatOutUnaryPromote, Ln); 14 | DEFINE_UNARY_KERNEL(ln_i32, i32, FloatOutUnaryPromote, Ln); 15 | DEFINE_UNARY_KERNEL(ln_i64, i64, FloatOutUnaryPromote, Ln); 16 | DEFINE_UNARY_KERNEL(ln_u8, u8, FloatOutUnaryPromote, Ln); 17 | DEFINE_UNARY_KERNEL(ln_u16, u16, FloatOutUnaryPromote, Ln); 18 | DEFINE_UNARY_KERNEL(ln_u32, u32, FloatOutUnaryPromote, Ln); 19 | DEFINE_UNARY_KERNEL(ln_u64, u64, FloatOutUnaryPromote, Ln); 20 | -------------------------------------------------------------------------------- /hpt-tests/src/hpt_common/shape.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_imports)] 2 | use std::sync::Arc; 3 | 4 | use hpt_common::shape::shape::Shape; 5 | 6 | #[test] 7 | fn test_new() { 8 | let shape = Shape::new(&[1, 2, 3]); 9 | assert_eq!(shape.inner(), &[1, 2, 3]); 10 | } 11 | 12 | #[test] 13 | fn test_to_strides() { 14 | let shape = Shape::new(&[1, 2, 3]); 15 | let strides = shape.to_strides(); 16 | assert_eq!(strides.inner(), &[6, 3, 1]); 17 | } 18 | 19 | #[test] 20 | fn test_to_string() { 21 | let shape = Shape::new(&[1, 2, 3]); 22 | let string = format!("{:?}", shape); 23 | assert_eq!(string, "shape([1, 2, 3])"); 24 | } 25 | 26 | #[test] 27 | fn test_default() { 28 | let shape = Shape::default(); 29 | let arr: [i64; 0] = []; 30 | assert_eq!(shape.inner(), &arr); 31 | } 32 | 33 | #[test] 34 | fn test_from() { 35 | let shape = Shape::from(&Arc::new(vec![1, 2, 3])); 36 | assert_eq!(shape.inner(), &[1, 2, 3]); 37 | let shape = Shape::from(Arc::new([1, 2, 3])); 38 | assert_eq!(shape.inner(), &[1, 2, 3]); 39 | } 40 | -------------------------------------------------------------------------------- /hpt-allocator/src/utils/deallocate.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use dashmap::DashMap; 4 | use lru::LruCache; 5 | 6 | use crate::{ 7 | ptr::SafePtr, 8 | storage::{CommonStorage, Storage}, 9 | }; 10 | 11 | pub(crate) fn deallocate_helper( 12 | cache: &mut LruCache>, 13 | allocated: &mut HashSet, 14 | storage: &DashMap, 15 | layout: &std::alloc::Layout, 16 | ptr: *mut u8, 17 | should_drop: bool, 18 | device_id: usize, 19 | ) { 20 | if let Some(mut storage) = storage.get_mut(&device_id) { 21 | if storage.decrement_ref(SafePtr { ptr }) && should_drop { 22 | allocated.remove(&SafePtr { ptr }); 23 | if let Some(ptrs) = cache.get_mut(layout) { 24 | ptrs.push(SafePtr { ptr }); 25 | } else { 26 | cache.put(layout.clone(), vec![SafePtr { ptr }]); 27 | } 28 | } 29 | } else { 30 | panic!("device {} not found in storage", device_id); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /docs/user_guide/iterator/strided_map.md: -------------------------------------------------------------------------------- 1 | # strided_map 2 | ```rust 3 | fn strided_map(self, f: F) -> ParStridedMap 4 | ``` 5 | 6 | Applies a function to each element of the iterator with strided access pattern. Useful for parallel data transformation operations. 7 | 8 | ## Parameters 9 | 10 | - `self`: The parallel iterator 11 | - Type: `ParStrided` or `ParStridedZip` 12 | - `f`: The mapping function 13 | - Type: `FnMut((&mut T, &T))` 14 | - Requirements: Must be thread-safe (`Send + Sync`) 15 | 16 | ## Returns 17 | 18 | A `ParStridedMap` iterator 19 | 20 | ## Examples: 21 | ```rust 22 | use hpt::Tensor; 23 | use hpt::iter::TensorIterator; 24 | 25 | fn main() -> anyhow::Result<()> { 26 | let x = Tensor::::new(&[1f64, 2., 3.]); 27 | 28 | let res = x.par_iter().strided_map(|(res, x)|{ 29 | *res = x.sin(); 30 | }).collect::>(); 31 | 32 | println!("{}", res); 33 | Ok(()) 34 | } 35 | ``` 36 | ## Backend Support 37 | | Backend | Supported | 38 | |---------|-----------| 39 | | CPU | ✅ | 40 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/exp.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(exp_f16, f16, FloatOutUnaryPromote, Exp); 8 | DEFINE_UNARY_KERNEL(exp_bf16, bf16, FloatOutUnaryPromote, Exp); 9 | DEFINE_UNARY_KERNEL(exp_f32, f32, FloatOutUnaryPromote, Exp); 10 | DEFINE_UNARY_KERNEL(exp_f64, f64, FloatOutUnaryPromote, Exp); 11 | DEFINE_UNARY_KERNEL(exp_bool, bool, FloatOutUnaryPromote, Exp); 12 | DEFINE_UNARY_KERNEL(exp_i8, i8, FloatOutUnaryPromote, Exp); 13 | DEFINE_UNARY_KERNEL(exp_i16, i16, FloatOutUnaryPromote, Exp); 14 | DEFINE_UNARY_KERNEL(exp_i32, i32, FloatOutUnaryPromote, Exp); 15 | DEFINE_UNARY_KERNEL(exp_i64, i64, FloatOutUnaryPromote, Exp); 16 | DEFINE_UNARY_KERNEL(exp_u8, u8, FloatOutUnaryPromote, Exp); 17 | DEFINE_UNARY_KERNEL(exp_u16, u16, FloatOutUnaryPromote, Exp); 18 | DEFINE_UNARY_KERNEL(exp_u32, u32, FloatOutUnaryPromote, Exp); 19 | DEFINE_UNARY_KERNEL(exp_u64, u64, FloatOutUnaryPromote, Exp); 20 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/sin.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(sin_f16, f16, FloatOutUnaryPromote, Sin); 8 | DEFINE_UNARY_KERNEL(sin_bf16, bf16, FloatOutUnaryPromote, Sin); 9 | DEFINE_UNARY_KERNEL(sin_f32, f32, FloatOutUnaryPromote, Sin); 10 | DEFINE_UNARY_KERNEL(sin_f64, f64, FloatOutUnaryPromote, Sin); 11 | DEFINE_UNARY_KERNEL(sin_bool, bool, FloatOutUnaryPromote, Sin); 12 | DEFINE_UNARY_KERNEL(sin_i8, i8, FloatOutUnaryPromote, Sin); 13 | DEFINE_UNARY_KERNEL(sin_i16, i16, FloatOutUnaryPromote, Sin); 14 | DEFINE_UNARY_KERNEL(sin_i32, i32, FloatOutUnaryPromote, Sin); 15 | DEFINE_UNARY_KERNEL(sin_i64, i64, FloatOutUnaryPromote, Sin); 16 | DEFINE_UNARY_KERNEL(sin_u8, u8, FloatOutUnaryPromote, Sin); 17 | DEFINE_UNARY_KERNEL(sin_u16, u16, FloatOutUnaryPromote, Sin); 18 | DEFINE_UNARY_KERNEL(sin_u32, u32, FloatOutUnaryPromote, Sin); 19 | DEFINE_UNARY_KERNEL(sin_u64, u64, FloatOutUnaryPromote, Sin); 20 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/tan.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(tan_f16, f16, FloatOutUnaryPromote, Tan); 8 | DEFINE_UNARY_KERNEL(tan_bf16, bf16, FloatOutUnaryPromote, Tan); 9 | DEFINE_UNARY_KERNEL(tan_f32, f32, FloatOutUnaryPromote, Tan); 10 | DEFINE_UNARY_KERNEL(tan_f64, f64, FloatOutUnaryPromote, Tan); 11 | DEFINE_UNARY_KERNEL(tan_bool, bool, FloatOutUnaryPromote, Tan); 12 | DEFINE_UNARY_KERNEL(tan_i8, i8, FloatOutUnaryPromote, Tan); 13 | DEFINE_UNARY_KERNEL(tan_i16, i16, FloatOutUnaryPromote, Tan); 14 | DEFINE_UNARY_KERNEL(tan_i32, i32, FloatOutUnaryPromote, Tan); 15 | DEFINE_UNARY_KERNEL(tan_i64, i64, FloatOutUnaryPromote, Tan); 16 | DEFINE_UNARY_KERNEL(tan_u8, u8, FloatOutUnaryPromote, Tan); 17 | DEFINE_UNARY_KERNEL(tan_u16, u16, FloatOutUnaryPromote, Tan); 18 | DEFINE_UNARY_KERNEL(tan_u32, u32, FloatOutUnaryPromote, Tan); 19 | DEFINE_UNARY_KERNEL(tan_u64, u64, FloatOutUnaryPromote, Tan); 20 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/cos.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(cos_f16, f16, FloatOutUnaryPromote, Cos); 8 | DEFINE_UNARY_KERNEL(cos_bf16, bf16, FloatOutUnaryPromote, Cos); 9 | DEFINE_UNARY_KERNEL(cos_f32, f32, FloatOutUnaryPromote, Cos); 10 | DEFINE_UNARY_KERNEL(cos_f64, f64, FloatOutUnaryPromote, Cos); 11 | DEFINE_UNARY_KERNEL(cos_bool, bool, FloatOutUnaryPromote, Cos); 12 | DEFINE_UNARY_KERNEL(cos_i8, i8, FloatOutUnaryPromote, Cos); 13 | DEFINE_UNARY_KERNEL(cos_i16, i16, FloatOutUnaryPromote, Cos); 14 | DEFINE_UNARY_KERNEL(cos_i32, i32, FloatOutUnaryPromote, Cos); 15 | DEFINE_UNARY_KERNEL(cos_i64, i64, FloatOutUnaryPromote, Cos); 16 | DEFINE_UNARY_KERNEL(cos_u8, u8, FloatOutUnaryPromote, Cos); 17 | DEFINE_UNARY_KERNEL(cos_u16, u16, FloatOutUnaryPromote, Cos); 18 | DEFINE_UNARY_KERNEL(cos_u32, u32, FloatOutUnaryPromote, Cos); 19 | DEFINE_UNARY_KERNEL(cos_u64, u64, FloatOutUnaryPromote, Cos); 20 | 21 | -------------------------------------------------------------------------------- /docs/user_guide/windows/hamming_window.md: -------------------------------------------------------------------------------- 1 | # hamming_window 2 | ```rust 3 | hamming_window( 4 | window_length: i64, 5 | periodic: bool 6 | ) -> Result, TensorError> 7 | ``` 8 | Creates a Hamming window tensor. The Hamming window is a taper formed by using a weighted cosine. 9 | 10 | ## Parameters: 11 | `window_length`: The length of the window 12 | 13 | `periodic`: If true, returns a window to be used as periodic function. If false, returns a symmetric window 14 | 15 | ## Returns: 16 | A 1-D tensor containing the window. 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::WindowOps, Tensor}; 21 | fn main() -> Result<(), TensorError> { 22 | // Create a periodic Hamming window of length 5 23 | let a = Tensor::::hamming_window(5, true)?; 24 | println!("{}", a); 25 | // [0.0800, 0.3979, 0.9121, 0.9121, 0.0800] 26 | 27 | // Create a symmetric Hamming window of length 5 28 | let b = Tensor::::hamming_window(5, false)?; 29 | println!("{}", b); 30 | // [0.08, 0.54, 1.00, 0.54] 31 | 32 | Ok(()) 33 | } 34 | ``` -------------------------------------------------------------------------------- /docs/user_guide/creation/ones_like.md: -------------------------------------------------------------------------------- 1 | # ones_like 2 | ```rust 3 | ones_like( 4 | x: &Tensor 5 | ) -> Result, TensorError> 6 | ``` 7 | Creates a new tensor filled with ones with the same shape as the input tensor. 8 | 9 | ## Parameters: 10 | `x`: The input tensor whose shape will be used. 11 | 12 | ## Returns: 13 | A new tensor of ones with the same shape as the input tensor. 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{ 18 | error::TensorError, 19 | ops::{ShapeManipulate, TensorCreator}, 20 | Tensor, 21 | }; 22 | fn main() -> Result<(), TensorError> { 23 | // Create a tensor 24 | let a = Tensor::::new(&[1.0, 2.0, 3.0, 4.0]).reshape(&[2, 2])?; 25 | println!("a: {}", a); 26 | // [[1, 2], 27 | // [3, 4]] 28 | 29 | // Create a tensor of ones with same shape 30 | let b = a.ones_like()?; 31 | println!("b: {}", b); 32 | // [[1, 1], 33 | // [1, 1]] 34 | 35 | Ok(()) 36 | } 37 | ``` 38 | ## Backend Support 39 | | Backend | Supported | 40 | |---------|-----------| 41 | | CPU | ✅ | 42 | | Cuda | ✅ | -------------------------------------------------------------------------------- /docs/user_guide/windows/hann_window.md: -------------------------------------------------------------------------------- 1 | # hann_window 2 | ```rust 3 | hann_window( 4 | window_length: i64, 5 | periodic: bool 6 | ) -> Result, TensorError> 7 | ``` 8 | Creates a Hann window tensor. The Hann window is a taper formed by using a raised cosine with α = β = 0.5. 9 | 10 | ## Parameters: 11 | `window_length`: The length of the window 12 | 13 | `periodic`: If true, returns a window to be used as periodic function. If false, returns a symmetric window 14 | 15 | ## Returns: 16 | A 1-D tensor containing the window. 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::WindowOps, Tensor}; 21 | fn main() -> Result<(), TensorError> { 22 | // Create a periodic Hann window of length 5 23 | let a = Tensor::::hann_window(5, true)?; 24 | println!("{}", a); 25 | // [0.0000, 0.3455, 0.9045, 0.9045, 0.0000] 26 | 27 | // Create a symmetric Hann window of length 5 28 | let b = Tensor::::hann_window(5, false)?; 29 | println!("{}", b); 30 | // [0.0000, 0.5000, 1.0000, 0.5000, 0.0000] 31 | 32 | Ok(()) 33 | } 34 | ``` -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/erf.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | 8 | DEFINE_UNARY_KERNEL(erf_f16, f16, FloatOutUnaryPromote, Erf); 9 | DEFINE_UNARY_KERNEL(erf_bf16, bf16, FloatOutUnaryPromote, Erf); 10 | DEFINE_UNARY_KERNEL(erf_f32, f32, FloatOutUnaryPromote, Erf); 11 | DEFINE_UNARY_KERNEL(erf_f64, f64, FloatOutUnaryPromote, Erf); 12 | DEFINE_UNARY_KERNEL(erf_bool, bool, FloatOutUnaryPromote, Erf); 13 | DEFINE_UNARY_KERNEL(erf_i8, i8, FloatOutUnaryPromote, Erf); 14 | DEFINE_UNARY_KERNEL(erf_i16, i16, FloatOutUnaryPromote, Erf); 15 | DEFINE_UNARY_KERNEL(erf_i32, i32, FloatOutUnaryPromote, Erf); 16 | DEFINE_UNARY_KERNEL(erf_i64, i64, FloatOutUnaryPromote, Erf); 17 | DEFINE_UNARY_KERNEL(erf_u8, u8, FloatOutUnaryPromote, Erf); 18 | DEFINE_UNARY_KERNEL(erf_u16, u16, FloatOutUnaryPromote, Erf); 19 | DEFINE_UNARY_KERNEL(erf_u32, u32, FloatOutUnaryPromote, Erf); 20 | DEFINE_UNARY_KERNEL(erf_u64, u64, FloatOutUnaryPromote, Erf); 21 | 22 | -------------------------------------------------------------------------------- /hpt-examples/examples/iterator/main.rs: -------------------------------------------------------------------------------- 1 | use hpt::{ 2 | error::TensorError, 3 | iter::{ParStridedIteratorSimdZip, ParStridedIteratorZip, TensorIterator}, 4 | ops::{Random, TensorCreator}, 5 | Tensor, 6 | }; 7 | use rayon::iter::ParallelIterator; 8 | 9 | fn main() -> Result<(), TensorError> { 10 | let a = Tensor::::randn([2, 4, 6, 8])?; 11 | let mut b = Tensor::::empty([2, 4, 6, 8])?; 12 | 13 | b.par_iter_mut().zip(a.par_iter()).for_each(|(b, a)| { 14 | *b = a; 15 | }); 16 | println!("{}", b); 17 | 18 | let res = b 19 | .par_iter() 20 | .zip(a.par_iter()) 21 | .strided_map(|(res, (b, a))| *res = b + a) 22 | .collect::>(); 23 | println!("{}", res); 24 | 25 | let res = b 26 | .par_iter_simd() 27 | .zip(a.par_iter_simd()) 28 | .strided_map_simd( 29 | |(res, (b, a))| *res = b + a, 30 | |(res, (b, a))| res.write_unaligned(b + a), 31 | ) 32 | .collect::>(); 33 | println!("{}", res); 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /docs/dev_guide/iterator/iterator.md: -------------------------------------------------------------------------------- 1 | ### Iterator 2 | 3 | Iterator are implemented using Rayon trait `UnindexedProducer`, the tasks are splitted in [split](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-iterator/src/par_strided.rs#L541) method. The main loop is happened in [fold_with](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt-iterator/src/par_strided_zip.rs#L471). Iterator can be used to implement elementwise or broadcast elementwise calculations. Usage can be found at [here](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt/src/ops/cpu/utils/binary/binary_normal.rs#L46) and [here](https://github.com/Jianqoq/Hpt/blob/d9a51874b3447d562b7c9d043b50eb05259b78c4/hpt/src/ops/cpu/utils/unary/unary.rs#L19) 4 | 5 | ### Known issue 6 | 7 | 1. Current iterator like `ParStrided`, the `fold_with` method doesn't have any looping logic. Maybe we can write same logic as `ParStridedZip` in `fold_with`. 8 | 9 | 2. When outer loop size is 1, there will be no parallelism because the tasks splits based on the outer loop size. -------------------------------------------------------------------------------- /hpt-types/src/into_vec.rs: -------------------------------------------------------------------------------- 1 | /// a trait to convert a vector to another vector 2 | pub trait IntoVec { 3 | /// convert a vector to another vector T 4 | fn into_vec(self) -> T; 5 | } 6 | 7 | #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))] 8 | mod into_vec { 9 | use super::IntoVec; 10 | use crate::convertion::VecConvertor; 11 | use crate::simd::_256bit::common::*; 12 | use hpt_macros::impl_into_vec; 13 | impl_into_vec!(); 14 | } 15 | 16 | #[cfg(target_feature = "avx512f")] 17 | mod into_vec { 18 | use super::IntoVec; 19 | use crate::convertion::VecConvertor; 20 | use crate::simd::_512bit::common::*; 21 | use hpt_macros::impl_into_vec; 22 | impl_into_vec!(); 23 | } 24 | 25 | #[cfg(all( 26 | any(target_feature = "sse", target_arch = "arm", target_arch = "aarch64"), 27 | not(target_feature = "avx2") 28 | ))] 29 | mod into_vec { 30 | use super::IntoVec; 31 | use crate::convertion::VecConvertor; 32 | use crate::simd::_128bit::common::*; 33 | use hpt_macros::impl_into_vec; 34 | impl_into_vec!(); 35 | } 36 | -------------------------------------------------------------------------------- /docs/user_guide/creation/zeros_like.md: -------------------------------------------------------------------------------- 1 | # zeros_like 2 | ```rust 3 | zeros_like( 4 | x: &Tensor 5 | ) -> Result, TensorError> 6 | ``` 7 | Creates a new tensor filled with zeros with the same shape as the input tensor. 8 | 9 | ## Parameters: 10 | `x`: The input tensor whose shape will be used. 11 | 12 | ## Returns: 13 | A new tensor of zeros with the same shape as the input tensor. 14 | 15 | ## Examples: 16 | ```rust 17 | use hpt::{ 18 | error::TensorError, 19 | ops::{ShapeManipulate, TensorCreator}, 20 | Tensor, 21 | }; 22 | fn main() -> Result<(), TensorError> { 23 | // Create a tensor 24 | let a = Tensor::::new(&[1.0, 2.0, 3.0, 4.0]).reshape(&[2, 2])?; 25 | println!("a: {}", a); 26 | // [[1, 2], 27 | // [3, 4]] 28 | 29 | // Create a tensor of zeros with same shape 30 | let b = a.zeros_like()?; 31 | println!("b: {}", b); 32 | // [[0, 0], 33 | // [0, 0]] 34 | 35 | Ok(()) 36 | } 37 | ``` 38 | ## Backend Support 39 | | Backend | Supported | 40 | |---------|-----------| 41 | | CPU | ✅ | 42 | | Cuda | ✅ | -------------------------------------------------------------------------------- /matconv_simd/src/simd/_256bit/avx2/f64x4.rs: -------------------------------------------------------------------------------- 1 | 2 | use std::arch::x86_64::*; 3 | use crate::simd::_256bit::common::f64x4::f64x4; 4 | 5 | impl f64x4 { 6 | #[inline(always)] 7 | pub(crate) fn mul_add(self, a: Self, b: Self) -> Self { 8 | #[cfg(not(target_feature = "fma"))] 9 | unsafe { 10 | f64x4(_mm256_add_pd(_mm256_mul_pd(self.0, a.0), b.0)) 11 | } 12 | #[cfg(target_feature = "fma")] 13 | unsafe { 14 | f64x4(_mm256_fmadd_pd(self.0, a.0, b.0)) 15 | } 16 | } 17 | #[inline(always)] 18 | pub(crate) fn splat(val: f64) -> f64x4 { 19 | unsafe { f64x4(_mm256_set1_pd(val)) } 20 | } 21 | } 22 | 23 | impl std::ops::Add for f64x4 { 24 | type Output = Self; 25 | #[inline(always)] 26 | fn add(self, rhs: Self) -> Self { 27 | unsafe { f64x4(_mm256_add_pd(self.0, rhs.0)) } 28 | } 29 | } 30 | impl std::ops::Mul for f64x4 { 31 | type Output = Self; 32 | #[inline(always)] 33 | fn mul(self, rhs: Self) -> Self { 34 | unsafe { f64x4(_mm256_mul_pd(self.0, rhs.0)) } 35 | } 36 | } -------------------------------------------------------------------------------- /docs/user_guide/random/exponential.md: -------------------------------------------------------------------------------- 1 | # exponential 2 | ```rust 3 | exponential( 4 | lambda: T, 5 | shape: &[i64] | &Vec | &[i64; _] 6 | ) -> Result, TensorError> 7 | ``` 8 | Create a Tensor with values drawn from an exponential distribution with rate parameter `lambda`. The exponential distribution describes the time between events in a Poisson point process. 9 | 10 | ## Parameters: 11 | `lambda`: Rate parameter (λ) of the exponential distribution. Must be positive. 12 | 13 | `shape`: Shape of the output tensor. 14 | 15 | ## Returns: 16 | Tensor with type `T` containing random values from the exponential distribution. 17 | 18 | ## Examples: 19 | ```rust 20 | use hpt::{error::TensorError, ops::Random, Tensor}; 21 | 22 | fn main() -> Result<(), TensorError> { 23 | // Create a 10x10 tensor with exponential distribution (λ=2.0) 24 | let a = Tensor::::exponential(2.0, &[10, 10])?; 25 | println!("{}", a); 26 | Ok(()) 27 | } 28 | ``` 29 | ## Backend Support 30 | | Backend | Supported | 31 | |---------|-----------| 32 | | CPU | ✅ | 33 | | Cuda | ❌ | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/acos.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(acos_f16, f16, FloatOutUnaryPromote, ACos); 8 | DEFINE_UNARY_KERNEL(acos_bf16, bf16, FloatOutUnaryPromote, ACos); 9 | DEFINE_UNARY_KERNEL(acos_f32, f32, FloatOutUnaryPromote, ACos); 10 | DEFINE_UNARY_KERNEL(acos_f64, f64, FloatOutUnaryPromote, ACos); 11 | DEFINE_UNARY_KERNEL(acos_bool, bool, FloatOutUnaryPromote, ACos); 12 | DEFINE_UNARY_KERNEL(acos_i8, i8, FloatOutUnaryPromote, ACos); 13 | DEFINE_UNARY_KERNEL(acos_i16, i16, FloatOutUnaryPromote, ACos); 14 | DEFINE_UNARY_KERNEL(acos_i32, i32, FloatOutUnaryPromote, ACos); 15 | DEFINE_UNARY_KERNEL(acos_i64, i64, FloatOutUnaryPromote, ACos); 16 | DEFINE_UNARY_KERNEL(acos_u8, u8, FloatOutUnaryPromote, ACos); 17 | DEFINE_UNARY_KERNEL(acos_u16, u16, FloatOutUnaryPromote, ACos); 18 | DEFINE_UNARY_KERNEL(acos_u32, u32, FloatOutUnaryPromote, ACos); 19 | DEFINE_UNARY_KERNEL(acos_u64, u64, FloatOutUnaryPromote, ACos); 20 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/asin.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(asin_f16, f16, FloatOutUnaryPromote, ASin); 8 | DEFINE_UNARY_KERNEL(asin_bf16, bf16, FloatOutUnaryPromote, ASin); 9 | DEFINE_UNARY_KERNEL(asin_f32, f32, FloatOutUnaryPromote, ASin); 10 | DEFINE_UNARY_KERNEL(asin_f64, f64, FloatOutUnaryPromote, ASin); 11 | DEFINE_UNARY_KERNEL(asin_bool, bool, FloatOutUnaryPromote, ASin); 12 | DEFINE_UNARY_KERNEL(asin_i8, i8, FloatOutUnaryPromote, ASin); 13 | DEFINE_UNARY_KERNEL(asin_i16, i16, FloatOutUnaryPromote, ASin); 14 | DEFINE_UNARY_KERNEL(asin_i32, i32, FloatOutUnaryPromote, ASin); 15 | DEFINE_UNARY_KERNEL(asin_i64, i64, FloatOutUnaryPromote, ASin); 16 | DEFINE_UNARY_KERNEL(asin_u8, u8, FloatOutUnaryPromote, ASin); 17 | DEFINE_UNARY_KERNEL(asin_u16, u16, FloatOutUnaryPromote, ASin); 18 | DEFINE_UNARY_KERNEL(asin_u32, u32, FloatOutUnaryPromote, ASin); 19 | DEFINE_UNARY_KERNEL(asin_u64, u64, FloatOutUnaryPromote, ASin); 20 | -------------------------------------------------------------------------------- /hpt-cudakernels/src/unary/atan.cu: -------------------------------------------------------------------------------- 1 | #include "unary_template.cuh" 2 | #include "../utils/type_alias.cuh" 3 | #include "../utils/promotion/promotes.cuh" 4 | #include "../utils/check_type.cuh" 5 | #include "unary_classes.cuh" 6 | 7 | DEFINE_UNARY_KERNEL(atan_f16, f16, FloatOutUnaryPromote, ATan); 8 | DEFINE_UNARY_KERNEL(atan_bf16, bf16, FloatOutUnaryPromote, ATan); 9 | DEFINE_UNARY_KERNEL(atan_f32, f32, FloatOutUnaryPromote, ATan); 10 | DEFINE_UNARY_KERNEL(atan_f64, f64, FloatOutUnaryPromote, ATan); 11 | DEFINE_UNARY_KERNEL(atan_bool, bool, FloatOutUnaryPromote, ATan); 12 | DEFINE_UNARY_KERNEL(atan_i8, i8, FloatOutUnaryPromote, ATan); 13 | DEFINE_UNARY_KERNEL(atan_i16, i16, FloatOutUnaryPromote, ATan); 14 | DEFINE_UNARY_KERNEL(atan_i32, i32, FloatOutUnaryPromote, ATan); 15 | DEFINE_UNARY_KERNEL(atan_i64, i64, FloatOutUnaryPromote, ATan); 16 | DEFINE_UNARY_KERNEL(atan_u8, u8, FloatOutUnaryPromote, ATan); 17 | DEFINE_UNARY_KERNEL(atan_u16, u16, FloatOutUnaryPromote, ATan); 18 | DEFINE_UNARY_KERNEL(atan_u32, u32, FloatOutUnaryPromote, ATan); 19 | DEFINE_UNARY_KERNEL(atan_u64, u64, FloatOutUnaryPromote, ATan); 20 | --------------------------------------------------------------------------------