├── .cursorrules
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── compile.md
    │   ├── excuter.md
    │   ├── operator.md
    │   └── py_deepx.md
    └── workflows
    │   ├── auto-merge.yml
    │   ├── excuter-cppcommon.yml
    │   ├── excuter-cuda-linux.yml
    │   ├── excuter-ompsimd-linux.yml
    │   └── tool-deepxctl.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── doc
    ├── .gitignore
    ├── README.md
    ├── benchmark
    │   ├── broadcast.md
    │   ├── matmul.md
    │   └── reduce.md
    ├── conf.py
    ├── deepxIR
    │   ├── ir.md
    │   └── readme.md
    ├── design.md
    ├── excuter
    │   ├── deepx.op.drawio
    │   ├── deepx.op.drawio.svg
    │   ├── deepx.op.jpg
    │   ├── excuter.md
    │   ├── mix_precision.md
    │   ├── op-mem-cuda
    │   │   ├── cublas
    │   │   │   └── api.md
    │   │   ├── cublaslt
    │   │   │   └── api.md
    │   │   └── list.md
    │   ├── op-mem-ompsimd
    │   │   ├── contribute.md
    │   │   ├── list.md
    │   │   └── range.md
    │   └── welcome.md
    ├── front
    │   ├── aboutop.md
    │   ├── deepx.jpg
    │   ├── deepx.op.drawio.svg
    │   ├── deepxpy.drawio.svg
    │   ├── front.md
    │   ├── graph.md
    │   ├── node.md
    │   ├── op.md
    │   └── py
    │   │   ├── about.md
    │   │   ├── contribute.md
    │   │   └── deepx.rst
    ├── highway.md
    ├── index.rst
    ├── language.md
    └── scheduler
    │   └── scheduler.md
├── excuter
    ├── cpp-common
    │   ├── CMakeLists.txt
    │   ├── src
    │   │   ├── client
    │   │   │   ├── udpserver.cpp
    │   │   │   ├── udpserver.hpp
    │   │   │   ├── unixsocketserver.cpp
    │   │   │   ├── unixsocketserver.hpp
    │   │   │   └── worker.hpp
    │   │   ├── deepx
    │   │   │   ├── dtype.hpp
    │   │   │   ├── mem
    │   │   │   │   └── mem.hpp
    │   │   │   ├── shape.cpp
    │   │   │   ├── shape.hpp
    │   │   │   ├── shape_changeshape.cpp
    │   │   │   ├── shape_changeshape.hpp
    │   │   │   ├── shape_matmul.cpp
    │   │   │   ├── shape_matmul.hpp
    │   │   │   ├── shape_range.cpp
    │   │   │   ├── shape_reduce.cpp
    │   │   │   ├── shape_reduce.hpp
    │   │   │   ├── shape_tensorinit.cpp
    │   │   │   ├── shape_tensorinit.hpp
    │   │   │   ├── tensor.hpp
    │   │   │   ├── tensorbase.hpp
    │   │   │   ├── tensorfunc
    │   │   │   │   ├── authors.hpp
    │   │   │   │   ├── changeshape.hpp
    │   │   │   │   ├── elementwise.hpp
    │   │   │   │   ├── init.hpp
    │   │   │   │   ├── io.hpp
    │   │   │   │   ├── matmul.hpp
    │   │   │   │   ├── reduce.hpp
    │   │   │   │   └── tensorlife.hpp
    │   │   │   ├── tf
    │   │   │   │   ├── tf.cpp
    │   │   │   │   ├── tf.hpp
    │   │   │   │   ├── tffactory.cpp
    │   │   │   │   └── tffactory.hpp
    │   │   │   ├── vector_combination.cpp
    │   │   │   └── vector_combination.hpp
    │   │   └── stdutil
    │   │   │   ├── error.hpp
    │   │   │   ├── fs.cpp
    │   │   │   ├── fs.hpp
    │   │   │   ├── num.cpp
    │   │   │   ├── num.hpp
    │   │   │   ├── print.hpp
    │   │   │   ├── string.cpp
    │   │   │   ├── string.hpp
    │   │   │   ├── time.hpp
    │   │   │   └── vector.hpp
    │   └── test
    │   │   ├── 0_dtypes.cpp
    │   │   ├── 1_tf.cpp
    │   │   ├── 1_tfcheck.cpp
    │   │   ├── 2_saveload.cpp
    │   │   └── CMakeLists.txt
    ├── op-mem-cuda
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── build.sh
    │   ├── doc
    │   │   ├── 00_quickstart.md
    │   │   ├── 01_layout.md
    │   │   ├── 02_layout_algebra.md
    │   │   ├── 03_tensor.md
    │   │   ├── 04_algorithms.md
    │   │   ├── 0t_mma_atom.md
    │   │   ├── 0x_gemm_tutorial.md
    │   │   ├── 0y_predication.md
    │   │   └── 0z_tma_tensors.md
    │   ├── dockerbuild.sh
    │   ├── log.md
    │   ├── src
    │   │   ├── client
    │   │   │   ├── main.cpp
    │   │   │   ├── tfs.cpp
    │   │   │   └── tfs.hpp
    │   │   └── deepx
    │   │   │   ├── dtype_cuda.hpp
    │   │   │   ├── mem
    │   │   │       └── mem_cuda.hpp
    │   │   │   ├── tensorfunc
    │   │   │       ├── changeshape_miaobyte.cu
    │   │   │       ├── changeshape_miaobyte.cuh
    │   │   │       ├── changeshape_miaobyte.hpp
    │   │   │       ├── cuda.hpp
    │   │   │       ├── cuda_atomic.cuh
    │   │   │       ├── cuda_math.cuh
    │   │   │       ├── elementwise_cublas_basic.hpp
    │   │   │       ├── elementwise_miaobyte_basic.cu
    │   │   │       ├── elementwise_miaobyte_basic.cuh
    │   │   │       ├── elementwise_miaobyte_basic.hpp
    │   │   │       ├── elementwise_miaobyte_compare.cu
    │   │   │       ├── elementwise_miaobyte_compare.cuh
    │   │   │       ├── elementwise_miaobyte_compare.hpp
    │   │   │       ├── elementwise_miaobyte_sin.cu
    │   │   │       ├── elementwise_miaobyte_sin.cuh
    │   │   │       ├── elementwise_miaobyte_sin.hpp
    │   │   │       ├── elementwise_miaobyte_sqrt.cu
    │   │   │       ├── elementwise_miaobyte_sqrt.cuh
    │   │   │       ├── elementwise_miaobyte_sqrt.hpp
    │   │   │       ├── init_miaobyte.cu
    │   │   │       ├── init_miaobyte.cuh
    │   │   │       ├── init_miaobyte.hpp
    │   │   │       ├── io_miaobyte.hpp
    │   │   │       ├── matmul_cublas.hpp
    │   │   │       ├── new_mempool.hpp
    │   │   │       ├── reduce_miaobyte.cu
    │   │   │       ├── reduce_miaobyte.cuh
    │   │   │       ├── reduce_miaobyte.hpp
    │   │   │       ├── tensor_cuda.cuh
    │   │   │       ├── tensorlife_miaobyte.hpp
    │   │   │       └── vector_cuda.cuh
    │   │   │   └── tf
    │   │   │       ├── arg.hpp
    │   │   │       ├── changeshape.hpp
    │   │   │       ├── elementwise_basic.hpp
    │   │   │       ├── elementwise_compare.hpp
    │   │   │       ├── elementwise_sin.hpp
    │   │   │       ├── elementwise_sqrt.hpp
    │   │   │       ├── init.hpp
    │   │   │       ├── io.hpp
    │   │   │       ├── matmul.hpp
    │   │   │       ├── reduce.hpp
    │   │   │       └── tensorlife.hpp
    │   └── test
    │   │   ├── op
    │   │       └── CMakeLists.txt
    │   │   └── tensorfunc
    │   │       ├── 0_new.cpp
    │   │       ├── 1_cublas_add.cpp
    │   │       ├── 1_cublas_matmul.cpp
    │   │       ├── 2_changeshape.cpp
    │   │       └── CMakeLists.txt
    └── op-mem-ompsimd
    │   ├── .cursorignore
    │   ├── .cursorrules
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── Dockerfile
    │   ├── dockerbuild.sh
    │   ├── log.md
    │   ├── src
    │       ├── client
    │       │   ├── main.cpp
    │       │   ├── tfs.cpp
    │       │   └── tfs.hpp
    │       └── deepx
    │       │   ├── dtype_ompsimd.hpp
    │       │   ├── mem
    │       │       └── mem_ompsimd.hpp
    │       │   ├── tensorfunc
    │       │       ├── changeshape_miaobyte.hpp
    │       │       ├── elementwise_cblas.hpp
    │       │       ├── elementwise_miaobyte.hpp
    │       │       ├── equal.hpp
    │       │       ├── highway.hpp
    │       │       ├── init_miaobyte.hpp
    │       │       ├── io_miaobyte.hpp
    │       │       ├── matmul_cblas.hpp
    │       │       ├── matmul_miaobyte.hpp
    │       │       ├── new_mempool.hpp
    │       │       ├── reduce_miaobyte.hpp
    │       │       └── tensorlife_miaobyte.hpp
    │       │   └── tf
    │       │       ├── arg.hpp
    │       │       ├── changeshape.hpp
    │       │       ├── elementwise.hpp
    │       │       ├── init.hpp
    │       │       ├── io.hpp
    │       │       ├── matmul.hpp
    │       │       ├── reduce.hpp
    │       │       └── tensorlife.hpp
    │   └── test
    │       ├── op
    │           ├── 1_mem.cpp
    │           └── CMakeLists.txt
    │       └── tensorfunc
    │           ├── 1_shape.cpp
    │           ├── 2_shape_combintion.cpp
    │           ├── 2_tensor_equal.cpp
    │           ├── 2_tensor_new.cpp
    │           ├── 2_tensor_range.cpp
    │           ├── 2_tensor_range.py
    │           ├── 3_tensor_print.cpp
    │           ├── 4_tensor_add.cpp
    │           ├── 4_tensor_matmul.cpp
    │           ├── 4_tensor_max.cpp
    │           ├── 4_tensor_mul.cpp
    │           ├── 4_tensor_sub.cpp
    │           ├── 5_tensor_sum.cpp
    │           ├── 6_tensor_broadcast.cpp
    │           ├── 7_tensor_transpose.cpp
    │           ├── 8_tensor_concat.cpp
    │           ├── CMakeLists.txt
    │           └── tensorutil.hpp
├── front
    ├── go
    │   ├── README.md
    │   ├── deepx
    │   │   ├── attention.go
    │   │   ├── graph_constarg.go
    │   │   ├── graph_opnode.go
    │   │   ├── graph_tensornode.go
    │   │   ├── graph_viz.go
    │   │   ├── linear.go
    │   │   ├── mlp.go
    │   │   ├── module.go
    │   │   ├── norm.go
    │   │   ├── tensor_activite.go
    │   │   ├── tensor_elementwise.go
    │   │   ├── tensor_matmul.go
    │   │   ├── tensor_musk.go
    │   │   ├── tensor_norm.go
    │   │   ├── tensor_normalization.go
    │   │   ├── tensor_reduce.go
    │   │   ├── tensor_shape.go
    │   │   ├── transformer.go
    │   │   ├── transformer
    │   │   │   ├── attention.go
    │   │   │   ├── config.go
    │   │   │   ├── model.go
    │   │   │   ├── qwen2.md
    │   │   │   ├── qwen2_causal_lm.go
    │   │   │   └── qwen2_model.go
    │   │   └── transformer_model.go
    │   ├── example
    │   │   ├── 1
    │   │   │   ├── 1_app.dot
    │   │   │   ├── 1_app.go
    │   │   │   └── 1_app.svg
    │   │   └── 3
    │   │   │   ├── 3_transformer.svg
    │   │   │   ├── 3_transformer_app.go
    │   │   │   └── transformer.dot
    │   └── go.mod
    └── py
    │   ├── .cursorrules
    │   ├── deepx
    │       ├── .cursorrules
    │       ├── .gitignore
    │       ├── README.md
    │       ├── __init__.py
    │       ├── nn
    │       │   ├── __init__.py
    │       │   ├── deepxir.py
    │       │   ├── functional
    │       │   │   ├── __init__.py
    │       │   │   ├── activite.py
    │       │   │   ├── authormap.py
    │       │   │   ├── changeshape.py
    │       │   │   ├── elementwise.py
    │       │   │   ├── leaffunc.py
    │       │   │   ├── leaffunc_changeshape.py
    │       │   │   ├── leaffunc_elementwise.py
    │       │   │   ├── leaffunc_init.py
    │       │   │   ├── leaffunc_io.py
    │       │   │   ├── leaffunc_life.py
    │       │   │   ├── leaffunc_matmul.py
    │       │   │   ├── leaffunc_reduce.py
    │       │   │   ├── normalization.py
    │       │   │   ├── reduce.py
    │       │   │   ├── rtf.py
    │       │   │   ├── rtf_changeshape.py
    │       │   │   ├── rtf_elementwise.py
    │       │   │   ├── rtf_init.py
    │       │   │   ├── rtf_io.py
    │       │   │   ├── rtf_life.py
    │       │   │   ├── rtf_matmul.py
    │       │   │   └── rtf_reduce.py
    │       │   ├── modules
    │       │   │   ├── __init__.py
    │       │   │   ├── activation.py
    │       │   │   ├── conv.py
    │       │   │   ├── dropout.py
    │       │   │   ├── linear.py
    │       │   │   ├── loss.py
    │       │   │   ├── module.py
    │       │   │   ├── normalization.py
    │       │   │   ├── padding.py
    │       │   │   ├── pooling.py
    │       │   │   ├── rmsnorm.py
    │       │   │   ├── rnn.py
    │       │   │   └── sparse.py
    │       │   └── parameter.py
    │       ├── optim
    │       │   ├── __init__.py
    │       │   ├── adam.py
    │       │   ├── optimizer.py
    │       │   └── sgd.py
    │       ├── requirements.txt
    │       ├── scheduler
    │       │   ├── __init__.py
    │       │   └── client
    │       │   │   ├── allclient.py
    │       │   │   ├── udpconn.py
    │       │   │   └── unixsocket.py
    │       ├── setup.py
    │       ├── tensor
    │       │   ├── __init__.py
    │       │   ├── changeshape.py
    │       │   ├── elementwise.py
    │       │   ├── init.py
    │       │   ├── io.py
    │       │   ├── matmul.py
    │       │   ├── reduce.py
    │       │   ├── shape.py
    │       │   └── tensor.py
    │       ├── transformer
    │       │   ├── __init__.py
    │       │   ├── attention.py
    │       │   ├── decoder.py
    │       │   ├── modeling_rope_utils.py
    │       │   └── models
    │       │   │   ├── __init__.py
    │       │   │   └── llama
    │       │   │       ├── __init__.py
    │       │   │       ├── attention.py
    │       │   │       ├── embedding.py
    │       │   │       ├── groupedquery_attention.py
    │       │   │       ├── mlp.py
    │       │   │       ├── modeling_llama.py
    │       │   │       └── normalization.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── benchmark
    │       │       └── bench.py
    │       │   ├── checkpoint.py
    │       │   └── data
    │       │       ├── __init__.py
    │       │       ├── dataloader.py
    │       │       ├── dataset.py
    │       │       └── sampler.py
    │   ├── deepxutil
    │       ├── numpy
    │       │   ├── __init__.py
    │       │   └── io.py
    │       └── torch
    │       │   ├── __init__.py
    │       │   └── io.py
    │   ├── docs
    │       ├── api.rst
    │       ├── conf.py
    │       └── index.rst
    │   └── examples
    │       ├── 0_pyenv
    │           └── binsearch.py
    │       ├── 1_tensor
    │           ├── 1_clone.py
    │           ├── 1_copy.py
    │           ├── 1_new.py
    │           ├── 1_print.py
    │           ├── 2_newbig.py
    │           ├── 2_saveload.py
    │           └── getitem.py
    │       ├── 2_ir
    │           ├── 1_init_zeroones.py
    │           ├── 2_elementwise_add.py
    │           ├── 2_elementwise_bit.py
    │           ├── 2_elementwise_dropout.py
    │           ├── 2_elementwise_lessgreater.py
    │           ├── 2_elementwise_minmax.py
    │           ├── 2_elementwise_operator.py
    │           ├── 2_elementwise_sqrtlog.py
    │           ├── 2_elementwise_switchwhere.py
    │           ├── 3_matmul.py
    │           ├── 4_changeshape_broadcast.py
    │           ├── 4_changeshape_broadcast_add.py
    │           ├── 4_changeshape_concat.py
    │           ├── 4_changeshape_gather.py
    │           ├── 4_changeshape_repeat.py
    │           ├── 4_changeshape_reshape.py
    │           ├── 4_changeshape_transpose.py
    │           ├── 5_reduce_prod.py
    │           ├── 5_reduce_sum.py
    │           ├── 5_reduce_sum_keepdim.py
    │           ├── 6_tensorlife_to.py
    │           └── changeshape_repeat.py
    │       ├── 3_functional
    │           ├── activite_relu.py
    │           ├── activite_sigmoid.py
    │           ├── activite_swish.py
    │           ├── changeshape_broadcast.py
    │           ├── elementwise_dropout.py
    │           ├── elementwise_rsqrt.py
    │           ├── normalization_softmax.py
    │           └── reduce_mean.py
    │       ├── 3_module
    │           ├── 0_hg_tokenizer.py
    │           ├── 1_embedding.py
    │           └── 1_linear.py
    │       └── 4_transformer
    │           └── llama
    │               ├── 1_llama_rmsnorm.py
    │               ├── llama_
    │               ├── llama_rope.py
    │               └── llama_rope_torch.py
├── log.md
├── model
    ├── h5_deepx
    │   ├── h5_deepx
    │   │   ├── __init__.py
    │   │   ├── todeepx.py
    │   │   └── toh5.py
    │   ├── requirements.txt
    │   └── setup.py
    ├── onnx_deepx
    │   ├── README.md
    │   ├── onnx_deepx
    │   │   ├── __init__.py
    │   │   └── todeepx.py
    │   ├── requirements.txt
    │   └── setup.py
    └── safetensor_deepx
    │   ├── README.md
    │   ├── examples
    │       └── load_model.py
    │   ├── requirements.txt
    │   ├── safetensor_deepx.egg-info
    │       ├── PKG-INFO
    │       ├── SOURCES.txt
    │       ├── dependency_links.txt
    │       ├── requires.txt
    │       └── top_level.txt
    │   ├── safetensor_deepx
    │       ├── __init__.py
    │       └── loader.py
    │   └── setup.py
├── scheduler
    ├── README.md
    ├── autograd
    │   ├── __init__.py
    │   ├── function.py
    │   └── graph
    │   │   ├── _controlflownode.py
    │   │   ├── _datanode.py
    │   │   ├── _opnode.py
    │   │   ├── graph.py
    │   │   ├── graph_viz.py
    │   │   ├── node.py
    │   │   └── nodetype.py
    └── common
    │   ├── pass_register.cpp
    │   └── pass_register.hpp
├── todo
    ├── infer.py
    └── qwen2_infer.py
└── tool
    └── deepxctl
        ├── .gitignore
        ├── cmd
            └── tensor
            │   ├── print.go
            │   └── tensor.go
        ├── go.mod
        ├── go.sum
        ├── main.go
        └── tensor
            ├── fp16.go
            ├── io.go
            ├── print.go
            └── tensor.go


/.cursorrules:
--------------------------------------------------------------------------------
 1 | Always respond in 中文
 2 | 不要回答重复的内容（如我提问中的代码）
 3 | 
 4 | 此项目名为deepx
 5 | 项目路径为/home/lipeng/code/ai/deepx
 6 | 项目分为3部分
 7 | 1. 前端。python库的接口风格参考pytorch，其他语言如go,java,c,rust等，后续设计完善。
 8 | 2. 调度器，待设计
 9 | 3. 执行器，使用c++,cuda,metal,omp simd等,实现不同excuter的算子的前向和反向
10 | 
11 | 关于概念
12 | deepx.Tensor仅仅就是一个tensor，不像pytorch的tensor，一个tensor其实包含了自身和梯度2个tensor的数据
13 | 
14 | 关于任何编程语言
15 | 注重设计函数时，通过多级的子函数，实现层级模块化分解
16 | 
17 | 关于c++
18 | 我的环境为ubuntu22,项目是c++17,使用cmake编译,
19 | 返回c++代码区分header和source文件
20 | 由于作者是c++新手,请仔细检查指针和引用,对deepx这种密集计算任务,不要使用智能指针，但注意内存泄漏，函数返回对象等
21 | 
22 | 关于python
23 | 贴近pytorch的接口风格，不要增加任何注释，我会手动添加注释
24 | 
25 | 关于doc目录
26 | 采用Sphinx构建,使用reStructuredText格式


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/compile.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 算子等价替换
 3 | about:以等价替换的方式，优化执行效率，新增其他能力支持
 4 | title: '[deepx(compile)] '
 5 | labels: compile,
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## 你的思路
10 | 
11 | ## 影响组件
12 |  
13 | ## 其他叙述
14 | 
15 | <!-- 请在此处添加其他相关信息，如：
16 | - 参考实现（如PyTorch中的实现）
17 | - 性能要求
18 | - 测试用例
19 | - 其他注意事项
20 | -->


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/excuter.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 执行引擎
 3 | about:按照给定计算图，负责存储、计算、网络传输的执行
 4 | title: '[excuter] '
 5 | labels: excuter,
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## 支持的硬件、操作系统
10 | 
11 | ## 你的思路
12 | 
13 | ## 其他叙述
14 |  
15 | 
16 | 
17 | <!-- 请在此处添加其他相关信息，如：
18 | - 参考实现（如PyTorch中的实现）
19 | - 性能要求
20 | - 测试用例
21 | - 其他注意事项
22 | -->


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/operator.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 算子新增、修改、删除
 3 | about: 用于提交新的算子实现请求
 4 | title: '[算子] '
 5 | labels: enhancement, operator
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## 算子新增
10 | 该算子数学表达为
11 | 
12 | ## 影响组件
13 | 
14 | ### front
15 | 1. 
16 | 2.
17 | 
18 | ### 引擎
19 | 1. 
20 | 2. 
21 | 
22 | ## 其他叙述
23 | 
24 | <!-- 请在此处添加其他相关信息，如：
25 | - 参考实现（如PyTorch中的实现）
26 | - 性能要求
27 | - 测试用例
28 | - 其他注意事项
29 | -->


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/py_deepx.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: deepx的python主库
 3 | about: 优化deepx(python)的前端api
 4 | title: '[deepx(python)] '
 5 | labels: python,
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## 你的思路
10 | 
11 | ## 影响组件
12 |  
13 | ## 其他叙述
14 | 
15 | <!-- 请在此处添加其他相关信息，如：
16 | - 参考实现（如PyTorch中的实现）
17 | - 性能要求
18 | - 测试用例
19 | - 其他注意事项
20 | -->


--------------------------------------------------------------------------------
/.github/workflows/auto-merge.yml:
--------------------------------------------------------------------------------
 1 | name: 自动合并PR
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Excuter/ompsimd-linux Build", "Excuter/cuda-linux Build"] # 列出您所有需要等待完成的CI工作流
 6 |     types:
 7 |       - completed
 8 |     branches:
 9 |       - main  # 仅在针对main分支的PR上运行
10 | 
11 | permissions:
12 |   contents: write
13 |   pull-requests: write
14 | 
15 | jobs:
16 |   auto-merge:
17 |     runs-on: ubuntu-latest
18 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
19 |     steps:
20 |       - name: 自动合并PR
21 |         uses: pascalgn/automerge-action@v0.15.6
22 |         env:
23 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 |           MERGE_LABELS: "auto-merge,!work-in-progress,!do-not-merge"
25 |           MERGE_METHOD: "squash"
26 |           MERGE_COMMIT_MESSAGE: "自动合并: PR #{pullRequest.number} {pullRequest.title}"
27 |           MERGE_FORKS: "true"
28 |           MERGE_RETRIES: "6"
29 |           MERGE_RETRY_SLEEP: "10000"
30 |           UPDATE_LABELS: "auto-merge"
31 |           UPDATE_METHOD: "rebase" 


--------------------------------------------------------------------------------
/.github/workflows/excuter-cppcommon.yml:
--------------------------------------------------------------------------------
 1 | name: Excuter/cppcommon Build
 2 | on: [push, pull_request]
 3 | 
 4 | env:
 5 |   HIGHWAY_VERSION: 1.2.0
 6 | 
 7 | jobs:
 8 |   build:
 9 |     strategy:
10 |       matrix:
11 |         os: [ubuntu-22.04]  # 只保留 Ubuntu
12 |         backend: [ompsimd]
13 |     runs-on: ${{ matrix.os }}
14 |     
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |       with:
18 |         fetch-depth: 0
19 |         
20 |     # 系统依赖安装
21 |     - name: Install Dependencies (Ubuntu)
22 |       if: matrix.os == 'ubuntu-22.04'
23 |       env:
24 |         DEBIAN_FRONTEND: noninteractive
25 |       run: |
26 |         sudo apt-get update
27 |         sudo apt-get install -y \
28 |           build-essential \
29 |           cmake \
30 |           libopenblas-dev \
31 |           libyaml-cpp-dev \
32 |           libjemalloc-dev \
33 |           libgtest-dev \
34 |           clang \
35 |           git
36 | 
37 |     # 设置 ccache
38 |     - name: Setup ccache
39 |       uses: hendrikmuhs/ccache-action@v1.2
40 | 
41 |     # 构建缓存
42 |     - name: Cache Build
43 |       uses: actions/cache@v3
44 |       with:
45 |         path: |
46 |           excuter/cpp-common/build
47 |           ~/.ccache
48 |         key: ${{ runner.os }}-build-${{ hashFiles('**/CMakeLists.txt') }}
49 |         restore-keys: |
50 |           ${{ runner.os }}-build-
51 |  
52 |     # 构建 cpp-common 库
53 |     - name: Build Common Library
54 |       run: |
55 |         cd excuter/cpp-common
56 |         mkdir -p build && cd build
57 |         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ..
58 |         cmake --build . --config Release -j$(nproc)
59 |  


--------------------------------------------------------------------------------
/.github/workflows/tool-deepxctl.yml:
--------------------------------------------------------------------------------
 1 | name: Tool/deepxctl Build
 2 | on: [push, pull_request]
 3 |  
 4 | jobs:
 5 |   build:
 6 |     strategy:
 7 |       matrix:
 8 |         os: [ubuntu-22.04]  # 只保留 Ubuntu
 9 |         go-version: [1.23.2]
10 |     runs-on: ${{ matrix.os }}
11 |     
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |       with:
15 |         fetch-depth: 0
16 |         
17 |     # 系统依赖安装
18 |     - name: 安装Go
19 |       uses: actions/setup-go@v4
20 |       with:
21 |         go-version: ${{ matrix.go-version }}
22 |         cache: true
23 | 
24 |     # 系统依赖安装
25 |     - name: 安装依赖 (Ubuntu)
26 |       env:
27 |         DEBIAN_FRONTEND: noninteractive
28 |       run: |
29 |         sudo apt-get update
30 |         sudo apt-get install -y git
31 | 
32 |     # 构建deepxctl工具
33 |     - name: 构建deepxctl
34 |       run: |
35 |         cd tool/deepxctl
36 |         go build -v -o deepxctl
37 |     
38 |     # 运行测试
39 |     - name: 运行测试
40 |       run: |
41 |        cd tool/deepxctl
42 |        ./deepxctl
43 |  


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | **/build/
3 | .idea
4 | **/.idea
5 | **/__pycache__/
6 | **/dist/
7 | **/egg.info/
8 | front/py/deepx/deepx.egg-info/*
9 | *.pdf


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # DeepX 行为准则
 2 | 
 3 | ## 我们的承诺
 4 | 
 5 | 作为贡献者和维护者，我们承诺为每个人提供一个开放和欢迎的环境。
 6 | 
 7 | ## 我们的标准
 8 | 
 9 | 有助于创造积极环境的行为包括但不限于：
10 | 
11 | - 使用友好和包容的语言
12 | - 尊重不同的观点和经验
13 | - 耐心地接受建设性的批评
14 | - 关注对社区最有利的事情
15 | - 友善对待其他社区成员
16 | 
17 | 不可接受的行为包括但不限于：
18 | 
19 | - 使用性化的语言或图像以及不受欢迎的性关注或挑逗
20 | - 捣乱/煽动/侮辱性/贬损的评论，人身攻击或政治攻击
21 | - 公开或私下的骚扰
22 | - 未经明确许可，发布他人的私人信息，如物理或电子地址
23 | - 其他可以合理地被认为不符合专业行为的行为
24 | 
25 | ## 我们的责任
26 | 
27 | 项目维护者有责任澄清可接受行为的标准，并应对任何不可接受的行为采取适当和公平的纠正措施。
28 | 
29 | 项目维护者有权利和责任删除、编辑或拒绝与本行为准则不符的评论、提交、代码、wiki编辑、问题和其他贡献，并可暂时或永久禁止任何他们认为不适合、威胁、冒犯或有害的贡献者。
30 | 
31 | ## 适用范围
32 | 
33 | 当个人代表项目或其社区时，本行为准则适用于项目空间和公共空间。
34 | 
35 | ## 执行
36 | 
37 | 如有滥用、骚扰或其他不可接受的行为，请通过以下方式联系项目团队。所有投诉都将被审查和调查，并将导致认为必要和适当的回应。
38 | 
39 | ## 联系信息
40 | 
41 | 请通过 [您的联系信息] 联系我们。
42 | 
43 | ## 归属
44 | 
45 | 本行为准则改编自[贡献者公约](https://www.contributor-covenant.org)，版本1.4。


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # deepx 贡献指南
 2 | 
 3 | deepx框架的发展，主要包括五大类方向
 4 | 
 5 | + front: 新增模型、module、python类函数等
 6 | + 中间层：包括计算图优化器，插件系统(自动KVcache系统)，自动分布式化，栈tensor自动释放，自动Inplace化等操作
 7 | + 新增或修改excuter
 8 | + 增加或修改算子，进一步可以分为leaftensorfunc(不可分割的基础算子)，fusedtensorfunc（融合算子）
 9 | + 文档丰富：
10 | + 运维自动化方向
11 | 
12 | 大家可以选择一个方向
13 | 
14 | ## 步骤
15 | 
16 | 第一次提交
17 |   1. Fork本仓库（github.com/array2d/deepx）的main分支，到你的github/yourname/deepx
18 |   2. 本地clone github/yourname/deepx
19 |   3. 提交并推送您的更改到你的github：`git commit -m 'Add some feature'`
20 |   4. 创建一个Pull Request。
21 | 
22 | 第N次提交 
23 | 
24 |   1. 保障你的本地和github/yourname/deepx中均已提pull request并得到merge
25 |   2. 在github/yourname/deepx中sync fork【危险操作，会删除你新增的代码】，拉取（github.com/array2d/deepx） main分支的最新代码
26 |   3. 本地clone github/yourname/deepx
27 |   4. 提交并推送您的更改到你的github：`git commit -m 'Add some feature'`
28 |   5. 创建一个Pull Request。


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .venv
3 | _build
4 | make.bat
5 | Makefile
6 | _static
7 | _templates


--------------------------------------------------------------------------------
/doc/benchmark/broadcast.md:
--------------------------------------------------------------------------------
 1 | 
 2 | [i] broadcastto [4i,2i,i]
 3 | 
 4 | + 不开启omp
 5 | 
 6 | | Size | Time (seconds)   | mem    |
 7 | |------|------------------|--------|
 8 | | 64   | 0.304582         | 8 MB   |
 9 | | 128  | 2.06795          | 64 MB  |
10 | | 256  | 16.4505          | 512 MB |
11 | | 512  | 131.381          | 4 GB   |
12 | 
13 | + 开启omp
14 | 
15 | 以下是整理后的表格，展示了不同大小的时间消耗：
16 | 
17 | | Size | Time (seconds)   | mem    |
18 | |------|------------------|--------|
19 | | 64   | 0.062084         | 8 MB   |
20 | | 128  | 0.132792         | 64 MB  |
21 | | 256  | 1.21183          | 512 MB |
22 | | 512  | 8.89442          | 4 GB   |
23 | 
24 | 


--------------------------------------------------------------------------------
/doc/benchmark/matmul.md:
--------------------------------------------------------------------------------
 1 | ## 矩阵乘法
 2 | 
 3 | 平台 ubuntu 22.04
 4 | cpu  Intel(R) Core(TM) i9-14900K
 5 | 内存 64GB
 6 | 
 7 | | 矩阵大小     | 耗时(秒) | 内存占用   |
 8 | |--------------|-----------|------------|
 9 | | 64x64        | 0.000073  | 16 KB      |
10 | | 128x128      | 0.007146  | 64 KB      |
11 | | 256x256      | 0.002196  | 256 KB     |
12 | | 512x512      | 0.007013  | 1 MB       |
13 | | 1024x1024    | 0.027820  | 4 MB       |
14 | | 2048x2048    | 0.058486  | 16 MB      |
15 | | 4096x4096    | 0.249994  | 64 MB      |
16 | | 8192x8192    | 1.973990  | 256 MB     |
17 | | 16384x16384  | 14.712000 | 1 GB       |
18 | | 32768x32768  | 111.222000| 4 GB       |
19 | 
20 | 再大会段错误，待优化
21 | 


--------------------------------------------------------------------------------
/doc/benchmark/reduce.md:
--------------------------------------------------------------------------------
1 | ## reduce操作性能比较
2 | 
3 | ### sum
4 | 


--------------------------------------------------------------------------------
/doc/deepxIR/readme.md:
--------------------------------------------------------------------------------
 1 | ## 测试工具
 2 | 
 3 | ### udp
 4 | 
 5 | nc命令可以发送udp包
 6 | 
 7 | ```bash
 8 | nc -u 127.0.0.1 8080
 9 | 
10 | //然后，输入内容，回车，即可发送
11 | ```
12 |  
13 | ### unixsocket
14 |  
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/doc/design.md:
--------------------------------------------------------------------------------
 1 | # deepx默认原则
 2 | 
 3 | ## 一.DeepxIR
 4 | 
 5 | ### 1.deepIR结构
 6 | ```
 7 | deepIR{
 8 |     Meta{
 9 |         int id
10 |         string author
11 |     } meta
12 |     string name 
13 |     []Param args
14 |     []Param returns
15 | }
16 | ```
17 | 
18 | excuter执行deepxIR的规则
19 | 
20 | + excuter执行deepxIR时，不得修改args中的tensor
21 | + 但deepIR不限制args和returns中的Param同名，这样可以实现类似inplace的操作
22 | 
23 | 
24 | ## front/python规则
25 | 
26 | ### 1.命名规则
27 | + inplace操作的函数，其名为_后缀, 返回值为空
28 | + 非inplace操作的函数，其名无_后缀
29 | 


--------------------------------------------------------------------------------
/doc/excuter/deepx.op.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/doc/excuter/deepx.op.jpg


--------------------------------------------------------------------------------
/doc/excuter/excuter.md:
--------------------------------------------------------------------------------
 1 | ## 如何给excuter添加一个新算子
 2 | 
 3 | ### 层次结构图
 4 | 
 5 | ![层次结构图](./deepx.op.drawio.svg)
 6 | 
 7 | 
 8 | #### TensorFunction
 9 | 
10 | 顾名思义，TensorFunction是操作Tensor的函数，可以是c++函数，也可以是python函数，cuda函数等。
11 | 
12 | #### TensorFunction 特定精度特化，或混合精度实现
13 | 
14 | 
15 | #### Op
16 | 
17 | Op是excuter的算子，是excuter的执行单元
18 | 
19 | 在程序中，Op是基类，不同的Op有不同的实现，比如Add, Mul, MatMul等。
20 | 每个Op都需要override forward和backward函数
21 | 
22 | 对同一个功能的Op如Matmul，可以有多种作者的实现
23 | 
24 | Matmul会选择选择一个默认的实现
25 | 
26 | 或者由MatmulOp的name属性来指定具体author的实现
27 |  
28 | 
29 | ### 具体步骤
30 | 
31 | git clone https://github.com/deepx-org/deepx.git
32 | 
33 | #### 1.cpu执行器
34 | cd deepx/excuter/op-mem-ompsimd
35 | 
36 | 需要提前安装好依赖
37 | + highway需要源码安装
38 | + omp，openmp库
39 | + yaml-cpp
40 | make build && cd build && cmake .. && make
41 | 
42 | 你可以在test目录下，验证或添加测试用例
43 | 
44 | 
45 | #### 2.cuda执行器
46 | cd deepx/excuter/op-mem-cuda
47 | 
48 | 需要提前安装好依赖
49 | + cuda
50 | + cublas
51 | + yaml-cpp
52 | 
53 | make build && cd build && cmake .. && make
54 | 
55 | 
56 | #### 3.jax执行器
57 | 
58 | todo
59 |  
60 | 
61 | #### 4.front对接测试
62 | 
63 | 1.先启动excuter可执行文件， 位于excuter/op-mem-{cuda/ompsimd}/build，可执行文件名同excuter名
64 | 2.然后测试front中py的对应算子脚本（front/py/examples 目录）
65 | 
66 | 可以按照顺序，以此测试
67 | 
68 | 1_tensor
69 | 
70 | 2_ir
71 | 
72 | 3_functional
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/doc/excuter/mix_precision.md:
--------------------------------------------------------------------------------
 1 | # mix precision
 2 | 
 3 | ## 1. 什么是 mix precision
 4 | 
 5 | mix precision 是一种混合精度训练方法，它使用 16 位浮点数和 8 位整数来训练模型，从而在保持模型精度的同时，减少显存占用和计算时间。
 6 | 
 7 | ## 2. 为什么需要 mix precision
 8 | 
 9 | 在深度学习中，模型通常使用 32 位浮点数进行训练，这样可以确保模型的精度。但是，32 位浮点数占用的显存较大，计算时间较长。因此，为了减少显存占用和计算时间，可以使用 mix precision 训练方法。
10 | 
11 | ## 3. 关于excuter的mix precision的实现
12 | 
13 | 如：
14 | 
15 | matmul(A[float16],B[float16])->C[float32] //author=miaobyte id=1 create_time=1714512000 send_time=1714512000
16 | 
17 | 我们在opfactory中,把实际参数用占位符替换，注册为
18 | 
19 | matmul[authora] Tensor@float16 Tensor@float16 -> Tensor@float32
20 | 
21 | 如:
22 | 
23 | matmul[authora] A@float16 b@float16 -> C@float32
24 | 
25 | 同样，在opfactory中，把实际参数用占位符替换，注册为
26 | 
27 | muladd[authora] Tensor@float16 Scalar@float32-> Tensor@float16
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/doc/excuter/op-mem-cuda/cublaslt/api.md:
--------------------------------------------------------------------------------
1 | + cublasLtMatmul()
2 | 支持部分低精度
3 | 
4 | + cublasLtMatmulEx()
5 | 
6 | + cublasLtMatmulBatched()
7 | 
8 | 


--------------------------------------------------------------------------------
/doc/excuter/op-mem-ompsimd/contribute.md:
--------------------------------------------------------------------------------
 1 | ## excuter
 2 | 
 3 | ### op-mem-ompsimd
 4 | 
 5 | ompsimd是DeepX框架的cpu执行器进程
 6 | 
 7 | + 采用OMP+SIMD加速tensor计算
 8 | + double和float采用openblas加速
 9 | + 使用了jemalloc内存池管理内存。
10 | 
11 | 
12 | #### 1. 安装依赖
13 | 
14 |  安装apt依赖
15 | 
16 | ```
17 | sudo apt-get update
18 |         sudo apt-get install -y \
19 |           build-essential \
20 |           cmake \
21 |           libopenblas-dev \
22 |           libyaml-cpp-dev \
23 |           libjemalloc-dev \
24 |           libgtest-dev \
25 |           clang \
26 |           git
27 | ```
28 |  
29 | 源码依赖安装
30 | 
31 | ```
32 | sudo apt-get install -y libgtest-dev
33 | 
34 | # 克隆 Highway
35 | git clone --depth 1 --branch ${HIGHWAY_VERSION} https://github.com/google/highway.git
36 | cd highway
37 | mkdir -p build && cd build
38 | 
39 | # 使用标准的 CMake 构建流程
40 | cmake .. \
41 |     -DCMAKE_BUILD_TYPE=Release \
42 |     -DBUILD_SHARED_LIBS=ON \
43 |     -DHWY_SYSTEM_GTEST=ON \
44 |     -DHWY_ENABLE_TESTS=OFF
45 | 
46 | # 构建和安装
47 | make -j$(nproc)
48 | sudo make install
49 | sudo ldconfig  # 更新动态链接库缓存
50 | 
51 | # 确保头文件正确安装
52 | sudo cp -r ../hwy /usr/local/include/
53 | ```
54 | 
55 | #### 2. 开发环境
56 | 
57 | c++ 17
58 | 
59 | 


--------------------------------------------------------------------------------
/doc/excuter/op-mem-ompsimd/range.md:
--------------------------------------------------------------------------------
 1 | ## excuter
 2 | 
 3 | ### op-mem-ompsimd
 4 | 
 5 | #### cpu的range算子辅助函数
 6 | 
 7 | range函数是shape类中的一个函数，用于根据shape对tensor进行omp线程并行遍历的方式
 8 | 
 9 | 定义和实现分别在：
10 | 
11 | excuter/common/src/deepx/shape.hpp
12 | 
13 | excuter/common/src/deepx/shape_range.cpp
14 | 
15 | | func | omp并行 | omp线程local局部对象 | 调用场景   |
16 | | ---- | ---- | ------ | ---------- |
17 | |      | N    |        | print      |
18 | | 函数 | 否   | 0      | 不需要并行 |
19 | | 函数 | 是   | 0      | 需要并行   |
20 | | 函数 | 否   | 0      | 不需要并行 |
21 | 


--------------------------------------------------------------------------------
/doc/excuter/welcome.md:
--------------------------------------------------------------------------------
 1 | **DeepX高性能算子开发英雄帖**  
 2 | ——**挑战算力极限，定义下一代AI基础设施**  
 3 | 
 4 | ---
 5 | 
 6 | ### **🔥 我们是谁？**  
 7 | **DeepX**——致力于打造**原生分布式并行**的深度学习训练推理一体化框架，以**极致性能**和**全场景覆盖**为目标！
 8 | - **性能追求者**：深耕算子优化，目标达到业界一流水平
 9 | - **异构计算先锋**：CUDA/Metal/沐熙/昇腾... 打造全平台支持
10 | - **开源共建者**：开放、透明的开发模式，欢迎全球开发者参与
11 | 
12 | ---
13 | 
14 | ### **⚡ 招募令：算力世界的角斗士**  
15 | **如果你**：  
16 | - 手握CUDA/Metal优化绝技，却苦于没有**工业级战场**  
17 | - 精通硬件指令集，渴望打造**教科书级算子实现**  
18 | - 梦想代码在**千万级GPU集群**上奔腾  
19 | 
20 | **加入我们，你将**：  
21 | ✅ 挑战**纳米级指令优化**，与硬件共舞  
22 | ✅ 设计**分布式算子原语**，定义行业标准  
23 | ✅ 打造**训练-推理一体化**的终极架构  
24 | 
25 | ---
26 | 
27 | ### **🏆 巅峰对决：算子性能挑战赛**  
28 | **期待以下领域高手**：  
29 | 
30 | #### **1. CUDA核弹专家（NVIDIA全系）**  
31 | - 战场：Ampere/Hopper架构深度调优  
32 | - 必杀技：  
33 |   - Tensor Core极限压榨  
34 |   - Warp级同步黑魔法  
35 |   - 显存带宽利用率≥95%  
36 | 
37 | #### **2. Metal刀锋战士（Apple Silicon）**  
38 | - 战场：M1/M2/M3系列芯片  
39 | - 必杀技：  
40 |   - Metal Performance Shaders魔改  
41 |   - 苹果神经引擎(NE)指令直通  
42 |   - Unified Memory架构颠覆性优化  
43 | 
44 | #### **3. 异架构开荒者（沐熙/寒武纪/昇腾等）**  
45 | - 战场：国产算力芯片深水区  
46 | - 必杀技：  
47 |   - 自定义指令集破解  
48 |   - 存算一体架构适配  
49 |   - 自主IP核驱动开发  
50 | 
51 | ---
52 | 
53 | ### **💎 你将获得**  
54 | - **技术突破**：参与前沿AI基础设施开发
55 | - **开源贡献**：代码将服务广大开发者社区
56 | - **收益共享**：优秀算子作者将获得项目未来商业化的分成机会
57 | - **荣誉认可**：优秀贡献者将被记入项目贡献者名单
58 | 
59 | ---
60 | 
61 | ### **⚔️ 申请方式**  
62 | **加入我们的两个步骤**：
63 | 
64 | #### **步骤一：提交初始方案**
65 | 1. 访问 github.com/array2d.com 的 todo 目录
66 | 2. 提交 PR，详细描述：
67 |    - 你的优化想法
68 |    - 具体实现方案
69 |    - 预期性能提升
70 | 3. PR通过后，我们会邀请你加入技术讨论群
71 | 
72 | #### **步骤二：提交算子实现**
73 | 1. 提交算子代码 PR
74 | 2. 进行全面性能测试和验证
75 | 3. 通过验证后，你的算子将被合并进主分支
76 | 
77 | --- 
78 | 
79 | **让我们一起，用代码构建AI基础设施的未来！** 🔥🚀
80 | 
81 | > "在DeepX，我们追求的是每一个算子的极致优化"  
82 | > —— DeepX开源项目发起人 李鹏  
83 | 
84 | ---  
85 | **即刻加入，成为这个激动人心项目的重要一员！** 🚀
86 | 


--------------------------------------------------------------------------------
/doc/front/deepx.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/doc/front/deepx.jpg


--------------------------------------------------------------------------------
/doc/front/front.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # deepx前端
 3 | 
 4 | ## 对应关系
 5 | 
 6 | | 前端 | pytorch | tensorflow | deepx|
 7 | | --- | --- | --- | --- |
 8 | | tensor库 | ATen | TensorFlow | deepx/tensorfunc | 
 9 | | 算子(支持forward和backward) | torch.nn.functional | ？ | deepx/op |
10 | | 计算图子图| torch.nn.Module | tensorflow.nn.Module | deepx.nn.Module | 
11 | | 抽象计算图 | torch.fx.graph.Graph | ？| deepx.nn.Graph | 
12 | | 执行计算图| torch._inductor.graph.GraphLowering | tensorflow.Graph | deepx.nn.Graph | 


--------------------------------------------------------------------------------
/doc/front/graph.md:
--------------------------------------------------------------------------------
 1 | # 计算图
 2 | 
 3 | 
 4 | ## 抽象计算图
 5 | 
 6 | 抽象计算图是计算图的抽象表示，它描述了计算的整体逻辑结构。
 7 | 
 8 | ## 执行计算图
 9 | 
10 | 执行计算图是计算图的实际执行过程，它描述了计算的详细具体执行过程。
11 | 
12 | 
13 | 自动tensor并行
14 | 
15 | + 根据tensor的shape和dtype，对tensor进行split，分解为n个小tensor
16 | + 对每个小tensor，调度到不同的存算执行器上进行计算
17 | + 根据tensor的shape和dtype，对tensor进行concat
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/doc/front/node.md:
--------------------------------------------------------------------------------
 1 | # Node,计算图的设计思考
 2 | 
 3 | ## 概念
 4 | 
 5 | pytorch的计算图是动态的，tensorflow早期的计算图是静态的。
 6 | 
 7 | pytorch在前向传播时，会构建一个计算图，在反向传播时，会根据计算图进行反向传播。
 8 | 
 9 | 
10 | ## Graph结构
11 | 
12 | Node{
13 |     froms []*Node
14 |     tos []*Node
15 | }
16 | 
17 | Graph结构可以支持Residual的跳跃Node连接
18 | 
19 | ## Tree结构
20 | 
21 | Tree{
22 |     parent *Node
23 |     children []*Node
24 | }
25 | 
26 | Tree结构需要特别的实现Residual的跳跃Node连接
27 | 
28 | Residual可以把跳跃连接的Node打平,都作为Residual的childs
29 | 
30 | ## Deepx的设计实现 
31 | 优先考虑Tree这种静态图结构，如果需要支持Residual的跳跃Node连接，可以在forward和backward中特别的实现。


--------------------------------------------------------------------------------
/doc/front/op.md:
--------------------------------------------------------------------------------
 1 | # 基础算子
 2 | 算术运算
 3 | 一元运算：Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Ceil, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Reciprocal, Sign, Sin, Sinh, Sqrt, Tan, Tanh
 4 | 二元运算：Add, Div, Mul, Pow, Sub
 5 | 比较运算：Equal, Greater, GreaterOrEqual, Less, LessOrEqual, Not
 6 | 逻辑运算：And, Or, Xor, BitwiseAnd, BitwiseNot, BitwiseOr, BitwiseXor, BitShift
 7 | 激活函数：Elu, Gelu, HardSigmoid, HardSwish, Hardmax, LeakyRelu, Mish, PRelu, Relu, Selu, Sigmoid, Softmax, Softplus, Softsign, ThresholdedRelu
 8 | 数据变换
 9 | 形状变换：Cast, CastLike, Flatten, Reshape, Squeeze, Transpose, Unsqueeze
10 | 元素选择与索引：ArgMax, ArgMin, Gather, GatherElements, GatherND, Scatter, ScatterElements, ScatterND, Slice, TopK
11 | 数据生成：Constant, ConstantOfShape, EyeLike, Range, RandomNormal, RandomNormalLike, RandomUniform, RandomUniformLike
12 | 池化操作
13 | 普通池化：AveragePool, GlobalAveragePool, GlobalLpPool, GlobalMaxPool, LpPool, MaxPool, Mean, Min
14 | 特殊池化：MaxRoiPool, MaxUnpool, SpaceToDepth, DepthToSpace
15 | 归一化操作：BatchNormalization, GroupNormalization, InstanceNormalization, LayerNormalization, LpNormalization, MeanVarianceNormalization
16 | 统计运算：CumSum, ReduceL1, ReduceL2, ReduceLogSum, ReduceLogSumExp, ReduceMax, ReduceMean, ReduceMin, ReduceProd, ReduceSum, ReduceSumSquare
17 | 张量操作：Concat, ConcatFromSequence, Split, SplitToSequence, Expand, Pad, Resize, ReverseSequence, Shrink, Tile, Where
18 | 类型判断：IsInf, IsNaN
19 | 其他：Identity, OneHot, SequenceAt, SequenceConstruct, SequenceEmpty, SequenceErase, SequenceInsert, SequenceLength, SequenceMap, Shape, Size, StringConcat, StringNormalizer, StringSplit
20 | 融合算子
21 | 神经网络层：Conv, ConvInteger, ConvTranspose, DeformConv, GRU, LSTM, RNN, QLinearConv, QLinearMatMul
22 | 损失函数：NegativeLogLikelihoodLoss, SoftmaxCrossEntropyLoss
23 | 其他融合操作：AffineGrid, CenterCropPad, Col2Im, Compress, DFT, ImageDecoder, Loop, NonMaxSuppression, Optional, OptionalGetElement, OptionalHasElement, RegexFullMatch, Scan, TfIdfVectorizer, Upsample
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/front/py/about.md:
--------------------------------------------------------------------------------
 1 | ### deepx/front/py
 2 | 
 3 | deepx-py库是DeepX框架的Python库，方便用户搭建深度学习模型，输出计算图，主要用于深度学习模型的开发和训练。
 4 | 
 5 | #### 设计理念
 6 | 
 7 | + deepx并不像pytorch那样，追求python first，而是为了原生分布式和并行，约束python的灵活性。
 8 | + deepx的使用风格，基本贴近pytorch。尽量能做到 import deepx as torch，依然能正确的run起来
 9 | + deepx的py进程，不参与tensor计算，但会参与一些简单的shape计算
10 | 
11 | #### 待定
12 | 
13 | 


--------------------------------------------------------------------------------
/doc/front/py/contribute.md:
--------------------------------------------------------------------------------
 1 | ## front
 2 | 
 3 | ### py
 4 | 
 5 | deepx-py库是DeepX框架的Python库，方便用户搭建深度学习模型，输出计算图，主要用于深度学习模型的开发和训练。
 6 | 
 7 | #### 1. 安装依赖
 8 | 
 9 | deepx-py库依赖：
10 | 
11 | ```
12 | pip install graphviz
13 | ```
14 |  
15 | #### 2. 开发环境
16 | 
17 | deepx-py库的开发环境是：
18 | 
19 | python 3.8+
20 | 
21 | 


--------------------------------------------------------------------------------
/doc/front/py/deepx.rst:
--------------------------------------------------------------------------------
1 | DeepX Python 前端
2 | ===============
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 |    :caption: Python API
7 |    
8 |    about
9 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. DeepX documentation master file, created by
 2 |    sphinx-quickstart on Tue Mar  4 12:21:01 2025.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | DeepX 原生分布式并行的深度学习训练推理一体框架
 7 | =======================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 4
11 |    :caption: tutorials 教程
12 |    
13 |    benchmark/matmul
14 |    benchmark/broadcast
15 | 
16 | .. toctree::
17 |    :maxdepth: 6
18 |    :caption: doc 文档
19 |    
20 |    front/py/deepx/about
21 |    excuter/op-mem-ompsimd/list
22 |    deepxIR/ir
23 | 
24 | .. toctree::
25 |    :maxdepth: 6
26 |    :caption: contrib 贡献指南
27 |    
28 |    front/py/contribute
29 |    scheduler/scheduler
30 |    excuter/excuter
31 |    excuter/op-mem-ompsimd/contribute
32 |    excuter/op-mem-ompsimd/range
33 | 
34 | 索引和搜索
35 | ==========
36 | 
37 | * :ref:`genindex`
38 | * :ref:`search`
39 | 
40 | Add your content using ``reStructuredText`` syntax. See the
41 | `reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
42 | documentation for details.
43 | 
44 | 
45 | .. toctree::
46 |    :maxdepth: 2
47 |    :caption: Contents:
48 | 


--------------------------------------------------------------------------------
/doc/language.md:
--------------------------------------------------------------------------------
 1 | ## c++:计算执行器(excuter)
 2 | 
 3 | 负责实现tensor的具体计算过程,对接硬件如GPU、CPU的simd指令
 4 | 
 5 | 除了c++，也就只有编译器能干这样的脏活累活了
 6 | 
 7 | deepx用到了以下库，都是c++是实现
 8 | 
 9 | cblas
10 | openmp
11 | c++可以和汇编结合，从而最大程度发挥cpu、gpu寄存器的性能
12 | 
13 | cuda是c++的语言子集，也可以看作是c++
14 | 
15 | 
16 | ## python:模型前端构建
17 | python提供了类似pytorch的库，便于调试和验证模型算法
18 | 
19 | deepx/tensor/
20 | deepx/nn/deepxIR
21 | deepx/nn.module/
22 | deepx/nn.functional
23 | 通过这些库，我们可以快速的搭建一个模型结构
24 | 
25 | ## golang:运维、监控、分布式，深度学习训推自动化的维护者
26 | 
27 | 与pytorch、tensorflow不同，deepx追求分布式过程自动化，因此python侧不参与分布式
28 | 
29 | deepxctl:提供对deepx体系的所有工具、库、模型、镜像的统一纳管
30 | 
31 | 
32 | 
33 | ## deepxIR
34 | 虽然deepxIR不是独立的编程语言，但是deepx体系的程序格式标准
35 | 
36 | excuter所执行的内容，就是deepxir的序列或deepxir计算图
37 | 
38 | https://github.com/array2d/deepx/blob/main/doc/excuter/op-mem-cuda/list.md
39 | 
40 | deepxir分为3类
41 | 
42 | 计算：tensor这些系列elementwise、changeshape、tensorlife、io、reduce、init
43 | 指令结构:
44 | queue[deepxIR]，串行指令，有前后执行顺序
45 | parallel[deepxIR]，可并行的指令，无顺序依赖，可并行
46 | 以上指令为静态图所需的指令，运行过程是确定的。
47 | 
48 | 分支：goto、ifelse
49 | 分支指令会让计算图行为不可预测，也就是动态部分
50 | 
51 | 控制：parse、run等特殊自定义指令
52 | 控制指令是deepx分布式系统内置的各个组件控制指令


--------------------------------------------------------------------------------
/doc/scheduler/scheduler.md:
--------------------------------------------------------------------------------
 1 | ### scheduler
 2 | 
 3 | DeepX框架的scheduler，是front和excuter之间的桥梁。
 4 | 
 5 | front只负责搭建抽象计算图，excuter负责执行算子，而scheduler负责将抽象计算图转换为执行计算图，并发送给excuter。
 6 | 
 7 | #### 算子注册器
 8 | 
 9 | 算子注册器，接收excuter的算子及精度列表。
10 | 
11 | 
12 | #### 调度器
13 | 
14 | scheduler将实现以下能力：
15 | 
16 | + 根据计算图的依赖关系，确定算子的执行顺序。
17 | + 算子融合。抽象计算图都是由最基础的算子组成，而执行计算图可以由多个基础算子融合而成。
18 | + 算子消除。根据数学链式法则，有些算子可以相互抵消，如log和exp，mul和div,add和sub。
19 | + TP：tensor 并行，tensor自动拆分计算
20 | + PP：pipeline 并行,包括 dual-mode：前向和后向
21 | + MP：model 并行，模型自动拆分计算
22 | + DP：data 并行，多路batch并行训练
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15...3.29)
 2 | project(deepx-common LANGUAGES CXX ) 
 3 | 
 4 | # 设置 C++ 标准
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 7 | # 设置编译优化
 8 | 
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | # 设置 SIMD 编译选项
11 |  
12 | # 包含头文件目录
13 | include_directories(src)
14 | 
15 | # 源文件
16 | 
17 | file(GLOB_RECURSE DEEPX_COMMON_SOURCES "src/*.cpp")
18 | 
19 | add_library(deepx_common SHARED 
20 |     ${DEEPX_COMMON_SOURCES}
21 | )
22 | 
23 |  
24 | find_package(yaml-cpp REQUIRED)
25 | 
26 | target_link_libraries(deepx_common
27 |     PUBLIC
28 |     yaml-cpp
29 | )
30 | 
31 | target_include_directories(deepx_common PUBLIC
32 |     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
33 |     $<INSTALL_INTERFACE:include>
34 | )
35 |  
36 |  
37 | add_subdirectory(test)
38 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/client/udpserver.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __CLIENT_UDPSERVER_HPP__
 2 | #define __CLIENT_UDPSERVER_HPP__
 3 | 
 4 | #include <iostream>
 5 | #include <string.h>
 6 | #include <arpa/inet.h>
 7 | #include <sys/un.h>
 8 | #include <unistd.h>
 9 | #include <functional>
10 | #include <queue>
11 | 
12 | #include "deepx/tf/tf.hpp"
13 | namespace client{
14 |     using namespace std;
15 |     class udpserver
16 |     {
17 |     private:
18 |         int port;
19 |         int sockfd;
20 |         struct sockaddr_in servaddr,cliaddr;
21 |         char buffer[1024];
22 |         socklen_t len;
23 |         ssize_t n;
24 |     public:
25 |         udpserver(int port);
26 |         ~udpserver();
27 |         void start(queue<deepx::tf::TF> &tasks);
28 |         using handlefunc = std::function<void(const char *buffer)>;
29 |         handlefunc func;
30 |         void resp(string str);
31 |     };
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/excuter/cpp-common/src/client/unixsocketserver.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __CLIENT_UNIXSOCKETSERVER_HPP__
 2 | #define __CLIENT_UNIXSOCKETSERVER_HPP__
 3 | 
 4 | #include <iostream>
 5 | #include <string.h>
 6 | #include <arpa/inet.h>
 7 | #include <sys/un.h>
 8 | #include <unistd.h>
 9 | #include <functional>
10 | 
11 | namespace client
12 | {
13 |     class unixsocketserver
14 |     {
15 |     private:
16 |         std::string socket_path;
17 |         int sockfd;
18 |         struct sockaddr_un servaddr, cliaddr; // 修改为使用完整类型
19 |         char* buffer;        // 改为指针类型
20 |         const int buffer_size; // 新增缓冲区大小成员
21 |         socklen_t len;
22 |         ssize_t n;
23 | 
24 |     public:
25 |         unixsocketserver(const std::string &path, const int buffersize);
26 |         ~unixsocketserver();
27 |         void start();
28 |         using handlefunc = std::function<void(char *buffer)>;
29 |         handlefunc func;
30 |     };
31 | }
32 | 
33 | #endif


--------------------------------------------------------------------------------
/excuter/cpp-common/src/client/worker.hpp:
--------------------------------------------------------------------------------
1 | #ifndef __WORKER_HPP__
2 | #define __WORKER_HPP__
3 | 
4 | 
5 | namespace client{
6 |  
7 | }
8 | #endif
9 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/shape_matmul.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdexcept>
 2 | 
 3 | #include "deepx/shape_matmul.hpp"
 4 | 
 5 | namespace deepx
 6 | {
 7 |     Shape matmul_shape(const Shape &A, const Shape &B)
 8 |     {
 9 |         if (A.dim() < 2 || B.dim() < 2)
10 |         {
11 |             throw std::invalid_argument("A and B must >= 2D tensors");
12 |         }
13 |         if (A[-1] != B[-2])
14 |         {
15 |             throw std::invalid_argument("A[-1] must be equal to B[-2]");
16 |         }
17 |         std::vector<int> resultshape(A.dim());
18 |         std::copy(A.shape.begin(), A.shape.begin() + A.dim(), resultshape.begin());
19 |         Shape result(resultshape);
20 |         result[-1] = B[-1];
21 |         return result;
22 |     }
23 | }


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/shape_matmul.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_SHAPE_MATMUL_HPP
 2 | #define DEEPX_SHAPE_MATMUL_HPP
 3 | 
 4 | #include "deepx/shape.hpp"
 5 | 
 6 | namespace deepx
 7 | {
 8 |     Shape matmul_shape(const Shape &A, const Shape &B);
 9 | }
10 | 
11 | #endif // DEEPX_SHAPE_MATMUL_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/shape_reduce.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_SHAPE_SUM_HPP
 2 | #define DEEPX_SHAPE_SUM_HPP
 3 | 
 4 | #include "deepx/shape.hpp"
 5 | 
 6 | namespace deepx
 7 | {
 8 | 
 9 |         // 检查dims参数是否合法,返回整理后的dims
10 |         std::vector<int> checkedDims(const std::vector<int> &inputshape, const std::vector<int> &dims);
11 | 
12 |         // 返回求和后的形状     
13 |         std::vector<int> reducedShape(const std::vector<int> &inputshape, const std::vector<int> &dims, const bool keepdim = false);
14 | 
15 |         // 返回需要求和的维度
16 |         std::vector<int> reducedDim(const std::vector<int> &inputshape, const std::vector<int> &dims );
17 | }
18 | 
19 | #endif // DEEPX_SHAPE_SUM_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/shape_tensorinit.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/shape_tensorinit.hpp"
 2 | 
 3 | namespace deepx
 4 | {
 5 |      std::pair<int, int> calculateFanInAndFanOut(const Shape &shape)
 6 |     {
 7 |         int fanIn, fanOut;
 8 |         if (shape.dim() < 2)
 9 |         {
10 |             fanIn = 1;
11 |             fanOut = 1;
12 |             return std::make_pair(fanIn, fanOut);
13 |         }
14 | 
15 |         int numInputFmaps = shape[1];  // 输入特征图数量
16 |         int numOutputFmaps = shape[0]; // 输出特征图数量
17 |         int receptiveFieldSize = 1;
18 |         if (shape.dim() > 2)
19 |         {
20 |             for (int i = 2; i < shape.dim(); ++i)
21 |             {
22 |                 receptiveFieldSize *= shape[i]; // 计算感受野大小
23 |             }
24 |         }
25 | 
26 |         fanIn = numInputFmaps * receptiveFieldSize;
27 |         fanOut = numOutputFmaps * receptiveFieldSize;
28 |         return std::make_pair(fanIn, fanOut);
29 |     }
30 | 
31 | }


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/shape_tensorinit.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_SHAPE_TENSORINIT_HPP
 2 | #define DEEPX_SHAPE_TENSORINIT_HPP
 3 | 
 4 | #include "deepx/shape.hpp"
 5 | 
 6 | namespace deepx
 7 | {
 8 |     std::pair<int, int> calculateFanInAndFanOut(const Shape &shape);
 9 | }
10 | 
11 | #endif // DEEPX_SHAPE_TENSORINIT_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/tensorbase.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORBASE_HPP
 2 | #define DEEPX_TENSORBASE_HPP
 3 | 
 4 | #include "deepx/shape.hpp"
 5 | 
 6 | namespace deepx
 7 | {
 8 |  
 9 |     struct TensorBase
10 |     {
11 |         Shape shape;
12 |         TensorBase() = default;
13 |         // 拷贝构造函数
14 |         TensorBase(const TensorBase &other)
15 |         {
16 |             shape = other.shape;
17 |         }
18 | 
19 |         // 移动构造函数
20 |         TensorBase(TensorBase &&other) noexcept
21 |         {
22 |             shape = std::move(other.shape);
23 |         }
24 | 
25 |         // 拷贝赋值运算符
26 |         TensorBase &operator=(const TensorBase &other)
27 |         {
28 |             if (this != &other)
29 |             {
30 |                 shape = other.shape;
31 |             }
32 |             return *this;
33 |         }
34 | 
35 |         // 移动赋值运算符
36 |         TensorBase &operator=(TensorBase &&other) noexcept
37 |         {
38 |             if (this != &other)
39 |             {
40 |                 shape = std::move(other.shape);
41 |             }
42 |             return *this;
43 |         }
44 |     };
45 | 
46 | }
47 | #endif
48 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/tensorfunc/authors.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_AUTHORS_HPP
 2 | #define DEEPX_TENSORFUNC_AUTHORS_HPP
 3 | 
 4 | #include "string"
 5 | 
 6 | namespace deepx::tensorfunc{
 7 |     using namespace std;
 8 |     class default_{
 9 |     public:
10 |         static std::string name() { return "default"; }
11 |     };
12 |     
13 |     class miaobyte{
14 |     public:
15 |         static std::string name() { return "miaobyte"; }
16 |     };
17 |     
18 |     class cblas{
19 |     public:
20 |         static std::string name() { return "cblas"; }
21 |     };
22 | 
23 |     class cublas{
24 |     public:
25 |         static std::string name() { return "cublas"; }
26 |     };
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/tensorfunc/io.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_IO_HPP
 2 | #define DEEPX_TENSORFUNC_IO_HPP
 3 | 
 4 | #include "deepx/tensor.hpp"
 5 | #include "stdutil/fs.hpp"
 6 | 
 7 | namespace deepx::tensorfunc{
 8 |     
 9 |     template <typename Author,typename T>
10 |     struct printDispatcher{
11 |         static void print(const Tensor<T> &t, const std::string &f="")=delete;
12 |     };
13 | 
14 |     template <typename Author, typename T>
15 |     void print(const Tensor<T> &t, const std::string &f=""){
16 |         printDispatcher<Author,T>::print(t, f);
17 |     }
18 |     
19 |  
20 |    
21 |  
22 | }
23 | 
24 | #endif // DEEPX_TENSORFUNC_IO_HPP
25 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_MATMUL_HPP
 2 | #define DEEPX_TENSORFUNC_MATMUL_HPP
 3 | 
 4 | #include "deepx/tensor.hpp"
 5 | #include "deepx/tensorfunc/authors.hpp"
 6 | #include "stdutil/error.hpp"
 7 | namespace deepx::tensorfunc
 8 | {
 9 |     bool check_matmul_shape(const Shape &a, const Shape &b)
10 |     {
11 |         if (a[-1] != b[-2])
12 |         {
13 |             return false;
14 |         }
15 |         if (a.dim() != b.dim())
16 |         {
17 |             return false;
18 |         }
19 |         for (int i = 0; i < a.dim() - 2; ++i)
20 |         {
21 |             if (a[i] != b[i])
22 |             {
23 |                 return false;
24 |             }
25 |         }
26 |         return true;
27 |     }
28 | 
29 |     template <typename Author, typename T>
30 |     struct matmulDispatcher
31 |     {
32 |         static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
33 |         {
34 |             throw NotImplementError("matmul");
35 |         }
36 |     };
37 | 
38 |     template <typename Author, typename T>
39 |     void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
40 |     {
41 |         matmulDispatcher<Author, T>::matmul(A, B, C);
42 |     }
43 | }
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_TENSORLIFE_HPP
 2 | #define DEEPX_TENSORFUNC_TENSORLIFE_HPP
 3 | 
 4 | #include "deepx/tensor.hpp"
 5 | 
 6 | namespace deepx::tensorfunc
 7 | {
 8 |     //New
 9 |     template < typename T>
10 |     Tensor<T> New(const std::vector<int> &shape);
11 | 
12 |     template <typename T>
13 |     Tensor<T> New(const std::initializer_list<int> &shape){
14 |         std::vector<int> shape_vec(shape);
15 |         return New<T>(shape_vec);
16 |     }
17 |  
18 |     //copy
19 |     template <typename T>
20 |     void copy(const Tensor<T> &src,Tensor<T> &dst);
21 | 
22 |     //rename
23 |     //通过tf直接实现
24 | }
25 | #endif


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/vector_combination.cpp:
--------------------------------------------------------------------------------
 1 | #include <functional>
 2 | 
 3 | #include "deepx/vector_combination.hpp"
 4 | 
 5 | namespace deepx
 6 | {
 7 |     using namespace std;
 8 |     vector<vector<int>> combination(int n, int k)
 9 |     {
10 |         if (k > n || k < 0)
11 |         {
12 |             return {};
13 |         }
14 |         if (k == 0)
15 |         {
16 |             return {{}};
17 |         }
18 | 
19 |         vector<vector<int>> result;
20 |         vector<int> path;
21 | 
22 |         // 递归函数
23 |         function<void(int)> backtrack = [&](int start)
24 |         {
25 |             if (path.size() == k)
26 |             {
27 |                 result.push_back(path);
28 |                 return;
29 |             }
30 |             for (int i = start; i < n; i++)
31 |             {
32 |                 path.push_back(i);
33 |                 backtrack(i + 1);
34 |                 path.pop_back(); // 回溯
35 |             }
36 |         };
37 | 
38 |         backtrack(0);
39 |         return result;
40 |     }
41 |     vector<vector<int>> combination(int n )
42 |     {
43 |         vector<vector<int>> result;
44 |         for (int k = 0; k <= n; k++)
45 |         {
46 |             vector<vector<int>> temp = combination(n, k);
47 |             result.insert(result.end(), temp.begin(), temp.end());
48 |         }
49 |         return result;
50 |     }
51 |     vector<int> arrange(int n)
52 |     {
53 |         vector<int> result;
54 |         for (int i = 0; i < n; i++)
55 |         {
56 |             result.push_back(i);
57 |         }
58 |         return result;
59 |     }
60 | }


--------------------------------------------------------------------------------
/excuter/cpp-common/src/deepx/vector_combination.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_VECTOR_COMBINATION_HPP
 2 | #define DEEPX_VECTOR_COMBINATION_HPP
 3 | 
 4 | #include <vector>
 5 | 
 6 | namespace deepx
 7 | {
 8 |     using namespace std;
 9 |     vector<vector<int>> combination(int n, int k);
10 |     vector<vector<int>> combination(int n);
11 |     vector<int> arrange(int n);
12 | }
13 | #endif


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/error.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STDUTIL_ERROR_HPP
 2 | #define STDUTIL_ERROR_HPP
 3 | 
 4 | #include <stdexcept>
 5 | #include <string>
 6 | 
 7 | 
 8 | class NotImplementError : public std::logic_error {
 9 | public:
10 |     explicit NotImplementError(const std::string& method_name)
11 |         : std::logic_error("Not implement: " + method_name) {}
12 | };
13 | class UnsupportedOperationException : public std::logic_error {
14 | public:
15 |     explicit UnsupportedOperationException(const std::string& method_name)
16 |         : std::logic_error("Unsupported method: " + method_name) {}
17 | };
18 | 
19 | class TensorShapeError : public std::logic_error {
20 | public:
21 |     explicit TensorShapeError(const std::string& operation)
22 |         : std::logic_error("Tensor shape error for operation: " + operation) {}
23 | };
24 | 
25 | 
26 | #endif // STDUTIL_ERROR_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/fs.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_STDUTIL_FS_HPP
 2 | #define DEEPX_STDUTIL_FS_HPP
 3 | 
 4 | #include <string>
 5 | #include <memory>
 6 | 
 7 | namespace stdutil{
 8 | 
 9 |     
10 | 
11 |     using namespace std;
12 |     string filename(const string &path);
13 | 
14 |     using byte = unsigned char;
15 | 
16 |     void save(const byte *data,size_t size,const string &path);
17 |     void load(const string &path,byte *data,size_t target_size);
18 |     pair<size_t,shared_ptr<byte[]>> load(const string &path);
19 | }
20 | 
21 | #endif // DEEPX_STDUTIL_FS_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/num.cpp:
--------------------------------------------------------------------------------
 1 | #include "num.hpp"
 2 | #include <string>
 3 | #include <cctype>
 4 |  
 5 | bool is_positive_integer(const std::string& str) {
 6 |     try {
 7 |         int s=std::stoi(str);
 8 |         if(s<0){
 9 |             return false;
10 |         }
11 |         return true;
12 |     } catch (...) {
13 |         return false;
14 |     }
15 | }
16 | bool is_integer(const std::string& str) {
17 |    try {
18 |         std::stoi(str);
19 |         return true;
20 |     } catch (...) {
21 |         return false;
22 |     }
23 | }
24 | bool is_float(const std::string& str) {
25 |     try {
26 |         std::stof(str);
27 |         return true;
28 |     } catch (...) {
29 |         return false;
30 |     }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/num.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STDUTIL_NUM_HPP
 2 | #define STDUTIL_NUM_HPP
 3 | 
 4 | #include <string>
 5 |  
 6 | bool is_positive_integer(const std::string& str); 
 7 | bool is_integer(const std::string& str);
 8 | bool is_float(const std::string& str);
 9 | 
10 | #endif // STDUTIL_NUM_HPP
11 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/string.cpp:
--------------------------------------------------------------------------------
 1 | #include "string.hpp"
 2 | 
 3 | namespace stdutil
 4 | {
 5 |     void trimspace(string &str)
 6 |     {
 7 |         str.erase(0, str.find_first_not_of(" "));
 8 |         str.erase(str.find_last_not_of(" ") + 1);
 9 |     }
10 | 
11 |     void trim(string &str, const string &chars)
12 |     {
13 |         str.erase(0, str.find_first_not_of(chars));
14 |         str.erase(str.find_last_not_of(chars) + 1);
15 |     }
16 | 
17 |     string escape_markdown(const string &str)
18 |     {
19 |         std::string result;
20 |         for (char c : str)
21 |         {
22 |             switch (c)
23 |             {
24 |             case '\\':
25 |                 result += "\\\\";
26 |                 break;
27 |             case '\"':
28 |                 result += "\\\"";
29 |                 break;
30 |             case '\'':
31 |                 result += "\\\'";
32 |                 break;
33 |             case '\n':
34 |                 result += "\\n";
35 |                 break;
36 |             case '\t':
37 |                 result += "\\t";
38 |                 break;
39 |             case '\r':
40 |                 result += "\\r";
41 |                 break;
42 |             case '\b':
43 |                 result += "\\b";
44 |                 break;
45 |             case '\f':
46 |                 result += "\\f";
47 |                 break;
48 |             default:
49 |                 // 普通字符直接添加
50 |                 result += c;
51 |             }
52 |         }
53 |         return result;
54 |     }
55 | 
56 | } // namespace stdutil


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/string.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STDUTIL_STRING_HPP
 2 | #define STDUTIL_STRING_HPP
 3 | 
 4 | #include <string>
 5 | 
 6 | namespace stdutil
 7 | {
 8 |     using std::string;
 9 | 
10 |     void trimspace(string &str);
11 |     void trim(string &str,const string &chars=" \t\n\r\f\v");
12 | 
13 |     string escape_markdown(const string &str);
14 | }
15 | 
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/time.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STDUTIL_TIME_HPP
 2 | #define STDUTIL_TIME_HPP
 3 | 
 4 | #include <chrono>
 5 | #include <ctime>
 6 | #include <iomanip>
 7 | #include <sstream>
 8 | #include <string>
 9 | 
10 | namespace stdutil{
11 |     using namespace std::chrono;    
12 |     static std::string format_time(const system_clock::time_point &tp)
13 |     {
14 |         auto ms = duration_cast<microseconds>(tp.time_since_epoch());
15 |         auto sec = duration_cast<seconds>(ms);
16 |         ms -= sec;
17 | 
18 |         std::time_t t = sec.count();
19 |         std::tm tm;
20 |         localtime_r(&t, &tm); // 线程安全版本
21 | 
22 |         std::ostringstream oss;
23 |         oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S")
24 |             << '.' << std::setfill('0') << std::setw(6) << ms.count();
25 |         return oss.str();
26 |     }
27 | }
28 | #endif


--------------------------------------------------------------------------------
/excuter/cpp-common/src/stdutil/vector.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef STDUTIL_VECTOR_HPP
 2 | #define STDUTIL_VECTOR_HPP
 3 | 
 4 | #include <vector>
 5 | #include <ostream>
 6 | 
 7 | // 全局重载 operator<<
 8 | template<typename T>
 9 | std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
10 |     os << "[";
11 |     for (size_t i = 0; i < vec.size(); ++i) {
12 |         os << vec[i];
13 |         if (i < vec.size() - 1) {
14 |             os << ", ";
15 |         }
16 |     }
17 |     os << "]";
18 |     return os;
19 | }
20 | #endif // STDUTIL_VECTOR_HPP


--------------------------------------------------------------------------------
/excuter/cpp-common/test/1_tf.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/tf/tf.hpp"
 2 | #include <iostream>
 3 | using namespace std;
 4 | using namespace deepx::tf;
 5 | 
 6 | unordered_map<string,pair<TF,TF>> op_name_map = {
 7 |     {"matmul",{
 8 |         TF("matmul(float16,float16)->(float32)"),
 9 |         TF("matmul(a,b)->(c)")}},
10 |     {"matmul2",{
11 |         TF("matmul(float16|float32 a,float16|float32 b)->(float16|float32 c)"),
12 |         TF("matmul(a,b)->(c)")}},
13 |     {"newtensor",{
14 |         TF("newtensor(shape)->(float16 A)"),
15 |         TF("newtensor([3 4 5])->(A)")}},
16 |     {"argset",{
17 |         TF("argset(vector)->(int32 A)"),
18 |         TF("argset([3 4 5])->(A)")}},
19 |     {"argset2",{
20 |         TF("argset(int32 a)->(int32 shape)"),
21 |         TF("argset(a)->(shape)")}},
22 |     {"argset3",{
23 |         TF("argset(float32 1.34)->(float32 var1)"),
24 |         TF("argset(a)->(var1)")}},
25 |     {"argset4",{
26 |         TF("argset(1.34,2.34)->(float32 v1)"),
27 |         TF("argset(a,b)->(v1)")}},
28 | };
29 | 
30 | int main(int argc,char **argv){
31 |     if(argc!=2){
32 |         cout<<"usage: "<<argv[0]<<" opname"<<endl;
33 |         return 1;
34 |     }
35 |     string opname=argv[1];
36 |     if(op_name_map.find(opname)==op_name_map.end()){
37 |         cout<<"opname not found"<<endl;
38 |         return 1;
39 |     }
40 |     bool show_name=true;
41 |     cout<<"deffunc"<<op_name_map[opname].first.to_string(false,show_name)<<endl;
42 |     cout<<"funccall"<<op_name_map[opname].second.to_string(false,show_name)<<endl;
43 |     cout<<"funcmap_key:"<<op_name_map[opname].first.dtypes()<<endl;
44 | 
45 |     
46 |     return 0;
47 | }


--------------------------------------------------------------------------------
/excuter/cpp-common/test/1_tfcheck.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/tf/tf.hpp"
 2 | #include <iostream>
 3 | using namespace std;
 4 | using namespace deepx::tf;
 5 |  
 6 | 
 7 | int main(int argc,char **argv){
 8 |     TF t("matmul(float16 a,float16 b)->(float32 c)");
 9 |     cout<<t.to_string(false,false)<<endl;
10 |  
11 |     TF other("matmul(float16 a,float16 b)->(float32 c)");
12 |     
13 |     cout<<"checkdtype:"<<t.check_dtype(other)<<endl;
14 |     return 0;
15 | }


--------------------------------------------------------------------------------
/excuter/cpp-common/test/2_saveload.cpp:
--------------------------------------------------------------------------------
 1 | #include "stdutil/fs.hpp"
 2 | #include <iostream>
 3 | using namespace stdutil;
 4 | void test_save(int total_size){
 5 |     stdutil::byte *data = new stdutil::byte[total_size];
 6 |     for(int i=0;i<total_size;i++){
 7 |         data[i] =32+ i;
 8 |     }
 9 |     save(data,total_size,"test.bin");
10 |     delete[] data;
11 | }   
12 | 
13 | void test_load(int total_size ){
14 |  
15 |     auto [size,dataptr]=load("test.bin");
16 |     stdutil::byte *data = dataptr.get();
17 |     if (size != total_size){
18 |         cout<<"load failed"<<endl;
19 |     }
20 |     for (int i=0;i<total_size;i++){
21 |         cout<<data[i]<<" ";
22 |     }
23 |     cout<<endl;
24 | }
25 | 
26 | int main(int argc,char **argv){
27 |     int total_size = 96;
28 |     test_save(total_size);
29 |     test_load(total_size);
30 |     return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/excuter/cpp-common/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | add_executable(test_dtypes 0_dtypes.cpp)
 3 | target_link_libraries(test_dtypes deepx_common)
 4 | 
 5 | add_executable(test_tf  1_tf.cpp)
 6 | target_link_libraries(test_tf deepx_common)
 7 | 
 8 | add_executable(test_tfcheck  1_tfcheck.cpp)
 9 | target_link_libraries(test_tfcheck deepx_common)
10 |  
11 | add_executable(test_saveload  2_saveload.cpp)
12 | target_link_libraries(test_saveload deepx_common)


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/.gitignore:
--------------------------------------------------------------------------------
1 | thirdlib


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.array2d.com/nvidia/cuda:12.8.0-devel-ubuntu22.04
 2 | 
 3 | 
 4 | # 基础构建环境
 5 | RUN apt-get update && apt-get install -y \
 6 |     cmake \
 7 |     libyaml-cpp-dev \
 8 |     clang \
 9 |     git \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | 
13 | WORKDIR /home/
14 | # CUDA 已经预装，无需额外配置
15 | ADD cpp-common cpp-common
16 | ADD op-mem-cuda op-mem-cuda
17 | WORKDIR /home/op-mem-cuda
18 | 
19 | RUN rm -rf build && mkdir build && cd build && \
20 |     cmake .. && \
21 |     make -j$(nproc)
22 | 
23 | CMD ["./build/bin/deepx-excuter-cuda"]


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/README.md:
--------------------------------------------------------------------------------
 1 | ## build
 2 | 
 3 | ### 本地构建
 4 | 
 5 | ```
 6 | sudo bash build.sh
 7 | ```
 8 | 
 9 | ### docker 构建命令，及所在位置
10 | ```
11 | lipeng@lp:~/code/git.array2d.com/ai/deepx/excuter/op-mem-cuda$ sudo bash dockerbuild.sh 
12 | 
13 | ```
14 |  


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/build.sh:
--------------------------------------------------------------------------------
1 | cd build    
2 | rm -rf *
3 | cmake ..
4 | make -j$(nproc)
5 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/dockerbuild.sh:
--------------------------------------------------------------------------------
1 | cd ../
2 | pwd
3 | ls -al
4 | docker build -t docker.array2d.com/deepx/cuda:latest . -f op-mem-cuda/Dockerfile
5 | #docker push docker.array2d.com/deepx/cuda:latest
6 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/log.md:
--------------------------------------------------------------------------------
1 | ### 2025-02-12
2 | 
3 | 参考ompsimd，实现cuda版本的存算执行引擎


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/client/tfs.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CLIENT_TFS_HPP
 2 | #define CLIENT_TFS_HPP
 3 |  
 4 | #include "deepx/tf/tffactory.hpp"
 5 | namespace deepx::tf{
 6 | 
 7 |     int register_all(TfFactory &tfactory);
 8 | }
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/dtype_cuda.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_DTYPE_CUDA_HPP
 2 | #define DEEPX_DTYPE_CUDA_HPP
 3 | 
 4 | #include <cuda_fp16.h>
 5 | #include <cuda_bf16.h>
 6 | 
 7 | #include "deepx/dtype.hpp"
 8 | 
 9 | namespace deepx
10 | {
11 |     using namespace std;
12 |         // 获取类型对应的Precision
13 |     template <typename T>
14 |     constexpr Precision precision()
15 |     {
16 |         if constexpr (std::is_same_v<T, double>)
17 |             return Precision::Float64;
18 |         else if constexpr (std::is_same_v<T, float>)
19 |             return Precision::Float32;
20 |         else if constexpr (std::is_same_v<T, half>) return Precision::Float16;
21 |         else if constexpr (std::is_same_v<T, nv_bfloat16>) return Precision::BFloat16;
22 |         else if constexpr (std::is_same_v<T, int64_t>)
23 |             return Precision::Int64;
24 |         else if constexpr (std::is_same_v<T, int32_t>)
25 |             return Precision::Int32;
26 |         else if constexpr (std::is_same_v<T, int16_t>)
27 |             return Precision::Int16;
28 |         else if constexpr (std::is_same_v<T, int8_t>)
29 |             return Precision::Int8;
30 |         else if constexpr (std::is_same_v<T, bool>)
31 |             return Precision::Bool;
32 |         else if constexpr (std::is_same_v<T, std::string>)
33 |             return Precision::String;
34 |         else
35 |             return Precision::Any;
36 |     }   
37 | }
38 | 
39 | #endif // DEEPX_DTYPE_CUDA_HPP
40 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH
 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH
 3 | 
 4 |  #include <cuda_bf16.h>  
 5 | #include <cuda_fp16.h>
 6 | 
 7 | 
 8 | #include "deepx/tensorfunc/elementwise.hpp"
 9 | #include "deepx/tensorfunc/cuda.hpp"
10 | #include "deepx/tensorfunc/authors.hpp"
11 | 
12 | namespace deepx::tensorfunc
13 | {
14 |     // sin
15 |     template <typename T>
16 |     __global__ void sin_kernel(const T* A, T* C, const int size);
17 | 
18 |     template <typename T>
19 |     void launch_sin(const T* a, T* c, const int size);
20 | 
21 |     
22 |     template <typename T>
23 |     __global__ void cos_kernel(const T* A, T* C, const int size);
24 | 
25 |     template <typename T>
26 |     void launch_cos( const T* a, T* c, const int size);
27 | 
28 |     // tan
29 |     template <typename T>
30 |     __global__ void tan_kernel(const T* A, T* C, const int size);
31 | 
32 |     template <typename T>
33 |     void launch_tan( const T* a, T* c, const int size);
34 | }
35 | 
36 | #endif


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP
 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP
 3 | 
 4 | #include "deepx/tensorfunc/elementwise.hpp"
 5 | #include "deepx/tensorfunc/cuda.hpp"
 6 | #include "deepx/tensorfunc/authors.hpp"
 7 | #include "deepx/tensorfunc/elementwise_miaobyte_sin.cuh"
 8 | 
 9 | #include "stdutil/error.hpp"
10 | 
11 | namespace deepx::tensorfunc
12 | {
13 |  
14 |     template <typename T>
15 |     struct sinDispatcher<miaobyte, T>
16 |     {
17 |         static void sin(const Tensor<T> &A, Tensor<T> &C)
18 |         {
19 |             if (A.shape.size != C.shape.size) {
20 |                 throw TensorShapeError("sin");
21 |             }
22 |             launch_sin(A.data, C.data, A.shape.size);           
23 |         }   
24 |     };
25 | 
26 |     template <typename T>
27 |     struct cosDispatcher<miaobyte, T>
28 |     {
29 |         static void cos(const Tensor<T> &A, Tensor<T> &C)
30 |         {
31 |             if (A.shape.size != C.shape.size) {
32 |                 throw TensorShapeError("cos");
33 |             }
34 |             launch_cos(A.data, C.data, A.shape.size);
35 |         }
36 |     };
37 | 
38 |     template <typename T>
39 |     struct tanDispatcher<miaobyte, T>
40 |     {
41 |         static void tan(const Tensor<T> &A, Tensor<T> &C)
42 |         {
43 |             if (A.shape.size != C.shape.size) { 
44 |                 throw TensorShapeError("tan");
45 |             }
46 |             launch_tan(A.data, C.data, A.shape.size);
47 |         }
48 |     };
49 | 
50 |    
51 | }
52 | 
53 | #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP
54 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
 3 | 
 4 | #include "deepx/tensorfunc/cuda.hpp"
 5 | #include "deepx/tensorfunc/authors.hpp"
 6 | 
 7 | namespace deepx::tensorfunc
 8 | {   
 9 |     // sqrt
10 |     template <typename T >
11 |     __global__ void sqrt_kernel(const T* A, T* C,const int size);
12 | 
13 |     template <typename T>
14 |     void launch_sqrt(const T* a, T* c,const int size);
15 |  
16 |     
17 |     // pow
18 |     template <typename T>
19 |     __global__ void pow_kernel(const T* A, const T* B, T* C,const int size);
20 | 
21 |     template <typename T>
22 |     void launch_pow(const T* a, const T* b, T* c,const int size);
23 |  
24 |      
25 |     // powscalar
26 |     template <typename T>
27 |     __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size);
28 | 
29 |     template <typename T>
30 |     void launch_powscalar(const T* a, const T scalar, T* c,const int size);   
31 | 
32 |     // rpowscalar
33 |     template <typename T>
34 |     __global__ void rpowscalar_kernel(const T scalar, const T* A, T* C, const int size);
35 | 
36 |     template <typename T>
37 |     void launch_rpowscalar(const T scalar, const T* a, T* c, const int size);
38 |     
39 |     // log
40 |     template <typename T>
41 |     __global__ void log_kernel(const T* A, T* C,const int size);
42 | 
43 |     template <typename T>
44 |     void launch_log(const T* a, T* c,const int size);
45 |  
46 |     // exp
47 |     template <typename T>
48 |     __global__ void exp_kernel(const T* A, T* C,const int size);
49 | 
50 |     template <typename T>
51 |     void launch_exp(const T* a, T* c,const int size);
52 |     
53 |    
54 |     
55 | }
56 | 
57 | #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH
58 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_INIT_MIAO_BYTE_CUH
 2 | #define DEEPX_TENSORFUNC_INIT_MIAO_BYTE_CUH
 3 | 
 4 | #include <cuda_fp16.h>
 5 | #include <cuda_bf16.h>
 6 | #include <cuda_runtime.h>
 7 | #include "deepx/tensor.hpp"
 8 | #include "deepx/tensorfunc/authors.hpp"
 9 | #include "deepx/tensorfunc/init.hpp"
10 | 
11 | namespace deepx::tensorfunc
12 | {
13 |     //填充
14 |     template <typename T>
15 |     __global__ void kernel_constant(T *data, const T value, const int size);
16 | 
17 |     template <typename T>
18 |     void launch_constant(T *a, const T value, const int size);
19 |     
20 |     //dropout
21 |     template <typename T>
22 |     __global__ void dropout_kernel(T* A, const float p,const unsigned int seed,const int size);
23 | 
24 |     template <typename T>
25 |     void launch_dropout(T* a, const float p,const unsigned int seed,const int size);
26 | 
27 |     //初始化
28 |     //arange
29 |     template <typename T>
30 |     __global__ void kernel_arange(T *data, const float start, const float step, const int size);
31 | 
32 |     template <typename T>
33 |     void launch_arange(T *a, const T start, const T step, const int size);
34 | 
35 |     //uniform
36 |     template <typename T>
37 |     __global__ void kernel_uniform(T *data, const float low, const float high, const unsigned int seed, const int size);
38 | 
39 |     template <typename T>
40 |     void launch_uniform(T *a, const T low, const T high, const unsigned int seed, const int size);
41 | 
42 |     //normal
43 |     template <typename T>
44 |     __global__ void kernel_normal(T *data, const float mean, const float stddev, const unsigned int seed, const int size);
45 | 
46 |     template <typename T>
47 |     void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size);
48 | 
49 | }
50 | 
51 | #endif


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP
 2 | #define DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP
 3 | 
 4 | #include <random>
 5 | 
 6 | #include "deepx/tensorfunc/authors.hpp"
 7 | #include "deepx/tensorfunc/init.hpp"
 8 | #include "deepx/tensor.hpp"
 9 | #include "deepx/tensorfunc/init_miaobyte.cuh"
10 | namespace deepx::tensorfunc
11 | {
12 |     // constant
13 |     template <typename T>
14 |     struct constantDispatcher<miaobyte, T>
15 |     {
16 |         static void constant(Tensor<T> &tensor, const T value)
17 |         {
18 |             launch_constant(tensor.data, value, tensor.shape.size);
19 |         }
20 |     };
21 | 
22 | 
23 | 
24 |     template <typename T>
25 |     struct dropoutDispatcher<miaobyte, T>
26 |     {
27 |         static void dropout(Tensor<T> &A, const float p,const unsigned int seed)
28 |         {
29 |             launch_dropout(A.data, p, seed, A.shape.size);
30 |         }           
31 |     };
32 | 
33 |     // arange
34 |     template <typename T>
35 |     struct arangeDispatcher<miaobyte, T>
36 |     {
37 |         static void arange(Tensor<T> &tensor, const T start, const T step)
38 |         {
39 |             launch_arange(tensor.data, start, step, tensor.shape.size);
40 |         }
41 |     };
42 | 
43 |     // uniform
44 |     template <typename T>
45 |     struct uniformDispatcher<miaobyte, T>
46 |     {
47 |         static void uniform(Tensor<T> &tensor, const T low, const T high, const unsigned int seed)
48 |         {
49 |             launch_uniform(tensor.data, low, high, seed, tensor.shape.size);
50 |         }
51 |     };
52 | 
53 |     // normal
54 |     template <typename T>
55 |     struct normalDispatcher<miaobyte, T>
56 |     {
57 |         static void normal(Tensor<T> &tensor, const T mean, const T stddev, const unsigned int seed)
58 |         {
59 |             launch_normal(tensor.data, mean, stddev, seed, tensor.shape.size);
60 |         }
61 |     };
62 | }
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/new_mempool.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
 2 | #define DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
 3 |  
 4 | namespace deepx::tensorfunc
 5 | {
 6 |     class MemoryPool
 7 |     {
 8 |     public:
 9 |     static void* Malloc(size_t size) {
10 |         
11 |     }   
12 | 
13 |     static void Free(void* ptr) {
14 |        
15 |     }
16 |     
17 |     // Realloc: 重新分配内存并保留原数据,主要用于tensor形状改变时的内存重分配
18 |     // 如果新的size小于原size,数据会被截断
19 |     // 如果新的size大于原size,新分配的内存部分不会初始化
20 |     // 如果ptr为nullptr,等同于Malloc
21 |     // 如果size为0,等同于Free
22 |     // 返回新分配的内存指针,如果分配失败返回nullptr
23 | 
24 |     static void* Realloc(void* ptr, size_t size) {
25 |         
26 |     }
27 | 
28 |         
29 |     // GetAllocatedSize: 获取已分配内存的实际大小
30 |     // 由于内存对齐,实际分配的内存可能大于请求的size
31 |     // 主要用于内存使用统计和调试
32 |     // 如果ptr为nullptr,返回0
33 |     // 重新分配内存，保留原数据
34 |     static size_t GetAllocatedSize(void* ptr) {
35 |         
36 |     }
37 | };
38 | 
39 | }  // namespace deepx::tensorfunc
40 | #endif  // DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
41 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
 2 | #define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include "deepx/tensor.hpp"
 6 | 
 7 | namespace deepx::tensorfunc
 8 | {
 9 |     inline __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id)
10 |     {
11 |         int linearIndex = id;
12 |         for (int i = 0; i < dim; i++)
13 |         {
14 |             indices[i] = linearIndex / strides[i];
15 |             linearIndex %= strides[i];
16 |         }
17 |     }
18 | 
19 |     inline __host__ __device__ int linearAt(const int *strides, const int dim,const int *indices)
20 |     {
21 |         int idx = 0;
22 |         for (int i = 0; i < dim; i++)
23 |         {
24 |             idx += indices[i] * strides[i];
25 |         }
26 |         return idx;
27 |     }
28 | 
29 |     template <typename T>
30 |     __device__ __host__ void reorder(const T *order, const int *dimOrder, int dim, T *neworder)
31 |     {
32 |         for (int i = 0; i < dim; i++)
33 |         {
34 |             neworder[i] = order[dimOrder[i]];
35 |         }
36 |     }
37 |     
38 |     const int MAX_DIM = 12;
39 | }
40 | 
41 | #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH
42 | 


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/test/op/CMakeLists.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/excuter/op-mem-cuda/test/op/CMakeLists.txt


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/test/tensorfunc/0_new.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/tensorfunc/init.hpp"
 2 | #include "deepx/tensor.hpp"
 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 4 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 5 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 6 | #include "deepx/tensorfunc/authors.hpp"
 7 | 
 8 | using namespace deepx::tensorfunc;
 9 | using namespace deepx;
10 | void test_new()
11 | {
12 |     Tensor<float> a=New<float>({10, 10});
13 |     arange<miaobyte,float>(a, 1.0f, 0.1f);
14 |     print<miaobyte>(a,"%.2f");
15 | }
16 | 
17 | int main()
18 | {
19 |     test_new();
20 | }


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 2 | #include "deepx/tensor.hpp"
 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 4 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 5 | #include "deepx/tensorfunc/elementwise.hpp"
 6 | #include "deepx/tensorfunc/elementwise_cublas_basic.hpp"
 7 | using namespace deepx::tensorfunc;
 8 | using namespace deepx;
 9 | void test_add()
10 | {
11 |     Tensor<float> a=New<float>({10, 10});
12 |     arange<miaobyte,float>(a, 1.0f, 0.1f);
13 |     Tensor<float> b=New<float>({10, 10});
14 |     arange<miaobyte,float>(b, 2.0f, 0.2f);
15 |     Tensor<float> c=New<float>({10, 10});
16 |     constant<miaobyte,float>(c, 0.0f);
17 | 
18 |     add<cublas,float>(a, b, c);
19 |     print<miaobyte>(c,"%.2f");
20 | }
21 | 
22 | int main()
23 | {
24 |     test_add();
25 | }


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp:
--------------------------------------------------------------------------------
 1 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 2 | #include "deepx/tensor.hpp"
 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 4 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 5 | #include "deepx/tensorfunc/changeshape_miaobyte.hpp"
 6 | using namespace deepx::tensorfunc;
 7 | using namespace deepx;
 8 | void test_transpose()
 9 | {
10 |     Tensor<float> a=New<float>({3,4,6});
11 |     arange<miaobyte,float>(a, 1.0f, 1.0f);
12 |     print<miaobyte>(a,"%.0f");
13 |     Tensor<float> b=New<float>({3,6,4});
14 |     transpose<miaobyte,float>(a, {0,2,1}, b);
15 |     print<miaobyte>(b,"%.0f");
16 | }
17 | 
18 | void test_concat()
19 | {
20 |     Tensor<float> a=New<float>({3,2,6});
21 |     arange<miaobyte,float>(a, 1.0f, 1.0f);
22 |     print<miaobyte>(a,"%.0f");
23 |     Tensor<float> b=New<float>({3,4,6});
24 |     constant<miaobyte,float>(b, 2.0f);
25 |     print<miaobyte>(b,"%.0f");
26 |     Tensor<float> c=New<float>({3,6,6});
27 |     constant<miaobyte,float>(c, 3.0f);
28 |     print<miaobyte>(c,"%.0f");
29 |     Tensor<float> d=New<float>({3,12,6});
30 |     concat<miaobyte,float>({&a,&b,&c},1,d);
31 |     print<miaobyte>(d,"%.0f");
32 | }
33 | 
34 | void test_broadcastTo()
35 | {
36 |     Tensor<float> a=New<float>({3,2});
37 |     arange<miaobyte,float>(a, 1.0f, 1.0f);
38 |     Tensor<float> b=New<float>({4,3,2});
39 |     broadcastTo<miaobyte,float>(a, b.shape.shape, b);
40 |     print<miaobyte>(b,"%.0f");
41 | }
42 | int main(int argc, char **argv)
43 | {      
44 |     int casearg=atoi(argv[1]);
45 |     switch (casearg)
46 |     {
47 |     case 0:
48 |         test_transpose();
49 |         break;
50 |     case 1:
51 |         test_concat();
52 |         break;
53 |     case 2:
54 |         test_broadcastTo();
55 |         break;
56 |     }
57 |     return 0;
58 | }


--------------------------------------------------------------------------------
/excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(0_new 0_new.cpp)
 2 | target_link_libraries(0_new deepx CUDA::cudart)
 3 | 
 4 | add_executable(1_cublas_add 1_cublas_add.cpp)
 5 | target_link_libraries(1_cublas_add deepx CUDA::cudart)
 6 | 
 7 | add_executable(1_cublas_matmul 1_cublas_matmul.cpp)
 8 | target_link_libraries(1_cublas_matmul deepx CUDA::cudart)
 9 | 
10 | add_executable(2_changeshape 2_changeshape.cpp)
11 | target_link_libraries(2_changeshape deepx CUDA::cudart)


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/.cursorignore:
--------------------------------------------------------------------------------
1 | # Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
2 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/.cursorrules:
--------------------------------------------------------------------------------
1 | excuter只实现基础的op，不要实现可以由基础op组合的op，如relu可以由max组合。


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/.gitignore:
--------------------------------------------------------------------------------
1 | thirdlib


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15...3.29)
 2 | project(deepx-excuter-ompsimd LANGUAGES CXX ) 
 3 | 
 4 | # 设置 C++ 标准
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 7 | # 设置编译优化
 8 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | # 设置 SIMD 编译选项
11 |  
12 | # 包含头文件目录
13 | include_directories(src)
14 |  
15 | add_subdirectory(../cpp-common common)
16 | 
17 | # 源文件
18 | 
19 | file(GLOB_RECURSE DEEPX_SOURCES "src/*.cpp")
20 | file(GLOB_RECURSE CLIENT_SOURCES "src/client/*.cpp")
21 | 
22 | # cpu 线性代数库
23 | list(APPEND CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/openblas-pthread/cmake")
24 | find_package(OpenBLAS REQUIRED)
25 | 
26 | # include(FetchContent)
27 | 
28 | # # Fetch OpenBLAS from GitHub
29 | # FetchContent_Declare(
30 | #   OpenBLAS
31 | #   GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS
32 | #   GIT_TAG        v0.3.29  # 使用最新的稳定版本或您需要的特定版本
33 | # )
34 | 
35 | # # 下载并构建 OpenBLAS
36 | # FetchContent_MakeAvailable(OpenBLAS)
37 | 
38 | # 线程并行
39 | find_package(OpenMP REQUIRED)
40 | # simd
41 | find_package(hwy REQUIRED)
42 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -msse4.2")
43 | # 内存池
44 | find_package(PkgConfig REQUIRED)
45 | pkg_check_modules(JEMALLOC REQUIRED jemalloc)
46 | 
47 | 
48 | find_package(yaml-cpp REQUIRED)
49 |  
50 | add_library(deepx_ompsimd SHARED 
51 |     ${DEEPX_SOURCES}
52 | )
53 | 
54 | target_link_libraries( deepx_ompsimd
55 |     PUBLIC 
56 |     deepx_common
57 |     yaml-cpp
58 |     ${JEMALLOC_LIBRARIES}    
59 |     openblas
60 |     OpenMP::OpenMP_CXX
61 |     hwy
62 | )
63 | add_executable(${PROJECT_NAME} ${CLIENT_SOURCES})
64 | target_link_libraries(${PROJECT_NAME}
65 |     PRIVATE
66 |     deepx_ompsimd
67 | )
68 | # 测试
69 | add_subdirectory(test/tensorfunc)
70 | add_subdirectory(test/op)
71 | # # 数据集测试
72 | # add_subdirectory(test/dataset)
73 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.array2d.com/library/ubuntu:18.04
 2 | 
 3 | # 基础构建环境
 4 | RUN apt-get update && apt-get install -y \
 5 |     build-essential \
 6 |     cmake \
 7 |     libopenblas-dev \
 8 |     libyaml-cpp-dev \
 9 |     libjemalloc-dev \
10 |     clang \
11 |     git \
12 |     wget \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | # 安装 Highway SIMD 库
16 | 
17 | RUN mkdir -p thirdlib && \
18 |     cd thirdlib && \
19 |     git clone https://github.com/google/highway.git && \
20 |     cd highway && \
21 |     rm -rf build && mkdir build && cd build && \
22 |     cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \
23 |     make -j$(nproc) && make install && \
24 |     cd ../../ && rm -rf thirdlib/highway
25 | 
26 | ADD cpp-common cpp-common
27 | ADD op-mem-ompsimd op-mem-ompsimd
28 | WORKDIR /home/op-mem-ompsimd
29 | 
30 | RUN rm -rf build && mkdir build && cd build && \
31 |     cmake ..&& \
32 |     make -j$(nproc)
33 | 
34 | CMD ["./build/bin/deepx-excuter-ompsimd"]


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/dockerbuild.sh:
--------------------------------------------------------------------------------
1 | cd ../
2 | pwd
3 | ls -al
4 | docker build -t docker.array2d.com/deepx/ompsimd:latest . -f op-mem-ompsimd/Dockerfile
5 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/log.md:
--------------------------------------------------------------------------------
 1 | ### 2025-01-9
 2 | deepx第三次重构
 3 | 目标：性能与特性并重
 4 | 
 5 | 
 6 | ### 2025-01-17
 7 | 尝试omp+highway的simd融合
 8 | 
 9 | ### 2025-01-20
10 | 
11 | layer.Node需要仔细设计forward和backward的接口
12 | 
13 | + 输入输出用string作为key，从tensormanager中获取tensor
14 | + parallel结构
15 | 
16 | ### 2025-01-21
17 | h5模型文件，转deepx格式
18 | 
19 | ### 2025-02-06
20 | 
21 | op完全重构
22 | 
23 | + 输入输出用string作为key，从tensormanager中获取tensor
24 | 
25 | + 对算子的精度进行了特化
26 | 
27 | 
28 | ### 2025-02-07
29 | 
30 | +  关于simd对齐的3段式对齐
31 |  ```
32 |     头部未对齐：通过标量运算处理直到对齐边界
33 | 
34 |       const size_t adjust = (alignment - misalign) / sizeof(T);
35 |    for (; j < adjust...)
36 | 
37 | 
38 |     主体对齐部分：使用对齐加载/存储指令
39 | 
40 | 
41 |       Load(tag, a_start + j);  // 对齐加载
42 |    Store(...);  // 对齐存储
43 | 
44 | 
45 |     尾部剩余元素：处理最后不足一个向量宽度的元素
46 | 
47 |     
48 |       for (; j < len; ++j)
49 |  ```
50 | 
51 | ### 2025-02-19
52 | 
53 | + 增加、优化了部分tensorfunc的性能
54 | + 验证了一些列op的正确性
55 | 
56 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/client/tfs.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CLIENT_TFS_HPP
 2 | #define CLIENT_TFS_HPP
 3 |  
 4 | #include "deepx/tf/tffactory.hpp"
 5 | namespace deepx::tf{
 6 | 
 7 |     int register_all(TfFactory &tfactory);
 8 | }
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/deepx/dtype_ompsimd.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_DTYPE_OMPSIMD_HPP
 2 | #define DEEPX_DTYPE_OMPSIMD_HPP
 3 |  
 4 | #include "deepx/dtype.hpp"
 5 | 
 6 | namespace deepx
 7 | {
 8 |     using namespace std;
 9 |         // 获取类型对应的Precision
10 |     template <typename T>
11 |     constexpr Precision precision()
12 |     {
13 |         if constexpr (std::is_same_v<T, double>)
14 |             return Precision::Float64;
15 |         else if constexpr (std::is_same_v<T, float>)
16 |             return Precision::Float32;
17 |         else if constexpr (std::is_same_v<T, int64_t>)
18 |             return Precision::Int64;
19 |         else if constexpr (std::is_same_v<T, int32_t>)
20 |             return Precision::Int32;
21 |         else if constexpr (std::is_same_v<T, int16_t>)
22 |             return Precision::Int16;
23 |         else if constexpr (std::is_same_v<T, int8_t>)
24 |             return Precision::Int8;
25 |         else if constexpr (std::is_same_v<T, bool>)
26 |             return Precision::Bool;
27 |         else if constexpr (std::is_same_v<T, std::string>)
28 |             return Precision::String;
29 |         else
30 |             return Precision::Any;
31 |     }   
32 | }
33 | 
34 | #endif // DEEPX_DTYPE_OMPSIMD_HPP
35 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/deepx/tensorfunc/equal.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_EQUAL_HPP
 2 | #define DEEPX_TENSORFUNC_EQUAL_HPP
 3 | #include <cmath>
 4 | #include <omp.h>
 5 | 
 6 | #include "deepx/tensor.hpp"
 7 | #include "deepx/shape.hpp"
 8 | namespace deepx::tensorfunc
 9 | {
10 |     template <typename T>
11 |     bool equal(Tensor<T> &tensor, Tensor<T> &other,float epsilon=1e-6)
12 |     {
13 |         bool result=true;
14 |         if (tensor.shape.shape != other.shape.shape)
15 |             return false;
16 | 
17 |         if constexpr (std::is_floating_point_v<T>)
18 |         {
19 |             #pragma omp parallel for
20 |             for (int i = 0; i < tensor.shape.size; ++i)
21 |             {
22 |                 if (std::fabs(tensor.data[i] - other.data[i]) > epsilon)
23 |                 {
24 |                     #pragma omp atomic write
25 |                     result=false;
26 |                 }
27 |             }
28 | 
29 |             return result;
30 |         }
31 |         else
32 |         {
33 |             return std::equal(tensor.data, tensor.data + tensor.shape.size, other.data);
34 |         }           
35 |     };
36 | }
37 | #endif // DEEPX_OP_CPU_EQUAL_HPP
38 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/deepx/tensorfunc/highway.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_HIGHWAY_HPP
 2 | #define DEEPX_TENSORFUNC_HIGHWAY_HPP
 3 | 
 4 | #include <hwy/highway.h>
 5 | 
 6 | namespace deepx::tensorfunc
 7 | {
 8 |     using namespace hwy::HWY_NAMESPACE;
 9 | 
10 |     template <typename T, class D>
11 |     T ReduceMul(D d, Vec<D> v)
12 |     {
13 |         T result = GetLane(v);
14 |         for (size_t i = 1; i < Lanes(d); ++i)
15 |         {
16 |             result *= ExtractLane(v, i);
17 |         }
18 |         return result;
19 |     }
20 | 
21 | }
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_MATMUL_MIAOBYTE_HPP
 2 | #define DEEPX_TENSORFUNC_MATMUL_MIAOBYTE_HPP
 3 | 
 4 | #include "deepx/tensorfunc/matmul.hpp"
 5 | 
 6 | namespace deepx::tensorfunc
 7 | {
 8 |     template <typename T>
 9 |     struct matmulDispatcher<miaobyte,T>
10 |     {
11 |         static void matmul(const Tensor<T> &A, const Tensor<T> &B, Tensor<T> &C)
12 |         {
13 |             if (!check_matmul_shape(A.shape, B.shape))
14 |             {
15 |                 throw std::invalid_argument("A.shape could matmul with B.shape");
16 |             }
17 |             //TODO
18 |             //这里需要进一步优化
19 |             C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector<int> &indices,ThreadLocalVectors &tlv) {
20 |                 
21 |                 // int m=A.shape[-2];
22 |                 int k=A.shape[-1];
23 |                 // int n=B.shape[-1];
24 |      
25 |                 std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin());
26 |                 tlv.get(0)[indices.size()-2]=A.shape[-2];
27 |                 tlv.get(0)[indices.size()-1]=indices[-1];
28 |                 int aIdx=A.shape.linearat(tlv.get(0));
29 |                 std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin());
30 |                 tlv.get(1)[indices.size()-2]=0;
31 |                 tlv.get(1)[indices.size()-1]=indices[-2];
32 |                 int bIdx=B.shape.linearat(tlv.get(1));
33 |                 int bstride=k;
34 |                 
35 |                 T sum=0;
36 |                 for(int l=0;l<k;l++){
37 |                     sum+=A.data[aIdx+l]+B.data[bIdx+l*bstride];
38 |                 }
39 |                 C.data[idx]=sum;
40 |             },{A.shape.dim(),B.shape.dim()});
41 |         }
42 |     };
43 | 
44 | }
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/src/deepx/tensorfunc/new_mempool.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
 2 | #define DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
 3 | 
 4 | #include <jemalloc/jemalloc.h>
 5 | 
 6 | namespace deepx::tensorfunc
 7 | {
 8 |     class MemoryPool
 9 |     {
10 |     public:
11 |     static void* Malloc(size_t size) {
12 |         return mallocx(size, MALLOCX_ALIGN(64));  // 64字节对齐，适合SIMD
13 |     }   
14 | 
15 |     static void Free(void* ptr) {
16 |         dallocx(ptr, 0);
17 |     }
18 |     
19 |     // Realloc: 重新分配内存并保留原数据,主要用于tensor形状改变时的内存重分配
20 |     // 如果新的size小于原size,数据会被截断
21 |     // 如果新的size大于原size,新分配的内存部分不会初始化
22 |     // 如果ptr为nullptr,等同于Malloc
23 |     // 如果size为0,等同于Free
24 |     // 返回新分配的内存指针,如果分配失败返回nullptr
25 | 
26 |     static void* Realloc(void* ptr, size_t size) {
27 |         return rallocx(ptr, size, MALLOCX_ALIGN(64));
28 |     }
29 | 
30 |         
31 |     // GetAllocatedSize: 获取已分配内存的实际大小
32 |     // 由于内存对齐,实际分配的内存可能大于请求的size
33 |     // 主要用于内存使用统计和调试
34 |     // 如果ptr为nullptr,返回0
35 |     // 重新分配内存，保留原数据
36 |     static size_t GetAllocatedSize(void* ptr) {
37 |         return sallocx(ptr, 0);
38 |     }
39 | };
40 | 
41 | }  // namespace deepx::tensorfunc
42 | #endif  // DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP
43 | 


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/op/1_mem.cpp:
--------------------------------------------------------------------------------
 1 | #include <memory>
 2 | #include "deepx/mem/mem_ompsimd.hpp"
 3 | #include "deepx/tensor.hpp"
 4 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 5 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 6 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 7 | #include "deepx/tensorfunc/authors.hpp"
 8 | 
 9 | using namespace deepx::mem;
10 | using namespace deepx;
11 | using namespace deepx::tensorfunc;
12 | using namespace std;
13 | int main()
14 | {
15 |     shared_ptr<MemBase> mem=make_shared<Mem>();
16 |     for (int i = 0; i < 10; i++)
17 |     {
18 |         Tensor<float> tensor = New<float>({1, 2, 3});
19 |         uniform<miaobyte>(tensor,0.0f,1.0f);
20 |         mem->addtensor("tensor" + std::to_string(i),  tensor );
21 |     }
22 |  
23 |     cout << mem->existstensor(string("tensor0")) << endl;
24 |     print<miaobyte>(*(mem->gettensor<float>(string("tensor0")).get()));
25 |     mem->clear();
26 |  
27 |     return 0;
28 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/op/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  add_executable(1_mem 1_mem.cpp)
 3 |  target_link_libraries(1_mem deepx_ompsimd)
 4 | 
 5 | # add_executable(1_relu 1_relu.cpp)
 6 | # target_link_libraries(1_relu deepx_ompsimd)
 7 | 
 8 | 
 9 | # add_executable(2_add 2_add.cpp)
10 | # target_link_libraries(2_add deepx_ompsimd)
11 | 
12 | # add_executable(3_matmul 3_matmul.cpp)
13 | # target_link_libraries(3_matmul deepx_ompsimd)
14 | 
15 | # add_executable(4_sgd 4_sgd.cpp)
16 | # target_link_libraries(4_sgd deepx_ompsimd)


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp:
--------------------------------------------------------------------------------
 1 |  
 2 | #include <deepx/tensor.hpp>
 3 | #include <iostream>
 4 | 
 5 | #include <stdutil/vector.hpp>
 6 | using namespace deepx;
 7 |  
 8 | void test_tensor_shape() {
 9 |     Shape shape({2, 3, 4});
10 |     std::cout << "print shape: " << shape.size << std::endl;
11 |     std::string yaml=shape.toYaml();
12 |     std::cout<<"yaml:"<<std::endl<<yaml<<std::endl<<std::endl<<std::endl;
13 |     
14 |     Shape shape2;
15 |     shape2.fromYaml(yaml);
16 |     std::cout<<"shape2: "<<shape2.size<<" "<<shape2.shape<<" "<<shape2.dim()<<" "<<shape2.strides<<std::endl;
17 |     
18 | }
19 | 
20 | int main() {
21 |     test_tensor_shape();
22 |     return 0;
23 | }
24 |  


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/2_shape_combintion.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include "deepx/vector_combination.hpp"
 4 | #include "stdutil/vector.hpp"
 5 | using namespace deepx;
 6 | 
 7 | void test_combination()
 8 | {
 9 |     std::vector<std::vector<int>> result = combination(3);
10 |     for (const auto &comb : result)
11 |     {
12 |         std::cout << "Combination:"<<comb<<std::endl;
13 |     }
14 | }
15 | int main()
16 | {
17 |     test_combination();
18 |     return 0;
19 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_equal.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <numeric>
 3 | 
 4 | #include "deepx/tensorfunc/equal.hpp"
 5 | #include "deepx/tensor.hpp"
 6 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 7 | 
 8 | using namespace deepx;
 9 | using namespace deepx::tensorfunc;
10 | void test_equal(){
11 |     Tensor<float> tensor1=New<float>({4096,4096});
12 |     std::iota(tensor1.data,tensor1.data+tensor1.shape.size,0);
13 |     Tensor<float> tensor2=New<float>({4096,4096});
14 |     std::iota(tensor2.data,tensor2.data+tensor2.shape.size,0);
15 |     std::cout<<equal(tensor1,tensor2)<<std::endl;
16 | }
17 | 
18 | int main(){
19 |     test_equal();
20 |     return 0;
21 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_new.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <iostream>
 3 | 
 4 | #include "deepx/tensor.hpp"
 5 | 
 6 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 7 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 8 | #include "deepx/tensorfunc/authors.hpp"
 9 | #include "deepx/tensorfunc/io_miaobyte.hpp"
10 | 
11 | using namespace deepx;
12 | using namespace deepx::tensorfunc;
13 | void test_tensor_new(){
14 |     Tensor<float> tensor=New<float>({2, 3});
15 |     constant<miaobyte,float>(tensor,1);
16 |     print<miaobyte>(tensor);
17 |     tensor.save("tensor");
18 |     Tensor<float> tensor2=New<float>({2, 3});
19 |     constant<miaobyte,float>(tensor2,2);
20 |     print<miaobyte>(tensor2);
21 |     tensor2.save("tensor2");
22 | }
23 | 
24 | void test_arange() {
25 |     Tensor<float> tensor=New<float>({2, 3});
26 |     arange<miaobyte,float>(tensor,float(0),float(1));
27 |     print<miaobyte>(tensor);
28 | }
29 |  
30 | int main(int argc,char **argv){
31 |     int i=0;
32 |     if (argc>1){
33 |         
34 |         i=std::atoi(argv[1]);
35 |     }
36 |     switch (i) {
37 |         case 1:
38 |             test_tensor_new();
39 |         case 0:
40 |             test_arange();
41 |     }
42 |     return 0;
43 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <iostream>
 3 | 
 4 | #include "deepx/tensor.hpp"
 5 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 6 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 7 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 8 | #include "deepx/tensorfunc/authors.hpp"
 9 | 
10 | 
11 | using namespace deepx;
12 | using namespace deepx::tensorfunc;
13 | void test_tensor_range(){
14 |     Tensor<float> tensor=New<float>({2, 3});
15 |     constant<miaobyte,float>(tensor,1);
16 |     print<miaobyte>(tensor);
17 |  
18 |     Tensor<float> tensor2=New<float>({2, 3});
19 |     constant<miaobyte,float>(tensor2,2);
20 |     print<miaobyte>(tensor2);
21 |  
22 | }
23 |  
24 | int main(){
25 |     test_tensor_range();
26 |  
27 |     return 0;
28 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import yaml
 4 | import os
 5 | import sys
 6 | def load_shape(path):
 7 |     with open(path + '.shape', 'r') as f:
 8 |         shape_data = f.read()
 9 |     shape = yaml.safe_load(shape_data)
10 |     return shape['shape'], shape['dim'], shape['strides'], shape['size']
11 | 
12 | def load_tensor_data(path, shape):
13 |     data = np.fromfile(path + '.data', dtype=np.float32)
14 |     return data.reshape(shape)
15 | 
16 | def load_deepx_tensor(path):
17 |     shape, dim, strides, size = load_shape(path)
18 |     tensor_data = load_tensor_data(path, shape)
19 |     return torch.tensor(tensor_data)
20 | 
21 | # 使用示例
22 | if __name__ == "__main__":
23 |     name=sys.argv[1]
24 |     tensor = load_deepx_tensor(name)
25 |     print("Tensor:", tensor)
26 |     print("Shape:", tensor.size()) 
27 |     print("Strides:", tensor.stride())


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp:
--------------------------------------------------------------------------------
 1 | #include <numeric>
 2 | 
 3 | #include "deepx/tensor.hpp"
 4 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 5 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 6 | #include "deepx/tensorfunc/authors.hpp"
 7 | 
 8 | using namespace deepx::tensorfunc;
 9 | int main(){
10 |     deepx::Tensor<float> t=New<float>({2, 3,4});
11 |     std::iota(t.data, t.data+t.shape.size, 0);
12 |     print<miaobyte>(t);
13 |     return 0;
14 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp:
--------------------------------------------------------------------------------
 1 | #include <numeric>
 2 | #include "deepx/tensorfunc/elementwise.hpp"
 3 | #include "deepx/tensorfunc/elementwise_miaobyte.hpp"
 4 | #include "deepx/tensor.hpp"
 5 | #include "deepx/tensorfunc/init_miaobyte.hpp"
 6 | #include "deepx/tensorfunc/io_miaobyte.hpp"
 7 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 8 | #include "deepx/tensorfunc/authors.hpp"
 9 | #include "tensorutil.hpp"
10 |     
11 | using namespace deepx;
12 | using namespace deepx::tensorfunc;
13 | 
14 | void test_max(){
15 |     std::vector<int> shape=randomshape(1,3,1,19);
16 |     Tensor<float> A=New<float>(shape);
17 |     std::iota(A.data,A.data+A.shape.size,0);
18 |     print<miaobyte>(A)  ;
19 |     Tensor<float> B=New<float>(shape);
20 |     constant<miaobyte,float>(B,float(55));
21 |     print<miaobyte>(B);
22 |     Tensor<float> C=New<float>(shape);
23 |     Tensor<float> D=New<float>(shape);
24 |     max<tensorfunc::miaobyte,float>(A,B,C);
25 |     print<miaobyte>(C);
26 |     min<tensorfunc::miaobyte,float>(A,B,D);
27 |     print<miaobyte>(D);
28 | }
29 | int main(){
30 |     test_max();
31 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <numeric>
 3 | #include <vector>
 4 | 
 5 | #include "deepx/tensor.hpp"
 6 | #include "deepx/tensorfunc/changeshape_miaobyte.hpp"
 7 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp"
 8 | #include "deepx/tensorfunc/authors.hpp"
 9 | #include "deepx/tensorfunc/io_miaobyte.hpp"
10 | #include "stdutil/vector.hpp"
11 | #include "tensorutil.hpp"
12 | #include "deepx/shape_changeshape.hpp"
13 | 
14 | using namespace deepx::tensorfunc;
15 | using namespace deepx;
16 | using namespace std;
17 | void test_transpose()
18 | {
19 |     std::vector<int> shape = randomshape(2, 4, 1, 6);
20 |     Tensor tensor = New<float>(shape);
21 |     std::iota(tensor.data, tensor.data + tensor.shape.size, 1);
22 |     print<miaobyte>(tensor);
23 | 
24 |     vector<int> dimOrder = swaplastTwoDimOrder(shape);
25 | 
26 |     std::vector<int> resultshape = transposeShape(tensor.shape.shape, dimOrder);
27 |     Tensor result = New<float>(resultshape);
28 |     transpose<miaobyte,float>(tensor, dimOrder, result);
29 |     print<miaobyte>(result);
30 | }
31 | 
32 | int main()
33 | {
34 |     test_transpose();
35 | }


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(1_shape 1_shape.cpp)
 2 | target_link_libraries(1_shape deepx_ompsimd)
 3 | 
 4 | add_executable(2_shape_combintion 2_shape_combintion.cpp)
 5 | target_link_libraries(2_shape_combintion deepx_ompsimd)
 6 | 
 7 | add_executable(2_tensor_new 2_tensor_new.cpp)
 8 | target_link_libraries(2_tensor_new deepx_ompsimd )
 9 | 
10 | add_executable(2_tensor_range 2_tensor_range.cpp)
11 | target_link_libraries(2_tensor_range deepx_ompsimd  )
12 | 
13 | add_executable(2_tensor_equal 2_tensor_equal.cpp)
14 | target_link_libraries(2_tensor_equal deepx_ompsimd  )
15 | 
16 | add_executable(3_tensor_print 3_tensor_print.cpp)
17 | target_link_libraries(3_tensor_print deepx_ompsimd  )
18 | 
19 | 
20 | add_executable(4_tensor_matmul 4_tensor_matmul.cpp)
21 | target_link_libraries(4_tensor_matmul deepx_ompsimd  )
22 | 
23 | add_executable(4_tensor_add 4_tensor_add.cpp)
24 | target_link_libraries(4_tensor_add deepx_ompsimd   )
25 | 
26 | add_executable(4_tensor_sub 4_tensor_sub.cpp)
27 | target_link_libraries(4_tensor_sub deepx_ompsimd  )
28 | 
29 | add_executable(4_tensor_mul 4_tensor_mul.cpp)
30 | target_link_libraries(4_tensor_mul deepx_ompsimd  )
31 | 
32 | add_executable(4_tensor_max 4_tensor_max.cpp)
33 | target_link_libraries(4_tensor_max deepx_ompsimd  )
34 | 
35 | 
36 | add_executable(5_tensor_sum 5_tensor_sum.cpp)
37 | target_link_libraries(5_tensor_sum deepx_ompsimd  )
38 | 
39 | add_executable(6_tensor_broadcast 6_tensor_broadcast.cpp)
40 | target_link_libraries(6_tensor_broadcast deepx_ompsimd  )
41 | 
42 | add_executable(7_tensor_transpose 7_tensor_transpose.cpp)
43 | target_link_libraries(7_tensor_transpose deepx_ompsimd  )
44 | 
45 | add_executable(8_tensor_concat 8_tensor_concat.cpp)
46 | target_link_libraries(8_tensor_concat deepx_ompsimd  )


--------------------------------------------------------------------------------
/excuter/op-mem-ompsimd/test/tensorfunc/tensorutil.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef TENSORUTIL_HPP
 2 | #define TENSORUTIL_HPP
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <cstdlib>
 7 | #include <ctime>
 8 | #include <random>
 9 | 
10 | #include "deepx/tensor.hpp"
11 | 
12 | using namespace deepx;
13 |  
14 | /*
15 |     dimlen_min:shape.size()的最小维度长度
16 |     dimlen_max:shape.size()的最大维度长度
17 |     shape_min:shape[i]的最小维度数量
18 |     shape_max:shape[i]的最大维度数量
19 | */
20 |  
21 | std::vector<int> randomshape(size_t dimlen_min, size_t dimlen_max, size_t shape_min, size_t shape_max) {
22 |     // 初始化随机数种子
23 |     std::srand(static_cast<unsigned int>(std::time(nullptr)));
24 | 
25 |     // 随机生成维度长度
26 |     size_t dimlen = dimlen_min + std::rand() % (dimlen_max - dimlen_min + 1);
27 | 
28 |     // 创建存储形状的向量
29 |     std::vector<int> shape(dimlen);
30 | 
31 |     // 为每个维度随机生成形状值
32 |     for (size_t i = 0; i < dimlen; ++i) {
33 |         shape[i] = static_cast<int>(shape_min + std::rand() % (shape_max - shape_min + 1));
34 |     }
35 | 
36 |     return shape;
37 | }
38 | 
39 | std::vector<int> randomshape2(size_t dimlen_min, size_t dimlen_max, size_t dim_min, size_t dim_max) {
40 |     std::random_device rd;
41 |     std::mt19937 gen(rd());
42 |     
43 |     // 生成维度数量
44 |     std::uniform_int_distribution<> dim_dist(dimlen_min, dimlen_max);
45 |     int dims = dim_dist(gen);
46 |     
47 |     // 生成每个维度的长度
48 |     std::uniform_int_distribution<> len_dist(dim_min, dim_max);
49 |     std::vector<int> shape;
50 |     shape.reserve(dims);
51 |     
52 |     for (int i = 0; i < dims; ++i) {
53 |         shape.push_back(len_dist(gen));
54 |     }
55 |     
56 |     return shape;
57 | }
58 | 
59 | 
60 | #endif // TENSORUTIL_HPP
61 | 


--------------------------------------------------------------------------------
/front/go/README.md:
--------------------------------------------------------------------------------
 1 | # deepx-go
 2 | 
 3 |  deepx-go是为了构建抽象计算图的golang接口库
 4 | 
 5 | 供算法人员搭建模型使用
 6 | 
 7 | ## 存算一体，控制分离
 8 | 
 9 | deepx-py是控制侧，负责生成计算图，通过通信调用其他进程
10 | 
11 | 在excuter中，实现了具体的存算引擎
12 | 
13 | 
14 | 
15 | ## 接口设计
16 | deepx-py的接口设计尽量贴合pytorch，方便用户进行迁移
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/front/go/deepx/graph_constarg.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type ArgType int
 4 | 
 5 | const (
 6 | 	ArgTypeInt ArgType = iota
 7 | 	ArgTypeFloat
 8 | 	ArgTypeString
 9 | 	ArgTypeIntVector
10 | )
11 | 
12 | type ConstArgNode struct {
13 | 	name    string
14 | 	ntype   NodeType
15 | 	inputs  map[string]Node
16 | 	value   any
17 | 	argType ArgType
18 | }
19 | 
20 | func NewConstArgNode(name string) *ConstArgNode {
21 | 	return &ConstArgNode{
22 | 		name:   name,
23 | 		ntype:  NodeConstArg,
24 | 		inputs: make(map[string]Node),
25 | 	}
26 | }
27 | func (n *ConstArgNode) Ntype() NodeType {
28 | 	return n.ntype
29 | }
30 | func (n *ConstArgNode) Name() string {
31 | 	return n.name
32 | }
33 | func (n *ConstArgNode) Input(name string) Node {
34 | 	return n.inputs[name]
35 | }
36 | func (n *ConstArgNode) Inputs() map[string]Node {
37 | 	return n.inputs
38 | }
39 | func (n *ConstArgNode) AddInput(name string, input Node) {
40 | 	n.inputs[name] = input
41 | }
42 | func (n *ConstArgNode) RemoveInput(name string) {
43 | 	delete(n.inputs, name)
44 | }
45 | func (n *ConstArgNode) Int() int {
46 | 	if n.argType != ArgTypeInt {
47 | 		panic("ConstArgNode is not an integer")
48 | 	}
49 | 	return n.value.(int)
50 | }
51 | func (n *ConstArgNode) Float() float64 {
52 | 	if n.argType != ArgTypeFloat {
53 | 		panic("ConstArgNode is not a float")
54 | 	}
55 | 	return n.value.(float64)
56 | }
57 | func (n *ConstArgNode) String() string {
58 | 	if n.argType != ArgTypeString {
59 | 		panic("ConstArgNode is not a string")
60 | 	}
61 | 	return n.value.(string)
62 | }
63 | func (n *ConstArgNode) SetInt(value int) {
64 | 	n.value = value
65 | 	n.argType = ArgTypeInt
66 | }
67 | func (n *ConstArgNode) SetInts(value []int) {
68 | 	n.value = value
69 | 	n.argType = ArgTypeIntVector
70 | }
71 | func (n *ConstArgNode) SetFloat(value float64) {
72 | 	n.value = value
73 | 	n.argType = ArgTypeFloat
74 | }
75 | func (n *ConstArgNode) SetString(value string) {
76 | 	n.value = value
77 | 	n.argType = ArgTypeString
78 | }
79 | 


--------------------------------------------------------------------------------
/front/go/deepx/graph_opnode.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type OpType struct {
 4 | 	name      string
 5 | 	shortchar string
 6 | }
 7 | 
 8 | var (
 9 | 	opmaps = make(map[string]OpType)
10 | )
11 | 
12 | func RegistOpType(name string, shortchar string) {
13 | 	opmaps[name] = OpType{name, shortchar}
14 | }
15 | 
16 | type OpNode struct {
17 | 	OpType
18 | 	ntype NodeType
19 | 
20 | 	inputs map[string]Node
21 | }
22 | 
23 | func NewOpNode(name string) *OpNode {
24 | 	return &OpNode{
25 | 		OpType: opmaps[name],
26 | 		ntype:  NodeOp,
27 | 		inputs: make(map[string]Node),
28 | 	}
29 | }
30 | func (n *OpNode) Ntype() NodeType {
31 | 	return n.ntype
32 | }
33 | func (n *OpNode) Name() string {
34 | 	return n.name
35 | }
36 | func (n *OpNode) Input(name string) Node {
37 | 	return n.inputs[name]
38 | }
39 | func (n *OpNode) Inputs() map[string]Node {
40 | 	return n.inputs
41 | }
42 | func (n *OpNode) AddInput(name string, input Node) {
43 | 	n.inputs[name] = input
44 | }
45 | func (n *OpNode) RemoveInput(name string) {
46 | 	delete(n.inputs, name)
47 | }
48 | func (n *OpNode) Shortchar() string {
49 | 	return n.shortchar
50 | }
51 | 


--------------------------------------------------------------------------------
/front/go/deepx/graph_tensornode.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type TensorNode struct {
 4 | 	name   string
 5 | 	ntype  NodeType
 6 | 	inputs map[string]Node
 7 | 	tensor *Tensor // 对于 NodeTensor 类型，存储实际的张量数据
 8 | }
 9 | 
10 | func NewTensorNode(name string) *TensorNode {
11 | 	return &TensorNode{
12 | 		name:   name,
13 | 		ntype:  NodeTensor,
14 | 		inputs: make(map[string]Node),
15 | 	}
16 | }
17 | func (n *TensorNode) Ntype() NodeType {
18 | 	return n.ntype
19 | }
20 | func (n *TensorNode) Name() string {
21 | 	return n.name
22 | }
23 | func (n *TensorNode) Input(name string) Node {
24 | 	return n.inputs[name]
25 | }
26 | func (n *TensorNode) Inputs() map[string]Node {
27 | 	return n.inputs
28 | }
29 | func (n *TensorNode) AddInput(name string, input Node) {
30 | 	n.inputs[name] = input
31 | }
32 | func (n *TensorNode) RemoveInput(name string) {
33 | 	delete(n.inputs, name)
34 | }
35 | func (n *TensorNode) Tensor() *Tensor {
36 | 	return n.tensor
37 | }
38 | func (n *TensorNode) SetTensor(tensor *Tensor) {
39 | 	n.tensor = tensor
40 | }
41 | 


--------------------------------------------------------------------------------
/front/go/deepx/linear.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type Linear struct {
 4 | 	ModuleBase
 5 | 	W *Tensor
 6 | 	b *Tensor
 7 | }
 8 | 
 9 | func NewLinear(name string, in_features, out_features int, dtype Dtype, g *Graph) (m *Linear) {
10 | 	if g == nil {
11 | 		g = NewGraph()
12 | 	}
13 | 	if name == "" {
14 | 		name = "linear"
15 | 	}
16 | 	m = &Linear{
17 | 		ModuleBase: ModuleBase{
18 | 			g:    g,
19 | 			name: name,
20 | 		},
21 | 	}
22 | 
23 | 	in_features_node := g.AddConstArg(name + ".in_features")
24 | 	in_features_node.SetInt(in_features)
25 | 	out_features_node := g.AddConstArg(name + ".out_features")
26 | 	out_features_node.SetInt(out_features)
27 | 
28 | 	//如果利用矩阵grad时的取巧运算，则需要将W的shape设置为[out_features,in_features]来实现提前转置
29 | 	m.W = g.AddTensor(name+".W", dtype, []int{in_features, out_features}, true, in_features_node, out_features_node).Tensor()
30 | 	m.b = g.AddTensor(name+".bias", dtype, []int{out_features}, true, out_features_node).Tensor()
31 | 	return m
32 | }
33 | func (m *Linear) Linear(input *Tensor) *Tensor {
34 | 	y := input.Matmul(m.W)
35 | 	z := y.Add(m.b)
36 | 	return z
37 | }
38 | 


--------------------------------------------------------------------------------
/front/go/deepx/mlp.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type MLP struct {
 4 | 	ModuleBase
 5 | 	fc1 *Linear
 6 | 	fc2 *Linear
 7 | }
 8 | 
 9 | func NewMLP(name string, in_features, hidden_features int, dtype Dtype, g *Graph) *MLP {
10 | 	return &MLP{
11 | 		ModuleBase: ModuleBase{
12 | 			g:    g,
13 | 			name: name,
14 | 		},
15 | 		fc1: NewLinear(name+".fc1", in_features, hidden_features, dtype, g),
16 | 		fc2: NewLinear(name+".fc2", hidden_features, in_features, dtype, g),
17 | 	}
18 | }
19 | 
20 | func (m *MLP) Forward(x *Tensor) *Tensor {
21 | 	x = m.fc1.Linear(x)
22 | 	x = x.Relu()
23 | 	x = m.fc2.Linear(x)
24 | 	return x
25 | }
26 | 


--------------------------------------------------------------------------------
/front/go/deepx/module.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type Module interface {
 4 | 	Graph() *Graph
 5 | 	Name() string
 6 | }
 7 | type ModuleBase struct {
 8 | 	name string
 9 | 	g    *Graph
10 | }
11 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_activite.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("relu", "ReLU")
 5 | }
 6 | 
 7 | func (t *Tensor) Relu() *Tensor {
 8 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
 9 | 	op := t.graph.AddOp("relu", t.node)
10 | 	result.AddInput(op.name, op)
11 | 	return result.tensor
12 | }
13 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_matmul.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("matmul", "⊗")
 5 | }
 6 | 
 7 | func (t *Tensor) Matmul(other *Tensor) *Tensor {
 8 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
 9 | 	op := t.graph.AddOp("matmul", t.node, other.node)
10 | 	result.AddInput(op.name, op)
11 | 	return result.tensor
12 | }
13 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_musk.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("mask", "mask")
 5 | }
 6 | 
 7 | func (t *Tensor) ApplyMask(mask *Tensor) *Tensor {
 8 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
 9 | 	op := t.graph.AddOp("mask", t.node, mask.node)
10 | 	result.AddInput(op.name, op)
11 | 	return result.tensor
12 | }
13 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_norm.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("L2Norm", "l2norm")
 5 | 	RegistOpType("L1Norm", "l1norm")
 6 | }
 7 | 
 8 | // L2Norm 计算L2范数
 9 | // ||x||₂ = sqrt(Σx²)
10 | func (t *Tensor) L2Norm() *Tensor {
11 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
12 | 	op := t.graph.AddOp("l2norm", t.node)
13 | 	result.AddInput(op.name, op)
14 | 	return result.tensor
15 | }
16 | 
17 | // L1Norm 计算L1范数
18 | // ||x||₁ = Σ|x|
19 | func (t *Tensor) L1Norm() *Tensor {
20 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
21 | 	op := t.graph.AddOp("l1norm", t.node)
22 | 	result.AddInput(op.name, op)
23 | 	return result.tensor
24 | }
25 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_normalization.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("softmax", "softmax")
 5 | 	RegistOpType("layernorm", "LN")
 6 | 	RegistOpType("batchnorm", "BN")
 7 | 	RegistOpType("instancenorm", "IN")
 8 | 	RegistOpType("groupnorm", "GN")
 9 | 	RegistOpType("rmsnorm", "RMS")
10 | }
11 | func (t *Tensor) Softmax(axis int) *Tensor {
12 | 	// 1. 计算最大值
13 | 	x_max := t.Max([]int{axis})
14 | 	// 2. 减去最大值
15 | 	shifted := t.Sub(x_max)
16 | 	// 3. 计算指数
17 | 	exp_x := shifted.Exp()
18 | 	// 4. 计算和
19 | 	sum_exp := exp_x.Sum([]int{axis})
20 | 	// 5. 归一化
21 | 	result := exp_x.Div(sum_exp)
22 | 	return result
23 | }
24 | 
25 | func (t *Tensor) MinMax(axis int) *Tensor {
26 | 	// 1. 计算最大值
27 | 	x_max := t.Max([]int{axis})
28 | 	// 2. 计算最小值
29 | 	x_min := t.Min([]int{axis})
30 | 	// 3. 计算范围
31 | 	ranged := x_max.Sub(x_min)
32 | 	// 4. 归一化
33 | 	result := t.Sub(x_min).Div(ranged)
34 | 	return result
35 | }
36 | 


--------------------------------------------------------------------------------
/front/go/deepx/tensor_shape.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | func init() {
 4 | 	RegistOpType("reshape", "reshape")
 5 | 	RegistOpType("transpose", "T")
 6 | }
 7 | 
 8 | func (t *Tensor) Reshape(shape []int) *Tensor {
 9 | 	result := t.graph.AddTensor("", t.Dtype, shape, t.requiresGrad)
10 | 	op := t.graph.AddOp("reshape", t.node)
11 | 	result.AddInput(op.name, op)
12 | 	return result.tensor
13 | }
14 | 
15 | func (t *Tensor) Transpose(axes []int) *Tensor {
16 | 	result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad)
17 | 	op := t.graph.AddOp("transpose", t.node)
18 | 	result.AddInput(op.name, op)
19 | 	return result.tensor
20 | }
21 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | type TransformerLayer struct {
 4 | 	ModuleBase
 5 | 	attention *MultiHeadAttention
 6 | 	mlp       *MLP
 7 | 	ln1       *LayerNorm
 8 | 	ln2       *LayerNorm
 9 | }
10 | 
11 | func NewTransformerLayer(name string, hidden_size, num_heads, mlp_ratio int, dtype Dtype, g *Graph) *TransformerLayer {
12 | 	if name == "" {
13 | 		name = "transformer_layer"
14 | 	}
15 | 
16 | 	return &TransformerLayer{
17 | 		ModuleBase: ModuleBase{
18 | 			g:    g,
19 | 			name: name,
20 | 		},
21 | 		attention: NewMultiHeadAttention(name+".attn", hidden_size, num_heads, dtype, g),
22 | 		mlp:       NewMLP(name+".mlp", hidden_size, mlp_ratio*hidden_size, dtype, g),
23 | 		ln1:       NewLayerNorm(name+".ln1", hidden_size, dtype, g),
24 | 		ln2:       NewLayerNorm(name+".ln2", hidden_size, dtype, g),
25 | 	}
26 | }
27 | 
28 | func (m *TransformerLayer) Forward(x *Tensor) *Tensor {
29 | 	// 1. Self Attention
30 | 	h := m.ln1.LayerNorm(x)
31 | 	h = m.attention.Forward(h, h, h)
32 | 	x = x.Add(h) // residual
33 | 
34 | 	// 2. MLP
35 | 	h = m.ln2.LayerNorm(x)
36 | 	h = m.mlp.Forward(h)
37 | 	x = x.Add(h) // residual
38 | 
39 | 	return x
40 | }
41 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer/attention.go:
--------------------------------------------------------------------------------
 1 | package transformer
 2 | 
 3 | // Qwen2Attention 模拟注意力层
 4 | type Qwen2Attention struct {
 5 | 	// 注意力层相关权重等参数
 6 | }
 7 | 
 8 | func (a *Qwen2Attention) Forward(hiddenStates interface{}, pastKV interface{}) (interface{}, interface{}, error) {
 9 | 	// 计算查询、键、值以及 RoPE 位置编码
10 | 	// 如果存在 pastKV，则进行拼接
11 | 	// 计算注意力分数并返回注意力输出及新的 KV 缓存
12 | 	return nil, nil, nil
13 | }
14 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer/config.go:
--------------------------------------------------------------------------------
 1 | package transformer
 2 | 
 3 | // Config 定义模型配置
 4 | type Config struct {
 5 | 	// 模型基本配置
 6 | 	HiddenSize       int
 7 | 	NumLayers        int
 8 | 	NumHeads         int
 9 | 	MLPRatio         int
10 | 	VocabSize        int
11 | 	MaxSeqLength     int
12 | 	InitializerRange float32
13 | 
14 | 	// 注意力相关配置
15 | 	AttentionImpl string
16 | 	SlidingWindow int
17 | 	UseFlashAttn  bool
18 | 
19 | 	// 生成相关配置
20 | 	UseCache    bool
21 | 	BeamSize    int
22 | 	TopK        int
23 | 	TopP        float32
24 | 	Temperature float32
25 | }
26 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer/model.go:
--------------------------------------------------------------------------------
 1 | package transformer
 2 | 
 3 | // PreTrainedModel 定义了基础模型接口
 4 | type PreTrainedModel interface {
 5 | 	Forward(inputs ...interface{}) (interface{}, error)
 6 | 	Generate(inputs ...interface{}) (interface{}, error)
 7 | 	SavePretrained(path string) error
 8 | 	FromPretrained(path string) (PreTrainedModel, error)
 9 | }
10 | 
11 | // Qwen2PreTrainedModel 实现了基类的一部分功能
12 | type Qwen2PreTrainedModel struct {
13 | 	Config *Config
14 | }
15 | 
16 | func (m *Qwen2PreTrainedModel) Forward(args ...interface{}) (interface{}, error) {
17 | 	// 实现前向传播逻辑，可留空或返回默认值
18 | 	return nil, nil
19 | }
20 | 
21 | func (m *Qwen2PreTrainedModel) Generate(inputs ...interface{}) (interface{}, error) {
22 | 	// 实现生成逻辑，例如自回归生成
23 | 	return nil, nil
24 | }
25 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer/qwen2_causal_lm.go:
--------------------------------------------------------------------------------
 1 | package transformer
 2 | 
 3 | // Qwen2ForCausalLM 为生成模型入口
 4 | type Qwen2ForCausalLM struct {
 5 | 	*Qwen2PreTrainedModel
 6 | 	Model  *Qwen2Model
 7 | 	LMHead interface{}
 8 | }
 9 | 
10 | func (m *Qwen2ForCausalLM) Forward(inputIDs []int, pastKV [][]interface{}) (interface{}, error) {
11 | 	outputs, err := m.Model.Forward(inputIDs, nil, nil, pastKV)
12 | 	if err != nil {
13 | 		return nil, err
14 | 	}
15 | 	// 根据主干网络输出生成 logits
16 | 	hiddenStates := outputs.(struct {
17 | 		LastHiddenState interface{}
18 | 		PastKeyValues   [][]interface{}
19 | 	}).LastHiddenState
20 | 	logits := m.lmHeadForward(hiddenStates)
21 | 	return struct {
22 | 		Logits        interface{}
23 | 		PastKeyValues [][]interface{}
24 | 	}{Logits: logits, PastKeyValues: outputs.(struct {
25 | 		LastHiddenState interface{}
26 | 		PastKeyValues   [][]interface{}
27 | 	}).PastKeyValues}, nil
28 | }
29 | 
30 | func (m *Qwen2ForCausalLM) PrepareInputsForGeneration(inputIDs []int, pastKV [][]interface{}) map[string]interface{} {
31 | 	if pastKV != nil && len(inputIDs) > 0 {
32 | 		// 仅保留最后一个 token
33 | 		inputIDs = inputIDs[len(inputIDs)-1:]
34 | 	}
35 | 	return map[string]interface{}{
36 | 		"input_ids":       inputIDs,
37 | 		"past_key_values": pastKV,
38 | 		"use_cache":       true,
39 | 	}
40 | }
41 | 
42 | // lmHeadForward 模拟 lm_head 的前向传播
43 | func (m *Qwen2ForCausalLM) lmHeadForward(hiddenStates interface{}) interface{} {
44 | 	// 实现将 hiddenStates 投影到词表维度的逻辑
45 | 	return nil
46 | }
47 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer/qwen2_model.go:
--------------------------------------------------------------------------------
 1 | package transformer
 2 | 
 3 | // Qwen2DecoderLayer 定义单层 Decoder 的接口
 4 | type Qwen2DecoderLayer interface {
 5 | 	Forward(hiddenStates interface{}, attentionMask interface{},
 6 | 		positionIds interface{}, pastKV interface{}) (output interface{}, newKV interface{}, err error)
 7 | }
 8 | 
 9 | // Qwen2Model 为主网络
10 | type Qwen2Model struct {
11 | 	*Qwen2PreTrainedModel             // 组合方式复用基类功能
12 | 	EmbedTokens           interface{} // Token 嵌入层
13 | 	Layers                []Qwen2DecoderLayer
14 | }
15 | 
16 | func (m *Qwen2Model) Forward(inputIDs []int, attentionMask []int, positionIDs []int, pastKV [][]interface{}) (interface{}, error) {
17 | 	// 模拟 token 嵌入
18 | 	hiddenStates := m.embedTokensForward(inputIDs)
19 | 	var updatedKV [][]interface{}
20 | 	// 遍历每一层 Decoder
21 | 	for i, layer := range m.Layers {
22 | 		var pastKVLayer interface{}
23 | 		if pastKV != nil && i < len(pastKV) {
24 | 			pastKVLayer = pastKV[i]
25 | 		}
26 | 		output, newKV, err := layer.Forward(hiddenStates, attentionMask, positionIDs, pastKVLayer)
27 | 		if err != nil {
28 | 			return nil, err
29 | 		}
30 | 		hiddenStates = output
31 | 		updatedKV = append(updatedKV, newKV)
32 | 	}
33 | 	return struct {
34 | 		LastHiddenState interface{}
35 | 		PastKeyValues   [][]interface{}
36 | 	}{LastHiddenState: hiddenStates, PastKeyValues: updatedKV}, nil
37 | }
38 | 
39 | // embedTokensForward 为嵌入层的模拟实现
40 | func (m *Qwen2Model) embedTokensForward(inputIDs []int) interface{} {
41 | 	// 根据 inputIDs 返回对应的嵌入向量
42 | 	return nil
43 | }
44 | 


--------------------------------------------------------------------------------
/front/go/deepx/transformer_model.go:
--------------------------------------------------------------------------------
 1 | package deepx
 2 | 
 3 | import "fmt"
 4 | 
 5 | type Transformer struct {
 6 | 	ModuleBase
 7 | 	embedding *Linear
 8 | 	layers    []*TransformerLayer
 9 | 	ln_final  *LayerNorm
10 | }
11 | 
12 | func NewTransformer(name string, num_layers, hidden_size, num_heads, mlp_ratio int, dtype Dtype, g *Graph) *Transformer {
13 | 	if name == "" {
14 | 		name = "transformer"
15 | 	}
16 | 
17 | 	m := &Transformer{
18 | 		ModuleBase: ModuleBase{
19 | 			g:    g,
20 | 			name: name,
21 | 		},
22 | 		embedding: NewLinear(name+".embedding", hidden_size, hidden_size, dtype, g),
23 | 		layers:    make([]*TransformerLayer, num_layers),
24 | 		ln_final:  NewLayerNorm(name+".ln_final", hidden_size, dtype, g),
25 | 	}
26 | 
27 | 	for i := 0; i < num_layers; i++ {
28 | 		m.layers[i] = NewTransformerLayer(
29 | 			fmt.Sprintf("%s.layer_%d", name, i),
30 | 			hidden_size,
31 | 			num_heads,
32 | 			mlp_ratio,
33 | 			dtype,
34 | 			g,
35 | 		)
36 | 	}
37 | 
38 | 	return m
39 | }
40 | 
41 | func (m *Transformer) Forward(x *Tensor) *Tensor {
42 | 	x = m.embedding.Linear(x)
43 | 
44 | 	for _, layer := range m.layers {
45 | 		x = layer.Forward(x)
46 | 	}
47 | 
48 | 	x = m.ln_final.LayerNorm(x)
49 | 	return x
50 | }
51 | 


--------------------------------------------------------------------------------
/front/go/example/1/1_app.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"github.com/array2d/deepx/front/go/deepx"
 7 | )
 8 | 
 9 | type Module1 struct {
10 | 	g *deepx.Graph
11 | }
12 | 
13 | func (m *Module1) Linear(input *deepx.Tensor) *deepx.Tensor {
14 | 	// 创建输入节点
15 | 	w_node := m.g.AddTensor("W", deepx.DtypeFloat32, []int{3, 4, 5}, true)
16 | 
17 | 	// 自动构建计算图
18 | 	y := input.Matmul(w_node.Tensor())
19 | 
20 | 	b_node := m.g.AddTensor("b", deepx.DtypeFloat32, []int{1, 4, 5}, true)
21 | 	z := y.Add(b_node.Tensor())
22 | 	return z
23 | }
24 | func (m *Module1) Forward() (z *deepx.Tensor) {
25 | 	x_node := m.g.AddTensor("Input", deepx.DtypeFloat32, []int{1, 2, 3}, true)
26 | 	z = x_node.Tensor()
27 | 	for i := 0; i < 2; i++ {
28 | 		z = m.Linear(z)
29 | 	}
30 | 
31 | 	return z
32 | }
33 | 
34 | func main() {
35 | 	module := &Module1{
36 | 		g: deepx.NewGraph(),
37 | 	}
38 | 	module.Forward()
39 | 
40 | 	dot := module.g.ToDOT()
41 | 	os.WriteFile("1_app.dot", []byte(dot), 0644)
42 | }
43 | 


--------------------------------------------------------------------------------
/front/go/example/3/3_transformer_app.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"os"
 5 | 
 6 | 	"github.com/array2d/deepx/front/go/deepx"
 7 | )
 8 | 
 9 | func main() {
10 | 	// 创建计算图
11 | 	g := deepx.NewGraph()
12 | 
13 | 	// 创建 Transformer 配置
14 | 	config := struct {
15 | 		hidden_size int
16 | 		num_heads   int
17 | 		num_layers  int
18 | 		mlp_ratio   int
19 | 		dtype       deepx.Dtype
20 | 	}{
21 | 		hidden_size: 256,
22 | 		num_heads:   4,
23 | 		num_layers:  2,
24 | 		mlp_ratio:   4,
25 | 		dtype:       deepx.DtypeFloat32,
26 | 	}
27 | 
28 | 	// 创建 Transformer 模型
29 | 	transformer := deepx.NewTransformer(
30 | 		"transformer",
31 | 		config.num_layers,
32 | 		config.hidden_size,
33 | 		config.num_heads,
34 | 		config.mlp_ratio,
35 | 		config.dtype,
36 | 		g,
37 | 	)
38 | 
39 | 	// 创建输入张量
40 | 	batch_size := 1
41 | 	seq_len := 32
42 | 	input := g.AddTensor(
43 | 		"input",
44 | 		config.dtype,
45 | 		[]int{batch_size, seq_len, config.hidden_size},
46 | 		true,
47 | 	)
48 | 
49 | 	// 前向计算，构建计算图
50 | 	transformer.Forward(input.Tensor())
51 | 
52 | 	// 将计算图导出为 DOT 格式
53 | 	dot := g.ToDOT()
54 | 	os.WriteFile("transformer.dot", []byte(dot), 0644)
55 | }
56 | 


--------------------------------------------------------------------------------
/front/go/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/array2d/deepx/front/go
2 | 
3 | go 1.23.2
4 | 


--------------------------------------------------------------------------------
/front/py/.cursorrules:
--------------------------------------------------------------------------------
 1 | 回答要求：
 2 | Always respond in 中文
 3 | 不要回答重复的内容（如我提问中的代码）
 4 | 由于作者是py新手，请多注释python语法和库的用法
 5 | 不要增加"假设Tensor类在这个路径下"这种一眼就被看出是AI写的注释，不要让我的代码看起来像AI生成
 6 | 
 7 | 项目介绍：
 8 | 此项目名为deepx的py部分
 9 | 是deepx的python部分，项目路径为/home/lipeng/code/ai/deepx/front/py
10 | deepx的对外暴露的设计，尽可能接近pytorch的API
11 | deepx_py是前端，负责搭建深度学习模型,但并不参与实际数据存储和计算
12 | deepx_py虽然有类似pytorch的API,但只是调用excuter内的对应实现
13 | deepx_py依赖网络api调用后端excuter的计算
14 | deepx_py的每个OP，都会在excuter内找到对应的实现


--------------------------------------------------------------------------------
/front/py/deepx/.cursorrules:
--------------------------------------------------------------------------------
1 | deepx-py计划像素级复刻pytorch，照顾开发者的使用习惯
2 | 


--------------------------------------------------------------------------------
/front/py/deepx/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .vscode
3 | 


--------------------------------------------------------------------------------
/front/py/deepx/README.md:
--------------------------------------------------------------------------------
 1 | # deepx-py
 2 | 
 3 | deepx-py是为了构建抽象计算图的py接口库
 4 | 
 5 | 供算法人员搭建模型使用
 6 | 
 7 | ## 存算一体，控制分离
 8 | 
 9 | deepx-py是控制侧，负责生成计算图，通过通信调用其他进程
10 | 
11 | 在excuter中，实现了具体的存算引擎
12 | 
13 | 
14 | 
15 | ## 接口设计
16 | deepx-py计划像素级复刻pytorch，照顾开发者的使用习惯
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/front/py/deepx/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tensor import Tensor,Shape,Number
 2 | from deepx.nn.functional import *  # 导入所有functional函数
 3 | from deepx.nn.functional import __all__ as _func_all  # 获取functional的导出列表
 4 | 
 5 | __all__ = [
 6 |     #tensor
 7 |     'Tensor','Shape','Number',
 8 |     *_func_all
 9 | ]
10 | 
11 | # 为了支持 import deepx as dx 的用法
12 | tensor = Tensor


--------------------------------------------------------------------------------
/front/py/deepx/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .deepxir import *
2 | 
3 | __all__ = [
4 |     "DeepxIR","DeepxIRResp"
5 |     ]


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .leaffunc_life import *
 3 | from .leaffunc_io import *
 4 | from .leaffunc_init import *
 5 | from .leaffunc_changeshape import *
 6 | from .leaffunc_elementwise import *
 7 | from .leaffunc_matmul import matmul
 8 | from .leaffunc_reduce import reducemax,reducemin,sum,prod
 9 | 
10 | from .authormap import defaultauthor
11 | 
12 | from .reduce import mean
13 | from .activite import *
14 | from .elementwise import *
15 | from .normalization import *
16 | from .changeshape import *
17 | __all__ = [
18 | 
19 |     #leaffunc
20 |     "newtensor","rnewtensor","printtensor","load", #life
21 |     "printtensor","save",#io
22 |     "constant","constant_","dropout","full","zeros","ones","uniform","uniform_","arange","arange_",
23 |     "kaiming_uniform","kaiming_uniform_",
24 |     "add","sub","mul","div",
25 |     "sqrt","pow","exp","log",
26 |     "min","max",
27 |     "less","greater","equal","notequal",
28 |     "switch","where",
29 |     "todtype",
30 |     "invert",
31 |     "matmul",
32 |     "reducemax","reducemin","sum","prod",
33 |     "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",'repeat',
34 | 
35 |     #functional
36 |     "relu","sigmoid","swish","silu",
37 |     "mean",
38 |     "rsqrt",
39 |     "softmax",
40 |     "squeeze","unsqueeze",
41 | 
42 |     #other
43 |     "calculate_fan_in_and_fan_out",
44 | ]


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/activite.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn.functional import newtensor
 3 | from .leaffunc_elementwise  import exp
 4 | # 数学公式：relu(x) = max(0, x)
 5 | def relu(t: Tensor)->Tensor:
 6 |     from .leaffunc_elementwise import max as max_func
 7 |     outtensor=newtensor(t.shape, dtype=t.dtype)
 8 |     return max_func(t,0,outtensor)
 9 |  
10 |  # 数学公式：σ(x) = 1 / (1 + exp(-x))
11 | def sigmoid(t: Tensor)->Tensor:
12 |     return 1 / (exp(t*-1)+1)
13 | 
14 | # 数学公式：swish(x) = x * σ(βx)
15 | def swish(x: Tensor,beta: float = 1.0) -> Tensor:
16 |     return x*sigmoid(x*beta)
17 | 
18 | silu=swish


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/changeshape.py:
--------------------------------------------------------------------------------
 1 | from deepx import Tensor
 2 | from .leaffunc_changeshape import reshape
 3 | 
 4 | def squeeze(t:Tensor,dim:int)->Tensor:
 5 |     assert isinstance(dim,int)
 6 |     assert isinstance(t,Tensor)
 7 |     dim=dim%t.ndim
 8 |     newshape=list(t.shape)
 9 |     newshape.pop(dim)
10 |     return reshape(t,tuple(newshape))
11 | 
12 | def unsqueeze(t:Tensor,dim:int)->Tensor:
13 |     assert isinstance(dim,int)
14 |     assert isinstance(t,Tensor)
15 |     dim=dim%t.ndim
16 |     newshape=list(t.shape)
17 |     newshape.insert(dim,1)
18 |     return reshape(t,tuple(newshape))


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/leaffunc_io.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from .authormap import defaultauthor
 3 | 
 4 | def printtensor(t:Tensor,format=''):
 5 |     from .rtf_io import rtf_printtensor
 6 |     rtf_printtensor(t,format,defaultauthor['print'])
 7 |     return ''
 8 | 
 9 | def save(t:Tensor,path:str):
10 |     from .rtf_io import rtf_save
11 |     rtf_save(t,path)
12 |     return t
13 |  
14 | def loadData(t:Tensor,path:str)->Tensor:
15 |     from .rtf_io import rtf_loadtensordata
16 |     return rtf_loadtensordata(t,path)


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/leaffunc_life.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from typing import Union
 3 |  
 4 | def newtensor(shape:tuple[int,...],dtype:str='float32',name:str=None):
 5 |     assert isinstance(shape,tuple)
 6 |     for i in shape:
 7 |         assert isinstance(i,int)
 8 |     assert isinstance(dtype,str)
 9 |     assert isinstance(name,str) or name is None
10 | 
11 |     t=Tensor(shape=shape,dtype=dtype,name=name)
12 |     from .rtf_life import rtf_newtensor
13 |     rtf_newtensor(t)
14 |     return t
15 | 
16 | def rnewtensor(t:Tensor):
17 |     from .rtf_life import rtf_newtensor
18 |     rtf_newtensor(t)
19 |     return t
20 | 
21 | def copytensor(t:Tensor,out:Tensor):
22 |     from .rtf_life import rtf_copytensor
23 |     rtf_copytensor(t,out)
24 | 
25 | 
26 | def deltensor(t:Tensor):
27 |     from .rtf_life import rtf_deltensor
28 |     rtf_deltensor(t)
29 | def renametensor(t:Tensor,new_name:str):
30 |     assert isinstance(t,Tensor)
31 |     assert isinstance(new_name,str) and new_name != ''
32 |     assert t.name is not None and t.name != ''
33 | 
34 |     from .rtf_life import rtf_renametensor
35 |     rtf_renametensor(t,new_name)
36 | 
37 | def load(path:str)->Tensor:
38 |     from .rtf_io import rtf_load
39 |     return rtf_load(path)
40 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/leaffunc_matmul.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from deepx import Tensor,Shape
 4 | from .leaffunc_life import newtensor
 5 | from .authormap import defaultauthor
 6 | 
 7 | def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor:
 8 |     outtensor=out
 9 |     if isinstance(out,str) or out is None:
10 |         outshape=Shape.matmul(a.shape,b.shape)
11 |         outtensor=newtensor(outshape,dtype=a.dtype,name=out)
12 |     from .rtf_matmul import rtf_matmul
13 |     rtf_matmul(a,b,outtensor,defaultauthor['matmul'],bench)
14 |     return outtensor
15 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/leaffunc_reduce.py:
--------------------------------------------------------------------------------
 1 | from .leaffunc import create_A_dim_keepdim_tf_C
 2 |  
 3 | 
 4 |  
 5 | 
 6 | sum=create_A_dim_keepdim_tf_C('sum')
 7 | prod=create_A_dim_keepdim_tf_C('prod')
 8 | reducemin=create_A_dim_keepdim_tf_C('reducemin')
 9 | reducemax=create_A_dim_keepdim_tf_C('reducemax')
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/normalization.py:
--------------------------------------------------------------------------------
 1 | from deepx import Tensor
 2 | 
 3 | # 数学公式：softmax(x_i) = e^{x_i} / sum(e^{x_j})
 4 | def softmax(t: Tensor,dim:list[int]=[-1])->Tensor:
 5 |     assert isinstance(dim,list)
 6 |     for i in range(len(dim)):
 7 |         dim[i]=dim[i]%t.ndim
 8 |     # 数值稳定性处理：减去最大值防止指数爆炸
 9 |     if dim is not None:
10 |         t_reducemax = t.reducemax(dim=tuple(dim), keepdim=True)  # 保持维度用于广播
11 |     else:
12 |         t_reducemax= t.reducemax(keepdim=True)
13 | 
14 |     t=t-t_reducemax
15 | 
16 |     t_exp = t.exp()
17 |     t_exp_sum=t_exp.sum(dim=tuple(dim), keepdim=True)
18 |     return t.exp()/t_exp_sum


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/reduce.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor,Shape
 2 | from .leaffunc_reduce import sum
 3 | from .leaffunc_life import newtensor
 4 | #mean
 5 |  
 6 | def mean(a:Tensor,dim:tuple[int,...]=None,keepdim:bool=False)->Tensor:
 7 |     assert isinstance(a,Tensor)
 8 |     if dim is None:
 9 |        dim = list(range(a.ndim))
10 |     else:
11 |         dim=list(dim)
12 |         for i in dim:
13 |             if i < 0:
14 |                 dim[i] = i + a.dim()
15 |     total = 1
16 |     for i in dim:
17 |         total *= a.shape[i]
18 |     reduceshape=Shape.reduceshape(a.shape,dim,keepdim)
19 |     out=newtensor(reduceshape,dtype=a.dtype)
20 |     sum(a, tuple(dim), keepdim, out)
21 |     out.div_(total)
22 |     return out
23 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn.deepxir import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | from typing import Union
 5 | def A_B_op_C(op:str,a:Tensor,b:Tensor,out:Tensor,author='miaobyte'):
 6 |     args=[Param.tensor(a),Param.tensor(b)]
 7 |     returns=[Param.tensor(out)]
 8 |     ir=DeepxIR(op, args, returns,author)
 9 |     send(ir)
10 | 
11 | def A_B_c_op_D(op:str,a:Tensor,b:Tensor,c:Union[float,int],out:Tensor,author='miaobyte'):
12 |     args=[Param.tensor(a),Param.tensor(b),Param.varnum(c)]
13 |     returns=[Param.tensor(out)]
14 |     ir=DeepxIR(op, args, returns,author)
15 |     send(ir)
16 | def A_scalar_c_op_D(op:str,a:Tensor,scalar:Union[float,int],c:Union[float,int],out:Tensor,author='miaobyte'):
17 |     args=[Param.tensor(a),Param.varnum(scalar),Param.varnum(c)]
18 |     returns=[Param.tensor(out)]
19 |     ir=DeepxIR(op, args, returns,author)
20 |     send(ir)
21 | 
22 | def A_scalar_op(op:str,a:Tensor,b:Union[float,int],author='miaobyte'):
23 |     args=[Param.tensor(a),Param.varnum(b)]
24 |     returns=[]
25 |     ir=DeepxIR(op, args, returns,author)
26 |     send(ir)
27 | 
28 | def A_scalar_op_C(op:str,a:Tensor,b:Union[float,int],out:Tensor,author='miaobyte'):
29 |     args=[Param.tensor(a),Param.varnum(b)]
30 |     returns=[Param.tensor(out)]
31 |     ir=DeepxIR(op, args, returns,author)
32 |     send(ir)
33 | 
34 | def A_op_C(op:str,a:Tensor,out:Tensor,author='miaobyte'):
35 |     args=[Param.tensor(a)]
36 |     returns=[Param.tensor(out)]
37 |     ir=DeepxIR(op, args, returns,author)
38 |     send(ir)
39 | 
40 | def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int],b2:bool,out:Tensor,author='miaobyte'):
41 |     args=[Param.tensor(a),Param.vector(b1,'int32'),Param.varbool(b2)]
42 |     returns=[Param.tensor(out)]
43 |     ir=DeepxIR(op, args, returns,author)
44 |     send(ir)
45 | 
46 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_changeshape.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn.deepxir import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | 
 5 | def rtf_reshape(t:Tensor,shape:tuple[int],out:Tensor,author='miaobyte'):
 6 |     args=[Param.tensor(t),Param.vector(shape,'int32')]
 7 |     returns=[Param.tensor(out)]
 8 |     ir=DeepxIR("reshape", args, returns,author)
 9 |     send(ir)
10 | 
11 | 
12 | def rtf_transpose(t:Tensor,dimorder:tuple[int],out:Tensor,author='miaobyte'):
13 |     args=[Param.tensor(t),Param.vector(dimorder,'int32')]
14 |     returns=[Param.tensor(out)]
15 |     ir=DeepxIR("transpose", args, returns,author)
16 |     send(ir)
17 |  
18 | def rtf_concat(tensors:tuple[Tensor],dim:int,out:Tensor,author='miaobyte'):
19 |     args=[Param.listtensor(tensors),Param.varnum(dim)]
20 |     returns=[Param.tensor(out)]
21 |     ir=DeepxIR("concat", args, returns,author)
22 |     send(ir)
23 |  
24 | 
25 | def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'):
26 |     args=[Param.tensor(t),Param.vector(new_shape,'int32')]
27 |     returns=[Param.tensor(out)]
28 |     ir=DeepxIR("broadcastTo", args, returns,author)
29 |     send(ir)
30 |  
31 | def rtf_indexselect(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'):
32 |     assert axis>=0 and axis<input.ndim
33 |     args=[Param.tensor(input),Param.tensor(indices),Param.varnum(axis)]
34 |     returns=[Param.tensor(out)]
35 |     ir=DeepxIR("indexselect", args, returns,author)
36 |     send(ir)
37 |  
38 | def rtf_repeat(input:Tensor,repeats:tuple[int,...],out:Tensor,author='miaobyte'):
39 |     assert isinstance(repeats,tuple)
40 |     for i in repeats:
41 |         assert isinstance(i,int) and i>0
42 |     args=[Param.tensor(input),Param.vector(repeats,'int32')]
43 |     returns=[Param.tensor(out)]
44 |     ir=DeepxIR("repeat", args, returns,author)
45 |     send(ir)


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_init.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn.deepxir import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | from typing import Union,Optional
 5 | from .rtf import  A_scalar_op 
 6 | 
 7 | # 初始化
 8 | def rtf_arange(t:Tensor,start:Optional[Union[float,int]]=0,step:Optional[Union[float,int]]=1,author='miaobyte')->Tensor:
 9 |     args=[Param.varnum(start),Param.varnum(step)]
10 |     returns=[Param.tensor(t)]
11 |     ir=DeepxIR("arange", args, returns,author)
12 |     send(ir)
13 |     return t
14 |  
15 | def rtf_uniform(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor:
16 |     args=[Param.varnum(low),Param.varnum(high),Param.varnum(seed)]
17 |     returns=[Param.tensor(t)]
18 |     ir=DeepxIR("uniform", args, returns,author)
19 |     send(ir)
20 |     return t
21 | 
22 | def rtf_normal(t:Tensor,mean:float=0, stddev:float=1,seed:int=0,author='miaobyte')->Tensor:
23 |     args=[Param.varnum(mean),Param.varnum(stddev),Param.varnum(seed)]
24 |     returns=[Param.tensor(t)]
25 |     ir=DeepxIR("normal", args, returns,author)
26 |     send(ir)
27 |     return t
28 | 
29 | # 填充
30 | def rtf_constant(t:Tensor,value:Union[float,int]=0,author='miaobyte')->Tensor:
31 |     args=[Param.varnum(value)]
32 |     returns=[Param.tensor(t)]
33 |     ir=DeepxIR("constant", args, returns,author)
34 |     send(ir)
35 |     return t
36 |   
37 | def rtf_dropout(a:Tensor, p:float, seed:int, author='miaobyte')->Tensor:
38 |     assert isinstance(p,float) and 0<=p<=1
39 |     assert isinstance(seed,int)
40 | 
41 |     args=[Param.varnum(p),Param.varnum(seed)]
42 |     returns=[Param.tensor(a)]
43 |     ir=DeepxIR("dropout",args,returns,author)
44 |     send(ir)
45 |     return a


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_io.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor,loadShape
 2 | from deepx.nn import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | 
 5 | def rtf_printtensor(t:Tensor,format='',author='miaobyte'):
 6 |     args=[Param.tensor(t),Param.varstr(format)]
 7 |     returns=[]
 8 |     ir=DeepxIR("print", args, returns,author)
 9 |     send(ir)
10 |     return ''
11 | 
12 | def rtf_save(t:Tensor,path:str):
13 |     args=[Param.tensor(t),Param.varstr(path)]
14 |     returns=[]
15 |     ir=DeepxIR("save", args, returns)
16 |     send(ir)
17 |     return t
18 | 
19 | def rtf_load(path:str)->Tensor:
20 |     args=[Param.varstr(path)]
21 |     returns=[]
22 |     ir=DeepxIR("load", args, returns)
23 |     send(ir)
24 |     shapefile=path+'.shape'
25 |     tensor_name,shape,dtype=loadShape(shapefile)
26 |     return Tensor(shape.shape,dtype,tensor_name)
27 | 
28 | def rtf_loadtensordata(t:Tensor,path:str)->Tensor:
29 |     args=[Param.varstr(path)]
30 |     returns=[Param.tensor(t)]
31 |     ir=DeepxIR("loadtensordata", args, returns)
32 |     send(ir)
33 |     return t


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_life.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn.deepxir import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | 
 5 | def rtf_newtensor(t:Tensor):
 6 |     assert isinstance(t,Tensor)
 7 |     args=[Param.vector(t.shape,'int32')]
 8 |     returns=[Param.tensor(t)]
 9 |     ir=DeepxIR("newtensor", args, returns,'')
10 |     send(ir)
11 | 
12 | 
13 | def rtf_copytensor(t:Tensor,out:Tensor):
14 |     assert isinstance(t,Tensor)
15 |     assert isinstance(out,Tensor)
16 |     assert t.shape==out.shape
17 |     assert t.dtype==out.dtype
18 | 
19 |     args=[Param.tensor(t)]
20 |     returns=[Param.tensor(out)]
21 |     ir=DeepxIR("copytensor", args, returns,'')
22 |     send(ir)
23 | 
24 | 
25 | 
26 | def rtf_deltensor(t:Tensor):
27 |     assert isinstance(t,Tensor)
28 |     args=[]
29 |     returns=[Param.tensor(t)]
30 |     ir=DeepxIR("deltensor", args, returns,'')
31 |     send(ir)
32 | 
33 | def rtf_renametensor(t:Tensor,new_name:str):
34 |     args=[Param.varstr(new_name)]
35 |     returns=[Param.tensor(t)]
36 |     ir=DeepxIR("renametensor", args, returns,'')
37 |     send(ir)
38 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_matmul.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from deepx.nn import DeepxIR,Param
 3 | from deepx.scheduler import send
 4 | 
 5 | def rtf_matmul(a:Tensor,b:Tensor,out: Tensor ,author='cublas',bench:int=None):
 6 |     args=[Param.tensor(a),Param.tensor(b)]
 7 |     returns=[Param.tensor(out)]
 8 |     ir=DeepxIR("matmul", args, returns, author)
 9 |     if bench is not None:
10 |         ir._metadata.openbench(bench)
11 |     send(ir)
12 |     return out


--------------------------------------------------------------------------------
/front/py/deepx/nn/functional/rtf_reduce.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | from .rtf import A_b1_b2_op_C
 3 | 
 4 | def rtf_sum(a:Tensor,dim:tuple[int],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor:
 5 |     A_b1_b2_op_C("sum",a,dim,keepdim,out,author)
 6 |  
 7 |     
 8 | def rtf_prod(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor:
 9 |     A_b1_b2_op_C("prod",a,dim,keepdim,out,author)
10 |  
11 | 
12 | def rtf_reducemax(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor:
13 |     A_b1_b2_op_C("reducemax",a,dim,keepdim,out,author)
14 |  
15 | 
16 | def rtf_reducemin(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor:
17 |     A_b1_b2_op_C("reducemin",a,dim,keepdim,out,author)
18 |  


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .module import Module, Sequential
 2 | from .linear import Linear
 3 | from .sparse import Embedding
 4 | __all__ = [
 5 |     "Module",
 6 |     "Linear",
 7 |     "Sequential",
 8 |     "Embedding",
 9 |     ]
10 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/activation.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from deepx  import Tensor,ones
 3 | from .module import Module
 4 | 
 5 | 
 6 | class Glu(Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.W = ones(shape=(1,1),name=self.full_name+"_W")
10 |         self.V = ones(shape=(1,1),name=self.full_name+"_V")
11 | 
12 | class Swiglu(Module):
13 |     def __init__(self):
14 |         super().__init__()
15 |         self.W = ones(shape=(1,1),name=self.full_name+"_W")
16 |         self.V = ones(shape=(1,1),name=self.full_name+"_V")
17 | 
18 |     def swiglu(
19 |         x: Tensor,
20 |         W: Tensor,  # 第一个投影矩阵
21 |         V: Tensor,  # 第二个投影矩阵
22 |         beta: float = 1.0,  # swish函数的缩放因子
23 |         out: Union[Tensor,str] = '') -> Tensor:
24 |         from deepx.nn.functional import swish
25 |         result=swish(x@W,beta=beta).mul(x@V,out=out)       
26 |         return result
27 |  
28 |     def forward(self, input: Tensor) -> Tensor:
29 |         return self.swiglu(input,self.W,self.V)
30 |  
31 | 


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/conv.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/conv.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/dropout.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/dropout.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/loss.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/loss.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/normalization.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/normalization.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/padding.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/padding.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/pooling.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/pooling.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/rmsnorm.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/rmsnorm.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/modules/rnn.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/rnn.py


--------------------------------------------------------------------------------
/front/py/deepx/nn/parameter.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/parameter.py


--------------------------------------------------------------------------------
/front/py/deepx/optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/optim/__init__.py


--------------------------------------------------------------------------------
/front/py/deepx/optim/adam.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/optim/adam.py


--------------------------------------------------------------------------------
/front/py/deepx/optim/optimizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from deepx.tensor import Tensor
 3 | class Optimizer:
 4 |     def __init__(self, 
 5 |                  params:list[Tensor],
 6 |                  defaults: dict[str, Any]) -> None:
 7 |         self.params = params
 8 |         self.defaults = defaults
 9 | 
10 |     def step(self):
11 |         pass


--------------------------------------------------------------------------------
/front/py/deepx/optim/sgd.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from .optimizer import Optimizer
 3 | from deepx.tensor import Tensor
 4 | 
 5 | class SGD(Optimizer):
 6 |     def __init__(self,
 7 |                 params:list[Tensor],
 8 |                 defaults: dict[str, Any]) -> None:
 9 |         super().__init__(params, defaults)
10 | 
11 |     def step(self):
12 |         for param in self.params:
13 |             param.data -= self.defaults['lr'] * param.grad
14 | 


--------------------------------------------------------------------------------
/front/py/deepx/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/requirements.txt


--------------------------------------------------------------------------------
/front/py/deepx/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .client.allclient import send
3 | 
4 | __all__ = [
5 |     "send",
6 | ]
7 | 
8 | 


--------------------------------------------------------------------------------
/front/py/deepx/scheduler/client/allclient.py:
--------------------------------------------------------------------------------
 1 | from .udpconn import _default_udpconn
 2 | from typing import Optional
 3 | from deepx.nn import DeepxIR,DeepxIRResp
 4 | import time
 5 | default_client = _default_udpconn
 6 | 
 7 | 
 8 | _id_counter=0
 9 | def send(ir:DeepxIR) -> DeepxIRResp:
10 |     ir._sent_at=time.time()
11 |     global _id_counter
12 |     _id_counter=_id_counter+1
13 |     ir._id=_id_counter
14 |     s=str(ir)
15 |     respstr=default_client.send(s)
16 |     respir=DeepxIRResp(respstr)
17 |     return respir
18 | 


--------------------------------------------------------------------------------
/front/py/deepx/scheduler/client/udpconn.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | from typing import Optional, Tuple
 3 | import select
 4 | 
 5 | class UDPConn:
 6 |     def __init__(self, endpoint: str = "localhost:9090"):
 7 |         # 解析endpoint
 8 |         self._host, port_str = endpoint.split(':')
 9 |         self._port = int(port_str)
10 |         # 创建UDP socket
11 |         self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
12 |         # 设置非阻塞模式
13 |         self._sock.setblocking(False)
14 |         # 设置接收缓冲区
15 |         self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1024*1024)
16 |     
17 |     def send(self, ir: str) -> Optional[dict]:
18 |         
19 |         # 发送IR字符串
20 |         try:
21 |             # 将IR字符串编码为bytes并发送
22 |             data = ir.encode('utf-8')
23 |             self._sock.sendto(data, (self._host, self._port))
24 |             # 等待响应
25 |             return self._wait_response()
26 |             
27 |         except Exception as e:
28 |             print(f"发送IR失败: {e}")
29 |             return None
30 | 
31 |     def _wait_response(self, timeout: float =10000) -> any:
32 |         """等待并接收响应
33 |         
34 |         Args:
35 |             timeout: 超时时间(秒)
36 |         """
37 |         try:
38 |             # 使用select实现超时等待
39 |             ready = select.select([self._sock], [], [], timeout)
40 |             if ready[0]:
41 |                 data, addr = self._sock.recvfrom(65536)  # 64KB缓冲区
42 |                 response = data.decode('utf-8')
43 |                 return response
44 |             return None
45 |             
46 |         except Exception as e:
47 |             print(f"接收响应失败: {e}")
48 |             return None
49 | 
50 |     def __del__(self):
51 |         """确保socket正确关闭"""
52 |         if hasattr(self, '_sock'):
53 |             self._sock.close()
54 | 
55 | # 全局单例实例
56 | _default_udpconn = UDPConn()
57 | 


--------------------------------------------------------------------------------
/front/py/deepx/scheduler/client/unixsocket.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/scheduler/client/unixsocket.py


--------------------------------------------------------------------------------
/front/py/deepx/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='deepx',
 5 |     version='0.1.0',
 6 |     description='DeepX - 高性能深度学习框架的Python接口',
 7 |     author='igor.li',
 8 |     author_email='lipeng@mirrorsoft.cn',
 9 |     packages=find_packages(),
10 |     install_requires=[
11 |         'graphviz>=0.20.1',  # 用于计算图可视化
12 |     ],
13 |     long_description=open("README.md").read(),
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/array2d/deepx",
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.7',  # 确保支持数据类型注解
22 | )


--------------------------------------------------------------------------------
/front/py/deepx/tensor/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tensor import *
 2 | from .shape import Shape
 3 | from .elementwise import *  # 导入所有包含@tensor_method装饰的方法
 4 | from .matmul import *       # 导入矩阵乘法相关方法
 5 | from .changeshape import *    # 导入转置方法
 6 | from .init import *
 7 | from .reduce import *
 8 | from .io import *
 9 | __all__ = [
10 |     'Shape',
11 |     'Tensor',
12 |     'tensor_method',
13 |     'Number',
14 |     'loadShape',
15 |     # 'lt', 'gt', 'eq',
16 |     # 'sin', 'cos', 'tan',
17 |     # 'DType',
18 |     # '_dtype_to_typestr'
19 | ] 


--------------------------------------------------------------------------------
/front/py/deepx/tensor/init.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from deepx.tensor import tensor_method
 3 | 
 4 | # 填充
 5 | @tensor_method
 6 | def full_(self,value:Union[float,int]):
 7 |     from deepx.nn.functional import constant_ as constant_func
 8 |     constant_func(self,value=value)
 9 | 
10 | @tensor_method
11 | def dropout_(self,p:float=0.5,seed:int=None):
12 |     from deepx.nn.functional import dropout as dropout_func
13 |     dropout_func(self,p,seed)
14 |     return self
15 | 
16 | 
17 | @tensor_method
18 | def zeros_(self):
19 |     from deepx.nn.functional import constant_ as constant_func
20 |     constant_func(self,value=0)
21 | 
22 | @tensor_method
23 | def ones_(self):
24 |     from deepx.nn.functional import constant_ as constant_func
25 |     constant_func(self,value=1)
26 | 
27 | @tensor_method
28 | def uniform_(self,low=0, high=1,seed:int=None):
29 |     from deepx.nn.functional import uniform_ as uniform_func
30 |     uniform_func(self,low=low, high=high,seed=seed)
31 | 
32 | @tensor_method
33 | def arange_(self,start=0,step=1):
34 |     from deepx.nn.functional import arange_ as arange_func
35 |     arange_func(self,start,step)
36 | 
37 | @tensor_method
38 | def normal_(self,mean=0, stddev=1,seed:int=None):
39 |     from deepx.nn.functional import normal_ as normal_func
40 |     normal_func(self,mean,stddev,seed)
41 | 
42 | @tensor_method
43 | def rand_(self):
44 |     #todo
45 |     pass
46 | 
47 | @tensor_method
48 | def randn_(self):
49 |     #todo
50 |     pass
51 | @tensor_method
52 | def eye_(self,n,m=None):
53 |     #todo
54 |     pass
55 | 


--------------------------------------------------------------------------------
/front/py/deepx/tensor/io.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | from deepx.tensor import Shape,Tensor,tensor_method
 4 | 
 5 | def loadShape(path:str)->tuple[str,Shape,str]:
 6 |     filename = os.path.basename(path)
 7 |     if filename.endswith('.shape'):
 8 |         with open(path, 'r') as f:
 9 |             shape = yaml.safe_load(f)
10 |     else:
11 |         raise ValueError("文件名必须以.shape结尾")
12 |  
13 |     tensor_name = filename[:-6]  # 移除'.shape'后缀
14 |     return (tensor_name,Shape(tuple(shape['shape'])),shape['dtype'])
15 | @tensor_method
16 | def loadData(self,path:str):
17 |     from deepx.nn.functional import loadData as loadData_func
18 |     loadData_func(self,path)
19 |     
20 | @tensor_method
21 | def save(self,path:str):
22 |     from deepx.nn.functional import save  as save_func
23 |     save_func(self,path)
24 | 
25 |  
26 | 


--------------------------------------------------------------------------------
/front/py/deepx/tensor/matmul.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | 
3 | from  .tensor import Tensor,tensor_method
4 | 
5 | @tensor_method
6 | def matmul(self:Tensor,other:Tensor,out:Union[Tensor,str]=''):
7 |     from deepx.nn.functional import matmul as matmul_func
8 |     return matmul_func(self,other,out)


--------------------------------------------------------------------------------
/front/py/deepx/tensor/reduce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import Union
 3 | 
 4 | from deepx.tensor import Tensor,tensor_method
 5 | 
 6 | @tensor_method
 7 | def reducemax(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
 8 |     assert isinstance(dim,tuple)
 9 |     for i in dim:
10 |         assert isinstance(i,int)
11 |     from deepx.nn.functional import reducemax as reduce_max_func
12 |     return reduce_max_func(self,dim,keepdim,out)
13 | 
14 | @tensor_method
15 | def reducemin(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
16 |     assert isinstance(dim,tuple)
17 |     for i in dim:
18 |         assert isinstance(i,int)
19 |     from deepx.nn.functional import reducemin as reduce_min_func
20 |     return reduce_min_func(self,dim,keepdim,out)
21 | 
22 | 
23 | @tensor_method
24 | def sum(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
25 |     assert isinstance(dim,tuple)
26 |     for i in dim:
27 |         assert isinstance(i,int)
28 |     from deepx.nn.functional import  sum as sum_func
29 |     return  sum_func(self,dim,keepdim,out)
30 | 
31 | @tensor_method
32 | def prod(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor:
33 |     assert isinstance(dim,tuple)
34 |     for i in dim:
35 |         assert isinstance(i,int)
36 |     from deepx.nn.functional import prod as prod_func
37 |     return prod_func(self,dim,keepdim,out)   
38 | 
39 | @tensor_method
40 | def mean(self,dim:tuple[int,...],keepdim:bool=False)->Tensor:
41 |     assert isinstance(dim,tuple)
42 |     for i in dim:
43 |         assert isinstance(i,int)
44 |     from deepx.nn.functional import mean as mean_func
45 |     return mean_func(self,dim,keepdim)
46 |  


--------------------------------------------------------------------------------
/front/py/deepx/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/__init__.py


--------------------------------------------------------------------------------
/front/py/deepx/transformer/attention.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/attention.py


--------------------------------------------------------------------------------
/front/py/deepx/transformer/decoder.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/decoder.py


--------------------------------------------------------------------------------
/front/py/deepx/transformer/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/models/__init__.py


--------------------------------------------------------------------------------
/front/py/deepx/transformer/models/llama/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedding import *
2 | __all__ = [
3 |     "LlamaRotaryEmbedding"
4 | ]


--------------------------------------------------------------------------------
/front/py/deepx/transformer/models/llama/groupedquery_attention.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional,Tuple
 2 | from deepx.nn.modules import Module,Linear
 3 | from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func
 4 | 
 5 | 
 6 | def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor:
 7 |     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
 8 |     if n_rep == 1:
 9 |         return hidden_states
10 |     hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
11 |     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
12 | 
13 | 


--------------------------------------------------------------------------------
/front/py/deepx/transformer/models/llama/mlp.py:
--------------------------------------------------------------------------------
 1 | from deepx.nn.functional import swish as swish_fn
 2 | from deepx.nn.modules import Module,Linear
 3 | 
 4 | ACT2FN={
 5 |     "silu":swish_fn,
 6 | }
 7 | 
 8 | class LlamaMLP(Module):
 9 |     def __init__(self, config:dict):
10 |         super().__init__()
11 |         # 输入层大小
12 |         self.hidden_size = config.hidden_size  
13 |         # 中间层大小
14 |         self.intermediate_size = config["intermediate_size"]
15 |         #门控投影层
16 |         self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
17 |         #上投影层
18 |         self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
19 |         #下投影层
20 |         self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
21 |         #激活函数
22 |         self.act_fn = ACT2FN[config.hidden_act]
23 | 
24 |     def forward(self, x):
25 |         down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
26 |         return down_proj


--------------------------------------------------------------------------------
/front/py/deepx/transformer/models/llama/normalization.py:
--------------------------------------------------------------------------------
 1 | from deepx.nn.modules import Module
 2 | from deepx import Tensor,ones,rsqrt
 3 | # RMSNorm
 4 | # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
 5 | # 数学公式
 6 | class LlamaRMSNorm(Module):
 7 |     def __init__(self, hidden_size:int, eps:float=1e-6):
 8 |         """
 9 |         LlamaRMSNorm is equivalent to T5LayerNorm
10 |         """
11 |         super().__init__()
12 |         self.weight=ones((hidden_size,))
13 |         self.register_parameter("weight",self.weight)
14 |         self.variance_epsilon = eps
15 |     def forward(self, hidden_states:Tensor):
16 |         variance =  hidden_states.pow(2).mean((-1,), keepdim=True)
17 |         hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
18 |         return self.weight * hidden_states
19 |  
20 |     def extra_repr(self):
21 |         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
22 |     


--------------------------------------------------------------------------------
/front/py/deepx/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tensor import Tensor
 2 | from .creation import zeros, ones, arange
 3 | from .elementwise import add, sub, mul, div
 4 | from .matmul import matmul, dot
 5 | from .reduction import sum, mean, max, min
 6 | from .shape import reshape, transpose
 7 | from .comparison import lt, gt, eq
 8 | from .trigonometric import sin, cos, tan
 9 | 
10 | __all__ = [
11 |     'Tensor',
12 |     'zeros', 'ones', 'arange',
13 |     'add', 'sub', 'mul', 'div',
14 |     'matmul', 'dot',
15 |     'sum', 'mean', 'max', 'min',
16 |     'reshape', 'transpose',
17 |     'lt', 'gt', 'eq',
18 |     'sin', 'cos', 'tan'
19 | ] 


--------------------------------------------------------------------------------
/front/py/deepx/utils/benchmark/bench.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class Bench:
 4 |     def __init__(self, name: str):
 5 |         self.name = name
 6 |     
 7 |     def __call__(self, func):
 8 |         def wrapper(*args, **kwargs):   
 9 |             result = func(*args, **kwargs)
10 |             print(f"{self.name} took {time.time() - start_time} seconds to run")
11 |             return result


--------------------------------------------------------------------------------
/front/py/deepx/utils/checkpoint.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/checkpoint.py


--------------------------------------------------------------------------------
/front/py/deepx/utils/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tensor import Tensor
 2 | from .creation import zeros, ones, arange
 3 | from .elementwise import add, sub, mul, div
 4 | from .matmul import matmul, dot
 5 | from .reduction import sum, mean, max, min
 6 | from .shape import reshape, transpose
 7 | from .comparison import lt, gt, eq
 8 | from .trigonometric import sin, cos, tan
 9 | 
10 | __all__ = [
11 |     'Tensor',
12 |     'zeros', 'ones', 'arange',
13 |     'add', 'sub', 'mul', 'div',
14 |     'matmul', 'dot',
15 |     'sum', 'mean', 'max', 'min',
16 |     'reshape', 'transpose',
17 |     'lt', 'gt', 'eq',
18 |     'sin', 'cos', 'tan'
19 | ] 


--------------------------------------------------------------------------------
/front/py/deepx/utils/data/dataloader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/dataloader.py


--------------------------------------------------------------------------------
/front/py/deepx/utils/data/dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/dataset.py


--------------------------------------------------------------------------------
/front/py/deepx/utils/data/sampler.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/sampler.py


--------------------------------------------------------------------------------
/front/py/deepxutil/numpy/__init__.py:
--------------------------------------------------------------------------------
1 | from .io import *
2 | __all__ = [
3 |     'save_numpy',
4 | ]
5 | 


--------------------------------------------------------------------------------
/front/py/deepxutil/numpy/io.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Shape
 2 |  
 3 | def save_numpy(t,tensorpath:str):
 4 |     r'''
 5 |     保存numpy.ndarray为deepx.tensor格式
 6 |     t:numpy.ndarray
 7 |     tensorpath:str,
 8 |     '''
 9 |     from numpy import ascontiguousarray,ndarray
10 |     assert isinstance(t,ndarray)
11 |     shape=Shape(t.shape)
12 |     shape._dtype=str(t.dtype)
13 |     shape.save(tensorpath+".shape")
14 | 
15 |     array = ascontiguousarray(t)
16 |     array.tofile(tensorpath+'.data')
17 |     return t
18 | 


--------------------------------------------------------------------------------
/front/py/deepxutil/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from .io import *
2 | __all__ = [
3 |     'save_torch',
4 | ]
5 | 


--------------------------------------------------------------------------------
/front/py/deepxutil/torch/io.py:
--------------------------------------------------------------------------------
 1 | def save_torch(t,path:str):
 2 |     r'''
 3 |     保存torch.Tensor为deepx.tensor格式
 4 |     '''
 5 |     from torch import Tensor as torch_Tensor
 6 |     assert isinstance(t,torch_Tensor)
 7 |     t=t.detach().cpu().numpy()
 8 |     from deepxutil.numpy.io import save_numpy
 9 |     save_numpy(t,path)
10 |     


--------------------------------------------------------------------------------
/front/py/docs/api.rst:
--------------------------------------------------------------------------------
 1 | API 文档
 2 | ========
 3 | 
 4 | 激活函数
 5 | --------
 6 | 
 7 | .. automodule:: deepx.nn.functional.activite
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance: 


--------------------------------------------------------------------------------
/front/py/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0, os.path.abspath('..'))  # 添加项目根目录到路径
 4 | 
 5 | # 扩展配置
 6 | extensions = [
 7 |     'sphinx.ext.autodoc',     # 自动提取文档字符串
 8 |     'sphinx.ext.napoleon',    # 支持 Google 和 NumPy 风格的文档
 9 |     'sphinx.ext.mathjax',     # 支持数学公式渲染
10 |     'sphinx.ext.viewcode',    # 链接到源代码
11 | ]
12 | 
13 | # 主题设置
14 | html_theme = 'sphinx_rtd_theme'  # 使用 Read the Docs 主题
15 | 
16 | # 项目信息
17 | project = 'deepx'
18 | copyright = '2024, Your Name'
19 | author = 'Your Name' 


--------------------------------------------------------------------------------
/front/py/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 欢迎使用 deepx 文档
 2 | ==================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: 目录:
 7 | 
 8 |    api
 9 | 
10 | 索引和表格
11 | ==========
12 | 
13 | * :ref:`genindex`
14 | * :ref:`modindex`
15 | * :ref:`search` 


--------------------------------------------------------------------------------
/front/py/examples/0_pyenv/binsearch.py:
--------------------------------------------------------------------------------
 1 | # 升序有序数组，用二分查找元素定位
 2 | def binarysearch(data:tuple[int],target:int):
 3 |     left,right=0,len(data)-1
 4 |     while left<=right:
 5 |         mid=(left+right)//2
 6 |         if data[mid]<target:
 7 |             left=mid+1
 8 |         elif data[mid]==target:
 9 |             return mid
10 |         else:
11 |             right=mid-1
12 |     return -1
13 | 
14 | test=(1,2,5,6,7,8,9,12,14)
15 | print(binarysearch(test,8))


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/1_clone.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from deepx  import Tensor,newtensor,rnewtensor
 3 | 
 4 | def clonetest():
 5 |     t1=Tensor(shape=(1,2,3),dtype='float32',name='t1')
 6 |     rnewtensor(t1)
 7 |     t2=t1.clone(name='t2')
 8 |     t2.print()
 9 | 
10 | if __name__ == "__main__":
11 |     clonetest()


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/1_copy.py:
--------------------------------------------------------------------------------
 1 | from deepx.tensor import Tensor
 2 | 
 3 | def copytest():
 4 |     from deepx.nn.functional import newtensor
 5 |     t1= newtensor((1, 2, 3),name='t1')
 6 |     t2= newtensor((1, 2, 3),name='t2')
 7 |     t1.print()
 8 |     t1.copy_to(t2)
 9 |     t2.print()
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     copytest()


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/1_new.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | sys.path.append('/home/lipeng/code/git.array2d.com/ai/deepx/front/py')  # 将项目根目录添加到Python路径
 3 | 
 4 | from deepx.tensor import Tensor
 5 | def printall(t):
 6 | 
 7 |    print("t.name",t.name)
 8 |    print("t.shape=",t.shape)
 9 |    print("t.shape[0]=",t.shape[0])
10 |    print("t.stride=",t.stride)
11 |    print("t.stride[0]=",t.stride[0])
12 |    print("t.dim=",t.dim())
13 |    print("t.ndimension=",t.ndimension)
14 |    print("t.numel=",t.numel())
15 |    print("t.dtype=", t.dtype)
16 |    t.print()
17 | 
18 | def newtensor(dtype):
19 | 
20 |    from deepx.nn.functional import newtensor
21 |    t=newtensor((1,2,3),dtype=dtype)
22 |    printall(t)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |    args=sys.argv[1:]
27 |    if len(args)==0:
28 |       newtensor('float32')
29 |    elif len(args)==1:
30 |       newtensor(args[0])
31 |    else:
32 |       print("Usage: python 1_new.py [dtype]")
33 | 


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/1_print.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | sys.path.append('/home/lipeng/code/git.array2d.com/ai/deepx/front/py')  # 将项目根目录添加到Python路径
 3 | 
 4 | from deepx.tensor import Tensor
 5 | 
 6 | def newtensor():
 7 | 
 8 |    from deepx.nn.functional import newtensor
 9 |    t=newtensor((1,2,3),name='t')
10 |    t.print()
11 | 
12 | if __name__ == "__main__":
13 |    newtensor()
14 | 


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/2_newbig.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('/home/lipeng/code/git.array2d.com/ai/deepx/front/py')  # 将项目根目录添加到Python路径
 3 | 
 4 | def newtensor(dtype):
 5 |    from deepx.nn.functional import newtensor
 6 |    for i in range(0,20):
 7 |       t=newtensor((1,20,4096),dtype=dtype)
 8 |       # t.print()
 9 | a=None
10 | def f():
11 |    a="hello"
12 | f()
13 | print(a)
14 | 
15 | if __name__ == "__main__":
16 |    args=sys.argv[1:]
17 |    if len(args)==0:
18 |       newtensor('float32')
19 |    elif len(args)==1:
20 |       newtensor(args[0])
21 |    else:
22 |       print("Usage: python 1_new.py [dtype]")


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/2_saveload.py:
--------------------------------------------------------------------------------
 1 | from deepx.nn.functional import arange,save,load
 2 | 
 3 | def saveloadfloat32():
 4 |     t1=arange(start=0,end=60 ,dtype='float32').reshape_((3,4,5))
 5 |     dir='/home/lipeng/model/deepxmodel/tester/'
 6 |     t1.save(dir+'t1')
 7 | 
 8 |     t2=load(dir+'t1')
 9 |     t2.print()
10 | 
11 | def saveloadint8():
12 |     t=arange(start=0,end=60 ,dtype='int8').reshape_((3,4,5))
13 |     dir='/home/lipeng/model/deepxmodel/tester/'
14 |     t.save(dir+'tint8')
15 | 
16 |     t2=load(dir+"tint8")
17 |     t2.print()
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     #saveloadfloat32()
22 |     saveloadint8()


--------------------------------------------------------------------------------
/front/py/examples/1_tensor/getitem.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def deepx_getitem():
 3 |     from deepx  import newtensor
 4 |     t=newtensor((2,3,4)).full_(1)
 5 |     t2=t[None, :, None]
 6 |     t2.print()
 7 | def torch_getitem():
 8 |     import torch
 9 |     t=torch.full((2,3,4),1)
10 |     t2=t[None, :, None]
11 |     print(t2)
12 | if __name__ == "__main__":
13 |     deepx_getitem()
14 |     torch_getitem()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/1_init_zeroones.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ############-------PyTorch-------################
 3 | 
 4 | import torch
 5 | torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
 6 | torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
 7 | torch_t4 = torch.full((3, 4, 5), 0.5)
 8 | print(torch_t4)
 9 | torch_t5=torch.nn.init.uniform_(torch.zeros(3,4,5),0,1)
10 | print(torch_t5)
11 | 
12 | 
13 | torch_t6 = torch.zeros(3, 4, 5, dtype=torch.float32)
14 | torch.nn.init.kaiming_uniform_(torch_t6)
15 | print(torch_t6)
16 | 
17 | torch_t7 = torch.zeros(3, 4, 5, dtype=torch.float32)
18 | torch_t7.normal_(mean=0,std=0.02)
19 | print(torch_t7)
20 | 
21 | ############-------DEEPX-------################
22 | 
23 | import deepx
24 | print()
25 | 
26 | t1 = deepx.zeros((3,4,5),dtype='float32')
27 | t2 = deepx.ones((3,4,5),dtype='float32')
28 | t4=deepx.full((3,4,5),value=0.5)
29 | t4.print()
30 | t5=deepx.uniform((3,4,5),low=0,high=1)
31 | t5.print()
32 | t6=deepx.kaiming_uniform((3,4,5),dtype='float32')
33 | t6.print()
34 | 
35 | t7=deepx.zeros((3,4,5),dtype='float32')
36 | t7.normal_(mean=0,stddev=0.02)
37 | t7.print("%.6f")
38 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_add.py:
--------------------------------------------------------------------------------
 1 | print()
 2 | ############-------PyTorch-------################
 3 | 
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32)
 6 | torch_t2 = torch_t1.clone()
 7 | torch_t3 = torch_t1 + torch_t2
 8 | torch_t3.add_(0.5)
 9 | 
10 | print(torch_t3)
11 | torch_t4 = torch.full((2,3,4), 1.5, dtype=torch.float32)
12 | torch_t5 = 2-torch_t4
13 | print(torch_t5)
14 | 
15 | ############-------DEEPX-------################
16 | 
17 | from deepx import  full
18 | 
19 | print()
20 | 
21 | t1 = full((2,3,4), value=10,dtype="float32")
22 | t2 = t1.clone()
23 | t3 = t1+t2
24 | t3.add_(0.5)
25 | t3.print()
26 | 
27 | t4 = full((2,3,4), value=1.5,dtype="float32")
28 | t5 = 2-t4
29 | t5.print()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_bit.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.int8)
 6 | torch_t2 = ~torch_t1
 7 | print(torch_t2)
 8 | torch_t3 = torch.full((2,3,4, ), 2, dtype=torch.int64)
 9 | torch_t4 = ~torch_t3
10 | print(torch_t4)
11 | 
12 | 
13 | 
14 | ############-------DEEPX-------################
15 | 
16 | from deepx import Tensor,full
17 | 
18 | print()
19 | 
20 | t1 = full((2,3,4), value=10,dtype="int8")
21 | t2 = ~t1
22 | t2.print()
23 | 
24 | t3 = full((2,3,4), value=2,dtype="int64")
25 | t4 = ~t3
26 | t4.print()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_dropout.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.arange(24, dtype=torch.int32).reshape(2,3,4)
 6 | torch_t2 = torch_t1.dropout(p=0.5)
 7 | print(torch_t2)
 8 |  
 9 | 
10 | 
11 | 
12 | ############-------DEEPX-------################
13 | 
14 | from deepx import Tensor,arange
15 | 
16 | print()
17 | 
18 | t1 = arange(start=0,end=24 ,dtype="int32").reshape_(2,3,4)
19 | t2 = t1.dropout(p=0.5)
20 | t2.print()
21 |  


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_lessgreater.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32)
 6 | torch_t2 = torch.arange(24,dtype=torch.float32).reshape(2,3,4)
 7 | torch_t3= torch.less(torch_t2,torch_t1)
 8 | print("t1<t2")
 9 | print(torch_t3)
10 | torch_t4= torch.greater(torch_t2,torch_t1)
11 | print("t1>t2")
12 | print(torch_t4)
13 | torch_t5= torch.equal(torch_t2,torch_t1)
14 | print("t1==t2")
15 | print(torch_t5)
16 | torch_t6= torch.not_equal(torch_t2,torch_t1)
17 | print("t1!=t2")
18 | print(torch_t6)
19 | 
20 | 
21 | ############-------DEEPX-------################
22 | 
23 | from deepx import Tensor,full,arange,less,greater
24 | 
25 | print()
26 | 
27 | t1 = full((2,3,4), value=10,dtype="float32")
28 | equalmask=t1==10
29 | equalmask.print()
30 | t2 = arange(0,24,dtype="float32").reshape_((2,3,4))
31 | t3_= t2<t1
32 | t3_.print()
33 | t4_= t2>t1
34 | t4_.print()
35 | 
36 | t5_= t2==t1
37 | t5_.print()
38 | t6_= t2!=t1
39 | t6_.print()
40 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_minmax.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.int8)
 6 | torch_t2 = torch.arange(24,dtype=torch.int8).reshape(2,3,4)
 7 | torch_t3= torch.min(torch_t2,torch_t1)
 8 | print(torch_t3)
 9 | torch_t4= torch.max(torch_t2,torch_t1)
10 | print(torch_t4)
11 | 
12 | 
13 | ############-------DEEPX-------################
14 | 
15 | from deepx import Tensor,full,arange,min,max
16 | 
17 | print()
18 | 
19 | t1 = full((2,3,4), value=10,dtype="int8")
20 | t2 = arange(0,24,dtype="int8").reshape_((2,3,4))
21 | t3 = min(t2,t1)
22 | t3.print()
23 | t4 = max(t2,t1)
24 | t4.print()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_operator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ############-------PyTorch-------################
 3 | 
 4 | import torch
 5 | torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32)
 6 | torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32)
 7 | torch_t3 = torch_t1 + torch_t2
 8 | torch_t4 = torch.full((3, 4, 5), 0.5)
 9 | torch_t5 = torch_t4 + torch_t3
10 | print(torch_t5)
11 | torch_t6 = torch_t1 / torch_t2
12 | print(torch_t6)
13 | torch_t7=0.05/torch_t2*2.5
14 | print(torch_t7)
15 | 
16 | torch_t8=torch_t7.mul(torch_t2)
17 | print(torch_t8)
18 | ############-------DEEPX-------################
19 | 
20 | import deepx
21 | print()
22 | 
23 | t1 = deepx.zeros([3,4,5],dtype='float32',name="t1")
24 | t2 = deepx.ones([3,4,5],dtype='float32',name="t2")
25 | t3 = t1.add(t2,out='t3')
26 | t4=deepx.full([3,4,5],value=0.5,name='t4')
27 | t5=t4.add(t3,out='t5')
28 | t5.print()
29 | t6=t1.div(t2,out='t6')
30 | t6.print()
31 | t7=t2.rdiv(0.05,out='t7')
32 | t7.mul_(2.5)
33 | t7.print()
34 | t8=t7.mul(t2,out='t8')
35 | t8.print()
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_sqrtlog.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ############-------PyTorch-------################
 3 | print()
 4 | 
 5 | import torch
 6 | torch_t1 = torch.arange(3*4*5, dtype=torch.float32)
 7 | torch_t2 = torch.full((3*4*5,),2, dtype=torch.float32)
 8 | 
 9 | torch_t3 = torch.sqrt(torch_t1)
10 | print(torch_t3)
11 | torch_t4 = torch.log(torch_t2)
12 | print(torch_t4)
13 | torch_t5 = torch.exp(torch_t4)
14 | print(torch_t5)
15 | torch_t6 = torch.pow(torch_t5,torch_t3)
16 | print(torch_t6)
17 | torch_t7 = 2**torch_t1
18 | print(torch_t7)
19 | ############-------DEEPX-------################
20 | 
21 | import deepx
22 | print()
23 | 
24 | t1 = deepx.arange(start=0,end=3*4*5,dtype='float32',name="t1")
25 | t2 = deepx.full((3*4*5,),value=2,dtype='float32',name="t2")
26 | t3 = deepx.sqrt(t1,out='t3')
27 | t3.print()
28 | t4 = deepx.log(t2,out='t4')
29 | t4.print()
30 | t5 = deepx.exp(t4,out='t5')
31 | t5.print()
32 | t6 = deepx.pow(t5,t3,out='t6')
33 | t6.print()
34 | t7 = 2**t1
35 | t7.print()
36 | 
37 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/2_elementwise_switchwhere.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32)
 6 | torch_t2 = torch.arange(24,dtype=torch.float32).reshape(2,3,4)
 7 | torch_t3= torch.where(torch_t2<torch_t1,torch_t1,torch_t2)
 8 | print("t3=where(t2<t1,t1,t2)")
 9 | print(torch_t3)
10 | 
11 | 
12 | ############-------DEEPX-------################
13 | 
14 | from deepx import Tensor,full,arange,where
15 | 
16 | print()
17 | 
18 | t1 = full((2,3,4), value=10,dtype="float32")
19 | t2 = arange(0,24,dtype="float32").reshape_((2,3,4))
20 | t3= where(t2<t1,t1,t2)
21 | t3.print()
22 | 
23 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/3_matmul.py:
--------------------------------------------------------------------------------
 1 | benchcnt=2000
 2 | shape=[4096,4096]
 3 | print()
 4 | from deepxutil.numpy  import save_numpy
 5 | import numpy as np
 6 | np_T1 = np.random.randn(shape[0], shape[1]).astype(np.float32)
 7 | np_T2 = np.random.randn(shape[0], shape[1]).astype(np.float32)
 8 | 
 9 | npy_path = '/home/lipeng/model/deepxmodel/matmul/'
10 | save_numpy(np_T1,npy_path+'t1')
11 | save_numpy(np_T2,npy_path+'t2')
12 | 
13 | ############-------PyTorch-------################
14 | 
15 | import torch
16 | import time
17 | torch_t1 = torch.from_numpy(np_T1).to(torch.float32).to('cuda')
18 | torch_t2 = torch.from_numpy(np_T2).to(torch.float32).to('cuda')
19 | # warmup
20 | _=torch_t1 @ torch_t2
21 | 
22 | torch_start = time.time()
23 | for i in range(benchcnt):
24 |     torch_t3 = torch_t1 @ torch_t2
25 | 
26 | torch_end = time.time()
27 | print(f"PyTorch time: {torch_end - torch_start} seconds")
28 | ############-------DEEPX-------################
29 | 
30 | from deepx import   matmul, zeros,load
31 | print()
32 | 
33 | t1 = load(npy_path+'t1')
34 | t2 = load(npy_path+'t2')
35 | t3= zeros(tuple(shape),dtype='float32',name="t3")
36 | from deepx.nn.functional import defaultauthor
37 | defaultauthor['matmul']='miaobyte'
38 | # warmup
39 | matmul(t1,t2,out=t3)
40 | 
41 | deepx_start = time.time()
42 | matmul(t1,t2,out=t3,bench=benchcnt)
43 | deepx_end = time.time()
44 | print(f"DeepX time: {deepx_end - deepx_start} seconds")
45 | 
46 | 
47 | 
48 |  


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_broadcast.py:
--------------------------------------------------------------------------------
 1 | #######====PYTORCH======########
 2 | 
 3 | print()
 4 | import torch
 5 | torch_a=torch.arange(4*2*3).reshape(4,2,3)
 6 | torch_b=torch.arange(2*1).reshape(2,1)
 7 | bb_torch = torch.broadcast_to(torch_b, (4,2,3))
 8 | print(bb_torch)
 9 | torch_a[None:,]
10 | 
11 | 
12 | 
13 | ########====DEEPX====########
14 | from deepx import Tensor,arange,broadcastTo
15 | 
16 | a=arange(start=0,end=4*2*3,name="a").reshape_((4,2,3))
17 | b=arange(start=0,end=2,name='b').reshape((2,1))
18 | bb=b.broadcastTo( a.shape,out="b.broadcasted")
19 | bb.print()
20 | 
21 | c=a[None:,]
22 | 
23 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_broadcast_add.py:
--------------------------------------------------------------------------------
 1 | ########====DEEPX====########
 2 | from deepx import ones
 3 | 
 4 | a=ones( 4,2,3 ,name="a")    
 5 | b=ones(  2,1 ,name='b')
 6 | c=a+b
 7 | print(c)
 8 | 
 9 | ########====pytorch====########
10 | import torch
11 | torch_a=torch.ones(4,2,3)
12 | torch_b=torch.ones(2,1)
13 | torch_c=torch_a+torch_b
14 | print(torch_c)
15 | 
16 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_concat.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.ones(3, 4,5, dtype=torch.float32)
 6 | torch_t2 = torch.ones(3, 4,5, dtype=torch.float32)
 7 | torch_t3 = torch.ones(3, 4,5, dtype=torch.float32)
 8 |  
 9 | torch_t = torch.concat([torch_t1, torch_t2, torch_t3], dim=1)
10 | print(torch_t)
11 |  
12 | 
13 | ############-------DEEPX-------################
14 | 
15 | from deepx import Tensor,zeros, ones, concat
16 | 
17 | 
18 | t1 = ones([3,4,5],dtype='float32',name='t1')
19 | t2=ones([3,4,5],dtype='float32',name='t2')
20 | t3=ones([3,4,5],dtype='float32',name='t3')
21 |  
22 | t=concat([t1,t2,t3],dim=1,out='t')
23 | t.print()
24 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_gather.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | import os
 3 | print()
 4 | dir=os.path.expanduser('~/model/deepxmodel/functional/')
 5 | import torch
 6 | torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5)
 7 | index=[0, 1, 2,0, 1, 2]
 8 | torch_index = torch.tensor(index,dtype=torch.int32)
 9 | 
10 | from deepxutil.torch import save_torch
11 | save_torch(torch_index,dir+'gatherindex')
12 | 
13 | torch_t2 = torch.index_select(torch_t, 1,torch_index)
14 | print(torch_t2.shape)
15 | print(torch_t2)
16 | 
17 | 
18 | ############-------DEEPX-------################
19 | 
20 | from deepx import  arange ,load
21 | 
22 | t = arange(start=0,end=10*5,dtype='float32',name='t').reshape_((10,5))
23 | indices = load(dir+'gatherindex')
24 | indices.print()
25 | t2 = t.indexselect(indices,axis=1)
26 | t2.print()
27 | 
28 | ### indexselect 行为和tensorflow.gather保持一致，支持index为多维


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_repeat.py:
--------------------------------------------------------------------------------
 1 | print()
 2 | ############-------PyTorch-------################
 3 | 
 4 | import torch
 5 | torch_t1 = torch.arange(60, dtype=torch.float32).reshape(3, 4,5)
 6 | print(torch_t1)
 7 | torch_t2=torch_t1.repeat([1,2,3])
 8 | print(torch_t2)
 9 | 
10 | 
11 | ############-------Deepx-------################
12 | 
13 | from deepx import arange
14 | t1 =  arange(0,60).reshape_((3, 4,5))
15 | t1.print()
16 | t2=t1.repeat((1,2,3))
17 | t2.print()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_reshape.py:
--------------------------------------------------------------------------------
 1 | print()
 2 | ############-------PyTorch-------################
 3 | 
 4 | import torch
 5 | torch_t1 = torch.ones(3, 4, dtype=torch.float32)
 6 | print(torch_t1)
 7 | torch_t2 = torch_t1.reshape(3, 2, 2)
 8 | print(torch_t2)
 9 | 
10 | torch_t3=torch.ones(4, 5, dtype=torch.float32).reshape(-1)
11 | print(torch_t3)
12 | 
13 | ############-------DEEPX-------################
14 | 
15 | from deepx import Tensor,zeros, ones, full, arange
16 | 
17 | t1 = ones((3,4),dtype='float32',name='t1')
18 | t1.print()
19 | t2=t1.reshape((3,2,2))
20 | t2.print()
21 | 
22 | t3=ones((4,5),dtype='float32').reshape_((20,))
23 | t3.print()
24 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/4_changeshape_transpose.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | print()
 3 | import torch
 4 | torch_t1 = torch.ones(3, 4, dtype=torch.float32)
 5 | print(torch_t1)
 6 | torch_t2 = torch_t1.transpose(0, 1)
 7 | print(torch_t2)
 8 | 
 9 | torch_t3 = torch.ones(2, 3, 4, dtype=torch.float32)
10 | torch_t4 = torch_t3.transpose(1, 2)
11 | print(torch_t4)
12 | 
13 | ############-------DEEPX-------################
14 | 
15 | from deepx import  ones
16 | 
17 | 
18 | 
19 | t1 = ones((3,4),dtype='float32',name='t1')
20 | t1.print()
21 | t2=t1.transpose(out='t2')
22 | t2.print()
23 | 
24 | t3=ones((2,3,4),dtype='float32',name='t3')
25 | t4=t3.transpose(out='t4')
26 | t4.print()
27 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/5_reduce_prod.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | import torch
 4 | torch_t = torch.arange(0,60).reshape(3,4,5)
 5 | print(torch_t)
 6 | 
 7 | torch_p=torch.prod(torch_t,dim=1)
 8 | print(torch_p)
 9 | 
10 | 
11 | 
12 | ############-------DEEPX-------################
13 | 
14 | from deepx import Tensor,ones,zeros,arange
15 | from deepx.nn.functional import sum,prod
16 | 
17 | t=arange(0,60,name='t').reshape_((3,4,5))
18 | t.print()
19 | 
20 | p=prod(t,dim=(1,),out="p")
21 | p.print()
22 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/5_reduce_sum.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | import torch
 4 | torch_t = torch.arange(0,60).reshape(3,4,5)
 5 | print(torch_t)
 6 | torch_s = torch.sum(torch_t, dim=[0, 2])
 7 | print(torch_s)
 8 | # torch_p=torch.prod(torch_t,dim=1)
 9 | # print(torch_p)
10 | 
11 | torch_t1 = torch.ones(4, 5, 6,dtype=torch.float)
12 | print(torch_t1)
13 | torch_t2 = torch.sum(torch_t1, dim=[0, 1])
14 | print(torch_t2)
15 | 
16 | 
17 | ############-------DEEPX-------################
18 | 
19 | from deepx import Tensor,ones,zeros,arange
20 | from deepx.nn.functional import sum,prod
21 | 
22 | t=arange(0,60,name='t').reshape_((3,4,5))
23 | 
24 | t.print()
25 | s=sum(t,dim=(0,2),out="s")
26 | s.print()
27 | # p=prod(t,dim=(1,),out="p")
28 | # p.print()
29 | 
30 | # t1=ones((4,5,6),name="t1")
31 | # t1.print()
32 | # t2=sum(t1,dim=(0,1),out='t2')
33 | # t2.print()
34 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/5_reduce_sum_keepdim.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | print()
 3 | 
 4 | import torch
 5 | torch_t = torch.arange(0,60).reshape(3,4,5)
 6 | print(torch_t)
 7 | torch_s = torch.sum(torch_t, dim=[0, 2],keepdim=True)
 8 | print(torch_s)
 9 | torch_p=torch.prod(torch_t,dim=1)
10 | print(torch_p)
11 | 
12 | torch_t1 = torch.ones(4, 5, 6,dtype=torch.float)
13 | print(torch_t1)
14 | torch_t2 = torch.sum(torch_t1, dim=[0, 1],keepdim=True)
15 | print(torch_t2)
16 | 
17 | 
18 | ############-------DEEPX-------################
19 | 
20 | from deepx import Tensor,ones,zeros,arange
21 | from deepx.nn.functional import sum,prod
22 | 
23 | t=arange(0,60,name='t').reshape_((3,4,5))
24 | t.print()
25 | s=sum(t,dim=(0,2),out="s",keepdim=True)
26 | 
27 | s.print()
28 | p=prod(t,dim=(1,),out="p",keepdim=True)
29 | 
30 | p.print()
31 | 
32 | t1=ones((4,5,6),name="t1")
33 | 
34 | t1.print()
35 | t2=sum(t1,dim=(0,1),out='t2',keepdim=True)
36 | 
37 | t2.print()
38 | 


--------------------------------------------------------------------------------
/front/py/examples/2_ir/6_tensorlife_to.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 |  
 3 | print()
 4 | import torch
 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32)
 6 | torch_t2 = torch_t1.to(dtype=torch.bfloat16)
 7 | print(torch_t2)
 8 | torch_t3 = torch_t2.to(dtype=torch.float32)
 9 | print(torch_t3)
10 | 
11 | ############-------DEEPX-------################
12 | 
13 | from deepx import  full
14 | 
15 | 
16 | t1 = full((2,3,4), value=10,dtype="float32")
17 | t2 = t1.to(dtype="bfloat16")
18 | t2.print()
19 | t3 = t2.to(dtype="float32")
20 | t3.print()


--------------------------------------------------------------------------------
/front/py/examples/2_ir/changeshape_repeat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # 正确：repeats为一维张量
 4 | x = torch.tensor([[1, 2], [3, 4]])
 5 | repeats = torch.tensor([1, 2])  # 一维张量
 6 | torch.repeat_interleave(x, repeats, dim=0)
 7 | # 输出:
 8 | # tensor([[1, 2],
 9 | #         [3, 4],
10 | #         [3, 4]])
11 | 
12 | # 错误：repeats为二维张量
13 | repeats_2d = torch.tensor([[1, 2], [3, 4]])  # 二维张量
14 | try:
15 |     torch.repeat_interleave(x, repeats_2d, dim=0)
16 | except RuntimeError as e:
17 |     print(f"错误: {e}")
18 | # 输出:


--------------------------------------------------------------------------------
/front/py/examples/3_functional/activite_relu.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | print()
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | torch_t = torch.empty(10, 10).uniform_(-1, 1)
 7 | torch_relu_t = F.relu(torch_t)
 8 | print(torch_t)
 9 | print(torch_relu_t)
10 | 
11 | import os
12 | dir=os.path.expanduser('~/model/deepxmodel/functional/')
13 | from deepxutil.torch import save_torch
14 | save_torch(torch_t,dir+'uniformed')
15 |  
16 | ############-------DEEPX-------################
17 | 
18 | from deepx  import relu,load
19 | 
20 | 
21 | t=load(dir+'uniformed')
22 | t.print()
23 | relu_t=relu(t)
24 | relu_t.print()
25 | 
26 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/activite_sigmoid.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | import torch
 3 | 
 4 | # 使用arange创建连续数据
 5 | x_torch = torch.arange(60, dtype=torch.float32).reshape(3, 4, 5) / 10.0 - 3.0
 6 | print("PyTorch tensor:")
 7 | print(x_torch)
 8 | 
 9 | import os
10 | dir=os.path.expanduser('~/model/deepxmodel/functional/')
11 | from deepxutil.torch import save_torch
12 | save_torch(x_torch,dir+'sigmoided')
13 | 
14 | out_torch = torch.sigmoid(x_torch)
15 | print("\nPyTorch sigmoid result:")
16 | print(out_torch)
17 | 
18 | ############-------DEEPX-------################
19 | from deepx import Tensor,ones,zeros,arange,load
20 | from deepx import sigmoid
21 | 
22 | # 使用相同的初始化方式
23 | x = load(dir+'sigmoided')
24 | 
25 | print("\nDEEPX tensor:")
26 | x.print()
27 | 
28 | out=sigmoid(x)
29 | print("\nDEEPX sigmoid result:")
30 | out.print()
31 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/activite_swish.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | import torch
 3 | 
 4 | # 使用arange创建连续数据
 5 | x_torch = torch.arange(60, dtype=torch.float32).reshape(3, 4, 5) / 10.0 - 3.0
 6 | print("PyTorch tensor:")
 7 | print(x_torch)
 8 | 
 9 | import os
10 | dir=os.path.expanduser('~/model/deepxmodel/functional/')
11 | from deepxutil.torch import save_torch
12 | save_torch(x_torch,dir+'swish')
13 | 
14 | out_torch = torch.nn.functional.silu(x_torch)
15 | print("\nPyTorch swish result:")
16 | print(out_torch)
17 | 
18 | ############-------DEEPX-------################
19 | from deepx import  load, swish
20 | 
21 | # 使用相同的初始化方式
22 | x = load(dir+'swish')
23 | 
24 | print("\nDEEPX tensor:")
25 | x.print()
26 | 
27 | out=swish(x)
28 | print("\nDEEPX swish result:")
29 | out.print()
30 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/changeshape_broadcast.py:
--------------------------------------------------------------------------------
 1 | 
 2 | print()
 3 | #######-----------------torch-----------------#######
 4 | import torch
 5 | torch_x = torch.arange(6).reshape(1,2,3)       # shape=(2,3)
 6 | torch_y = torch_x.broadcast_to((3,2,3))    # 需要原维度为1
 7 | print(torch_y)
 8 | 
 9 | torch_x2=torch_x.repeat_interleave(dim=0, repeats=3)
10 | print(torch_x2)
11 | 
12 | 
13 | #######-----------------deepx-----------------#######
14 | from deepx import Tensor,broadcast_to,arange
15 | deepx_x = arange(0,6).reshape_((1,2,3))      # shape=(2,3)
16 | deepx_y = broadcast_to(deepx_x, (3,2,3))    # 需要原维度为1
17 | deepx_y.print()
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/elementwise_dropout.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | print()
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | torch_t = torch.empty(10, 10).uniform_(-1, 1)
 7 | torch_dropout_t = F.dropout(torch_t)
 8 | print(torch_t)
 9 | print(torch_dropout_t)
10 | 
11 | 
12 | ############-------Deepx-------################
13 | 
14 | from deepx import uniform
15 | t = uniform((10, 10), -1, 1)
16 | dropout_t = t.clone()
17 | dropout_t.dropout_(0.5)
18 | dropout_t.print()
19 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/elementwise_rsqrt.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | 
 3 | import torch
 4 | torch_t = torch.arange(0, 24,dtype=torch.float).reshape(2, 3, 4)
 5 | torch_rsqrt_t = torch.rsqrt(torch_t)
 6 | print(torch_t)
 7 | print(torch_rsqrt_t)
 8 | 
 9 | import os
10 | dir = os.path.expanduser('~/model/deepxmodel/functional/')
11 | from deepxutil.torch import save_torch
12 | save_torch(torch_t, dir + 'aranged')
13 | 
14 | ############-------DEEPX-------################
15 | 
16 | from deepx import  rsqrt,load
17 | 
18 | t=load(dir+'aranged')
19 | t.print()
20 | rsqrt_t=rsqrt(t)
21 | rsqrt_t.print()
22 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/normalization_softmax.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | import torch
 3 | 
 4 | # 使用arange创建连续数据
 5 | x_torch = torch.arange(60, dtype=torch.float32).reshape(3, 4, 5) / 10.0 - 3.0
 6 | print("PyTorch tensor:")
 7 | print(x_torch)
 8 | 
 9 | out_torch = torch.softmax(x_torch,-2)
10 | print("\nPyTorch sigmoid result:")
11 | print(out_torch)
12 | 
13 | import os
14 | dir = os.path.expanduser('~/model/deepxmodel/functional/')
15 | from deepxutil.torch import save_torch
16 | save_torch(x_torch, dir + 'forsoftmax')
17 | 
18 | ############-------DEEPX-------################
19 | from deepx import softmax,load
20 | 
21 | # 使用相同的初始化方式
22 | x=load(dir+'forsoftmax')
23 | 
24 | print("\nDEEPX tensor:")
25 | x.print()
26 | 
27 | out=softmax(x,[-2])
28 | print("\nDEEPX sigmoid result:")
29 | out.print()
30 | 


--------------------------------------------------------------------------------
/front/py/examples/3_functional/reduce_mean.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | print()
 3 | import torch
 4 | 
 5 | torch_t3 = torch.arange(0, 120,dtype=torch.float).reshape(4, 5, 6)
 6 | print(torch_t3)
 7 | torch_t3_mean = torch.mean(torch_t3, dim=[0, 1])
 8 | print(torch_t3_mean)
 9 | 
10 | ############-------DEEPX-------################
11 | from deepx.nn.functional import  mean,arange
12 | 
13 | t3 = arange(0, 120).reshape((4, 5, 6))
14 | t3.print()
15 | 
16 | t3_mean=mean(t3,dim=(0,1))
17 | t3_mean.print()
18 | 


--------------------------------------------------------------------------------
/front/py/examples/3_module/0_hg_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | 
 3 | def init_tokenizer(model_path):
 4 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
 5 |     tokenizer.pad_token = tokenizer.eos_token
 6 |     return tokenizer
 7 | 
 8 | tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
 9 | 
10 | def test_tokenizer():
11 |     # 测试编码功能
12 |     text = "这是一个测试文本   aaa bbb"
13 |     tokens = tokenizer(text, return_tensors="np")
14 |     print(f"{text}==>{tokens.input_ids.shape} {tokens}")
15 |     
16 |     # 测试解码功能
17 |     for i in range(tokens.input_ids.shape[0]):
18 |         for j in range(tokens.input_ids.shape[1]):
19 |             decoded_text = tokenizer.decode(tokens.input_ids[i][j])
20 |             print(f"{i,j}->{decoded_text}")
21 |     
22 |     # 验证特殊tokens
23 |     print(f"PAD token:{tokenizer.pad_token_id}=  {tokenizer.pad_token}")
24 |     print(f"EOS token:{tokenizer.eos_token_id}=  {tokenizer.eos_token}")
25 |     print(f"Vocabulary size: {len(tokenizer)}")
26 |     
27 |     # 测试批处理
28 |     batch_texts = ["测试文本一", "另一个测试文本", "第三个测试文本"]
29 |     batch_tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='np')
30 |     print(f"批处理tokens shape: {batch_tokens.input_ids.shape}")
31 |     
32 |     # 测试最大长度限制
33 |     long_text = "这是一个" * 100
34 |     tokens_truncated = tokenizer(long_text, max_length=20, truncation=True, return_tensors="np")
35 |     print(f"截断后的tokens长度: {tokens_truncated.input_ids.shape[1]}")
36 |     
37 |     return True
38 | 
39 | if __name__ == "__main__":
40 |     print()
41 |     test_result = test_tokenizer()
42 | 
43 |     print(f"Tokenizer测试完成: {'成功' if test_result else '失败'}")


--------------------------------------------------------------------------------
/front/py/examples/3_module/1_embedding.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | print()
 3 | def init_tokenizer(model_path):
 4 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
 5 |     tokenizer.pad_token = tokenizer.eos_token
 6 |     return tokenizer
 7 | 
 8 | tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
 9 | 
10 | def tokenize_text(text, tokenizer):
11 |     tokens = tokenizer(text, return_tensors="pt").input_ids
12 |     import torch
13 |     # 处理超出词汇表范围的token
14 |     if torch.any(tokens >= tokenizer.vocab_size):
15 |         # 获取UNK token ID，如果没有则使用0
16 |         unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0
17 |         # 替换所有超出范围的token为UNK
18 |         tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device))
19 |     return tokens
20 | 
21 | dir="/home/lipeng/model/deepxmodel/embeddingtest/"
22 |  
23 | ############-------PyTorch-------################
24 | import torch.nn as nn
25 | 
26 | # 创建输入
27 | text = "这是一个测试文本，用于演示嵌入层的使用。"
28 | torch_input = tokenize_text(text, tokenizer)
29 | from deepxutil.torch import save_torch
30 | save_torch(torch_input,dir+'input')
31 | print(torch_input.shape)
32 | print(torch_input)
33 | # 创建网络
34 | torch_net = nn.Embedding(tokenizer.vocab_size, 4096)
35 | save_torch(torch_net.weight,dir+'weight')
36 | # 前向传播
37 | torch_output = torch_net(torch_input)
38 | 
39 | print(torch_output.shape)
40 | print(torch_output)
41 | 
42 | 
43 | ############-------DEEPX-------################
44 | from deepx.nn.modules import Embedding
45 | from deepx.nn.functional import load
46 | 
47 | input=load(dir+'input')
48 | input.print()
49 | 
50 | weight=load(dir+'weight')
51 | net = Embedding(tokenizer.vocab_size, 4096,weight=weight)
52 | out=net.forward(input)
53 | out.print()
54 | 
55 | 


--------------------------------------------------------------------------------
/front/py/examples/3_module/1_linear.py:
--------------------------------------------------------------------------------
 1 | ############-------PyTorch-------################
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | net = nn.Linear(64, 4)
 6 | torch_input = torch.ones(1, 64)
 7 | torch_output = net(torch_input)
 8 | print()
 9 | print(torch_output)
10 | 
11 | 
12 | ############-------DEEPX-------################
13 | from deepx.nn.modules import Linear
14 | from deepx import ones
15 | 
16 | net = Linear(64, 4)
17 | input=ones(1,64,name='input')
18 | out=net.forward(input)
19 | out.print()
20 | 
21 | 


--------------------------------------------------------------------------------
/front/py/examples/4_transformer/llama/1_llama_rmsnorm.py:
--------------------------------------------------------------------------------
 1 | hidden_size = 8
 2 | eps = 1e-6
 3 | dir='/home/lipeng/model/deepxmodel/llama/'
 4 | 
 5 | 
 6 | 
 7 | ############### PyTorch 实现部分 ###############
 8 | import torch
 9 | # 使用小规模数据以便打印完整结果
10 | pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0
11 | print("PyTorch 输入:")
12 | print(pt_input)
13 | 
14 | from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm
15 | from deepxutil.torch import save_torch
16 | save_torch(pt_input,dir+'rmsnorm_input')
17 | # 使用transformers库中的官方LlamaRMSNorm实现
18 | pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps)
19 | # 设置权重为固定值0.5
20 | with torch.no_grad():
21 |     pt_norm.weight.fill_(0.5)
22 | # 前向传播
23 | pt_output = pt_norm(pt_input)
24 | 
25 | 
26 | print("\nPyTorch RMSNorm 结果:")
27 | print(pt_output.shape)
28 | print(pt_output)
29 | 
30 |  
31 | ############### DeepX 实现部分 ###############
32 | from deepx import  constant_,load
33 | from deepx.transformer.models.llama.normalization import LlamaRMSNorm
34 | 
35 | input=load(dir+'rmsnorm_input')
36 | 
37 | # DeepX计算流程
38 | norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
39 | # 设置相同的权重
40 | constant_(norm.weight, 0.5)
41 | # 前向计算
42 | output = norm(input)
43 | output.print()
44 | 


--------------------------------------------------------------------------------
/front/py/examples/4_transformer/llama/llama_:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/examples/4_transformer/llama/llama_


--------------------------------------------------------------------------------
/front/py/examples/4_transformer/llama/llama_rope.py:
--------------------------------------------------------------------------------
 1 | from llama_rope_torch import dir,config
 2 | 
 3 | ############-------DEEPX-------################
 4 | from deepx.nn.modules import Embedding,Module
 5 | from deepx  import load,arange
 6 | from deepx.transformer.models.llama import LlamaRotaryEmbedding
 7 | 
 8 | input=load(dir+'input')
 9 | 
10 | embed_tokens_weight=load(dir+'weight')
11 | 
12 | class NetDeepx(Module):
13 |     def __init__(self,configdict:dict):
14 |         super().__init__()
15 |         self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight)
16 |         self.rotary_emb = LlamaRotaryEmbedding(config=configdict)
17 |         print("rotary_emb.inv_freq")
18 |         self.rotary_emb.inv_freq.print()
19 |     def forward(self,x):
20 |         inputs_embeds = self.embed_tokens(x)
21 |         hidden_states = inputs_embeds
22 |         position_ids = arange(start=0,end=hidden_states.shape[1]).unsqueeze(0)
23 |         return self.rotary_emb(hidden_states, position_ids)
24 | 
25 | if __name__ == "__main__":
26 |     net = NetDeepx(configdict=config.to_dict())
27 |     out=net.forward(input)
28 |     out[0].print()
29 |     out[1].print()
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/log.md:
--------------------------------------------------------------------------------
 1 | ### 2025-01-9
 2 | deepx第三次重构
 3 | 目标：性能与特性并重
 4 | 
 5 | 
 6 | ### 2025-01-17
 7 | 尝试omp+highway的simd融合
 8 | 
 9 | ### 2025-01-20
10 | 
11 | layer.Node需要仔细设计forward和backward的接口
12 | 
13 | + 输入输出用string作为key，从tensormanager中获取tensor
14 | + parallel结构
15 | 
16 | ### 2025-01-21
17 | h5模型文件，转deepx格式
18 | 
19 | ### 2025-01-22
20 | 


--------------------------------------------------------------------------------
/model/h5_deepx/h5_deepx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/h5_deepx/h5_deepx/__init__.py


--------------------------------------------------------------------------------
/model/h5_deepx/h5_deepx/toh5.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/h5_deepx/h5_deepx/toh5.py


--------------------------------------------------------------------------------
/model/h5_deepx/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py
2 | numpy
3 | pyyaml
4 | os
5 | sys
6 | 


--------------------------------------------------------------------------------
/model/h5_deepx/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='h5_deepx',
 5 |     version='0.1.0',
 6 |     description='A tool to extract model structure and weights from H5 files',
 7 |     author='Your Name',
 8 |     author_email='your.email@example.com',
 9 |     packages=find_packages(),
10 |     install_requires=[
11 |         'h5py',  # H5 文件依赖
12 |         'numpy',  # NumPy 依赖
13 |         'pyyaml',  # YAML 依赖
14 |     ],
15 |     entry_points={
16 |         'console_scripts': [
17 |             'todeepx=h5_deepx.todeepx:extract_h5_model',
18 |         ],
19 |     },
20 | )


--------------------------------------------------------------------------------
/model/onnx_deepx/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/onnx_deepx/README.md


--------------------------------------------------------------------------------
/model/onnx_deepx/onnx_deepx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/onnx_deepx/onnx_deepx/__init__.py


--------------------------------------------------------------------------------
/model/onnx_deepx/onnx_deepx/todeepx.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | 
 3 | def extract_onnx_info(onnx_file):
 4 |     # 加载 ONNX 模型
 5 |     model = onnx.load(onnx_file)
 6 | 
 7 |     # 提取模型信息
 8 |     model_info = {
 9 |         "ir_version": model.ir_version,
10 |         "producer_name": model.producer_name,
11 |         "producer_version": model.producer_version,
12 |         "domain": model.domain,
13 |         "model_version": model.model_version,
14 |         "doc_string": model.doc_string,
15 |         "graph": {
16 |             "name": model.graph.name,
17 |             "input": [input.name for input in model.graph.input],
18 |             "output": [output.name for output in model.graph.output],
19 |             "nodes": []
20 |         }
21 |     }
22 | 
23 |     # 提取节点信息
24 |     for node in model.graph.node:
25 |         model_info["graph"]["nodes"].append({
26 |             "name": node.name,
27 |             "op_type": node.op_type,
28 |             "inputs": node.input,
29 |             "outputs": node.output,
30 |             "attributes": {attr.name: attr for attr in node.attribute}
31 |         })
32 | 
33 |     return model_info


--------------------------------------------------------------------------------
/model/onnx_deepx/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx


--------------------------------------------------------------------------------
/model/onnx_deepx/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='onnx_deepx',
 5 |     version='0.1.0',
 6 |     description='A simple ONNX model extractor',
 7 |     author='Lipeng',
 8 |     author_email='lipeng@mirrorsoft.cn',
 9 |     packages=find_packages(),
10 |     install_requires=[
11 |         'onnx',  # 添加 ONNX 依赖
12 |     ],
13 |     entry_points={
14 |         'console_scripts': [
15 |             'todeepx=onnx_deepx.todeepx:extract_onnx_info',
16 |             'toonnx=onnx_deepx.toonnx:extract_onnx_info',
17 |         ],
18 |     },
19 | )


--------------------------------------------------------------------------------
/model/safetensor_deepx/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/safetensor_deepx/README.md


--------------------------------------------------------------------------------
/model/safetensor_deepx/examples/load_model.py:
--------------------------------------------------------------------------------
 1 | from safetensor_deepx import SafeTensorLoader, SafeTensorGraphBuilder
 2 | import os
 3 | 
 4 | def main():
 5 |     model_dir = "/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 6 |     
 7 |     # 加载tensor
 8 |     loader = SafeTensorLoader(model_dir)
 9 |     tensors, metadata = loader.load()
10 |     
11 |     print("\nModel Configuration:")
12 |     for key, value in metadata.get("model_config", {}).items():
13 |         print(f"{key}: {value}")
14 |     
15 |     print("\nTensor Statistics:")
16 |     total_params = 0
17 |     for name, tensor in tensors.items():
18 |         shape = tensor.shape
19 |         num_params = tensor.data.size
20 |         total_params += num_params
21 |         print(f"{name}: shape={shape}, params={num_params:,}")
22 |     
23 |     print(f"\nTotal Parameters: {total_params:,}")
24 |         
25 |     # 构建计算图
26 |     builder = SafeTensorGraphBuilder(model_dir)
27 |     graph, _, _ = builder.build_graph()
28 |     
29 |     # 导出计算图可视化
30 |     output_dir = "model_analysis"
31 |     os.makedirs(output_dir, exist_ok=True)
32 |     
33 |     dot = graph.to_dot()
34 |     dot.render(os.path.join(output_dir, "model_graph"), format="png", cleanup=True)
35 |     
36 |     print(f"\n计算图已保存到 {output_dir}/model_graph.png")
37 | 
38 | if __name__ == "__main__":
39 |     main() 


--------------------------------------------------------------------------------
/model/safetensor_deepx/requirements.txt:
--------------------------------------------------------------------------------
1 | safetensors>=0.3.0
2 | numpy>=1.19.0
3 | 


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: safetensor-deepx
 3 | Version: 0.1.0
 4 | Summary: SafeTensor support for DeepX
 5 | Home-page: UNKNOWN
 6 | Author: igor.li
 7 | License: UNKNOWN
 8 | Platform: UNKNOWN
 9 | Requires-Python: >=3.7
10 | 
11 | UNKNOWN
12 | 
13 | 


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.py
 3 | safetensor_deepx/__init__.py
 4 | safetensor_deepx/graph.py
 5 | safetensor_deepx/loader.py
 6 | safetensor_deepx.egg-info/PKG-INFO
 7 | safetensor_deepx.egg-info/SOURCES.txt
 8 | safetensor_deepx.egg-info/dependency_links.txt
 9 | safetensor_deepx.egg-info/requires.txt
10 | safetensor_deepx.egg-info/top_level.txt


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.19.0
2 | safetensors>=0.3.0
3 | 


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | safetensor_deepx
2 | 


--------------------------------------------------------------------------------
/model/safetensor_deepx/safetensor_deepx/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import SafeTensorLoader, SafeTensorSaver
2 | from .graph import SafeTensorGraphBuilder
3 | 
4 | __all__ = [
5 |     'SafeTensorLoader',
6 |     'SafeTensorSaver',
7 |     'SafeTensorGraphBuilder'
8 | ] 


--------------------------------------------------------------------------------
/model/safetensor_deepx/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='safetensor-deepx',
 5 |     version='0.1.0',
 6 |     description='SafeTensor support for DeepX',
 7 |     author='igor.li',
 8 |     packages=find_packages(),
 9 |     install_requires=[
10 |         'safetensors>=0.3.0',
11 |         'numpy>=1.19.0',
12 |         'deepxpy>=0.1.0',
13 |         'graphviz>=0.20.1',
14 |     ],
15 |     python_requires='>=3.7',
16 | )


--------------------------------------------------------------------------------
/scheduler/README.md:
--------------------------------------------------------------------------------
 1 | # scheduler 
 2 | 
 3 | 执行调度器
 4 | 
 5 | 执行调度器作为deepx的训推的执行计算图，负责执行计算图的分布式调度
 6 | 
 7 | TODO
 8 | 
 9 |  
10 | 
11 | 


--------------------------------------------------------------------------------
/scheduler/autograd/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .function import Function,Context
 3 | __all__ = [
 4 |     'Graph',
 5 |     'Node',
 6 |     'NodeType',
 7 |     'DataNode',
 8 |     'OpNode',
 9 |     'Function',
10 |     'Context',
11 |    ]


--------------------------------------------------------------------------------
/scheduler/autograd/function.py:
--------------------------------------------------------------------------------
 1 | from deepx.autograd import Graph
 2 | class Context:
 3 |     def __init__(self,requires_grad=False):
 4 |         self._requires_grad = requires_grad
 5 |         self._saved_tensors = []
 6 |         self._non_tensor_data = {}
 7 |         self._authormap = {}
 8 |     def save_tensors(self, *tensors):
 9 |         self._saved_tensors.extend(tensors)
10 | 
11 |     @property
12 |     def get_tensor(self):
13 |         return tuple(self._saved_tensors)
14 | 
15 |     def save_data(self, key, value):
16 |         self._non_tensor_data[key] = value
17 | 
18 |     def get_data(self, key):
19 |         return self._non_tensor_data.get(key)
20 | 
21 |     def set_authormap(self,authormap:dict):
22 |         self._authormap = authormap
23 | 
24 |     @property
25 |     def authormap(self):
26 |         return self._authormap
27 | 
28 |     @property
29 |     def requires_grad(self):
30 |         return self._requires_grad
31 | 
32 | class Function:
33 |     @staticmethod
34 |     def forward(ctx:Context, *args, **kwargs):
35 |         raise NotImplementedError
36 | 
37 |     @staticmethod
38 |     def backward(ctx:Context, *grad_outputs):
39 |         raise NotImplementedError
40 | 
41 |     @classmethod
42 |     def apply(cls, *args, **kwargs):
43 |         requires_grad = kwargs.pop('requires_grad', False)
44 |         ctx = Context(requires_grad=requires_grad)
45 |         result = cls.forward(ctx, *args, **kwargs)
46 |         return result
47 |     
48 | 


--------------------------------------------------------------------------------
/scheduler/autograd/graph/_controlflownode.py:
--------------------------------------------------------------------------------
1 | from .node import Node
2 | from .nodetype import NodeType
3 | 
4 | 
5 | class ControlFlowNode(Node):
6 |     def __init__(self, name=None):
7 |         super().__init__(name="control_flow", ntype=NodeType.CONTROL_FLOW)
8 | 
9 | 


--------------------------------------------------------------------------------
/scheduler/autograd/graph/_datanode.py:
--------------------------------------------------------------------------------
 1 | from .node import Node
 2 | from .nodetype import NodeType 
 3 |  
 4 | class DataNode(Node):
 5 |     def __init__(self, name=None, type=None, data=None):
 6 |         super().__init__(name=name, ntype=NodeType.DATA)
 7 |         self._data = data
 8 |         self._type=type
 9 |     @property
10 |     def data(self):
11 |         return self._data
12 |     
13 |     def set_data(self, data):
14 |         self._data = data
15 |  
16 |  
17 | 


--------------------------------------------------------------------------------
/scheduler/autograd/graph/_opnode.py:
--------------------------------------------------------------------------------
1 | from .node import Node
2 | from .nodetype import NodeType
3 |  
4 | class OpNode(Node):
5 |     def __init__(self, name: str):
6 |         super().__init__(name=name, ntype=NodeType.OP)
7 | 


--------------------------------------------------------------------------------
/scheduler/autograd/graph/node.py:
--------------------------------------------------------------------------------
 1 | from .nodetype import NodeType
 2 | 
 3 | class Node:
 4 |     def __init__(self, 
 5 |                 ntype:NodeType=None,
 6 |                 name:str=None,
 7 |                 graph=None,
 8 |                 ):
 9 |         from .graph import Graph
10 |         if graph == None:
11 |             self._graph = Graph.get_default()
12 |         else:
13 |             self._graph = graph
14 |         self._module = None
15 |         self._ntype = ntype
16 |         self._name = name
17 |         self._inputs = []
18 | 
19 |     @property
20 |     def ntype(self):
21 |         return self._ntype
22 |     
23 |     @property
24 |     def graph(self):
25 |         return self._graph
26 |     
27 |     @property
28 |     def name(self):
29 |         return self._name
30 |     
31 |     def rename(self,name:str):
32 |         self._name = name
33 | 
34 |     
35 |     @property
36 |     def fullname(self):
37 |         if self._module is None:
38 |             return self._name
39 |         else:
40 |             return f"{self._module.full_name}.{self._name}"
41 |     
42 |     def set_module(self,module):
43 |         from deepx.nn.modules import Module
44 |         if isinstance(module,Module):
45 |             self._module = module
46 |         else:
47 |             raise ValueError("module must be a Module")
48 | 
49 |     @property
50 |     def module(self):
51 |         return self._module
52 | 
53 |     @property
54 |     def inputs(self):
55 |         return self._inputs
56 | 
57 |     
58 |     def add_input(self, input_node):
59 |         self._inputs.append(input_node)


--------------------------------------------------------------------------------
/scheduler/autograd/graph/nodetype.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum, EnumMeta
 2 | from typing import Dict, Any
 3 | 
 4 |  
 5 | class NodeType(IntEnum ):
 6 |     DATA = 0
 7 |     OP = 1
 8 |     CONTROL_FLOW = 2
 9 | 
10 | 


--------------------------------------------------------------------------------
/scheduler/common/pass_register.cpp:
--------------------------------------------------------------------------------
 1 | #include "pass_register.hpp"
 2 | 
 3 | namespace deepx
 4 | {
 5 | 
 6 |     PassRegistry &PassRegistry::instance()
 7 |     {
 8 |         static PassRegistry registry_ins;
 9 |         return registry_ins;
10 |     }
11 | 
12 |     void PassRegistry::register_pass(const std::string &name, pass_func func)
13 |     {
14 |         registry_[name] = func;
15 |     }
16 | }


--------------------------------------------------------------------------------
/scheduler/common/pass_register.hpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <unordered_map>
 4 | #include <functional>
 5 | 
 6 | /**
 7 |  * 使用REGISTER_PASS 宏用于注册pass
 8 |  */
 9 | namespace deepx
10 | {
11 |     using pass_func = std::function<void()>;
12 | 
13 |     class PassRegistry
14 |     {
15 |     public:
16 |         static PassRegistry& instance();
17 | 
18 |         void register_pass(const std::string &name, pass_func func);
19 | 
20 |         PassRegistry(const PassRegistry&) = delete;
21 |         PassRegistry& operator=(const PassRegistry&) = delete;
22 | 
23 |     private:
24 |         PassRegistry() = default;
25 | 
26 |     private:
27 |         std::unordered_map<std::string, pass_func> registry_;
28 |     };
29 | }
30 | 
31 | 
32 | #define REGISTER_PASS(name, func) \
33 |     struct Register##name { \
34 |         Register##name() { \
35 |             PassRegistry::instance().register_pass(#name, func); \
36 |         } \
37 |     }; \
38 |     static Register##name register_##name;


--------------------------------------------------------------------------------
/tool/deepxctl/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | deepxctl


--------------------------------------------------------------------------------
/tool/deepxctl/cmd/tensor/tensor.go:
--------------------------------------------------------------------------------
 1 | package tensor
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | )
 7 | 
 8 | func PrintUsage() {
 9 | 	fmt.Println("使用方法:")
10 | 	fmt.Println("  tensor print <文件路径>")
11 | 	fmt.Println("  tensor help")
12 | }
13 | 
14 | func Execute() {
15 | 	if len(os.Args) < 1 {
16 | 		PrintUsage()
17 | 		os.Exit(1)
18 | 	}
19 | 
20 | 	subCmd := "help"
21 | 	if len(os.Args) > 0 {
22 | 		subCmd = os.Args[0]
23 | 	}
24 | 
25 | 	switch subCmd {
26 | 	case "print":
27 | 		os.Args = os.Args[1:]
28 | 		PrintCmd()
29 | 	case "help":
30 | 		PrintUsage()
31 | 	default:
32 | 		fmt.Printf("未知的张量命令: %s\n", subCmd)
33 | 		PrintUsage()
34 | 		os.Exit(1)
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/tool/deepxctl/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/array2d/deepx/tool/deepxctl
2 | 
3 | go 1.23.2
4 | 
5 | require gopkg.in/yaml.v2 v2.4.0 // indirect
6 | 


--------------------------------------------------------------------------------
/tool/deepxctl/go.sum:
--------------------------------------------------------------------------------
1 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
2 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
3 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
4 | 


--------------------------------------------------------------------------------
/tool/deepxctl/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"path/filepath"
 8 | 
 9 | 	"github.com/array2d/deepx/tool/deepxctl/cmd/tensor"
10 | )
11 | 
12 | var version = "0.1.0"
13 | 
14 | func printUsage() {
15 | 	execName := filepath.Base(os.Args[0])
16 | 	fmt.Printf("用法: %s [命令] [参数]\n\n", execName)
17 | 	fmt.Println("可用命令:")
18 | 	fmt.Println("  tensor    张量操作相关命令")
19 | 	fmt.Println("  version   显示版本信息")
20 | 	fmt.Println("  help      显示帮助信息")
21 | 	fmt.Println("\n使用 '%s help [命令]' 获取命令的详细信息", execName)
22 | }
23 | 
24 | func main() {
25 | 	flag.Usage = printUsage
26 | 
27 | 	if len(os.Args) < 2 {
28 | 		printUsage()
29 | 		os.Exit(0)
30 | 	}
31 | 
32 | 	// 获取子命令
33 | 	cmd := os.Args[1]
34 | 
35 | 	// 根据子命令执行相应操作
36 | 	switch cmd {
37 | 	case "tensor":
38 | 		// 移除子命令，让子命令处理剩余的参数
39 | 		os.Args = os.Args[2:]
40 | 		tensor.Execute()
41 | 
42 | 	case "version":
43 | 		fmt.Printf("deepxctl 版本 %s\n", version)
44 | 
45 | 	case "help":
46 | 		if len(os.Args) > 2 {
47 | 			helpCmd := os.Args[2]
48 | 			switch helpCmd {
49 | 			case "tensor":
50 | 				tensor.PrintUsage()
51 | 			default:
52 | 				fmt.Printf("未知命令: %s\n", helpCmd)
53 | 				printUsage()
54 | 			}
55 | 		} else {
56 | 			printUsage()
57 | 		}
58 | 
59 | 	default:
60 | 		fmt.Printf("未知命令: %s\n", cmd)
61 | 		printUsage()
62 | 		os.Exit(1)
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/tool/deepxctl/tensor/fp16.go:
--------------------------------------------------------------------------------
 1 | package tensor
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"math"
 6 | )
 7 | 
 8 | func Byte2ToFloat16(value []byte) float32 {
 9 | 	bits := binary.BigEndian.Uint16(value)
10 | 	// 这里需要实现float16到float32的转换
11 | 	// 简化实现，实际项目中需要更完整的实现
12 | 	sign := float32(1)
13 | 	if bits&0x8000 != 0 {
14 | 		sign = -1
15 | 	}
16 | 	exp := int((bits & 0x7C00) >> 10)
17 | 	frac := float32(bits&0x03FF) / 1024.0
18 | 
19 | 	if exp == 0 {
20 | 		return sign * frac * float32(1.0/16384.0) // 非规格化数
21 | 	} else if exp == 31 {
22 | 		if frac == 0 {
23 | 			return sign * float32(math.Inf(1)) // 无穷大
24 | 		}
25 | 		return float32(math.NaN()) // NaN
26 | 	}
27 | 	return sign * float32(math.Pow(2, float64(exp-15))) * (1.0 + frac) // 规格化数
28 | }
29 | 


--------------------------------------------------------------------------------
/tool/deepxctl/tensor/io.go:
--------------------------------------------------------------------------------
 1 | package tensor
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"os"
 6 | 
 7 | 	"gopkg.in/yaml.v2"
 8 | )
 9 | 
10 | func LoadShape(filePath string) (shape Shape, err error) {
11 | 	var shapeData []byte
12 | 	shapeData, err = os.ReadFile(filePath + ".shape")
13 | 	if err != nil {
14 | 		return
15 | 	}
16 | 
17 | 	err = yaml.Unmarshal(shapeData, &shape)
18 | 	if err != nil {
19 | 		return
20 | 	}
21 | 	return
22 | }
23 | func LoadTensor[T Number](filePath string) (tensor Tensor[T], err error) {
24 | 
25 | 	_, err = os.ReadFile(filePath + ".shape")
26 | 	if err != nil {
27 | 		return
28 | 	}
29 | 	var shape Shape
30 | 	shape, err = LoadShape(filePath)
31 | 	if err != nil {
32 | 		return
33 | 	}
34 | 	file, err := os.Open(filePath + ".data")
35 | 	if err != nil {
36 | 		return
37 | 	}
38 | 	defer file.Close()
39 | 	data := make([]T, shape.Size)
40 | 
41 | 	err = binary.Read(file, binary.LittleEndian, data)
42 | 	if err != nil {
43 | 		return
44 | 	}
45 | 	tensor = Tensor[T]{Data: data, Shape: shape}
46 | 	return
47 | }
48 | 


--------------------------------------------------------------------------------
/tool/deepxctl/tensor/tensor.go:
--------------------------------------------------------------------------------
 1 | package tensor
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | type Shape struct {
 8 | 	Shape  []int  `json:"shape"`
 9 | 	Stride []int  `json:"stride"`
10 | 	Dim    int    `json:"ndim"`
11 | 	Size   int    `json:"size"`
12 | 	Dtype  string `json:"dtype"`
13 | }
14 | 
15 | func NewTensorShape(shape []int) (s Shape) {
16 | 	s.Dim = len(shape)
17 | 	s.Shape = make([]int, len(shape))
18 | 	copy(s.Shape, shape)
19 | 	s.Stride = make([]int, len(shape))
20 | 	s.Stride[len(shape)-1] = 1
21 | 	for i := len(shape) - 2; i >= 0; i-- {
22 | 		s.Stride[i] = s.Stride[i+1] * shape[i+1]
23 | 	}
24 | 	s.Size = s.Stride[0] * shape[0]
25 | 	return s
26 | }
27 | func (s Shape) String() string {
28 | 	return fmt.Sprintf("%v", s.Shape)
29 | }
30 | 
31 | func (s Shape) At(i int) int {
32 | 	return s.Shape[i]
33 | }
34 | 
35 | func (s Shape) LinearAt(indices []int) int {
36 | 	idx := 0
37 | 	for i := 0; i < len(indices); i++ {
38 | 		idx += indices[i] * s.Stride[i]
39 | 	}
40 | 	return idx
41 | }
42 | func (s Shape) LinearTo(idx int) (indices []int) {
43 | 	linearIndex := idx
44 | 	indices = make([]int, s.Dim)
45 | 	for i := 0; i < s.Dim; i++ {
46 | 		indices[i] = linearIndex / s.Stride[i]
47 | 		linearIndex %= s.Stride[i]
48 | 	}
49 | 	return indices
50 | }
51 | 
52 | func BitSize(Dtype string) int {
53 | 	switch Dtype {
54 | 	case "bool":
55 | 		return 8
56 | 	case "int8":
57 | 		return 8
58 | 	case "int16":
59 | 		return 16
60 | 	case "int32":
61 | 		return 32
62 | 	case "int64":
63 | 		return 64
64 | 	case "float16":
65 | 		return 16
66 | 	case "float32":
67 | 		return 32
68 | 	case "float64":
69 | 		return 64
70 | 	default:
71 | 		return 0
72 | 	}
73 | }
74 | 
75 | type Number interface {
76 | 	comparable
77 | 	float64 | float32 | int64 | int32 | int16 | int8 | bool
78 | }
79 | 
80 | type Tensor[T Number] struct {
81 | 	Data []T
82 | 	Shape
83 | }
84 | 
85 | // Get 获取Tensor的值
86 | func (t *Tensor[T]) Get(indices ...int) T {
87 | 	idx := t.Shape.LinearAt(indices)
88 | 	return t.Data[idx]
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------