├── .cursorrules ├── .github ├── ISSUE_TEMPLATE │ ├── compile.md │ ├── excuter.md │ ├── operator.md │ └── py_deepx.md └── workflows │ ├── auto-merge.yml │ ├── excuter-cppcommon.yml │ ├── excuter-cuda-linux.yml │ ├── excuter-ompsimd-linux.yml │ └── tool-deepxctl.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── doc ├── .gitignore ├── README.md ├── benchmark │ ├── broadcast.md │ ├── matmul.md │ └── reduce.md ├── conf.py ├── deepxIR │ ├── ir.md │ └── readme.md ├── design.md ├── excuter │ ├── deepx.op.drawio │ ├── deepx.op.drawio.svg │ ├── deepx.op.jpg │ ├── excuter.md │ ├── mix_precision.md │ ├── op-mem-cuda │ │ ├── cublas │ │ │ └── api.md │ │ ├── cublaslt │ │ │ └── api.md │ │ └── list.md │ ├── op-mem-ompsimd │ │ ├── contribute.md │ │ ├── list.md │ │ └── range.md │ └── welcome.md ├── front │ ├── aboutop.md │ ├── deepx.jpg │ ├── deepx.op.drawio.svg │ ├── deepxpy.drawio.svg │ ├── front.md │ ├── graph.md │ ├── node.md │ ├── op.md │ └── py │ │ ├── about.md │ │ ├── contribute.md │ │ └── deepx.rst ├── highway.md ├── index.rst ├── language.md └── scheduler │ └── scheduler.md ├── excuter ├── cpp-common │ ├── CMakeLists.txt │ ├── src │ │ ├── client │ │ │ ├── udpserver.cpp │ │ │ ├── udpserver.hpp │ │ │ ├── unixsocketserver.cpp │ │ │ ├── unixsocketserver.hpp │ │ │ └── worker.hpp │ │ ├── deepx │ │ │ ├── dtype.hpp │ │ │ ├── mem │ │ │ │ └── mem.hpp │ │ │ ├── shape.cpp │ │ │ ├── shape.hpp │ │ │ ├── shape_changeshape.cpp │ │ │ ├── shape_changeshape.hpp │ │ │ ├── shape_matmul.cpp │ │ │ ├── shape_matmul.hpp │ │ │ ├── shape_range.cpp │ │ │ ├── shape_reduce.cpp │ │ │ ├── shape_reduce.hpp │ │ │ ├── shape_tensorinit.cpp │ │ │ ├── shape_tensorinit.hpp │ │ │ ├── tensor.hpp │ │ │ ├── tensorbase.hpp │ │ │ ├── tensorfunc │ │ │ │ ├── authors.hpp │ │ │ │ ├── changeshape.hpp │ │ │ │ ├── elementwise.hpp │ │ │ │ ├── init.hpp │ │ │ │ ├── io.hpp │ │ │ │ ├── matmul.hpp │ │ │ │ ├── reduce.hpp │ │ │ │ └── tensorlife.hpp │ │ │ ├── tf │ │ │ │ ├── tf.cpp │ │ │ │ ├── tf.hpp │ │ │ │ ├── tffactory.cpp │ │ │ │ └── tffactory.hpp │ │ │ ├── vector_combination.cpp │ │ │ └── vector_combination.hpp │ │ └── stdutil │ │ │ ├── error.hpp │ │ │ ├── fs.cpp │ │ │ ├── fs.hpp │ │ │ ├── num.cpp │ │ │ ├── num.hpp │ │ │ ├── print.hpp │ │ │ ├── string.cpp │ │ │ ├── string.hpp │ │ │ ├── time.hpp │ │ │ └── vector.hpp │ └── test │ │ ├── 0_dtypes.cpp │ │ ├── 1_tf.cpp │ │ ├── 1_tfcheck.cpp │ │ ├── 2_saveload.cpp │ │ └── CMakeLists.txt ├── op-mem-cuda │ ├── .gitignore │ ├── CMakeLists.txt │ ├── Dockerfile │ ├── README.md │ ├── build.sh │ ├── doc │ │ ├── 00_quickstart.md │ │ ├── 01_layout.md │ │ ├── 02_layout_algebra.md │ │ ├── 03_tensor.md │ │ ├── 04_algorithms.md │ │ ├── 0t_mma_atom.md │ │ ├── 0x_gemm_tutorial.md │ │ ├── 0y_predication.md │ │ └── 0z_tma_tensors.md │ ├── dockerbuild.sh │ ├── log.md │ ├── src │ │ ├── client │ │ │ ├── main.cpp │ │ │ ├── tfs.cpp │ │ │ └── tfs.hpp │ │ └── deepx │ │ │ ├── dtype_cuda.hpp │ │ │ ├── mem │ │ │ └── mem_cuda.hpp │ │ │ ├── tensorfunc │ │ │ ├── changeshape_miaobyte.cu │ │ │ ├── changeshape_miaobyte.cuh │ │ │ ├── changeshape_miaobyte.hpp │ │ │ ├── cuda.hpp │ │ │ ├── cuda_atomic.cuh │ │ │ ├── cuda_math.cuh │ │ │ ├── elementwise_cublas_basic.hpp │ │ │ ├── elementwise_miaobyte_basic.cu │ │ │ ├── elementwise_miaobyte_basic.cuh │ │ │ ├── elementwise_miaobyte_basic.hpp │ │ │ ├── elementwise_miaobyte_compare.cu │ │ │ ├── elementwise_miaobyte_compare.cuh │ │ │ ├── elementwise_miaobyte_compare.hpp │ │ │ ├── elementwise_miaobyte_sin.cu │ │ │ ├── elementwise_miaobyte_sin.cuh │ │ │ ├── elementwise_miaobyte_sin.hpp │ │ │ ├── elementwise_miaobyte_sqrt.cu │ │ │ ├── elementwise_miaobyte_sqrt.cuh │ │ │ ├── elementwise_miaobyte_sqrt.hpp │ │ │ ├── init_miaobyte.cu │ │ │ ├── init_miaobyte.cuh │ │ │ ├── init_miaobyte.hpp │ │ │ ├── io_miaobyte.hpp │ │ │ ├── matmul_cublas.hpp │ │ │ ├── new_mempool.hpp │ │ │ ├── reduce_miaobyte.cu │ │ │ ├── reduce_miaobyte.cuh │ │ │ ├── reduce_miaobyte.hpp │ │ │ ├── tensor_cuda.cuh │ │ │ ├── tensorlife_miaobyte.hpp │ │ │ └── vector_cuda.cuh │ │ │ └── tf │ │ │ ├── arg.hpp │ │ │ ├── changeshape.hpp │ │ │ ├── elementwise_basic.hpp │ │ │ ├── elementwise_compare.hpp │ │ │ ├── elementwise_sin.hpp │ │ │ ├── elementwise_sqrt.hpp │ │ │ ├── init.hpp │ │ │ ├── io.hpp │ │ │ ├── matmul.hpp │ │ │ ├── reduce.hpp │ │ │ └── tensorlife.hpp │ └── test │ │ ├── op │ │ └── CMakeLists.txt │ │ └── tensorfunc │ │ ├── 0_new.cpp │ │ ├── 1_cublas_add.cpp │ │ ├── 1_cublas_matmul.cpp │ │ ├── 2_changeshape.cpp │ │ └── CMakeLists.txt └── op-mem-ompsimd │ ├── .cursorignore │ ├── .cursorrules │ ├── .gitignore │ ├── CMakeLists.txt │ ├── Dockerfile │ ├── dockerbuild.sh │ ├── log.md │ ├── src │ ├── client │ │ ├── main.cpp │ │ ├── tfs.cpp │ │ └── tfs.hpp │ └── deepx │ │ ├── dtype_ompsimd.hpp │ │ ├── mem │ │ └── mem_ompsimd.hpp │ │ ├── tensorfunc │ │ ├── changeshape_miaobyte.hpp │ │ ├── elementwise_cblas.hpp │ │ ├── elementwise_miaobyte.hpp │ │ ├── equal.hpp │ │ ├── highway.hpp │ │ ├── init_miaobyte.hpp │ │ ├── io_miaobyte.hpp │ │ ├── matmul_cblas.hpp │ │ ├── matmul_miaobyte.hpp │ │ ├── new_mempool.hpp │ │ ├── reduce_miaobyte.hpp │ │ └── tensorlife_miaobyte.hpp │ │ └── tf │ │ ├── arg.hpp │ │ ├── changeshape.hpp │ │ ├── elementwise.hpp │ │ ├── init.hpp │ │ ├── io.hpp │ │ ├── matmul.hpp │ │ ├── reduce.hpp │ │ └── tensorlife.hpp │ └── test │ ├── op │ ├── 1_mem.cpp │ └── CMakeLists.txt │ └── tensorfunc │ ├── 1_shape.cpp │ ├── 2_shape_combintion.cpp │ ├── 2_tensor_equal.cpp │ ├── 2_tensor_new.cpp │ ├── 2_tensor_range.cpp │ ├── 2_tensor_range.py │ ├── 3_tensor_print.cpp │ ├── 4_tensor_add.cpp │ ├── 4_tensor_matmul.cpp │ ├── 4_tensor_max.cpp │ ├── 4_tensor_mul.cpp │ ├── 4_tensor_sub.cpp │ ├── 5_tensor_sum.cpp │ ├── 6_tensor_broadcast.cpp │ ├── 7_tensor_transpose.cpp │ ├── 8_tensor_concat.cpp │ ├── CMakeLists.txt │ └── tensorutil.hpp ├── front ├── go │ ├── README.md │ ├── deepx │ │ ├── attention.go │ │ ├── graph_constarg.go │ │ ├── graph_opnode.go │ │ ├── graph_tensornode.go │ │ ├── graph_viz.go │ │ ├── linear.go │ │ ├── mlp.go │ │ ├── module.go │ │ ├── norm.go │ │ ├── tensor_activite.go │ │ ├── tensor_elementwise.go │ │ ├── tensor_matmul.go │ │ ├── tensor_musk.go │ │ ├── tensor_norm.go │ │ ├── tensor_normalization.go │ │ ├── tensor_reduce.go │ │ ├── tensor_shape.go │ │ ├── transformer.go │ │ ├── transformer │ │ │ ├── attention.go │ │ │ ├── config.go │ │ │ ├── model.go │ │ │ ├── qwen2.md │ │ │ ├── qwen2_causal_lm.go │ │ │ └── qwen2_model.go │ │ └── transformer_model.go │ ├── example │ │ ├── 1 │ │ │ ├── 1_app.dot │ │ │ ├── 1_app.go │ │ │ └── 1_app.svg │ │ └── 3 │ │ │ ├── 3_transformer.svg │ │ │ ├── 3_transformer_app.go │ │ │ └── transformer.dot │ └── go.mod └── py │ ├── .cursorrules │ ├── deepx │ ├── .cursorrules │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── nn │ │ ├── __init__.py │ │ ├── deepxir.py │ │ ├── functional │ │ │ ├── __init__.py │ │ │ ├── activite.py │ │ │ ├── authormap.py │ │ │ ├── changeshape.py │ │ │ ├── elementwise.py │ │ │ ├── leaffunc.py │ │ │ ├── leaffunc_changeshape.py │ │ │ ├── leaffunc_elementwise.py │ │ │ ├── leaffunc_init.py │ │ │ ├── leaffunc_io.py │ │ │ ├── leaffunc_life.py │ │ │ ├── leaffunc_matmul.py │ │ │ ├── leaffunc_reduce.py │ │ │ ├── normalization.py │ │ │ ├── reduce.py │ │ │ ├── rtf.py │ │ │ ├── rtf_changeshape.py │ │ │ ├── rtf_elementwise.py │ │ │ ├── rtf_init.py │ │ │ ├── rtf_io.py │ │ │ ├── rtf_life.py │ │ │ ├── rtf_matmul.py │ │ │ └── rtf_reduce.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── conv.py │ │ │ ├── dropout.py │ │ │ ├── linear.py │ │ │ ├── loss.py │ │ │ ├── module.py │ │ │ ├── normalization.py │ │ │ ├── padding.py │ │ │ ├── pooling.py │ │ │ ├── rmsnorm.py │ │ │ ├── rnn.py │ │ │ └── sparse.py │ │ └── parameter.py │ ├── optim │ │ ├── __init__.py │ │ ├── adam.py │ │ ├── optimizer.py │ │ └── sgd.py │ ├── requirements.txt │ ├── scheduler │ │ ├── __init__.py │ │ └── client │ │ │ ├── allclient.py │ │ │ ├── udpconn.py │ │ │ └── unixsocket.py │ ├── setup.py │ ├── tensor │ │ ├── __init__.py │ │ ├── changeshape.py │ │ ├── elementwise.py │ │ ├── init.py │ │ ├── io.py │ │ ├── matmul.py │ │ ├── reduce.py │ │ ├── shape.py │ │ └── tensor.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── decoder.py │ │ ├── modeling_rope_utils.py │ │ └── models │ │ │ ├── __init__.py │ │ │ └── llama │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── embedding.py │ │ │ ├── groupedquery_attention.py │ │ │ ├── mlp.py │ │ │ ├── modeling_llama.py │ │ │ └── normalization.py │ └── utils │ │ ├── __init__.py │ │ ├── benchmark │ │ └── bench.py │ │ ├── checkpoint.py │ │ └── data │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── dataset.py │ │ └── sampler.py │ ├── deepxutil │ ├── numpy │ │ ├── __init__.py │ │ └── io.py │ └── torch │ │ ├── __init__.py │ │ └── io.py │ ├── docs │ ├── api.rst │ ├── conf.py │ └── index.rst │ └── examples │ ├── 0_pyenv │ └── binsearch.py │ ├── 1_tensor │ ├── 1_clone.py │ ├── 1_copy.py │ ├── 1_new.py │ ├── 1_print.py │ ├── 2_newbig.py │ ├── 2_saveload.py │ └── getitem.py │ ├── 2_ir │ ├── 1_init_zeroones.py │ ├── 2_elementwise_add.py │ ├── 2_elementwise_bit.py │ ├── 2_elementwise_dropout.py │ ├── 2_elementwise_lessgreater.py │ ├── 2_elementwise_minmax.py │ ├── 2_elementwise_operator.py │ ├── 2_elementwise_sqrtlog.py │ ├── 2_elementwise_switchwhere.py │ ├── 3_matmul.py │ ├── 4_changeshape_broadcast.py │ ├── 4_changeshape_broadcast_add.py │ ├── 4_changeshape_concat.py │ ├── 4_changeshape_gather.py │ ├── 4_changeshape_repeat.py │ ├── 4_changeshape_reshape.py │ ├── 4_changeshape_transpose.py │ ├── 5_reduce_prod.py │ ├── 5_reduce_sum.py │ ├── 5_reduce_sum_keepdim.py │ ├── 6_tensorlife_to.py │ └── changeshape_repeat.py │ ├── 3_functional │ ├── activite_relu.py │ ├── activite_sigmoid.py │ ├── activite_swish.py │ ├── changeshape_broadcast.py │ ├── elementwise_dropout.py │ ├── elementwise_rsqrt.py │ ├── normalization_softmax.py │ └── reduce_mean.py │ ├── 3_module │ ├── 0_hg_tokenizer.py │ ├── 1_embedding.py │ └── 1_linear.py │ └── 4_transformer │ └── llama │ ├── 1_llama_rmsnorm.py │ ├── llama_ │ ├── llama_rope.py │ └── llama_rope_torch.py ├── log.md ├── model ├── h5_deepx │ ├── h5_deepx │ │ ├── __init__.py │ │ ├── todeepx.py │ │ └── toh5.py │ ├── requirements.txt │ └── setup.py ├── onnx_deepx │ ├── README.md │ ├── onnx_deepx │ │ ├── __init__.py │ │ └── todeepx.py │ ├── requirements.txt │ └── setup.py └── safetensor_deepx │ ├── README.md │ ├── examples │ └── load_model.py │ ├── requirements.txt │ ├── safetensor_deepx.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt │ ├── safetensor_deepx │ ├── __init__.py │ └── loader.py │ └── setup.py ├── scheduler ├── README.md ├── autograd │ ├── __init__.py │ ├── function.py │ └── graph │ │ ├── _controlflownode.py │ │ ├── _datanode.py │ │ ├── _opnode.py │ │ ├── graph.py │ │ ├── graph_viz.py │ │ ├── node.py │ │ └── nodetype.py └── common │ ├── pass_register.cpp │ └── pass_register.hpp ├── todo ├── infer.py └── qwen2_infer.py └── tool └── deepxctl ├── .gitignore ├── cmd └── tensor │ ├── print.go │ └── tensor.go ├── go.mod ├── go.sum ├── main.go └── tensor ├── fp16.go ├── io.go ├── print.go └── tensor.go /.cursorrules: -------------------------------------------------------------------------------- 1 | Always respond in 中文 2 | 不要回答重复的内容(如我提问中的代码) 3 | 4 | 此项目名为deepx 5 | 项目路径为/home/lipeng/code/ai/deepx 6 | 项目分为3部分 7 | 1. 前端。python库的接口风格参考pytorch,其他语言如go,java,c,rust等,后续设计完善。 8 | 2. 调度器,待设计 9 | 3. 执行器,使用c++,cuda,metal,omp simd等,实现不同excuter的算子的前向和反向 10 | 11 | 关于概念 12 | deepx.Tensor仅仅就是一个tensor,不像pytorch的tensor,一个tensor其实包含了自身和梯度2个tensor的数据 13 | 14 | 关于任何编程语言 15 | 注重设计函数时,通过多级的子函数,实现层级模块化分解 16 | 17 | 关于c++ 18 | 我的环境为ubuntu22,项目是c++17,使用cmake编译, 19 | 返回c++代码区分header和source文件 20 | 由于作者是c++新手,请仔细检查指针和引用,对deepx这种密集计算任务,不要使用智能指针,但注意内存泄漏,函数返回对象等 21 | 22 | 关于python 23 | 贴近pytorch的接口风格,不要增加任何注释,我会手动添加注释 24 | 25 | 关于doc目录 26 | 采用Sphinx构建,使用reStructuredText格式 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/compile.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 算子等价替换 3 | about:以等价替换的方式,优化执行效率,新增其他能力支持 4 | title: '[deepx(compile)] ' 5 | labels: compile, 6 | assignees: '' 7 | --- 8 | 9 | ## 你的思路 10 | 11 | ## 影响组件 12 | 13 | ## 其他叙述 14 | 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/excuter.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 执行引擎 3 | about:按照给定计算图,负责存储、计算、网络传输的执行 4 | title: '[excuter] ' 5 | labels: excuter, 6 | assignees: '' 7 | --- 8 | 9 | ## 支持的硬件、操作系统 10 | 11 | ## 你的思路 12 | 13 | ## 其他叙述 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/operator.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 算子新增、修改、删除 3 | about: 用于提交新的算子实现请求 4 | title: '[算子] ' 5 | labels: enhancement, operator 6 | assignees: '' 7 | --- 8 | 9 | ## 算子新增 10 | 该算子数学表达为 11 | 12 | ## 影响组件 13 | 14 | ### front 15 | 1. 16 | 2. 17 | 18 | ### 引擎 19 | 1. 20 | 2. 21 | 22 | ## 其他叙述 23 | 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/py_deepx.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: deepx的python主库 3 | about: 优化deepx(python)的前端api 4 | title: '[deepx(python)] ' 5 | labels: python, 6 | assignees: '' 7 | --- 8 | 9 | ## 你的思路 10 | 11 | ## 影响组件 12 | 13 | ## 其他叙述 14 | 15 | -------------------------------------------------------------------------------- /.github/workflows/auto-merge.yml: -------------------------------------------------------------------------------- 1 | name: 自动合并PR 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Excuter/ompsimd-linux Build", "Excuter/cuda-linux Build"] # 列出您所有需要等待完成的CI工作流 6 | types: 7 | - completed 8 | branches: 9 | - main # 仅在针对main分支的PR上运行 10 | 11 | permissions: 12 | contents: write 13 | pull-requests: write 14 | 15 | jobs: 16 | auto-merge: 17 | runs-on: ubuntu-latest 18 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 19 | steps: 20 | - name: 自动合并PR 21 | uses: pascalgn/automerge-action@v0.15.6 22 | env: 23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 24 | MERGE_LABELS: "auto-merge,!work-in-progress,!do-not-merge" 25 | MERGE_METHOD: "squash" 26 | MERGE_COMMIT_MESSAGE: "自动合并: PR #{pullRequest.number} {pullRequest.title}" 27 | MERGE_FORKS: "true" 28 | MERGE_RETRIES: "6" 29 | MERGE_RETRY_SLEEP: "10000" 30 | UPDATE_LABELS: "auto-merge" 31 | UPDATE_METHOD: "rebase" -------------------------------------------------------------------------------- /.github/workflows/excuter-cppcommon.yml: -------------------------------------------------------------------------------- 1 | name: Excuter/cppcommon Build 2 | on: [push, pull_request] 3 | 4 | env: 5 | HIGHWAY_VERSION: 1.2.0 6 | 7 | jobs: 8 | build: 9 | strategy: 10 | matrix: 11 | os: [ubuntu-22.04] # 只保留 Ubuntu 12 | backend: [ompsimd] 13 | runs-on: ${{ matrix.os }} 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | # 系统依赖安装 21 | - name: Install Dependencies (Ubuntu) 22 | if: matrix.os == 'ubuntu-22.04' 23 | env: 24 | DEBIAN_FRONTEND: noninteractive 25 | run: | 26 | sudo apt-get update 27 | sudo apt-get install -y \ 28 | build-essential \ 29 | cmake \ 30 | libopenblas-dev \ 31 | libyaml-cpp-dev \ 32 | libjemalloc-dev \ 33 | libgtest-dev \ 34 | clang \ 35 | git 36 | 37 | # 设置 ccache 38 | - name: Setup ccache 39 | uses: hendrikmuhs/ccache-action@v1.2 40 | 41 | # 构建缓存 42 | - name: Cache Build 43 | uses: actions/cache@v3 44 | with: 45 | path: | 46 | excuter/cpp-common/build 47 | ~/.ccache 48 | key: ${{ runner.os }}-build-${{ hashFiles('**/CMakeLists.txt') }} 49 | restore-keys: | 50 | ${{ runner.os }}-build- 51 | 52 | # 构建 cpp-common 库 53 | - name: Build Common Library 54 | run: | 55 | cd excuter/cpp-common 56 | mkdir -p build && cd build 57 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER_LAUNCHER=ccache .. 58 | cmake --build . --config Release -j$(nproc) 59 | -------------------------------------------------------------------------------- /.github/workflows/tool-deepxctl.yml: -------------------------------------------------------------------------------- 1 | name: Tool/deepxctl Build 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | build: 6 | strategy: 7 | matrix: 8 | os: [ubuntu-22.04] # 只保留 Ubuntu 9 | go-version: [1.23.2] 10 | runs-on: ${{ matrix.os }} 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | 17 | # 系统依赖安装 18 | - name: 安装Go 19 | uses: actions/setup-go@v4 20 | with: 21 | go-version: ${{ matrix.go-version }} 22 | cache: true 23 | 24 | # 系统依赖安装 25 | - name: 安装依赖 (Ubuntu) 26 | env: 27 | DEBIAN_FRONTEND: noninteractive 28 | run: | 29 | sudo apt-get update 30 | sudo apt-get install -y git 31 | 32 | # 构建deepxctl工具 33 | - name: 构建deepxctl 34 | run: | 35 | cd tool/deepxctl 36 | go build -v -o deepxctl 37 | 38 | # 运行测试 39 | - name: 运行测试 40 | run: | 41 | cd tool/deepxctl 42 | ./deepxctl 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | **/build/ 3 | .idea 4 | **/.idea 5 | **/__pycache__/ 6 | **/dist/ 7 | **/egg.info/ 8 | front/py/deepx/deepx.egg-info/* 9 | *.pdf -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # DeepX 行为准则 2 | 3 | ## 我们的承诺 4 | 5 | 作为贡献者和维护者,我们承诺为每个人提供一个开放和欢迎的环境。 6 | 7 | ## 我们的标准 8 | 9 | 有助于创造积极环境的行为包括但不限于: 10 | 11 | - 使用友好和包容的语言 12 | - 尊重不同的观点和经验 13 | - 耐心地接受建设性的批评 14 | - 关注对社区最有利的事情 15 | - 友善对待其他社区成员 16 | 17 | 不可接受的行为包括但不限于: 18 | 19 | - 使用性化的语言或图像以及不受欢迎的性关注或挑逗 20 | - 捣乱/煽动/侮辱性/贬损的评论,人身攻击或政治攻击 21 | - 公开或私下的骚扰 22 | - 未经明确许可,发布他人的私人信息,如物理或电子地址 23 | - 其他可以合理地被认为不符合专业行为的行为 24 | 25 | ## 我们的责任 26 | 27 | 项目维护者有责任澄清可接受行为的标准,并应对任何不可接受的行为采取适当和公平的纠正措施。 28 | 29 | 项目维护者有权利和责任删除、编辑或拒绝与本行为准则不符的评论、提交、代码、wiki编辑、问题和其他贡献,并可暂时或永久禁止任何他们认为不适合、威胁、冒犯或有害的贡献者。 30 | 31 | ## 适用范围 32 | 33 | 当个人代表项目或其社区时,本行为准则适用于项目空间和公共空间。 34 | 35 | ## 执行 36 | 37 | 如有滥用、骚扰或其他不可接受的行为,请通过以下方式联系项目团队。所有投诉都将被审查和调查,并将导致认为必要和适当的回应。 38 | 39 | ## 联系信息 40 | 41 | 请通过 [您的联系信息] 联系我们。 42 | 43 | ## 归属 44 | 45 | 本行为准则改编自[贡献者公约](https://www.contributor-covenant.org),版本1.4。 -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # deepx 贡献指南 2 | 3 | deepx框架的发展,主要包括五大类方向 4 | 5 | + front: 新增模型、module、python类函数等 6 | + 中间层:包括计算图优化器,插件系统(自动KVcache系统),自动分布式化,栈tensor自动释放,自动Inplace化等操作 7 | + 新增或修改excuter 8 | + 增加或修改算子,进一步可以分为leaftensorfunc(不可分割的基础算子),fusedtensorfunc(融合算子) 9 | + 文档丰富: 10 | + 运维自动化方向 11 | 12 | 大家可以选择一个方向 13 | 14 | ## 步骤 15 | 16 | 第一次提交 17 | 1. Fork本仓库(github.com/array2d/deepx)的main分支,到你的github/yourname/deepx 18 | 2. 本地clone github/yourname/deepx 19 | 3. 提交并推送您的更改到你的github:`git commit -m 'Add some feature'` 20 | 4. 创建一个Pull Request。 21 | 22 | 第N次提交 23 | 24 | 1. 保障你的本地和github/yourname/deepx中均已提pull request并得到merge 25 | 2. 在github/yourname/deepx中sync fork【危险操作,会删除你新增的代码】,拉取(github.com/array2d/deepx) main分支的最新代码 26 | 3. 本地clone github/yourname/deepx 27 | 4. 提交并推送您的更改到你的github:`git commit -m 'Add some feature'` 28 | 5. 创建一个Pull Request。 -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .venv 3 | _build 4 | make.bat 5 | Makefile 6 | _static 7 | _templates -------------------------------------------------------------------------------- /doc/benchmark/broadcast.md: -------------------------------------------------------------------------------- 1 | 2 | [i] broadcastto [4i,2i,i] 3 | 4 | + 不开启omp 5 | 6 | | Size | Time (seconds) | mem | 7 | |------|------------------|--------| 8 | | 64 | 0.304582 | 8 MB | 9 | | 128 | 2.06795 | 64 MB | 10 | | 256 | 16.4505 | 512 MB | 11 | | 512 | 131.381 | 4 GB | 12 | 13 | + 开启omp 14 | 15 | 以下是整理后的表格,展示了不同大小的时间消耗: 16 | 17 | | Size | Time (seconds) | mem | 18 | |------|------------------|--------| 19 | | 64 | 0.062084 | 8 MB | 20 | | 128 | 0.132792 | 64 MB | 21 | | 256 | 1.21183 | 512 MB | 22 | | 512 | 8.89442 | 4 GB | 23 | 24 | -------------------------------------------------------------------------------- /doc/benchmark/matmul.md: -------------------------------------------------------------------------------- 1 | ## 矩阵乘法 2 | 3 | 平台 ubuntu 22.04 4 | cpu Intel(R) Core(TM) i9-14900K 5 | 内存 64GB 6 | 7 | | 矩阵大小 | 耗时(秒) | 内存占用 | 8 | |--------------|-----------|------------| 9 | | 64x64 | 0.000073 | 16 KB | 10 | | 128x128 | 0.007146 | 64 KB | 11 | | 256x256 | 0.002196 | 256 KB | 12 | | 512x512 | 0.007013 | 1 MB | 13 | | 1024x1024 | 0.027820 | 4 MB | 14 | | 2048x2048 | 0.058486 | 16 MB | 15 | | 4096x4096 | 0.249994 | 64 MB | 16 | | 8192x8192 | 1.973990 | 256 MB | 17 | | 16384x16384 | 14.712000 | 1 GB | 18 | | 32768x32768 | 111.222000| 4 GB | 19 | 20 | 再大会段错误,待优化 21 | -------------------------------------------------------------------------------- /doc/benchmark/reduce.md: -------------------------------------------------------------------------------- 1 | ## reduce操作性能比较 2 | 3 | ### sum 4 | -------------------------------------------------------------------------------- /doc/deepxIR/readme.md: -------------------------------------------------------------------------------- 1 | ## 测试工具 2 | 3 | ### udp 4 | 5 | nc命令可以发送udp包 6 | 7 | ```bash 8 | nc -u 127.0.0.1 8080 9 | 10 | //然后,输入内容,回车,即可发送 11 | ``` 12 | 13 | ### unixsocket 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /doc/design.md: -------------------------------------------------------------------------------- 1 | # deepx默认原则 2 | 3 | ## 一.DeepxIR 4 | 5 | ### 1.deepIR结构 6 | ``` 7 | deepIR{ 8 | Meta{ 9 | int id 10 | string author 11 | } meta 12 | string name 13 | []Param args 14 | []Param returns 15 | } 16 | ``` 17 | 18 | excuter执行deepxIR的规则 19 | 20 | + excuter执行deepxIR时,不得修改args中的tensor 21 | + 但deepIR不限制args和returns中的Param同名,这样可以实现类似inplace的操作 22 | 23 | 24 | ## front/python规则 25 | 26 | ### 1.命名规则 27 | + inplace操作的函数,其名为_后缀, 返回值为空 28 | + 非inplace操作的函数,其名无_后缀 29 | -------------------------------------------------------------------------------- /doc/excuter/deepx.op.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/doc/excuter/deepx.op.jpg -------------------------------------------------------------------------------- /doc/excuter/excuter.md: -------------------------------------------------------------------------------- 1 | ## 如何给excuter添加一个新算子 2 | 3 | ### 层次结构图 4 | 5 | ![层次结构图](./deepx.op.drawio.svg) 6 | 7 | 8 | #### TensorFunction 9 | 10 | 顾名思义,TensorFunction是操作Tensor的函数,可以是c++函数,也可以是python函数,cuda函数等。 11 | 12 | #### TensorFunction 特定精度特化,或混合精度实现 13 | 14 | 15 | #### Op 16 | 17 | Op是excuter的算子,是excuter的执行单元 18 | 19 | 在程序中,Op是基类,不同的Op有不同的实现,比如Add, Mul, MatMul等。 20 | 每个Op都需要override forward和backward函数 21 | 22 | 对同一个功能的Op如Matmul,可以有多种作者的实现 23 | 24 | Matmul会选择选择一个默认的实现 25 | 26 | 或者由MatmulOp的name属性来指定具体author的实现 27 | 28 | 29 | ### 具体步骤 30 | 31 | git clone https://github.com/deepx-org/deepx.git 32 | 33 | #### 1.cpu执行器 34 | cd deepx/excuter/op-mem-ompsimd 35 | 36 | 需要提前安装好依赖 37 | + highway需要源码安装 38 | + omp,openmp库 39 | + yaml-cpp 40 | make build && cd build && cmake .. && make 41 | 42 | 你可以在test目录下,验证或添加测试用例 43 | 44 | 45 | #### 2.cuda执行器 46 | cd deepx/excuter/op-mem-cuda 47 | 48 | 需要提前安装好依赖 49 | + cuda 50 | + cublas 51 | + yaml-cpp 52 | 53 | make build && cd build && cmake .. && make 54 | 55 | 56 | #### 3.jax执行器 57 | 58 | todo 59 | 60 | 61 | #### 4.front对接测试 62 | 63 | 1.先启动excuter可执行文件, 位于excuter/op-mem-{cuda/ompsimd}/build,可执行文件名同excuter名 64 | 2.然后测试front中py的对应算子脚本(front/py/examples 目录) 65 | 66 | 可以按照顺序,以此测试 67 | 68 | 1_tensor 69 | 70 | 2_ir 71 | 72 | 3_functional 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /doc/excuter/mix_precision.md: -------------------------------------------------------------------------------- 1 | # mix precision 2 | 3 | ## 1. 什么是 mix precision 4 | 5 | mix precision 是一种混合精度训练方法,它使用 16 位浮点数和 8 位整数来训练模型,从而在保持模型精度的同时,减少显存占用和计算时间。 6 | 7 | ## 2. 为什么需要 mix precision 8 | 9 | 在深度学习中,模型通常使用 32 位浮点数进行训练,这样可以确保模型的精度。但是,32 位浮点数占用的显存较大,计算时间较长。因此,为了减少显存占用和计算时间,可以使用 mix precision 训练方法。 10 | 11 | ## 3. 关于excuter的mix precision的实现 12 | 13 | 如: 14 | 15 | matmul(A[float16],B[float16])->C[float32] //author=miaobyte id=1 create_time=1714512000 send_time=1714512000 16 | 17 | 我们在opfactory中,把实际参数用占位符替换,注册为 18 | 19 | matmul[authora] Tensor@float16 Tensor@float16 -> Tensor@float32 20 | 21 | 如: 22 | 23 | matmul[authora] A@float16 b@float16 -> C@float32 24 | 25 | 同样,在opfactory中,把实际参数用占位符替换,注册为 26 | 27 | muladd[authora] Tensor@float16 Scalar@float32-> Tensor@float16 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /doc/excuter/op-mem-cuda/cublaslt/api.md: -------------------------------------------------------------------------------- 1 | + cublasLtMatmul() 2 | 支持部分低精度 3 | 4 | + cublasLtMatmulEx() 5 | 6 | + cublasLtMatmulBatched() 7 | 8 | -------------------------------------------------------------------------------- /doc/excuter/op-mem-ompsimd/contribute.md: -------------------------------------------------------------------------------- 1 | ## excuter 2 | 3 | ### op-mem-ompsimd 4 | 5 | ompsimd是DeepX框架的cpu执行器进程 6 | 7 | + 采用OMP+SIMD加速tensor计算 8 | + double和float采用openblas加速 9 | + 使用了jemalloc内存池管理内存。 10 | 11 | 12 | #### 1. 安装依赖 13 | 14 | 安装apt依赖 15 | 16 | ``` 17 | sudo apt-get update 18 | sudo apt-get install -y \ 19 | build-essential \ 20 | cmake \ 21 | libopenblas-dev \ 22 | libyaml-cpp-dev \ 23 | libjemalloc-dev \ 24 | libgtest-dev \ 25 | clang \ 26 | git 27 | ``` 28 | 29 | 源码依赖安装 30 | 31 | ``` 32 | sudo apt-get install -y libgtest-dev 33 | 34 | # 克隆 Highway 35 | git clone --depth 1 --branch ${HIGHWAY_VERSION} https://github.com/google/highway.git 36 | cd highway 37 | mkdir -p build && cd build 38 | 39 | # 使用标准的 CMake 构建流程 40 | cmake .. \ 41 | -DCMAKE_BUILD_TYPE=Release \ 42 | -DBUILD_SHARED_LIBS=ON \ 43 | -DHWY_SYSTEM_GTEST=ON \ 44 | -DHWY_ENABLE_TESTS=OFF 45 | 46 | # 构建和安装 47 | make -j$(nproc) 48 | sudo make install 49 | sudo ldconfig # 更新动态链接库缓存 50 | 51 | # 确保头文件正确安装 52 | sudo cp -r ../hwy /usr/local/include/ 53 | ``` 54 | 55 | #### 2. 开发环境 56 | 57 | c++ 17 58 | 59 | -------------------------------------------------------------------------------- /doc/excuter/op-mem-ompsimd/range.md: -------------------------------------------------------------------------------- 1 | ## excuter 2 | 3 | ### op-mem-ompsimd 4 | 5 | #### cpu的range算子辅助函数 6 | 7 | range函数是shape类中的一个函数,用于根据shape对tensor进行omp线程并行遍历的方式 8 | 9 | 定义和实现分别在: 10 | 11 | excuter/common/src/deepx/shape.hpp 12 | 13 | excuter/common/src/deepx/shape_range.cpp 14 | 15 | | func | omp并行 | omp线程local局部对象 | 调用场景 | 16 | | ---- | ---- | ------ | ---------- | 17 | | | N | | print | 18 | | 函数 | 否 | 0 | 不需要并行 | 19 | | 函数 | 是 | 0 | 需要并行 | 20 | | 函数 | 否 | 0 | 不需要并行 | 21 | -------------------------------------------------------------------------------- /doc/excuter/welcome.md: -------------------------------------------------------------------------------- 1 | **DeepX高性能算子开发英雄帖** 2 | ——**挑战算力极限,定义下一代AI基础设施** 3 | 4 | --- 5 | 6 | ### **🔥 我们是谁?** 7 | **DeepX**——致力于打造**原生分布式并行**的深度学习训练推理一体化框架,以**极致性能**和**全场景覆盖**为目标! 8 | - **性能追求者**:深耕算子优化,目标达到业界一流水平 9 | - **异构计算先锋**:CUDA/Metal/沐熙/昇腾... 打造全平台支持 10 | - **开源共建者**:开放、透明的开发模式,欢迎全球开发者参与 11 | 12 | --- 13 | 14 | ### **⚡ 招募令:算力世界的角斗士** 15 | **如果你**: 16 | - 手握CUDA/Metal优化绝技,却苦于没有**工业级战场** 17 | - 精通硬件指令集,渴望打造**教科书级算子实现** 18 | - 梦想代码在**千万级GPU集群**上奔腾 19 | 20 | **加入我们,你将**: 21 | ✅ 挑战**纳米级指令优化**,与硬件共舞 22 | ✅ 设计**分布式算子原语**,定义行业标准 23 | ✅ 打造**训练-推理一体化**的终极架构 24 | 25 | --- 26 | 27 | ### **🏆 巅峰对决:算子性能挑战赛** 28 | **期待以下领域高手**: 29 | 30 | #### **1. CUDA核弹专家(NVIDIA全系)** 31 | - 战场:Ampere/Hopper架构深度调优 32 | - 必杀技: 33 | - Tensor Core极限压榨 34 | - Warp级同步黑魔法 35 | - 显存带宽利用率≥95% 36 | 37 | #### **2. Metal刀锋战士(Apple Silicon)** 38 | - 战场:M1/M2/M3系列芯片 39 | - 必杀技: 40 | - Metal Performance Shaders魔改 41 | - 苹果神经引擎(NE)指令直通 42 | - Unified Memory架构颠覆性优化 43 | 44 | #### **3. 异架构开荒者(沐熙/寒武纪/昇腾等)** 45 | - 战场:国产算力芯片深水区 46 | - 必杀技: 47 | - 自定义指令集破解 48 | - 存算一体架构适配 49 | - 自主IP核驱动开发 50 | 51 | --- 52 | 53 | ### **💎 你将获得** 54 | - **技术突破**:参与前沿AI基础设施开发 55 | - **开源贡献**:代码将服务广大开发者社区 56 | - **收益共享**:优秀算子作者将获得项目未来商业化的分成机会 57 | - **荣誉认可**:优秀贡献者将被记入项目贡献者名单 58 | 59 | --- 60 | 61 | ### **⚔️ 申请方式** 62 | **加入我们的两个步骤**: 63 | 64 | #### **步骤一:提交初始方案** 65 | 1. 访问 github.com/array2d.com 的 todo 目录 66 | 2. 提交 PR,详细描述: 67 | - 你的优化想法 68 | - 具体实现方案 69 | - 预期性能提升 70 | 3. PR通过后,我们会邀请你加入技术讨论群 71 | 72 | #### **步骤二:提交算子实现** 73 | 1. 提交算子代码 PR 74 | 2. 进行全面性能测试和验证 75 | 3. 通过验证后,你的算子将被合并进主分支 76 | 77 | --- 78 | 79 | **让我们一起,用代码构建AI基础设施的未来!** 🔥🚀 80 | 81 | > "在DeepX,我们追求的是每一个算子的极致优化" 82 | > —— DeepX开源项目发起人 李鹏 83 | 84 | --- 85 | **即刻加入,成为这个激动人心项目的重要一员!** 🚀 86 | -------------------------------------------------------------------------------- /doc/front/deepx.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/doc/front/deepx.jpg -------------------------------------------------------------------------------- /doc/front/front.md: -------------------------------------------------------------------------------- 1 | 2 | # deepx前端 3 | 4 | ## 对应关系 5 | 6 | | 前端 | pytorch | tensorflow | deepx| 7 | | --- | --- | --- | --- | 8 | | tensor库 | ATen | TensorFlow | deepx/tensorfunc | 9 | | 算子(支持forward和backward) | torch.nn.functional | ? | deepx/op | 10 | | 计算图子图| torch.nn.Module | tensorflow.nn.Module | deepx.nn.Module | 11 | | 抽象计算图 | torch.fx.graph.Graph | ?| deepx.nn.Graph | 12 | | 执行计算图| torch._inductor.graph.GraphLowering | tensorflow.Graph | deepx.nn.Graph | -------------------------------------------------------------------------------- /doc/front/graph.md: -------------------------------------------------------------------------------- 1 | # 计算图 2 | 3 | 4 | ## 抽象计算图 5 | 6 | 抽象计算图是计算图的抽象表示,它描述了计算的整体逻辑结构。 7 | 8 | ## 执行计算图 9 | 10 | 执行计算图是计算图的实际执行过程,它描述了计算的详细具体执行过程。 11 | 12 | 13 | 自动tensor并行 14 | 15 | + 根据tensor的shape和dtype,对tensor进行split,分解为n个小tensor 16 | + 对每个小tensor,调度到不同的存算执行器上进行计算 17 | + 根据tensor的shape和dtype,对tensor进行concat 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /doc/front/node.md: -------------------------------------------------------------------------------- 1 | # Node,计算图的设计思考 2 | 3 | ## 概念 4 | 5 | pytorch的计算图是动态的,tensorflow早期的计算图是静态的。 6 | 7 | pytorch在前向传播时,会构建一个计算图,在反向传播时,会根据计算图进行反向传播。 8 | 9 | 10 | ## Graph结构 11 | 12 | Node{ 13 | froms []*Node 14 | tos []*Node 15 | } 16 | 17 | Graph结构可以支持Residual的跳跃Node连接 18 | 19 | ## Tree结构 20 | 21 | Tree{ 22 | parent *Node 23 | children []*Node 24 | } 25 | 26 | Tree结构需要特别的实现Residual的跳跃Node连接 27 | 28 | Residual可以把跳跃连接的Node打平,都作为Residual的childs 29 | 30 | ## Deepx的设计实现 31 | 优先考虑Tree这种静态图结构,如果需要支持Residual的跳跃Node连接,可以在forward和backward中特别的实现。 -------------------------------------------------------------------------------- /doc/front/op.md: -------------------------------------------------------------------------------- 1 | # 基础算子 2 | 算术运算 3 | 一元运算:Abs, Acos, Acosh, Asin, Asinh, Atan, Atanh, Ceil, Cos, Cosh, Erf, Exp, Floor, Log, Neg, Reciprocal, Sign, Sin, Sinh, Sqrt, Tan, Tanh 4 | 二元运算:Add, Div, Mul, Pow, Sub 5 | 比较运算:Equal, Greater, GreaterOrEqual, Less, LessOrEqual, Not 6 | 逻辑运算:And, Or, Xor, BitwiseAnd, BitwiseNot, BitwiseOr, BitwiseXor, BitShift 7 | 激活函数:Elu, Gelu, HardSigmoid, HardSwish, Hardmax, LeakyRelu, Mish, PRelu, Relu, Selu, Sigmoid, Softmax, Softplus, Softsign, ThresholdedRelu 8 | 数据变换 9 | 形状变换:Cast, CastLike, Flatten, Reshape, Squeeze, Transpose, Unsqueeze 10 | 元素选择与索引:ArgMax, ArgMin, Gather, GatherElements, GatherND, Scatter, ScatterElements, ScatterND, Slice, TopK 11 | 数据生成:Constant, ConstantOfShape, EyeLike, Range, RandomNormal, RandomNormalLike, RandomUniform, RandomUniformLike 12 | 池化操作 13 | 普通池化:AveragePool, GlobalAveragePool, GlobalLpPool, GlobalMaxPool, LpPool, MaxPool, Mean, Min 14 | 特殊池化:MaxRoiPool, MaxUnpool, SpaceToDepth, DepthToSpace 15 | 归一化操作:BatchNormalization, GroupNormalization, InstanceNormalization, LayerNormalization, LpNormalization, MeanVarianceNormalization 16 | 统计运算:CumSum, ReduceL1, ReduceL2, ReduceLogSum, ReduceLogSumExp, ReduceMax, ReduceMean, ReduceMin, ReduceProd, ReduceSum, ReduceSumSquare 17 | 张量操作:Concat, ConcatFromSequence, Split, SplitToSequence, Expand, Pad, Resize, ReverseSequence, Shrink, Tile, Where 18 | 类型判断:IsInf, IsNaN 19 | 其他:Identity, OneHot, SequenceAt, SequenceConstruct, SequenceEmpty, SequenceErase, SequenceInsert, SequenceLength, SequenceMap, Shape, Size, StringConcat, StringNormalizer, StringSplit 20 | 融合算子 21 | 神经网络层:Conv, ConvInteger, ConvTranspose, DeformConv, GRU, LSTM, RNN, QLinearConv, QLinearMatMul 22 | 损失函数:NegativeLogLikelihoodLoss, SoftmaxCrossEntropyLoss 23 | 其他融合操作:AffineGrid, CenterCropPad, Col2Im, Compress, DFT, ImageDecoder, Loop, NonMaxSuppression, Optional, OptionalGetElement, OptionalHasElement, RegexFullMatch, Scan, TfIdfVectorizer, Upsample 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /doc/front/py/about.md: -------------------------------------------------------------------------------- 1 | ### deepx/front/py 2 | 3 | deepx-py库是DeepX框架的Python库,方便用户搭建深度学习模型,输出计算图,主要用于深度学习模型的开发和训练。 4 | 5 | #### 设计理念 6 | 7 | + deepx并不像pytorch那样,追求python first,而是为了原生分布式和并行,约束python的灵活性。 8 | + deepx的使用风格,基本贴近pytorch。尽量能做到 import deepx as torch,依然能正确的run起来 9 | + deepx的py进程,不参与tensor计算,但会参与一些简单的shape计算 10 | 11 | #### 待定 12 | 13 | -------------------------------------------------------------------------------- /doc/front/py/contribute.md: -------------------------------------------------------------------------------- 1 | ## front 2 | 3 | ### py 4 | 5 | deepx-py库是DeepX框架的Python库,方便用户搭建深度学习模型,输出计算图,主要用于深度学习模型的开发和训练。 6 | 7 | #### 1. 安装依赖 8 | 9 | deepx-py库依赖: 10 | 11 | ``` 12 | pip install graphviz 13 | ``` 14 | 15 | #### 2. 开发环境 16 | 17 | deepx-py库的开发环境是: 18 | 19 | python 3.8+ 20 | 21 | -------------------------------------------------------------------------------- /doc/front/py/deepx.rst: -------------------------------------------------------------------------------- 1 | DeepX Python 前端 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | :caption: Python API 7 | 8 | about 9 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. DeepX documentation master file, created by 2 | sphinx-quickstart on Tue Mar 4 12:21:01 2025. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | DeepX 原生分布式并行的深度学习训练推理一体框架 7 | ======================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | :caption: tutorials 教程 12 | 13 | benchmark/matmul 14 | benchmark/broadcast 15 | 16 | .. toctree:: 17 | :maxdepth: 6 18 | :caption: doc 文档 19 | 20 | front/py/deepx/about 21 | excuter/op-mem-ompsimd/list 22 | deepxIR/ir 23 | 24 | .. toctree:: 25 | :maxdepth: 6 26 | :caption: contrib 贡献指南 27 | 28 | front/py/contribute 29 | scheduler/scheduler 30 | excuter/excuter 31 | excuter/op-mem-ompsimd/contribute 32 | excuter/op-mem-ompsimd/range 33 | 34 | 索引和搜索 35 | ========== 36 | 37 | * :ref:`genindex` 38 | * :ref:`search` 39 | 40 | Add your content using ``reStructuredText`` syntax. See the 41 | `reStructuredText `_ 42 | documentation for details. 43 | 44 | 45 | .. toctree:: 46 | :maxdepth: 2 47 | :caption: Contents: 48 | -------------------------------------------------------------------------------- /doc/language.md: -------------------------------------------------------------------------------- 1 | ## c++:计算执行器(excuter) 2 | 3 | 负责实现tensor的具体计算过程,对接硬件如GPU、CPU的simd指令 4 | 5 | 除了c++,也就只有编译器能干这样的脏活累活了 6 | 7 | deepx用到了以下库,都是c++是实现 8 | 9 | cblas 10 | openmp 11 | c++可以和汇编结合,从而最大程度发挥cpu、gpu寄存器的性能 12 | 13 | cuda是c++的语言子集,也可以看作是c++ 14 | 15 | 16 | ## python:模型前端构建 17 | python提供了类似pytorch的库,便于调试和验证模型算法 18 | 19 | deepx/tensor/ 20 | deepx/nn/deepxIR 21 | deepx/nn.module/ 22 | deepx/nn.functional 23 | 通过这些库,我们可以快速的搭建一个模型结构 24 | 25 | ## golang:运维、监控、分布式,深度学习训推自动化的维护者 26 | 27 | 与pytorch、tensorflow不同,deepx追求分布式过程自动化,因此python侧不参与分布式 28 | 29 | deepxctl:提供对deepx体系的所有工具、库、模型、镜像的统一纳管 30 | 31 | 32 | 33 | ## deepxIR 34 | 虽然deepxIR不是独立的编程语言,但是deepx体系的程序格式标准 35 | 36 | excuter所执行的内容,就是deepxir的序列或deepxir计算图 37 | 38 | https://github.com/array2d/deepx/blob/main/doc/excuter/op-mem-cuda/list.md 39 | 40 | deepxir分为3类 41 | 42 | 计算:tensor这些系列elementwise、changeshape、tensorlife、io、reduce、init 43 | 指令结构: 44 | queue[deepxIR],串行指令,有前后执行顺序 45 | parallel[deepxIR],可并行的指令,无顺序依赖,可并行 46 | 以上指令为静态图所需的指令,运行过程是确定的。 47 | 48 | 分支:goto、ifelse 49 | 分支指令会让计算图行为不可预测,也就是动态部分 50 | 51 | 控制:parse、run等特殊自定义指令 52 | 控制指令是deepx分布式系统内置的各个组件控制指令 -------------------------------------------------------------------------------- /doc/scheduler/scheduler.md: -------------------------------------------------------------------------------- 1 | ### scheduler 2 | 3 | DeepX框架的scheduler,是front和excuter之间的桥梁。 4 | 5 | front只负责搭建抽象计算图,excuter负责执行算子,而scheduler负责将抽象计算图转换为执行计算图,并发送给excuter。 6 | 7 | #### 算子注册器 8 | 9 | 算子注册器,接收excuter的算子及精度列表。 10 | 11 | 12 | #### 调度器 13 | 14 | scheduler将实现以下能力: 15 | 16 | + 根据计算图的依赖关系,确定算子的执行顺序。 17 | + 算子融合。抽象计算图都是由最基础的算子组成,而执行计算图可以由多个基础算子融合而成。 18 | + 算子消除。根据数学链式法则,有些算子可以相互抵消,如log和exp,mul和div,add和sub。 19 | + TP:tensor 并行,tensor自动拆分计算 20 | + PP:pipeline 并行,包括 dual-mode:前向和后向 21 | + MP:model 并行,模型自动拆分计算 22 | + DP:data 并行,多路batch并行训练 23 | 24 | 25 | -------------------------------------------------------------------------------- /excuter/cpp-common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15...3.29) 2 | project(deepx-common LANGUAGES CXX ) 3 | 4 | # 设置 C++ 标准 5 | set(CMAKE_CXX_STANDARD 17) 6 | set(CMAKE_CXX_STANDARD_REQUIRED True) 7 | # 设置编译优化 8 | 9 | set(CMAKE_BUILD_TYPE Debug) 10 | # 设置 SIMD 编译选项 11 | 12 | # 包含头文件目录 13 | include_directories(src) 14 | 15 | # 源文件 16 | 17 | file(GLOB_RECURSE DEEPX_COMMON_SOURCES "src/*.cpp") 18 | 19 | add_library(deepx_common SHARED 20 | ${DEEPX_COMMON_SOURCES} 21 | ) 22 | 23 | 24 | find_package(yaml-cpp REQUIRED) 25 | 26 | target_link_libraries(deepx_common 27 | PUBLIC 28 | yaml-cpp 29 | ) 30 | 31 | target_include_directories(deepx_common PUBLIC 32 | $ 33 | $ 34 | ) 35 | 36 | 37 | add_subdirectory(test) 38 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/client/udpserver.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CLIENT_UDPSERVER_HPP__ 2 | #define __CLIENT_UDPSERVER_HPP__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "deepx/tf/tf.hpp" 13 | namespace client{ 14 | using namespace std; 15 | class udpserver 16 | { 17 | private: 18 | int port; 19 | int sockfd; 20 | struct sockaddr_in servaddr,cliaddr; 21 | char buffer[1024]; 22 | socklen_t len; 23 | ssize_t n; 24 | public: 25 | udpserver(int port); 26 | ~udpserver(); 27 | void start(queue &tasks); 28 | using handlefunc = std::function; 29 | handlefunc func; 30 | void resp(string str); 31 | }; 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /excuter/cpp-common/src/client/unixsocketserver.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CLIENT_UNIXSOCKETSERVER_HPP__ 2 | #define __CLIENT_UNIXSOCKETSERVER_HPP__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace client 12 | { 13 | class unixsocketserver 14 | { 15 | private: 16 | std::string socket_path; 17 | int sockfd; 18 | struct sockaddr_un servaddr, cliaddr; // 修改为使用完整类型 19 | char* buffer; // 改为指针类型 20 | const int buffer_size; // 新增缓冲区大小成员 21 | socklen_t len; 22 | ssize_t n; 23 | 24 | public: 25 | unixsocketserver(const std::string &path, const int buffersize); 26 | ~unixsocketserver(); 27 | void start(); 28 | using handlefunc = std::function; 29 | handlefunc func; 30 | }; 31 | } 32 | 33 | #endif -------------------------------------------------------------------------------- /excuter/cpp-common/src/client/worker.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __WORKER_HPP__ 2 | #define __WORKER_HPP__ 3 | 4 | 5 | namespace client{ 6 | 7 | } 8 | #endif 9 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/shape_matmul.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "deepx/shape_matmul.hpp" 4 | 5 | namespace deepx 6 | { 7 | Shape matmul_shape(const Shape &A, const Shape &B) 8 | { 9 | if (A.dim() < 2 || B.dim() < 2) 10 | { 11 | throw std::invalid_argument("A and B must >= 2D tensors"); 12 | } 13 | if (A[-1] != B[-2]) 14 | { 15 | throw std::invalid_argument("A[-1] must be equal to B[-2]"); 16 | } 17 | std::vector resultshape(A.dim()); 18 | std::copy(A.shape.begin(), A.shape.begin() + A.dim(), resultshape.begin()); 19 | Shape result(resultshape); 20 | result[-1] = B[-1]; 21 | return result; 22 | } 23 | } -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/shape_matmul.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_SHAPE_MATMUL_HPP 2 | #define DEEPX_SHAPE_MATMUL_HPP 3 | 4 | #include "deepx/shape.hpp" 5 | 6 | namespace deepx 7 | { 8 | Shape matmul_shape(const Shape &A, const Shape &B); 9 | } 10 | 11 | #endif // DEEPX_SHAPE_MATMUL_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/shape_reduce.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_SHAPE_SUM_HPP 2 | #define DEEPX_SHAPE_SUM_HPP 3 | 4 | #include "deepx/shape.hpp" 5 | 6 | namespace deepx 7 | { 8 | 9 | // 检查dims参数是否合法,返回整理后的dims 10 | std::vector checkedDims(const std::vector &inputshape, const std::vector &dims); 11 | 12 | // 返回求和后的形状 13 | std::vector reducedShape(const std::vector &inputshape, const std::vector &dims, const bool keepdim = false); 14 | 15 | // 返回需要求和的维度 16 | std::vector reducedDim(const std::vector &inputshape, const std::vector &dims ); 17 | } 18 | 19 | #endif // DEEPX_SHAPE_SUM_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/shape_tensorinit.cpp: -------------------------------------------------------------------------------- 1 | #include "deepx/shape_tensorinit.hpp" 2 | 3 | namespace deepx 4 | { 5 | std::pair calculateFanInAndFanOut(const Shape &shape) 6 | { 7 | int fanIn, fanOut; 8 | if (shape.dim() < 2) 9 | { 10 | fanIn = 1; 11 | fanOut = 1; 12 | return std::make_pair(fanIn, fanOut); 13 | } 14 | 15 | int numInputFmaps = shape[1]; // 输入特征图数量 16 | int numOutputFmaps = shape[0]; // 输出特征图数量 17 | int receptiveFieldSize = 1; 18 | if (shape.dim() > 2) 19 | { 20 | for (int i = 2; i < shape.dim(); ++i) 21 | { 22 | receptiveFieldSize *= shape[i]; // 计算感受野大小 23 | } 24 | } 25 | 26 | fanIn = numInputFmaps * receptiveFieldSize; 27 | fanOut = numOutputFmaps * receptiveFieldSize; 28 | return std::make_pair(fanIn, fanOut); 29 | } 30 | 31 | } -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/shape_tensorinit.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_SHAPE_TENSORINIT_HPP 2 | #define DEEPX_SHAPE_TENSORINIT_HPP 3 | 4 | #include "deepx/shape.hpp" 5 | 6 | namespace deepx 7 | { 8 | std::pair calculateFanInAndFanOut(const Shape &shape); 9 | } 10 | 11 | #endif // DEEPX_SHAPE_TENSORINIT_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/tensorbase.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORBASE_HPP 2 | #define DEEPX_TENSORBASE_HPP 3 | 4 | #include "deepx/shape.hpp" 5 | 6 | namespace deepx 7 | { 8 | 9 | struct TensorBase 10 | { 11 | Shape shape; 12 | TensorBase() = default; 13 | // 拷贝构造函数 14 | TensorBase(const TensorBase &other) 15 | { 16 | shape = other.shape; 17 | } 18 | 19 | // 移动构造函数 20 | TensorBase(TensorBase &&other) noexcept 21 | { 22 | shape = std::move(other.shape); 23 | } 24 | 25 | // 拷贝赋值运算符 26 | TensorBase &operator=(const TensorBase &other) 27 | { 28 | if (this != &other) 29 | { 30 | shape = other.shape; 31 | } 32 | return *this; 33 | } 34 | 35 | // 移动赋值运算符 36 | TensorBase &operator=(TensorBase &&other) noexcept 37 | { 38 | if (this != &other) 39 | { 40 | shape = std::move(other.shape); 41 | } 42 | return *this; 43 | } 44 | }; 45 | 46 | } 47 | #endif 48 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/tensorfunc/authors.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_AUTHORS_HPP 2 | #define DEEPX_TENSORFUNC_AUTHORS_HPP 3 | 4 | #include "string" 5 | 6 | namespace deepx::tensorfunc{ 7 | using namespace std; 8 | class default_{ 9 | public: 10 | static std::string name() { return "default"; } 11 | }; 12 | 13 | class miaobyte{ 14 | public: 15 | static std::string name() { return "miaobyte"; } 16 | }; 17 | 18 | class cblas{ 19 | public: 20 | static std::string name() { return "cblas"; } 21 | }; 22 | 23 | class cublas{ 24 | public: 25 | static std::string name() { return "cublas"; } 26 | }; 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/tensorfunc/io.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_IO_HPP 2 | #define DEEPX_TENSORFUNC_IO_HPP 3 | 4 | #include "deepx/tensor.hpp" 5 | #include "stdutil/fs.hpp" 6 | 7 | namespace deepx::tensorfunc{ 8 | 9 | template 10 | struct printDispatcher{ 11 | static void print(const Tensor &t, const std::string &f="")=delete; 12 | }; 13 | 14 | template 15 | void print(const Tensor &t, const std::string &f=""){ 16 | printDispatcher::print(t, f); 17 | } 18 | 19 | 20 | 21 | 22 | } 23 | 24 | #endif // DEEPX_TENSORFUNC_IO_HPP 25 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/tensorfunc/matmul.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_MATMUL_HPP 2 | #define DEEPX_TENSORFUNC_MATMUL_HPP 3 | 4 | #include "deepx/tensor.hpp" 5 | #include "deepx/tensorfunc/authors.hpp" 6 | #include "stdutil/error.hpp" 7 | namespace deepx::tensorfunc 8 | { 9 | bool check_matmul_shape(const Shape &a, const Shape &b) 10 | { 11 | if (a[-1] != b[-2]) 12 | { 13 | return false; 14 | } 15 | if (a.dim() != b.dim()) 16 | { 17 | return false; 18 | } 19 | for (int i = 0; i < a.dim() - 2; ++i) 20 | { 21 | if (a[i] != b[i]) 22 | { 23 | return false; 24 | } 25 | } 26 | return true; 27 | } 28 | 29 | template 30 | struct matmulDispatcher 31 | { 32 | static void matmul(const Tensor &A, const Tensor &B, Tensor &C) 33 | { 34 | throw NotImplementError("matmul"); 35 | } 36 | }; 37 | 38 | template 39 | void matmul(const Tensor &A, const Tensor &B, Tensor &C) 40 | { 41 | matmulDispatcher::matmul(A, B, C); 42 | } 43 | } 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/tensorfunc/tensorlife.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_TENSORLIFE_HPP 2 | #define DEEPX_TENSORFUNC_TENSORLIFE_HPP 3 | 4 | #include "deepx/tensor.hpp" 5 | 6 | namespace deepx::tensorfunc 7 | { 8 | //New 9 | template < typename T> 10 | Tensor New(const std::vector &shape); 11 | 12 | template 13 | Tensor New(const std::initializer_list &shape){ 14 | std::vector shape_vec(shape); 15 | return New(shape_vec); 16 | } 17 | 18 | //copy 19 | template 20 | void copy(const Tensor &src,Tensor &dst); 21 | 22 | //rename 23 | //通过tf直接实现 24 | } 25 | #endif -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/vector_combination.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "deepx/vector_combination.hpp" 4 | 5 | namespace deepx 6 | { 7 | using namespace std; 8 | vector> combination(int n, int k) 9 | { 10 | if (k > n || k < 0) 11 | { 12 | return {}; 13 | } 14 | if (k == 0) 15 | { 16 | return {{}}; 17 | } 18 | 19 | vector> result; 20 | vector path; 21 | 22 | // 递归函数 23 | function backtrack = [&](int start) 24 | { 25 | if (path.size() == k) 26 | { 27 | result.push_back(path); 28 | return; 29 | } 30 | for (int i = start; i < n; i++) 31 | { 32 | path.push_back(i); 33 | backtrack(i + 1); 34 | path.pop_back(); // 回溯 35 | } 36 | }; 37 | 38 | backtrack(0); 39 | return result; 40 | } 41 | vector> combination(int n ) 42 | { 43 | vector> result; 44 | for (int k = 0; k <= n; k++) 45 | { 46 | vector> temp = combination(n, k); 47 | result.insert(result.end(), temp.begin(), temp.end()); 48 | } 49 | return result; 50 | } 51 | vector arrange(int n) 52 | { 53 | vector result; 54 | for (int i = 0; i < n; i++) 55 | { 56 | result.push_back(i); 57 | } 58 | return result; 59 | } 60 | } -------------------------------------------------------------------------------- /excuter/cpp-common/src/deepx/vector_combination.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_VECTOR_COMBINATION_HPP 2 | #define DEEPX_VECTOR_COMBINATION_HPP 3 | 4 | #include 5 | 6 | namespace deepx 7 | { 8 | using namespace std; 9 | vector> combination(int n, int k); 10 | vector> combination(int n); 11 | vector arrange(int n); 12 | } 13 | #endif -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/error.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STDUTIL_ERROR_HPP 2 | #define STDUTIL_ERROR_HPP 3 | 4 | #include 5 | #include 6 | 7 | 8 | class NotImplementError : public std::logic_error { 9 | public: 10 | explicit NotImplementError(const std::string& method_name) 11 | : std::logic_error("Not implement: " + method_name) {} 12 | }; 13 | class UnsupportedOperationException : public std::logic_error { 14 | public: 15 | explicit UnsupportedOperationException(const std::string& method_name) 16 | : std::logic_error("Unsupported method: " + method_name) {} 17 | }; 18 | 19 | class TensorShapeError : public std::logic_error { 20 | public: 21 | explicit TensorShapeError(const std::string& operation) 22 | : std::logic_error("Tensor shape error for operation: " + operation) {} 23 | }; 24 | 25 | 26 | #endif // STDUTIL_ERROR_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/fs.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_STDUTIL_FS_HPP 2 | #define DEEPX_STDUTIL_FS_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace stdutil{ 8 | 9 | 10 | 11 | using namespace std; 12 | string filename(const string &path); 13 | 14 | using byte = unsigned char; 15 | 16 | void save(const byte *data,size_t size,const string &path); 17 | void load(const string &path,byte *data,size_t target_size); 18 | pair> load(const string &path); 19 | } 20 | 21 | #endif // DEEPX_STDUTIL_FS_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/num.cpp: -------------------------------------------------------------------------------- 1 | #include "num.hpp" 2 | #include 3 | #include 4 | 5 | bool is_positive_integer(const std::string& str) { 6 | try { 7 | int s=std::stoi(str); 8 | if(s<0){ 9 | return false; 10 | } 11 | return true; 12 | } catch (...) { 13 | return false; 14 | } 15 | } 16 | bool is_integer(const std::string& str) { 17 | try { 18 | std::stoi(str); 19 | return true; 20 | } catch (...) { 21 | return false; 22 | } 23 | } 24 | bool is_float(const std::string& str) { 25 | try { 26 | std::stof(str); 27 | return true; 28 | } catch (...) { 29 | return false; 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/num.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STDUTIL_NUM_HPP 2 | #define STDUTIL_NUM_HPP 3 | 4 | #include 5 | 6 | bool is_positive_integer(const std::string& str); 7 | bool is_integer(const std::string& str); 8 | bool is_float(const std::string& str); 9 | 10 | #endif // STDUTIL_NUM_HPP 11 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/string.cpp: -------------------------------------------------------------------------------- 1 | #include "string.hpp" 2 | 3 | namespace stdutil 4 | { 5 | void trimspace(string &str) 6 | { 7 | str.erase(0, str.find_first_not_of(" ")); 8 | str.erase(str.find_last_not_of(" ") + 1); 9 | } 10 | 11 | void trim(string &str, const string &chars) 12 | { 13 | str.erase(0, str.find_first_not_of(chars)); 14 | str.erase(str.find_last_not_of(chars) + 1); 15 | } 16 | 17 | string escape_markdown(const string &str) 18 | { 19 | std::string result; 20 | for (char c : str) 21 | { 22 | switch (c) 23 | { 24 | case '\\': 25 | result += "\\\\"; 26 | break; 27 | case '\"': 28 | result += "\\\""; 29 | break; 30 | case '\'': 31 | result += "\\\'"; 32 | break; 33 | case '\n': 34 | result += "\\n"; 35 | break; 36 | case '\t': 37 | result += "\\t"; 38 | break; 39 | case '\r': 40 | result += "\\r"; 41 | break; 42 | case '\b': 43 | result += "\\b"; 44 | break; 45 | case '\f': 46 | result += "\\f"; 47 | break; 48 | default: 49 | // 普通字符直接添加 50 | result += c; 51 | } 52 | } 53 | return result; 54 | } 55 | 56 | } // namespace stdutil -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/string.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STDUTIL_STRING_HPP 2 | #define STDUTIL_STRING_HPP 3 | 4 | #include 5 | 6 | namespace stdutil 7 | { 8 | using std::string; 9 | 10 | void trimspace(string &str); 11 | void trim(string &str,const string &chars=" \t\n\r\f\v"); 12 | 13 | string escape_markdown(const string &str); 14 | } 15 | 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/time.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STDUTIL_TIME_HPP 2 | #define STDUTIL_TIME_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace stdutil{ 11 | using namespace std::chrono; 12 | static std::string format_time(const system_clock::time_point &tp) 13 | { 14 | auto ms = duration_cast(tp.time_since_epoch()); 15 | auto sec = duration_cast(ms); 16 | ms -= sec; 17 | 18 | std::time_t t = sec.count(); 19 | std::tm tm; 20 | localtime_r(&t, &tm); // 线程安全版本 21 | 22 | std::ostringstream oss; 23 | oss << std::put_time(&tm, "%Y-%m-%d %H:%M:%S") 24 | << '.' << std::setfill('0') << std::setw(6) << ms.count(); 25 | return oss.str(); 26 | } 27 | } 28 | #endif -------------------------------------------------------------------------------- /excuter/cpp-common/src/stdutil/vector.hpp: -------------------------------------------------------------------------------- 1 | #ifndef STDUTIL_VECTOR_HPP 2 | #define STDUTIL_VECTOR_HPP 3 | 4 | #include 5 | #include 6 | 7 | // 全局重载 operator<< 8 | template 9 | std::ostream& operator<<(std::ostream& os, const std::vector& vec) { 10 | os << "["; 11 | for (size_t i = 0; i < vec.size(); ++i) { 12 | os << vec[i]; 13 | if (i < vec.size() - 1) { 14 | os << ", "; 15 | } 16 | } 17 | os << "]"; 18 | return os; 19 | } 20 | #endif // STDUTIL_VECTOR_HPP -------------------------------------------------------------------------------- /excuter/cpp-common/test/1_tf.cpp: -------------------------------------------------------------------------------- 1 | #include "deepx/tf/tf.hpp" 2 | #include 3 | using namespace std; 4 | using namespace deepx::tf; 5 | 6 | unordered_map> op_name_map = { 7 | {"matmul",{ 8 | TF("matmul(float16,float16)->(float32)"), 9 | TF("matmul(a,b)->(c)")}}, 10 | {"matmul2",{ 11 | TF("matmul(float16|float32 a,float16|float32 b)->(float16|float32 c)"), 12 | TF("matmul(a,b)->(c)")}}, 13 | {"newtensor",{ 14 | TF("newtensor(shape)->(float16 A)"), 15 | TF("newtensor([3 4 5])->(A)")}}, 16 | {"argset",{ 17 | TF("argset(vector)->(int32 A)"), 18 | TF("argset([3 4 5])->(A)")}}, 19 | {"argset2",{ 20 | TF("argset(int32 a)->(int32 shape)"), 21 | TF("argset(a)->(shape)")}}, 22 | {"argset3",{ 23 | TF("argset(float32 1.34)->(float32 var1)"), 24 | TF("argset(a)->(var1)")}}, 25 | {"argset4",{ 26 | TF("argset(1.34,2.34)->(float32 v1)"), 27 | TF("argset(a,b)->(v1)")}}, 28 | }; 29 | 30 | int main(int argc,char **argv){ 31 | if(argc!=2){ 32 | cout<<"usage: "< 3 | using namespace std; 4 | using namespace deepx::tf; 5 | 6 | 7 | int main(int argc,char **argv){ 8 | TF t("matmul(float16 a,float16 b)->(float32 c)"); 9 | cout<(float32 c)"); 12 | 13 | cout<<"checkdtype:"< 3 | using namespace stdutil; 4 | void test_save(int total_size){ 5 | stdutil::byte *data = new stdutil::byte[total_size]; 6 | for(int i=0;i 5 | #include 6 | 7 | #include "deepx/dtype.hpp" 8 | 9 | namespace deepx 10 | { 11 | using namespace std; 12 | // 获取类型对应的Precision 13 | template 14 | constexpr Precision precision() 15 | { 16 | if constexpr (std::is_same_v) 17 | return Precision::Float64; 18 | else if constexpr (std::is_same_v) 19 | return Precision::Float32; 20 | else if constexpr (std::is_same_v) return Precision::Float16; 21 | else if constexpr (std::is_same_v) return Precision::BFloat16; 22 | else if constexpr (std::is_same_v) 23 | return Precision::Int64; 24 | else if constexpr (std::is_same_v) 25 | return Precision::Int32; 26 | else if constexpr (std::is_same_v) 27 | return Precision::Int16; 28 | else if constexpr (std::is_same_v) 29 | return Precision::Int8; 30 | else if constexpr (std::is_same_v) 31 | return Precision::Bool; 32 | else if constexpr (std::is_same_v) 33 | return Precision::String; 34 | else 35 | return Precision::Any; 36 | } 37 | } 38 | 39 | #endif // DEEPX_DTYPE_CUDA_HPP 40 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.cuh: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAOBYTE_SIN_CUH 3 | 4 | #include 5 | #include 6 | 7 | 8 | #include "deepx/tensorfunc/elementwise.hpp" 9 | #include "deepx/tensorfunc/cuda.hpp" 10 | #include "deepx/tensorfunc/authors.hpp" 11 | 12 | namespace deepx::tensorfunc 13 | { 14 | // sin 15 | template 16 | __global__ void sin_kernel(const T* A, T* C, const int size); 17 | 18 | template 19 | void launch_sin(const T* a, T* c, const int size); 20 | 21 | 22 | template 23 | __global__ void cos_kernel(const T* A, T* C, const int size); 24 | 25 | template 26 | void launch_cos( const T* a, T* c, const int size); 27 | 28 | // tan 29 | template 30 | __global__ void tan_kernel(const T* A, T* C, const int size); 31 | 32 | template 33 | void launch_tan( const T* a, T* c, const int size); 34 | } 35 | 36 | #endif -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sin.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SIN_HPP 3 | 4 | #include "deepx/tensorfunc/elementwise.hpp" 5 | #include "deepx/tensorfunc/cuda.hpp" 6 | #include "deepx/tensorfunc/authors.hpp" 7 | #include "deepx/tensorfunc/elementwise_miaobyte_sin.cuh" 8 | 9 | #include "stdutil/error.hpp" 10 | 11 | namespace deepx::tensorfunc 12 | { 13 | 14 | template 15 | struct sinDispatcher 16 | { 17 | static void sin(const Tensor &A, Tensor &C) 18 | { 19 | if (A.shape.size != C.shape.size) { 20 | throw TensorShapeError("sin"); 21 | } 22 | launch_sin(A.data, C.data, A.shape.size); 23 | } 24 | }; 25 | 26 | template 27 | struct cosDispatcher 28 | { 29 | static void cos(const Tensor &A, Tensor &C) 30 | { 31 | if (A.shape.size != C.shape.size) { 32 | throw TensorShapeError("cos"); 33 | } 34 | launch_cos(A.data, C.data, A.shape.size); 35 | } 36 | }; 37 | 38 | template 39 | struct tanDispatcher 40 | { 41 | static void tan(const Tensor &A, Tensor &C) 42 | { 43 | if (A.shape.size != C.shape.size) { 44 | throw TensorShapeError("tan"); 45 | } 46 | launch_tan(A.data, C.data, A.shape.size); 47 | } 48 | }; 49 | 50 | 51 | } 52 | 53 | #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_BASIC_HPP 54 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_sqrt.cuh: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH 2 | #define DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH 3 | 4 | #include "deepx/tensorfunc/cuda.hpp" 5 | #include "deepx/tensorfunc/authors.hpp" 6 | 7 | namespace deepx::tensorfunc 8 | { 9 | // sqrt 10 | template 11 | __global__ void sqrt_kernel(const T* A, T* C,const int size); 12 | 13 | template 14 | void launch_sqrt(const T* a, T* c,const int size); 15 | 16 | 17 | // pow 18 | template 19 | __global__ void pow_kernel(const T* A, const T* B, T* C,const int size); 20 | 21 | template 22 | void launch_pow(const T* a, const T* b, T* c,const int size); 23 | 24 | 25 | // powscalar 26 | template 27 | __global__ void powscalar_kernel(const T* A, const T scalar, T* C,const int size); 28 | 29 | template 30 | void launch_powscalar(const T* a, const T scalar, T* c,const int size); 31 | 32 | // rpowscalar 33 | template 34 | __global__ void rpowscalar_kernel(const T scalar, const T* A, T* C, const int size); 35 | 36 | template 37 | void launch_rpowscalar(const T scalar, const T* a, T* c, const int size); 38 | 39 | // log 40 | template 41 | __global__ void log_kernel(const T* A, T* C,const int size); 42 | 43 | template 44 | void launch_log(const T* a, T* c,const int size); 45 | 46 | // exp 47 | template 48 | __global__ void exp_kernel(const T* A, T* C,const int size); 49 | 50 | template 51 | void launch_exp(const T* a, T* c,const int size); 52 | 53 | 54 | 55 | } 56 | 57 | #endif // DEEPX_TENSORFUNC_ELEMENTWISE_MIAO_BYTE_SQRT_CUH 58 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.cuh: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_INIT_MIAO_BYTE_CUH 2 | #define DEEPX_TENSORFUNC_INIT_MIAO_BYTE_CUH 3 | 4 | #include 5 | #include 6 | #include 7 | #include "deepx/tensor.hpp" 8 | #include "deepx/tensorfunc/authors.hpp" 9 | #include "deepx/tensorfunc/init.hpp" 10 | 11 | namespace deepx::tensorfunc 12 | { 13 | //填充 14 | template 15 | __global__ void kernel_constant(T *data, const T value, const int size); 16 | 17 | template 18 | void launch_constant(T *a, const T value, const int size); 19 | 20 | //dropout 21 | template 22 | __global__ void dropout_kernel(T* A, const float p,const unsigned int seed,const int size); 23 | 24 | template 25 | void launch_dropout(T* a, const float p,const unsigned int seed,const int size); 26 | 27 | //初始化 28 | //arange 29 | template 30 | __global__ void kernel_arange(T *data, const float start, const float step, const int size); 31 | 32 | template 33 | void launch_arange(T *a, const T start, const T step, const int size); 34 | 35 | //uniform 36 | template 37 | __global__ void kernel_uniform(T *data, const float low, const float high, const unsigned int seed, const int size); 38 | 39 | template 40 | void launch_uniform(T *a, const T low, const T high, const unsigned int seed, const int size); 41 | 42 | //normal 43 | template 44 | __global__ void kernel_normal(T *data, const float mean, const float stddev, const unsigned int seed, const int size); 45 | 46 | template 47 | void launch_normal(T *a, const T mean, const T stddev, const unsigned int seed, const int size); 48 | 49 | } 50 | 51 | #endif -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/init_miaobyte.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP 2 | #define DEEPX_TENSORFUNC_INIT_MIAO_BYTE_HPP 3 | 4 | #include 5 | 6 | #include "deepx/tensorfunc/authors.hpp" 7 | #include "deepx/tensorfunc/init.hpp" 8 | #include "deepx/tensor.hpp" 9 | #include "deepx/tensorfunc/init_miaobyte.cuh" 10 | namespace deepx::tensorfunc 11 | { 12 | // constant 13 | template 14 | struct constantDispatcher 15 | { 16 | static void constant(Tensor &tensor, const T value) 17 | { 18 | launch_constant(tensor.data, value, tensor.shape.size); 19 | } 20 | }; 21 | 22 | 23 | 24 | template 25 | struct dropoutDispatcher 26 | { 27 | static void dropout(Tensor &A, const float p,const unsigned int seed) 28 | { 29 | launch_dropout(A.data, p, seed, A.shape.size); 30 | } 31 | }; 32 | 33 | // arange 34 | template 35 | struct arangeDispatcher 36 | { 37 | static void arange(Tensor &tensor, const T start, const T step) 38 | { 39 | launch_arange(tensor.data, start, step, tensor.shape.size); 40 | } 41 | }; 42 | 43 | // uniform 44 | template 45 | struct uniformDispatcher 46 | { 47 | static void uniform(Tensor &tensor, const T low, const T high, const unsigned int seed) 48 | { 49 | launch_uniform(tensor.data, low, high, seed, tensor.shape.size); 50 | } 51 | }; 52 | 53 | // normal 54 | template 55 | struct normalDispatcher 56 | { 57 | static void normal(Tensor &tensor, const T mean, const T stddev, const unsigned int seed) 58 | { 59 | launch_normal(tensor.data, mean, stddev, seed, tensor.shape.size); 60 | } 61 | }; 62 | } 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/new_mempool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP 2 | #define DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP 3 | 4 | namespace deepx::tensorfunc 5 | { 6 | class MemoryPool 7 | { 8 | public: 9 | static void* Malloc(size_t size) { 10 | 11 | } 12 | 13 | static void Free(void* ptr) { 14 | 15 | } 16 | 17 | // Realloc: 重新分配内存并保留原数据,主要用于tensor形状改变时的内存重分配 18 | // 如果新的size小于原size,数据会被截断 19 | // 如果新的size大于原size,新分配的内存部分不会初始化 20 | // 如果ptr为nullptr,等同于Malloc 21 | // 如果size为0,等同于Free 22 | // 返回新分配的内存指针,如果分配失败返回nullptr 23 | 24 | static void* Realloc(void* ptr, size_t size) { 25 | 26 | } 27 | 28 | 29 | // GetAllocatedSize: 获取已分配内存的实际大小 30 | // 由于内存对齐,实际分配的内存可能大于请求的size 31 | // 主要用于内存使用统计和调试 32 | // 如果ptr为nullptr,返回0 33 | // 重新分配内存,保留原数据 34 | static size_t GetAllocatedSize(void* ptr) { 35 | 36 | } 37 | }; 38 | 39 | } // namespace deepx::tensorfunc 40 | #endif // DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP 41 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/src/deepx/tensorfunc/tensor_cuda.cuh: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_TENSOR_CUDA_CUH 2 | #define DEEPX_TENSORFUNC_TENSOR_CUDA_CUH 3 | 4 | #include 5 | #include "deepx/tensor.hpp" 6 | 7 | namespace deepx::tensorfunc 8 | { 9 | inline __host__ __device__ void linearTo(const int *strides, const int dim, int *indices, const int id) 10 | { 11 | int linearIndex = id; 12 | for (int i = 0; i < dim; i++) 13 | { 14 | indices[i] = linearIndex / strides[i]; 15 | linearIndex %= strides[i]; 16 | } 17 | } 18 | 19 | inline __host__ __device__ int linearAt(const int *strides, const int dim,const int *indices) 20 | { 21 | int idx = 0; 22 | for (int i = 0; i < dim; i++) 23 | { 24 | idx += indices[i] * strides[i]; 25 | } 26 | return idx; 27 | } 28 | 29 | template 30 | __device__ __host__ void reorder(const T *order, const int *dimOrder, int dim, T *neworder) 31 | { 32 | for (int i = 0; i < dim; i++) 33 | { 34 | neworder[i] = order[dimOrder[i]]; 35 | } 36 | } 37 | 38 | const int MAX_DIM = 12; 39 | } 40 | 41 | #endif // DEEPX_TENSORFUNC_TENSOR_CUDA_CUH 42 | -------------------------------------------------------------------------------- /excuter/op-mem-cuda/test/op/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/excuter/op-mem-cuda/test/op/CMakeLists.txt -------------------------------------------------------------------------------- /excuter/op-mem-cuda/test/tensorfunc/0_new.cpp: -------------------------------------------------------------------------------- 1 | #include "deepx/tensorfunc/init.hpp" 2 | #include "deepx/tensor.hpp" 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 4 | #include "deepx/tensorfunc/io_miaobyte.hpp" 5 | #include "deepx/tensorfunc/init_miaobyte.hpp" 6 | #include "deepx/tensorfunc/authors.hpp" 7 | 8 | using namespace deepx::tensorfunc; 9 | using namespace deepx; 10 | void test_new() 11 | { 12 | Tensor a=New({10, 10}); 13 | arange(a, 1.0f, 0.1f); 14 | print(a,"%.2f"); 15 | } 16 | 17 | int main() 18 | { 19 | test_new(); 20 | } -------------------------------------------------------------------------------- /excuter/op-mem-cuda/test/tensorfunc/1_cublas_add.cpp: -------------------------------------------------------------------------------- 1 | #include "deepx/tensorfunc/init_miaobyte.hpp" 2 | #include "deepx/tensor.hpp" 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 4 | #include "deepx/tensorfunc/io_miaobyte.hpp" 5 | #include "deepx/tensorfunc/elementwise.hpp" 6 | #include "deepx/tensorfunc/elementwise_cublas_basic.hpp" 7 | using namespace deepx::tensorfunc; 8 | using namespace deepx; 9 | void test_add() 10 | { 11 | Tensor a=New({10, 10}); 12 | arange(a, 1.0f, 0.1f); 13 | Tensor b=New({10, 10}); 14 | arange(b, 2.0f, 0.2f); 15 | Tensor c=New({10, 10}); 16 | constant(c, 0.0f); 17 | 18 | add(a, b, c); 19 | print(c,"%.2f"); 20 | } 21 | 22 | int main() 23 | { 24 | test_add(); 25 | } -------------------------------------------------------------------------------- /excuter/op-mem-cuda/test/tensorfunc/2_changeshape.cpp: -------------------------------------------------------------------------------- 1 | #include "deepx/tensorfunc/init_miaobyte.hpp" 2 | #include "deepx/tensor.hpp" 3 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 4 | #include "deepx/tensorfunc/io_miaobyte.hpp" 5 | #include "deepx/tensorfunc/changeshape_miaobyte.hpp" 6 | using namespace deepx::tensorfunc; 7 | using namespace deepx; 8 | void test_transpose() 9 | { 10 | Tensor a=New({3,4,6}); 11 | arange(a, 1.0f, 1.0f); 12 | print(a,"%.0f"); 13 | Tensor b=New({3,6,4}); 14 | transpose(a, {0,2,1}, b); 15 | print(b,"%.0f"); 16 | } 17 | 18 | void test_concat() 19 | { 20 | Tensor a=New({3,2,6}); 21 | arange(a, 1.0f, 1.0f); 22 | print(a,"%.0f"); 23 | Tensor b=New({3,4,6}); 24 | constant(b, 2.0f); 25 | print(b,"%.0f"); 26 | Tensor c=New({3,6,6}); 27 | constant(c, 3.0f); 28 | print(c,"%.0f"); 29 | Tensor d=New({3,12,6}); 30 | concat({&a,&b,&c},1,d); 31 | print(d,"%.0f"); 32 | } 33 | 34 | void test_broadcastTo() 35 | { 36 | Tensor a=New({3,2}); 37 | arange(a, 1.0f, 1.0f); 38 | Tensor b=New({4,3,2}); 39 | broadcastTo(a, b.shape.shape, b); 40 | print(b,"%.0f"); 41 | } 42 | int main(int argc, char **argv) 43 | { 44 | int casearg=atoi(argv[1]); 45 | switch (casearg) 46 | { 47 | case 0: 48 | test_transpose(); 49 | break; 50 | case 1: 51 | test_concat(); 52 | break; 53 | case 2: 54 | test_broadcastTo(); 55 | break; 56 | } 57 | return 0; 58 | } -------------------------------------------------------------------------------- /excuter/op-mem-cuda/test/tensorfunc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(0_new 0_new.cpp) 2 | target_link_libraries(0_new deepx CUDA::cudart) 3 | 4 | add_executable(1_cublas_add 1_cublas_add.cpp) 5 | target_link_libraries(1_cublas_add deepx CUDA::cudart) 6 | 7 | add_executable(1_cublas_matmul 1_cublas_matmul.cpp) 8 | target_link_libraries(1_cublas_matmul deepx CUDA::cudart) 9 | 10 | add_executable(2_changeshape 2_changeshape.cpp) 11 | target_link_libraries(2_changeshape deepx CUDA::cudart) -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/.cursorignore: -------------------------------------------------------------------------------- 1 | # Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv) 2 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/.cursorrules: -------------------------------------------------------------------------------- 1 | excuter只实现基础的op,不要实现可以由基础op组合的op,如relu可以由max组合。 -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/.gitignore: -------------------------------------------------------------------------------- 1 | thirdlib -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15...3.29) 2 | project(deepx-excuter-ompsimd LANGUAGES CXX ) 3 | 4 | # 设置 C++ 标准 5 | set(CMAKE_CXX_STANDARD 17) 6 | set(CMAKE_CXX_STANDARD_REQUIRED True) 7 | # 设置编译优化 8 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") 9 | set(CMAKE_BUILD_TYPE Debug) 10 | # 设置 SIMD 编译选项 11 | 12 | # 包含头文件目录 13 | include_directories(src) 14 | 15 | add_subdirectory(../cpp-common common) 16 | 17 | # 源文件 18 | 19 | file(GLOB_RECURSE DEEPX_SOURCES "src/*.cpp") 20 | file(GLOB_RECURSE CLIENT_SOURCES "src/client/*.cpp") 21 | 22 | # cpu 线性代数库 23 | list(APPEND CMAKE_PREFIX_PATH "/usr/lib/x86_64-linux-gnu/openblas-pthread/cmake") 24 | find_package(OpenBLAS REQUIRED) 25 | 26 | # include(FetchContent) 27 | 28 | # # Fetch OpenBLAS from GitHub 29 | # FetchContent_Declare( 30 | # OpenBLAS 31 | # GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS 32 | # GIT_TAG v0.3.29 # 使用最新的稳定版本或您需要的特定版本 33 | # ) 34 | 35 | # # 下载并构建 OpenBLAS 36 | # FetchContent_MakeAvailable(OpenBLAS) 37 | 38 | # 线程并行 39 | find_package(OpenMP REQUIRED) 40 | # simd 41 | find_package(hwy REQUIRED) 42 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -msse4.2") 43 | # 内存池 44 | find_package(PkgConfig REQUIRED) 45 | pkg_check_modules(JEMALLOC REQUIRED jemalloc) 46 | 47 | 48 | find_package(yaml-cpp REQUIRED) 49 | 50 | add_library(deepx_ompsimd SHARED 51 | ${DEEPX_SOURCES} 52 | ) 53 | 54 | target_link_libraries( deepx_ompsimd 55 | PUBLIC 56 | deepx_common 57 | yaml-cpp 58 | ${JEMALLOC_LIBRARIES} 59 | openblas 60 | OpenMP::OpenMP_CXX 61 | hwy 62 | ) 63 | add_executable(${PROJECT_NAME} ${CLIENT_SOURCES}) 64 | target_link_libraries(${PROJECT_NAME} 65 | PRIVATE 66 | deepx_ompsimd 67 | ) 68 | # 测试 69 | add_subdirectory(test/tensorfunc) 70 | add_subdirectory(test/op) 71 | # # 数据集测试 72 | # add_subdirectory(test/dataset) 73 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.array2d.com/library/ubuntu:18.04 2 | 3 | # 基础构建环境 4 | RUN apt-get update && apt-get install -y \ 5 | build-essential \ 6 | cmake \ 7 | libopenblas-dev \ 8 | libyaml-cpp-dev \ 9 | libjemalloc-dev \ 10 | clang \ 11 | git \ 12 | wget \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | # 安装 Highway SIMD 库 16 | 17 | RUN mkdir -p thirdlib && \ 18 | cd thirdlib && \ 19 | git clone https://github.com/google/highway.git && \ 20 | cd highway && \ 21 | rm -rf build && mkdir build && cd build && \ 22 | cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \ 23 | make -j$(nproc) && make install && \ 24 | cd ../../ && rm -rf thirdlib/highway 25 | 26 | ADD cpp-common cpp-common 27 | ADD op-mem-ompsimd op-mem-ompsimd 28 | WORKDIR /home/op-mem-ompsimd 29 | 30 | RUN rm -rf build && mkdir build && cd build && \ 31 | cmake ..&& \ 32 | make -j$(nproc) 33 | 34 | CMD ["./build/bin/deepx-excuter-ompsimd"] -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/dockerbuild.sh: -------------------------------------------------------------------------------- 1 | cd ../ 2 | pwd 3 | ls -al 4 | docker build -t docker.array2d.com/deepx/ompsimd:latest . -f op-mem-ompsimd/Dockerfile 5 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/log.md: -------------------------------------------------------------------------------- 1 | ### 2025-01-9 2 | deepx第三次重构 3 | 目标:性能与特性并重 4 | 5 | 6 | ### 2025-01-17 7 | 尝试omp+highway的simd融合 8 | 9 | ### 2025-01-20 10 | 11 | layer.Node需要仔细设计forward和backward的接口 12 | 13 | + 输入输出用string作为key,从tensormanager中获取tensor 14 | + parallel结构 15 | 16 | ### 2025-01-21 17 | h5模型文件,转deepx格式 18 | 19 | ### 2025-02-06 20 | 21 | op完全重构 22 | 23 | + 输入输出用string作为key,从tensormanager中获取tensor 24 | 25 | + 对算子的精度进行了特化 26 | 27 | 28 | ### 2025-02-07 29 | 30 | + 关于simd对齐的3段式对齐 31 | ``` 32 | 头部未对齐:通过标量运算处理直到对齐边界 33 | 34 | const size_t adjust = (alignment - misalign) / sizeof(T); 35 | for (; j < adjust...) 36 | 37 | 38 | 主体对齐部分:使用对齐加载/存储指令 39 | 40 | 41 | Load(tag, a_start + j); // 对齐加载 42 | Store(...); // 对齐存储 43 | 44 | 45 | 尾部剩余元素:处理最后不足一个向量宽度的元素 46 | 47 | 48 | for (; j < len; ++j) 49 | ``` 50 | 51 | ### 2025-02-19 52 | 53 | + 增加、优化了部分tensorfunc的性能 54 | + 验证了一些列op的正确性 55 | 56 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/src/client/tfs.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CLIENT_TFS_HPP 2 | #define CLIENT_TFS_HPP 3 | 4 | #include "deepx/tf/tffactory.hpp" 5 | namespace deepx::tf{ 6 | 7 | int register_all(TfFactory &tfactory); 8 | } 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/src/deepx/dtype_ompsimd.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_DTYPE_OMPSIMD_HPP 2 | #define DEEPX_DTYPE_OMPSIMD_HPP 3 | 4 | #include "deepx/dtype.hpp" 5 | 6 | namespace deepx 7 | { 8 | using namespace std; 9 | // 获取类型对应的Precision 10 | template 11 | constexpr Precision precision() 12 | { 13 | if constexpr (std::is_same_v) 14 | return Precision::Float64; 15 | else if constexpr (std::is_same_v) 16 | return Precision::Float32; 17 | else if constexpr (std::is_same_v) 18 | return Precision::Int64; 19 | else if constexpr (std::is_same_v) 20 | return Precision::Int32; 21 | else if constexpr (std::is_same_v) 22 | return Precision::Int16; 23 | else if constexpr (std::is_same_v) 24 | return Precision::Int8; 25 | else if constexpr (std::is_same_v) 26 | return Precision::Bool; 27 | else if constexpr (std::is_same_v) 28 | return Precision::String; 29 | else 30 | return Precision::Any; 31 | } 32 | } 33 | 34 | #endif // DEEPX_DTYPE_OMPSIMD_HPP 35 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/src/deepx/tensorfunc/equal.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_EQUAL_HPP 2 | #define DEEPX_TENSORFUNC_EQUAL_HPP 3 | #include 4 | #include 5 | 6 | #include "deepx/tensor.hpp" 7 | #include "deepx/shape.hpp" 8 | namespace deepx::tensorfunc 9 | { 10 | template 11 | bool equal(Tensor &tensor, Tensor &other,float epsilon=1e-6) 12 | { 13 | bool result=true; 14 | if (tensor.shape.shape != other.shape.shape) 15 | return false; 16 | 17 | if constexpr (std::is_floating_point_v) 18 | { 19 | #pragma omp parallel for 20 | for (int i = 0; i < tensor.shape.size; ++i) 21 | { 22 | if (std::fabs(tensor.data[i] - other.data[i]) > epsilon) 23 | { 24 | #pragma omp atomic write 25 | result=false; 26 | } 27 | } 28 | 29 | return result; 30 | } 31 | else 32 | { 33 | return std::equal(tensor.data, tensor.data + tensor.shape.size, other.data); 34 | } 35 | }; 36 | } 37 | #endif // DEEPX_OP_CPU_EQUAL_HPP 38 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/src/deepx/tensorfunc/highway.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_HIGHWAY_HPP 2 | #define DEEPX_TENSORFUNC_HIGHWAY_HPP 3 | 4 | #include 5 | 6 | namespace deepx::tensorfunc 7 | { 8 | using namespace hwy::HWY_NAMESPACE; 9 | 10 | template 11 | T ReduceMul(D d, Vec v) 12 | { 13 | T result = GetLane(v); 14 | for (size_t i = 1; i < Lanes(d); ++i) 15 | { 16 | result *= ExtractLane(v, i); 17 | } 18 | return result; 19 | } 20 | 21 | } 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/src/deepx/tensorfunc/matmul_miaobyte.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DEEPX_TENSORFUNC_MATMUL_MIAOBYTE_HPP 2 | #define DEEPX_TENSORFUNC_MATMUL_MIAOBYTE_HPP 3 | 4 | #include "deepx/tensorfunc/matmul.hpp" 5 | 6 | namespace deepx::tensorfunc 7 | { 8 | template 9 | struct matmulDispatcher 10 | { 11 | static void matmul(const Tensor &A, const Tensor &B, Tensor &C) 12 | { 13 | if (!check_matmul_shape(A.shape, B.shape)) 14 | { 15 | throw std::invalid_argument("A.shape could matmul with B.shape"); 16 | } 17 | //TODO 18 | //这里需要进一步优化 19 | C.shape.rangeParallel(C.shape.dim(), [&A,&B,&C](const int idx,const std::vector &indices,ThreadLocalVectors &tlv) { 20 | 21 | // int m=A.shape[-2]; 22 | int k=A.shape[-1]; 23 | // int n=B.shape[-1]; 24 | 25 | std::copy(indices.begin(),indices.end()-2,tlv.get(0).begin()); 26 | tlv.get(0)[indices.size()-2]=A.shape[-2]; 27 | tlv.get(0)[indices.size()-1]=indices[-1]; 28 | int aIdx=A.shape.linearat(tlv.get(0)); 29 | std::copy(indices.begin(),indices.end()-2,tlv.get(1).begin()); 30 | tlv.get(1)[indices.size()-2]=0; 31 | tlv.get(1)[indices.size()-1]=indices[-2]; 32 | int bIdx=B.shape.linearat(tlv.get(1)); 33 | int bstride=k; 34 | 35 | T sum=0; 36 | for(int l=0;l 5 | 6 | namespace deepx::tensorfunc 7 | { 8 | class MemoryPool 9 | { 10 | public: 11 | static void* Malloc(size_t size) { 12 | return mallocx(size, MALLOCX_ALIGN(64)); // 64字节对齐,适合SIMD 13 | } 14 | 15 | static void Free(void* ptr) { 16 | dallocx(ptr, 0); 17 | } 18 | 19 | // Realloc: 重新分配内存并保留原数据,主要用于tensor形状改变时的内存重分配 20 | // 如果新的size小于原size,数据会被截断 21 | // 如果新的size大于原size,新分配的内存部分不会初始化 22 | // 如果ptr为nullptr,等同于Malloc 23 | // 如果size为0,等同于Free 24 | // 返回新分配的内存指针,如果分配失败返回nullptr 25 | 26 | static void* Realloc(void* ptr, size_t size) { 27 | return rallocx(ptr, size, MALLOCX_ALIGN(64)); 28 | } 29 | 30 | 31 | // GetAllocatedSize: 获取已分配内存的实际大小 32 | // 由于内存对齐,实际分配的内存可能大于请求的size 33 | // 主要用于内存使用统计和调试 34 | // 如果ptr为nullptr,返回0 35 | // 重新分配内存,保留原数据 36 | static size_t GetAllocatedSize(void* ptr) { 37 | return sallocx(ptr, 0); 38 | } 39 | }; 40 | 41 | } // namespace deepx::tensorfunc 42 | #endif // DEEPX_TENSORFUNC_NEW_MEMPOOL_HPP 43 | -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/op/1_mem.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "deepx/mem/mem_ompsimd.hpp" 3 | #include "deepx/tensor.hpp" 4 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 5 | #include "deepx/tensorfunc/init_miaobyte.hpp" 6 | #include "deepx/tensorfunc/io_miaobyte.hpp" 7 | #include "deepx/tensorfunc/authors.hpp" 8 | 9 | using namespace deepx::mem; 10 | using namespace deepx; 11 | using namespace deepx::tensorfunc; 12 | using namespace std; 13 | int main() 14 | { 15 | shared_ptr mem=make_shared(); 16 | for (int i = 0; i < 10; i++) 17 | { 18 | Tensor tensor = New({1, 2, 3}); 19 | uniform(tensor,0.0f,1.0f); 20 | mem->addtensor("tensor" + std::to_string(i), tensor ); 21 | } 22 | 23 | cout << mem->existstensor(string("tensor0")) << endl; 24 | print(*(mem->gettensor(string("tensor0")).get())); 25 | mem->clear(); 26 | 27 | return 0; 28 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/op/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(1_mem 1_mem.cpp) 3 | target_link_libraries(1_mem deepx_ompsimd) 4 | 5 | # add_executable(1_relu 1_relu.cpp) 6 | # target_link_libraries(1_relu deepx_ompsimd) 7 | 8 | 9 | # add_executable(2_add 2_add.cpp) 10 | # target_link_libraries(2_add deepx_ompsimd) 11 | 12 | # add_executable(3_matmul 3_matmul.cpp) 13 | # target_link_libraries(3_matmul deepx_ompsimd) 14 | 15 | # add_executable(4_sgd 4_sgd.cpp) 16 | # target_link_libraries(4_sgd deepx_ompsimd) -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/1_shape.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include 6 | using namespace deepx; 7 | 8 | void test_tensor_shape() { 9 | Shape shape({2, 3, 4}); 10 | std::cout << "print shape: " << shape.size << std::endl; 11 | std::string yaml=shape.toYaml(); 12 | std::cout<<"yaml:"< 2 | #include 3 | #include "deepx/vector_combination.hpp" 4 | #include "stdutil/vector.hpp" 5 | using namespace deepx; 6 | 7 | void test_combination() 8 | { 9 | std::vector> result = combination(3); 10 | for (const auto &comb : result) 11 | { 12 | std::cout << "Combination:"< 2 | #include 3 | 4 | #include "deepx/tensorfunc/equal.hpp" 5 | #include "deepx/tensor.hpp" 6 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 7 | 8 | using namespace deepx; 9 | using namespace deepx::tensorfunc; 10 | void test_equal(){ 11 | Tensor tensor1=New({4096,4096}); 12 | std::iota(tensor1.data,tensor1.data+tensor1.shape.size,0); 13 | Tensor tensor2=New({4096,4096}); 14 | std::iota(tensor2.data,tensor2.data+tensor2.shape.size,0); 15 | std::cout< 3 | 4 | #include "deepx/tensor.hpp" 5 | 6 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 7 | #include "deepx/tensorfunc/init_miaobyte.hpp" 8 | #include "deepx/tensorfunc/authors.hpp" 9 | #include "deepx/tensorfunc/io_miaobyte.hpp" 10 | 11 | using namespace deepx; 12 | using namespace deepx::tensorfunc; 13 | void test_tensor_new(){ 14 | Tensor tensor=New({2, 3}); 15 | constant(tensor,1); 16 | print(tensor); 17 | tensor.save("tensor"); 18 | Tensor tensor2=New({2, 3}); 19 | constant(tensor2,2); 20 | print(tensor2); 21 | tensor2.save("tensor2"); 22 | } 23 | 24 | void test_arange() { 25 | Tensor tensor=New({2, 3}); 26 | arange(tensor,float(0),float(1)); 27 | print(tensor); 28 | } 29 | 30 | int main(int argc,char **argv){ 31 | int i=0; 32 | if (argc>1){ 33 | 34 | i=std::atoi(argv[1]); 35 | } 36 | switch (i) { 37 | case 1: 38 | test_tensor_new(); 39 | case 0: 40 | test_arange(); 41 | } 42 | return 0; 43 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "deepx/tensor.hpp" 5 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 6 | #include "deepx/tensorfunc/init_miaobyte.hpp" 7 | #include "deepx/tensorfunc/io_miaobyte.hpp" 8 | #include "deepx/tensorfunc/authors.hpp" 9 | 10 | 11 | using namespace deepx; 12 | using namespace deepx::tensorfunc; 13 | void test_tensor_range(){ 14 | Tensor tensor=New({2, 3}); 15 | constant(tensor,1); 16 | print(tensor); 17 | 18 | Tensor tensor2=New({2, 3}); 19 | constant(tensor2,2); 20 | print(tensor2); 21 | 22 | } 23 | 24 | int main(){ 25 | test_tensor_range(); 26 | 27 | return 0; 28 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/2_tensor_range.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import yaml 4 | import os 5 | import sys 6 | def load_shape(path): 7 | with open(path + '.shape', 'r') as f: 8 | shape_data = f.read() 9 | shape = yaml.safe_load(shape_data) 10 | return shape['shape'], shape['dim'], shape['strides'], shape['size'] 11 | 12 | def load_tensor_data(path, shape): 13 | data = np.fromfile(path + '.data', dtype=np.float32) 14 | return data.reshape(shape) 15 | 16 | def load_deepx_tensor(path): 17 | shape, dim, strides, size = load_shape(path) 18 | tensor_data = load_tensor_data(path, shape) 19 | return torch.tensor(tensor_data) 20 | 21 | # 使用示例 22 | if __name__ == "__main__": 23 | name=sys.argv[1] 24 | tensor = load_deepx_tensor(name) 25 | print("Tensor:", tensor) 26 | print("Shape:", tensor.size()) 27 | print("Strides:", tensor.stride()) -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/3_tensor_print.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "deepx/tensor.hpp" 4 | #include "deepx/tensorfunc/io_miaobyte.hpp" 5 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 6 | #include "deepx/tensorfunc/authors.hpp" 7 | 8 | using namespace deepx::tensorfunc; 9 | int main(){ 10 | deepx::Tensor t=New({2, 3,4}); 11 | std::iota(t.data, t.data+t.shape.size, 0); 12 | print(t); 13 | return 0; 14 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/4_tensor_max.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "deepx/tensorfunc/elementwise.hpp" 3 | #include "deepx/tensorfunc/elementwise_miaobyte.hpp" 4 | #include "deepx/tensor.hpp" 5 | #include "deepx/tensorfunc/init_miaobyte.hpp" 6 | #include "deepx/tensorfunc/io_miaobyte.hpp" 7 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 8 | #include "deepx/tensorfunc/authors.hpp" 9 | #include "tensorutil.hpp" 10 | 11 | using namespace deepx; 12 | using namespace deepx::tensorfunc; 13 | 14 | void test_max(){ 15 | std::vector shape=randomshape(1,3,1,19); 16 | Tensor A=New(shape); 17 | std::iota(A.data,A.data+A.shape.size,0); 18 | print(A) ; 19 | Tensor B=New(shape); 20 | constant(B,float(55)); 21 | print(B); 22 | Tensor C=New(shape); 23 | Tensor D=New(shape); 24 | max(A,B,C); 25 | print(C); 26 | min(A,B,D); 27 | print(D); 28 | } 29 | int main(){ 30 | test_max(); 31 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/7_tensor_transpose.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "deepx/tensor.hpp" 6 | #include "deepx/tensorfunc/changeshape_miaobyte.hpp" 7 | #include "deepx/tensorfunc/tensorlife_miaobyte.hpp" 8 | #include "deepx/tensorfunc/authors.hpp" 9 | #include "deepx/tensorfunc/io_miaobyte.hpp" 10 | #include "stdutil/vector.hpp" 11 | #include "tensorutil.hpp" 12 | #include "deepx/shape_changeshape.hpp" 13 | 14 | using namespace deepx::tensorfunc; 15 | using namespace deepx; 16 | using namespace std; 17 | void test_transpose() 18 | { 19 | std::vector shape = randomshape(2, 4, 1, 6); 20 | Tensor tensor = New(shape); 21 | std::iota(tensor.data, tensor.data + tensor.shape.size, 1); 22 | print(tensor); 23 | 24 | vector dimOrder = swaplastTwoDimOrder(shape); 25 | 26 | std::vector resultshape = transposeShape(tensor.shape.shape, dimOrder); 27 | Tensor result = New(resultshape); 28 | transpose(tensor, dimOrder, result); 29 | print(result); 30 | } 31 | 32 | int main() 33 | { 34 | test_transpose(); 35 | } -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(1_shape 1_shape.cpp) 2 | target_link_libraries(1_shape deepx_ompsimd) 3 | 4 | add_executable(2_shape_combintion 2_shape_combintion.cpp) 5 | target_link_libraries(2_shape_combintion deepx_ompsimd) 6 | 7 | add_executable(2_tensor_new 2_tensor_new.cpp) 8 | target_link_libraries(2_tensor_new deepx_ompsimd ) 9 | 10 | add_executable(2_tensor_range 2_tensor_range.cpp) 11 | target_link_libraries(2_tensor_range deepx_ompsimd ) 12 | 13 | add_executable(2_tensor_equal 2_tensor_equal.cpp) 14 | target_link_libraries(2_tensor_equal deepx_ompsimd ) 15 | 16 | add_executable(3_tensor_print 3_tensor_print.cpp) 17 | target_link_libraries(3_tensor_print deepx_ompsimd ) 18 | 19 | 20 | add_executable(4_tensor_matmul 4_tensor_matmul.cpp) 21 | target_link_libraries(4_tensor_matmul deepx_ompsimd ) 22 | 23 | add_executable(4_tensor_add 4_tensor_add.cpp) 24 | target_link_libraries(4_tensor_add deepx_ompsimd ) 25 | 26 | add_executable(4_tensor_sub 4_tensor_sub.cpp) 27 | target_link_libraries(4_tensor_sub deepx_ompsimd ) 28 | 29 | add_executable(4_tensor_mul 4_tensor_mul.cpp) 30 | target_link_libraries(4_tensor_mul deepx_ompsimd ) 31 | 32 | add_executable(4_tensor_max 4_tensor_max.cpp) 33 | target_link_libraries(4_tensor_max deepx_ompsimd ) 34 | 35 | 36 | add_executable(5_tensor_sum 5_tensor_sum.cpp) 37 | target_link_libraries(5_tensor_sum deepx_ompsimd ) 38 | 39 | add_executable(6_tensor_broadcast 6_tensor_broadcast.cpp) 40 | target_link_libraries(6_tensor_broadcast deepx_ompsimd ) 41 | 42 | add_executable(7_tensor_transpose 7_tensor_transpose.cpp) 43 | target_link_libraries(7_tensor_transpose deepx_ompsimd ) 44 | 45 | add_executable(8_tensor_concat 8_tensor_concat.cpp) 46 | target_link_libraries(8_tensor_concat deepx_ompsimd ) -------------------------------------------------------------------------------- /excuter/op-mem-ompsimd/test/tensorfunc/tensorutil.hpp: -------------------------------------------------------------------------------- 1 | #ifndef TENSORUTIL_HPP 2 | #define TENSORUTIL_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "deepx/tensor.hpp" 11 | 12 | using namespace deepx; 13 | 14 | /* 15 | dimlen_min:shape.size()的最小维度长度 16 | dimlen_max:shape.size()的最大维度长度 17 | shape_min:shape[i]的最小维度数量 18 | shape_max:shape[i]的最大维度数量 19 | */ 20 | 21 | std::vector randomshape(size_t dimlen_min, size_t dimlen_max, size_t shape_min, size_t shape_max) { 22 | // 初始化随机数种子 23 | std::srand(static_cast(std::time(nullptr))); 24 | 25 | // 随机生成维度长度 26 | size_t dimlen = dimlen_min + std::rand() % (dimlen_max - dimlen_min + 1); 27 | 28 | // 创建存储形状的向量 29 | std::vector shape(dimlen); 30 | 31 | // 为每个维度随机生成形状值 32 | for (size_t i = 0; i < dimlen; ++i) { 33 | shape[i] = static_cast(shape_min + std::rand() % (shape_max - shape_min + 1)); 34 | } 35 | 36 | return shape; 37 | } 38 | 39 | std::vector randomshape2(size_t dimlen_min, size_t dimlen_max, size_t dim_min, size_t dim_max) { 40 | std::random_device rd; 41 | std::mt19937 gen(rd()); 42 | 43 | // 生成维度数量 44 | std::uniform_int_distribution<> dim_dist(dimlen_min, dimlen_max); 45 | int dims = dim_dist(gen); 46 | 47 | // 生成每个维度的长度 48 | std::uniform_int_distribution<> len_dist(dim_min, dim_max); 49 | std::vector shape; 50 | shape.reserve(dims); 51 | 52 | for (int i = 0; i < dims; ++i) { 53 | shape.push_back(len_dist(gen)); 54 | } 55 | 56 | return shape; 57 | } 58 | 59 | 60 | #endif // TENSORUTIL_HPP 61 | -------------------------------------------------------------------------------- /front/go/README.md: -------------------------------------------------------------------------------- 1 | # deepx-go 2 | 3 | deepx-go是为了构建抽象计算图的golang接口库 4 | 5 | 供算法人员搭建模型使用 6 | 7 | ## 存算一体,控制分离 8 | 9 | deepx-py是控制侧,负责生成计算图,通过通信调用其他进程 10 | 11 | 在excuter中,实现了具体的存算引擎 12 | 13 | 14 | 15 | ## 接口设计 16 | deepx-py的接口设计尽量贴合pytorch,方便用户进行迁移 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /front/go/deepx/graph_constarg.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type ArgType int 4 | 5 | const ( 6 | ArgTypeInt ArgType = iota 7 | ArgTypeFloat 8 | ArgTypeString 9 | ArgTypeIntVector 10 | ) 11 | 12 | type ConstArgNode struct { 13 | name string 14 | ntype NodeType 15 | inputs map[string]Node 16 | value any 17 | argType ArgType 18 | } 19 | 20 | func NewConstArgNode(name string) *ConstArgNode { 21 | return &ConstArgNode{ 22 | name: name, 23 | ntype: NodeConstArg, 24 | inputs: make(map[string]Node), 25 | } 26 | } 27 | func (n *ConstArgNode) Ntype() NodeType { 28 | return n.ntype 29 | } 30 | func (n *ConstArgNode) Name() string { 31 | return n.name 32 | } 33 | func (n *ConstArgNode) Input(name string) Node { 34 | return n.inputs[name] 35 | } 36 | func (n *ConstArgNode) Inputs() map[string]Node { 37 | return n.inputs 38 | } 39 | func (n *ConstArgNode) AddInput(name string, input Node) { 40 | n.inputs[name] = input 41 | } 42 | func (n *ConstArgNode) RemoveInput(name string) { 43 | delete(n.inputs, name) 44 | } 45 | func (n *ConstArgNode) Int() int { 46 | if n.argType != ArgTypeInt { 47 | panic("ConstArgNode is not an integer") 48 | } 49 | return n.value.(int) 50 | } 51 | func (n *ConstArgNode) Float() float64 { 52 | if n.argType != ArgTypeFloat { 53 | panic("ConstArgNode is not a float") 54 | } 55 | return n.value.(float64) 56 | } 57 | func (n *ConstArgNode) String() string { 58 | if n.argType != ArgTypeString { 59 | panic("ConstArgNode is not a string") 60 | } 61 | return n.value.(string) 62 | } 63 | func (n *ConstArgNode) SetInt(value int) { 64 | n.value = value 65 | n.argType = ArgTypeInt 66 | } 67 | func (n *ConstArgNode) SetInts(value []int) { 68 | n.value = value 69 | n.argType = ArgTypeIntVector 70 | } 71 | func (n *ConstArgNode) SetFloat(value float64) { 72 | n.value = value 73 | n.argType = ArgTypeFloat 74 | } 75 | func (n *ConstArgNode) SetString(value string) { 76 | n.value = value 77 | n.argType = ArgTypeString 78 | } 79 | -------------------------------------------------------------------------------- /front/go/deepx/graph_opnode.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type OpType struct { 4 | name string 5 | shortchar string 6 | } 7 | 8 | var ( 9 | opmaps = make(map[string]OpType) 10 | ) 11 | 12 | func RegistOpType(name string, shortchar string) { 13 | opmaps[name] = OpType{name, shortchar} 14 | } 15 | 16 | type OpNode struct { 17 | OpType 18 | ntype NodeType 19 | 20 | inputs map[string]Node 21 | } 22 | 23 | func NewOpNode(name string) *OpNode { 24 | return &OpNode{ 25 | OpType: opmaps[name], 26 | ntype: NodeOp, 27 | inputs: make(map[string]Node), 28 | } 29 | } 30 | func (n *OpNode) Ntype() NodeType { 31 | return n.ntype 32 | } 33 | func (n *OpNode) Name() string { 34 | return n.name 35 | } 36 | func (n *OpNode) Input(name string) Node { 37 | return n.inputs[name] 38 | } 39 | func (n *OpNode) Inputs() map[string]Node { 40 | return n.inputs 41 | } 42 | func (n *OpNode) AddInput(name string, input Node) { 43 | n.inputs[name] = input 44 | } 45 | func (n *OpNode) RemoveInput(name string) { 46 | delete(n.inputs, name) 47 | } 48 | func (n *OpNode) Shortchar() string { 49 | return n.shortchar 50 | } 51 | -------------------------------------------------------------------------------- /front/go/deepx/graph_tensornode.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type TensorNode struct { 4 | name string 5 | ntype NodeType 6 | inputs map[string]Node 7 | tensor *Tensor // 对于 NodeTensor 类型,存储实际的张量数据 8 | } 9 | 10 | func NewTensorNode(name string) *TensorNode { 11 | return &TensorNode{ 12 | name: name, 13 | ntype: NodeTensor, 14 | inputs: make(map[string]Node), 15 | } 16 | } 17 | func (n *TensorNode) Ntype() NodeType { 18 | return n.ntype 19 | } 20 | func (n *TensorNode) Name() string { 21 | return n.name 22 | } 23 | func (n *TensorNode) Input(name string) Node { 24 | return n.inputs[name] 25 | } 26 | func (n *TensorNode) Inputs() map[string]Node { 27 | return n.inputs 28 | } 29 | func (n *TensorNode) AddInput(name string, input Node) { 30 | n.inputs[name] = input 31 | } 32 | func (n *TensorNode) RemoveInput(name string) { 33 | delete(n.inputs, name) 34 | } 35 | func (n *TensorNode) Tensor() *Tensor { 36 | return n.tensor 37 | } 38 | func (n *TensorNode) SetTensor(tensor *Tensor) { 39 | n.tensor = tensor 40 | } 41 | -------------------------------------------------------------------------------- /front/go/deepx/linear.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type Linear struct { 4 | ModuleBase 5 | W *Tensor 6 | b *Tensor 7 | } 8 | 9 | func NewLinear(name string, in_features, out_features int, dtype Dtype, g *Graph) (m *Linear) { 10 | if g == nil { 11 | g = NewGraph() 12 | } 13 | if name == "" { 14 | name = "linear" 15 | } 16 | m = &Linear{ 17 | ModuleBase: ModuleBase{ 18 | g: g, 19 | name: name, 20 | }, 21 | } 22 | 23 | in_features_node := g.AddConstArg(name + ".in_features") 24 | in_features_node.SetInt(in_features) 25 | out_features_node := g.AddConstArg(name + ".out_features") 26 | out_features_node.SetInt(out_features) 27 | 28 | //如果利用矩阵grad时的取巧运算,则需要将W的shape设置为[out_features,in_features]来实现提前转置 29 | m.W = g.AddTensor(name+".W", dtype, []int{in_features, out_features}, true, in_features_node, out_features_node).Tensor() 30 | m.b = g.AddTensor(name+".bias", dtype, []int{out_features}, true, out_features_node).Tensor() 31 | return m 32 | } 33 | func (m *Linear) Linear(input *Tensor) *Tensor { 34 | y := input.Matmul(m.W) 35 | z := y.Add(m.b) 36 | return z 37 | } 38 | -------------------------------------------------------------------------------- /front/go/deepx/mlp.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type MLP struct { 4 | ModuleBase 5 | fc1 *Linear 6 | fc2 *Linear 7 | } 8 | 9 | func NewMLP(name string, in_features, hidden_features int, dtype Dtype, g *Graph) *MLP { 10 | return &MLP{ 11 | ModuleBase: ModuleBase{ 12 | g: g, 13 | name: name, 14 | }, 15 | fc1: NewLinear(name+".fc1", in_features, hidden_features, dtype, g), 16 | fc2: NewLinear(name+".fc2", hidden_features, in_features, dtype, g), 17 | } 18 | } 19 | 20 | func (m *MLP) Forward(x *Tensor) *Tensor { 21 | x = m.fc1.Linear(x) 22 | x = x.Relu() 23 | x = m.fc2.Linear(x) 24 | return x 25 | } 26 | -------------------------------------------------------------------------------- /front/go/deepx/module.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type Module interface { 4 | Graph() *Graph 5 | Name() string 6 | } 7 | type ModuleBase struct { 8 | name string 9 | g *Graph 10 | } 11 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_activite.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("relu", "ReLU") 5 | } 6 | 7 | func (t *Tensor) Relu() *Tensor { 8 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 9 | op := t.graph.AddOp("relu", t.node) 10 | result.AddInput(op.name, op) 11 | return result.tensor 12 | } 13 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_matmul.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("matmul", "⊗") 5 | } 6 | 7 | func (t *Tensor) Matmul(other *Tensor) *Tensor { 8 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 9 | op := t.graph.AddOp("matmul", t.node, other.node) 10 | result.AddInput(op.name, op) 11 | return result.tensor 12 | } 13 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_musk.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("mask", "mask") 5 | } 6 | 7 | func (t *Tensor) ApplyMask(mask *Tensor) *Tensor { 8 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 9 | op := t.graph.AddOp("mask", t.node, mask.node) 10 | result.AddInput(op.name, op) 11 | return result.tensor 12 | } 13 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_norm.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("L2Norm", "l2norm") 5 | RegistOpType("L1Norm", "l1norm") 6 | } 7 | 8 | // L2Norm 计算L2范数 9 | // ||x||₂ = sqrt(Σx²) 10 | func (t *Tensor) L2Norm() *Tensor { 11 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 12 | op := t.graph.AddOp("l2norm", t.node) 13 | result.AddInput(op.name, op) 14 | return result.tensor 15 | } 16 | 17 | // L1Norm 计算L1范数 18 | // ||x||₁ = Σ|x| 19 | func (t *Tensor) L1Norm() *Tensor { 20 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 21 | op := t.graph.AddOp("l1norm", t.node) 22 | result.AddInput(op.name, op) 23 | return result.tensor 24 | } 25 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_normalization.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("softmax", "softmax") 5 | RegistOpType("layernorm", "LN") 6 | RegistOpType("batchnorm", "BN") 7 | RegistOpType("instancenorm", "IN") 8 | RegistOpType("groupnorm", "GN") 9 | RegistOpType("rmsnorm", "RMS") 10 | } 11 | func (t *Tensor) Softmax(axis int) *Tensor { 12 | // 1. 计算最大值 13 | x_max := t.Max([]int{axis}) 14 | // 2. 减去最大值 15 | shifted := t.Sub(x_max) 16 | // 3. 计算指数 17 | exp_x := shifted.Exp() 18 | // 4. 计算和 19 | sum_exp := exp_x.Sum([]int{axis}) 20 | // 5. 归一化 21 | result := exp_x.Div(sum_exp) 22 | return result 23 | } 24 | 25 | func (t *Tensor) MinMax(axis int) *Tensor { 26 | // 1. 计算最大值 27 | x_max := t.Max([]int{axis}) 28 | // 2. 计算最小值 29 | x_min := t.Min([]int{axis}) 30 | // 3. 计算范围 31 | ranged := x_max.Sub(x_min) 32 | // 4. 归一化 33 | result := t.Sub(x_min).Div(ranged) 34 | return result 35 | } 36 | -------------------------------------------------------------------------------- /front/go/deepx/tensor_shape.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | func init() { 4 | RegistOpType("reshape", "reshape") 5 | RegistOpType("transpose", "T") 6 | } 7 | 8 | func (t *Tensor) Reshape(shape []int) *Tensor { 9 | result := t.graph.AddTensor("", t.Dtype, shape, t.requiresGrad) 10 | op := t.graph.AddOp("reshape", t.node) 11 | result.AddInput(op.name, op) 12 | return result.tensor 13 | } 14 | 15 | func (t *Tensor) Transpose(axes []int) *Tensor { 16 | result := t.graph.AddTensor("", t.Dtype, t.Shape.shape, t.requiresGrad) 17 | op := t.graph.AddOp("transpose", t.node) 18 | result.AddInput(op.name, op) 19 | return result.tensor 20 | } 21 | -------------------------------------------------------------------------------- /front/go/deepx/transformer.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | type TransformerLayer struct { 4 | ModuleBase 5 | attention *MultiHeadAttention 6 | mlp *MLP 7 | ln1 *LayerNorm 8 | ln2 *LayerNorm 9 | } 10 | 11 | func NewTransformerLayer(name string, hidden_size, num_heads, mlp_ratio int, dtype Dtype, g *Graph) *TransformerLayer { 12 | if name == "" { 13 | name = "transformer_layer" 14 | } 15 | 16 | return &TransformerLayer{ 17 | ModuleBase: ModuleBase{ 18 | g: g, 19 | name: name, 20 | }, 21 | attention: NewMultiHeadAttention(name+".attn", hidden_size, num_heads, dtype, g), 22 | mlp: NewMLP(name+".mlp", hidden_size, mlp_ratio*hidden_size, dtype, g), 23 | ln1: NewLayerNorm(name+".ln1", hidden_size, dtype, g), 24 | ln2: NewLayerNorm(name+".ln2", hidden_size, dtype, g), 25 | } 26 | } 27 | 28 | func (m *TransformerLayer) Forward(x *Tensor) *Tensor { 29 | // 1. Self Attention 30 | h := m.ln1.LayerNorm(x) 31 | h = m.attention.Forward(h, h, h) 32 | x = x.Add(h) // residual 33 | 34 | // 2. MLP 35 | h = m.ln2.LayerNorm(x) 36 | h = m.mlp.Forward(h) 37 | x = x.Add(h) // residual 38 | 39 | return x 40 | } 41 | -------------------------------------------------------------------------------- /front/go/deepx/transformer/attention.go: -------------------------------------------------------------------------------- 1 | package transformer 2 | 3 | // Qwen2Attention 模拟注意力层 4 | type Qwen2Attention struct { 5 | // 注意力层相关权重等参数 6 | } 7 | 8 | func (a *Qwen2Attention) Forward(hiddenStates interface{}, pastKV interface{}) (interface{}, interface{}, error) { 9 | // 计算查询、键、值以及 RoPE 位置编码 10 | // 如果存在 pastKV,则进行拼接 11 | // 计算注意力分数并返回注意力输出及新的 KV 缓存 12 | return nil, nil, nil 13 | } 14 | -------------------------------------------------------------------------------- /front/go/deepx/transformer/config.go: -------------------------------------------------------------------------------- 1 | package transformer 2 | 3 | // Config 定义模型配置 4 | type Config struct { 5 | // 模型基本配置 6 | HiddenSize int 7 | NumLayers int 8 | NumHeads int 9 | MLPRatio int 10 | VocabSize int 11 | MaxSeqLength int 12 | InitializerRange float32 13 | 14 | // 注意力相关配置 15 | AttentionImpl string 16 | SlidingWindow int 17 | UseFlashAttn bool 18 | 19 | // 生成相关配置 20 | UseCache bool 21 | BeamSize int 22 | TopK int 23 | TopP float32 24 | Temperature float32 25 | } 26 | -------------------------------------------------------------------------------- /front/go/deepx/transformer/model.go: -------------------------------------------------------------------------------- 1 | package transformer 2 | 3 | // PreTrainedModel 定义了基础模型接口 4 | type PreTrainedModel interface { 5 | Forward(inputs ...interface{}) (interface{}, error) 6 | Generate(inputs ...interface{}) (interface{}, error) 7 | SavePretrained(path string) error 8 | FromPretrained(path string) (PreTrainedModel, error) 9 | } 10 | 11 | // Qwen2PreTrainedModel 实现了基类的一部分功能 12 | type Qwen2PreTrainedModel struct { 13 | Config *Config 14 | } 15 | 16 | func (m *Qwen2PreTrainedModel) Forward(args ...interface{}) (interface{}, error) { 17 | // 实现前向传播逻辑,可留空或返回默认值 18 | return nil, nil 19 | } 20 | 21 | func (m *Qwen2PreTrainedModel) Generate(inputs ...interface{}) (interface{}, error) { 22 | // 实现生成逻辑,例如自回归生成 23 | return nil, nil 24 | } 25 | -------------------------------------------------------------------------------- /front/go/deepx/transformer/qwen2_causal_lm.go: -------------------------------------------------------------------------------- 1 | package transformer 2 | 3 | // Qwen2ForCausalLM 为生成模型入口 4 | type Qwen2ForCausalLM struct { 5 | *Qwen2PreTrainedModel 6 | Model *Qwen2Model 7 | LMHead interface{} 8 | } 9 | 10 | func (m *Qwen2ForCausalLM) Forward(inputIDs []int, pastKV [][]interface{}) (interface{}, error) { 11 | outputs, err := m.Model.Forward(inputIDs, nil, nil, pastKV) 12 | if err != nil { 13 | return nil, err 14 | } 15 | // 根据主干网络输出生成 logits 16 | hiddenStates := outputs.(struct { 17 | LastHiddenState interface{} 18 | PastKeyValues [][]interface{} 19 | }).LastHiddenState 20 | logits := m.lmHeadForward(hiddenStates) 21 | return struct { 22 | Logits interface{} 23 | PastKeyValues [][]interface{} 24 | }{Logits: logits, PastKeyValues: outputs.(struct { 25 | LastHiddenState interface{} 26 | PastKeyValues [][]interface{} 27 | }).PastKeyValues}, nil 28 | } 29 | 30 | func (m *Qwen2ForCausalLM) PrepareInputsForGeneration(inputIDs []int, pastKV [][]interface{}) map[string]interface{} { 31 | if pastKV != nil && len(inputIDs) > 0 { 32 | // 仅保留最后一个 token 33 | inputIDs = inputIDs[len(inputIDs)-1:] 34 | } 35 | return map[string]interface{}{ 36 | "input_ids": inputIDs, 37 | "past_key_values": pastKV, 38 | "use_cache": true, 39 | } 40 | } 41 | 42 | // lmHeadForward 模拟 lm_head 的前向传播 43 | func (m *Qwen2ForCausalLM) lmHeadForward(hiddenStates interface{}) interface{} { 44 | // 实现将 hiddenStates 投影到词表维度的逻辑 45 | return nil 46 | } 47 | -------------------------------------------------------------------------------- /front/go/deepx/transformer/qwen2_model.go: -------------------------------------------------------------------------------- 1 | package transformer 2 | 3 | // Qwen2DecoderLayer 定义单层 Decoder 的接口 4 | type Qwen2DecoderLayer interface { 5 | Forward(hiddenStates interface{}, attentionMask interface{}, 6 | positionIds interface{}, pastKV interface{}) (output interface{}, newKV interface{}, err error) 7 | } 8 | 9 | // Qwen2Model 为主网络 10 | type Qwen2Model struct { 11 | *Qwen2PreTrainedModel // 组合方式复用基类功能 12 | EmbedTokens interface{} // Token 嵌入层 13 | Layers []Qwen2DecoderLayer 14 | } 15 | 16 | func (m *Qwen2Model) Forward(inputIDs []int, attentionMask []int, positionIDs []int, pastKV [][]interface{}) (interface{}, error) { 17 | // 模拟 token 嵌入 18 | hiddenStates := m.embedTokensForward(inputIDs) 19 | var updatedKV [][]interface{} 20 | // 遍历每一层 Decoder 21 | for i, layer := range m.Layers { 22 | var pastKVLayer interface{} 23 | if pastKV != nil && i < len(pastKV) { 24 | pastKVLayer = pastKV[i] 25 | } 26 | output, newKV, err := layer.Forward(hiddenStates, attentionMask, positionIDs, pastKVLayer) 27 | if err != nil { 28 | return nil, err 29 | } 30 | hiddenStates = output 31 | updatedKV = append(updatedKV, newKV) 32 | } 33 | return struct { 34 | LastHiddenState interface{} 35 | PastKeyValues [][]interface{} 36 | }{LastHiddenState: hiddenStates, PastKeyValues: updatedKV}, nil 37 | } 38 | 39 | // embedTokensForward 为嵌入层的模拟实现 40 | func (m *Qwen2Model) embedTokensForward(inputIDs []int) interface{} { 41 | // 根据 inputIDs 返回对应的嵌入向量 42 | return nil 43 | } 44 | -------------------------------------------------------------------------------- /front/go/deepx/transformer_model.go: -------------------------------------------------------------------------------- 1 | package deepx 2 | 3 | import "fmt" 4 | 5 | type Transformer struct { 6 | ModuleBase 7 | embedding *Linear 8 | layers []*TransformerLayer 9 | ln_final *LayerNorm 10 | } 11 | 12 | func NewTransformer(name string, num_layers, hidden_size, num_heads, mlp_ratio int, dtype Dtype, g *Graph) *Transformer { 13 | if name == "" { 14 | name = "transformer" 15 | } 16 | 17 | m := &Transformer{ 18 | ModuleBase: ModuleBase{ 19 | g: g, 20 | name: name, 21 | }, 22 | embedding: NewLinear(name+".embedding", hidden_size, hidden_size, dtype, g), 23 | layers: make([]*TransformerLayer, num_layers), 24 | ln_final: NewLayerNorm(name+".ln_final", hidden_size, dtype, g), 25 | } 26 | 27 | for i := 0; i < num_layers; i++ { 28 | m.layers[i] = NewTransformerLayer( 29 | fmt.Sprintf("%s.layer_%d", name, i), 30 | hidden_size, 31 | num_heads, 32 | mlp_ratio, 33 | dtype, 34 | g, 35 | ) 36 | } 37 | 38 | return m 39 | } 40 | 41 | func (m *Transformer) Forward(x *Tensor) *Tensor { 42 | x = m.embedding.Linear(x) 43 | 44 | for _, layer := range m.layers { 45 | x = layer.Forward(x) 46 | } 47 | 48 | x = m.ln_final.LayerNorm(x) 49 | return x 50 | } 51 | -------------------------------------------------------------------------------- /front/go/example/1/1_app.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/array2d/deepx/front/go/deepx" 7 | ) 8 | 9 | type Module1 struct { 10 | g *deepx.Graph 11 | } 12 | 13 | func (m *Module1) Linear(input *deepx.Tensor) *deepx.Tensor { 14 | // 创建输入节点 15 | w_node := m.g.AddTensor("W", deepx.DtypeFloat32, []int{3, 4, 5}, true) 16 | 17 | // 自动构建计算图 18 | y := input.Matmul(w_node.Tensor()) 19 | 20 | b_node := m.g.AddTensor("b", deepx.DtypeFloat32, []int{1, 4, 5}, true) 21 | z := y.Add(b_node.Tensor()) 22 | return z 23 | } 24 | func (m *Module1) Forward() (z *deepx.Tensor) { 25 | x_node := m.g.AddTensor("Input", deepx.DtypeFloat32, []int{1, 2, 3}, true) 26 | z = x_node.Tensor() 27 | for i := 0; i < 2; i++ { 28 | z = m.Linear(z) 29 | } 30 | 31 | return z 32 | } 33 | 34 | func main() { 35 | module := &Module1{ 36 | g: deepx.NewGraph(), 37 | } 38 | module.Forward() 39 | 40 | dot := module.g.ToDOT() 41 | os.WriteFile("1_app.dot", []byte(dot), 0644) 42 | } 43 | -------------------------------------------------------------------------------- /front/go/example/3/3_transformer_app.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | 6 | "github.com/array2d/deepx/front/go/deepx" 7 | ) 8 | 9 | func main() { 10 | // 创建计算图 11 | g := deepx.NewGraph() 12 | 13 | // 创建 Transformer 配置 14 | config := struct { 15 | hidden_size int 16 | num_heads int 17 | num_layers int 18 | mlp_ratio int 19 | dtype deepx.Dtype 20 | }{ 21 | hidden_size: 256, 22 | num_heads: 4, 23 | num_layers: 2, 24 | mlp_ratio: 4, 25 | dtype: deepx.DtypeFloat32, 26 | } 27 | 28 | // 创建 Transformer 模型 29 | transformer := deepx.NewTransformer( 30 | "transformer", 31 | config.num_layers, 32 | config.hidden_size, 33 | config.num_heads, 34 | config.mlp_ratio, 35 | config.dtype, 36 | g, 37 | ) 38 | 39 | // 创建输入张量 40 | batch_size := 1 41 | seq_len := 32 42 | input := g.AddTensor( 43 | "input", 44 | config.dtype, 45 | []int{batch_size, seq_len, config.hidden_size}, 46 | true, 47 | ) 48 | 49 | // 前向计算,构建计算图 50 | transformer.Forward(input.Tensor()) 51 | 52 | // 将计算图导出为 DOT 格式 53 | dot := g.ToDOT() 54 | os.WriteFile("transformer.dot", []byte(dot), 0644) 55 | } 56 | -------------------------------------------------------------------------------- /front/go/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/array2d/deepx/front/go 2 | 3 | go 1.23.2 4 | -------------------------------------------------------------------------------- /front/py/.cursorrules: -------------------------------------------------------------------------------- 1 | 回答要求: 2 | Always respond in 中文 3 | 不要回答重复的内容(如我提问中的代码) 4 | 由于作者是py新手,请多注释python语法和库的用法 5 | 不要增加"假设Tensor类在这个路径下"这种一眼就被看出是AI写的注释,不要让我的代码看起来像AI生成 6 | 7 | 项目介绍: 8 | 此项目名为deepx的py部分 9 | 是deepx的python部分,项目路径为/home/lipeng/code/ai/deepx/front/py 10 | deepx的对外暴露的设计,尽可能接近pytorch的API 11 | deepx_py是前端,负责搭建深度学习模型,但并不参与实际数据存储和计算 12 | deepx_py虽然有类似pytorch的API,但只是调用excuter内的对应实现 13 | deepx_py依赖网络api调用后端excuter的计算 14 | deepx_py的每个OP,都会在excuter内找到对应的实现 -------------------------------------------------------------------------------- /front/py/deepx/.cursorrules: -------------------------------------------------------------------------------- 1 | deepx-py计划像素级复刻pytorch,照顾开发者的使用习惯 2 | -------------------------------------------------------------------------------- /front/py/deepx/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | -------------------------------------------------------------------------------- /front/py/deepx/README.md: -------------------------------------------------------------------------------- 1 | # deepx-py 2 | 3 | deepx-py是为了构建抽象计算图的py接口库 4 | 5 | 供算法人员搭建模型使用 6 | 7 | ## 存算一体,控制分离 8 | 9 | deepx-py是控制侧,负责生成计算图,通过通信调用其他进程 10 | 11 | 在excuter中,实现了具体的存算引擎 12 | 13 | 14 | 15 | ## 接口设计 16 | deepx-py计划像素级复刻pytorch,照顾开发者的使用习惯 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /front/py/deepx/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor import Tensor,Shape,Number 2 | from deepx.nn.functional import * # 导入所有functional函数 3 | from deepx.nn.functional import __all__ as _func_all # 获取functional的导出列表 4 | 5 | __all__ = [ 6 | #tensor 7 | 'Tensor','Shape','Number', 8 | *_func_all 9 | ] 10 | 11 | # 为了支持 import deepx as dx 的用法 12 | tensor = Tensor -------------------------------------------------------------------------------- /front/py/deepx/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepxir import * 2 | 3 | __all__ = [ 4 | "DeepxIR","DeepxIRResp" 5 | ] -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .leaffunc_life import * 3 | from .leaffunc_io import * 4 | from .leaffunc_init import * 5 | from .leaffunc_changeshape import * 6 | from .leaffunc_elementwise import * 7 | from .leaffunc_matmul import matmul 8 | from .leaffunc_reduce import reducemax,reducemin,sum,prod 9 | 10 | from .authormap import defaultauthor 11 | 12 | from .reduce import mean 13 | from .activite import * 14 | from .elementwise import * 15 | from .normalization import * 16 | from .changeshape import * 17 | __all__ = [ 18 | 19 | #leaffunc 20 | "newtensor","rnewtensor","printtensor","load", #life 21 | "printtensor","save",#io 22 | "constant","constant_","dropout","full","zeros","ones","uniform","uniform_","arange","arange_", 23 | "kaiming_uniform","kaiming_uniform_", 24 | "add","sub","mul","div", 25 | "sqrt","pow","exp","log", 26 | "min","max", 27 | "less","greater","equal","notequal", 28 | "switch","where", 29 | "todtype", 30 | "invert", 31 | "matmul", 32 | "reducemax","reducemin","sum","prod", 33 | "reshape","permute","transpose","concat","broadcastTo","broadcast_to","indexselect",'repeat', 34 | 35 | #functional 36 | "relu","sigmoid","swish","silu", 37 | "mean", 38 | "rsqrt", 39 | "softmax", 40 | "squeeze","unsqueeze", 41 | 42 | #other 43 | "calculate_fan_in_and_fan_out", 44 | ] -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/activite.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn.functional import newtensor 3 | from .leaffunc_elementwise import exp 4 | # 数学公式:relu(x) = max(0, x) 5 | def relu(t: Tensor)->Tensor: 6 | from .leaffunc_elementwise import max as max_func 7 | outtensor=newtensor(t.shape, dtype=t.dtype) 8 | return max_func(t,0,outtensor) 9 | 10 | # 数学公式:σ(x) = 1 / (1 + exp(-x)) 11 | def sigmoid(t: Tensor)->Tensor: 12 | return 1 / (exp(t*-1)+1) 13 | 14 | # 数学公式:swish(x) = x * σ(βx) 15 | def swish(x: Tensor,beta: float = 1.0) -> Tensor: 16 | return x*sigmoid(x*beta) 17 | 18 | silu=swish -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/changeshape.py: -------------------------------------------------------------------------------- 1 | from deepx import Tensor 2 | from .leaffunc_changeshape import reshape 3 | 4 | def squeeze(t:Tensor,dim:int)->Tensor: 5 | assert isinstance(dim,int) 6 | assert isinstance(t,Tensor) 7 | dim=dim%t.ndim 8 | newshape=list(t.shape) 9 | newshape.pop(dim) 10 | return reshape(t,tuple(newshape)) 11 | 12 | def unsqueeze(t:Tensor,dim:int)->Tensor: 13 | assert isinstance(dim,int) 14 | assert isinstance(t,Tensor) 15 | dim=dim%t.ndim 16 | newshape=list(t.shape) 17 | newshape.insert(dim,1) 18 | return reshape(t,tuple(newshape)) -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/leaffunc_io.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from .authormap import defaultauthor 3 | 4 | def printtensor(t:Tensor,format=''): 5 | from .rtf_io import rtf_printtensor 6 | rtf_printtensor(t,format,defaultauthor['print']) 7 | return '' 8 | 9 | def save(t:Tensor,path:str): 10 | from .rtf_io import rtf_save 11 | rtf_save(t,path) 12 | return t 13 | 14 | def loadData(t:Tensor,path:str)->Tensor: 15 | from .rtf_io import rtf_loadtensordata 16 | return rtf_loadtensordata(t,path) -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/leaffunc_life.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from typing import Union 3 | 4 | def newtensor(shape:tuple[int,...],dtype:str='float32',name:str=None): 5 | assert isinstance(shape,tuple) 6 | for i in shape: 7 | assert isinstance(i,int) 8 | assert isinstance(dtype,str) 9 | assert isinstance(name,str) or name is None 10 | 11 | t=Tensor(shape=shape,dtype=dtype,name=name) 12 | from .rtf_life import rtf_newtensor 13 | rtf_newtensor(t) 14 | return t 15 | 16 | def rnewtensor(t:Tensor): 17 | from .rtf_life import rtf_newtensor 18 | rtf_newtensor(t) 19 | return t 20 | 21 | def copytensor(t:Tensor,out:Tensor): 22 | from .rtf_life import rtf_copytensor 23 | rtf_copytensor(t,out) 24 | 25 | 26 | def deltensor(t:Tensor): 27 | from .rtf_life import rtf_deltensor 28 | rtf_deltensor(t) 29 | def renametensor(t:Tensor,new_name:str): 30 | assert isinstance(t,Tensor) 31 | assert isinstance(new_name,str) and new_name != '' 32 | assert t.name is not None and t.name != '' 33 | 34 | from .rtf_life import rtf_renametensor 35 | rtf_renametensor(t,new_name) 36 | 37 | def load(path:str)->Tensor: 38 | from .rtf_io import rtf_load 39 | return rtf_load(path) 40 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/leaffunc_matmul.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from deepx import Tensor,Shape 4 | from .leaffunc_life import newtensor 5 | from .authormap import defaultauthor 6 | 7 | def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:int=None)->Tensor: 8 | outtensor=out 9 | if isinstance(out,str) or out is None: 10 | outshape=Shape.matmul(a.shape,b.shape) 11 | outtensor=newtensor(outshape,dtype=a.dtype,name=out) 12 | from .rtf_matmul import rtf_matmul 13 | rtf_matmul(a,b,outtensor,defaultauthor['matmul'],bench) 14 | return outtensor 15 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/leaffunc_reduce.py: -------------------------------------------------------------------------------- 1 | from .leaffunc import create_A_dim_keepdim_tf_C 2 | 3 | 4 | 5 | 6 | sum=create_A_dim_keepdim_tf_C('sum') 7 | prod=create_A_dim_keepdim_tf_C('prod') 8 | reducemin=create_A_dim_keepdim_tf_C('reducemin') 9 | reducemax=create_A_dim_keepdim_tf_C('reducemax') 10 | 11 | 12 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/normalization.py: -------------------------------------------------------------------------------- 1 | from deepx import Tensor 2 | 3 | # 数学公式:softmax(x_i) = e^{x_i} / sum(e^{x_j}) 4 | def softmax(t: Tensor,dim:list[int]=[-1])->Tensor: 5 | assert isinstance(dim,list) 6 | for i in range(len(dim)): 7 | dim[i]=dim[i]%t.ndim 8 | # 数值稳定性处理:减去最大值防止指数爆炸 9 | if dim is not None: 10 | t_reducemax = t.reducemax(dim=tuple(dim), keepdim=True) # 保持维度用于广播 11 | else: 12 | t_reducemax= t.reducemax(keepdim=True) 13 | 14 | t=t-t_reducemax 15 | 16 | t_exp = t.exp() 17 | t_exp_sum=t_exp.sum(dim=tuple(dim), keepdim=True) 18 | return t.exp()/t_exp_sum -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/reduce.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor,Shape 2 | from .leaffunc_reduce import sum 3 | from .leaffunc_life import newtensor 4 | #mean 5 | 6 | def mean(a:Tensor,dim:tuple[int,...]=None,keepdim:bool=False)->Tensor: 7 | assert isinstance(a,Tensor) 8 | if dim is None: 9 | dim = list(range(a.ndim)) 10 | else: 11 | dim=list(dim) 12 | for i in dim: 13 | if i < 0: 14 | dim[i] = i + a.dim() 15 | total = 1 16 | for i in dim: 17 | total *= a.shape[i] 18 | reduceshape=Shape.reduceshape(a.shape,dim,keepdim) 19 | out=newtensor(reduceshape,dtype=a.dtype) 20 | sum(a, tuple(dim), keepdim, out) 21 | out.div_(total) 22 | return out 23 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn.deepxir import DeepxIR,Param 3 | from deepx.scheduler import send 4 | from typing import Union 5 | def A_B_op_C(op:str,a:Tensor,b:Tensor,out:Tensor,author='miaobyte'): 6 | args=[Param.tensor(a),Param.tensor(b)] 7 | returns=[Param.tensor(out)] 8 | ir=DeepxIR(op, args, returns,author) 9 | send(ir) 10 | 11 | def A_B_c_op_D(op:str,a:Tensor,b:Tensor,c:Union[float,int],out:Tensor,author='miaobyte'): 12 | args=[Param.tensor(a),Param.tensor(b),Param.varnum(c)] 13 | returns=[Param.tensor(out)] 14 | ir=DeepxIR(op, args, returns,author) 15 | send(ir) 16 | def A_scalar_c_op_D(op:str,a:Tensor,scalar:Union[float,int],c:Union[float,int],out:Tensor,author='miaobyte'): 17 | args=[Param.tensor(a),Param.varnum(scalar),Param.varnum(c)] 18 | returns=[Param.tensor(out)] 19 | ir=DeepxIR(op, args, returns,author) 20 | send(ir) 21 | 22 | def A_scalar_op(op:str,a:Tensor,b:Union[float,int],author='miaobyte'): 23 | args=[Param.tensor(a),Param.varnum(b)] 24 | returns=[] 25 | ir=DeepxIR(op, args, returns,author) 26 | send(ir) 27 | 28 | def A_scalar_op_C(op:str,a:Tensor,b:Union[float,int],out:Tensor,author='miaobyte'): 29 | args=[Param.tensor(a),Param.varnum(b)] 30 | returns=[Param.tensor(out)] 31 | ir=DeepxIR(op, args, returns,author) 32 | send(ir) 33 | 34 | def A_op_C(op:str,a:Tensor,out:Tensor,author='miaobyte'): 35 | args=[Param.tensor(a)] 36 | returns=[Param.tensor(out)] 37 | ir=DeepxIR(op, args, returns,author) 38 | send(ir) 39 | 40 | def A_b1_b2_op_C(op:str,a:Tensor,b1:tuple[int],b2:bool,out:Tensor,author='miaobyte'): 41 | args=[Param.tensor(a),Param.vector(b1,'int32'),Param.varbool(b2)] 42 | returns=[Param.tensor(out)] 43 | ir=DeepxIR(op, args, returns,author) 44 | send(ir) 45 | 46 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_changeshape.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn.deepxir import DeepxIR,Param 3 | from deepx.scheduler import send 4 | 5 | def rtf_reshape(t:Tensor,shape:tuple[int],out:Tensor,author='miaobyte'): 6 | args=[Param.tensor(t),Param.vector(shape,'int32')] 7 | returns=[Param.tensor(out)] 8 | ir=DeepxIR("reshape", args, returns,author) 9 | send(ir) 10 | 11 | 12 | def rtf_transpose(t:Tensor,dimorder:tuple[int],out:Tensor,author='miaobyte'): 13 | args=[Param.tensor(t),Param.vector(dimorder,'int32')] 14 | returns=[Param.tensor(out)] 15 | ir=DeepxIR("transpose", args, returns,author) 16 | send(ir) 17 | 18 | def rtf_concat(tensors:tuple[Tensor],dim:int,out:Tensor,author='miaobyte'): 19 | args=[Param.listtensor(tensors),Param.varnum(dim)] 20 | returns=[Param.tensor(out)] 21 | ir=DeepxIR("concat", args, returns,author) 22 | send(ir) 23 | 24 | 25 | def rtf_broadcastTo(t:Tensor,new_shape:tuple[int],out:Tensor,author='miaobyte'): 26 | args=[Param.tensor(t),Param.vector(new_shape,'int32')] 27 | returns=[Param.tensor(out)] 28 | ir=DeepxIR("broadcastTo", args, returns,author) 29 | send(ir) 30 | 31 | def rtf_indexselect(input:Tensor,indices:Tensor,axis:int,out:Tensor,author='miaobyte'): 32 | assert axis>=0 and axis0 42 | args=[Param.tensor(input),Param.vector(repeats,'int32')] 43 | returns=[Param.tensor(out)] 44 | ir=DeepxIR("repeat", args, returns,author) 45 | send(ir) -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_init.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn.deepxir import DeepxIR,Param 3 | from deepx.scheduler import send 4 | from typing import Union,Optional 5 | from .rtf import A_scalar_op 6 | 7 | # 初始化 8 | def rtf_arange(t:Tensor,start:Optional[Union[float,int]]=0,step:Optional[Union[float,int]]=1,author='miaobyte')->Tensor: 9 | args=[Param.varnum(start),Param.varnum(step)] 10 | returns=[Param.tensor(t)] 11 | ir=DeepxIR("arange", args, returns,author) 12 | send(ir) 13 | return t 14 | 15 | def rtf_uniform(t:Tensor,low=0, high=1,seed:int=0,author='miaobyte')->Tensor: 16 | args=[Param.varnum(low),Param.varnum(high),Param.varnum(seed)] 17 | returns=[Param.tensor(t)] 18 | ir=DeepxIR("uniform", args, returns,author) 19 | send(ir) 20 | return t 21 | 22 | def rtf_normal(t:Tensor,mean:float=0, stddev:float=1,seed:int=0,author='miaobyte')->Tensor: 23 | args=[Param.varnum(mean),Param.varnum(stddev),Param.varnum(seed)] 24 | returns=[Param.tensor(t)] 25 | ir=DeepxIR("normal", args, returns,author) 26 | send(ir) 27 | return t 28 | 29 | # 填充 30 | def rtf_constant(t:Tensor,value:Union[float,int]=0,author='miaobyte')->Tensor: 31 | args=[Param.varnum(value)] 32 | returns=[Param.tensor(t)] 33 | ir=DeepxIR("constant", args, returns,author) 34 | send(ir) 35 | return t 36 | 37 | def rtf_dropout(a:Tensor, p:float, seed:int, author='miaobyte')->Tensor: 38 | assert isinstance(p,float) and 0<=p<=1 39 | assert isinstance(seed,int) 40 | 41 | args=[Param.varnum(p),Param.varnum(seed)] 42 | returns=[Param.tensor(a)] 43 | ir=DeepxIR("dropout",args,returns,author) 44 | send(ir) 45 | return a -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_io.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor,loadShape 2 | from deepx.nn import DeepxIR,Param 3 | from deepx.scheduler import send 4 | 5 | def rtf_printtensor(t:Tensor,format='',author='miaobyte'): 6 | args=[Param.tensor(t),Param.varstr(format)] 7 | returns=[] 8 | ir=DeepxIR("print", args, returns,author) 9 | send(ir) 10 | return '' 11 | 12 | def rtf_save(t:Tensor,path:str): 13 | args=[Param.tensor(t),Param.varstr(path)] 14 | returns=[] 15 | ir=DeepxIR("save", args, returns) 16 | send(ir) 17 | return t 18 | 19 | def rtf_load(path:str)->Tensor: 20 | args=[Param.varstr(path)] 21 | returns=[] 22 | ir=DeepxIR("load", args, returns) 23 | send(ir) 24 | shapefile=path+'.shape' 25 | tensor_name,shape,dtype=loadShape(shapefile) 26 | return Tensor(shape.shape,dtype,tensor_name) 27 | 28 | def rtf_loadtensordata(t:Tensor,path:str)->Tensor: 29 | args=[Param.varstr(path)] 30 | returns=[Param.tensor(t)] 31 | ir=DeepxIR("loadtensordata", args, returns) 32 | send(ir) 33 | return t -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_life.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn.deepxir import DeepxIR,Param 3 | from deepx.scheduler import send 4 | 5 | def rtf_newtensor(t:Tensor): 6 | assert isinstance(t,Tensor) 7 | args=[Param.vector(t.shape,'int32')] 8 | returns=[Param.tensor(t)] 9 | ir=DeepxIR("newtensor", args, returns,'') 10 | send(ir) 11 | 12 | 13 | def rtf_copytensor(t:Tensor,out:Tensor): 14 | assert isinstance(t,Tensor) 15 | assert isinstance(out,Tensor) 16 | assert t.shape==out.shape 17 | assert t.dtype==out.dtype 18 | 19 | args=[Param.tensor(t)] 20 | returns=[Param.tensor(out)] 21 | ir=DeepxIR("copytensor", args, returns,'') 22 | send(ir) 23 | 24 | 25 | 26 | def rtf_deltensor(t:Tensor): 27 | assert isinstance(t,Tensor) 28 | args=[] 29 | returns=[Param.tensor(t)] 30 | ir=DeepxIR("deltensor", args, returns,'') 31 | send(ir) 32 | 33 | def rtf_renametensor(t:Tensor,new_name:str): 34 | args=[Param.varstr(new_name)] 35 | returns=[Param.tensor(t)] 36 | ir=DeepxIR("renametensor", args, returns,'') 37 | send(ir) 38 | -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_matmul.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from deepx.nn import DeepxIR,Param 3 | from deepx.scheduler import send 4 | 5 | def rtf_matmul(a:Tensor,b:Tensor,out: Tensor ,author='cublas',bench:int=None): 6 | args=[Param.tensor(a),Param.tensor(b)] 7 | returns=[Param.tensor(out)] 8 | ir=DeepxIR("matmul", args, returns, author) 9 | if bench is not None: 10 | ir._metadata.openbench(bench) 11 | send(ir) 12 | return out -------------------------------------------------------------------------------- /front/py/deepx/nn/functional/rtf_reduce.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Tensor 2 | from .rtf import A_b1_b2_op_C 3 | 4 | def rtf_sum(a:Tensor,dim:tuple[int],keepdim:bool,out: Tensor, author:str='miaobyte')->Tensor: 5 | A_b1_b2_op_C("sum",a,dim,keepdim,out,author) 6 | 7 | 8 | def rtf_prod(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: 9 | A_b1_b2_op_C("prod",a,dim,keepdim,out,author) 10 | 11 | 12 | def rtf_reducemax(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: 13 | A_b1_b2_op_C("reducemax",a,dim,keepdim,out,author) 14 | 15 | 16 | def rtf_reducemin(a:Tensor,dim:tuple[int],keepdim:bool,out:Tensor, author:str='miaobyte')->Tensor: 17 | A_b1_b2_op_C("reducemin",a,dim,keepdim,out,author) 18 | -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .module import Module, Sequential 2 | from .linear import Linear 3 | from .sparse import Embedding 4 | __all__ = [ 5 | "Module", 6 | "Linear", 7 | "Sequential", 8 | "Embedding", 9 | ] 10 | -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/activation.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from deepx import Tensor,ones 3 | from .module import Module 4 | 5 | 6 | class Glu(Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.W = ones(shape=(1,1),name=self.full_name+"_W") 10 | self.V = ones(shape=(1,1),name=self.full_name+"_V") 11 | 12 | class Swiglu(Module): 13 | def __init__(self): 14 | super().__init__() 15 | self.W = ones(shape=(1,1),name=self.full_name+"_W") 16 | self.V = ones(shape=(1,1),name=self.full_name+"_V") 17 | 18 | def swiglu( 19 | x: Tensor, 20 | W: Tensor, # 第一个投影矩阵 21 | V: Tensor, # 第二个投影矩阵 22 | beta: float = 1.0, # swish函数的缩放因子 23 | out: Union[Tensor,str] = '') -> Tensor: 24 | from deepx.nn.functional import swish 25 | result=swish(x@W,beta=beta).mul(x@V,out=out) 26 | return result 27 | 28 | def forward(self, input: Tensor) -> Tensor: 29 | return self.swiglu(input,self.W,self.V) 30 | 31 | -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/conv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/conv.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/dropout.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/dropout.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/loss.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/normalization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/normalization.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/padding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/padding.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/pooling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/pooling.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/rmsnorm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/rmsnorm.py -------------------------------------------------------------------------------- /front/py/deepx/nn/modules/rnn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/modules/rnn.py -------------------------------------------------------------------------------- /front/py/deepx/nn/parameter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/nn/parameter.py -------------------------------------------------------------------------------- /front/py/deepx/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/optim/__init__.py -------------------------------------------------------------------------------- /front/py/deepx/optim/adam.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/optim/adam.py -------------------------------------------------------------------------------- /front/py/deepx/optim/optimizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from deepx.tensor import Tensor 3 | class Optimizer: 4 | def __init__(self, 5 | params:list[Tensor], 6 | defaults: dict[str, Any]) -> None: 7 | self.params = params 8 | self.defaults = defaults 9 | 10 | def step(self): 11 | pass -------------------------------------------------------------------------------- /front/py/deepx/optim/sgd.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from .optimizer import Optimizer 3 | from deepx.tensor import Tensor 4 | 5 | class SGD(Optimizer): 6 | def __init__(self, 7 | params:list[Tensor], 8 | defaults: dict[str, Any]) -> None: 9 | super().__init__(params, defaults) 10 | 11 | def step(self): 12 | for param in self.params: 13 | param.data -= self.defaults['lr'] * param.grad 14 | -------------------------------------------------------------------------------- /front/py/deepx/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/requirements.txt -------------------------------------------------------------------------------- /front/py/deepx/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .client.allclient import send 3 | 4 | __all__ = [ 5 | "send", 6 | ] 7 | 8 | -------------------------------------------------------------------------------- /front/py/deepx/scheduler/client/allclient.py: -------------------------------------------------------------------------------- 1 | from .udpconn import _default_udpconn 2 | from typing import Optional 3 | from deepx.nn import DeepxIR,DeepxIRResp 4 | import time 5 | default_client = _default_udpconn 6 | 7 | 8 | _id_counter=0 9 | def send(ir:DeepxIR) -> DeepxIRResp: 10 | ir._sent_at=time.time() 11 | global _id_counter 12 | _id_counter=_id_counter+1 13 | ir._id=_id_counter 14 | s=str(ir) 15 | respstr=default_client.send(s) 16 | respir=DeepxIRResp(respstr) 17 | return respir 18 | -------------------------------------------------------------------------------- /front/py/deepx/scheduler/client/udpconn.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from typing import Optional, Tuple 3 | import select 4 | 5 | class UDPConn: 6 | def __init__(self, endpoint: str = "localhost:9090"): 7 | # 解析endpoint 8 | self._host, port_str = endpoint.split(':') 9 | self._port = int(port_str) 10 | # 创建UDP socket 11 | self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 12 | # 设置非阻塞模式 13 | self._sock.setblocking(False) 14 | # 设置接收缓冲区 15 | self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 1024*1024) 16 | 17 | def send(self, ir: str) -> Optional[dict]: 18 | 19 | # 发送IR字符串 20 | try: 21 | # 将IR字符串编码为bytes并发送 22 | data = ir.encode('utf-8') 23 | self._sock.sendto(data, (self._host, self._port)) 24 | # 等待响应 25 | return self._wait_response() 26 | 27 | except Exception as e: 28 | print(f"发送IR失败: {e}") 29 | return None 30 | 31 | def _wait_response(self, timeout: float =10000) -> any: 32 | """等待并接收响应 33 | 34 | Args: 35 | timeout: 超时时间(秒) 36 | """ 37 | try: 38 | # 使用select实现超时等待 39 | ready = select.select([self._sock], [], [], timeout) 40 | if ready[0]: 41 | data, addr = self._sock.recvfrom(65536) # 64KB缓冲区 42 | response = data.decode('utf-8') 43 | return response 44 | return None 45 | 46 | except Exception as e: 47 | print(f"接收响应失败: {e}") 48 | return None 49 | 50 | def __del__(self): 51 | """确保socket正确关闭""" 52 | if hasattr(self, '_sock'): 53 | self._sock.close() 54 | 55 | # 全局单例实例 56 | _default_udpconn = UDPConn() 57 | -------------------------------------------------------------------------------- /front/py/deepx/scheduler/client/unixsocket.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/scheduler/client/unixsocket.py -------------------------------------------------------------------------------- /front/py/deepx/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='deepx', 5 | version='0.1.0', 6 | description='DeepX - 高性能深度学习框架的Python接口', 7 | author='igor.li', 8 | author_email='lipeng@mirrorsoft.cn', 9 | packages=find_packages(), 10 | install_requires=[ 11 | 'graphviz>=0.20.1', # 用于计算图可视化 12 | ], 13 | long_description=open("README.md").read(), 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/array2d/deepx", 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.7', # 确保支持数据类型注解 22 | ) -------------------------------------------------------------------------------- /front/py/deepx/tensor/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor import * 2 | from .shape import Shape 3 | from .elementwise import * # 导入所有包含@tensor_method装饰的方法 4 | from .matmul import * # 导入矩阵乘法相关方法 5 | from .changeshape import * # 导入转置方法 6 | from .init import * 7 | from .reduce import * 8 | from .io import * 9 | __all__ = [ 10 | 'Shape', 11 | 'Tensor', 12 | 'tensor_method', 13 | 'Number', 14 | 'loadShape', 15 | # 'lt', 'gt', 'eq', 16 | # 'sin', 'cos', 'tan', 17 | # 'DType', 18 | # '_dtype_to_typestr' 19 | ] -------------------------------------------------------------------------------- /front/py/deepx/tensor/init.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from deepx.tensor import tensor_method 3 | 4 | # 填充 5 | @tensor_method 6 | def full_(self,value:Union[float,int]): 7 | from deepx.nn.functional import constant_ as constant_func 8 | constant_func(self,value=value) 9 | 10 | @tensor_method 11 | def dropout_(self,p:float=0.5,seed:int=None): 12 | from deepx.nn.functional import dropout as dropout_func 13 | dropout_func(self,p,seed) 14 | return self 15 | 16 | 17 | @tensor_method 18 | def zeros_(self): 19 | from deepx.nn.functional import constant_ as constant_func 20 | constant_func(self,value=0) 21 | 22 | @tensor_method 23 | def ones_(self): 24 | from deepx.nn.functional import constant_ as constant_func 25 | constant_func(self,value=1) 26 | 27 | @tensor_method 28 | def uniform_(self,low=0, high=1,seed:int=None): 29 | from deepx.nn.functional import uniform_ as uniform_func 30 | uniform_func(self,low=low, high=high,seed=seed) 31 | 32 | @tensor_method 33 | def arange_(self,start=0,step=1): 34 | from deepx.nn.functional import arange_ as arange_func 35 | arange_func(self,start,step) 36 | 37 | @tensor_method 38 | def normal_(self,mean=0, stddev=1,seed:int=None): 39 | from deepx.nn.functional import normal_ as normal_func 40 | normal_func(self,mean,stddev,seed) 41 | 42 | @tensor_method 43 | def rand_(self): 44 | #todo 45 | pass 46 | 47 | @tensor_method 48 | def randn_(self): 49 | #todo 50 | pass 51 | @tensor_method 52 | def eye_(self,n,m=None): 53 | #todo 54 | pass 55 | -------------------------------------------------------------------------------- /front/py/deepx/tensor/io.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | from deepx.tensor import Shape,Tensor,tensor_method 4 | 5 | def loadShape(path:str)->tuple[str,Shape,str]: 6 | filename = os.path.basename(path) 7 | if filename.endswith('.shape'): 8 | with open(path, 'r') as f: 9 | shape = yaml.safe_load(f) 10 | else: 11 | raise ValueError("文件名必须以.shape结尾") 12 | 13 | tensor_name = filename[:-6] # 移除'.shape'后缀 14 | return (tensor_name,Shape(tuple(shape['shape'])),shape['dtype']) 15 | @tensor_method 16 | def loadData(self,path:str): 17 | from deepx.nn.functional import loadData as loadData_func 18 | loadData_func(self,path) 19 | 20 | @tensor_method 21 | def save(self,path:str): 22 | from deepx.nn.functional import save as save_func 23 | save_func(self,path) 24 | 25 | 26 | -------------------------------------------------------------------------------- /front/py/deepx/tensor/matmul.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .tensor import Tensor,tensor_method 4 | 5 | @tensor_method 6 | def matmul(self:Tensor,other:Tensor,out:Union[Tensor,str]=''): 7 | from deepx.nn.functional import matmul as matmul_func 8 | return matmul_func(self,other,out) -------------------------------------------------------------------------------- /front/py/deepx/tensor/reduce.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Union 3 | 4 | from deepx.tensor import Tensor,tensor_method 5 | 6 | @tensor_method 7 | def reducemax(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor: 8 | assert isinstance(dim,tuple) 9 | for i in dim: 10 | assert isinstance(i,int) 11 | from deepx.nn.functional import reducemax as reduce_max_func 12 | return reduce_max_func(self,dim,keepdim,out) 13 | 14 | @tensor_method 15 | def reducemin(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor: 16 | assert isinstance(dim,tuple) 17 | for i in dim: 18 | assert isinstance(i,int) 19 | from deepx.nn.functional import reducemin as reduce_min_func 20 | return reduce_min_func(self,dim,keepdim,out) 21 | 22 | 23 | @tensor_method 24 | def sum(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor: 25 | assert isinstance(dim,tuple) 26 | for i in dim: 27 | assert isinstance(i,int) 28 | from deepx.nn.functional import sum as sum_func 29 | return sum_func(self,dim,keepdim,out) 30 | 31 | @tensor_method 32 | def prod(self, dim:tuple[int,...],keepdim:bool=False,out:Union[Tensor,str]='')->Tensor: 33 | assert isinstance(dim,tuple) 34 | for i in dim: 35 | assert isinstance(i,int) 36 | from deepx.nn.functional import prod as prod_func 37 | return prod_func(self,dim,keepdim,out) 38 | 39 | @tensor_method 40 | def mean(self,dim:tuple[int,...],keepdim:bool=False)->Tensor: 41 | assert isinstance(dim,tuple) 42 | for i in dim: 43 | assert isinstance(i,int) 44 | from deepx.nn.functional import mean as mean_func 45 | return mean_func(self,dim,keepdim) 46 | -------------------------------------------------------------------------------- /front/py/deepx/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/__init__.py -------------------------------------------------------------------------------- /front/py/deepx/transformer/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/attention.py -------------------------------------------------------------------------------- /front/py/deepx/transformer/decoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/decoder.py -------------------------------------------------------------------------------- /front/py/deepx/transformer/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/transformer/models/__init__.py -------------------------------------------------------------------------------- /front/py/deepx/transformer/models/llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedding import * 2 | __all__ = [ 3 | "LlamaRotaryEmbedding" 4 | ] -------------------------------------------------------------------------------- /front/py/deepx/transformer/models/llama/groupedquery_attention.py: -------------------------------------------------------------------------------- 1 | from typing import Optional,Tuple 2 | from deepx.nn.modules import Module,Linear 3 | from deepx import Tensor,matmul,softmax,concat,arange,dropout as dropout_func 4 | 5 | 6 | def repeat_kv(hidden_states: Tensor, n_rep: int) -> Tensor: 7 | batch, num_key_value_heads, slen, head_dim = hidden_states.shape 8 | if n_rep == 1: 9 | return hidden_states 10 | hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) 11 | return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) 12 | 13 | -------------------------------------------------------------------------------- /front/py/deepx/transformer/models/llama/mlp.py: -------------------------------------------------------------------------------- 1 | from deepx.nn.functional import swish as swish_fn 2 | from deepx.nn.modules import Module,Linear 3 | 4 | ACT2FN={ 5 | "silu":swish_fn, 6 | } 7 | 8 | class LlamaMLP(Module): 9 | def __init__(self, config:dict): 10 | super().__init__() 11 | # 输入层大小 12 | self.hidden_size = config.hidden_size 13 | # 中间层大小 14 | self.intermediate_size = config["intermediate_size"] 15 | #门控投影层 16 | self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) 17 | #上投影层 18 | self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) 19 | #下投影层 20 | self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) 21 | #激活函数 22 | self.act_fn = ACT2FN[config.hidden_act] 23 | 24 | def forward(self, x): 25 | down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) 26 | return down_proj -------------------------------------------------------------------------------- /front/py/deepx/transformer/models/llama/normalization.py: -------------------------------------------------------------------------------- 1 | from deepx.nn.modules import Module 2 | from deepx import Tensor,ones,rsqrt 3 | # RMSNorm 4 | # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py 5 | # 数学公式 6 | class LlamaRMSNorm(Module): 7 | def __init__(self, hidden_size:int, eps:float=1e-6): 8 | """ 9 | LlamaRMSNorm is equivalent to T5LayerNorm 10 | """ 11 | super().__init__() 12 | self.weight=ones((hidden_size,)) 13 | self.register_parameter("weight",self.weight) 14 | self.variance_epsilon = eps 15 | def forward(self, hidden_states:Tensor): 16 | variance = hidden_states.pow(2).mean((-1,), keepdim=True) 17 | hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon) 18 | return self.weight * hidden_states 19 | 20 | def extra_repr(self): 21 | return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" 22 | -------------------------------------------------------------------------------- /front/py/deepx/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor import Tensor 2 | from .creation import zeros, ones, arange 3 | from .elementwise import add, sub, mul, div 4 | from .matmul import matmul, dot 5 | from .reduction import sum, mean, max, min 6 | from .shape import reshape, transpose 7 | from .comparison import lt, gt, eq 8 | from .trigonometric import sin, cos, tan 9 | 10 | __all__ = [ 11 | 'Tensor', 12 | 'zeros', 'ones', 'arange', 13 | 'add', 'sub', 'mul', 'div', 14 | 'matmul', 'dot', 15 | 'sum', 'mean', 'max', 'min', 16 | 'reshape', 'transpose', 17 | 'lt', 'gt', 'eq', 18 | 'sin', 'cos', 'tan' 19 | ] -------------------------------------------------------------------------------- /front/py/deepx/utils/benchmark/bench.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class Bench: 4 | def __init__(self, name: str): 5 | self.name = name 6 | 7 | def __call__(self, func): 8 | def wrapper(*args, **kwargs): 9 | result = func(*args, **kwargs) 10 | print(f"{self.name} took {time.time() - start_time} seconds to run") 11 | return result -------------------------------------------------------------------------------- /front/py/deepx/utils/checkpoint.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/checkpoint.py -------------------------------------------------------------------------------- /front/py/deepx/utils/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .tensor import Tensor 2 | from .creation import zeros, ones, arange 3 | from .elementwise import add, sub, mul, div 4 | from .matmul import matmul, dot 5 | from .reduction import sum, mean, max, min 6 | from .shape import reshape, transpose 7 | from .comparison import lt, gt, eq 8 | from .trigonometric import sin, cos, tan 9 | 10 | __all__ = [ 11 | 'Tensor', 12 | 'zeros', 'ones', 'arange', 13 | 'add', 'sub', 'mul', 'div', 14 | 'matmul', 'dot', 15 | 'sum', 'mean', 'max', 'min', 16 | 'reshape', 'transpose', 17 | 'lt', 'gt', 'eq', 18 | 'sin', 'cos', 'tan' 19 | ] -------------------------------------------------------------------------------- /front/py/deepx/utils/data/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/dataloader.py -------------------------------------------------------------------------------- /front/py/deepx/utils/data/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/dataset.py -------------------------------------------------------------------------------- /front/py/deepx/utils/data/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/deepx/utils/data/sampler.py -------------------------------------------------------------------------------- /front/py/deepxutil/numpy/__init__.py: -------------------------------------------------------------------------------- 1 | from .io import * 2 | __all__ = [ 3 | 'save_numpy', 4 | ] 5 | -------------------------------------------------------------------------------- /front/py/deepxutil/numpy/io.py: -------------------------------------------------------------------------------- 1 | from deepx.tensor import Shape 2 | 3 | def save_numpy(t,tensorpath:str): 4 | r''' 5 | 保存numpy.ndarray为deepx.tensor格式 6 | t:numpy.ndarray 7 | tensorpath:str, 8 | ''' 9 | from numpy import ascontiguousarray,ndarray 10 | assert isinstance(t,ndarray) 11 | shape=Shape(t.shape) 12 | shape._dtype=str(t.dtype) 13 | shape.save(tensorpath+".shape") 14 | 15 | array = ascontiguousarray(t) 16 | array.tofile(tensorpath+'.data') 17 | return t 18 | -------------------------------------------------------------------------------- /front/py/deepxutil/torch/__init__.py: -------------------------------------------------------------------------------- 1 | from .io import * 2 | __all__ = [ 3 | 'save_torch', 4 | ] 5 | -------------------------------------------------------------------------------- /front/py/deepxutil/torch/io.py: -------------------------------------------------------------------------------- 1 | def save_torch(t,path:str): 2 | r''' 3 | 保存torch.Tensor为deepx.tensor格式 4 | ''' 5 | from torch import Tensor as torch_Tensor 6 | assert isinstance(t,torch_Tensor) 7 | t=t.detach().cpu().numpy() 8 | from deepxutil.numpy.io import save_numpy 9 | save_numpy(t,path) 10 | -------------------------------------------------------------------------------- /front/py/docs/api.rst: -------------------------------------------------------------------------------- 1 | API 文档 2 | ======== 3 | 4 | 激活函数 5 | -------- 6 | 7 | .. automodule:: deepx.nn.functional.activite 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: -------------------------------------------------------------------------------- /front/py/docs/conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.path.abspath('..')) # 添加项目根目录到路径 4 | 5 | # 扩展配置 6 | extensions = [ 7 | 'sphinx.ext.autodoc', # 自动提取文档字符串 8 | 'sphinx.ext.napoleon', # 支持 Google 和 NumPy 风格的文档 9 | 'sphinx.ext.mathjax', # 支持数学公式渲染 10 | 'sphinx.ext.viewcode', # 链接到源代码 11 | ] 12 | 13 | # 主题设置 14 | html_theme = 'sphinx_rtd_theme' # 使用 Read the Docs 主题 15 | 16 | # 项目信息 17 | project = 'deepx' 18 | copyright = '2024, Your Name' 19 | author = 'Your Name' -------------------------------------------------------------------------------- /front/py/docs/index.rst: -------------------------------------------------------------------------------- 1 | 欢迎使用 deepx 文档 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: 目录: 7 | 8 | api 9 | 10 | 索引和表格 11 | ========== 12 | 13 | * :ref:`genindex` 14 | * :ref:`modindex` 15 | * :ref:`search` -------------------------------------------------------------------------------- /front/py/examples/0_pyenv/binsearch.py: -------------------------------------------------------------------------------- 1 | # 升序有序数组,用二分查找元素定位 2 | def binarysearch(data:tuple[int],target:int): 3 | left,right=0,len(data)-1 4 | while left<=right: 5 | mid=(left+right)//2 6 | if data[mid]t2") 12 | print(torch_t4) 13 | torch_t5= torch.equal(torch_t2,torch_t1) 14 | print("t1==t2") 15 | print(torch_t5) 16 | torch_t6= torch.not_equal(torch_t2,torch_t1) 17 | print("t1!=t2") 18 | print(torch_t6) 19 | 20 | 21 | ############-------DEEPX-------################ 22 | 23 | from deepx import Tensor,full,arange,less,greater 24 | 25 | print() 26 | 27 | t1 = full((2,3,4), value=10,dtype="float32") 28 | equalmask=t1==10 29 | equalmask.print() 30 | t2 = arange(0,24,dtype="float32").reshape_((2,3,4)) 31 | t3_= t2t1 34 | t4_.print() 35 | 36 | t5_= t2==t1 37 | t5_.print() 38 | t6_= t2!=t1 39 | t6_.print() 40 | -------------------------------------------------------------------------------- /front/py/examples/2_ir/2_elementwise_minmax.py: -------------------------------------------------------------------------------- 1 | ############-------PyTorch-------################ 2 | 3 | print() 4 | import torch 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.int8) 6 | torch_t2 = torch.arange(24,dtype=torch.int8).reshape(2,3,4) 7 | torch_t3= torch.min(torch_t2,torch_t1) 8 | print(torch_t3) 9 | torch_t4= torch.max(torch_t2,torch_t1) 10 | print(torch_t4) 11 | 12 | 13 | ############-------DEEPX-------################ 14 | 15 | from deepx import Tensor,full,arange,min,max 16 | 17 | print() 18 | 19 | t1 = full((2,3,4), value=10,dtype="int8") 20 | t2 = arange(0,24,dtype="int8").reshape_((2,3,4)) 21 | t3 = min(t2,t1) 22 | t3.print() 23 | t4 = max(t2,t1) 24 | t4.print() -------------------------------------------------------------------------------- /front/py/examples/2_ir/2_elementwise_operator.py: -------------------------------------------------------------------------------- 1 | 2 | ############-------PyTorch-------################ 3 | 4 | import torch 5 | torch_t1 = torch.zeros(3, 4, 5, dtype=torch.float32) 6 | torch_t2 = torch.ones(3, 4, 5, dtype=torch.float32) 7 | torch_t3 = torch_t1 + torch_t2 8 | torch_t4 = torch.full((3, 4, 5), 0.5) 9 | torch_t5 = torch_t4 + torch_t3 10 | print(torch_t5) 11 | torch_t6 = torch_t1 / torch_t2 12 | print(torch_t6) 13 | torch_t7=0.05/torch_t2*2.5 14 | print(torch_t7) 15 | 16 | torch_t8=torch_t7.mul(torch_t2) 17 | print(torch_t8) 18 | ############-------DEEPX-------################ 19 | 20 | import deepx 21 | print() 22 | 23 | t1 = deepx.zeros([3,4,5],dtype='float32',name="t1") 24 | t2 = deepx.ones([3,4,5],dtype='float32',name="t2") 25 | t3 = t1.add(t2,out='t3') 26 | t4=deepx.full([3,4,5],value=0.5,name='t4') 27 | t5=t4.add(t3,out='t5') 28 | t5.print() 29 | t6=t1.div(t2,out='t6') 30 | t6.print() 31 | t7=t2.rdiv(0.05,out='t7') 32 | t7.mul_(2.5) 33 | t7.print() 34 | t8=t7.mul(t2,out='t8') 35 | t8.print() 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /front/py/examples/2_ir/2_elementwise_sqrtlog.py: -------------------------------------------------------------------------------- 1 | 2 | ############-------PyTorch-------################ 3 | print() 4 | 5 | import torch 6 | torch_t1 = torch.arange(3*4*5, dtype=torch.float32) 7 | torch_t2 = torch.full((3*4*5,),2, dtype=torch.float32) 8 | 9 | torch_t3 = torch.sqrt(torch_t1) 10 | print(torch_t3) 11 | torch_t4 = torch.log(torch_t2) 12 | print(torch_t4) 13 | torch_t5 = torch.exp(torch_t4) 14 | print(torch_t5) 15 | torch_t6 = torch.pow(torch_t5,torch_t3) 16 | print(torch_t6) 17 | torch_t7 = 2**torch_t1 18 | print(torch_t7) 19 | ############-------DEEPX-------################ 20 | 21 | import deepx 22 | print() 23 | 24 | t1 = deepx.arange(start=0,end=3*4*5,dtype='float32',name="t1") 25 | t2 = deepx.full((3*4*5,),value=2,dtype='float32',name="t2") 26 | t3 = deepx.sqrt(t1,out='t3') 27 | t3.print() 28 | t4 = deepx.log(t2,out='t4') 29 | t4.print() 30 | t5 = deepx.exp(t4,out='t5') 31 | t5.print() 32 | t6 = deepx.pow(t5,t3,out='t6') 33 | t6.print() 34 | t7 = 2**t1 35 | t7.print() 36 | 37 | -------------------------------------------------------------------------------- /front/py/examples/2_ir/2_elementwise_switchwhere.py: -------------------------------------------------------------------------------- 1 | ############-------PyTorch-------################ 2 | 3 | print() 4 | import torch 5 | torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32) 6 | torch_t2 = torch.arange(24,dtype=torch.float32).reshape(2,3,4) 7 | torch_t3= torch.where(torch_t2{tokens.input_ids.shape} {tokens}") 15 | 16 | # 测试解码功能 17 | for i in range(tokens.input_ids.shape[0]): 18 | for j in range(tokens.input_ids.shape[1]): 19 | decoded_text = tokenizer.decode(tokens.input_ids[i][j]) 20 | print(f"{i,j}->{decoded_text}") 21 | 22 | # 验证特殊tokens 23 | print(f"PAD token:{tokenizer.pad_token_id}= {tokenizer.pad_token}") 24 | print(f"EOS token:{tokenizer.eos_token_id}= {tokenizer.eos_token}") 25 | print(f"Vocabulary size: {len(tokenizer)}") 26 | 27 | # 测试批处理 28 | batch_texts = ["测试文本一", "另一个测试文本", "第三个测试文本"] 29 | batch_tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='np') 30 | print(f"批处理tokens shape: {batch_tokens.input_ids.shape}") 31 | 32 | # 测试最大长度限制 33 | long_text = "这是一个" * 100 34 | tokens_truncated = tokenizer(long_text, max_length=20, truncation=True, return_tensors="np") 35 | print(f"截断后的tokens长度: {tokens_truncated.input_ids.shape[1]}") 36 | 37 | return True 38 | 39 | if __name__ == "__main__": 40 | print() 41 | test_result = test_tokenizer() 42 | 43 | print(f"Tokenizer测试完成: {'成功' if test_result else '失败'}") -------------------------------------------------------------------------------- /front/py/examples/3_module/1_embedding.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | print() 3 | def init_tokenizer(model_path): 4 | tokenizer = AutoTokenizer.from_pretrained(model_path) 5 | tokenizer.pad_token = tokenizer.eos_token 6 | return tokenizer 7 | 8 | tokenizer = init_tokenizer("/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Llama-8B") 9 | 10 | def tokenize_text(text, tokenizer): 11 | tokens = tokenizer(text, return_tensors="pt").input_ids 12 | import torch 13 | # 处理超出词汇表范围的token 14 | if torch.any(tokens >= tokenizer.vocab_size): 15 | # 获取UNK token ID,如果没有则使用0 16 | unk_token_id = tokenizer.unk_token_id if hasattr(tokenizer, 'unk_token_id') and tokenizer.unk_token_id is not None else 0 17 | # 替换所有超出范围的token为UNK 18 | tokens = torch.where(tokens < tokenizer.vocab_size, tokens, torch.tensor(unk_token_id, device=tokens.device)) 19 | return tokens 20 | 21 | dir="/home/lipeng/model/deepxmodel/embeddingtest/" 22 | 23 | ############-------PyTorch-------################ 24 | import torch.nn as nn 25 | 26 | # 创建输入 27 | text = "这是一个测试文本,用于演示嵌入层的使用。" 28 | torch_input = tokenize_text(text, tokenizer) 29 | from deepxutil.torch import save_torch 30 | save_torch(torch_input,dir+'input') 31 | print(torch_input.shape) 32 | print(torch_input) 33 | # 创建网络 34 | torch_net = nn.Embedding(tokenizer.vocab_size, 4096) 35 | save_torch(torch_net.weight,dir+'weight') 36 | # 前向传播 37 | torch_output = torch_net(torch_input) 38 | 39 | print(torch_output.shape) 40 | print(torch_output) 41 | 42 | 43 | ############-------DEEPX-------################ 44 | from deepx.nn.modules import Embedding 45 | from deepx.nn.functional import load 46 | 47 | input=load(dir+'input') 48 | input.print() 49 | 50 | weight=load(dir+'weight') 51 | net = Embedding(tokenizer.vocab_size, 4096,weight=weight) 52 | out=net.forward(input) 53 | out.print() 54 | 55 | -------------------------------------------------------------------------------- /front/py/examples/3_module/1_linear.py: -------------------------------------------------------------------------------- 1 | ############-------PyTorch-------################ 2 | import torch 3 | import torch.nn as nn 4 | 5 | net = nn.Linear(64, 4) 6 | torch_input = torch.ones(1, 64) 7 | torch_output = net(torch_input) 8 | print() 9 | print(torch_output) 10 | 11 | 12 | ############-------DEEPX-------################ 13 | from deepx.nn.modules import Linear 14 | from deepx import ones 15 | 16 | net = Linear(64, 4) 17 | input=ones(1,64,name='input') 18 | out=net.forward(input) 19 | out.print() 20 | 21 | -------------------------------------------------------------------------------- /front/py/examples/4_transformer/llama/1_llama_rmsnorm.py: -------------------------------------------------------------------------------- 1 | hidden_size = 8 2 | eps = 1e-6 3 | dir='/home/lipeng/model/deepxmodel/llama/' 4 | 5 | 6 | 7 | ############### PyTorch 实现部分 ############### 8 | import torch 9 | # 使用小规模数据以便打印完整结果 10 | pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0 11 | print("PyTorch 输入:") 12 | print(pt_input) 13 | 14 | from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm 15 | from deepxutil.torch import save_torch 16 | save_torch(pt_input,dir+'rmsnorm_input') 17 | # 使用transformers库中的官方LlamaRMSNorm实现 18 | pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps) 19 | # 设置权重为固定值0.5 20 | with torch.no_grad(): 21 | pt_norm.weight.fill_(0.5) 22 | # 前向传播 23 | pt_output = pt_norm(pt_input) 24 | 25 | 26 | print("\nPyTorch RMSNorm 结果:") 27 | print(pt_output.shape) 28 | print(pt_output) 29 | 30 | 31 | ############### DeepX 实现部分 ############### 32 | from deepx import constant_,load 33 | from deepx.transformer.models.llama.normalization import LlamaRMSNorm 34 | 35 | input=load(dir+'rmsnorm_input') 36 | 37 | # DeepX计算流程 38 | norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps) 39 | # 设置相同的权重 40 | constant_(norm.weight, 0.5) 41 | # 前向计算 42 | output = norm(input) 43 | output.print() 44 | -------------------------------------------------------------------------------- /front/py/examples/4_transformer/llama/llama_: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/front/py/examples/4_transformer/llama/llama_ -------------------------------------------------------------------------------- /front/py/examples/4_transformer/llama/llama_rope.py: -------------------------------------------------------------------------------- 1 | from llama_rope_torch import dir,config 2 | 3 | ############-------DEEPX-------################ 4 | from deepx.nn.modules import Embedding,Module 5 | from deepx import load,arange 6 | from deepx.transformer.models.llama import LlamaRotaryEmbedding 7 | 8 | input=load(dir+'input') 9 | 10 | embed_tokens_weight=load(dir+'weight') 11 | 12 | class NetDeepx(Module): 13 | def __init__(self,configdict:dict): 14 | super().__init__() 15 | self.embed_tokens = Embedding(configdict["vocab_size"], configdict["hidden_size"],weight=embed_tokens_weight) 16 | self.rotary_emb = LlamaRotaryEmbedding(config=configdict) 17 | print("rotary_emb.inv_freq") 18 | self.rotary_emb.inv_freq.print() 19 | def forward(self,x): 20 | inputs_embeds = self.embed_tokens(x) 21 | hidden_states = inputs_embeds 22 | position_ids = arange(start=0,end=hidden_states.shape[1]).unsqueeze(0) 23 | return self.rotary_emb(hidden_states, position_ids) 24 | 25 | if __name__ == "__main__": 26 | net = NetDeepx(configdict=config.to_dict()) 27 | out=net.forward(input) 28 | out[0].print() 29 | out[1].print() 30 | 31 | 32 | -------------------------------------------------------------------------------- /log.md: -------------------------------------------------------------------------------- 1 | ### 2025-01-9 2 | deepx第三次重构 3 | 目标:性能与特性并重 4 | 5 | 6 | ### 2025-01-17 7 | 尝试omp+highway的simd融合 8 | 9 | ### 2025-01-20 10 | 11 | layer.Node需要仔细设计forward和backward的接口 12 | 13 | + 输入输出用string作为key,从tensormanager中获取tensor 14 | + parallel结构 15 | 16 | ### 2025-01-21 17 | h5模型文件,转deepx格式 18 | 19 | ### 2025-01-22 20 | -------------------------------------------------------------------------------- /model/h5_deepx/h5_deepx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/h5_deepx/h5_deepx/__init__.py -------------------------------------------------------------------------------- /model/h5_deepx/h5_deepx/toh5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/h5_deepx/h5_deepx/toh5.py -------------------------------------------------------------------------------- /model/h5_deepx/requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | numpy 3 | pyyaml 4 | os 5 | sys 6 | -------------------------------------------------------------------------------- /model/h5_deepx/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='h5_deepx', 5 | version='0.1.0', 6 | description='A tool to extract model structure and weights from H5 files', 7 | author='Your Name', 8 | author_email='your.email@example.com', 9 | packages=find_packages(), 10 | install_requires=[ 11 | 'h5py', # H5 文件依赖 12 | 'numpy', # NumPy 依赖 13 | 'pyyaml', # YAML 依赖 14 | ], 15 | entry_points={ 16 | 'console_scripts': [ 17 | 'todeepx=h5_deepx.todeepx:extract_h5_model', 18 | ], 19 | }, 20 | ) -------------------------------------------------------------------------------- /model/onnx_deepx/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/onnx_deepx/README.md -------------------------------------------------------------------------------- /model/onnx_deepx/onnx_deepx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/onnx_deepx/onnx_deepx/__init__.py -------------------------------------------------------------------------------- /model/onnx_deepx/onnx_deepx/todeepx.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | 3 | def extract_onnx_info(onnx_file): 4 | # 加载 ONNX 模型 5 | model = onnx.load(onnx_file) 6 | 7 | # 提取模型信息 8 | model_info = { 9 | "ir_version": model.ir_version, 10 | "producer_name": model.producer_name, 11 | "producer_version": model.producer_version, 12 | "domain": model.domain, 13 | "model_version": model.model_version, 14 | "doc_string": model.doc_string, 15 | "graph": { 16 | "name": model.graph.name, 17 | "input": [input.name for input in model.graph.input], 18 | "output": [output.name for output in model.graph.output], 19 | "nodes": [] 20 | } 21 | } 22 | 23 | # 提取节点信息 24 | for node in model.graph.node: 25 | model_info["graph"]["nodes"].append({ 26 | "name": node.name, 27 | "op_type": node.op_type, 28 | "inputs": node.input, 29 | "outputs": node.output, 30 | "attributes": {attr.name: attr for attr in node.attribute} 31 | }) 32 | 33 | return model_info -------------------------------------------------------------------------------- /model/onnx_deepx/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx -------------------------------------------------------------------------------- /model/onnx_deepx/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='onnx_deepx', 5 | version='0.1.0', 6 | description='A simple ONNX model extractor', 7 | author='Lipeng', 8 | author_email='lipeng@mirrorsoft.cn', 9 | packages=find_packages(), 10 | install_requires=[ 11 | 'onnx', # 添加 ONNX 依赖 12 | ], 13 | entry_points={ 14 | 'console_scripts': [ 15 | 'todeepx=onnx_deepx.todeepx:extract_onnx_info', 16 | 'toonnx=onnx_deepx.toonnx:extract_onnx_info', 17 | ], 18 | }, 19 | ) -------------------------------------------------------------------------------- /model/safetensor_deepx/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/array2d/deepx/104c7b326251e8d20efbcdf4aad584ad6d179aa9/model/safetensor_deepx/README.md -------------------------------------------------------------------------------- /model/safetensor_deepx/examples/load_model.py: -------------------------------------------------------------------------------- 1 | from safetensor_deepx import SafeTensorLoader, SafeTensorGraphBuilder 2 | import os 3 | 4 | def main(): 5 | model_dir = "/home/lipeng/model/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" 6 | 7 | # 加载tensor 8 | loader = SafeTensorLoader(model_dir) 9 | tensors, metadata = loader.load() 10 | 11 | print("\nModel Configuration:") 12 | for key, value in metadata.get("model_config", {}).items(): 13 | print(f"{key}: {value}") 14 | 15 | print("\nTensor Statistics:") 16 | total_params = 0 17 | for name, tensor in tensors.items(): 18 | shape = tensor.shape 19 | num_params = tensor.data.size 20 | total_params += num_params 21 | print(f"{name}: shape={shape}, params={num_params:,}") 22 | 23 | print(f"\nTotal Parameters: {total_params:,}") 24 | 25 | # 构建计算图 26 | builder = SafeTensorGraphBuilder(model_dir) 27 | graph, _, _ = builder.build_graph() 28 | 29 | # 导出计算图可视化 30 | output_dir = "model_analysis" 31 | os.makedirs(output_dir, exist_ok=True) 32 | 33 | dot = graph.to_dot() 34 | dot.render(os.path.join(output_dir, "model_graph"), format="png", cleanup=True) 35 | 36 | print(f"\n计算图已保存到 {output_dir}/model_graph.png") 37 | 38 | if __name__ == "__main__": 39 | main() -------------------------------------------------------------------------------- /model/safetensor_deepx/requirements.txt: -------------------------------------------------------------------------------- 1 | safetensors>=0.3.0 2 | numpy>=1.19.0 3 | -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: safetensor-deepx 3 | Version: 0.1.0 4 | Summary: SafeTensor support for DeepX 5 | Home-page: UNKNOWN 6 | Author: igor.li 7 | License: UNKNOWN 8 | Platform: UNKNOWN 9 | Requires-Python: >=3.7 10 | 11 | UNKNOWN 12 | 13 | -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | safetensor_deepx/__init__.py 4 | safetensor_deepx/graph.py 5 | safetensor_deepx/loader.py 6 | safetensor_deepx.egg-info/PKG-INFO 7 | safetensor_deepx.egg-info/SOURCES.txt 8 | safetensor_deepx.egg-info/dependency_links.txt 9 | safetensor_deepx.egg-info/requires.txt 10 | safetensor_deepx.egg-info/top_level.txt -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.0 2 | safetensors>=0.3.0 3 | -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | safetensor_deepx 2 | -------------------------------------------------------------------------------- /model/safetensor_deepx/safetensor_deepx/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import SafeTensorLoader, SafeTensorSaver 2 | from .graph import SafeTensorGraphBuilder 3 | 4 | __all__ = [ 5 | 'SafeTensorLoader', 6 | 'SafeTensorSaver', 7 | 'SafeTensorGraphBuilder' 8 | ] -------------------------------------------------------------------------------- /model/safetensor_deepx/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='safetensor-deepx', 5 | version='0.1.0', 6 | description='SafeTensor support for DeepX', 7 | author='igor.li', 8 | packages=find_packages(), 9 | install_requires=[ 10 | 'safetensors>=0.3.0', 11 | 'numpy>=1.19.0', 12 | 'deepxpy>=0.1.0', 13 | 'graphviz>=0.20.1', 14 | ], 15 | python_requires='>=3.7', 16 | ) -------------------------------------------------------------------------------- /scheduler/README.md: -------------------------------------------------------------------------------- 1 | # scheduler 2 | 3 | 执行调度器 4 | 5 | 执行调度器作为deepx的训推的执行计算图,负责执行计算图的分布式调度 6 | 7 | TODO 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /scheduler/autograd/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .function import Function,Context 3 | __all__ = [ 4 | 'Graph', 5 | 'Node', 6 | 'NodeType', 7 | 'DataNode', 8 | 'OpNode', 9 | 'Function', 10 | 'Context', 11 | ] -------------------------------------------------------------------------------- /scheduler/autograd/function.py: -------------------------------------------------------------------------------- 1 | from deepx.autograd import Graph 2 | class Context: 3 | def __init__(self,requires_grad=False): 4 | self._requires_grad = requires_grad 5 | self._saved_tensors = [] 6 | self._non_tensor_data = {} 7 | self._authormap = {} 8 | def save_tensors(self, *tensors): 9 | self._saved_tensors.extend(tensors) 10 | 11 | @property 12 | def get_tensor(self): 13 | return tuple(self._saved_tensors) 14 | 15 | def save_data(self, key, value): 16 | self._non_tensor_data[key] = value 17 | 18 | def get_data(self, key): 19 | return self._non_tensor_data.get(key) 20 | 21 | def set_authormap(self,authormap:dict): 22 | self._authormap = authormap 23 | 24 | @property 25 | def authormap(self): 26 | return self._authormap 27 | 28 | @property 29 | def requires_grad(self): 30 | return self._requires_grad 31 | 32 | class Function: 33 | @staticmethod 34 | def forward(ctx:Context, *args, **kwargs): 35 | raise NotImplementedError 36 | 37 | @staticmethod 38 | def backward(ctx:Context, *grad_outputs): 39 | raise NotImplementedError 40 | 41 | @classmethod 42 | def apply(cls, *args, **kwargs): 43 | requires_grad = kwargs.pop('requires_grad', False) 44 | ctx = Context(requires_grad=requires_grad) 45 | result = cls.forward(ctx, *args, **kwargs) 46 | return result 47 | 48 | -------------------------------------------------------------------------------- /scheduler/autograd/graph/_controlflownode.py: -------------------------------------------------------------------------------- 1 | from .node import Node 2 | from .nodetype import NodeType 3 | 4 | 5 | class ControlFlowNode(Node): 6 | def __init__(self, name=None): 7 | super().__init__(name="control_flow", ntype=NodeType.CONTROL_FLOW) 8 | 9 | -------------------------------------------------------------------------------- /scheduler/autograd/graph/_datanode.py: -------------------------------------------------------------------------------- 1 | from .node import Node 2 | from .nodetype import NodeType 3 | 4 | class DataNode(Node): 5 | def __init__(self, name=None, type=None, data=None): 6 | super().__init__(name=name, ntype=NodeType.DATA) 7 | self._data = data 8 | self._type=type 9 | @property 10 | def data(self): 11 | return self._data 12 | 13 | def set_data(self, data): 14 | self._data = data 15 | 16 | 17 | -------------------------------------------------------------------------------- /scheduler/autograd/graph/_opnode.py: -------------------------------------------------------------------------------- 1 | from .node import Node 2 | from .nodetype import NodeType 3 | 4 | class OpNode(Node): 5 | def __init__(self, name: str): 6 | super().__init__(name=name, ntype=NodeType.OP) 7 | -------------------------------------------------------------------------------- /scheduler/autograd/graph/node.py: -------------------------------------------------------------------------------- 1 | from .nodetype import NodeType 2 | 3 | class Node: 4 | def __init__(self, 5 | ntype:NodeType=None, 6 | name:str=None, 7 | graph=None, 8 | ): 9 | from .graph import Graph 10 | if graph == None: 11 | self._graph = Graph.get_default() 12 | else: 13 | self._graph = graph 14 | self._module = None 15 | self._ntype = ntype 16 | self._name = name 17 | self._inputs = [] 18 | 19 | @property 20 | def ntype(self): 21 | return self._ntype 22 | 23 | @property 24 | def graph(self): 25 | return self._graph 26 | 27 | @property 28 | def name(self): 29 | return self._name 30 | 31 | def rename(self,name:str): 32 | self._name = name 33 | 34 | 35 | @property 36 | def fullname(self): 37 | if self._module is None: 38 | return self._name 39 | else: 40 | return f"{self._module.full_name}.{self._name}" 41 | 42 | def set_module(self,module): 43 | from deepx.nn.modules import Module 44 | if isinstance(module,Module): 45 | self._module = module 46 | else: 47 | raise ValueError("module must be a Module") 48 | 49 | @property 50 | def module(self): 51 | return self._module 52 | 53 | @property 54 | def inputs(self): 55 | return self._inputs 56 | 57 | 58 | def add_input(self, input_node): 59 | self._inputs.append(input_node) -------------------------------------------------------------------------------- /scheduler/autograd/graph/nodetype.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum, EnumMeta 2 | from typing import Dict, Any 3 | 4 | 5 | class NodeType(IntEnum ): 6 | DATA = 0 7 | OP = 1 8 | CONTROL_FLOW = 2 9 | 10 | -------------------------------------------------------------------------------- /scheduler/common/pass_register.cpp: -------------------------------------------------------------------------------- 1 | #include "pass_register.hpp" 2 | 3 | namespace deepx 4 | { 5 | 6 | PassRegistry &PassRegistry::instance() 7 | { 8 | static PassRegistry registry_ins; 9 | return registry_ins; 10 | } 11 | 12 | void PassRegistry::register_pass(const std::string &name, pass_func func) 13 | { 14 | registry_[name] = func; 15 | } 16 | } -------------------------------------------------------------------------------- /scheduler/common/pass_register.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | /** 7 | * 使用REGISTER_PASS 宏用于注册pass 8 | */ 9 | namespace deepx 10 | { 11 | using pass_func = std::function; 12 | 13 | class PassRegistry 14 | { 15 | public: 16 | static PassRegistry& instance(); 17 | 18 | void register_pass(const std::string &name, pass_func func); 19 | 20 | PassRegistry(const PassRegistry&) = delete; 21 | PassRegistry& operator=(const PassRegistry&) = delete; 22 | 23 | private: 24 | PassRegistry() = default; 25 | 26 | private: 27 | std::unordered_map registry_; 28 | }; 29 | } 30 | 31 | 32 | #define REGISTER_PASS(name, func) \ 33 | struct Register##name { \ 34 | Register##name() { \ 35 | PassRegistry::instance().register_pass(#name, func); \ 36 | } \ 37 | }; \ 38 | static Register##name register_##name; -------------------------------------------------------------------------------- /tool/deepxctl/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | deepxctl -------------------------------------------------------------------------------- /tool/deepxctl/cmd/tensor/tensor.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | ) 7 | 8 | func PrintUsage() { 9 | fmt.Println("使用方法:") 10 | fmt.Println(" tensor print <文件路径>") 11 | fmt.Println(" tensor help") 12 | } 13 | 14 | func Execute() { 15 | if len(os.Args) < 1 { 16 | PrintUsage() 17 | os.Exit(1) 18 | } 19 | 20 | subCmd := "help" 21 | if len(os.Args) > 0 { 22 | subCmd = os.Args[0] 23 | } 24 | 25 | switch subCmd { 26 | case "print": 27 | os.Args = os.Args[1:] 28 | PrintCmd() 29 | case "help": 30 | PrintUsage() 31 | default: 32 | fmt.Printf("未知的张量命令: %s\n", subCmd) 33 | PrintUsage() 34 | os.Exit(1) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tool/deepxctl/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/array2d/deepx/tool/deepxctl 2 | 3 | go 1.23.2 4 | 5 | require gopkg.in/yaml.v2 v2.4.0 // indirect 6 | -------------------------------------------------------------------------------- /tool/deepxctl/go.sum: -------------------------------------------------------------------------------- 1 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 2 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 3 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 4 | -------------------------------------------------------------------------------- /tool/deepxctl/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | 9 | "github.com/array2d/deepx/tool/deepxctl/cmd/tensor" 10 | ) 11 | 12 | var version = "0.1.0" 13 | 14 | func printUsage() { 15 | execName := filepath.Base(os.Args[0]) 16 | fmt.Printf("用法: %s [命令] [参数]\n\n", execName) 17 | fmt.Println("可用命令:") 18 | fmt.Println(" tensor 张量操作相关命令") 19 | fmt.Println(" version 显示版本信息") 20 | fmt.Println(" help 显示帮助信息") 21 | fmt.Println("\n使用 '%s help [命令]' 获取命令的详细信息", execName) 22 | } 23 | 24 | func main() { 25 | flag.Usage = printUsage 26 | 27 | if len(os.Args) < 2 { 28 | printUsage() 29 | os.Exit(0) 30 | } 31 | 32 | // 获取子命令 33 | cmd := os.Args[1] 34 | 35 | // 根据子命令执行相应操作 36 | switch cmd { 37 | case "tensor": 38 | // 移除子命令,让子命令处理剩余的参数 39 | os.Args = os.Args[2:] 40 | tensor.Execute() 41 | 42 | case "version": 43 | fmt.Printf("deepxctl 版本 %s\n", version) 44 | 45 | case "help": 46 | if len(os.Args) > 2 { 47 | helpCmd := os.Args[2] 48 | switch helpCmd { 49 | case "tensor": 50 | tensor.PrintUsage() 51 | default: 52 | fmt.Printf("未知命令: %s\n", helpCmd) 53 | printUsage() 54 | } 55 | } else { 56 | printUsage() 57 | } 58 | 59 | default: 60 | fmt.Printf("未知命令: %s\n", cmd) 61 | printUsage() 62 | os.Exit(1) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tool/deepxctl/tensor/fp16.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import ( 4 | "encoding/binary" 5 | "math" 6 | ) 7 | 8 | func Byte2ToFloat16(value []byte) float32 { 9 | bits := binary.BigEndian.Uint16(value) 10 | // 这里需要实现float16到float32的转换 11 | // 简化实现,实际项目中需要更完整的实现 12 | sign := float32(1) 13 | if bits&0x8000 != 0 { 14 | sign = -1 15 | } 16 | exp := int((bits & 0x7C00) >> 10) 17 | frac := float32(bits&0x03FF) / 1024.0 18 | 19 | if exp == 0 { 20 | return sign * frac * float32(1.0/16384.0) // 非规格化数 21 | } else if exp == 31 { 22 | if frac == 0 { 23 | return sign * float32(math.Inf(1)) // 无穷大 24 | } 25 | return float32(math.NaN()) // NaN 26 | } 27 | return sign * float32(math.Pow(2, float64(exp-15))) * (1.0 + frac) // 规格化数 28 | } 29 | -------------------------------------------------------------------------------- /tool/deepxctl/tensor/io.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import ( 4 | "encoding/binary" 5 | "os" 6 | 7 | "gopkg.in/yaml.v2" 8 | ) 9 | 10 | func LoadShape(filePath string) (shape Shape, err error) { 11 | var shapeData []byte 12 | shapeData, err = os.ReadFile(filePath + ".shape") 13 | if err != nil { 14 | return 15 | } 16 | 17 | err = yaml.Unmarshal(shapeData, &shape) 18 | if err != nil { 19 | return 20 | } 21 | return 22 | } 23 | func LoadTensor[T Number](filePath string) (tensor Tensor[T], err error) { 24 | 25 | _, err = os.ReadFile(filePath + ".shape") 26 | if err != nil { 27 | return 28 | } 29 | var shape Shape 30 | shape, err = LoadShape(filePath) 31 | if err != nil { 32 | return 33 | } 34 | file, err := os.Open(filePath + ".data") 35 | if err != nil { 36 | return 37 | } 38 | defer file.Close() 39 | data := make([]T, shape.Size) 40 | 41 | err = binary.Read(file, binary.LittleEndian, data) 42 | if err != nil { 43 | return 44 | } 45 | tensor = Tensor[T]{Data: data, Shape: shape} 46 | return 47 | } 48 | -------------------------------------------------------------------------------- /tool/deepxctl/tensor/tensor.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type Shape struct { 8 | Shape []int `json:"shape"` 9 | Stride []int `json:"stride"` 10 | Dim int `json:"ndim"` 11 | Size int `json:"size"` 12 | Dtype string `json:"dtype"` 13 | } 14 | 15 | func NewTensorShape(shape []int) (s Shape) { 16 | s.Dim = len(shape) 17 | s.Shape = make([]int, len(shape)) 18 | copy(s.Shape, shape) 19 | s.Stride = make([]int, len(shape)) 20 | s.Stride[len(shape)-1] = 1 21 | for i := len(shape) - 2; i >= 0; i-- { 22 | s.Stride[i] = s.Stride[i+1] * shape[i+1] 23 | } 24 | s.Size = s.Stride[0] * shape[0] 25 | return s 26 | } 27 | func (s Shape) String() string { 28 | return fmt.Sprintf("%v", s.Shape) 29 | } 30 | 31 | func (s Shape) At(i int) int { 32 | return s.Shape[i] 33 | } 34 | 35 | func (s Shape) LinearAt(indices []int) int { 36 | idx := 0 37 | for i := 0; i < len(indices); i++ { 38 | idx += indices[i] * s.Stride[i] 39 | } 40 | return idx 41 | } 42 | func (s Shape) LinearTo(idx int) (indices []int) { 43 | linearIndex := idx 44 | indices = make([]int, s.Dim) 45 | for i := 0; i < s.Dim; i++ { 46 | indices[i] = linearIndex / s.Stride[i] 47 | linearIndex %= s.Stride[i] 48 | } 49 | return indices 50 | } 51 | 52 | func BitSize(Dtype string) int { 53 | switch Dtype { 54 | case "bool": 55 | return 8 56 | case "int8": 57 | return 8 58 | case "int16": 59 | return 16 60 | case "int32": 61 | return 32 62 | case "int64": 63 | return 64 64 | case "float16": 65 | return 16 66 | case "float32": 67 | return 32 68 | case "float64": 69 | return 64 70 | default: 71 | return 0 72 | } 73 | } 74 | 75 | type Number interface { 76 | comparable 77 | float64 | float32 | int64 | int32 | int16 | int8 | bool 78 | } 79 | 80 | type Tensor[T Number] struct { 81 | Data []T 82 | Shape 83 | } 84 | 85 | // Get 获取Tensor的值 86 | func (t *Tensor[T]) Get(indices ...int) T { 87 | idx := t.Shape.LinearAt(indices) 88 | return t.Data[idx] 89 | 90 | } 91 | --------------------------------------------------------------------------------