├── .github └── workflows │ └── docker-image.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── README.md ├── conanfile.txt ├── configs ├── booksim2_configs │ ├── anynet.icnt │ ├── anynet_file │ ├── fly_c16_m8.icnt │ ├── fly_c1_m2.icnt │ ├── fly_c1_m8.icnt │ ├── fly_c2_m8.icnt │ ├── fly_c4_m16.icnt │ ├── fly_c4_m2.icnt │ ├── fly_c4_m32.icnt │ ├── fly_c4_m8.icnt │ ├── fly_c64_m8.icnt │ ├── fly_c64_m8_sif-age.icnt │ ├── fly_c64_m8_sif-rr.icnt │ ├── make_anynet_topology.py │ ├── mesh_sif-age.icnt │ └── mesh_sif-rr.icnt ├── ramulator2_configs │ ├── DDR4.yaml │ └── HBM2.yaml ├── ramulator_configs │ ├── ALDRAM-config.cfg │ ├── DDR3-config.cfg │ ├── DDR4-config.cfg │ ├── DSARP-config.cfg │ ├── GDDR5-config.cfg │ ├── HBM-config.cfg │ ├── HBM-config_ChRaBaRoCo.cfg │ ├── HBM-config_FCFS.cfg │ ├── HBM-config_FRFCFS.cfg │ ├── HBM-config_FRFCFS_Cap.cfg │ ├── HBM-config_FRFCFS_PriorHit.cfg │ ├── HBM-config_RoBaRaCoCh.cfg │ ├── HBM-config_RoCoBaRaCh.cfg │ ├── HBMx0.5ch-config.cfg │ ├── HBMx2ch-config.cfg │ ├── LPDDR3-config.cfg │ ├── LPDDR4-config.cfg │ ├── PCM-config.cfg │ ├── SALP-config.cfg │ ├── STTMRAM-config.cfg │ ├── TLDRAM-config.cfg │ ├── WideIO-config.cfg │ └── WideIO2-config.cfg ├── systolic_ws_128x128_c4_booksim2_tpuv4.json ├── systolic_ws_128x128_c4_simple_noc_tpuv4.json ├── systolic_ws_128x128_c4_simple_noc_tpuv4_half_ramulator2.json ├── systolic_ws_128x128_c4_simple_noc_tpuv4_partition_quad.json ├── systolic_ws_8x8_c1_booksim2_transformer.json ├── systolic_ws_8x8_c1_simple_noc_transformer.json ├── systolic_ws_8x8_c4_booksim2_transformer.json ├── systolic_ws_8x8_c4_simple_noc_transformer.json ├── test.json └── timeloop_configs │ ├── systolic_os_32x32 │ ├── README.md │ ├── arch │ │ ├── components │ │ │ ├── reg_storage.yaml │ │ │ ├── smartbuffer_RF.yaml │ │ │ └── smartbuffer_SRAM.yaml │ │ ├── simple_output_stationary.yaml │ │ └── simple_output_stationary.yaml.tmp │ ├── constraints │ │ ├── simple_output_stationary_arch_constraints.yaml │ │ └── simple_output_stationary_map_constraints.yaml │ └── mapper │ │ └── mapper.yaml │ └── systolic_ws_8x8 │ ├── README.md │ ├── arch │ ├── components │ │ ├── reg_storage.yaml │ │ ├── smartbuffer_RF.yaml │ │ └── smartbuffer_SRAM.yaml │ └── simple_weight_stationary.yaml │ ├── constraints │ ├── simple_weight_stationary_arch_constraints.yaml │ └── simple_weight_stationary_map_constraints.yaml │ ├── example_AlexNet_layer1_outputs │ ├── timeloop-mapper.ART.yaml │ ├── timeloop-mapper.ART_summary.yaml │ ├── timeloop-mapper.ERT.yaml │ ├── timeloop-mapper.ERT_summary.yaml │ ├── timeloop-mapper.accelergy.log │ ├── timeloop-mapper.defined_input_architecture.yaml │ ├── timeloop-mapper.flattened_architecture.yaml │ ├── timeloop-mapper.log │ ├── timeloop-mapper.map+stats.xml │ ├── timeloop-mapper.map.txt │ └── timeloop-mapper.stats.txt │ ├── mapper │ └── mapper.yaml │ ├── timeloop-mapper.map+stats.xml │ ├── timeloop-mapper.map.txt │ └── timeloop-mapper.stats.txt ├── example ├── language_models.json └── models_list.json ├── extern └── ramulator_custom │ ├── .gitignore │ ├── CMakeLists.txt │ ├── include │ └── ramulator │ │ └── Ramulator.hpp │ └── src │ ├── Config.cpp │ ├── Config.h │ ├── Controller.h │ ├── DDR4.cpp │ ├── DDR4.h │ ├── DRAM.h │ ├── HBM.cpp │ ├── HBM.h │ ├── Memory.h │ ├── MemoryFactory.cpp │ ├── MemoryFactory.h │ ├── Ramulator.cpp │ ├── Refresh.cpp │ ├── Refresh.h │ ├── Request.cpp │ ├── Request.h │ ├── Scheduler.h │ ├── SpeedyController.h │ ├── StatType.cpp │ ├── StatType.h │ └── Statistics.h ├── img ├── ONNXim_demo.png └── speedup.png ├── models ├── language_models │ ├── llama3-8b.json │ ├── opt-125m.json │ └── opt-66b.json └── resnet18 │ └── resnet18.onnx ├── scripts ├── aggregate_results.sh ├── generate_cnn_onnx.py ├── generate_conv_onnx.py ├── generate_matmul_onnx.py ├── generate_multi-tenancy_onnx.py ├── generate_multi-tenancy_onnx2.py ├── generate_transformer_onnx.py ├── onnxim_sbatch.sh ├── run_matmul_conv.sh ├── run_multi-tenancy.sh ├── run_sbatch.sh ├── run_simulation.sh ├── run_timeloop.sh ├── run_transformer.sh └── timeloop_slurm_job.sh ├── src ├── CMakeLists.txt ├── Common.cc ├── Common.h ├── Core.cc ├── Core.h ├── Dram.cc ├── Dram.h ├── Hashing.cc ├── Hashing.h ├── Instruction.h ├── Interconnect.cc ├── Interconnect.h ├── Mapping.cc ├── Mapping.h ├── Model.cc ├── Model.h ├── SimulationConfig.h ├── Simulator.cc ├── Simulator.h ├── Sram.cc ├── Sram.h ├── Stat.h ├── SystolicOS.cc ├── SystolicOS.h ├── SystolicWS.cc ├── SystolicWS.h ├── Tensor.cc ├── Tensor.h ├── allocator │ └── AddressAllocator.h ├── helper │ ├── CommandLineParser.cc │ ├── CommandLineParser.h │ └── HelperFunctions.h ├── main.cc ├── models │ ├── LanguageModel.cc │ └── LanguageModel.h ├── operations │ ├── AdaptiveAvgPool.cc │ ├── AdaptiveAvgPool.h │ ├── Attention.cc │ ├── Attention.h │ ├── BiasAct.cc │ ├── BiasAct.h │ ├── BiasGelu.cc │ ├── BiasGelu.h │ ├── Concat.cc │ ├── Concat.h │ ├── Conv.cc │ ├── Conv.h │ ├── ConvOS.cc │ ├── ConvOS.h │ ├── ConvWS.cc │ ├── ConvWS.h │ ├── Dummy.cc │ ├── Dummy.h │ ├── EmbedLayerNorm.cc │ ├── EmbedLayerNorm.h │ ├── Flatten.cc │ ├── Flatten.h │ ├── Gemm.cc │ ├── Gemm.h │ ├── GemmOS.cc │ ├── GemmOS.h │ ├── GemmWS.cc │ ├── GemmWS.h │ ├── GlobalAvgPool.cc │ ├── GlobalAvgPool.h │ ├── KVCacheConcat.cc │ ├── KVCacheConcat.h │ ├── MaxPool.cc │ ├── MaxPool.h │ ├── Operation.cc │ ├── Operation.h │ ├── OperationFactory.cc │ ├── OperationFactory.h │ ├── SkipLayerNorm.cc │ ├── SkipLayerNorm.h │ ├── Softmax.cc │ └── Softmax.h └── scheduler │ ├── IterLevelScheduler.cc │ ├── IterLevelScheduler.h │ ├── LanguageScheduler.cc │ ├── LanguageScheduler.h │ ├── Scheduler.cc │ └── Scheduler.h ├── tests ├── CMakeLists.txt ├── MappingTest.cc ├── SystolicOsTest.cc ├── SystolicWsTest.cc ├── main.cc └── operatons │ ├── ConvWSTest.cc │ └── GemmWSTest.cc └── traces └── input.csv /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | docker-build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v3 18 | 19 | - name: Build Docker image 20 | run: | 21 | docker build -t onxxim-test . 22 | 23 | - name: Test Docker image 24 | run: | 25 | docker run --rm onxxim-test echo "Docker build successful!" 26 | 27 | - name: Test generating onnx file - GPT2 28 | run: | 29 | docker run --rm onxxim-test python3 /workspace/ONNXim/scripts/generate_transformer_onnx.py --model gpt2 30 | 31 | - name: Test generating onnx file - BERT 32 | run: | 33 | docker run --rm onxxim-test python3 /workspace/ONNXim/scripts/generate_transformer_onnx.py --model bert 34 | 35 | - name: Cache Docker layers 36 | uses: actions/cache@v3 37 | with: 38 | path: /var/lib/docker 39 | key: ${{ runner.os }}-docker-${{ github.sha }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | models/* 2 | model_lists/* 3 | build/* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extern/onnx"] 2 | path = extern/onnx 3 | url = https://github.com/onnx/onnx.git 4 | [submodule "extern/protobuf"] 5 | path = extern/protobuf 6 | url = https://github.com/protocolbuffers/protobuf.git 7 | [submodule "extern/torch2timeloop"] 8 | path = extern/torch2timeloop 9 | url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git 10 | [submodule "extern/booksim"] 11 | path = extern/booksim 12 | url = https://github.com/PSAL-POSTECH/booksim.git 13 | [submodule "extern/ramulator2"] 14 | path = extern/ramulator2 15 | url = https://github.com/PSAL-POSTECH/ramulator2.git 16 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15.0) 2 | set(project_name "AiFrameworkSim") 3 | project(${project_name}) 4 | 5 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 6 | include("${CMAKE_SOURCE_DIR}/build/conanbuildinfo.cmake") 7 | conan_basic_setup() 8 | 9 | # find_package(Boost 1.70 REQUIRED COMPONENTS program_options) 10 | # message("LIB ${Boost_LIBRARY_DIRS}") 11 | option(USE_RAMULATOR "USE_RAMULATOR" ON) 12 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/extern) 13 | 14 | # C++ setttings 15 | set(CMAKE_CXX_STANDARD 20) 16 | set(ONNX_ML 1) 17 | set(JSON_BuildTests OFF CACHE INTERNAL "") 18 | set(EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/build/bin") 19 | set(LIBRARY_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/build/lib") 20 | add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) 21 | message("BINARY DIR ${CMAKE_BINARY_DIR}") 22 | if(CMAKE_BUILD_TYPE STREQUAL "Debug") 23 | add_compile_options(-fsanitize=address) 24 | add_link_options(-fsanitize=address) 25 | endif() 26 | 27 | # Build source 28 | add_subdirectory("${PROJECT_SOURCE_DIR}/src") 29 | 30 | # Add libaray ramulator 31 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator_custom") 32 | 33 | # Add libaray ramulator 34 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator2") 35 | include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/src") 36 | include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/resources/ndp_wrappers") 37 | 38 | # Add libaray booksim 39 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/booksim") 40 | 41 | # Add libary protobuf 42 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/protobuf/cmake" EXCLUDE_FROM_ALL) 43 | set_target_properties(libprotoc PROPERTIES FOLDER "external/protobuf") 44 | set_target_properties(protoc PROPERTIES FOLDER "external/protobuf") 45 | 46 | # Add libaray onnx 47 | add_definitions("-DONNX_NAMESPACE=onnx") 48 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/onnx" EXCLUDE_FROM_ALL) 49 | set_target_properties(onnx PROPERTIES FOLDER "extern/onnx") 50 | set_target_properties(onnx_proto PROPERTIES FOLDER "extern/onnx") 51 | 52 | target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS}) 53 | target_link_libraries(Simulator ramulator1 booksim2 ramulator) 54 | target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs) 55 | 56 | target_include_directories(Simulator_lib PUBLIC ${ONNX_INCLUDE_DIRS}) 57 | target_link_libraries(Simulator_lib ramulator1 booksim2 ramulator) 58 | target_link_libraries(Simulator_lib ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs) 59 | 60 | enable_testing() 61 | add_subdirectory("${PROJECT_SOURCE_DIR}/tests") 62 | 63 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Ubuntu 20.04 as the base image, where GCC 10 is available 2 | FROM ubuntu:20.04 3 | 4 | # Avoid prompts during package installation 5 | ARG DEBIAN_FRONTEND=noninteractive 6 | 7 | # Update and install software 8 | RUN apt-get update && apt-get install -y \ 9 | gcc-10 g++-10 python3.8 python3-pip git wget make \ 10 | libssl-dev libasan5 libubsan1 11 | 12 | # Set GCC 10 as the default gcc and g++ compilers 13 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \ 14 | update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100 15 | 16 | # Set the working directory 17 | WORKDIR /workspace 18 | 19 | # Install CMake 3.22.0 from source 20 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz && \ 21 | tar -xvzf cmake-3.22.0.tar.gz && \ 22 | cd cmake-3.22.0 && \ 23 | ./bootstrap && \ 24 | make -j$(nproc) && \ 25 | make install 26 | 27 | # Install specific Python packages with pip 28 | RUN pip3 install conan==1.57.0 transformers==4.40.1 onnx onnxruntime torch==2.3.1 torchvision optimum 29 | 30 | # Copy your project files into the image 31 | COPY ./ ONNXim 32 | 33 | # Prepare ONNXim project 34 | RUN cd ONNXim && \ 35 | git submodule update --recursive --init && \ 36 | mkdir -p build && \ 37 | cd build && \ 38 | conan install .. --build=missing && \ 39 | cmake .. && \ 40 | make -j$(nproc) 41 | 42 | # Set environment variable 43 | ENV ONNXIM_HOME /workspace/ONNXim 44 | 45 | # Final command 46 | CMD ["echo", "Welcome to ONNXim!"] 47 | 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Parallel System Architecture Lab at POSTECH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conanfile.txt: -------------------------------------------------------------------------------- 1 | [requires] 2 | boost/1.79.0 3 | robin-hood-hashing/3.11.5 4 | spdlog/1.11.0 5 | nlohmann_json/3.11.2 6 | [generators] 7 | cmake 8 | -------------------------------------------------------------------------------- /configs/booksim2_configs/anynet.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = anynet 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file 6 | routing_function = min 7 | subnets = 1 8 | 9 | vc_buf_size = 64 10 | input_buffer_size = 256 11 | ejection_buffer_size = 64 12 | boundary_buffer_size = 64 13 | wait_for_tail_credit = 0 14 | vc_allocator = islip 15 | sw_allocator = islip 16 | alloc_iters = 1 17 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/anynet_file: -------------------------------------------------------------------------------- 1 | router 0 node 0 router 1 router 8 2 | router 1 node 1 router 2 router 9 3 | router 2 node 2 router 3 router 10 4 | router 3 node 3 router 4 router 11 5 | router 4 node 4 router 5 router 12 6 | router 5 node 5 router 6 router 13 7 | router 6 node 6 router 7 router 14 8 | router 7 node 7 router 15 9 | router 8 node 8 router 9 router 16 10 | router 9 node 9 router 10 router 17 11 | router 10 node 10 router 11 router 18 12 | router 11 node 11 router 12 router 19 13 | router 12 node 12 router 13 router 20 14 | router 13 node 13 router 14 router 21 15 | router 14 node 14 router 15 router 22 16 | router 15 node 15 router 23 17 | router 16 node 16 router 17 router 24 18 | router 17 node 17 router 18 router 25 19 | router 18 node 18 router 19 router 26 20 | router 19 node 19 router 20 router 27 21 | router 20 node 20 router 21 router 28 22 | router 21 node 21 router 22 router 29 23 | router 22 node 22 router 23 router 30 24 | router 23 node 23 router 31 25 | router 24 node 24 router 25 router 32 26 | router 25 node 25 router 26 router 33 27 | router 26 node 26 router 27 router 34 28 | router 27 node 27 router 28 router 35 29 | router 28 node 28 router 29 router 36 30 | router 29 node 29 router 30 router 37 31 | router 30 node 30 router 31 router 38 32 | router 31 node 31 router 39 33 | router 32 node 32 router 33 router 40 34 | router 33 node 33 router 34 router 41 35 | router 34 node 34 router 35 router 42 36 | router 35 node 35 router 36 router 43 37 | router 36 node 36 router 37 router 44 38 | router 37 node 37 router 38 router 45 39 | router 38 node 38 router 39 router 46 40 | router 39 node 39 router 47 41 | router 40 node 40 router 41 router 48 42 | router 41 node 41 router 42 router 49 43 | router 42 node 42 router 43 router 50 44 | router 43 node 43 router 44 router 51 45 | router 44 node 44 router 45 router 52 46 | router 45 node 45 router 46 router 53 47 | router 46 node 46 router 47 router 54 48 | router 47 node 47 router 55 49 | router 48 node 48 router 49 router 56 50 | router 49 node 49 router 50 router 57 51 | router 50 node 50 router 51 router 58 52 | router 51 node 51 router 52 router 59 53 | router 52 node 52 router 53 router 60 54 | router 53 node 53 router 54 router 61 55 | router 54 node 54 router 55 router 62 56 | router 55 node 55 router 63 57 | router 56 node 56 router 57 router 64 58 | router 57 node 57 router 58 router 65 59 | router 58 node 58 router 59 router 66 60 | router 59 node 59 router 60 router 67 61 | router 60 node 60 router 61 router 68 62 | router 61 node 61 router 62 router 69 63 | router 62 node 62 router 63 router 70 64 | router 63 node 63 router 71 65 | router 64 node 64 router 65 66 | router 65 node 65 router 66 67 | router 66 node 66 router 67 68 | router 67 node 67 router 68 69 | router 68 node 68 router 69 70 | router 69 node 69 router 70 71 | router 70 node 70 router 71 72 | router 71 node 71 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c16_m8.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 24 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c1_m2.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 3 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c1_m8.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 9 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c2_m8.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 10 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c4_m16.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 20 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c4_m2.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 6 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c4_m32.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 36 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c4_m8.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 12 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c64_m8.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 72 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = islip 16 | sw_allocator = islip 17 | alloc_iters = 1 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c64_m8_sif-age.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 72 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = separable_input_first 16 | sw_allocator = separable_input_first 17 | alloc_iters = 1 18 | priority = age 19 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/fly_c64_m8_sif-rr.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = fly 5 | k = 72 6 | n = 1 7 | routing_function = dest_tag 8 | subnets = 1 9 | 10 | vc_buf_size = 64 11 | input_buffer_size = 256 12 | ejection_buffer_size = 64 13 | boundary_buffer_size = 64 14 | wait_for_tail_credit = 0 15 | vc_allocator = separable_input_first 16 | sw_allocator = separable_input_first 17 | alloc_iters = 1 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/make_anynet_topology.py: -------------------------------------------------------------------------------- 1 | RowSize = 8 2 | ColSize = 9 3 | for col in range(ColSize): 4 | for i in range(RowSize): 5 | id = RowSize * col + i 6 | if col < ColSize - 1: 7 | if i < RowSize - 1: 8 | print(f'router {id} node {id} router {id+1} router {id+RowSize}') 9 | else: 10 | print(f'router {id} node {id} router {id+RowSize}') 11 | else: 12 | if i < RowSize - 1: 13 | print(f'router {id} node {id} router {id+1}') 14 | else: 15 | print(f'router {id} node {id}') -------------------------------------------------------------------------------- /configs/booksim2_configs/mesh_sif-age.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = anynet 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file 6 | routing_function = min 7 | subnets = 1 8 | 9 | vc_buf_size = 64 10 | input_buffer_size = 256 11 | ejection_buffer_size = 64 12 | boundary_buffer_size = 64 13 | wait_for_tail_credit = 0 14 | vc_allocator = separable_input_first 15 | sw_allocator = separable_input_first 16 | alloc_iters = 1 17 | priority = age 18 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/booksim2_configs/mesh_sif-rr.icnt: -------------------------------------------------------------------------------- 1 | [config] 2 | use_map = 0 3 | flit_size = 64 4 | topology = anynet 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file 6 | routing_function = min 7 | subnets = 1 8 | 9 | vc_buf_size = 64 10 | input_buffer_size = 256 11 | ejection_buffer_size = 64 12 | boundary_buffer_size = 64 13 | wait_for_tail_credit = 0 14 | vc_allocator = separable_input_first 15 | sw_allocator = separable_input_first 16 | alloc_iters = 1 17 | deadlock_warn_timeout = 10000 -------------------------------------------------------------------------------- /configs/ramulator2_configs/DDR4.yaml: -------------------------------------------------------------------------------- 1 | Frontend: 2 | impl: GEM5 3 | 4 | MemorySystem: 5 | impl: GenericDRAM 6 | clock_ratio: 1 7 | 8 | DRAM: 9 | impl: DDR4 10 | org: 11 | preset: DDR4_16Gb_x4 12 | channel: 1 13 | timing: 14 | preset: DDR4_1600J 15 | 16 | Controller: 17 | impl: Generic 18 | Scheduler: 19 | impl: FRFCFS 20 | RefreshManager: 21 | impl: AllBank 22 | plugins: 23 | 24 | AddrMapper: 25 | impl: RoBaRaCoCh -------------------------------------------------------------------------------- /configs/ramulator2_configs/HBM2.yaml: -------------------------------------------------------------------------------- 1 | Frontend: 2 | impl: GEM5 3 | 4 | MemorySystem: 5 | impl: GenericDRAM 6 | clock_ratio: 1 7 | 8 | DRAM: 9 | impl: HBM2 10 | org: 11 | preset: HBM2_8Gb 12 | channel: 1 13 | timing: 14 | preset: HBM2_2.5Gbps 15 | 16 | Controller: 17 | impl: Generic 18 | Scheduler: 19 | impl: FRFCFS 20 | RefreshManager: 21 | impl: AllBank 22 | plugins: 23 | 24 | AddrMapper: 25 | impl: RoBaRaCoCh -------------------------------------------------------------------------------- /configs/ramulator_configs/ALDRAM-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = ALDRAM 6 | channels = 1 7 | ranks = 1 8 | speed = ALDRAM_1600K 9 | org = ALDRAM_4Gb_x8 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/DDR3-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = DDR3 6 | channels = 1 7 | ranks = 1 8 | speed = DDR3_1600K 9 | org = DDR3_2Gb_x8 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | # warmup_insts = 100000000 25 | warmup_insts = 0 26 | cache = no 27 | # cache = no, L1L2, L3, all (default value is no) 28 | translation = None 29 | # translation = None, Random (default value is None) 30 | # 31 | ######################## 32 | -------------------------------------------------------------------------------- /configs/ramulator_configs/DDR4-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = DDR4 6 | channels = 2 7 | ranks = 1 8 | speed = DDR4_3200 9 | org = DDR4_4Gb_x8 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 8 17 | mem_tick = 3 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | # warmup_insts = 100000000 25 | warmup_insts = 0 26 | cache = no 27 | # cache = no, L1L2, L3, all (default value is no) 28 | translation = None 29 | # translation = None, Random (default value is None) 30 | # 31 | ######################## 32 | -------------------------------------------------------------------------------- /configs/ramulator_configs/DSARP-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = DSARP 6 | subarrays = 8 7 | channels = 1 8 | ranks = 1 9 | speed = DSARP_1333 10 | org = DSARP_8Gb_x8 11 | # record_cmd_trace: (default is off): on, off 12 | record_cmd_trace = off 13 | # print_cmd_trace: (default is off): on, off 14 | print_cmd_trace = off 15 | 16 | ### Below are parameters only for CPU trace 17 | cpu_tick = 4 18 | mem_tick = 1 19 | ### Below are parameters only for multicore mode 20 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 21 | early_exit = on 22 | # early_exit = on, off (default value is on) 23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 24 | expected_limit_insts = 200000000 25 | warmup_insts = 100000000 26 | cache = no 27 | # cache = no, L1L2, L3, all (default value is no) 28 | translation = None 29 | # translation = None, Random (default value is None) 30 | # 31 | ######################## 32 | -------------------------------------------------------------------------------- /configs/ramulator_configs/GDDR5-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = GDDR5 6 | channels = 1 7 | ranks = 1 8 | speed = GDDR5_6000 9 | org = GDDR5_8Gb_x16 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 2 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 16 7 | ranks = 1 8 | speed = HBM_2_5Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FRFCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = ChRaBaRoCo 29 | scheduler = FRFCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_FCFS.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_FRFCFS.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FRFCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FRFCFS_Cap 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FRFCFS_PriorHit 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoBaRaCoCh 29 | scheduler = FRFCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 8 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | mapping = RoCoBaRaCh 29 | scheduler = FRFCFS 30 | # translation = None, Random (default value is None) 31 | # 32 | ######################## 33 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBMx0.5ch-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 4 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/HBMx2ch-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = HBM 6 | channels = 16 7 | ranks = 1 8 | speed = HBM_1Gbps 9 | org = HBM_4Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 32 17 | mem_tick = 5 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/LPDDR3-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = LPDDR3 6 | channels = 1 7 | ranks = 1 8 | speed = LPDDR3_1600 9 | org = LPDDR3_8Gb_x16 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/LPDDR4-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = LPDDR4 6 | channels = 2 7 | ranks = 1 8 | speed = LPDDR4_2400 9 | org = LPDDR4_8Gb_x16 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 8 17 | mem_tick = 3 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/PCM-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = PCM 6 | channels = 1 7 | ranks = 1 8 | speed = PCM_800D 9 | org = PCM_2Gb_x8 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/SALP-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = SALP-MASA 6 | subarrays = 8 7 | channels = 1 8 | ranks = 1 9 | speed = SALP_1600K 10 | org = SALP_4Gb_x8 11 | # record_cmd_trace: (default is off): on, off 12 | record_cmd_trace = off 13 | # print_cmd_trace: (default is off): on, off 14 | print_cmd_trace = off 15 | 16 | ### Below are parameters only for CPU trace 17 | cpu_tick = 4 18 | mem_tick = 1 19 | ### Below are parameters only for multicore mode 20 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 21 | early_exit = on 22 | # early_exit = on, off (default value is on) 23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 24 | expected_limit_insts = 200000000 25 | warmup_insts = 100000000 26 | cache = no 27 | # cache = no, L1L2, L3, all (default value is no) 28 | translation = None 29 | # translation = None, Random (default value is None) 30 | # 31 | ######################## 32 | -------------------------------------------------------------------------------- /configs/ramulator_configs/STTMRAM-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = STTMRAM 6 | channels = 4 7 | ranks = 1 8 | speed = STT_1600_1_2 9 | org = STTMRAM_2Gb_x8 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/TLDRAM-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = TLDRAM 6 | subarrays = 16 7 | channels = 1 8 | ranks = 1 9 | speed = TLDRAM_1600K 10 | org = TLDRAM_4Gb_x8 11 | # record_cmd_trace: (default is off): on, off 12 | record_cmd_trace = off 13 | # print_cmd_trace: (default is off): on, off 14 | print_cmd_trace = off 15 | 16 | ### Below are parameters only for CPU trace 17 | cpu_tick = 4 18 | mem_tick = 1 19 | ### Below are parameters only for multicore mode 20 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 21 | early_exit = on 22 | # early_exit = on, off (default value is on) 23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 24 | expected_limit_insts = 200000000 25 | warmup_insts = 100000000 26 | cache = no 27 | # cache = no, L1L2, L3, all (default value is no) 28 | translation = None 29 | # translation = None, Random (default value is None) 30 | # 31 | ######################## 32 | -------------------------------------------------------------------------------- /configs/ramulator_configs/WideIO-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = WideIO 6 | channels = 4 7 | ranks = 1 8 | speed = WideIO_266 9 | org = WideIO_8Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 4 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translation = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/ramulator_configs/WideIO2-config.cfg: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Example config file 3 | # Comments start with # 4 | # There are restrictions for valid channel/rank numbers 5 | standard = WideIO2 6 | channels = 8 7 | ranks = 1 8 | speed = WideIO2_1066 9 | org = WideIO2_8Gb 10 | # record_cmd_trace: (default is off): on, off 11 | record_cmd_trace = off 12 | # print_cmd_trace: (default is off): on, off 13 | print_cmd_trace = off 14 | 15 | ### Below are parameters only for CPU trace 16 | cpu_tick = 6 17 | mem_tick = 1 18 | ### Below are parameters only for multicore mode 19 | # When early_exit is on, all cores will be terminated when the earliest one finishes. 20 | early_exit = on 21 | # early_exit = on, off (default value is on) 22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit. 23 | expected_limit_insts = 200000000 24 | warmup_insts = 100000000 25 | cache = no 26 | # cache = no, L1L2, L3, all (default value is no) 27 | translation = None 28 | # translatino = None, Random (default value is None) 29 | # 30 | ######################## 31 | -------------------------------------------------------------------------------- /configs/systolic_ws_128x128_c4_booksim2_tpuv4.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 4, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 8000, 5 | "core_config" : { 6 | "core_0" : { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 128, 9 | "core_height" : 128, 10 | 11 | "spad_size" : 32768, 12 | "accum_spad_size" : 4096, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 65536, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | }, 27 | "core_1" : { 28 | "core_type" : "systolic_ws", 29 | "core_width" : 128, 30 | "core_height" : 128, 31 | 32 | "spad_size" : 32768, 33 | "accum_spad_size" : 4096, 34 | "sram_width" : 32, 35 | 36 | "vector_process_bit" : 65536, 37 | "add_latency" : 1, 38 | "mul_latency" : 1, 39 | "mac_latency" : 1, 40 | "exp_latency" : 1, 41 | "gelu_latency" : 1, 42 | "div_latency" : 1, 43 | "add_tree_latency" : 1, 44 | "scalar_sqrt_latency" : 1, 45 | "scalar_add_latency" : 1, 46 | "scalar_mul_latency" : 1 47 | }, 48 | "core_2" : { 49 | "core_type" : "systolic_ws", 50 | "core_width" : 128, 51 | "core_height" : 128, 52 | 53 | "spad_size" : 32768, 54 | "accum_spad_size" : 4096, 55 | "sram_width" : 32, 56 | 57 | "vector_process_bit" : 65536, 58 | "add_latency" : 1, 59 | "mul_latency" : 1, 60 | "mac_latency" : 1, 61 | "exp_latency" : 1, 62 | "gelu_latency" : 1, 63 | "div_latency" : 1, 64 | "add_tree_latency" : 1, 65 | "scalar_sqrt_latency" : 1, 66 | "scalar_add_latency" : 1, 67 | "scalar_mul_latency" : 1 68 | }, 69 | "core_3" : { 70 | "core_type" : "systolic_ws", 71 | "core_width" : 128, 72 | "core_height" : 128, 73 | 74 | "spad_size" : 32768, 75 | "accum_spad_size" : 4096, 76 | "sram_width" : 32, 77 | 78 | "vector_process_bit" : 65536, 79 | "add_latency" : 1, 80 | "mul_latency" : 1, 81 | "mac_latency" : 1, 82 | "exp_latency" : 1, 83 | "gelu_latency" : 1, 84 | "div_latency" : 1, 85 | "add_tree_latency" : 1, 86 | "scalar_sqrt_latency" : 1, 87 | "scalar_add_latency" : 1, 88 | "scalar_mul_latency" : 1 89 | } 90 | }, 91 | 92 | "dram_type" : "ramulator2", 93 | "dram_freq" :1200, 94 | "dram_channels": 16, 95 | "dram_req_size": 32, 96 | "dram_latency" : 10, 97 | "dram_size" : 16, 98 | "dram_nbl" : 1, 99 | "dram_print_interval": 4800, 100 | "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", 101 | 102 | "icnt_type" : "booksim2", 103 | "icnt_latency" : 1, 104 | "icnt_freq" : 8000, 105 | "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m16.icnt", 106 | 107 | "precision" : 2, 108 | "layout" : "NHWC", 109 | "scheduler" : "simple" 110 | } -------------------------------------------------------------------------------- /configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 4, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 8000, 5 | "core_config" : { 6 | "core_0" : { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 128, 9 | "core_height" : 128, 10 | 11 | "spad_size" : 32768, 12 | "accum_spad_size" : 4096, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 65536, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | }, 27 | "core_1" : { 28 | "core_type" : "systolic_ws", 29 | "core_width" : 128, 30 | "core_height" : 128, 31 | 32 | "spad_size" : 32768, 33 | "accum_spad_size" : 4096, 34 | "sram_width" : 32, 35 | 36 | "vector_process_bit" : 65536, 37 | "add_latency" : 1, 38 | "mul_latency" : 1, 39 | "mac_latency" : 1, 40 | "exp_latency" : 1, 41 | "gelu_latency" : 1, 42 | "div_latency" : 1, 43 | "add_tree_latency" : 1, 44 | "scalar_sqrt_latency" : 1, 45 | "scalar_add_latency" : 1, 46 | "scalar_mul_latency" : 1 47 | }, 48 | "core_2" : { 49 | "core_type" : "systolic_ws", 50 | "core_width" : 128, 51 | "core_height" : 128, 52 | 53 | "spad_size" : 32768, 54 | "accum_spad_size" : 4096, 55 | "sram_width" : 32, 56 | 57 | "vector_process_bit" : 65536, 58 | "add_latency" : 1, 59 | "mul_latency" : 1, 60 | "mac_latency" : 1, 61 | "exp_latency" : 1, 62 | "gelu_latency" : 1, 63 | "div_latency" : 1, 64 | "add_tree_latency" : 1, 65 | "scalar_sqrt_latency" : 1, 66 | "scalar_add_latency" : 1, 67 | "scalar_mul_latency" : 1 68 | }, 69 | "core_3" : { 70 | "core_type" : "systolic_ws", 71 | "core_width" : 128, 72 | "core_height" : 128, 73 | 74 | "spad_size" : 32768, 75 | "accum_spad_size" : 4096, 76 | "sram_width" : 32, 77 | 78 | "vector_process_bit" : 65536, 79 | "add_latency" : 1, 80 | "mul_latency" : 1, 81 | "mac_latency" : 1, 82 | "exp_latency" : 1, 83 | "gelu_latency" : 1, 84 | "div_latency" : 1, 85 | "add_tree_latency" : 1, 86 | "scalar_sqrt_latency" : 1, 87 | "scalar_add_latency" : 1, 88 | "scalar_mul_latency" : 1 89 | } 90 | }, 91 | "dram_type" : "ramulator2", 92 | "dram_freq" :1200, 93 | "dram_channels": 16, 94 | "dram_req_size": 32, 95 | "dram_latency" : 10, 96 | "dram_size" : 16, 97 | "dram_nbl" : 1, 98 | "dram_print_interval": 9600, 99 | "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", 100 | 101 | "icnt_type" : "simple", 102 | "icnt_latency" : 1, 103 | "icnt_freq" : 8000, 104 | "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", 105 | 106 | "precision" : 2, 107 | "layout" : "NHWC", 108 | "scheduler" : "simple" 109 | } -------------------------------------------------------------------------------- /configs/systolic_ws_128x128_c4_simple_noc_tpuv4_half_ramulator2.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 4, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 8000, 5 | "core_config" : { 6 | "core_0" : { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 128, 9 | "core_height" : 128, 10 | 11 | "spad_size" : 16384, 12 | "accum_spad_size" : 4096, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 16384, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | }, 27 | "core_1" : { 28 | "core_type" : "systolic_ws", 29 | "core_width" : 128, 30 | "core_height" : 128, 31 | 32 | "spad_size" : 16384, 33 | "accum_spad_size" : 4096, 34 | "sram_width" : 32, 35 | 36 | "vector_process_bit" : 16384, 37 | "add_latency" : 1, 38 | "mul_latency" : 1, 39 | "mac_latency" : 1, 40 | "exp_latency" : 1, 41 | "gelu_latency" : 1, 42 | "div_latency" : 1, 43 | "add_tree_latency" : 1, 44 | "scalar_sqrt_latency" : 1, 45 | "scalar_add_latency" : 1, 46 | "scalar_mul_latency" : 1 47 | }, 48 | "core_2" : { 49 | "core_type" : "systolic_ws", 50 | "core_width" : 128, 51 | "core_height" : 128, 52 | 53 | "spad_size" : 16384, 54 | "accum_spad_size" : 4096, 55 | "sram_width" : 32, 56 | 57 | "vector_process_bit" : 16384, 58 | "add_latency" : 1, 59 | "mul_latency" : 1, 60 | "mac_latency" : 1, 61 | "exp_latency" : 1, 62 | "gelu_latency" : 1, 63 | "div_latency" : 1, 64 | "add_tree_latency" : 1, 65 | "scalar_sqrt_latency" : 1, 66 | "scalar_add_latency" : 1, 67 | "scalar_mul_latency" : 1 68 | }, 69 | "core_3" : { 70 | "core_type" : "systolic_ws", 71 | "core_width" : 128, 72 | "core_height" : 128, 73 | 74 | "spad_size" : 16384, 75 | "accum_spad_size" : 4096, 76 | "sram_width" : 32, 77 | 78 | "vector_process_bit" : 16384, 79 | "add_latency" : 1, 80 | "mul_latency" : 1, 81 | "mac_latency" : 1, 82 | "exp_latency" : 1, 83 | "gelu_latency" : 1, 84 | "div_latency" : 1, 85 | "add_tree_latency" : 1, 86 | "scalar_sqrt_latency" : 1, 87 | "scalar_add_latency" : 1, 88 | "scalar_mul_latency" : 1 89 | } 90 | }, 91 | 92 | "dram_type" : "ramulator2", 93 | "dram_freq" :1200, 94 | "dram_channels": 16, 95 | "dram_req_size": 32, 96 | "dram_latency" : 10, 97 | "dram_size" : 16, 98 | "dram_nbl" : 1, 99 | "dram_print_interval": 10000, 100 | "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml", 101 | 102 | "icnt_type" : "simple", 103 | "icnt_latency" : 1, 104 | "icnt_freq" : 8400, 105 | "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", 106 | 107 | "precision" : 2, 108 | "layout" : "NHWC", 109 | "scheduler" : "simple" 110 | } -------------------------------------------------------------------------------- /configs/systolic_ws_8x8_c1_booksim2_transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 1, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 10000, 5 | "core_config" : { 6 | "core_0": { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 8, 9 | "core_height" : 8, 10 | 11 | "spad_size" : 64, 12 | "accum_spad_size" : 16, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 2048, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | } 27 | }, 28 | 29 | "dram_type" : "ramulator2", 30 | "dram_freq" : 400, 31 | "dram_channels": 2, 32 | "dram_req_size": 32, 33 | "dram_latency" : 10, 34 | "dram_print_interval": 10000, 35 | "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", 36 | 37 | "icnt_type" : "booksim2", 38 | "icnt_latency" : 1, 39 | "icnt_freq" : 2000, 40 | "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt", 41 | 42 | "precision" : 2, 43 | "layout" : "NHWC", 44 | "scheduler" : "simple" 45 | } -------------------------------------------------------------------------------- /configs/systolic_ws_8x8_c1_simple_noc_transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 1, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 10000, 5 | "core_config" : { 6 | "core_0": { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 8, 9 | "core_height" : 8, 10 | 11 | "spad_size" : 64, 12 | "accum_spad_size" : 16, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 2048, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | } 27 | }, 28 | 29 | "dram_type" : "ramulator2", 30 | "dram_freq" : 400, 31 | "dram_channels": 2, 32 | "dram_req_size": 32, 33 | "dram_latency" : 10, 34 | "dram_print_interval": 160000, 35 | "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", 36 | 37 | "icnt_type" : "simple", 38 | "icnt_latency" : 1, 39 | "icnt_freq" : 2000, 40 | "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt", 41 | 42 | "precision" : 1, 43 | "layout" : "NHWC", 44 | "scheduler" : "simple" 45 | } -------------------------------------------------------------------------------- /configs/systolic_ws_8x8_c4_booksim2_transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 4, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 10000, 5 | "core_config" : { 6 | "core_0": { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 8, 9 | "core_height" : 8, 10 | 11 | "spad_size" : 64, 12 | "accum_spad_size" : 16, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 2048, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | }, 27 | "core_1": { 28 | "core_type" : "systolic_ws", 29 | "core_width" : 8, 30 | "core_height" : 8, 31 | 32 | "spad_size" : 64, 33 | "accum_spad_size" : 16, 34 | "sram_width" : 32, 35 | 36 | "vector_process_bit" : 2048, 37 | "add_latency" : 1, 38 | "mul_latency" : 1, 39 | "mac_latency" : 1, 40 | "exp_latency" : 1, 41 | "gelu_latency" : 1, 42 | "div_latency" : 1, 43 | "add_tree_latency" : 1, 44 | "scalar_sqrt_latency" : 1, 45 | "scalar_add_latency" : 1, 46 | "scalar_mul_latency" : 1 47 | }, 48 | "core_2": { 49 | "core_type" : "systolic_ws", 50 | "core_width" : 8, 51 | "core_height" : 8, 52 | 53 | "spad_size" : 64, 54 | "accum_spad_size" : 16, 55 | "sram_width" : 32, 56 | 57 | "vector_process_bit" : 2048, 58 | "add_latency" : 1, 59 | "mul_latency" : 1, 60 | "mac_latency" : 1, 61 | "exp_latency" : 1, 62 | "gelu_latency" : 1, 63 | "div_latency" : 1, 64 | "add_tree_latency" : 1, 65 | "scalar_sqrt_latency" : 1, 66 | "scalar_add_latency" : 1, 67 | "scalar_mul_latency" : 1 68 | }, 69 | "core_3": { 70 | "core_type" : "systolic_ws", 71 | "core_width" : 8, 72 | "core_height" : 8, 73 | 74 | "spad_size" : 64, 75 | "accum_spad_size" : 16, 76 | "sram_width" : 32, 77 | 78 | "vector_process_bit" : 2048, 79 | "add_latency" : 1, 80 | "mul_latency" : 1, 81 | "mac_latency" : 1, 82 | "exp_latency" : 1, 83 | "gelu_latency" : 1, 84 | "div_latency" : 1, 85 | "add_tree_latency" : 1, 86 | "scalar_sqrt_latency" : 1, 87 | "scalar_add_latency" : 1, 88 | "scalar_mul_latency" : 1 89 | } 90 | }, 91 | 92 | "dram_type" : "ramulator2", 93 | "dram_freq" : 400, 94 | "dram_channels": 2, 95 | "dram_req_size": 32, 96 | "dram_latency" : 10, 97 | "dram_print_interval": 10000, 98 | "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", 99 | 100 | "icnt_type" : "booksim2", 101 | "icnt_latency" : 1, 102 | "icnt_freq" : 2000, 103 | "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m2.icnt", 104 | 105 | "precision" : 2, 106 | "layout" : "NHWC", 107 | "scheduler" : "simple" 108 | } -------------------------------------------------------------------------------- /configs/systolic_ws_8x8_c4_simple_noc_transformer.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 4, 3 | "core_freq" : 1000, 4 | "core_print_interval" : 10000, 5 | "core_config" : { 6 | "core_0": { 7 | "core_type" : "systolic_ws", 8 | "core_width" : 8, 9 | "core_height" : 8, 10 | 11 | "spad_size" : 64, 12 | "accum_spad_size" : 16, 13 | "sram_width" : 32, 14 | 15 | "vector_process_bit" : 2048, 16 | "add_latency" : 1, 17 | "mul_latency" : 1, 18 | "mac_latency" : 1, 19 | "exp_latency" : 1, 20 | "gelu_latency" : 1, 21 | "div_latency" : 1, 22 | "add_tree_latency" : 1, 23 | "scalar_sqrt_latency" : 1, 24 | "scalar_add_latency" : 1, 25 | "scalar_mul_latency" : 1 26 | }, 27 | "core_1": { 28 | "core_type" : "systolic_ws", 29 | "core_width" : 8, 30 | "core_height" : 8, 31 | 32 | "spad_size" : 64, 33 | "accum_spad_size" : 16, 34 | "sram_width" : 32, 35 | 36 | "vector_process_bit" : 2048, 37 | "add_latency" : 1, 38 | "mul_latency" : 1, 39 | "mac_latency" : 1, 40 | "exp_latency" : 1, 41 | "gelu_latency" : 1, 42 | "div_latency" : 1, 43 | "add_tree_latency" : 1, 44 | "scalar_sqrt_latency" : 1, 45 | "scalar_add_latency" : 1, 46 | "scalar_mul_latency" : 1 47 | }, 48 | "core_2": { 49 | "core_type" : "systolic_ws", 50 | "core_width" : 8, 51 | "core_height" : 8, 52 | 53 | "spad_size" : 64, 54 | "accum_spad_size" : 16, 55 | "sram_width" : 32, 56 | 57 | "vector_process_bit" : 2048, 58 | "add_latency" : 1, 59 | "mul_latency" : 1, 60 | "mac_latency" : 1, 61 | "exp_latency" : 1, 62 | "gelu_latency" : 1, 63 | "div_latency" : 1, 64 | "add_tree_latency" : 1, 65 | "scalar_sqrt_latency" : 1, 66 | "scalar_add_latency" : 1, 67 | "scalar_mul_latency" : 1 68 | }, 69 | "core_3": { 70 | "core_type" : "systolic_ws", 71 | "core_width" : 8, 72 | "core_height" : 8, 73 | 74 | "spad_size" : 64, 75 | "accum_spad_size" : 16, 76 | "sram_width" : 32, 77 | 78 | "vector_process_bit" : 2048, 79 | "add_latency" : 1, 80 | "mul_latency" : 1, 81 | "mac_latency" : 1, 82 | "exp_latency" : 1, 83 | "gelu_latency" : 1, 84 | "div_latency" : 1, 85 | "add_tree_latency" : 1, 86 | "scalar_sqrt_latency" : 1, 87 | "scalar_add_latency" : 1, 88 | "scalar_mul_latency" : 1 89 | } 90 | }, 91 | 92 | "dram_type" : "ramulator2", 93 | "dram_freq" : 400, 94 | "dram_channels": 2, 95 | "dram_req_size": 32, 96 | "dram_latency" : 10, 97 | "dram_print_interval": 10000, 98 | "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml", 99 | 100 | "icnt_type" : "simple", 101 | "icnt_latency" : 1, 102 | "icnt_freq" : 2000, 103 | "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m2.icnt", 104 | 105 | "precision" : 2, 106 | "layout" : "NHWC", 107 | "scheduler" : "simple" 108 | } -------------------------------------------------------------------------------- /configs/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_cores" : 1, 3 | "core_type" : "systolic_ws", 4 | "core_freq" : 1000, 5 | "core_width" : 8, 6 | "core_height" : 8, 7 | 8 | "vector_process_bit" : 32, 9 | 10 | "spad_size" : 448, 11 | "sram_width" : 32, 12 | 13 | "dram_type" : "ramulator", 14 | "dram_freq" : 877, 15 | "dram_channels": 8, 16 | "dram_req_size": 32, 17 | "dram_latency" : 10, 18 | "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg", 19 | 20 | "icnt_type" : "simple", 21 | "icnt_latency" : 1, 22 | "icnt_freq" : 2000, 23 | "icnt_config_path" : "../configs/booksim2_configs/fly_c64_m8.icnt", 24 | 25 | "precision" : 1, 26 | "layout" : "NHWC", 27 | "scheduler" : "simple" 28 | } -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/README.md: -------------------------------------------------------------------------------- 1 | Simple Output Stationary Architecture 2 | ---------------------------- 3 | This folder contains a simple output stationary architecture. 4 | 5 | Q&As: 6 | ---------------------------- 7 | 1. How long do the Timeloop simulations take? 8 | 9 | Depending on your workload, the simulation takes various amount of time to finish. Generally, they should 10 | converge within 30 mins. You can manually stop the exploration when you see things are converging by 11 | pressing `ctrl + C`. They sometimes will take much longer to 12 | automaticaly stop as we set the converging cretiria to be pretty high to avoid early-stop with subooptimal mappings. Use you own 13 | judgement. -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/arch/components/reg_storage.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: reg_storage 5 | attributes: 6 | technology: 45nm 7 | latency: 1ns 8 | width: 16 9 | depth: 1 10 | subcomponents: 11 | - name: storage 12 | class: reg 13 | attributes: 14 | technology: technology 15 | latency: latency 16 | datawidth : width 17 | actions: 18 | - name: access 19 | subcomponents: 20 | - name: storage 21 | actions: 22 | - name: access -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/arch/components/smartbuffer_RF.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: smartbuffer_RF 5 | attributes: 6 | technology: 45nm 7 | memory_depth: 12 8 | memory_width: 16 9 | n_rdwr_ports: 2 10 | n_banks: 1 11 | n_buffets: 1 12 | subcomponents: 13 | - name: storage 14 | class: regfile 15 | attributes: 16 | technology: technology 17 | width: memory_width 18 | depth: memory_depth 19 | n_rdwr_ports: n_rdwr_ports 20 | n_banks: n_banks 21 | - name: address_generators[0..1] 22 | class: intadder 23 | attributes: 24 | technology: technology 25 | width: log(memory_depth) 26 | actions: 27 | - name: write 28 | arguments: 29 | data_delta: 0..1 30 | address_delta: 0..n_banks 31 | subcomponents: 32 | - name: storage 33 | actions: 34 | - name: write 35 | arguments: 36 | data_delta: data_delta 37 | address_delta: address_delta 38 | - name: address_generators[0] 39 | actions: 40 | - name: add 41 | - name: address_generators[1] 42 | actions: 43 | - name: idle 44 | - name: read 45 | arguments: 46 | data_delta: 0..1 47 | address_delta: 0..n_banks 48 | subcomponents: 49 | - name: storage 50 | actions: 51 | - name: read 52 | arguments: 53 | data_delta: data_delta 54 | address_delta: address_delta 55 | - name: address_generators[1] 56 | actions: 57 | - name: add 58 | - name: address_generators[0] 59 | actions: 60 | - name: idle 61 | - name: idle 62 | subcomponents: 63 | - name: storage 64 | actions: 65 | - name: idle 66 | - name: address_generators[0..1] 67 | actions: 68 | - name: idle 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/arch/components/smartbuffer_SRAM.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: smartbuffer_SRAM 5 | attributes: 6 | technology: 45nm 7 | memory_depth: 12 8 | memory_width: 16 9 | n_rdwr_ports: 2 10 | n_banks: 1 11 | n_buffets: 1 12 | subcomponents: 13 | - name: storage 14 | class: SRAM 15 | attributes: 16 | technology: technology 17 | width: memory_width 18 | depth: memory_depth 19 | n_rdwr_ports: n_rdwr_ports 20 | n_banks: n_banks 21 | - name: address_generators[0..1] 22 | class: intadder 23 | attributes: 24 | technology: technology 25 | width: log(memory_depth) 26 | actions: 27 | - name: write 28 | arguments: 29 | data_delta: 0..1 30 | address_delta: 0..n_banks 31 | subcomponents: 32 | - name: storage 33 | actions: 34 | - name: write 35 | arguments: 36 | data_delta: data_delta 37 | address_delta: address_delta 38 | - name: address_generators[0] 39 | actions: 40 | - name: count 41 | - name: address_generators[1] 42 | actions: 43 | - name: idle 44 | - name: read 45 | arguments: 46 | data_delta: 0..1 47 | address_delta: 0..n_banks 48 | subcomponents: 49 | - name: storage 50 | actions: 51 | - name: read 52 | arguments: 53 | data_delta: data_delta 54 | address_delta: address_delta 55 | - name: address_generators[1] 56 | actions: 57 | - name: add 58 | - name: address_generators[0] 59 | actions: 60 | - name: idle 61 | - name: idle 62 | subcomponents: 63 | - name: storage 64 | actions: 65 | - name: idle 66 | - name: address_generators[0..1] 67 | actions: 68 | - name: idle 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/arch/simple_output_stationary.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | # ============================================================ 3 | # Architecture Description 4 | # ============================================================ 5 | version: 0.3 6 | subtree: 7 | - name: system 8 | local: 9 | - name: DRAM 10 | class: DRAM 11 | attributes: 12 | type: HBM2 13 | width: 32 14 | block-size: 2 15 | word-bits: 16 16 | subtree: 17 | - name: simple_ws[0] 18 | attributes: 19 | technology: 45nm 20 | local: 21 | - name: shared_glb 22 | class: smartbuffer_SRAM 23 | attributes: 24 | memory_depth: 458752 25 | memory_width: 32 26 | n_banks: 64 27 | block-size: 2 28 | word-bits: 16 29 | read_bandwidth: 32 30 | write_bandwidth: 32 31 | meshX: 1 32 | subtree: 33 | - name: PE[0..1023] 34 | attributes: 35 | meshX: 1 36 | local: 37 | - name: pe_spad 38 | class: smartbuffer_SRAM 39 | attributes: 40 | memory_depth: 1 41 | memory_width: 16 42 | block-size: 1 43 | word-bits: 16 44 | meshX: 32 45 | - name: mac 46 | class: intmac 47 | attributes: 48 | datawidth: 16 49 | meshX : 32 50 | # input and output registers for the mac unit 51 | - name: weight_reg 52 | class: reg_storage 53 | attributes: 54 | depth: 16 55 | width: 16 # width in bits 56 | meshX: 32 57 | - name: input_activation_reg 58 | class: reg_storage 59 | attributes: 60 | depth: 1 61 | width: 16 # width in bits 62 | meshX: 32 63 | - name: output_activation_reg 64 | class: reg_storage 65 | attributes: 66 | depth: 1 67 | width: 16 # width in bits 68 | meshX: 32 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/arch/simple_output_stationary.yaml.tmp: -------------------------------------------------------------------------------- 1 | architecture: 2 | # ============================================================ 3 | # Architecture Description 4 | # ============================================================ 5 | version: 0.3 6 | subtree: 7 | - name: system 8 | local: 9 | - name: DRAM 10 | class: DRAM 11 | attributes: 12 | type: LPDDR4 13 | width: 32 14 | block-size: 2 15 | word-bits: 16 16 | subtree: 17 | - name: simple_ws 18 | attributes: 19 | technology: 45nm 20 | local: 21 | - name: shared_glb 22 | class: smartbuffer_SRAM 23 | attributes: 24 | memory_depth: 458752 25 | memory_width: 32 26 | n_banks: 64 27 | block-size: 2 28 | word-bits: 16 29 | read_bandwidth: 32 30 | write_bandwidth: 32 31 | subtree: 32 | - name: PE[0..1023] 33 | local: 34 | - name: pe_spad 35 | class: smartbuffer_SRAM 36 | attributes: 37 | memory_depth: 1 38 | memory_width: 16 39 | block-size: 1 40 | word-bits: 16 41 | meshX: 32 42 | - name: mac 43 | class: intmac 44 | attributes: 45 | datawidth: 16 46 | meshX : 32 47 | # input and output registers for the mac unit 48 | - name: weight_reg 49 | class: reg_storage 50 | attributes: 51 | depth: 16 52 | width: 16 # width in bits 53 | meshX: 32 54 | - name: input_activation_reg 55 | class: reg_storage 56 | attributes: 57 | depth: 1 58 | width: 16 # width in bits 59 | meshX: 32 60 | - name: output_activation_reg 61 | class: reg_storage 62 | attributes: 63 | depth: 1 64 | width: 16 # width in bits 65 | meshX: 32 66 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/constraints/simple_output_stationary_arch_constraints.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # The following constraints are limitations of the hardware architecture and dataflow 3 | # 4 | 5 | architecture_constraints: 6 | targets: 7 | - target: DRAM 8 | type: temporal 9 | permutation: CPQ 10 | # pe spad only stored outputs 11 | - target: pe_spad 12 | type: bypass 13 | bypass: [Inputs, Weights] 14 | keep: [Outputs] 15 | # pe spad keeps outputs stationary 16 | - target: pe_spad 17 | type: temporal 18 | permutation: CRSPQ 19 | # NoC sending C in x direction, M in y direction; parallel-for loops for C and M only 20 | - target: shared_glb 21 | type: bypass 22 | bypass: [Outputs] 23 | keep: [Inputs, Weights] 24 | - target: shared_glb 25 | type: spatial 26 | permutation: MPQ 27 | split: 1 28 | factors: R=1 S=1 C=1 29 | # enforce the registers to only store 1 data of the datatype it stores 30 | - target: weight_reg 31 | type: temporal 32 | factors: R=1 S=1 P=1 Q=1 C=16 33 | - target: weight_reg 34 | type: bypass 35 | keep: [Weights] 36 | bypass: [Inputs, Outputs] 37 | - target: input_activation_reg 38 | type: temporal 39 | factors: P=1 Q=1 C=1 N=1 40 | - target: input_activation_reg 41 | type: bypass 42 | keep: [Inputs] 43 | bypass: [Outputs, Weights] 44 | - target: output_activation_reg 45 | type: temporal 46 | factors: P=1 Q=1 M=1 N=1 47 | - target: output_activation_reg 48 | type: bypass 49 | keep: [Outputs] 50 | bypass: [Inputs, Weights] 51 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/constraints/simple_output_stationary_map_constraints.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # The following constraints are not limitations of the hardware architecture and dataflow, 3 | # but help limit the search space to speed up search 4 | # 5 | 6 | mapspace_constraints: 7 | targets: 8 | # intuitive optimization to not tile R and S at the GLB level 9 | - target: shared_glb 10 | type: temporal 11 | factors: R=1 S=1 12 | # intuitive optimization according to architecture dimensions 13 | #- target: shared_glb 14 | #type: spatial 15 | # factors: M=16 N=16 16 | # intuitive optimization to not tile R and S at the DRAM level 17 | - target: DRAM 18 | type: temporal 19 | factors: R=1 S=1 20 | # optimization to constrain the amplification factor of R and S to only one register 21 | - target: output_activation_reg 22 | type: temporal 23 | factors: R=1 S=1 24 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_os_32x32/mapper/mapper.yaml: -------------------------------------------------------------------------------- 1 | mapper: 2 | optimization-metrics: [ delay, energy ] 3 | live-status: False 4 | num-threads: 8 5 | timeout: 15000 6 | victory-condition: 3000 7 | algorithm: random-pruned 8 | max-permutations-per-if-visit: 16 -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/README.md: -------------------------------------------------------------------------------- 1 | Simple Weight Stationary Architecture 2 | ---------------------------- 3 | This folder contains a simple weight stationary architecture. 4 | 5 | Q&As: 6 | ---------------------------- 7 | 1. How long do the Timeloop simulations take? 8 | 9 | Depending on your workload, the simulation takes various amount of time to finish. Generally, they should 10 | converge within 30 mins. You can manually stop the exploration when you see things are converging by 11 | pressing `ctrl + C`. They sometimes will take much longer to 12 | automaticaly stop as we set the converging cretiria to be pretty high to avoid early-stop with subooptimal mappings. Use you own 13 | judgement. 14 | 15 | 2. How to get started on using the architecture skeleton to model architectures with advanced technologies? 16 | 17 | You generally need to modify the definitions of the compound components. If needed, you are also like required to 18 | make updated the architecture description to include the additional setup for your architecture. 19 | 20 | An example design for compute-in-memory architecture using ReRAM can be found 21 | [here](https://github.com/Accelergy-Project/processing-in-memory-design) 22 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/arch/components/reg_storage.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: reg_storage 5 | attributes: 6 | technology: 45nm 7 | latency: 1ns 8 | width: 16 9 | depth: 1 10 | subcomponents: 11 | - name: storage 12 | class: reg 13 | attributes: 14 | technology: technology 15 | latency: latency 16 | datawidth : width 17 | actions: 18 | - name: access 19 | subcomponents: 20 | - name: storage 21 | actions: 22 | - name: access -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/arch/components/smartbuffer_RF.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: smartbuffer_RF 5 | attributes: 6 | technology: 45nm 7 | memory_depth: 12 8 | memory_width: 16 9 | n_rdwr_ports: 2 10 | n_banks: 1 11 | n_buffets: 1 12 | subcomponents: 13 | - name: storage 14 | class: regfile 15 | attributes: 16 | technology: technology 17 | width: memory_width 18 | depth: memory_depth 19 | n_rdwr_ports: n_rdwr_ports 20 | n_banks: n_banks 21 | - name: address_generators[0..1] 22 | class: intadder 23 | attributes: 24 | technology: technology 25 | width: log(memory_depth) 26 | actions: 27 | - name: write 28 | arguments: 29 | data_delta: 0..1 30 | address_delta: 0..n_banks 31 | subcomponents: 32 | - name: storage 33 | actions: 34 | - name: write 35 | arguments: 36 | data_delta: data_delta 37 | address_delta: address_delta 38 | - name: address_generators[0] 39 | actions: 40 | - name: add 41 | - name: address_generators[1] 42 | actions: 43 | - name: idle 44 | - name: read 45 | arguments: 46 | data_delta: 0..1 47 | address_delta: 0..n_banks 48 | subcomponents: 49 | - name: storage 50 | actions: 51 | - name: read 52 | arguments: 53 | data_delta: data_delta 54 | address_delta: address_delta 55 | - name: address_generators[1] 56 | actions: 57 | - name: add 58 | - name: address_generators[0] 59 | actions: 60 | - name: idle 61 | - name: idle 62 | subcomponents: 63 | - name: storage 64 | actions: 65 | - name: idle 66 | - name: address_generators[0..1] 67 | actions: 68 | - name: idle 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/arch/components/smartbuffer_SRAM.yaml: -------------------------------------------------------------------------------- 1 | compound_components: 2 | version: 0.3 3 | classes: 4 | - name: smartbuffer_SRAM 5 | attributes: 6 | technology: 45nm 7 | memory_depth: 12 8 | memory_width: 16 9 | n_rdwr_ports: 2 10 | n_banks: 1 11 | n_buffets: 1 12 | subcomponents: 13 | - name: storage 14 | class: SRAM 15 | attributes: 16 | technology: technology 17 | width: memory_width 18 | depth: memory_depth 19 | n_rdwr_ports: n_rdwr_ports 20 | n_banks: n_banks 21 | - name: address_generators[0..1] 22 | class: intadder 23 | attributes: 24 | technology: technology 25 | width: log(memory_depth) 26 | actions: 27 | - name: write 28 | arguments: 29 | data_delta: 0..1 30 | address_delta: 0..n_banks 31 | subcomponents: 32 | - name: storage 33 | actions: 34 | - name: write 35 | arguments: 36 | data_delta: data_delta 37 | address_delta: address_delta 38 | - name: address_generators[0] 39 | actions: 40 | - name: count 41 | - name: address_generators[1] 42 | actions: 43 | - name: idle 44 | - name: read 45 | arguments: 46 | data_delta: 0..1 47 | address_delta: 0..n_banks 48 | subcomponents: 49 | - name: storage 50 | actions: 51 | - name: read 52 | arguments: 53 | data_delta: data_delta 54 | address_delta: address_delta 55 | - name: address_generators[1] 56 | actions: 57 | - name: add 58 | - name: address_generators[0] 59 | actions: 60 | - name: idle 61 | - name: idle 62 | subcomponents: 63 | - name: storage 64 | actions: 65 | - name: idle 66 | - name: address_generators[0..1] 67 | actions: 68 | - name: idle 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/arch/simple_weight_stationary.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | # ============================================================ 3 | # Architecture Description 4 | # ============================================================ 5 | version: 0.3 6 | subtree: 7 | - name: system 8 | local: 9 | - name: DRAM 10 | class: DRAM 11 | attributes: 12 | type: HBM2 13 | width: 256 14 | block-size: 8 15 | word-bits: 32 16 | subtree: 17 | - name: simple_ws 18 | attributes: 19 | technology: 45nm 20 | local: 21 | - name: accum_spad 22 | class: smartbuffer_SRAM 23 | attributes: 24 | memory_depth: 2048 25 | memory_width: 256 26 | n_banks: 2 27 | block-size: 8 28 | word-bits: 32 29 | read_bandwidth: 8 30 | write_bandwidth: 8 31 | multiple-buffering: 2 32 | - name: spad 33 | class: smartbuffer_SRAM 34 | attributes: 35 | memory_depth: 8192 36 | memory_width: 256 37 | n_banks: 4 38 | block-size: 8 39 | word-bits: 32 40 | read_bandwidth: 8 41 | write_bandwidth: 8 42 | multiple-buffering: 2 43 | subtree: 44 | - name: PE[0..63] 45 | local: 46 | - name: pe_spad 47 | class: smartbuffer_RF 48 | attributes: 49 | # memory_depth: 192 50 | memory_depth: 1 51 | memory_width: 32 52 | block-size: 1 53 | word-bits: 32 54 | meshX: 8 55 | - name: mac 56 | class: intmac 57 | attributes: 58 | datawidth: 32 59 | meshX : 8 60 | # input and output registers for the mac unit 61 | - name: weight_reg 62 | class: reg_storage 63 | attributes: 64 | depth: 1 65 | width: 32 # width in bits 66 | meshX: 8 67 | - name: input_activation_reg 68 | class: reg_storage 69 | attributes: 70 | depth: 1 71 | width: 32 # width in bits 72 | meshX: 8 73 | - name: output_activation_reg 74 | class: reg_storage 75 | attributes: 76 | depth: 1 77 | width: 32 # width in bits 78 | meshX: 8 79 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/constraints/simple_weight_stationary_arch_constraints.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # The following constraints are limitations of the hardware architecture and dataflow 3 | # 4 | 5 | architecture_constraints: 6 | targets: 7 | # pe spad only stored weights 8 | - target: pe_spad 9 | type: bypass 10 | bypass: [Inputs, Outputs] 11 | keep: [Weights] 12 | # pe spad keeps weights stationary 13 | - target: pe_spad 14 | type: temporal 15 | permutation: PQCRS 16 | # NoC sending C in x direction, M in y direction; parallel-for loops for C and M only 17 | - target: spad 18 | type: spatial 19 | permutation: MCRS 20 | split: 1 21 | factors: P=1 Q=1 22 | # enforce the registers to only store 1 data of the datatype it stores 23 | - target: weight_reg 24 | type: temporal 25 | factors: R=1 S=1 M=1 C=1 26 | - target: weight_reg 27 | type: bypass 28 | keep: [Weights] 29 | bypass: [Inputs, Outputs] 30 | - target: input_activation_reg 31 | type: temporal 32 | factors: P=1 Q=1 C=1 N=1 33 | - target: input_activation_reg 34 | type: bypass 35 | keep: [Inputs] 36 | bypass: [Outputs, Weights] 37 | - target: output_activation_reg 38 | type: temporal 39 | factors: P=1 Q=1 M=1 N=1 40 | - target: output_activation_reg 41 | type: bypass 42 | keep: [Outputs] 43 | bypass: [Inputs, Weights] 44 | - target: spad 45 | type: bypass 46 | keep: [Inputs, Weights] 47 | bypass: [Outputs] 48 | - target: accum_spad 49 | type: bypass 50 | keep: [Outputs] 51 | bypass: [Inputs, Weights] 52 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/constraints/simple_weight_stationary_map_constraints.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # The following constraints are not limitations of the hardware architecture and dataflow, 3 | # but help limit the search space to speed up search 4 | # 5 | 6 | mapspace_constraints: 7 | targets: 8 | # intuitive optimization to not tile R and S at the GLB level 9 | # - target: shared_glb 10 | # type: temporal 11 | # factors: R=1 S=1 12 | # intuitive optimization according to architecture dimensions 13 | - target: spad 14 | type: spatial 15 | factors: M=8 C=8 16 | # intuitive optimization to not tile R and S at the DRAM level 17 | - target: DRAM 18 | type: temporal 19 | factors: R=1 S=1 20 | # optimization to constrain the amplification factor of R and S to only one register 21 | - target: output_activation_reg 22 | type: temporal 23 | factors: R=1 S=1 24 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ART.yaml: -------------------------------------------------------------------------------- 1 | ART: 2 | version: 0.3 3 | tables: 4 | - name: system.simple_ws.PE[0..255].mac 5 | area: 1239.5 6 | - name: system.DRAM 7 | area: 1 8 | - name: system.simple_ws.PE[0..255].pe_spad 9 | area: 3634.68 10 | - name: system.simple_ws.PE[0..255].weight_reg 11 | area: 5.98 12 | - name: system.simple_ws.PE[0..255].input_activation_reg 13 | area: 5.98 14 | - name: system.simple_ws.PE[0..255].output_activation_reg 15 | area: 5.98 16 | - name: system.simple_ws.shared_glb 17 | area: 1162549.0 18 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ART_summary.yaml: -------------------------------------------------------------------------------- 1 | ART_summary: 2 | version: 0.3 3 | table_summary: 4 | - name: system.simple_ws.PE[0..255].mac 5 | area: 1239.5 6 | primitive_estimations: Aladdin_table 7 | - name: system.DRAM 8 | area: 1 9 | primitive_estimations: dummy_table 10 | - name: system.simple_ws.PE[0..255].pe_spad 11 | area: 3634.68 12 | primitive_estimations: 13 | - name: storage 14 | estimator: Aladdin_table 15 | - name: address_generators[0..1] 16 | estimator: Aladdin_table 17 | - name: system.simple_ws.PE[0..255].weight_reg 18 | area: 5.98 19 | primitive_estimations: 20 | - name: storage 21 | estimator: Aladdin_table 22 | - name: system.simple_ws.PE[0..255].input_activation_reg 23 | area: 5.98 24 | primitive_estimations: 25 | - name: storage 26 | estimator: Aladdin_table 27 | - name: system.simple_ws.PE[0..255].output_activation_reg 28 | area: 5.98 29 | primitive_estimations: 30 | - name: storage 31 | estimator: Aladdin_table 32 | - name: system.simple_ws.shared_glb 33 | area: 1162549.0 34 | primitive_estimations: 35 | - name: storage 36 | estimator: Cacti 37 | - name: address_generators[0..1] 38 | estimator: Aladdin_table 39 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ERT_summary.yaml: -------------------------------------------------------------------------------- 1 | ERT_summary: 2 | version: 0.3 3 | table_summary: 4 | - name: system.simple_ws.PE[0..255].mac 5 | actions: 6 | - name: mac_random 7 | energy: 2.2 8 | - name: mac_reused 9 | energy: 1.877 10 | - name: mac_gated 11 | energy: 0.103 12 | - name: idle 13 | energy: 0.066 14 | primitive_estimation(s): 15 | - name: system.simple_ws.PE[0..255].mac 16 | estimator: Aladdin_table 17 | - name: system.DRAM 18 | actions: 19 | - name: read 20 | energy: 512 21 | - name: write 22 | energy: 512 23 | - name: idle 24 | energy: 0 25 | primitive_estimation(s): 26 | - name: system.DRAM 27 | estimator: Cacti 28 | - name: system.simple_ws.PE[0..255].pe_spad 29 | actions: 30 | - name: write 31 | average_energy: 0.824 32 | max_energy: 1.586 33 | min_energy: 0.061 34 | - name: read 35 | average_energy: 0.824 36 | max_energy: 1.586 37 | min_energy: 0.061 38 | - name: idle 39 | energy: 0.024 40 | primitive_estimation(s): 41 | - name: storage 42 | estimator: Aladdin_table 43 | - name: address_generators[0] 44 | estimator: Aladdin_table 45 | - name: address_generators[1] 46 | estimator: Aladdin_table 47 | - name: address_generators[0..1] 48 | estimator: Aladdin_table 49 | - name: system.simple_ws.PE[0..255].weight_reg 50 | actions: 51 | - name: access 52 | energy: 0.009 53 | primitive_estimation(s): 54 | - name: storage 55 | estimator: Aladdin_table 56 | - name: system.simple_ws.PE[0..255].input_activation_reg 57 | actions: 58 | - name: access 59 | energy: 0.009 60 | primitive_estimation(s): 61 | - name: storage 62 | estimator: Aladdin_table 63 | - name: system.simple_ws.PE[0..255].output_activation_reg 64 | actions: 65 | - name: access 66 | energy: 0.009 67 | primitive_estimation(s): 68 | - name: storage 69 | estimator: Aladdin_table 70 | - name: system.simple_ws.shared_glb 71 | actions: 72 | - name: write 73 | average_energy: 37.635 74 | max_energy: 75.215 75 | min_energy: 0.055 76 | - name: read 77 | average_energy: 37.1 78 | max_energy: 74.144 79 | min_energy: 0.055 80 | - name: idle 81 | energy: 0.018 82 | primitive_estimation(s): 83 | - name: storage 84 | estimator: Cacti 85 | - name: address_generators[0] 86 | estimator: Aladdin_table 87 | - name: address_generators[1] 88 | estimator: Aladdin_table 89 | - name: address_generators[0..1] 90 | estimator: Aladdin_table 91 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.defined_input_architecture.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.3 3 | subtree: 4 | - name: system 5 | local: 6 | - name: DRAM 7 | class: DRAM 8 | attributes: 9 | block-size: 4 10 | technology: 65nm 11 | type: LPDDR4 12 | width: 64 13 | word-bits: 16 14 | subtree: 15 | - name: simple_ws 16 | attributes: 17 | technology: 45nm 18 | local: 19 | - name: shared_glb 20 | class: smartbuffer_SRAM 21 | attributes: 22 | block-size: 4 23 | memory_depth: 16384 24 | memory_width: 64 25 | n_banks: 32 26 | n_buffets: 1 27 | n_rdwr_ports: 2 28 | read_bandwidth: 16 29 | technology: 45nm 30 | word-bits: 16 31 | write_bandwidth: 16 32 | subtree: 33 | - name: PE[0..255] 34 | local: 35 | - name: pe_spad 36 | class: smartbuffer_RF 37 | attributes: 38 | block-size: 1 39 | memory_depth: 192 40 | memory_width: 16 41 | meshX: 16 42 | n_banks: 1 43 | n_buffets: 1 44 | n_rdwr_ports: 2 45 | technology: 45nm 46 | word-bits: 16 47 | - name: mac 48 | class: intmac 49 | attributes: 50 | datawidth: 16 51 | latency: 5ns 52 | meshX: 16 53 | num_pipeline_stages: 2 54 | technology: 45nm 55 | - name: weight_reg 56 | class: reg_storage 57 | attributes: 58 | depth: 1 59 | latency: 1ns 60 | meshX: 16 61 | technology: 45nm 62 | width: 16 63 | - name: input_activation_reg 64 | class: reg_storage 65 | attributes: 66 | depth: 1 67 | latency: 1ns 68 | meshX: 16 69 | technology: 45nm 70 | width: 16 71 | - name: output_activation_reg 72 | class: reg_storage 73 | attributes: 74 | depth: 1 75 | latency: 1ns 76 | meshX: 16 77 | technology: 45nm 78 | width: 16 79 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.flattened_architecture.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.3 3 | local: 4 | - name: system.simple_ws.PE[0..255].pe_spad 5 | class: smartbuffer_RF 6 | attributes: 7 | block-size: 1 8 | memory_depth: 192 9 | memory_width: 16 10 | meshX: 16 11 | n_banks: 1 12 | n_buffets: 1 13 | n_rdwr_ports: 2 14 | technology: 45nm 15 | word-bits: 16 16 | - name: system.simple_ws.PE[0..255].mac 17 | class: intmac 18 | attributes: 19 | datawidth: 16 20 | latency: 5ns 21 | meshX: 16 22 | num_pipeline_stages: 2 23 | technology: 45nm 24 | - name: system.simple_ws.PE[0..255].weight_reg 25 | class: reg_storage 26 | attributes: 27 | depth: 1 28 | latency: 1ns 29 | meshX: 16 30 | technology: 45nm 31 | width: 16 32 | - name: system.simple_ws.PE[0..255].input_activation_reg 33 | class: reg_storage 34 | attributes: 35 | depth: 1 36 | latency: 1ns 37 | meshX: 16 38 | technology: 45nm 39 | width: 16 40 | - name: system.simple_ws.PE[0..255].output_activation_reg 41 | class: reg_storage 42 | attributes: 43 | depth: 1 44 | latency: 1ns 45 | meshX: 16 46 | technology: 45nm 47 | width: 16 48 | - name: system.simple_ws.shared_glb 49 | class: smartbuffer_SRAM 50 | attributes: 51 | block-size: 4 52 | memory_depth: 16384 53 | memory_width: 64 54 | n_banks: 32 55 | n_buffets: 1 56 | n_rdwr_ports: 2 57 | read_bandwidth: 16 58 | technology: 45nm 59 | word-bits: 16 60 | write_bandwidth: 16 61 | - name: system.DRAM 62 | class: DRAM 63 | attributes: 64 | block-size: 4 65 | technology: 65nm 66 | type: LPDDR4 67 | width: 64 68 | word-bits: 16 69 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.map.txt: -------------------------------------------------------------------------------- 1 | 2 | DRAM [ Weights:34848 Inputs:154587 Outputs:290400 ] 3 | --------------------------------------------------- 4 | | for Q in [0:5) 5 | 6 | shared_glb [ Inputs:34731 ] 7 | --------------------------- 8 | | for M in [0:6) 9 | | for Q in [0:11) 10 | | for P in [0:55) 11 | | for M in [0:16) (Spatial-Y) 12 | | for C in [0:3) (Spatial-X) 13 | 14 | pe_spad [ Weights:121 ] 15 | ----------------------- 16 | | for S in [0:11) 17 | | for R in [0:11) 18 | 19 | weight_reg [ Weights:1 ] 20 | ------------------------ 21 | | for Q in [0:1) 22 | 23 | input_activation_reg [ Inputs:1 ] 24 | --------------------------------- 25 | | for Q in [0:1) 26 | 27 | output_activation_reg [ Outputs:1 ] 28 | ----------------------------------- 29 | | for Q in [0:1) 30 | 31 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/mapper/mapper.yaml: -------------------------------------------------------------------------------- 1 | mapper: 2 | optimization-metrics: [ delay, energy ] 3 | live-status: False 4 | num-threads: 8 5 | timeout: 15000 6 | victory-condition: 3000 7 | algorithm: random-pruned 8 | max-permutations-per-if-visit: 16 9 | -------------------------------------------------------------------------------- /configs/timeloop_configs/systolic_ws_8x8/timeloop-mapper.map.txt: -------------------------------------------------------------------------------- 1 | 2 | DRAM [ Weights:147456 (147456) Inputs:1663488 (1663488) Outputs:1605632 (1605632) ] 3 | ----------------------------------------------------------------------------------- 4 | | for Q in [0:2) 5 | | for C in [0:8) 6 | | for P in [0:2) 7 | | for M in [0:2) 8 | 9 | shared_glb [ Inputs:53824 (53824) ] 10 | ----------------------------------- 11 | | for Q in [0:2) 12 | | for M in [0:16) (Spatial-Y) 13 | | for C in [0:16) (Spatial-X) 14 | 15 | pe_spad [ Weights:36 (36) ] 16 | --------------------------- 17 | | for M in [0:4) 18 | | for S in [0:3) 19 | | for R in [0:3) 20 | | for P in [0:28) 21 | 22 | weight_reg [ Weights:1 (1) ] 23 | ---------------------------- 24 | | for P in [0:2) 25 | | for Q in [0:28) 26 | 27 | input_activation_reg [ Inputs:1 (1) ] 28 | ------------------------------------- 29 | | for Q in [0:1) 30 | 31 | output_activation_reg [ Outputs:1 (1) ] 32 | --------------------------------------- 33 | | for Q in [0:1) 34 | 35 | -------------------------------------------------------------------------------- /example/language_models.json: -------------------------------------------------------------------------------- 1 | { 2 | "models": [ 3 | { 4 | "name": "opt-125m", 5 | "trace_file" : "input.csv", 6 | "scheduler" : "simple", 7 | "scheduler_config": { 8 | "max_batch_size": 8 9 | } 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /example/models_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "models": [ 3 | { 4 | "name": "resnet18", 5 | "batch_size" : 1 6 | } 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /extern/ramulator_custom/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | 3 | # Compiled Object files 4 | obj/ 5 | 6 | # Compiled target executable files 7 | -------------------------------------------------------------------------------- /extern/ramulator_custom/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | project(ramulator_project) 3 | 4 | file(GLOB_RECURSE RAMULATOR_SRCS CONFIGURE_DEPENDS src/*.cpp) 5 | add_library(ramulator1 STATIC ${RAMULATOR_SRCS}) 6 | target_include_directories(ramulator1 7 | PUBLIC include 8 | PRIVATE include/ramulator 9 | PRIVATE src 10 | ) 11 | target_compile_options(ramulator1 PRIVATE -Wall -O3) 12 | -------------------------------------------------------------------------------- /extern/ramulator_custom/include/ramulator/Ramulator.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __RAMULATOR_H 2 | #define __RAMULATOR_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | namespace ram { 12 | class MemoryBase; 13 | class Request; 14 | class Ramulator { 15 | public: 16 | Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim = false); 17 | ~Ramulator(); 18 | void tick(); 19 | bool isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const; 20 | bool isAvailable(uint64_t Addr, bool IsWrite) const; 21 | void push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req); 22 | void push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req); 23 | bool isEmpty(int CtrlID) const; 24 | const void* top(int CtrlID) const; 25 | void pop(int CtrlID); 26 | int getAtomicBytes() const; 27 | int getNumChannels() const; 28 | int getChannel(uint64_t Addr) const; 29 | void print_stats(); 30 | private: 31 | std::unique_ptr MemBase; 32 | class OutputPendingQueue; 33 | std::vector OutputPendingQueues; 34 | using CallbackMap = 35 | std::unordered_map>; 36 | CallbackMap Callbacks; 37 | robin_hood::unordered_flat_set hot_vids; 38 | bool is_pim; 39 | static std::unique_ptr createMemory(std::string ConfigFilePath, uint32_t num_core); 40 | }; 41 | class Ramulator::OutputPendingQueue { 42 | public: 43 | OutputPendingQueue(int Size); 44 | bool isAvailable() const; 45 | bool isAvailable(uint32_t count) const; 46 | bool isEmpty() const; 47 | void reserve(); 48 | void push(void* original_req); 49 | const void* top() const; 50 | void pop(); 51 | private: 52 | const int Size; 53 | int NumReserved; 54 | std::queue PendingQueue; 55 | }; 56 | } // end namespace 57 | #endif 58 | -------------------------------------------------------------------------------- /extern/ramulator_custom/src/Config.cpp: -------------------------------------------------------------------------------- 1 | #include "Config.h" 2 | 3 | using namespace std; 4 | using namespace ram; 5 | 6 | RamulatorConfig::RamulatorConfig(const std::string& fname) { 7 | options["mapping"] = "RoBaRaCoCh"; 8 | options["scheduler"] = "FRFCFS"; 9 | parse(fname); 10 | } 11 | 12 | void RamulatorConfig::parse(const string& fname) 13 | { 14 | ifstream file(fname); 15 | assert(file.good() && "Bad config file"); 16 | string line; 17 | while (getline(file, line)) { 18 | char delim[] = " \t="; 19 | vector tokens; 20 | 21 | while (true) { 22 | size_t start = line.find_first_not_of(delim); 23 | if (start == string::npos) 24 | break; 25 | 26 | size_t end = line.find_first_of(delim, start); 27 | if (end == string::npos) { 28 | tokens.push_back(line.substr(start)); 29 | break; 30 | } 31 | 32 | tokens.push_back(line.substr(start, end - start)); 33 | line = line.substr(end); 34 | } 35 | 36 | // empty line 37 | if (!tokens.size()) 38 | continue; 39 | 40 | // comment line 41 | if (tokens[0][0] == '#') 42 | continue; 43 | 44 | // parameter line 45 | assert(tokens.size() == 2 && "Only allow two tokens in one line"); 46 | 47 | options[tokens[0]] = tokens[1]; 48 | 49 | if (tokens[0] == "channels") { 50 | channels = atoi(tokens[1].c_str()); 51 | } else if (tokens[0] == "ranks") { 52 | ranks = atoi(tokens[1].c_str()); 53 | } else if (tokens[0] == "subarrays") { 54 | subarrays = atoi(tokens[1].c_str()); 55 | } else if (tokens[0] == "cpu_tick") { 56 | cpu_tick = atoi(tokens[1].c_str()); 57 | } else if (tokens[0] == "mem_tick") { 58 | mem_tick = atoi(tokens[1].c_str()); 59 | } else if (tokens[0] == "expected_limit_insts") { 60 | expected_limit_insts = atoi(tokens[1].c_str()); 61 | } else if (tokens[0] == "warmup_insts") { 62 | warmup_insts = atoi(tokens[1].c_str()); 63 | } 64 | } 65 | file.close(); 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /extern/ramulator_custom/src/MemoryFactory.cpp: -------------------------------------------------------------------------------- 1 | #include "MemoryFactory.h" 2 | // #include "LPDDR4.h" 3 | // #include "WideIO.h" 4 | // #include "WideIO2.h" 5 | #include "HBM.h" 6 | //#include "SALP.h" 7 | 8 | using namespace ram; 9 | 10 | namespace ram 11 | { 12 | // 13 | // template <> 14 | // void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { 15 | // assert(channels >= 2 && "LPDDR4 requires 2, 4, 8 ... channels"); 16 | // } 17 | // 18 | // template <> 19 | // void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { 20 | // assert(channels == 4 && "WideIO comes with 4 channels"); 21 | // } 22 | // 23 | // template <> 24 | // void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { 25 | // assert((channels == 4 || channels == 8) && "WideIO2 comes with 4 or 8 channels"); 26 | // assert((ranks == 1 || ranks == 2) && "WideIO2 comes with 1 or 2 ranks"); 27 | // } 28 | 29 | template <> 30 | void MemoryFactory::validate(int channels, int ranks, RamulatorConfig& configs) { 31 | assert(channels == 8 && "HBM comes with 8 channels"); 32 | } 33 | 34 | // template <> 35 | // MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline) { 36 | // int channels = stoi(configs["channels"], NULL, 0); 37 | // int ranks = stoi(configs["ranks"], NULL, 0); 38 | // validate(channels, ranks, configs); 39 | // 40 | // const string& org_name = configs["org"]; 41 | // const string& speed_name = configs["speed"]; 42 | // 43 | // WideIO2 *spec = new WideIO2(org_name, speed_name, channels); 44 | // 45 | // extend_channel_width(spec, cacheline); 46 | // 47 | // return (MemoryBase *)populate_memory(configs, spec, channels, ranks); 48 | // } 49 | // 50 | // 51 | // template <> 52 | // MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline) { 53 | // int channels = stoi(configs["channels"], NULL, 0); 54 | // int ranks = stoi(configs["ranks"], NULL, 0); 55 | // int subarrays = stoi(configs["subarrays"], NULL, 0); 56 | // validate(channels, ranks, configs); 57 | // 58 | // const string& std_name = configs["standard"]; 59 | // const string& org_name = configs["org"]; 60 | // const string& speed_name = configs["speed"]; 61 | // 62 | // SALP *spec = new SALP(org_name, speed_name, std_name, subarrays); 63 | // 64 | // extend_channel_width(spec, cacheline); 65 | // 66 | // return (MemoryBase *)populate_memory(configs, spec, channels, ranks); 67 | // } 68 | 69 | } 70 | 71 | // This function can be used by autoconf AC_CHECK_LIB since 72 | // apparently it can't detect C++ functions. 73 | // Basically just an entry in the symbol table 74 | // extern "C" 75 | // { 76 | // void libramulator_is_present(void) 77 | // { 78 | // ; 79 | // } 80 | // } 81 | -------------------------------------------------------------------------------- /extern/ramulator_custom/src/MemoryFactory.h: -------------------------------------------------------------------------------- 1 | #ifndef __MEMORY_FACTORY_H 2 | #define __MEMORY_FACTORY_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Memory.h" 10 | #include "DRAM.h" 11 | #include "Controller.h" 12 | #include "Config.h" 13 | 14 | using namespace std; 15 | 16 | namespace ram 17 | { 18 | template 19 | class MemoryFactory { 20 | public: 21 | static void extend_channel_width(T* spec, int cacheline) 22 | { 23 | int channel_unit = spec->prefetch_size * spec->channel_width / 8; 24 | int gang_number = cacheline / channel_unit; 25 | 26 | assert(gang_number >= 1 && 27 | "cacheline size must be greater or equal to minimum channel width"); 28 | 29 | assert(cacheline == gang_number * channel_unit && 30 | "cacheline size must be a multiple of minimum channel width"); 31 | 32 | spec->channel_width *= gang_number; 33 | } 34 | 35 | static std::unique_ptr> populate_memory(RamulatorConfig& configs, 36 | T *spec, 37 | int channels, int ranks) { 38 | int& default_ranks = spec->org_entry.count[int(T::Level::Rank)]; 39 | int& default_channels = spec->org_entry.count[int(T::Level::Channel)]; 40 | 41 | if (default_channels == 0) default_channels = channels; 42 | if (default_ranks == 0) default_ranks = ranks; 43 | 44 | vector *> ctrls; 45 | for (int c = 0; c < channels; c++){ 46 | DRAM* channel = new DRAM(spec, T::Level::Channel); 47 | channel->id = c; 48 | channel->regStats(""); 49 | ctrls.push_back(new Controller(configs, channel)); 50 | } 51 | return std::make_unique>(configs, ctrls); 52 | } 53 | 54 | static void validate(int channels, int ranks, RamulatorConfig& configs) { 55 | assert(channels > 0 && ranks > 0); 56 | } 57 | 58 | static std::unique_ptr create(RamulatorConfig& configs, 59 | int cacheline) { 60 | int channels = stoi(configs["channels"], NULL, 0); 61 | int ranks = stoi(configs["ranks"], NULL, 0); 62 | 63 | validate(channels, ranks, configs); 64 | 65 | const string& org_name = configs["org"]; 66 | const string& speed_name = configs["speed"]; 67 | 68 | T *spec = new T(org_name, speed_name); 69 | 70 | // Set channel width statically in the header file 71 | //extend_channel_width(spec, cacheline); 72 | 73 | return populate_memory(configs, spec, channels, ranks); 74 | } 75 | }; 76 | 77 | // template <> 78 | // MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline); 79 | // template <> 80 | // MemoryBase *MemoryFactory::create(RamulatorConfig& configs, int cacheline); 81 | 82 | } /*namespace ram*/ 83 | 84 | #endif /*__MEMORY_FACTORY_H*/ 85 | -------------------------------------------------------------------------------- /extern/ramulator_custom/src/Request.cpp: -------------------------------------------------------------------------------- 1 | #include "Request.h" 2 | 3 | namespace ram { 4 | 5 | Request::Request() {} 6 | 7 | Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, 8 | function &cb) 9 | : type(Type), 10 | is_first_command(true), 11 | addr(Addr), 12 | addr_vec(AddrVec), 13 | coreid(0), 14 | arrive(0), 15 | depart(0), 16 | callback(cb) {} 17 | 18 | Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, 19 | function &cb, void* original_req) 20 | : type(Type), 21 | is_first_command(true), 22 | addr(Addr), 23 | addr_vec(AddrVec), 24 | coreid(0), 25 | arrive(0), 26 | depart(0), 27 | callback(cb), 28 | orignal_request(original_req) {} 29 | 30 | Request::Request(Type Type, uint64_t Addr, std::vector AddrVec, 31 | function &cb, int vid) 32 | : type(Type), 33 | is_first_command(true), 34 | addr(Addr), 35 | addr_vec(AddrVec), 36 | coreid(0), 37 | arrive(0), 38 | depart(0), 39 | vid(vid), 40 | callback(cb) {} 41 | 42 | Request::Request(std::vector addr_vec, Type type, 43 | function cb) 44 | : type(type), 45 | is_first_command(true), 46 | addr(-1), 47 | BaseAddr(-1), 48 | addr_vec(addr_vec), 49 | coreid(0), 50 | arrive(0), 51 | depart(0), 52 | callback(cb) {} 53 | 54 | Request::Request(std::vector addr_vec, Type type, 55 | function cb, void* original_req) 56 | : type(type), 57 | is_first_command(true), 58 | addr(-1), 59 | BaseAddr(-1), 60 | addr_vec(addr_vec), 61 | coreid(0), 62 | arrive(0), 63 | depart(0), 64 | callback(cb), 65 | orignal_request(original_req) {} 66 | 67 | Request::Request(Type Type, uint64_t BaseAddr, uint64_t Addr, 68 | std::vector AddrVec, function &cb) 69 | : type(Type), 70 | is_first_command(true), 71 | addr(Addr), 72 | BaseAddr(BaseAddr), 73 | addr_vec(AddrVec), 74 | coreid(0), 75 | arrive(0), 76 | depart(0), 77 | callback(cb) {} 78 | 79 | bool Request::isRead() const { 80 | return type == Type::READ; 81 | } 82 | bool Request::isWrite() const { 83 | return type == Type::WRITE; 84 | } 85 | int Request::getChannelID() const { 86 | return addr_vec[0]; 87 | } 88 | 89 | } // end namespace 90 | 91 | -------------------------------------------------------------------------------- /extern/ramulator_custom/src/Request.h: -------------------------------------------------------------------------------- 1 | #ifndef __REQUEST_H 2 | #define __REQUEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | namespace ram { 11 | class Request { 12 | public: 13 | enum class Type { 14 | READ, WRITE, PIM_WRITE, REFRESH, POWERDOWN, SELFREFRESH, EXTENSION, MAX 15 | }; 16 | Type type; 17 | bool is_first_command; 18 | uint64_t addr; 19 | uint64_t BaseAddr; 20 | //int HandlerID; 21 | 22 | vector addr_vec; 23 | // specify which node this request sent from 24 | int coreid; // to remove compile errors 25 | 26 | uint64_t arrive; 27 | uint64_t depart; 28 | 29 | int vid = -1; 30 | void* orignal_request; 31 | function callback; // call back with more info 32 | 33 | bool isRead() const; 34 | bool isWrite() const; 35 | int getChannelID() const; 36 | 37 | // Used to generate refresh request 38 | Request(); 39 | Request(std::vector addr_vec, Type type, function cb); 40 | Request(std::vector addr_vec, Type type, function cb, void* original_req); 41 | Request(Type type, uint64_t Addr, 42 | std::vector AddrVec, function &cb); 43 | Request(Type type, uint64_t Addr, 44 | std::vector AddrVec, function &cb, void* orignal_req); 45 | Request(Type type, uint64_t Addr, 46 | std::vector AddrVec, function &cb, int vid); 47 | Request(Type type, uint64_t BaseAddr, uint64_t Addr, 48 | std::vector AddrVec, function &cb); 49 | }; 50 | 51 | } /*namespace ram*/ 52 | 53 | #endif /*__REQUEST_H*/ 54 | 55 | -------------------------------------------------------------------------------- /img/ONNXim_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/img/ONNXim_demo.png -------------------------------------------------------------------------------- /img/speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/img/speedup.png -------------------------------------------------------------------------------- /models/language_models/llama3-8b.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function" : "swish", 3 | "num_attention_heads" : 32, 4 | "num_kv_heads" : 8, 5 | "vocab_size" : 128256, 6 | "num_hidden_layers" : 32, 7 | "hidden_size" : 4096, 8 | "intermediate_size" : 14336, 9 | "ffn_type" : "llama", 10 | "max_seq_length" : 8192, 11 | "run_single_layer": true, 12 | "tensor_parallel_size" : 1, 13 | "pipeline_parallel_size" : 1 14 | } -------------------------------------------------------------------------------- /models/language_models/opt-125m.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function" : "relu", 3 | "num_attention_heads" : 12, 4 | "num_kv_heads" : 12, 5 | "vocab_size" : 50272, 6 | "num_hidden_layers" : 1, 7 | "hidden_size" : 768, 8 | "intermediate_size" : 3072, 9 | "ffn_type" : "default", 10 | "max_seq_length" : 2048, 11 | "run_single_layer": true, 12 | "tensor_parallel_size" : 1, 13 | "pipeline_parallel_size" : 1 14 | } -------------------------------------------------------------------------------- /models/language_models/opt-66b.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation_function" : "relu", 3 | "num_attention_heads" : 72, 4 | "num_kv_heads" : 72, 5 | "vocab_size" : 50272, 6 | "num_hidden_layers" : 64, 7 | "hidden_size" : 9216, 8 | "intermediate_size" : 36864, 9 | "ffn_type" : "default", 10 | "max_seq_length" : 2048, 11 | "run_single_layer": true, 12 | "tensor_parallel_size" : 1, 13 | "pipeline_parallel_size" : 1 14 | } -------------------------------------------------------------------------------- /models/resnet18/resnet18.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/models/resnet18/resnet18.onnx -------------------------------------------------------------------------------- /scripts/aggregate_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DATAES=`ls workspace2/` 4 | # for date in $DATAES; do 5 | # for config in `ls workspace2/$date`; do 6 | # MODEL=($(cut -d- -f1 <<< $config)) 7 | # CONFIG=($(cut -d- -f2 <<< $config)) 8 | # grep " finish at" workspace2/${date}/${config}/log.out | while read line; do 9 | # CYCLE=`echo $line | awk '{print $8}'` 10 | # LAYER=`echo $line | awk '{print $5}'` 11 | # echo "$date,$MODEL,$CONFIG,$LAYER,$CYCLE" 12 | # done 13 | 14 | # # grep " Model finish at " workspace/${date}/${config}/log.out | awk '{print $}' 15 | 16 | 17 | # done 18 | # done 19 | 20 | DATAES=`ls workspace/` 21 | for date in $DATAES; do 22 | for model in `ls workspace/$date`; do 23 | for config in `ls workspace/$date/$model`; do 24 | CONFIG=($(cut -d- -f2 <<< $config)) 25 | grep " finish at " workspace/${date}/$model/${config}/log.out | tail -n1 | while read line; do 26 | CYCLE=`echo $line | awk '{print $7}'` 27 | MODEL=`echo $line | awk '{print $5}'` 28 | LAYER=`echo $line | awk '{print $6}'` 29 | echo "$date,$model,$config,$CYCLE" 30 | # echo $line 31 | done 32 | done 33 | 34 | # grep " Model finish at " workspace/${date}/${config}/log.out | awk '{print $}' 35 | 36 | 37 | done 38 | done 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /scripts/generate_cnn_onnx.py: -------------------------------------------------------------------------------- 1 | """ 2 | ONNX File generator 3 | Optimizer onnx graph for inference 4 | """ 5 | 6 | # pylint: disable=unused-argument,missing-docstring,useless-super-delegation 7 | 8 | import onnxruntime as rt 9 | import torch 10 | import torchvision.models as models 11 | import argparse 12 | import pathlib 13 | import os 14 | import json 15 | 16 | size_list = [1, 2, 4, 8, 16, 32] 17 | 18 | HOME = os.getenv("ONNXIM_HOME", default="../") 19 | parser = argparse.ArgumentParser(prog = 'ONNX generator') 20 | parser.add_argument('--model', required=True, help="resnet18, resnet50, alexnet, vgg16, inception") 21 | parser.add_argument('--weight', type=int, default=1, help="export weight, defulat=True") 22 | args = parser.parse_args() 23 | 24 | torchvision_models = { 25 | 'resnet18' : models.resnet18(), 26 | 'resnet50' : models.resnet50(), 27 | 'alexnet' : models.alexnet(), 28 | 'vgg16' : models.vgg16(), 29 | 'squeezenet' : models.squeezenet1_0(), 30 | 'densenet' : models.densenet161(), 31 | 'inception' : models.inception_v3(), 32 | 'googlenet' : models.googlenet(), 33 | 'shufflenet' : models.shufflenet_v2_x1_0(), 34 | 'mobilenet' : models.mobilenet_v2(), 35 | 'resnext50_32x4d' : models.resnext50_32x4d(), 36 | 'wide_resnet50_2' : models.wide_resnet50_2(), 37 | 'mnasnet' : models.mnasnet1_0(), 38 | } 39 | 40 | model = torchvision_models[args.model] 41 | batch_size = 1 42 | if args.model != 'inception': 43 | input = torch.randn(1, 3, 224, 224, requires_grad=True) 44 | input_shape = (3, 224, 224) 45 | else: 46 | input = torch.randn(1, 3, 299, 299, requires_grad=True) 47 | input_shape = (3, 299, 299) 48 | 49 | # Export PyTorch model to onnx 50 | torch.onnx.export( 51 | model, 52 | input, 53 | 'tmp.onnx', 54 | export_params = bool(args.weight), 55 | input_names = ['input'], 56 | output_names = ['output'], 57 | dynamic_axes = { 58 | 'input' : {0 : 'batch_size'}, 59 | 'output' : {0 : 'batch_size'}} 60 | ) 61 | 62 | # Create output folder 63 | pathlib.Path(f'{HOME}/models/{args.model}/').mkdir(parents=True, exist_ok=True) 64 | pathlib.Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True) 65 | 66 | # Optimzied exported onnx file 67 | print(f"Converting ONNX FILE: {args.model}") 68 | opt = rt.SessionOptions() 69 | opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL 70 | opt.optimized_model_filepath = f'{HOME}/models/{args.model}/{args.model}.onnx' 71 | sess = rt.InferenceSession('tmp.onnx', sess_options=opt) 72 | 73 | # Generate model_list json file 74 | for size in size_list: 75 | config = { 76 | "models": [ 77 | { 78 | "name": f"{args.model}", 79 | "batch_size": size, 80 | "request_time": 0 81 | } 82 | ] 83 | } 84 | with open(f"{HOME}/model_lists/{args.model}_{size}.json", "w") as json_file: 85 | json.dump(config, json_file, indent=4) 86 | print("DONE") -------------------------------------------------------------------------------- /scripts/generate_conv_onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pathlib import Path 3 | import json 4 | import os 5 | 6 | size_list = [128]#64, 256, 1024] 7 | dtype = torch.float32 8 | C_in = 128 9 | C_out = 128 10 | K_sz = 3 11 | padding = 1 12 | H = 14 * 4 13 | W = 14 * 4 14 | stride=2 15 | HOME = os.getenv("ONNXIM_HOME", default="../") 16 | 17 | size_name = f"{C_in}_{C_out}_{K_sz}_{H}_{W}" 18 | # Test Convolution model 19 | class size_conv(torch.nn.Module): 20 | def __init__(self, C_in, C_out, K_sz, padding=padding): 21 | super().__init__() 22 | self.fc = torch.nn.Conv2d(C_in, C_out, K_sz, stride=stride, padding=padding, bias=False, dtype=dtype) 23 | 24 | def forward(self, x): 25 | return self.fc(x) 26 | 27 | # Create output folder 28 | Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True) 29 | for size in size_list: 30 | 31 | # Export PyTorch model to onnx 32 | Path(f"{HOME}/models/conv_{size_name}").mkdir(parents=True, exist_ok=True) 33 | m = size_conv(C_in, C_out, K_sz, padding) 34 | A = torch.zeros([1,C_in, H, W], dtype=dtype) 35 | onnx_path = Path(f"{HOME}/models/conv_{size_name}/conv_{size_name}.onnx") 36 | torch.onnx.export(m, A, onnx_path, export_params=True, input_names = ['input'], output_names=['output']) 37 | 38 | # Generate model_list json file 39 | config = { 40 | "models": [ 41 | { 42 | "name": f"conv_{size_name}", 43 | "request_time": 0 44 | } 45 | ] 46 | } 47 | with open(f"{HOME}/model_lists/conv_{size_name}.json", "w") as json_file: 48 | json.dump(config, json_file, indent=4) 49 | -------------------------------------------------------------------------------- /scripts/generate_matmul_onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pathlib import Path 3 | import json 4 | import os 5 | 6 | #size_list = [[512, 768, 2304],[512, 768, 512],[512, 768, 768], [512, 512, 768], [512, 768, 50257]]#32, 64, 128, 256, 512, 1024, 2048] 7 | #size_list = [[512, 512, 1024],[512, 1024, 2],[512, 1024, 512], [512, 1024, 1024], [512, 1024, 3072], [512, 768, 3072], [512, 1024, 4096], [512, 4096, 1024]]#32, 64, 128, 256, 512, 1024, 2048] 8 | size_list = [[1, 1024*8, 1024*8]] #[32,32,32], [64,64,64],[128]*3, [256]*3, [512]*3, [1024]*3, [2048]*3, [4096]*3, [8192]*3] 9 | dtype = torch.float16 10 | 11 | HOME = os.getenv("ONNXIM_HOME", default="../") 12 | 13 | # Test matmul model 14 | class size_matmul(torch.nn.Module): 15 | def __init__(self, size2, size3): 16 | super().__init__() 17 | self.fc = torch.nn.Linear(size2, size3, dtype=dtype, bias=False) #size, size, dtype=dtype, bias=False) 18 | 19 | def forward(self, x): 20 | return self.fc(x) 21 | 22 | # Create output folder 23 | Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True) 24 | for size1, size2, size3 in size_list: 25 | # Export PyTorch model to onnx 26 | Path(f"{HOME}/models/matmul_{size1}_{size2}_{size3}").mkdir(parents=True, exist_ok=True) 27 | m = size_matmul(size2, size3) 28 | A = torch.zeros([size1, size2], dtype=dtype) 29 | onnx_path = Path(f"{HOME}/models/matmul_{size1}_{size2}_{size3}/matmul_{size1}_{size2}_{size3}.onnx") 30 | torch.onnx.export(m, A, onnx_path, export_params=True, input_names = ['input'], output_names=['output']) 31 | 32 | # Generate model_list json file 33 | config = { 34 | "models": [ 35 | { 36 | "name": f"matmul_{size1}_{size2}_{size3}", 37 | "request_time": 0 38 | } 39 | ] 40 | } 41 | with open(f"{HOME}/model_lists/matmul_{size1}_{size2}_{size3}.json", "w") as json_file: 42 | json.dump(config, json_file, indent=4) 43 | -------------------------------------------------------------------------------- /scripts/generate_multi-tenancy_onnx.py: -------------------------------------------------------------------------------- 1 | """ 2 | ONNX File generator 3 | Optimizer onnx graph for inference 4 | """ 5 | 6 | # pylint: disable=unused-argument,missing-docstring,useless-super-delegation 7 | 8 | import onnxruntime as rt 9 | import torch 10 | import torchvision.models as models 11 | # import pytorch2timeloop 12 | import argparse 13 | import pathlib 14 | import os 15 | import json 16 | 17 | size_list = [1, 2, 4, 8, 16, 32] 18 | 19 | HOME = os.getenv("ONNXIM_HOME", default="../") 20 | parser = argparse.ArgumentParser(prog = 'ONNX generator') 21 | parser.add_argument('--models') 22 | parser.add_argument('--weight', type=int, default=1) 23 | args = parser.parse_args() 24 | torchvision_models = { 25 | 'resnet18' : models.resnet18(), 26 | 'resnet50' : models.resnet50(), 27 | 'alexnet' : models.alexnet(), 28 | 'vgg16' : models.vgg16(), 29 | 'squeezenet' : models.squeezenet1_0(), 30 | 'densenet' : models.densenet161(), 31 | 'inception' : models.inception_v3(), 32 | 'googlenet' : models.googlenet(), 33 | 'shufflenet' : models.shufflenet_v2_x1_0(), 34 | 'mobilenet' : models.mobilenet_v2(), 35 | 'resnext50_32x4d' : models.resnext50_32x4d(), 36 | 'wide_resnet50_2' : models.wide_resnet50_2(), 37 | 'mnasnet' : models.mnasnet1_0(), 38 | } 39 | 40 | model_list = args.models.split(',') 41 | for model_name in model_list: 42 | model = torchvision_models[model_name] 43 | batch_size = 1 44 | if model_name != 'inception': 45 | input = torch.randn(1, 3, 224, 224, requires_grad=True) 46 | input_shape = (3, 224, 224) 47 | else: 48 | input = torch.randn(1, 3, 299, 299, requires_grad=True) 49 | input_shape = (3, 299, 299) 50 | 51 | top_dir = os.path.join(HOME, "models") 52 | convert_fc = True 53 | exception_module_names = [] 54 | 55 | # pytorch2timeloop.convert_model(model, input_shape, batch_size, args.model, top_dir, convert_fc, exception_module_names) 56 | 57 | torch.onnx.export( 58 | model, 59 | input, 60 | 'tmp.onnx', 61 | export_params = bool(args.weight), 62 | input_names = ['input'], 63 | output_names = ['output'], 64 | dynamic_axes = { 65 | 'input' : {0 : 'batch_size'}, 66 | 'output' : {0 : 'batch_size'}} 67 | ) 68 | 69 | opt = rt.SessionOptions() 70 | # enable level 3 optimizations 71 | print(f"Converting ONNX FILE: {model_name}") 72 | pathlib.Path(f'{HOME}/models/{model_name}/').mkdir(parents=True, exist_ok=True) 73 | opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL 74 | opt.optimized_model_filepath = f'{HOME}/models/{model_name}/{model_name}.onnx' 75 | sess = rt.InferenceSession('tmp.onnx', sess_options=opt) 76 | 77 | pathlib.Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True) 78 | config = { 79 | "models": [ 80 | ] 81 | } 82 | for model_name in model_list: 83 | config["models"].append( 84 | { 85 | "name": f"{model_name}", 86 | "batch_size": 1, 87 | } 88 | ) 89 | 90 | file_name = '_'.join(model_list) 91 | 92 | with open(f"{HOME}/model_lists/{file_name}.json", "w") as json_file: 93 | json.dump(config, json_file, indent=4) 94 | print("DONE") -------------------------------------------------------------------------------- /scripts/onnxim_sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p allcpu 4 | #SBATCH --nodes=1 5 | #SBATCH --nodelist=n10 6 | #SBATCH --ntasks-per-node=1 7 | 8 | ml swap gnu8 gnu13 9 | which gcc 10 | 11 | echo "config: $1 model: $2" 12 | echo "$ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$1.json --model $ONNXIM_HOME/model_lists/$2.json" 13 | $ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$1.json --model $ONNXIM_HOME/model_lists/$2.json 14 | 15 | exit 0 -------------------------------------------------------------------------------- /scripts/run_matmul_conv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #models=("matmul_1024_1024_1024" "matmul_2048_2048_2048" "matmul_4096_4096_4096" "matmul_8192_8192_8192") 3 | #models=("matmul_2048") #"matmul_32" "matmul_64" "matmul_128" "matmul_256" "matmul_512" "matmul_1024" "matmul_2048" "conv_64" "conv_256" "conv_1024") 4 | #models=("matmul_32_32_32" "matmul_64_64_64" "matmul_128_128_128" "matmul_256_256_256" "matmul_512_512_512") 5 | models=("multi_1_0_2_0_100_1121_1000_once" "multi_2_0_2_0_100_8121_1000_once" "multi_8_0_2_0_100_32121_1000_once") 6 | #models=("matmul_512_512_1024" "matmul_512_1024_2" "matmul_512_1024_512" "matmul_512_1024_3072" "matmul_512_1024_4096" "matmul_512_4096_1024") 7 | #models=("conv_64" "conv_256" "conv_1024") 8 | #configs=("systolic_ws_8x8_c1_simple_noc_transformer") # "systolic_ws_8x8_c1_booksim2_transformer" "systolic_ws_8x8_c4_simple_noc_transformer" "systolic_ws_8x8_c4_booksim2_transformer") 9 | #configs=("systolic_ws_8x8_c4_simple_noc_transformer" "systolic_ws_8x8_c4_booksim2_transformer") 10 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4_partition_quad") #"systolic_ws_128x128_c4_booksim2_tpuv4") 11 | #models=("matmul_4096_4096_4096" "matmul_8192_8192_8192") #("matmul_1024_1024_1024" "matmul_2048_2048_2048" "matmul_4096_4096_4096" "matmul_8192_8192_8192") 12 | i=5 13 | 14 | #python3 $ONNXIM_HOME/scripts/generate_matmul_onnx.py 15 | #python3 $ONNXIM_HOME/scripts/generate_conv_onnx.py 16 | 17 | if [ ! -d "$ONNXIM_HOME/results" ]; then 18 | mkdir $ONNXIM_HOME/results 19 | fi 20 | 21 | for model_file in "${models[@]}"; do 22 | if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then 23 | mkdir $ONNXIM_HOME/results/$model_file 24 | fi 25 | for config in "${configs[@]}"; do 26 | if [ ! -d "$ONNXIM_HOME/results/$model_file/$config" ]; then 27 | mkdir $ONNXIM_HOME/results/$model_file/$config 28 | fi 29 | total_time=0 30 | for (( j=0; j $ONNXIM_HOME/results/$model_file/$config/result_$j" 32 | $ONNXIM_HOME/build/bin/Simulator --config ./configs/$config.json --model $ONNXIM_HOME/model_lists/$model_file.json > $ONNXIM_HOME/results/$model_file/$config/result_$j & 33 | simulation_time=$(grep "Simulation time:" "$ONNXIM_HOME/results/$model_file/$config/result_$j" | awk '{print $(NF-1)}') 34 | if [[ ! -z "$simulation_time" ]]; then 35 | total_time=$(echo "$total_time + $simulation_time" | bc) 36 | fi 37 | done 38 | mean_time=$(awk "BEGIN {print $total_time / $i}") 39 | echo "Mean Simulation time: $mean_time seconds" 40 | done 41 | done 42 | wait -------------------------------------------------------------------------------- /scripts/run_multi-tenancy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | models=("resnet50_vgg16") 4 | i=1 5 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4") # "systolic_ws_128x128_c4_booksim2_tpuv4") 6 | 7 | if [ ! -d "$ONNXIM_HOME/results" ]; then 8 | mkdir $ONNXIM_HOME/results 9 | fi 10 | 11 | for model_file in "${models[@]}"; do 12 | if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then 13 | mkdir $ONNXIM_HOME/results/$model_file 14 | fi 15 | if [[ $model_file == "gpt2_g" ]] || [[ $model_file == "gpt2_s" ]]; then 16 | onnx_file="gpt2" 17 | elif [[ $model_file == "bert" ]]; then 18 | onnx_file="$model_file" 19 | else 20 | onnx_file="$model_file" 21 | fi 22 | if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then 23 | mkdir $ONNXIM_HOME/results/$model_file 24 | fi 25 | for config in "${configs[@]}"; do 26 | if [ ! -d "$ONNXIM_HOME/results/$model_file/$config" ]; then 27 | mkdir $ONNXIM_HOME/results/$model_file/$config 28 | fi 29 | for (( j=0; j&2 24 | exit 1 25 | fi 26 | ;; 27 | -c| --config) 28 | if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then 29 | CONFIG_PATH=$2 30 | shift 2 31 | else 32 | echo "Error: Arcument for $1 is missing" >&2 33 | exit 1 34 | fi 35 | ;; 36 | -t| --two) 37 | MODE=two_model 38 | shift 1 39 | ;; 40 | -h| --help) 41 | echo "Usage: $0 -i [options]" >&2 42 | echo " -m | --model % (set input model for simulation)" >&2 43 | echo " -c | --config % (set configuration for simulation)" >&2 44 | exit 0 45 | ;; 46 | esac 47 | done 48 | 49 | if [ -z "$MODEL_PATH" ] || [ -z "$CONFIG_PATH" ]; then 50 | echo "Error: --model and --config option must be set" >&2 51 | exit 1 52 | fi 53 | CONFIG_PATH=`realpath $CONFIG_PATH` 54 | 55 | MODEL_TMP=$MODEL_PATH 56 | unset MODEL_PATH 57 | unset MODEL_NAME 58 | for model in ${MODEL_TMP//,/$'\n'}; do 59 | model=`realpath $model` 60 | model_name=`basename $model` 61 | model_name=${model_name%.*} 62 | if [[ -n $MODEL_PATH ]]; then 63 | MODEL_PATH="${MODEL_PATH}," 64 | MODEL_NAME="${MODEL_NAME}-" 65 | fi 66 | MODEL_PATH="${MODEL_PATH}${model}" 67 | MODEL_NAME="${MODEL_NAME}${model_name}" 68 | done 69 | 70 | #Make simulation workspace 71 | CURRENTDATE=`date +"%Y-%m-%d"` 72 | CURRENTTIME=`date +"%H-%M"` 73 | CONFIG_NAME=`basename $CONFIG_PATH` 74 | # MODEL_NAME=`basename $MODEL_PATH` 75 | echo $MODEL_PATH 76 | echo ./workspace/$CURRENTDATE/${MODEL_NAME%.*}/${CONFIG_NAME%.*}-$CURRENTTIME 77 | WORKSPACE=./workspace/$CURRENTDATE/${MODEL_NAME%.*}/${CONFIG_NAME%.*}-$CURRENTTIME 78 | mkdir -p $WORKSPACE 79 | run_simulator $WORKSPACE -------------------------------------------------------------------------------- /scripts/run_timeloop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL=$1 3 | CONFIG=$2 4 | MODEL_PATH=../models/$MODEL 5 | CONFIG_PATH=../configs/timeloop_configs/$CONFIG 6 | 7 | pids="" 8 | for LAYER in `ls $MODEL_PATH/*.yaml`; do 9 | srun -p allcpu ./timeloop_slurm_job.sh $MODEL $CONFIG $LAYER & 10 | pids="$pids $!" 11 | done 12 | 13 | wait $pids 14 | 15 | for MAP_FILE in `ls $MODEL_PATH/*.map`; do 16 | echo $MAP_FILE 17 | MAP_FILE_BASE=`basename $MAP_FILE` 18 | ID="${MAP_FILE_BASE%.*}" 19 | MAPPING=`cat $MAP_FILE` 20 | echo $ID, $MAPPING >> $MODEL_PATH/$MODEL.mapping 21 | rm $MAP_FILE 22 | done 23 | 24 | echo "DONE" 25 | -------------------------------------------------------------------------------- /scripts/run_transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | models=("gpt2_s" "gpt2_g" "bert") 4 | batch_list=("1" "2" "4" "8" "16" "32") 5 | 6 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4" "systolic_ws_128x128_c4_booksim2_tpuv4") 7 | i=1 8 | 9 | #python3.8 $ONNXIM_HOME/scripts/generate_transformer_onnx.py --model gpt2 10 | #python3 $ONNXIM_HOME/scripts/generate_transformer_onnx.py --model bert 11 | 12 | if [ ! -d "$ONNXIM_HOME/results" ]; then 13 | mkdir $ONNXIM_HOME/results 14 | fi 15 | 16 | for model_file in "${models[@]}"; do 17 | if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then 18 | mkdir $ONNXIM_HOME/results/$model_file 19 | fi 20 | if [[ $model_file == "gpt2_g" ]] || [[ $model_file == "gpt2_s" ]]; then 21 | onnx_file="gpt2" 22 | elif [[ $model_file == "bert" ]]; then 23 | onnx_file="$model_file" 24 | fi 25 | for batch in "${batch_list[@]}"; do 26 | if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch" ]; then 27 | mkdir $ONNXIM_HOME/results/$model_file/$batch 28 | fi 29 | for config in "${configs[@]}"; do 30 | if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch/$config" ]; then 31 | mkdir $ONNXIM_HOME/results/$model_file/$batch/$config 32 | fi 33 | total_time=0 34 | for (( j=0; j $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j 2>&1" 36 | $ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$config.json --model $ONNXIM_HOME/model_lists/"$model_file"_$batch.json > $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j 2>&1 37 | simulation_time=$(grep "Simulation time:" "$ONNXIM_HOME/results/$model_file/$batch/$config/result_$j" | awk '{print $(NF-1)}') 38 | if [[ ! -z "$simulation_time" ]]; then 39 | total_time=$(echo "$total_time + $simulation_time" | bc) 40 | fi 41 | done 42 | mean_time=$(awk "BEGIN {print $total_time / $i}") 43 | echo "Mean Simulation time: $mean_time seconds" 44 | done 45 | done 46 | done -------------------------------------------------------------------------------- /scripts/timeloop_slurm_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL=$1 3 | CONFIG=$2 4 | LAYER=$3 5 | MODEL_PATH=../models/$MODEL 6 | CONFIG_PATH=../configs/timeloop_configs/$CONFIG 7 | 8 | execute_timeloop() { 9 | echo $LAYER 10 | LAYER_FILE=`basename $LAYER` 11 | ID="${LAYER_FILE%.*}" 12 | TMP_DIR=tmp-$ID 13 | echo $TMP_DIR 14 | mkdir $TMP_DIR 15 | pushd $TMP_DIR 16 | ../timeloop-mapper ../$CONFIG_PATH/arch/*.yaml ../$CONFIG_PATH/arch/components/*.yaml ../$CONFIG_PATH/mapper/mapper.yaml ../$CONFIG_PATH/constraints/*.yaml ../$LAYER > /dev/null 2>/dev/null 17 | mv map.tmp.txt ../$MODEL_PATH/$ID.map 18 | popd 19 | rm -rf $TMP_DIR 20 | } 21 | 22 | execute_timeloop 23 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.15) 2 | 3 | # project setting 4 | set(LIB_NAME "Simulator") 5 | 6 | # set source and headers 7 | file(GLOB_RECURSE SRC_FILES 8 | "${CMAKE_SOURCE_DIR}/src/*.h" 9 | "${CMAKE_SOURCE_DIR}/src/*.cc" 10 | ) 11 | 12 | # build 13 | add_executable(${LIB_NAME} ${SRC_FILES}) 14 | add_library(${LIB_NAME}_lib ${SRC_FILES}) 15 | -------------------------------------------------------------------------------- /src/Core.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "Dram.h" 8 | #include "SimulationConfig.h" 9 | #include "Sram.h" 10 | #include "Stat.h" 11 | 12 | class Core { 13 | public: 14 | static std::unique_ptr create(uint32_t id, SimulationConfig config); 15 | Core(uint32_t id, SimulationConfig config); 16 | virtual ~Core() = default; 17 | virtual bool running(); 18 | virtual bool can_issue(bool is_accum_tile=false); 19 | virtual void issue(std::unique_ptr tile); 20 | virtual std::unique_ptr pop_finished_tile(); 21 | 22 | virtual void cycle(); 23 | 24 | virtual bool has_memory_request(); 25 | virtual void pop_memory_request(); 26 | virtual MemoryAccess* top_memory_request() { return _request_queue.front(); } 27 | virtual void push_memory_response(MemoryAccess* response); 28 | virtual void print_stats(); 29 | virtual void print_current_stats(); 30 | 31 | virtual cycle_type get_compute_cycles() { return _stat_tot_compute_cycle; } 32 | 33 | protected: 34 | virtual bool can_issue_compute(std::unique_ptr& inst); 35 | virtual cycle_type get_inst_compute_cycles(std::unique_ptr& inst) = 0; 36 | virtual void update_stats(); 37 | virtual void finish_compute_pipeline(); 38 | virtual void finish_vector_pipeline(); 39 | virtual void handle_ld_inst_queue(); 40 | virtual void handle_st_inst_queue(); 41 | virtual cycle_type calculate_add_tree_iterations(uint32_t vector_size); 42 | virtual cycle_type calculate_vector_op_iterations(uint32_t vector_size); 43 | 44 | const uint32_t _id; 45 | const SimulationConfig _config; 46 | 47 | cycle_type _core_cycle; 48 | 49 | cycle_type _stat_idle_cycle; 50 | cycle_type _stat_tot_idle_cycle = 0; 51 | 52 | cycle_type _stat_systolic_bubble_cycle = 0; 53 | cycle_type _stat_tot_systolic_bubble_cycle = 0; 54 | 55 | cycle_type _stat_memory_idle_cycle; 56 | cycle_type _stat_tot_memory_idle_cycle = 0; 57 | 58 | cycle_type _stat_compute_cycle = 0; 59 | cycle_type _stat_tot_compute_cycle = 0; 60 | 61 | cycle_type _accum_request_rr_cycle; 62 | cycle_type _max_request_rr_cycle; 63 | cycle_type _min_request_rr_cycle; 64 | 65 | /* Vector Unit Params */ 66 | cycle_type _stat_vec_compute_cycle; 67 | cycle_type _stat_tot_vec_compute_cycle = 0; 68 | 69 | cycle_type _stat_systolic_active_cycle = 0; 70 | cycle_type _stat_tot_systolic_active_cycle = 0; 71 | double _stat_matmul_cycle = 0; 72 | double _stat_tot_matmul_cycle = 0; 73 | 74 | int _running_layer; 75 | uint32_t tile_rr = 0; 76 | std::deque> _tiles; 77 | std::queue> _finished_tiles; 78 | 79 | std::queue> _compute_pipeline; 80 | std::queue> _vector_pipeline; 81 | 82 | std::queue> _ld_inst_queue; 83 | std::queue> _st_inst_queue; 84 | std::queue> _ex_inst_queue; 85 | 86 | std::queue _request_queue; 87 | std::queue _response_queue; 88 | uint32_t _waiting_write_reqs; 89 | 90 | uint32_t _current_layer_id; 91 | uint32_t _current_fused_op_id; 92 | Sram _spad; 93 | Sram _acc_spad; 94 | }; -------------------------------------------------------------------------------- /src/Dram.h: -------------------------------------------------------------------------------- 1 | #ifndef DRAM_H 2 | #define DRAM_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "Common.h" 9 | #include "ramulator/Ramulator.hpp" 10 | #include "ramulator2.hh" 11 | 12 | 13 | class Dram { 14 | public: 15 | virtual ~Dram() = default; 16 | virtual bool running() = 0; 17 | virtual void cycle() = 0; 18 | virtual bool is_full(uint32_t cid, MemoryAccess* request) = 0; 19 | virtual void push(uint32_t cid, MemoryAccess* request) = 0; 20 | virtual bool is_empty(uint32_t cid) = 0; 21 | virtual MemoryAccess* top(uint32_t cid) = 0; 22 | virtual void pop(uint32_t cid) = 0; 23 | uint32_t get_channel_id(MemoryAccess* request); 24 | virtual void print_stat() {} 25 | 26 | protected: 27 | SimulationConfig _config; 28 | uint32_t _n_ch; 29 | cycle_type _cycles; 30 | }; 31 | 32 | class SimpleDram : public Dram { 33 | public: 34 | SimpleDram(SimulationConfig config); 35 | virtual bool running() override; 36 | virtual void cycle() override; 37 | virtual bool is_full(uint32_t cid, MemoryAccess* request) override; 38 | virtual void push(uint32_t cid, MemoryAccess* request) override; 39 | virtual bool is_empty(uint32_t cid) override; 40 | virtual MemoryAccess* top(uint32_t cid) override; 41 | virtual void pop(uint32_t cid) override; 42 | 43 | private: 44 | uint32_t _latency; 45 | double _bandwidth; 46 | 47 | uint64_t _last_finish_cycle; 48 | std::vector>> _waiting_queue; 49 | std::vector> _response_queue; 50 | }; 51 | 52 | class DramRamulator : public Dram { 53 | public: 54 | DramRamulator(SimulationConfig config); 55 | 56 | virtual bool running() override; 57 | virtual void cycle() override; 58 | virtual bool is_full(uint32_t cid, MemoryAccess* request) override; 59 | virtual void push(uint32_t cid, MemoryAccess* request) override; 60 | virtual bool is_empty(uint32_t cid) override; 61 | virtual MemoryAccess* top(uint32_t cid) override; 62 | virtual void pop(uint32_t cid) override; 63 | virtual void print_stat() override; 64 | 65 | private: 66 | std::unique_ptr _mem; 67 | robin_hood::unordered_flat_map _waiting_mem_access; 68 | std::queue _responses; 69 | 70 | std::vector _total_processed_requests; 71 | std::vector _processed_requests; 72 | }; 73 | 74 | class DramRamulator2 : public Dram { 75 | public: 76 | DramRamulator2(SimulationConfig config); 77 | 78 | virtual bool running() override; 79 | virtual void cycle() override; 80 | virtual bool is_full(uint32_t cid, MemoryAccess* request) override; 81 | virtual void push(uint32_t cid, MemoryAccess* request) override; 82 | virtual bool is_empty(uint32_t cid) override; 83 | virtual MemoryAccess* top(uint32_t cid) override; 84 | virtual void pop(uint32_t cid) override; 85 | virtual void print_stat() override; 86 | 87 | private: 88 | std::vector> _mem; 89 | int _tx_ch_log2; 90 | int _tx_log2; 91 | int _req_size; 92 | }; 93 | #endif 94 | -------------------------------------------------------------------------------- /src/Hashing.h: -------------------------------------------------------------------------------- 1 | // author: Mahmoud Khairy, (Purdue Univ) 2 | // email: abdallm@purdue.edu 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef HASHING_H 9 | #define HASHING_H 10 | 11 | typedef unsigned long long new_addr_type; 12 | 13 | unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index, 14 | unsigned bank_set_num); 15 | 16 | #endif -------------------------------------------------------------------------------- /src/Instruction.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | enum class Opcode { MOVIN, MOVOUT, GEMM_PRELOAD, GEMM, GEMM_WRITE, COMP, BAR }; 13 | 14 | #define SPAD_BASE 0x10000000 15 | #define ASPAD_BASE 0x20000000 16 | typedef uint64_t addr_type; 17 | typedef uint64_t cycle_type; 18 | 19 | class Instruction { 20 | public: 21 | Instruction(); 22 | std::string toString(); 23 | 24 | private: 25 | enum class Type { 26 | LD_INST, ST_INST, EXE_INST 27 | }; 28 | uint32_t id; 29 | Opcode opcode; 30 | Type type; 31 | size_t tile_size; 32 | cycle_type start_cycle; 33 | cycle_type finish_cycle; 34 | std::vector dependent_ids; 35 | std::string dest_id; 36 | addr_type spad_addr; 37 | uint32_t spad_size; 38 | std::vector dram_addrs; 39 | }; -------------------------------------------------------------------------------- /src/Interconnect.h: -------------------------------------------------------------------------------- 1 | #ifndef INTERCONNECT_H 2 | #define INTERCONNECT_H 3 | #include "Common.h" 4 | #include "booksim2/Interconnect.hpp" 5 | #include "helper/HelperFunctions.h" 6 | 7 | class Interconnect { 8 | public: 9 | virtual ~Interconnect() = default; 10 | virtual bool running() = 0; 11 | virtual void cycle() = 0; 12 | virtual void push(uint32_t src, uint32_t dest, MemoryAccess* request) = 0; 13 | virtual bool is_full(uint32_t src, MemoryAccess* request) = 0; 14 | virtual bool is_empty(uint32_t nid) = 0; 15 | virtual MemoryAccess* top(uint32_t nid) = 0; 16 | virtual void pop(uint32_t nid) = 0; 17 | virtual void print_stats() = 0; 18 | 19 | protected: 20 | SimulationConfig _config; 21 | uint32_t _n_nodes; 22 | uint64_t _cycles; 23 | }; 24 | 25 | // Simple without conflict interconnect 26 | class SimpleInterconnect : public Interconnect { 27 | public: 28 | SimpleInterconnect(SimulationConfig config); 29 | virtual bool running() override; 30 | virtual void cycle() override; 31 | virtual void push(uint32_t src, uint32_t dest, 32 | MemoryAccess* request) override; 33 | virtual bool is_full(uint32_t src, MemoryAccess* request) override; 34 | virtual bool is_empty(uint32_t nid) override; 35 | virtual MemoryAccess* top(uint32_t nid) override; 36 | virtual void pop(uint32_t nid) override; 37 | virtual void print_stats() override {} 38 | 39 | private: 40 | uint32_t _latency; 41 | double _bandwidth; 42 | uint32_t _rr_start; 43 | uint32_t _buffer_size; 44 | 45 | struct Entity { 46 | cycle_type finish_cycle; 47 | uint32_t dest; 48 | MemoryAccess* access; 49 | }; 50 | 51 | std::vector> _in_buffers; 52 | std::vector> _out_buffers; 53 | std::vector _busy_node; 54 | }; 55 | 56 | class Booksim2Interconnect : public Interconnect { 57 | public: 58 | Booksim2Interconnect(SimulationConfig config); 59 | virtual bool running() override; 60 | virtual void cycle() override; 61 | virtual void push(uint32_t src, uint32_t dest, 62 | MemoryAccess* request) override; 63 | virtual bool is_full(uint32_t src, MemoryAccess* request) override; 64 | virtual bool is_empty(uint32_t nid) override; 65 | virtual MemoryAccess* top(uint32_t nid) override; 66 | virtual void pop(uint32_t nid) override; 67 | virtual void print_stats() override; 68 | 69 | private: 70 | uint32_t _ctrl_size; 71 | std::string _config_path; 72 | std::unique_ptr _booksim; 73 | 74 | booksim2::Interconnect::Type get_booksim_type(MemoryAccess* access); 75 | uint32_t get_packet_size(MemoryAccess* access); 76 | }; 77 | #endif -------------------------------------------------------------------------------- /src/Model.h: -------------------------------------------------------------------------------- 1 | #ifndef INSTRUCTION_H 2 | #define INSTRUCTION_H 3 | 4 | #include "Common.h" 5 | #include "helper/HelperFunctions.h" 6 | #include "operations/Operation.h" 7 | #include "Tensor.h" 8 | #include "Mapping.h" 9 | class Model { 10 | public: 11 | Model(std::string onnx_path, json model_config, SimulationConfig config, std::string name, MappingTable& map); 12 | Model(json model_config, SimulationConfig config, std::string name); 13 | virtual ~Model() = default; 14 | uint32_t get_id() { return _id; } 15 | json get_model_config() { return _model_config; } 16 | Tensor* get_tensor(uint32_t id); 17 | Tensor* find_tensor(std::string name); 18 | uint32_t get_root_node_id() { return _root_node_id; } 19 | void add_tensor(std::unique_ptr tensor); 20 | void set_layer_finish(uint32_t id); 21 | 22 | std::string get_name() { return _name; } 23 | uint32_t executable_layer_size(); 24 | Operation* get_executable_tile(); 25 | uint64_t get_request_time() const { return _request_time; } 26 | void set_request_time(uint64_t request_time) { _request_time=request_time; } 27 | uint64_t get_start_time() const { return _start_time; } 28 | void update_start_time(uint64_t start_time); 29 | bool check_finish(); 30 | uint32_t get_partition_id() { return _partition_id; } 31 | 32 | virtual bool check_language_model() { return false; } 33 | virtual bool check_regressive(); 34 | virtual void prepare_regressive(); 35 | 36 | virtual void initialize_model(std::vector>& weight_table); 37 | virtual void initialize_weight(std::vector>& weight_table); 38 | protected: 39 | 40 | uint32_t _id; 41 | MappingTable _mapping_table; 42 | json _model_config; 43 | std::string _onnx_path; 44 | std::string _name; 45 | uint32_t _root_node_id; 46 | std::map> _operation_map; 47 | std::map> _tensor_map; 48 | std::map _axis_map; 49 | std::vector _executable_layer; 50 | SimulationConfig _config; 51 | uint32_t _partition_id = 0; 52 | uint32_t _target_core = 0; 53 | 54 | /* Number of simulating attention block */ 55 | int nr_skip = 0; // NR_SKIP == 2 * NR_ATTEN 56 | uint64_t _request_time = 0; // pico second 57 | uint64_t _start_time = 0; // pico second 58 | bool _started = false; 59 | bool check_exist_in_exeutable(uint32_t id); 60 | }; 61 | 62 | #endif -------------------------------------------------------------------------------- /src/SimulationConfig.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | using json = nlohmann::json; 7 | 8 | enum class CoreType { SYSTOLIC_OS, SYSTOLIC_WS }; 9 | 10 | enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 }; 11 | 12 | enum class IcntType { SIMPLE, BOOKSIM2 }; 13 | 14 | struct CoreConfig { 15 | CoreType core_type; 16 | uint32_t core_width; 17 | uint32_t core_height; 18 | 19 | /* Vector config*/ 20 | uint32_t vector_process_bit; 21 | uint32_t layernorm_latency = 1; 22 | uint32_t softmax_latency = 1; 23 | uint32_t add_latency = 1; 24 | uint32_t mul_latency = 1; 25 | uint32_t mac_latency = 1; 26 | uint32_t div_latency = 1; 27 | uint32_t exp_latency = 1; 28 | uint32_t gelu_latency = 1; 29 | uint32_t add_tree_latency = 1; 30 | uint32_t scalar_sqrt_latency = 1; 31 | uint32_t scalar_add_latency = 1; 32 | uint32_t scalar_mul_latency = 1; 33 | 34 | /* SRAM config */ 35 | uint32_t sram_width; 36 | uint32_t spad_size; 37 | uint32_t accum_spad_size; 38 | }; 39 | 40 | struct SimulationConfig { 41 | /* Core config */ 42 | uint32_t num_cores; 43 | uint32_t core_freq; 44 | uint32_t core_print_interval; 45 | struct CoreConfig *core_config; 46 | 47 | /* DRAM config */ 48 | DramType dram_type; 49 | uint32_t dram_freq; 50 | uint32_t dram_channels; 51 | uint32_t dram_req_size; 52 | uint32_t dram_latency; 53 | uint32_t dram_size; // in GB 54 | uint32_t dram_nbl = 1; // busrt length in clock cycles (bust_length 8 in DDR -> 4 nbl) 55 | uint32_t dram_print_interval; 56 | std::string dram_config_path; 57 | 58 | /* ICNT config */ 59 | IcntType icnt_type; 60 | std::string icnt_config_path; 61 | uint32_t icnt_freq; 62 | uint32_t icnt_latency; 63 | uint32_t icnt_print_interval=0; 64 | 65 | /* Sheduler config */ 66 | std::string scheduler_type; 67 | 68 | /* Other configs */ 69 | uint32_t precision; 70 | uint32_t full_precision = 4; 71 | std::string layout; 72 | 73 | /* 74 | * This map stores the partition information: 75 | * 76 | * Note: Each core belongs to one partition. Through these partition IDs, 77 | * it is possible to assign a specific DNN model to a particular group of cores. 78 | */ 79 | std::map> partiton_map; 80 | 81 | uint64_t align_address(uint64_t addr) { 82 | return addr - (addr % dram_req_size); 83 | } 84 | 85 | float max_systolic_flops(uint32_t id) { 86 | return core_config[id].core_width * core_config[id].core_height * core_freq * 2 * num_cores / 1000; // GFLOPS 87 | } 88 | 89 | float max_vector_flops(uint32_t id) { 90 | return (core_config[id].vector_process_bit >> 3) / precision * 2 * core_freq / 1000; // GFLOPS 91 | } 92 | 93 | float max_dram_bandwidth() { 94 | return dram_freq * dram_channels * dram_req_size / dram_nbl / 1000; // GB/s 95 | } 96 | 97 | }; -------------------------------------------------------------------------------- /src/Simulator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Common.h" 4 | #include "Core.h" 5 | #include "Dram.h" 6 | #include "Interconnect.h" 7 | #include "Model.h" 8 | #include "scheduler/Scheduler.h" 9 | #include "scheduler/LanguageScheduler.h" 10 | #include 11 | 12 | #define CORE_MASK 0x1 << 1 13 | #define DRAM_MASK 0x1 << 2 14 | #define ICNT_MASK 0x1 << 3 15 | 16 | class Simulator { 17 | public: 18 | Simulator(SimulationConfig config, bool language_mode); 19 | void register_model(std::unique_ptr model); 20 | void register_language_model(json info, std::unique_ptr model); 21 | void finish_language_model(uint32_t model_id); 22 | void run_simulator(); 23 | const double get_tile_ops(); 24 | const size_t get_number_tile() { return _tile_timestamp.size(); } 25 | // void run_offline(std::string model_name, uint32_t sample_count); 26 | // void run_multistream(std::string model_name, uint32_t sample_count, 27 | // uint32_t ); void run_server(std::string trace_path); 28 | private: 29 | void cycle(); 30 | bool running(); 31 | void set_cycle_mask(); 32 | void handle_model(); 33 | uint32_t get_dest_node(MemoryAccess* access); 34 | SimulationConfig _config; 35 | uint32_t _n_cores; 36 | uint32_t _n_memories; 37 | uint32_t _memory_req_size; 38 | 39 | // Components 40 | std::vector> _cores; 41 | std::unique_ptr _icnt; 42 | std::unique_ptr _dram; 43 | std::unique_ptr _scheduler; 44 | 45 | // period information (ps) 46 | uint64_t _core_period; 47 | uint64_t _icnt_period; 48 | uint64_t _dram_period; 49 | // 50 | uint64_t _core_time; 51 | uint64_t _icnt_time; 52 | uint64_t _dram_time; 53 | 54 | addr_type _dram_ch_stride_size; 55 | 56 | uint64_t _core_cycles; 57 | 58 | uint32_t _cycle_mask; 59 | bool _single_run; 60 | bool _language_mode; 61 | std::unique_ptr _lang_scheduler; 62 | 63 | // Icnt stat 64 | uint64_t _nr_from_core=0; 65 | uint64_t _nr_to_core=0; 66 | uint64_t _nr_from_mem=0; 67 | uint64_t _nr_to_mem=0; 68 | cycle_type _icnt_cycle=0; 69 | uint64_t _icnt_interval=0; 70 | 71 | struct CompareModel { 72 | bool operator()(const std::unique_ptr& a, const std::unique_ptr& b) const { 73 | return a->get_request_time() > b->get_request_time(); 74 | } 75 | }; 76 | robin_hood::unordered_map>> _weight_table; 78 | std::vector> _models; 79 | robin_hood::unordered_map> _language_models; 80 | std::vector> _tile_timestamp; 81 | 82 | bool check_defined_model(std::string model_name); 83 | }; -------------------------------------------------------------------------------- /src/Sram.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Common.h" 3 | 4 | class Sram { 5 | public: 6 | Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum, uint32_t core_id); 7 | 8 | bool check_hit(addr_type address, int buffer_id); 9 | bool check_full(int buffer_id); 10 | bool check_remain(size_t size, int buffer_id); 11 | bool check_allocated(addr_type address, int buffer_id); 12 | 13 | void cycle(); 14 | void flush(int buffer_id); 15 | int prefetch(addr_type address, int buffer_id, size_t allocated_size, size_t count); 16 | void count_up(addr_type, int buffer_id); 17 | void fill(addr_type address, int buffer_id); 18 | int get_size() { return _size; } 19 | int get_current_size(int buffer_id) { return _current_size[buffer_id]; } 20 | void print_all(int buffer_id); 21 | private: 22 | struct SramEntry { 23 | bool valid; 24 | addr_type address; 25 | size_t size; 26 | size_t remain_req_count; 27 | cycle_type timestamp; 28 | }; 29 | 30 | int _size; 31 | int _data_width; 32 | int _current_size[2]; 33 | bool _accum; 34 | 35 | const cycle_type& _core_cycle; 36 | 37 | robin_hood::unordered_map _cache_table[2]; 38 | }; 39 | -------------------------------------------------------------------------------- /src/Stat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | typedef struct { 8 | uint64_t start_cycle; 9 | uint64_t cycles; 10 | uint64_t compute_cycles; 11 | uint64_t memory_stall; 12 | uint64_t dependency_stall; 13 | uint64_t sram_reads; 14 | uint64_t sram_writes; 15 | } TileStat; 16 | 17 | typedef struct { 18 | uint64_t op_cycles; 19 | std::vector tile_stats; 20 | } OpStat; 21 | 22 | typedef struct { 23 | uint64_t total_cycles; 24 | std::vector op_stats; 25 | } ModelStat; 26 | -------------------------------------------------------------------------------- /src/SystolicOS.cc: -------------------------------------------------------------------------------- 1 | #include "SystolicOS.h" 2 | 3 | SystolicOS::SystolicOS(uint32_t id, SimulationConfig config) 4 | : Core(id, config) {} 5 | 6 | void SystolicOS::cycle() { 7 | // Todo: Impement this; 8 | assert(0); 9 | } 10 | 11 | cycle_type SystolicOS::get_inst_compute_cycles(std::unique_ptr& inst) { 12 | return _config.core_config[_id].core_height + _config.core_config[_id].core_width - 2 + inst->size; 13 | } -------------------------------------------------------------------------------- /src/SystolicOS.h: -------------------------------------------------------------------------------- 1 | #include "Core.h" 2 | 3 | class SystolicOS : public Core { 4 | public: 5 | SystolicOS(uint32_t id, SimulationConfig config); 6 | virtual void cycle() override; 7 | protected: 8 | virtual cycle_type get_inst_compute_cycles(std::unique_ptr& inst); 9 | }; -------------------------------------------------------------------------------- /src/SystolicWS.h: -------------------------------------------------------------------------------- 1 | #include "Core.h" 2 | 3 | class SystolicWS : public Core { 4 | public: 5 | SystolicWS(uint32_t id, SimulationConfig config); 6 | virtual void cycle() override; 7 | virtual void print_stats() override; 8 | 9 | protected: 10 | virtual bool can_issue_compute(std::unique_ptr& inst) override; 11 | virtual cycle_type get_inst_compute_cycles(std::unique_ptr& inst) override; 12 | uint32_t _stat_systolic_inst_issue_count = 0; 13 | uint32_t _stat_systolic_preload_issue_count = 0; 14 | cycle_type get_vector_compute_cycles(std::unique_ptr& inst); 15 | }; -------------------------------------------------------------------------------- /src/Tensor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Common.h" 3 | 4 | class Model; 5 | class Operation; 6 | 7 | class Tensor { 8 | public: 9 | Tensor(uint32_t src_node, onnx::TensorProto &tensor_proto, int precision, bool produced); 10 | Tensor(uint32_t src_node, std::string name, std::vector &dims, 11 | int precision, bool produced); 12 | Tensor(uint32_t src_node, std::string name, int precision); 13 | Tensor(const Tensor &tensor); 14 | 15 | void define_tensor(addr_type address, std::vector &dims); 16 | void redefine_tensor(uint32_t src_node, std::vector &dims); 17 | void resize_tensor(std::vector &dims); 18 | void add_child_node(Operation *op); 19 | 20 | uint32_t get_id() { return _id; } 21 | std::string get_name() { return _name; } 22 | uint32_t get_src_node() { return _src_node; } 23 | std::vector get_dims() { return _dims; } 24 | void set_produced() { _produced = true; } 25 | bool get_produced() { return _produced; } 26 | uint32_t num_child_nodes() { return _child_nodes.size(); } 27 | uint32_t get_child_node(uint32_t id) { return _child_nodes[id]; } 28 | 29 | void allocate_tensor(int precision); 30 | addr_type get_address() { return _address; } 31 | uint64_t get_size() { return _size; } 32 | void print_tensor(); 33 | 34 | private: 35 | bool _temporal; 36 | uint32_t _precision; 37 | bool _produced; 38 | uint32_t _id; 39 | std::string _name; 40 | std::vector _dims; 41 | uint32_t _src_node; 42 | std::vector _child_nodes; 43 | addr_type _address; 44 | uint64_t _size; 45 | friend Model; 46 | }; -------------------------------------------------------------------------------- /src/allocator/AddressAllocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../Common.h" 3 | 4 | class AddressAllocator { 5 | virtual addr_type allocate(std::vector shape, uint32_t data_size) = 0; 6 | }; -------------------------------------------------------------------------------- /src/helper/CommandLineParser.cc: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | This source code is licensed under the MIT license found in the 3 | LICENSE file in the root directory of this source tree. 4 | *******************************************************************************/ 5 | 6 | #include "CommandLineParser.h" 7 | 8 | namespace po = boost::program_options; 9 | 10 | void CommandLineParser::parse(int argc, char** argv) noexcept(false) { 11 | po::store(po::parse_command_line(argc, argv, options_description), 12 | variables_map); 13 | po::notify(variables_map); 14 | } 15 | 16 | void CommandLineParser::print_help_message_if_required() const noexcept { 17 | if (variables_map.count("help") > 0) { 18 | std::cout << options_description << std::endl; 19 | exit(0); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/helper/HelperFunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef HELPER_FUNCTIONS_H 2 | #define HELPER_FUNCTIONS_H 3 | 4 | #define MIN(x, y) (((x) > (y)) ? (y) : (x)) 5 | #define MIN3(x, y, z) MIN(MIN(x, y), z) 6 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) 7 | #define GB * 1024 * 1024 * 1024 8 | #define MHz * 1000 * 1000 9 | 10 | #endif -------------------------------------------------------------------------------- /src/operations/AdaptiveAvgPool.cc: -------------------------------------------------------------------------------- 1 | #include "AdaptiveAvgPool.h" 2 | 3 | #include "../Model.h" 4 | #include "../Tensor.h" 5 | 6 | AdaptiveAvgPool::AdaptiveAvgPool(SimulationConfig config, Model* model, 7 | onnx::NodeProto& node_proto, uint32_t target_core) 8 | : Operation(config, model, node_proto, target_core) { 9 | int kernel_dim = 0; 10 | for (auto attribute : node_proto.attribute()) { 11 | if (attribute.name() == "kernel_shape") { 12 | spdlog::trace(" kernel_shape {}", attribute.ints_size()); 13 | for (int i = 0; i < attribute.ints_size(); i++) { 14 | _kernel_shape.push_back(attribute.ints(i)); 15 | } 16 | kernel_dim = attribute.ints_size(); 17 | } else if (attribute.name() == "strides") { 18 | spdlog::trace("stride_shape {}", attribute.ints_size()); 19 | for (int i = 0; i < attribute.ints_size(); i++) { 20 | _strides.push_back(attribute.ints(i)); 21 | } 22 | } 23 | } 24 | 25 | /* We assume AdaptiveAvgPool2d */ 26 | assert(kernel_dim == 2); 27 | std::vector input_shape = get_input(0)->get_dims(); 28 | std::vector output_shape = input_shape; 29 | 30 | /* Asuming input H W size are multiple of output H W*/ 31 | assert(!(input_shape[Hdim] % _kernel_shape[0]) && 32 | !(input_shape[Wdim] % _kernel_shape[1])); 33 | 34 | output_shape[Hdim] = (input_shape[Hdim] - _kernel_shape[0]) / _strides[0] + 1; 35 | output_shape[Wdim] = (input_shape[Wdim] - _kernel_shape[1]) / _strides[1] + 1; 36 | 37 | spdlog::trace("output name : {} {}", node_proto.output(0).c_str(), 38 | output_shape); 39 | 40 | Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0)); 41 | if (predefined_tensor == nullptr) { 42 | std::unique_ptr output_tensor = std::make_unique( 43 | _id, node_proto.output(0), output_shape, _config.precision, false); 44 | _outputs.push_back(output_tensor.get()->get_id()); 45 | _model->add_tensor(std::move(output_tensor)); 46 | } else { 47 | predefined_tensor->redefine_tensor(_id, output_shape); 48 | } 49 | } 50 | 51 | AdaptiveAvgPool::AdaptiveAvgPool(const AdaptiveAvgPool& src) : Operation(src) { 52 | _kernel_shape = src._kernel_shape; 53 | _strides = src._strides; 54 | _skip = src._skip; 55 | } 56 | 57 | void AdaptiveAvgPool::initialize_tiles(MappingTable& mapping_table) { 58 | spdlog::trace("initialize_tile {}", _name); 59 | std::vector output_shape = get_output(0)->get_dims(); 60 | if (_skip) { 61 | _tiles.push_back(std::make_unique(Tile{.status = Tile::Status::INITIALIZED, .skip = true})); 62 | return; 63 | } 64 | 65 | std::unique_ptr tile = std::make_unique(Tile{ 66 | .status = Tile::Status::INITIALIZED, 67 | .optype = "AdaptiveAvgPool", 68 | .layer_id = _id, 69 | .skip = true}); 70 | _tiles.push_back(std::move(tile)); 71 | initialize_instructions(_tiles.back().get(), Mapping{}); 72 | } 73 | 74 | void AdaptiveAvgPool::initialize_instructions(Tile* tile, Mapping mapping) { 75 | return; 76 | } -------------------------------------------------------------------------------- /src/operations/AdaptiveAvgPool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Operation.h" 4 | 5 | class AdaptiveAvgPool : public Operation { 6 | public: 7 | AdaptiveAvgPool(SimulationConfig config, Model* model, 8 | onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | AdaptiveAvgPool(const AdaptiveAvgPool& src); 10 | 11 | virtual void initialize_tiles(MappingTable& mapping_table) override; 12 | 13 | protected: 14 | virtual void initialize_instructions(Tile* tile, Mapping mapping); 15 | 16 | private: 17 | std::vector _kernel_shape; 18 | std::vector _strides; 19 | bool _skip = false; 20 | }; -------------------------------------------------------------------------------- /src/operations/Attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //#include "../tensor/NPUTensor.h" 3 | #include "Operation.h" 4 | #include "GemmWS.h" 5 | 6 | class Attention : public Operation { 7 | public: 8 | Attention(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | Attention(SimulationConfig config, Model* model, std::string name, std::map& attributes, uint32_t target_core=0); 10 | //std::vector> get_outputs(std::vector> inputs) override; 11 | 12 | uint32_t _batch_size; 13 | /* q,k,v shape : (nh,{1,l},dk) / (nh,{l,l+1},dk) / (nh,{l,l+1},dk) */ 14 | std::vector _query_shape; 15 | std::vector _key_shape; 16 | std::vector _value_shape; 17 | 18 | std::vector _weight_shape; 19 | std::vector _bias_shape; 20 | std::vector _mask_shape; 21 | std::vector _kv_cache_shape; 22 | std::vector _input_shape; 23 | std::vector _output_shape; 24 | std::vector _liner_output_shape; 25 | std::vector _projection_output_shape; 26 | 27 | GemmWS* _projection_node; 28 | uint32_t _seq; 29 | uint32_t _q_len; 30 | uint32_t _dmodel; 31 | uint32_t _nh; 32 | uint32_t _nkvh; 33 | uint32_t _dk; 34 | 35 | uint32_t _key_projection_id; 36 | uint32_t _query_projection_id; 37 | uint32_t _value_projection_id; 38 | /* For kv cache */ 39 | bool onnx = false; 40 | bool has_kv_cache = false; 41 | bool use_fused = true; 42 | bool need_scale = false; 43 | 44 | std::vector _heads_per_tile; 45 | std::vector _tiles_per_head; 46 | std::vector _scale_tiles_per_head; 47 | 48 | void calculate_loops(); 49 | void calculate_loops(Mapping& mapping); 50 | 51 | //void initialize_tiles(); 52 | //void initialize_instructions(Tile &tile, int req_idx, int head_idx, int num_heads); 53 | void initialize_tiles(MappingTable& mapping_table) override; 54 | void initialize_onnx_tiles(MappingTable& mapping_table); 55 | void initialize_non_fused_tiles(MappingTable& mapping_table); 56 | void initialize_instructions(Tile* tile, Mapping mapping, int head_idx, int num_heads); 57 | void initialize_instructions(Tile* tile, int head_idx, int num_heads); 58 | 59 | void initialize_scale_instructions(Tile* tile, Mapping mapping, int head_idx, int num_tiles, int query_idx, int num_queries); 60 | protected: 61 | uint32_t sram_size_needed(); 62 | }; -------------------------------------------------------------------------------- /src/operations/BiasAct.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class BiasAct : public Operation { 5 | public: 6 | BiasAct(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | BiasAct(SimulationConfig config, Model* model, std::string name, 8 | std::map& attributes, uint32_t target_core=0); 9 | 10 | void initialize_tiles(MappingTable& mapping_table) override; 11 | 12 | private: 13 | void calculate_loops(); 14 | void initialize_instructions(Tile* tile, Mapping mapping, 15 | uint32_t token_offset, uint32_t tokens); 16 | std::vector _bias_shape; 17 | 18 | std::vector _input_shape; 19 | std::vector _output_shape; 20 | 21 | uint32_t _batch_size; 22 | uint32_t _seq; 23 | uint32_t _dk; 24 | uint32_t _tokens_per_tile; 25 | bool _llama_mlp; 26 | bool _use_bias; 27 | Opcode _activation; 28 | }; -------------------------------------------------------------------------------- /src/operations/BiasGelu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class BiasGelu : public Operation { 5 | public: 6 | BiasGelu(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | BiasGelu(SimulationConfig config, Model* model, std::string name, std::map& attributes, uint32_t target_core=0); 8 | 9 | std::vector _bias_shape; 10 | 11 | std::vector _input_shape; 12 | std::vector _output_shape; 13 | 14 | uint32_t _batch_size; 15 | uint32_t _seq; 16 | uint32_t _dk; 17 | uint32_t _tokens_per_tile; 18 | 19 | void calculate_loops(); 20 | void initialize_tiles(MappingTable& mapping_table) override; 21 | void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens); 22 | }; -------------------------------------------------------------------------------- /src/operations/Concat.cc: -------------------------------------------------------------------------------- 1 | /*TODO: implement this */ 2 | #include "Concat.h" 3 | 4 | #include "../Model.h" 5 | #include "../Tensor.h" 6 | 7 | Concat::Concat(SimulationConfig config, Model* model, 8 | onnx::NodeProto& node_proto, uint32_t target_core) 9 | : Operation(config, model, node_proto, target_core) { 10 | for (auto attribute : node_proto.attribute()) { 11 | if (attribute.name() == "axis") { 12 | spdlog::trace("concat axis {}", attribute.ints(0)); 13 | _axis = attribute.ints(0); 14 | } 15 | } 16 | 17 | assert(_axis>=0 && _axis<4); 18 | std::vector output_shape; 19 | std::vector input0_shape = get_input(0)->get_dims(); 20 | std::vector input1_shape = get_input(1)->get_dims(); 21 | output_shape.resize(input0_shape.size()); 22 | for (int i = 0; i < input0_shape.size(); i++) { 23 | if (i == _axis) 24 | continue; 25 | assert(input0_shape[i] == input1_shape[i]); 26 | output_shape[i] = input0_shape[i]; 27 | } 28 | output_shape[_axis] = input0_shape[_axis] + input1_shape[_axis]; 29 | 30 | spdlog::trace("output name : {} {}", node_proto.output(0).c_str(), 31 | output_shape); 32 | Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0)); 33 | if (predefined_tensor == nullptr) { 34 | std::unique_ptr output_tensor = std::make_unique( 35 | _id, node_proto.output(0), output_shape, _config.precision, false); 36 | _outputs.push_back(output_tensor.get()->get_id()); 37 | _model->add_tensor(std::move(output_tensor)); 38 | } else { 39 | predefined_tensor->redefine_tensor(_id, output_shape); 40 | } 41 | } 42 | 43 | Concat::Concat(const Concat& src) : Operation(src) { 44 | _axis = src._axis; 45 | } 46 | 47 | Concat::Concat(SimulationConfig config, Model* model, 48 | std::string name, std::map &attributes, uint32_t target_core) 49 | : Operation(config, model, name, attributes, target_core) { 50 | //TODO:implement this 51 | _axis = std::stoi(get_attribute("axis")); 52 | } 53 | 54 | void Concat::initialize_tiles(MappingTable& mapping_table) { 55 | if(_outputs.size() == 0) { 56 | std::vector output_shape = _model->get_tensor(_inputs[0])->get_dims(); 57 | output_shape[_axis] = 0; 58 | for(uint32_t input : _inputs) { 59 | Tensor* tensor = _model->get_tensor(input); 60 | output_shape[_axis] += tensor->get_dims()[_axis]; 61 | } 62 | auto output_tensor = std::make_unique(_id, name_gen(_name, "output"), output_shape, _config.precision, false); 63 | _outputs.push_back(output_tensor->get_id()); 64 | _model->add_tensor(std::move(output_tensor)); 65 | } 66 | spdlog::trace("initialize_tile {} ", _name); 67 | std::unique_ptr tile = std::make_unique(Tile{ 68 | .status = Tile::Status::INITIALIZED, 69 | .optype = "Concat", 70 | .layer_id = _id, 71 | .skip = true 72 | }); 73 | _tiles.push_back(std::move(tile)); 74 | } 75 | 76 | void Concat::initialize_instructions(Tile* tile, Mapping mapping) { 77 | } 78 | 79 | -------------------------------------------------------------------------------- /src/operations/Concat.h: -------------------------------------------------------------------------------- 1 | /*TODO: implement this */ 2 | #pragma once 3 | 4 | #include "Operation.h" 5 | 6 | class Concat : public Operation { 7 | public: 8 | Concat(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | Concat(const Concat& src); 10 | Concat(SimulationConfig config, Model* model, std::string name, 11 | std::map& attributes, uint32_t target_core=0); 12 | virtual void initialize_tiles(MappingTable& mapping_table) override; 13 | virtual void initialize_instructions(Tile* tile, Mapping mapping) override; 14 | protected: 15 | 16 | private: 17 | // std::vector _kernel_shape; 18 | // std::vector _strides; 19 | // std::vector _dilations; 20 | // std::vector _pads; 21 | 22 | uint32_t _axis; 23 | }; -------------------------------------------------------------------------------- /src/operations/Conv.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Operation.h" 4 | 5 | struct convInfo{ 6 | std::vector kernel_shape; 7 | std::vector strides; 8 | std::vector dilations; 9 | std::vector pads; 10 | std::vector input_shape; 11 | std::vector weight_shape; 12 | std::vector conv_out_shape; 13 | std::vector pool_out_shape; 14 | uint32_t group; 15 | bool activation_fused; 16 | std::string activation_type; 17 | bool bathnorm_fused; 18 | bool skip_connection_fused; 19 | bool pool_fused; 20 | std::string pool_type; 21 | std::vector pool_kernel_shape; 22 | std::vector pool_strides; 23 | std::vector pool_pads; 24 | }; 25 | 26 | class Conv : public Operation { 27 | public: 28 | Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 29 | Conv(const Conv& src); 30 | Conv(SimulationConfig config, MappingTable& mapping_table, convInfo info, uint32_t target_core=0); 31 | // virtual void initialize_tiles(MappingTable& mapping_table) override; 32 | protected: 33 | virtual void im2col_nhwc(); 34 | // void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 35 | 36 | protected: 37 | std::vector _kernel_shape; 38 | std::vector _strides; 39 | std::vector _dilations; 40 | std::vector _pads; 41 | std::vector _input_shape; 42 | std::vector _weight_shape; 43 | std::vector _conv_out_shape; 44 | std::vector _pool_out_shape; 45 | uint32_t _group; 46 | bool _activation_fused; 47 | std::string _activation_type; 48 | bool _bathnorm_fused; 49 | bool _skip_connection_fused; 50 | bool _pool_fused; 51 | std::string _pool_type; 52 | std::vector _pool_kernel_shape; 53 | std::vector _pool_strides; 54 | std::vector _pool_pads; 55 | 56 | }; -------------------------------------------------------------------------------- /src/operations/ConvOS.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Conv.h" 4 | 5 | class ConvOS : public Conv { 6 | public: 7 | ConvOS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 8 | ConvOS(const Conv& src); 9 | 10 | virtual void initialize_tiles(MappingTable& mapping_table) override; 11 | protected: 12 | virtual void initialize_instructions(Tile* tile, Mapping mapping) ; 13 | void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 14 | }; -------------------------------------------------------------------------------- /src/operations/ConvWS.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Conv.h" 4 | 5 | class ConvWS : public Conv { 6 | public: 7 | ConvWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 8 | ConvWS(const Conv& src); 9 | ConvWS(SimulationConfig config, MappingTable& mapping_table, convInfo info); 10 | virtual void initialize_tiles(MappingTable& mapping_table) override; 11 | 12 | protected: 13 | virtual void initialize_instructions(Tile* tile, Mapping mapping); 14 | virtual void initialize_matmul_instructions(Tile* tile); 15 | virtual addr_type make_weight_address(uint32_t S, uint32_t R, uint32_t M, uint32_t C, 16 | std::vector shape); 17 | virtual addr_type make_activation_address(uint32_t N, uint32_t H, uint32_t W, 18 | uint32_t C, std::vector shape); 19 | void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 20 | Instruction make_weight_instruction(int m_offset, int s_offset, int r_offset, 21 | int c_offset, Mapping mapping); 22 | Instruction make_input_instruction(int m_offset, int s_offset, int r_offset, 23 | int c_offset, Mapping mapping); 24 | }; -------------------------------------------------------------------------------- /src/operations/Dummy.cc: -------------------------------------------------------------------------------- 1 | #include "Dummy.h" 2 | #include "../Model.h" 3 | #include "../Tensor.h" 4 | 5 | Dummy::Dummy(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core) 6 | : Operation(config, model, node_proto, target_core) { 7 | _input_shape = get_input(0)->get_dims(); 8 | _output_shape = _input_shape; 9 | spdlog::trace("output_shape : {}", _output_shape); 10 | spdlog::trace("output name : {} {}", node_proto.output(0).c_str()); 11 | 12 | for (int i=0;ifind_tensor(node_proto.output(i)); 14 | if (pre_defind_tensor == nullptr) { 15 | std::unique_ptr output_tensor = std::make_unique( 16 | _id, node_proto.output(i), _output_shape, _config.precision, false); 17 | _outputs.push_back(output_tensor.get()->get_id()); 18 | _model->add_tensor(std::move(output_tensor)); 19 | } else { 20 | pre_defind_tensor->redefine_tensor(_id, _output_shape); 21 | } 22 | } 23 | } 24 | 25 | void Dummy::initialize_tiles(MappingTable& mapping_table) { 26 | std::unique_ptr tile = std::make_unique(Tile{ 27 | .status = Tile::Status::INITIALIZED, 28 | .optype="Dummy", 29 | .layer_id=_id, 30 | .skip = true}); 31 | _tiles.push_back(std::move(tile)); 32 | initialize_instructions(_tiles.back().get(), Mapping{}); 33 | } 34 | 35 | void Dummy::initialize_instructions(Tile* tile, Mapping mapping) { 36 | } -------------------------------------------------------------------------------- /src/operations/Dummy.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //#include "../tensor/NPUTensor.h" 3 | #include "Operation.h" 4 | #include 5 | 6 | class Dummy: public Operation { 7 | public: 8 | Dummy(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | 10 | std::vector _input_shape; 11 | std::vector _output_shape; 12 | void initialize_tiles(MappingTable& mapping_table); 13 | void initialize_instructions(Tile* tile, Mapping mapping); 14 | uint64_t _total_loop; 15 | uint32_t _element_in_tile; 16 | }; -------------------------------------------------------------------------------- /src/operations/EmbedLayerNorm.cc: -------------------------------------------------------------------------------- 1 | #include "EmbedLayerNorm.h" 2 | #include "../Model.h" 3 | #include "../Tensor.h" 4 | 5 | EmbedLayerNorm::EmbedLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core) 6 | : Operation(config, model, node_proto, target_core) { 7 | _input_shape = get_input(0)->get_dims(); 8 | _weight_shape = get_input(2)->get_dims(); 9 | 10 | assert(_input_shape.size()==2); 11 | _output_shape.push_back(_input_shape.at(0)); 12 | _output_shape.push_back(_input_shape.at(1)); 13 | _output_shape.push_back(_weight_shape.at(1)); 14 | spdlog::trace("output_shape : {}", _output_shape); 15 | 16 | Tensor* embed_output = _model->find_tensor(node_proto.output(0)); 17 | if (embed_output == nullptr) { 18 | std::unique_ptr output_tensor = std::make_unique( 19 | _id, node_proto.output(0), _output_shape, _config.precision, false); 20 | _outputs.push_back(output_tensor.get()->get_id()); 21 | _model->add_tensor(std::move(output_tensor)); 22 | } else { 23 | embed_output->redefine_tensor(_id, _output_shape); 24 | } 25 | 26 | /* mask */ 27 | Tensor* mask_output = _model->find_tensor(node_proto.output(1)); 28 | if (mask_output == nullptr) { 29 | std::unique_ptr output_tensor = std::make_unique( 30 | _id, node_proto.output(1), _output_shape, _config.precision, false); 31 | _outputs.push_back(output_tensor.get()->get_id()); 32 | _model->add_tensor(std::move(output_tensor)); 33 | } else { 34 | mask_output->redefine_tensor(_id, _output_shape); 35 | } 36 | if (node_proto.output().size()==3) { 37 | Tensor* embed_sum = _model->find_tensor(node_proto.output(2)); 38 | if (embed_sum == nullptr) { 39 | std::unique_ptr output_tensor = std::make_unique( 40 | _id, node_proto.output(2), _output_shape, _config.precision, false); 41 | _outputs.push_back(output_tensor.get()->get_id()); 42 | _model->add_tensor(std::move(output_tensor)); 43 | } else { 44 | embed_sum->redefine_tensor(_id, _output_shape); 45 | } 46 | } 47 | } 48 | 49 | void EmbedLayerNorm::initialize_tiles(MappingTable& mapping_table) { 50 | std::unique_ptr tile = std::make_unique(Tile{ 51 | .status = Tile::Status::INITIALIZED, 52 | .optype="EmbedLayerNorm", 53 | .layer_id=_id, 54 | .skip=true}); 55 | _tiles.push_back(std::move(tile)); 56 | } 57 | 58 | void EmbedLayerNorm::initialize_instructions(Tile* tile, Mapping mapping) { 59 | } -------------------------------------------------------------------------------- /src/operations/EmbedLayerNorm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //#include "../tensor/NPUTensor.h" 3 | #include "Operation.h" 4 | 5 | class EmbedLayerNorm: public Operation { 6 | public: 7 | EmbedLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 8 | 9 | std::vector _input_shape; 10 | std::vector _output_shape; 11 | std::vector _weight_shape; 12 | std::vector _position_weight_shape; 13 | std::vector _token_type_weight; 14 | std::vector _ln_weight_shape; 15 | std::vector _ln_bias_shape; 16 | void initialize_tiles(MappingTable& mapping_table); 17 | void initialize_instructions(Tile* tile, Mapping mapping); 18 | protected: 19 | }; -------------------------------------------------------------------------------- /src/operations/Flatten.cc: -------------------------------------------------------------------------------- 1 | /*TODO: implement this */ 2 | #include "Flatten.h" 3 | 4 | #include "../Model.h" 5 | #include "../Tensor.h" 6 | 7 | Flatten::Flatten(SimulationConfig config, Model* model, 8 | onnx::NodeProto& node_proto, uint32_t target_core) 9 | : Operation(config, model, node_proto, target_core) { 10 | for (auto attribute : node_proto.attribute()) { 11 | if (attribute.name() == "axis") { 12 | spdlog::trace("flatten axis {}", attribute.i()); 13 | _axis = attribute.i(); 14 | } 15 | } 16 | 17 | assert(_axis >= 0 && _axis < 4); 18 | std::vector input_shape = get_input(0)->get_dims(); 19 | std::vector output_shape(_axis + 1, 1); 20 | 21 | for (int i = 0; i < input_shape.size(); i++) { 22 | if (i < _axis) { 23 | output_shape[i] = input_shape[i]; 24 | } else { 25 | output_shape[_axis] *= input_shape[i]; 26 | } 27 | } 28 | 29 | spdlog::trace("output name : {} {}", node_proto.output(0).c_str(), output_shape); 30 | 31 | Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0)); 32 | if (predefined_tensor == nullptr) { 33 | std::unique_ptr output_tensor = std::make_unique( 34 | _id, node_proto.output(0), output_shape, _config.precision, false); 35 | _outputs.push_back(output_tensor.get()->get_id()); 36 | _model->add_tensor(std::move(output_tensor)); 37 | } else { 38 | predefined_tensor->redefine_tensor(_id, output_shape); 39 | } 40 | } 41 | 42 | Flatten::Flatten(const Flatten& src) : Operation(src) { _axis = src._axis; } 43 | 44 | void Flatten::initialize_tiles(MappingTable& mapping_table) { 45 | spdlog::trace("initialize_tile {}", _name); 46 | 47 | _tiles.push_back(std::make_unique(Tile{.status = Tile::Status::INITIALIZED, 48 | .optype = "Flatten", 49 | .layer_id = _id, 50 | .skip = true})); 51 | initialize_instructions(_tiles.back().get(), Mapping{}); 52 | } 53 | 54 | void Flatten::initialize_instructions(Tile* tile, Mapping mapping) { 55 | } 56 | -------------------------------------------------------------------------------- /src/operations/Flatten.h: -------------------------------------------------------------------------------- 1 | /*TODO: implement this */ 2 | #pragma once 3 | 4 | #include "Operation.h" 5 | 6 | class Flatten : public Operation { 7 | public: 8 | Flatten(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | Flatten(const Flatten& src); 10 | virtual void initialize_tiles(MappingTable& mapping_table) override; 11 | virtual void initialize_instructions(Tile* tile, Mapping mapping) override; 12 | protected: 13 | 14 | private: 15 | uint32_t _axis; 16 | }; -------------------------------------------------------------------------------- /src/operations/Gemm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class Gemm : public Operation { 5 | public: 6 | Gemm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | Gemm(SimulationConfig config, MappingTable& mapping_table, 8 | std::vector output_shape, std::vector input_shape, 9 | std::vector weight_shape, uint32_t target_core=0); 10 | Gemm(SimulationConfig config, Model* model, std::string name, 11 | std::map& attributes, uint32_t target_core=0); 12 | 13 | protected: 14 | 15 | std::vector _output_shape; 16 | std::vector _input_shape; 17 | std::vector _weight_shape; 18 | int _batch_size; 19 | 20 | private: 21 | uint32_t _alpha; 22 | uint32_t _beta; 23 | bool _transA; 24 | bool _transB; 25 | }; -------------------------------------------------------------------------------- /src/operations/GemmOS.cc: -------------------------------------------------------------------------------- 1 | #include "GemmOS.h" 2 | 3 | #include "../Model.h" 4 | 5 | GemmOS::GemmOS(SimulationConfig config, Model* model, 6 | onnx::NodeProto& node_proto, uint32_t target_core) 7 | : Gemm(config, model, node_proto, target_core) {} 8 | 9 | /* TODO : Implement this */ 10 | void GemmOS::initialize_tiles(MappingTable& mapping_table) { 11 | 12 | } -------------------------------------------------------------------------------- /src/operations/GemmOS.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Gemm.h" 3 | 4 | class GemmOS : public Gemm { 5 | public: 6 | GemmOS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | void initialize_tiles(MappingTable& mapping_table) override; 8 | private: 9 | }; -------------------------------------------------------------------------------- /src/operations/GemmWS.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Gemm.h" 3 | 4 | class GemmWS : public Gemm { 5 | public: 6 | GemmWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | GemmWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, bool has_true, uint32_t target_core=0); 8 | GemmWS(SimulationConfig config, MappingTable& mapping_table, 9 | std::vector input_shape, std::vector weight_shape, 10 | std::vector output_shape, uint32_t target_core); 11 | GemmWS(SimulationConfig config, Model* model, std::string name, std::map& attribute, uint32_t target_core); 12 | virtual void initialize_tiles(MappingTable& mapping_table) override; 13 | bool has_bias = true; 14 | protected: 15 | virtual void initialize_instructions(Tile* tile, Mapping mapping); 16 | private: 17 | }; -------------------------------------------------------------------------------- /src/operations/GlobalAvgPool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class GlobalAvgPool : public Operation { 5 | public: 6 | GlobalAvgPool(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | GlobalAvgPool(const GlobalAvgPool& src); 8 | virtual void initialize_tiles(MappingTable& mapping_table) override; 9 | 10 | protected: 11 | virtual void initialize_instructions(Tile* tile, Mapping mapping) override; 12 | private: 13 | std::vector _kernel_shape; 14 | std::vector _strides; 15 | // std::vector _dilations; 16 | // std::vector _pads; 17 | }; -------------------------------------------------------------------------------- /src/operations/KVCacheConcat.h: -------------------------------------------------------------------------------- 1 | #ifndef KV_CACHE_CONCAT_H 2 | #define KV_CACHE_CONCAT_H 3 | #include "Operation.h" 4 | 5 | class KVCacheConcat : public Operation { 6 | public: 7 | KVCacheConcat(SimulationConfig config, Model* model, 8 | onnx::NodeProto& node_proto, uint32_t target_core=0); 9 | KVCacheConcat(const KVCacheConcat& src); 10 | KVCacheConcat(SimulationConfig config, Model* model, std::string name, 11 | std::map& attributes, uint32_t target_core=0); 12 | void initialize_tiles(MappingTable& mapping_table) override; 13 | private: 14 | void calculate_loops(); 15 | void initialize_instructions(Tile* tile, uint32_t idx); 16 | 17 | uint32_t _num_batches; 18 | std::vector _input_token_lengths; 19 | uint32_t _num_kv_heads; 20 | uint32_t _num_attention_heads; 21 | uint32_t _hidden_size; 22 | uint32_t _cache_dim; 23 | uint32_t _outter_loops; 24 | uint32_t _inner_loops; 25 | }; 26 | 27 | #endif -------------------------------------------------------------------------------- /src/operations/MaxPool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Operation.h" 4 | 5 | class MaxPool : public Operation { 6 | public: 7 | MaxPool(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 8 | MaxPool(const MaxPool& src); 9 | virtual void initialize_tiles(MappingTable& mapping_table) override; 10 | virtual void initialize_instructions(Tile* tile, Mapping mapping) override; 11 | 12 | protected: 13 | // virtual void initialize_instructions(SimulationConfig config, Tile& tile) override; 14 | private: 15 | std::vector _kernel_shape; 16 | std::vector _strides; 17 | std::vector _dilations; 18 | std::vector _pads; 19 | }; -------------------------------------------------------------------------------- /src/operations/Operation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../Common.h" 4 | #include "../Mapping.h" 5 | #include "../Tensor.h" 6 | 7 | class Model; 8 | class OpParser; 9 | 10 | // Graph Node 11 | class Operation { 12 | public: 13 | Operation(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, 14 | uint32_t id, uint32_t target_core); 15 | Operation(SimulationConfig config, MappingTable& mapping_table, uint32_t target_core); 16 | Operation(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core); 17 | Operation(const Operation& operation); 18 | Operation(SimulationConfig config, Model* model, 19 | std::string name, std::map&attribute, uint32_t target_core); 20 | virtual ~Operation() = default; 21 | virtual void set_finish(); 22 | 23 | virtual std::string get_name() { return _name; } 24 | virtual std::string get_optype() { return _optype; } 25 | virtual uint32_t get_id() { return _id; } 26 | virtual uint32_t num_inputs() { return _inputs.size(); } 27 | virtual Tensor* get_input(int id); 28 | virtual void add_input(int id); 29 | virtual void add_output(int id); 30 | virtual uint32_t num_outputs() { return _outputs.size(); } 31 | virtual Tensor* get_output(int id); 32 | virtual void set_model(Model* model) { _model=model; } 33 | virtual std::vector get_child_nodes(); 34 | virtual std::deque>& get_tiles(); 35 | virtual void clear_tiles(); 36 | virtual void initialize_tiles(MappingTable& mapping_table) = 0; 37 | virtual bool check_executable(); 38 | bool check_finish() { return _finish; }; 39 | uint32_t target_core=0; // Targeted core id 40 | 41 | protected: 42 | virtual void initialize_instructions(Tile* tile, Mapping mapping) {} 43 | addr_type make_address(std::vector index, std::vector dims); 44 | addr_type get_operand_addr(uint32_t operand_id); 45 | std::string get_attribute(std::string key); 46 | protected: 47 | static const uint32_t _NO_OPERAND = 0; 48 | static const uint32_t _INPUT_OPERAND = 100; 49 | static const uint32_t _OUTPUT_OPERAND = 200; 50 | uint32_t _id; 51 | std::string _name; 52 | std::string _optype; 53 | SimulationConfig _config; 54 | Model* _model; 55 | onnx::NodeProto _proto; 56 | std::vector _inputs; 57 | std::vector _outputs; 58 | std::map _attributes; 59 | std::deque> _tiles; 60 | std::vector>> _weight_addrs; 61 | std::vector>>> _input_addrs; 62 | std::vector>>> _output_addrs; 63 | 64 | int Ndim; // Batch dimension of activation tensor (commoly 0) 65 | int Hdim; // Height dimension of activation tensor 66 | int Wdim; // Width dimension of activation tensor 67 | int Cdim; // Channel dimension of activation tensor 68 | int Cdim_w; // Channel dimension of weight tensor 69 | int Mdim; // Output channel dimension of weight tensor 70 | int Sdim; // Height dimension of weight tensor 71 | int Rdim; // Width dimension of weight tensor 72 | 73 | bool _finish; 74 | friend Model; 75 | }; -------------------------------------------------------------------------------- /src/operations/OperationFactory.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "../Common.h" 3 | #include "Operation.h" 4 | 5 | class Model; 6 | 7 | class OperationFactory { 8 | public: 9 | static void initialize(SimulationConfig config); 10 | static std::unique_ptr create_operation(Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 11 | static std::unique_ptr copy_operation(Operation* op); 12 | 13 | private: 14 | static SimulationConfig _config; 15 | }; -------------------------------------------------------------------------------- /src/operations/SkipLayerNorm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class SkipLayerNorm : public Operation { 5 | public: 6 | SkipLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | SkipLayerNorm(SimulationConfig config, Model* model, std::string name, std::map& attributes, uint32_t target_core=0); 8 | 9 | std::vector _weight_shape; 10 | std::vector _bias_shape; 11 | std::vector _dense_bias_shape; 12 | 13 | std::vector _input_shape; 14 | std::vector _skip_shape; 15 | std::vector _output_shape; 16 | 17 | uint32_t _batch_size; 18 | uint32_t _seq; 19 | uint32_t _dk; 20 | uint32_t _tokens_per_tile; 21 | 22 | void calculate_loops(); 23 | void initialize_tiles(MappingTable& mapping_table) override; 24 | void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens); 25 | }; -------------------------------------------------------------------------------- /src/operations/Softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Operation.h" 3 | 4 | class Softmax : public Operation { 5 | public: 6 | Softmax(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0); 7 | Softmax(SimulationConfig config, MappingTable& mapping_table, 8 | std::vector input_shape, uint32_t target_core=0); 9 | std::vector _input_shape; 10 | std::vector _output_shape; 11 | 12 | uint32_t _seq; 13 | uint32_t _dk; 14 | uint32_t _tokens_per_tile; 15 | 16 | void calculate_loops(); 17 | void initialize_tiles(MappingTable& mapping_table) override; 18 | void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens); 19 | }; -------------------------------------------------------------------------------- /src/scheduler/IterLevelScheduler.cc: -------------------------------------------------------------------------------- 1 | #include "IterLevelScheduler.h" 2 | 3 | IterLevelScheduler::IterLevelScheduler(std::string name, std::string path, 4 | std::unique_ptr model, 5 | SimulationConfig config, 6 | json scheduler_config) 7 | : LangScheduler(name, path, std::move(model), config, scheduler_config) { 8 | } 9 | 10 | void IterLevelScheduler::cycle() { 11 | _cycle++; 12 | if(_active_requests.size() <= _max_batch_size || _max_batch_size == 0) { 13 | while(!_request_queue.empty()) { 14 | if(_request_queue.front()->request_time <= _cycle) { 15 | init_request(_request_queue.front()); 16 | _active_requests[_request_queue.front()->request_id] = std::move(_request_queue.front()); 17 | _request_queue.pop(); 18 | } 19 | else { 20 | break; 21 | } 22 | if(_max_batch_size > 0 && _active_requests.size() >= _max_batch_size) { 23 | break; 24 | } 25 | } 26 | } 27 | 28 | if(_model_queue.empty() && _requests_in_model.empty()) { 29 | init_inputs_and_model(); 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /src/scheduler/IterLevelScheduler.h: -------------------------------------------------------------------------------- 1 | #ifndef ITER_LEVEL_SCHEDULER_H 2 | #define ITER_LEVEL_SCHEDULER_H 3 | #include "LanguageScheduler.h" 4 | 5 | class IterLevelScheduler : public LangScheduler { 6 | public: 7 | IterLevelScheduler(std::string name, std::string path, 8 | std::unique_ptr model, 9 | SimulationConfig config, 10 | json scheduler_config); 11 | virtual void cycle() override; 12 | }; 13 | 14 | 15 | #endif -------------------------------------------------------------------------------- /src/scheduler/LanguageScheduler.h: -------------------------------------------------------------------------------- 1 | #ifndef LANGUAGE_SCHEDULER_H 2 | #define LANGUAGE_SCHEDULER_H 3 | #include 4 | 5 | #include "../Common.h" 6 | #include "../models/LanguageModel.h" 7 | 8 | struct LangRequest { 9 | uint32_t request_id; 10 | bool running; 11 | bool gen_phase; 12 | uint64_t request_time; 13 | uint64_t start_time; 14 | uint64_t finish_time; 15 | uint32_t prompt_length; 16 | uint32_t current_length; 17 | uint32_t target_length; 18 | std::vector> key_cache; 19 | std::vector> value_cache; 20 | }; 21 | 22 | class LangScheduler { 23 | public: 24 | static std::unique_ptr create(std::string name, std::string path, 25 | std::unique_ptr model, 26 | SimulationConfig config, 27 | json scheduler_config); 28 | LangScheduler(std::string name, std::string path, 29 | std::unique_ptr model, 30 | SimulationConfig config, 31 | json scheduler_config); 32 | bool can_schedule_model(); 33 | virtual std::unique_ptr pop_model(); 34 | virtual void finish_model(uint32_t model_id); 35 | virtual void cycle(); 36 | virtual bool busy(); 37 | virtual uint64_t get_kv_memory_size(); 38 | protected: 39 | SimulationConfig _config; 40 | json _scheduler_config; 41 | std::string _name; 42 | std::unique_ptr _language_model; 43 | std::queue> _request_queue; 44 | std::map> _active_requests; 45 | std::map> _requests_in_model; 46 | std::queue> _model_queue; 47 | uint64_t _cycle; 48 | 49 | uint32_t _num_layers; 50 | uint32_t _num_sim_layers; 51 | uint32_t _num_attention_heads; 52 | uint32_t _num_kv_heads; 53 | uint32_t _hidden_size; 54 | uint32_t _cache_dim; 55 | uint32_t _max_seq_length; 56 | uint32_t _max_batch_size; 57 | bool _run_single_layer; 58 | bool _check_mem_size; 59 | 60 | 61 | 62 | std::vector _max_dims; 63 | 64 | void parse_request_trace(std::string trace_path); 65 | void init_request(std::unique_ptr& request); 66 | void init_inputs_and_model(); 67 | }; 68 | 69 | #endif -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | SET(BINARY Simulator_test) 2 | SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_SOURCE_DIR}) 3 | 4 | # 5 | # import googletest as an external project 6 | # 7 | include(ExternalProject) 8 | include(GoogleTest) 9 | externalproject_add( 10 | GoogleTest 11 | URL https://github.com/google/googletest/archive/release-1.8.1.zip 12 | PREFIX ${CMAKE_CURRENT_BINARY_DIR}/lib 13 | CMAKE_ARGS -DCMAKE_CXX_FLAGS:STRING="-D_GLIBCXX_USE_CXX11_ABI=0" 14 | INSTALL_COMMAND "" 15 | ) 16 | 17 | externalproject_get_property(GoogleTest source_dir) 18 | include_directories(${source_dir}/googletest/include) 19 | include_directories(${source_dir}/googlemock/include) 20 | 21 | externalproject_get_property(GoogleTest binary_dir) 22 | set(GTEST_LIBRARY_PATH ${binary_dir}/googlemock/gtest/${CMAKE_FIND_LIBRARY_PREFIXES}gtest.a) 23 | set(GTEST_LIBRARY GTest::GTest) 24 | add_library(${GTEST_LIBRARY} UNKNOWN IMPORTED) 25 | set_target_properties(${GTEST_LIBRARY} PROPERTIES 26 | IMPORTED_LOCATION ${GTEST_LIBRARY_PATH}) 27 | add_dependencies(${GTEST_LIBRARY} GoogleTest) 28 | 29 | set(GMOCK_LIBRARY_PATH ${binary_dir}/googlemock/${CMAKE_FIND_LIBRARY_PREFIXES}gmock.a) 30 | set(GMOCK_LIBRARY GTest::GMock) 31 | add_library(${GMOCK_LIBRARY} UNKNOWN IMPORTED) 32 | set_target_properties(${GMOCK_LIBRARY} PROPERTIES 33 | IMPORTED_LOCATION ${GMOCK_LIBRARY_PATH}) 34 | add_dependencies(${GMOCK_LIBRARY} GoogleTest) 35 | 36 | file(GLOB_RECURSE TEST_SOURCES LIST_DIRECTORIES false *.h *.cc) 37 | SET(SOURCES ${TEST_SOURCES}) 38 | add_executable(${BINARY} ${TEST_SOURCES}) 39 | 40 | target_include_directories(Simulator_test PUBLIC ${ONNX_INCLUDE_DIRS}) 41 | target_include_directories(Simulator_test PUBLIC ${PROJECT_SOURCE_DIR}/src) 42 | target_link_libraries(Simulator_test Simulator_lib) 43 | # target_link_libraries(Simulator_test ramulator booksim2) 44 | # target_link_libraries(Simulator_test nlohmann_json::nlohmann_json ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs spdlog::spdlog) 45 | target_link_libraries(Simulator_test GTest::GTest GTest::GMock) 46 | 47 | # gtest_discover_tests( 48 | # Simulator_test 49 | # WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} 50 | # ) 51 | add_test(NAME Simulator_test COMMAND Simulator_test) 52 | -------------------------------------------------------------------------------- /tests/MappingTest.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | #include "Mapping.h" 3 | 4 | TEST(OSMappingParsingTest, BasicAssertions) { 5 | /* Parse mapping for output stationary accelerator */ 6 | Mapping mapping("T N1 C128 M128 Q28 P28 S3 R3 - O P4 - I S3 R3 C128 P7 M4 Q28Y M32X"); 7 | /*Total loop count check*/ 8 | EXPECT_EQ(mapping.total_loop.N, 1); 9 | EXPECT_EQ(mapping.total_loop.C, 128); 10 | EXPECT_EQ(mapping.total_loop.M, 128); 11 | EXPECT_EQ(mapping.total_loop.Q, 28); 12 | EXPECT_EQ(mapping.total_loop.P, 28); 13 | EXPECT_EQ(mapping.total_loop.S, 3); 14 | EXPECT_EQ(mapping.total_loop.R, 3); 15 | 16 | /*Spatial parsing check*/ 17 | EXPECT_EQ(mapping.spatial_M, 32); 18 | EXPECT_EQ(mapping.spatial_Q, 28); 19 | EXPECT_EQ(mapping.spatial_P, 1); 20 | EXPECT_EQ(mapping.spatial_C, 1); 21 | EXPECT_EQ(mapping.spatial_R, 1); 22 | EXPECT_EQ(mapping.spatial_S, 1); 23 | 24 | } 25 | 26 | TEST(WSMappingParsingTest, BasicAssertions) { 27 | /* Parse mapping for weight stationary accelerator */ 28 | Mapping mapping("T N1 C64 M256 Q56 P56 S1 R1 - O C8 - I M32 Q28 C8Y M8X P14 Q2 P4"); 29 | /*Total loop count check*/ 30 | EXPECT_EQ(mapping.total_loop.N, 1); 31 | EXPECT_EQ(mapping.total_loop.C, 64); 32 | EXPECT_EQ(mapping.total_loop.M, 256); 33 | EXPECT_EQ(mapping.total_loop.Q, 56); 34 | EXPECT_EQ(mapping.total_loop.P, 56); 35 | EXPECT_EQ(mapping.total_loop.S, 1); 36 | EXPECT_EQ(mapping.total_loop.R, 1); 37 | 38 | /*Spatial parsing check*/ 39 | EXPECT_EQ(mapping.spatial_M, 8); 40 | EXPECT_EQ(mapping.spatial_Q, 1); 41 | EXPECT_EQ(mapping.spatial_P, 1); 42 | EXPECT_EQ(mapping.spatial_C, 8); 43 | EXPECT_EQ(mapping.spatial_R, 1); 44 | EXPECT_EQ(mapping.spatial_S, 1); 45 | 46 | } -------------------------------------------------------------------------------- /tests/SystolicOsTest.cc: -------------------------------------------------------------------------------- 1 | 2 | #include "Common.h" 3 | #include "Core.h" 4 | #include "SimulationConfig.h" 5 | #include "SystolicOS.h" 6 | #include "gtest/gtest.h" 7 | #include "operations/ConvOS.h" 8 | 9 | TEST(SystolicOSTileExecutionTest, BasicAssertions) { 10 | // /* Weight statinary config*/ 11 | // SimulationConfig config; 12 | // config.core_type = CoreType::SYSTOLIC_OS; 13 | // config.core_height = 8; 14 | // config.core_width = 8; 15 | // config.spad_size = 192; 16 | // config.precision = 4; 17 | // config.dram_req_size = 32; 18 | 19 | // SystolicOS core(0, config); 20 | // Tile tile{.status = Tile::Status::INITIALIZED, .layer_id = 0}; 21 | // tile.instructions.push( 22 | // Instruction{.opcode = Opcode::MOVIN, 23 | // .id = "WEIGHT-0", 24 | // .addrs = std::vector{0x00, 0x20}}); 25 | // tile.instructions.push( 26 | // Instruction{.opcode = Opcode::GEMM, 27 | // .tile_size = 100, 28 | // .dependent_ids = std::vector{"WEIGHT-0"}}); 29 | // core.issue(&tile); 30 | // cycle_type cycle = 0; 31 | // while (tile.status != Tile::Status::FINISH) { 32 | // core.cycle(); 33 | // if (core.has_memory_request()) { 34 | // MemoryAccess* access = core.top_memory_request(); 35 | // access->request = false; 36 | // core.pop_memory_request(); 37 | // core.push_memory_response(access); 38 | // } 39 | // cycle++; 40 | // if (cycle > 1000) break; 41 | // } 42 | // /*TODO: insert cycle count from GEMMINI */ 43 | // ASSERT_EQ(cycle, 125); 44 | } -------------------------------------------------------------------------------- /tests/main.cc: -------------------------------------------------------------------------------- 1 | #include "gtest/gtest.h" 2 | 3 | GTEST_API_ int main(int argc, char** argv) { 4 | int status = 0; 5 | ::testing::InitGoogleTest(&argc, argv); 6 | try { 7 | const bool create_default_logger = false; 8 | status = RUN_ALL_TESTS(); 9 | } catch (const std::exception& ex) { 10 | std::cerr << ex.what(); 11 | status = -1; 12 | } 13 | return status; 14 | } -------------------------------------------------------------------------------- /traces/input.csv: -------------------------------------------------------------------------------- 1 | time, prompt_length, target_length, cached_length 2 | 0, 100, 16, 0 3 | 100, 100, 15, 0 4 | --------------------------------------------------------------------------------