├── .github
    └── workflows
    │   └── docker-image.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── README.md
├── conanfile.txt
├── configs
    ├── booksim2_configs
    │   ├── anynet.icnt
    │   ├── anynet_file
    │   ├── fly_c16_m8.icnt
    │   ├── fly_c1_m2.icnt
    │   ├── fly_c1_m8.icnt
    │   ├── fly_c2_m8.icnt
    │   ├── fly_c4_m16.icnt
    │   ├── fly_c4_m2.icnt
    │   ├── fly_c4_m32.icnt
    │   ├── fly_c4_m8.icnt
    │   ├── fly_c64_m8.icnt
    │   ├── fly_c64_m8_sif-age.icnt
    │   ├── fly_c64_m8_sif-rr.icnt
    │   ├── make_anynet_topology.py
    │   ├── mesh_sif-age.icnt
    │   └── mesh_sif-rr.icnt
    ├── ramulator2_configs
    │   ├── DDR4.yaml
    │   └── HBM2.yaml
    ├── ramulator_configs
    │   ├── ALDRAM-config.cfg
    │   ├── DDR3-config.cfg
    │   ├── DDR4-config.cfg
    │   ├── DSARP-config.cfg
    │   ├── GDDR5-config.cfg
    │   ├── HBM-config.cfg
    │   ├── HBM-config_ChRaBaRoCo.cfg
    │   ├── HBM-config_FCFS.cfg
    │   ├── HBM-config_FRFCFS.cfg
    │   ├── HBM-config_FRFCFS_Cap.cfg
    │   ├── HBM-config_FRFCFS_PriorHit.cfg
    │   ├── HBM-config_RoBaRaCoCh.cfg
    │   ├── HBM-config_RoCoBaRaCh.cfg
    │   ├── HBMx0.5ch-config.cfg
    │   ├── HBMx2ch-config.cfg
    │   ├── LPDDR3-config.cfg
    │   ├── LPDDR4-config.cfg
    │   ├── PCM-config.cfg
    │   ├── SALP-config.cfg
    │   ├── STTMRAM-config.cfg
    │   ├── TLDRAM-config.cfg
    │   ├── WideIO-config.cfg
    │   └── WideIO2-config.cfg
    ├── systolic_ws_128x128_c4_booksim2_tpuv4.json
    ├── systolic_ws_128x128_c4_simple_noc_tpuv4.json
    ├── systolic_ws_128x128_c4_simple_noc_tpuv4_half_ramulator2.json
    ├── systolic_ws_128x128_c4_simple_noc_tpuv4_partition_quad.json
    ├── systolic_ws_8x8_c1_booksim2_transformer.json
    ├── systolic_ws_8x8_c1_simple_noc_transformer.json
    ├── systolic_ws_8x8_c4_booksim2_transformer.json
    ├── systolic_ws_8x8_c4_simple_noc_transformer.json
    ├── test.json
    └── timeloop_configs
    │   ├── systolic_os_32x32
    │       ├── README.md
    │       ├── arch
    │       │   ├── components
    │       │   │   ├── reg_storage.yaml
    │       │   │   ├── smartbuffer_RF.yaml
    │       │   │   └── smartbuffer_SRAM.yaml
    │       │   ├── simple_output_stationary.yaml
    │       │   └── simple_output_stationary.yaml.tmp
    │       ├── constraints
    │       │   ├── simple_output_stationary_arch_constraints.yaml
    │       │   └── simple_output_stationary_map_constraints.yaml
    │       └── mapper
    │       │   └── mapper.yaml
    │   └── systolic_ws_8x8
    │       ├── README.md
    │       ├── arch
    │           ├── components
    │           │   ├── reg_storage.yaml
    │           │   ├── smartbuffer_RF.yaml
    │           │   └── smartbuffer_SRAM.yaml
    │           └── simple_weight_stationary.yaml
    │       ├── constraints
    │           ├── simple_weight_stationary_arch_constraints.yaml
    │           └── simple_weight_stationary_map_constraints.yaml
    │       ├── example_AlexNet_layer1_outputs
    │           ├── timeloop-mapper.ART.yaml
    │           ├── timeloop-mapper.ART_summary.yaml
    │           ├── timeloop-mapper.ERT.yaml
    │           ├── timeloop-mapper.ERT_summary.yaml
    │           ├── timeloop-mapper.accelergy.log
    │           ├── timeloop-mapper.defined_input_architecture.yaml
    │           ├── timeloop-mapper.flattened_architecture.yaml
    │           ├── timeloop-mapper.log
    │           ├── timeloop-mapper.map+stats.xml
    │           ├── timeloop-mapper.map.txt
    │           └── timeloop-mapper.stats.txt
    │       ├── mapper
    │           └── mapper.yaml
    │       ├── timeloop-mapper.map+stats.xml
    │       ├── timeloop-mapper.map.txt
    │       └── timeloop-mapper.stats.txt
├── example
    ├── language_models.json
    └── models_list.json
├── extern
    └── ramulator_custom
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── include
    │       └── ramulator
    │       │   └── Ramulator.hpp
    │   └── src
    │       ├── Config.cpp
    │       ├── Config.h
    │       ├── Controller.h
    │       ├── DDR4.cpp
    │       ├── DDR4.h
    │       ├── DRAM.h
    │       ├── HBM.cpp
    │       ├── HBM.h
    │       ├── Memory.h
    │       ├── MemoryFactory.cpp
    │       ├── MemoryFactory.h
    │       ├── Ramulator.cpp
    │       ├── Refresh.cpp
    │       ├── Refresh.h
    │       ├── Request.cpp
    │       ├── Request.h
    │       ├── Scheduler.h
    │       ├── SpeedyController.h
    │       ├── StatType.cpp
    │       ├── StatType.h
    │       └── Statistics.h
├── img
    ├── ONNXim_demo.png
    └── speedup.png
├── models
    ├── language_models
    │   ├── llama3-8b.json
    │   ├── opt-125m.json
    │   └── opt-66b.json
    └── resnet18
    │   └── resnet18.onnx
├── scripts
    ├── aggregate_results.sh
    ├── generate_cnn_onnx.py
    ├── generate_conv_onnx.py
    ├── generate_matmul_onnx.py
    ├── generate_multi-tenancy_onnx.py
    ├── generate_multi-tenancy_onnx2.py
    ├── generate_transformer_onnx.py
    ├── onnxim_sbatch.sh
    ├── run_matmul_conv.sh
    ├── run_multi-tenancy.sh
    ├── run_sbatch.sh
    ├── run_simulation.sh
    ├── run_timeloop.sh
    ├── run_transformer.sh
    └── timeloop_slurm_job.sh
├── src
    ├── CMakeLists.txt
    ├── Common.cc
    ├── Common.h
    ├── Core.cc
    ├── Core.h
    ├── Dram.cc
    ├── Dram.h
    ├── Hashing.cc
    ├── Hashing.h
    ├── Instruction.h
    ├── Interconnect.cc
    ├── Interconnect.h
    ├── Mapping.cc
    ├── Mapping.h
    ├── Model.cc
    ├── Model.h
    ├── SimulationConfig.h
    ├── Simulator.cc
    ├── Simulator.h
    ├── Sram.cc
    ├── Sram.h
    ├── Stat.h
    ├── SystolicOS.cc
    ├── SystolicOS.h
    ├── SystolicWS.cc
    ├── SystolicWS.h
    ├── Tensor.cc
    ├── Tensor.h
    ├── allocator
    │   └── AddressAllocator.h
    ├── helper
    │   ├── CommandLineParser.cc
    │   ├── CommandLineParser.h
    │   └── HelperFunctions.h
    ├── main.cc
    ├── models
    │   ├── LanguageModel.cc
    │   └── LanguageModel.h
    ├── operations
    │   ├── AdaptiveAvgPool.cc
    │   ├── AdaptiveAvgPool.h
    │   ├── Attention.cc
    │   ├── Attention.h
    │   ├── BiasAct.cc
    │   ├── BiasAct.h
    │   ├── BiasGelu.cc
    │   ├── BiasGelu.h
    │   ├── Concat.cc
    │   ├── Concat.h
    │   ├── Conv.cc
    │   ├── Conv.h
    │   ├── ConvOS.cc
    │   ├── ConvOS.h
    │   ├── ConvWS.cc
    │   ├── ConvWS.h
    │   ├── Dummy.cc
    │   ├── Dummy.h
    │   ├── EmbedLayerNorm.cc
    │   ├── EmbedLayerNorm.h
    │   ├── Flatten.cc
    │   ├── Flatten.h
    │   ├── Gemm.cc
    │   ├── Gemm.h
    │   ├── GemmOS.cc
    │   ├── GemmOS.h
    │   ├── GemmWS.cc
    │   ├── GemmWS.h
    │   ├── GlobalAvgPool.cc
    │   ├── GlobalAvgPool.h
    │   ├── KVCacheConcat.cc
    │   ├── KVCacheConcat.h
    │   ├── MaxPool.cc
    │   ├── MaxPool.h
    │   ├── Operation.cc
    │   ├── Operation.h
    │   ├── OperationFactory.cc
    │   ├── OperationFactory.h
    │   ├── SkipLayerNorm.cc
    │   ├── SkipLayerNorm.h
    │   ├── Softmax.cc
    │   └── Softmax.h
    └── scheduler
    │   ├── IterLevelScheduler.cc
    │   ├── IterLevelScheduler.h
    │   ├── LanguageScheduler.cc
    │   ├── LanguageScheduler.h
    │   ├── Scheduler.cc
    │   └── Scheduler.h
├── tests
    ├── CMakeLists.txt
    ├── MappingTest.cc
    ├── SystolicOsTest.cc
    ├── SystolicWsTest.cc
    ├── main.cc
    └── operatons
    │   ├── ConvWSTest.cc
    │   └── GemmWSTest.cc
└── traces
    └── input.csv


/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build and Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   docker-build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v3
18 | 
19 |       - name: Build Docker image
20 |         run: |
21 |           docker build -t onxxim-test .
22 | 
23 |       - name: Test Docker image
24 |         run: |
25 |           docker run --rm onxxim-test echo "Docker build successful!"
26 | 
27 |       - name: Test generating onnx file - GPT2
28 |         run: |
29 |           docker run --rm onxxim-test python3 /workspace/ONNXim/scripts/generate_transformer_onnx.py --model gpt2
30 | 
31 |       - name: Test generating onnx file - BERT
32 |         run: |
33 |           docker run --rm onxxim-test python3 /workspace/ONNXim/scripts/generate_transformer_onnx.py --model bert
34 | 
35 |       - name: Cache Docker layers
36 |         uses: actions/cache@v3
37 |         with:
38 |           path: /var/lib/docker
39 |           key: ${{ runner.os }}-docker-${{ github.sha }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | models/*
2 | model_lists/*
3 | build/*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "extern/onnx"]
 2 | 	path = extern/onnx
 3 | 	url = https://github.com/onnx/onnx.git
 4 | [submodule "extern/protobuf"]
 5 | 	path = extern/protobuf
 6 | 	url = https://github.com/protocolbuffers/protobuf.git
 7 | [submodule "extern/torch2timeloop"]
 8 | 	path = extern/torch2timeloop
 9 | 	url = https://github.com/Accelergy-Project/pytorch2timeloop-converter.git
10 | [submodule "extern/booksim"]
11 | 	path = extern/booksim
12 | 	url = https://github.com/PSAL-POSTECH/booksim.git
13 | [submodule "extern/ramulator2"]
14 | 	path = extern/ramulator2
15 | 	url = https://github.com/PSAL-POSTECH/ramulator2.git
16 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.15.0)
 2 | set(project_name "AiFrameworkSim")
 3 | project(${project_name})
 4 | 
 5 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 6 | include("${CMAKE_SOURCE_DIR}/build/conanbuildinfo.cmake")
 7 | conan_basic_setup()
 8 | 
 9 | # find_package(Boost 1.70 REQUIRED COMPONENTS program_options)
10 | # message("LIB ${Boost_LIBRARY_DIRS}")
11 | option(USE_RAMULATOR "USE_RAMULATOR" ON)
12 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/extern)
13 | 
14 | # C++ setttings
15 | set(CMAKE_CXX_STANDARD 20)
16 | set(ONNX_ML 1)
17 | set(JSON_BuildTests OFF CACHE INTERNAL "")
18 | set(EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/build/bin")
19 | set(LIBRARY_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/build/lib")
20 | add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
21 | message("BINARY DIR ${CMAKE_BINARY_DIR}")
22 | if(CMAKE_BUILD_TYPE STREQUAL "Debug")
23 |     add_compile_options(-fsanitize=address)
24 |     add_link_options(-fsanitize=address)
25 | endif()
26 | 
27 | # Build source
28 | add_subdirectory("${PROJECT_SOURCE_DIR}/src")
29 | 
30 | # Add libaray ramulator
31 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator_custom")
32 | 
33 | # Add libaray ramulator
34 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/ramulator2")
35 | include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/src")
36 | include_directories("${PROJECT_SOURCE_DIR}/extern/ramulator2/resources/ndp_wrappers")
37 | 
38 | # Add libaray booksim
39 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/booksim")
40 | 
41 | # Add libary protobuf
42 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/protobuf/cmake" EXCLUDE_FROM_ALL)
43 | set_target_properties(libprotoc PROPERTIES FOLDER "external/protobuf")
44 | set_target_properties(protoc PROPERTIES FOLDER "external/protobuf")
45 | 
46 | # Add libaray onnx
47 | add_definitions("-DONNX_NAMESPACE=onnx")
48 | add_subdirectory("${PROJECT_SOURCE_DIR}/extern/onnx" EXCLUDE_FROM_ALL)
49 | set_target_properties(onnx PROPERTIES FOLDER "extern/onnx")
50 | set_target_properties(onnx_proto PROPERTIES FOLDER "extern/onnx")
51 | 
52 | target_include_directories(Simulator PUBLIC ${ONNX_INCLUDE_DIRS})
53 | target_link_libraries(Simulator ramulator1 booksim2 ramulator)
54 | target_link_libraries(Simulator ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
55 | 
56 | target_include_directories(Simulator_lib PUBLIC ${ONNX_INCLUDE_DIRS})
57 | target_link_libraries(Simulator_lib ramulator1 booksim2 ramulator)
58 | target_link_libraries(Simulator_lib ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs)
59 | 
60 | enable_testing()
61 | add_subdirectory("${PROJECT_SOURCE_DIR}/tests")
62 | 
63 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Ubuntu 20.04 as the base image, where GCC 10 is available
 2 | FROM ubuntu:20.04
 3 | 
 4 | # Avoid prompts during package installation
 5 | ARG DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # Update and install software
 8 | RUN apt-get update && apt-get install -y \
 9 |     gcc-10 g++-10 python3.8 python3-pip git wget make \
10 |     libssl-dev libasan5 libubsan1
11 | 
12 | # Set GCC 10 as the default gcc and g++ compilers
13 | RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
14 |     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100
15 | 
16 | # Set the working directory
17 | WORKDIR /workspace
18 | 
19 | # Install CMake 3.22.0 from source
20 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.22.0/cmake-3.22.0.tar.gz && \
21 |     tar -xvzf cmake-3.22.0.tar.gz && \
22 |     cd cmake-3.22.0 && \
23 |     ./bootstrap && \
24 |     make -j$(nproc) && \
25 |     make install
26 | 
27 | # Install specific Python packages with pip
28 | RUN pip3 install conan==1.57.0 transformers==4.40.1 onnx onnxruntime torch==2.3.1 torchvision optimum
29 | 
30 | # Copy your project files into the image
31 | COPY ./ ONNXim
32 | 
33 | # Prepare ONNXim project
34 | RUN cd ONNXim && \
35 |     git submodule update --recursive --init && \
36 |     mkdir -p build && \
37 |     cd build && \
38 |     conan install .. --build=missing && \
39 |     cmake .. && \
40 |     make -j$(nproc)
41 | 
42 | # Set environment variable
43 | ENV ONNXIM_HOME /workspace/ONNXim
44 | 
45 | # Final command
46 | CMD ["echo", "Welcome to ONNXim!"]
47 | 
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Parallel System Architecture Lab at POSTECH
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/conanfile.txt:
--------------------------------------------------------------------------------
1 | [requires]
2 | boost/1.79.0
3 | robin-hood-hashing/3.11.5
4 | spdlog/1.11.0
5 | nlohmann_json/3.11.2
6 | [generators]
7 | cmake
8 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/anynet.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = anynet
 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file
 6 | routing_function = min
 7 | subnets = 1
 8 | 
 9 | vc_buf_size = 64
10 | input_buffer_size = 256
11 | ejection_buffer_size = 64
12 | boundary_buffer_size = 64
13 | wait_for_tail_credit = 0
14 | vc_allocator = islip
15 | sw_allocator = islip
16 | alloc_iters = 1
17 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/anynet_file:
--------------------------------------------------------------------------------
 1 | router 0 node 0 router 1 router 8
 2 | router 1 node 1 router 2 router 9
 3 | router 2 node 2 router 3 router 10
 4 | router 3 node 3 router 4 router 11
 5 | router 4 node 4 router 5 router 12
 6 | router 5 node 5 router 6 router 13
 7 | router 6 node 6 router 7 router 14
 8 | router 7 node 7 router 15
 9 | router 8 node 8 router 9 router 16
10 | router 9 node 9 router 10 router 17
11 | router 10 node 10 router 11 router 18
12 | router 11 node 11 router 12 router 19
13 | router 12 node 12 router 13 router 20
14 | router 13 node 13 router 14 router 21
15 | router 14 node 14 router 15 router 22
16 | router 15 node 15 router 23
17 | router 16 node 16 router 17 router 24
18 | router 17 node 17 router 18 router 25
19 | router 18 node 18 router 19 router 26
20 | router 19 node 19 router 20 router 27
21 | router 20 node 20 router 21 router 28
22 | router 21 node 21 router 22 router 29
23 | router 22 node 22 router 23 router 30
24 | router 23 node 23 router 31
25 | router 24 node 24 router 25 router 32
26 | router 25 node 25 router 26 router 33
27 | router 26 node 26 router 27 router 34
28 | router 27 node 27 router 28 router 35
29 | router 28 node 28 router 29 router 36
30 | router 29 node 29 router 30 router 37
31 | router 30 node 30 router 31 router 38
32 | router 31 node 31 router 39
33 | router 32 node 32 router 33 router 40
34 | router 33 node 33 router 34 router 41
35 | router 34 node 34 router 35 router 42
36 | router 35 node 35 router 36 router 43
37 | router 36 node 36 router 37 router 44
38 | router 37 node 37 router 38 router 45
39 | router 38 node 38 router 39 router 46
40 | router 39 node 39 router 47
41 | router 40 node 40 router 41 router 48
42 | router 41 node 41 router 42 router 49
43 | router 42 node 42 router 43 router 50
44 | router 43 node 43 router 44 router 51
45 | router 44 node 44 router 45 router 52
46 | router 45 node 45 router 46 router 53
47 | router 46 node 46 router 47 router 54
48 | router 47 node 47 router 55
49 | router 48 node 48 router 49 router 56
50 | router 49 node 49 router 50 router 57
51 | router 50 node 50 router 51 router 58
52 | router 51 node 51 router 52 router 59
53 | router 52 node 52 router 53 router 60
54 | router 53 node 53 router 54 router 61
55 | router 54 node 54 router 55 router 62
56 | router 55 node 55 router 63
57 | router 56 node 56 router 57 router 64
58 | router 57 node 57 router 58 router 65
59 | router 58 node 58 router 59 router 66
60 | router 59 node 59 router 60 router 67
61 | router 60 node 60 router 61 router 68
62 | router 61 node 61 router 62 router 69
63 | router 62 node 62 router 63 router 70
64 | router 63 node 63 router 71
65 | router 64 node 64 router 65
66 | router 65 node 65 router 66
67 | router 66 node 66 router 67
68 | router 67 node 67 router 68
69 | router 68 node 68 router 69
70 | router 69 node 69 router 70
71 | router 70 node 70 router 71
72 | router 71 node 71


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c16_m8.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 24
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c1_m2.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 3
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c1_m8.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 9
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c2_m8.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 10
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c4_m16.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 20
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c4_m2.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 6
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c4_m32.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 36
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c4_m8.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 12
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | 


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c64_m8.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 72
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = islip
16 | sw_allocator = islip
17 | alloc_iters = 1
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c64_m8_sif-age.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 72
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = separable_input_first
16 | sw_allocator = separable_input_first
17 | alloc_iters = 1
18 | priority = age
19 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/fly_c64_m8_sif-rr.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = fly
 5 | k = 72
 6 | n = 1
 7 | routing_function = dest_tag
 8 | subnets = 1
 9 | 
10 | vc_buf_size = 64
11 | input_buffer_size = 256
12 | ejection_buffer_size = 64
13 | boundary_buffer_size = 64
14 | wait_for_tail_credit = 0
15 | vc_allocator = separable_input_first
16 | sw_allocator = separable_input_first
17 | alloc_iters = 1
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/make_anynet_topology.py:
--------------------------------------------------------------------------------
 1 | RowSize = 8
 2 | ColSize = 9
 3 | for col in range(ColSize):
 4 |   for i in range(RowSize):
 5 |     id = RowSize * col + i
 6 |     if col < ColSize - 1:
 7 |       if i < RowSize - 1:
 8 |         print(f'router {id} node {id} router {id+1} router {id+RowSize}')
 9 |       else:
10 |         print(f'router {id} node {id} router {id+RowSize}')
11 |     else:
12 |       if i < RowSize - 1:
13 |         print(f'router {id} node {id} router {id+1}')
14 |       else:
15 |         print(f'router {id} node {id}')


--------------------------------------------------------------------------------
/configs/booksim2_configs/mesh_sif-age.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = anynet
 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file
 6 | routing_function = min
 7 | subnets = 1
 8 | 
 9 | vc_buf_size = 64
10 | input_buffer_size = 256
11 | ejection_buffer_size = 64
12 | boundary_buffer_size = 64
13 | wait_for_tail_credit = 0
14 | vc_allocator = separable_input_first
15 | sw_allocator = separable_input_first
16 | alloc_iters = 1
17 | priority = age
18 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/booksim2_configs/mesh_sif-rr.icnt:
--------------------------------------------------------------------------------
 1 | [config]
 2 | use_map = 0
 3 | flit_size = 64
 4 | topology = anynet
 5 | network_file = /home/hhk971/ai_framework/my-project/ai-framwork-sim/configs/booksim2_configs/anynet_file
 6 | routing_function = min
 7 | subnets = 1
 8 | 
 9 | vc_buf_size = 64
10 | input_buffer_size = 256
11 | ejection_buffer_size = 64
12 | boundary_buffer_size = 64
13 | wait_for_tail_credit = 0
14 | vc_allocator = separable_input_first
15 | sw_allocator = separable_input_first
16 | alloc_iters = 1
17 | deadlock_warn_timeout = 10000


--------------------------------------------------------------------------------
/configs/ramulator2_configs/DDR4.yaml:
--------------------------------------------------------------------------------
 1 | Frontend:
 2 |   impl: GEM5            
 3 | 
 4 | MemorySystem:
 5 |   impl: GenericDRAM
 6 |   clock_ratio: 1
 7 | 
 8 |   DRAM:
 9 |     impl: DDR4
10 |     org:
11 |       preset: DDR4_16Gb_x4
12 |       channel: 1
13 |     timing:
14 |       preset: DDR4_1600J
15 | 
16 |   Controller:
17 |     impl: Generic
18 |     Scheduler:
19 |       impl: FRFCFS
20 |     RefreshManager:
21 |       impl: AllBank
22 |     plugins:
23 | 
24 |   AddrMapper:
25 |     impl: RoBaRaCoCh 


--------------------------------------------------------------------------------
/configs/ramulator2_configs/HBM2.yaml:
--------------------------------------------------------------------------------
 1 | Frontend:
 2 |   impl: GEM5            
 3 | 
 4 | MemorySystem:
 5 |   impl: GenericDRAM
 6 |   clock_ratio: 1
 7 | 
 8 |   DRAM:
 9 |     impl: HBM2
10 |     org:
11 |       preset: HBM2_8Gb
12 |       channel: 1
13 |     timing:
14 |       preset: HBM2_2.5Gbps
15 | 
16 |   Controller:
17 |     impl: Generic
18 |     Scheduler:
19 |       impl: FRFCFS
20 |     RefreshManager:
21 |       impl: AllBank
22 |     plugins:
23 | 
24 |   AddrMapper:
25 |     impl: RoBaRaCoCh


--------------------------------------------------------------------------------
/configs/ramulator_configs/ALDRAM-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = ALDRAM
 6 |  channels = 1
 7 |  ranks = 1
 8 |  speed = ALDRAM_1600K
 9 |  org = ALDRAM_4Gb_x8
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/DDR3-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = DDR3
 6 |  channels = 1
 7 |  ranks = 1
 8 |  speed = DDR3_1600K
 9 |  org = DDR3_2Gb_x8
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 | # warmup_insts = 100000000
25 |  warmup_insts = 0
26 |  cache = no
27 | # cache = no, L1L2, L3, all (default value is no)
28 |  translation = None
29 | # translation = None, Random (default value is None)
30 | #
31 | ########################
32 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/DDR4-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = DDR4
 6 |  channels = 2
 7 |  ranks = 1
 8 |  speed = DDR4_3200
 9 |  org = DDR4_4Gb_x8
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 8
17 |  mem_tick = 3
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 | #  warmup_insts = 100000000
25 |  warmup_insts = 0
26 |  cache = no
27 | # cache = no, L1L2, L3, all (default value is no)
28 |  translation = None
29 | # translation = None, Random (default value is None)
30 | #
31 | ########################
32 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/DSARP-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = DSARP
 6 |  subarrays = 8
 7 |  channels = 1
 8 |  ranks = 1
 9 |  speed = DSARP_1333
10 |  org = DSARP_8Gb_x8
11 | # record_cmd_trace: (default is off): on, off
12 |  record_cmd_trace = off
13 | # print_cmd_trace: (default is off): on, off
14 |  print_cmd_trace = off
15 | 
16 | ### Below are parameters only for CPU trace
17 |  cpu_tick = 4
18 |  mem_tick = 1
19 | ### Below are parameters only for multicore mode
20 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
21 |  early_exit = on
22 | # early_exit = on, off (default value is on)
23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
24 |  expected_limit_insts = 200000000
25 |  warmup_insts = 100000000
26 |  cache = no
27 | # cache = no, L1L2, L3, all (default value is no)
28 |  translation = None
29 | # translation = None, Random (default value is None)
30 | #
31 | ########################
32 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/GDDR5-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = GDDR5
 6 |  channels = 1
 7 |  ranks = 1
 8 |  speed = GDDR5_6000
 9 |  org = GDDR5_8Gb_x16
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 2
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 16
 7 |  ranks = 1
 8 |  speed = HBM_2_5Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FRFCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = ChRaBaRoCo
29 |  scheduler = FRFCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_FCFS.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_FRFCFS.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FRFCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FRFCFS_Cap
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FRFCFS_PriorHit
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoBaRaCoCh
29 |  scheduler = FRFCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 |  mapping = RoCoBaRaCh
29 |  scheduler = FRFCFS
30 | # translation = None, Random (default value is None)
31 | #
32 | ########################
33 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBMx0.5ch-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 4
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/HBMx2ch-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = HBM
 6 |  channels = 16
 7 |  ranks = 1
 8 |  speed = HBM_1Gbps
 9 |  org = HBM_4Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 32
17 |  mem_tick = 5
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/LPDDR3-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = LPDDR3
 6 |  channels = 1
 7 |  ranks = 1
 8 |  speed = LPDDR3_1600
 9 |  org = LPDDR3_8Gb_x16
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/LPDDR4-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = LPDDR4
 6 |  channels = 2
 7 |  ranks = 1
 8 |  speed = LPDDR4_2400
 9 |  org = LPDDR4_8Gb_x16
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 8
17 |  mem_tick = 3
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/PCM-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = PCM
 6 |  channels = 1
 7 |  ranks = 1
 8 |  speed = PCM_800D
 9 |  org = PCM_2Gb_x8
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/SALP-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = SALP-MASA
 6 |  subarrays = 8
 7 |  channels = 1
 8 |  ranks = 1
 9 |  speed = SALP_1600K
10 |  org = SALP_4Gb_x8
11 | # record_cmd_trace: (default is off): on, off
12 |  record_cmd_trace = off
13 | # print_cmd_trace: (default is off): on, off
14 |  print_cmd_trace = off
15 | 
16 | ### Below are parameters only for CPU trace
17 |  cpu_tick = 4
18 |  mem_tick = 1
19 | ### Below are parameters only for multicore mode
20 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
21 |  early_exit = on
22 | # early_exit = on, off (default value is on)
23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
24 |  expected_limit_insts = 200000000
25 |  warmup_insts = 100000000
26 |  cache = no
27 | # cache = no, L1L2, L3, all (default value is no)
28 |  translation = None
29 | # translation = None, Random (default value is None)
30 | #
31 | ########################
32 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/STTMRAM-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = STTMRAM
 6 |  channels = 4
 7 |  ranks = 1
 8 |  speed = STT_1600_1_2
 9 |  org = STTMRAM_2Gb_x8
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/TLDRAM-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = TLDRAM
 6 |  subarrays = 16
 7 |  channels = 1
 8 |  ranks = 1
 9 |  speed = TLDRAM_1600K
10 |  org = TLDRAM_4Gb_x8
11 | # record_cmd_trace: (default is off): on, off
12 |  record_cmd_trace = off
13 | # print_cmd_trace: (default is off): on, off
14 |  print_cmd_trace = off
15 | 
16 | ### Below are parameters only for CPU trace
17 |  cpu_tick = 4
18 |  mem_tick = 1
19 | ### Below are parameters only for multicore mode
20 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
21 |  early_exit = on
22 | # early_exit = on, off (default value is on)
23 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
24 |  expected_limit_insts = 200000000
25 |  warmup_insts = 100000000
26 |  cache = no
27 | # cache = no, L1L2, L3, all (default value is no)
28 |  translation = None
29 | # translation = None, Random (default value is None)
30 | #
31 | ########################
32 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/WideIO-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = WideIO
 6 |  channels = 4
 7 |  ranks = 1
 8 |  speed = WideIO_266
 9 |  org = WideIO_8Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 4
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translation = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/ramulator_configs/WideIO2-config.cfg:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Example config file
 3 | # Comments start with #
 4 | # There are restrictions for valid channel/rank numbers
 5 |  standard = WideIO2
 6 |  channels = 8
 7 |  ranks = 1
 8 |  speed = WideIO2_1066
 9 |  org = WideIO2_8Gb
10 | # record_cmd_trace: (default is off): on, off
11 |  record_cmd_trace = off
12 | # print_cmd_trace: (default is off): on, off
13 |  print_cmd_trace = off
14 | 
15 | ### Below are parameters only for CPU trace
16 |  cpu_tick = 6
17 |  mem_tick = 1
18 | ### Below are parameters only for multicore mode
19 | # When early_exit is on, all cores will be terminated when the earliest one finishes.
20 |  early_exit = on
21 | # early_exit = on, off (default value is on)
22 | # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
23 |  expected_limit_insts = 200000000
24 |  warmup_insts = 100000000
25 |  cache = no
26 | # cache = no, L1L2, L3, all (default value is no)
27 |  translation = None
28 | # translatino = None, Random (default value is None)
29 | #
30 | ########################
31 | 


--------------------------------------------------------------------------------
/configs/systolic_ws_128x128_c4_booksim2_tpuv4.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "num_cores" : 4,
  3 |   "core_freq" : 1000,
  4 |   "core_print_interval" : 8000,
  5 |   "core_config" : {
  6 |     "core_0"  : {
  7 |       "core_type" : "systolic_ws",
  8 |       "core_width" : 128,
  9 |       "core_height" : 128,
 10 | 
 11 |       "spad_size" : 32768,
 12 |       "accum_spad_size" : 4096,
 13 |       "sram_width" : 32,
 14 | 
 15 |       "vector_process_bit" : 65536,
 16 |       "add_latency" : 1,
 17 |       "mul_latency" : 1,
 18 |       "mac_latency" : 1,
 19 |       "exp_latency" : 1,
 20 |       "gelu_latency" : 1,
 21 |       "div_latency" : 1,
 22 |       "add_tree_latency" : 1,
 23 |       "scalar_sqrt_latency" : 1,
 24 |       "scalar_add_latency" : 1,
 25 |       "scalar_mul_latency" : 1
 26 |     },
 27 |     "core_1"  : {
 28 |       "core_type" : "systolic_ws",
 29 |       "core_width" : 128,
 30 |       "core_height" : 128,
 31 | 
 32 |       "spad_size" : 32768,
 33 |       "accum_spad_size" : 4096,
 34 |       "sram_width" : 32,
 35 | 
 36 |       "vector_process_bit" : 65536,
 37 |       "add_latency" : 1,
 38 |       "mul_latency" : 1,
 39 |       "mac_latency" : 1,
 40 |       "exp_latency" : 1,
 41 |       "gelu_latency" : 1,
 42 |       "div_latency" : 1,
 43 |       "add_tree_latency" : 1,
 44 |       "scalar_sqrt_latency" : 1,
 45 |       "scalar_add_latency" : 1,
 46 |       "scalar_mul_latency" : 1
 47 |     },
 48 |     "core_2"  : {
 49 |       "core_type" : "systolic_ws",
 50 |       "core_width" : 128,
 51 |       "core_height" : 128,
 52 | 
 53 |       "spad_size" : 32768,
 54 |       "accum_spad_size" : 4096,
 55 |       "sram_width" : 32,
 56 | 
 57 |       "vector_process_bit" : 65536,
 58 |       "add_latency" : 1,
 59 |       "mul_latency" : 1,
 60 |       "mac_latency" : 1,
 61 |       "exp_latency" : 1,
 62 |       "gelu_latency" : 1,
 63 |       "div_latency" : 1,
 64 |       "add_tree_latency" : 1,
 65 |       "scalar_sqrt_latency" : 1,
 66 |       "scalar_add_latency" : 1,
 67 |       "scalar_mul_latency" : 1
 68 |     },
 69 |     "core_3"  : {
 70 |       "core_type" : "systolic_ws",
 71 |       "core_width" : 128,
 72 |       "core_height" : 128,
 73 | 
 74 |       "spad_size" : 32768,
 75 |       "accum_spad_size" : 4096,
 76 |       "sram_width" : 32,
 77 | 
 78 |       "vector_process_bit" : 65536,
 79 |       "add_latency" : 1,
 80 |       "mul_latency" : 1,
 81 |       "mac_latency" : 1,
 82 |       "exp_latency" : 1,
 83 |       "gelu_latency" : 1,
 84 |       "div_latency" : 1,
 85 |       "add_tree_latency" : 1,
 86 |       "scalar_sqrt_latency" : 1,
 87 |       "scalar_add_latency" : 1,
 88 |       "scalar_mul_latency" : 1
 89 |     }
 90 |   },
 91 | 
 92 |   "dram_type" : "ramulator2",
 93 |   "dram_freq" :1200,
 94 |   "dram_channels": 16,
 95 |   "dram_req_size": 32,
 96 |   "dram_latency" : 10,
 97 |   "dram_size" : 16,
 98 |   "dram_nbl" : 1,
 99 |   "dram_print_interval": 4800,
100 |   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
101 | 
102 |   "icnt_type" : "booksim2",
103 |   "icnt_latency" : 1,
104 |   "icnt_freq" : 8000,
105 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m16.icnt",
106 | 
107 |   "precision" : 2,
108 |   "layout" : "NHWC",
109 |   "scheduler" : "simple"
110 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_128x128_c4_simple_noc_tpuv4.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "num_cores" : 4,
  3 |   "core_freq" : 1000,
  4 |   "core_print_interval" : 8000,
  5 |   "core_config" : {
  6 |     "core_0"  : {
  7 |       "core_type" : "systolic_ws",
  8 |       "core_width" : 128,
  9 |       "core_height" : 128,
 10 | 
 11 |       "spad_size" : 32768,
 12 |       "accum_spad_size" : 4096,
 13 |       "sram_width" : 32,
 14 | 
 15 |       "vector_process_bit" : 65536,
 16 |       "add_latency" : 1,
 17 |       "mul_latency" : 1,
 18 |       "mac_latency" : 1,
 19 |       "exp_latency" : 1,
 20 |       "gelu_latency" : 1,
 21 |       "div_latency" : 1,
 22 |       "add_tree_latency" : 1,
 23 |       "scalar_sqrt_latency" : 1,
 24 |       "scalar_add_latency" : 1,
 25 |       "scalar_mul_latency" : 1
 26 |     },
 27 |     "core_1"  : {
 28 |       "core_type" : "systolic_ws",
 29 |       "core_width" : 128,
 30 |       "core_height" : 128,
 31 | 
 32 |       "spad_size" : 32768,
 33 |       "accum_spad_size" : 4096,
 34 |       "sram_width" : 32,
 35 | 
 36 |       "vector_process_bit" : 65536,
 37 |       "add_latency" : 1,
 38 |       "mul_latency" : 1,
 39 |       "mac_latency" : 1,
 40 |       "exp_latency" : 1,
 41 |       "gelu_latency" : 1,
 42 |       "div_latency" : 1,
 43 |       "add_tree_latency" : 1,
 44 |       "scalar_sqrt_latency" : 1,
 45 |       "scalar_add_latency" : 1,
 46 |       "scalar_mul_latency" : 1
 47 |     },
 48 |     "core_2"  : {
 49 |       "core_type" : "systolic_ws",
 50 |       "core_width" : 128,
 51 |       "core_height" : 128,
 52 | 
 53 |       "spad_size" : 32768,
 54 |       "accum_spad_size" : 4096,
 55 |       "sram_width" : 32,
 56 | 
 57 |       "vector_process_bit" : 65536,
 58 |       "add_latency" : 1,
 59 |       "mul_latency" : 1,
 60 |       "mac_latency" : 1,
 61 |       "exp_latency" : 1,
 62 |       "gelu_latency" : 1,
 63 |       "div_latency" : 1,
 64 |       "add_tree_latency" : 1,
 65 |       "scalar_sqrt_latency" : 1,
 66 |       "scalar_add_latency" : 1,
 67 |       "scalar_mul_latency" : 1
 68 |     },
 69 |     "core_3"  : {
 70 |       "core_type" : "systolic_ws",
 71 |       "core_width" : 128,
 72 |       "core_height" : 128,
 73 | 
 74 |       "spad_size" : 32768,
 75 |       "accum_spad_size" : 4096,
 76 |       "sram_width" : 32,
 77 | 
 78 |       "vector_process_bit" : 65536,
 79 |       "add_latency" : 1,
 80 |       "mul_latency" : 1,
 81 |       "mac_latency" : 1,
 82 |       "exp_latency" : 1,
 83 |       "gelu_latency" : 1,
 84 |       "div_latency" : 1,
 85 |       "add_tree_latency" : 1,
 86 |       "scalar_sqrt_latency" : 1,
 87 |       "scalar_add_latency" : 1,
 88 |       "scalar_mul_latency" : 1
 89 |     }
 90 |   },
 91 |   "dram_type" : "ramulator2",
 92 |   "dram_freq" :1200,
 93 |   "dram_channels": 16,
 94 |   "dram_req_size": 32,
 95 |   "dram_latency" : 10,
 96 |   "dram_size" : 16,
 97 |   "dram_nbl" : 1,
 98 |   "dram_print_interval": 9600,
 99 |   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
100 | 
101 |   "icnt_type" : "simple",
102 |   "icnt_latency" : 1,
103 |   "icnt_freq" : 8000,
104 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
105 | 
106 |   "precision" : 2,
107 |   "layout" : "NHWC",
108 |   "scheduler" : "simple"
109 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_128x128_c4_simple_noc_tpuv4_half_ramulator2.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "num_cores" : 4,
  3 |   "core_freq" : 1000,
  4 |   "core_print_interval" : 8000,
  5 |   "core_config" : {
  6 |     "core_0"  : {
  7 |       "core_type" : "systolic_ws",
  8 |       "core_width" : 128,
  9 |       "core_height" : 128,
 10 | 
 11 |       "spad_size" : 16384,
 12 |       "accum_spad_size" : 4096,
 13 |       "sram_width" : 32,
 14 | 
 15 |       "vector_process_bit" : 16384,
 16 |       "add_latency" : 1,
 17 |       "mul_latency" : 1,
 18 |       "mac_latency" : 1,
 19 |       "exp_latency" : 1,
 20 |       "gelu_latency" : 1,
 21 |       "div_latency" : 1,
 22 |       "add_tree_latency" : 1,
 23 |       "scalar_sqrt_latency" : 1,
 24 |       "scalar_add_latency" : 1,
 25 |       "scalar_mul_latency" : 1
 26 |     },
 27 |     "core_1"  : {
 28 |       "core_type" : "systolic_ws",
 29 |       "core_width" : 128,
 30 |       "core_height" : 128,
 31 | 
 32 |       "spad_size" : 16384,
 33 |       "accum_spad_size" : 4096,
 34 |       "sram_width" : 32,
 35 | 
 36 |       "vector_process_bit" : 16384,
 37 |       "add_latency" : 1,
 38 |       "mul_latency" : 1,
 39 |       "mac_latency" : 1,
 40 |       "exp_latency" : 1,
 41 |       "gelu_latency" : 1,
 42 |       "div_latency" : 1,
 43 |       "add_tree_latency" : 1,
 44 |       "scalar_sqrt_latency" : 1,
 45 |       "scalar_add_latency" : 1,
 46 |       "scalar_mul_latency" : 1
 47 |     },
 48 |     "core_2"  : {
 49 |       "core_type" : "systolic_ws",
 50 |       "core_width" : 128,
 51 |       "core_height" : 128,
 52 | 
 53 |       "spad_size" : 16384,
 54 |       "accum_spad_size" : 4096,
 55 |       "sram_width" : 32,
 56 | 
 57 |       "vector_process_bit" : 16384,
 58 |       "add_latency" : 1,
 59 |       "mul_latency" : 1,
 60 |       "mac_latency" : 1,
 61 |       "exp_latency" : 1,
 62 |       "gelu_latency" : 1,
 63 |       "div_latency" : 1,
 64 |       "add_tree_latency" : 1,
 65 |       "scalar_sqrt_latency" : 1,
 66 |       "scalar_add_latency" : 1,
 67 |       "scalar_mul_latency" : 1
 68 |     },
 69 |     "core_3"  : {
 70 |       "core_type" : "systolic_ws",
 71 |       "core_width" : 128,
 72 |       "core_height" : 128,
 73 | 
 74 |       "spad_size" : 16384,
 75 |       "accum_spad_size" : 4096,
 76 |       "sram_width" : 32,
 77 | 
 78 |       "vector_process_bit" : 16384,
 79 |       "add_latency" : 1,
 80 |       "mul_latency" : 1,
 81 |       "mac_latency" : 1,
 82 |       "exp_latency" : 1,
 83 |       "gelu_latency" : 1,
 84 |       "div_latency" : 1,
 85 |       "add_tree_latency" : 1,
 86 |       "scalar_sqrt_latency" : 1,
 87 |       "scalar_add_latency" : 1,
 88 |       "scalar_mul_latency" : 1
 89 |     }
 90 |   },
 91 | 
 92 |   "dram_type" : "ramulator2",
 93 |   "dram_freq" :1200,
 94 |   "dram_channels": 16,
 95 |   "dram_req_size": 32,
 96 |   "dram_latency" : 10,
 97 |   "dram_size" : 16,
 98 |   "dram_nbl" : 1,
 99 |   "dram_print_interval": 10000,
100 |   "dram_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
101 | 
102 |   "icnt_type" : "simple",
103 |   "icnt_latency" : 1,
104 |   "icnt_freq" : 8400,
105 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt",
106 | 
107 |   "precision" : 2,
108 |   "layout" : "NHWC",
109 |   "scheduler" : "simple"
110 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_8x8_c1_booksim2_transformer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "num_cores" : 1,  
 3 |   "core_freq" : 1000,
 4 |   "core_print_interval" : 10000,
 5 |   "core_config" : {
 6 |     "core_0": {
 7 |       "core_type" : "systolic_ws",
 8 |       "core_width" : 8,
 9 |       "core_height" : 8,
10 | 
11 |       "spad_size" : 64,
12 |       "accum_spad_size" : 16,
13 |       "sram_width" : 32,
14 | 
15 |       "vector_process_bit" : 2048,
16 |       "add_latency" : 1,
17 |       "mul_latency" : 1,
18 |       "mac_latency" : 1,
19 |       "exp_latency" : 1,
20 |       "gelu_latency" : 1,
21 |       "div_latency" : 1,
22 |       "add_tree_latency" : 1,
23 |       "scalar_sqrt_latency" : 1,
24 |       "scalar_add_latency" : 1,
25 |       "scalar_mul_latency" : 1
26 |     }
27 |   },
28 | 
29 |   "dram_type" : "ramulator2",
30 |   "dram_freq" : 400,
31 |   "dram_channels": 2,
32 |   "dram_req_size": 32,
33 |   "dram_latency" : 10,
34 |   "dram_print_interval": 10000,
35 |   "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
36 | 
37 |   "icnt_type" : "booksim2",
38 |   "icnt_latency" : 1,
39 |   "icnt_freq" : 2000,
40 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt",
41 | 
42 |   "precision" : 2,
43 |   "layout" : "NHWC",
44 |   "scheduler" : "simple"
45 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_8x8_c1_simple_noc_transformer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "num_cores" : 1,  
 3 |   "core_freq" : 1000,
 4 |   "core_print_interval" : 10000,
 5 |   "core_config" : {
 6 |     "core_0": {
 7 |       "core_type" : "systolic_ws",
 8 |       "core_width" : 8,
 9 |       "core_height" : 8,
10 | 
11 |       "spad_size" : 64,
12 |       "accum_spad_size" : 16,
13 |       "sram_width" : 32,
14 | 
15 |       "vector_process_bit" : 2048,
16 |       "add_latency" : 1,
17 |       "mul_latency" : 1,
18 |       "mac_latency" : 1,
19 |       "exp_latency" : 1,
20 |       "gelu_latency" : 1,
21 |       "div_latency" : 1,
22 |       "add_tree_latency" : 1,
23 |       "scalar_sqrt_latency" : 1,
24 |       "scalar_add_latency" : 1,
25 |       "scalar_mul_latency" : 1
26 |     }
27 |   },
28 | 
29 |   "dram_type" : "ramulator2",
30 |   "dram_freq" : 400,
31 |   "dram_channels": 2,
32 |   "dram_req_size": 32,
33 |   "dram_latency" : 10,
34 |   "dram_print_interval": 160000,
35 |   "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
36 | 
37 |   "icnt_type" : "simple",
38 |   "icnt_latency" : 1,
39 |   "icnt_freq" : 2000,
40 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c1_m2.icnt",
41 | 
42 |   "precision" : 1,
43 |   "layout" : "NHWC",
44 |   "scheduler" : "simple"
45 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_8x8_c4_booksim2_transformer.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "num_cores" : 4,  
  3 |   "core_freq" : 1000,
  4 |   "core_print_interval" : 10000,
  5 |   "core_config" : {
  6 |     "core_0": {
  7 |       "core_type" : "systolic_ws",
  8 |       "core_width" : 8,
  9 |       "core_height" : 8,
 10 | 
 11 |       "spad_size" : 64,
 12 |       "accum_spad_size" : 16,
 13 |       "sram_width" : 32,
 14 | 
 15 |       "vector_process_bit" : 2048,
 16 |       "add_latency" : 1,
 17 |       "mul_latency" : 1,
 18 |       "mac_latency" : 1,
 19 |       "exp_latency" : 1,
 20 |       "gelu_latency" : 1,
 21 |       "div_latency" : 1,
 22 |       "add_tree_latency" : 1,
 23 |       "scalar_sqrt_latency" : 1,
 24 |       "scalar_add_latency" : 1,
 25 |       "scalar_mul_latency" : 1
 26 |     },
 27 |     "core_1": {
 28 |       "core_type" : "systolic_ws",
 29 |       "core_width" : 8,
 30 |       "core_height" : 8,
 31 | 
 32 |       "spad_size" : 64,
 33 |       "accum_spad_size" : 16,
 34 |       "sram_width" : 32,
 35 | 
 36 |       "vector_process_bit" : 2048,
 37 |       "add_latency" : 1,
 38 |       "mul_latency" : 1,
 39 |       "mac_latency" : 1,
 40 |       "exp_latency" : 1,
 41 |       "gelu_latency" : 1,
 42 |       "div_latency" : 1,
 43 |       "add_tree_latency" : 1,
 44 |       "scalar_sqrt_latency" : 1,
 45 |       "scalar_add_latency" : 1,
 46 |       "scalar_mul_latency" : 1
 47 |     },
 48 |      "core_2": {
 49 |       "core_type" : "systolic_ws",
 50 |       "core_width" : 8,
 51 |       "core_height" : 8,
 52 | 
 53 |       "spad_size" : 64,
 54 |       "accum_spad_size" : 16,
 55 |       "sram_width" : 32,
 56 | 
 57 |       "vector_process_bit" : 2048,
 58 |       "add_latency" : 1,
 59 |       "mul_latency" : 1,
 60 |       "mac_latency" : 1,
 61 |       "exp_latency" : 1,
 62 |       "gelu_latency" : 1,
 63 |       "div_latency" : 1,
 64 |       "add_tree_latency" : 1,
 65 |       "scalar_sqrt_latency" : 1,
 66 |       "scalar_add_latency" : 1,
 67 |       "scalar_mul_latency" : 1
 68 |     },
 69 |     "core_3": {
 70 |       "core_type" : "systolic_ws",
 71 |       "core_width" : 8,
 72 |       "core_height" : 8,
 73 | 
 74 |       "spad_size" : 64,
 75 |       "accum_spad_size" : 16,
 76 |       "sram_width" : 32,
 77 | 
 78 |       "vector_process_bit" : 2048,
 79 |       "add_latency" : 1,
 80 |       "mul_latency" : 1,
 81 |       "mac_latency" : 1,
 82 |       "exp_latency" : 1,
 83 |       "gelu_latency" : 1,
 84 |       "div_latency" : 1,
 85 |       "add_tree_latency" : 1,
 86 |       "scalar_sqrt_latency" : 1,
 87 |       "scalar_add_latency" : 1,
 88 |       "scalar_mul_latency" : 1
 89 |     }
 90 |   },
 91 | 
 92 |   "dram_type" : "ramulator2",
 93 |   "dram_freq" : 400,
 94 |   "dram_channels": 2,
 95 |   "dram_req_size": 32,
 96 |   "dram_latency" : 10,
 97 |   "dram_print_interval": 10000,
 98 |   "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
 99 | 
100 |   "icnt_type" : "booksim2",
101 |   "icnt_latency" : 1,
102 |   "icnt_freq" : 2000,
103 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m2.icnt",
104 | 
105 |   "precision" : 2,
106 |   "layout" : "NHWC",
107 |   "scheduler" : "simple"
108 | }


--------------------------------------------------------------------------------
/configs/systolic_ws_8x8_c4_simple_noc_transformer.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "num_cores" : 4,  
  3 |   "core_freq" : 1000,
  4 |   "core_print_interval" : 10000,
  5 |   "core_config" : {
  6 |     "core_0": {
  7 |       "core_type" : "systolic_ws",
  8 |       "core_width" : 8,
  9 |       "core_height" : 8,
 10 | 
 11 |       "spad_size" : 64,
 12 |       "accum_spad_size" : 16,
 13 |       "sram_width" : 32,
 14 | 
 15 |       "vector_process_bit" : 2048,
 16 |       "add_latency" : 1,
 17 |       "mul_latency" : 1,
 18 |       "mac_latency" : 1,
 19 |       "exp_latency" : 1,
 20 |       "gelu_latency" : 1,
 21 |       "div_latency" : 1,
 22 |       "add_tree_latency" : 1,
 23 |       "scalar_sqrt_latency" : 1,
 24 |       "scalar_add_latency" : 1,
 25 |       "scalar_mul_latency" : 1
 26 |     },
 27 |     "core_1": {
 28 |       "core_type" : "systolic_ws",
 29 |       "core_width" : 8,
 30 |       "core_height" : 8,
 31 | 
 32 |       "spad_size" : 64,
 33 |       "accum_spad_size" : 16,
 34 |       "sram_width" : 32,
 35 | 
 36 |       "vector_process_bit" : 2048,
 37 |       "add_latency" : 1,
 38 |       "mul_latency" : 1,
 39 |       "mac_latency" : 1,
 40 |       "exp_latency" : 1,
 41 |       "gelu_latency" : 1,
 42 |       "div_latency" : 1,
 43 |       "add_tree_latency" : 1,
 44 |       "scalar_sqrt_latency" : 1,
 45 |       "scalar_add_latency" : 1,
 46 |       "scalar_mul_latency" : 1
 47 |     },
 48 |      "core_2": {
 49 |       "core_type" : "systolic_ws",
 50 |       "core_width" : 8,
 51 |       "core_height" : 8,
 52 | 
 53 |       "spad_size" : 64,
 54 |       "accum_spad_size" : 16,
 55 |       "sram_width" : 32,
 56 | 
 57 |       "vector_process_bit" : 2048,
 58 |       "add_latency" : 1,
 59 |       "mul_latency" : 1,
 60 |       "mac_latency" : 1,
 61 |       "exp_latency" : 1,
 62 |       "gelu_latency" : 1,
 63 |       "div_latency" : 1,
 64 |       "add_tree_latency" : 1,
 65 |       "scalar_sqrt_latency" : 1,
 66 |       "scalar_add_latency" : 1,
 67 |       "scalar_mul_latency" : 1
 68 |     },
 69 |     "core_3": {
 70 |       "core_type" : "systolic_ws",
 71 |       "core_width" : 8,
 72 |       "core_height" : 8,
 73 | 
 74 |       "spad_size" : 64,
 75 |       "accum_spad_size" : 16,
 76 |       "sram_width" : 32,
 77 | 
 78 |       "vector_process_bit" : 2048,
 79 |       "add_latency" : 1,
 80 |       "mul_latency" : 1,
 81 |       "mac_latency" : 1,
 82 |       "exp_latency" : 1,
 83 |       "gelu_latency" : 1,
 84 |       "div_latency" : 1,
 85 |       "add_tree_latency" : 1,
 86 |       "scalar_sqrt_latency" : 1,
 87 |       "scalar_add_latency" : 1,
 88 |       "scalar_mul_latency" : 1
 89 |     }
 90 |   },
 91 | 
 92 |   "dram_type" : "ramulator2",
 93 |   "dram_freq" : 400,
 94 |   "dram_channels": 2,
 95 |   "dram_req_size": 32,
 96 |   "dram_latency" : 10,
 97 |   "dram_print_interval": 10000,
 98 |   "dram_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
 99 | 
100 |   "icnt_type" : "simple",
101 |   "icnt_latency" : 1,
102 |   "icnt_freq" : 2000,
103 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m2.icnt",
104 | 
105 |   "precision" : 2,
106 |   "layout" : "NHWC",
107 |   "scheduler" : "simple"
108 | }


--------------------------------------------------------------------------------
/configs/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "num_cores" : 1,  
 3 |   "core_type" : "systolic_ws",
 4 |   "core_freq" : 1000,
 5 |   "core_width" : 8,
 6 |   "core_height" : 8,
 7 | 
 8 |   "vector_process_bit" : 32,
 9 | 
10 |   "spad_size" : 448,
11 |   "sram_width" : 32,
12 | 
13 |   "dram_type" : "ramulator",
14 |   "dram_freq" : 877,
15 |   "dram_channels": 8,
16 |   "dram_req_size": 32,
17 |   "dram_latency" : 10,
18 |   "dram_config_path" : "../configs/ramulator_configs/HBM-config.cfg",
19 | 
20 |   "icnt_type" : "simple",
21 |   "icnt_latency" : 1,
22 |   "icnt_freq" : 2000,
23 |   "icnt_config_path" : "../configs/booksim2_configs/fly_c64_m8.icnt",
24 | 
25 |   "precision" : 1,
26 |   "layout" : "NHWC",
27 |   "scheduler" : "simple"
28 | }


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/README.md:
--------------------------------------------------------------------------------
 1 | Simple Output Stationary Architecture
 2 | ----------------------------
 3 | This folder contains a simple output stationary architecture. 
 4 | 
 5 | Q&As:
 6 | ----------------------------
 7 | 1. How long do the Timeloop simulations take?
 8 |   
 9 |    Depending on your workload, the simulation takes various amount of time to finish. Generally, they should 
10 |    converge within 30 mins. You can manually stop the exploration when you see things are converging by 
11 |    pressing `ctrl + C`. They sometimes will take much longer to 
12 |    automaticaly stop as we set the converging cretiria to be pretty high to avoid early-stop with subooptimal mappings. Use you own
13 |    judgement.


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/arch/components/reg_storage.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |     - name: reg_storage
 5 |       attributes:
 6 |         technology: 45nm
 7 |         latency: 1ns
 8 |         width: 16
 9 |         depth: 1
10 |       subcomponents:
11 |         - name: storage
12 |           class: reg
13 |           attributes:
14 |             technology: technology
15 |             latency: latency
16 |             datawidth : width
17 |       actions:
18 |         - name: access
19 |           subcomponents:
20 |             - name: storage
21 |               actions:
22 |                 - name: access


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/arch/components/smartbuffer_RF.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |   - name: smartbuffer_RF
 5 |     attributes:
 6 |       technology: 45nm
 7 |       memory_depth: 12
 8 |       memory_width: 16
 9 |       n_rdwr_ports: 2
10 |       n_banks: 1
11 |       n_buffets: 1
12 |     subcomponents:
13 |       - name: storage
14 |         class: regfile
15 |         attributes:
16 |           technology: technology
17 |           width: memory_width
18 |           depth: memory_depth
19 |           n_rdwr_ports: n_rdwr_ports
20 |           n_banks: n_banks
21 |       - name: address_generators[0..1]
22 |         class: intadder
23 |         attributes:
24 |           technology: technology
25 |           width: log(memory_depth)
26 |     actions:
27 |       - name: write
28 |         arguments:
29 |           data_delta: 0..1
30 |           address_delta: 0..n_banks
31 |         subcomponents:
32 |           - name: storage
33 |             actions:
34 |               - name: write
35 |                 arguments:
36 |                   data_delta: data_delta
37 |                   address_delta: address_delta
38 |           - name: address_generators[0]
39 |             actions:
40 |               - name: add
41 |           - name: address_generators[1]
42 |             actions:
43 |               - name: idle
44 |       - name: read
45 |         arguments:
46 |           data_delta: 0..1
47 |           address_delta: 0..n_banks
48 |         subcomponents:
49 |           - name: storage
50 |             actions:
51 |               - name: read
52 |                 arguments:
53 |                   data_delta: data_delta
54 |                   address_delta: address_delta
55 |           - name: address_generators[1]
56 |             actions:
57 |               - name: add
58 |           - name: address_generators[0]
59 |             actions:
60 |               - name: idle
61 |       - name: idle
62 |         subcomponents:
63 |           - name: storage
64 |             actions:
65 |               - name: idle
66 |           - name: address_generators[0..1]
67 |             actions:
68 |               - name: idle
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/arch/components/smartbuffer_SRAM.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |   - name: smartbuffer_SRAM
 5 |     attributes:
 6 |       technology: 45nm
 7 |       memory_depth: 12
 8 |       memory_width: 16
 9 |       n_rdwr_ports: 2
10 |       n_banks: 1
11 |       n_buffets: 1
12 |     subcomponents:
13 |       - name: storage
14 |         class: SRAM
15 |         attributes:
16 |           technology: technology
17 |           width: memory_width
18 |           depth: memory_depth
19 |           n_rdwr_ports: n_rdwr_ports
20 |           n_banks: n_banks
21 |       - name: address_generators[0..1]
22 |         class: intadder
23 |         attributes:
24 |           technology: technology
25 |           width: log(memory_depth)
26 |     actions:
27 |       - name: write
28 |         arguments:
29 |           data_delta: 0..1
30 |           address_delta: 0..n_banks
31 |         subcomponents:
32 |           - name: storage
33 |             actions:
34 |               - name: write
35 |                 arguments:
36 |                   data_delta: data_delta
37 |                   address_delta: address_delta
38 |           - name: address_generators[0]
39 |             actions:
40 |               - name: count
41 |           - name: address_generators[1]
42 |             actions:
43 |               - name: idle
44 |       - name: read
45 |         arguments:
46 |           data_delta: 0..1
47 |           address_delta: 0..n_banks
48 |         subcomponents:
49 |           - name: storage
50 |             actions:
51 |               - name: read
52 |                 arguments:
53 |                   data_delta: data_delta
54 |                   address_delta: address_delta
55 |           - name: address_generators[1]
56 |             actions:
57 |               - name: add
58 |           - name: address_generators[0]
59 |             actions:
60 |               - name: idle
61 |       - name: idle
62 |         subcomponents:
63 |           - name: storage
64 |             actions:
65 |               - name: idle
66 |           - name: address_generators[0..1]
67 |             actions:
68 |               - name: idle
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/arch/simple_output_stationary.yaml:
--------------------------------------------------------------------------------
 1 | architecture:
 2 |   # ============================================================
 3 |   # Architecture Description
 4 |   # ============================================================
 5 |   version: 0.3
 6 |   subtree:
 7 |     - name: system
 8 |       local:
 9 |         - name: DRAM
10 |           class: DRAM
11 |           attributes:
12 |             type: HBM2
13 |             width: 32
14 |             block-size: 2
15 |             word-bits: 16
16 |       subtree:
17 |         - name: simple_ws[0]
18 |           attributes:
19 |             technology: 45nm
20 |           local:
21 |             - name: shared_glb
22 |               class: smartbuffer_SRAM
23 |               attributes:
24 |                 memory_depth: 458752
25 |                 memory_width: 32
26 |                 n_banks: 64
27 |                 block-size: 2
28 |                 word-bits: 16
29 |                 read_bandwidth: 32
30 |                 write_bandwidth: 32
31 |                 meshX: 1
32 |           subtree:
33 |           - name: PE[0..1023]
34 |             attributes:
35 |               meshX: 1
36 |             local:
37 |               - name: pe_spad
38 |                 class: smartbuffer_SRAM
39 |                 attributes:
40 |                   memory_depth: 1
41 |                   memory_width: 16
42 |                   block-size: 1
43 |                   word-bits: 16
44 |                   meshX: 32
45 |               - name: mac
46 |                 class: intmac
47 |                 attributes:
48 |                   datawidth: 16
49 |                   meshX : 32
50 |               # input and output registers for the mac unit
51 |               - name: weight_reg
52 |                 class: reg_storage
53 |                 attributes:
54 |                   depth: 16
55 |                   width: 16           # width in bits
56 |                   meshX: 32
57 |               - name: input_activation_reg
58 |                 class: reg_storage
59 |                 attributes:
60 |                   depth: 1
61 |                   width: 16           # width in bits
62 |                   meshX: 32
63 |               - name: output_activation_reg
64 |                 class: reg_storage
65 |                 attributes:
66 |                   depth: 1
67 |                   width: 16           # width in bits
68 |                   meshX: 32
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/arch/simple_output_stationary.yaml.tmp:
--------------------------------------------------------------------------------
 1 | architecture:
 2 |   # ============================================================
 3 |   # Architecture Description
 4 |   # ============================================================
 5 |   version: 0.3
 6 |   subtree:
 7 |     - name: system
 8 |       local:
 9 |         - name: DRAM
10 |           class: DRAM
11 |           attributes:
12 |             type: LPDDR4
13 |             width: 32
14 |             block-size: 2
15 |             word-bits: 16
16 |       subtree:
17 |         - name: simple_ws
18 |           attributes:
19 |             technology: 45nm
20 |           local:
21 |             - name: shared_glb
22 |               class: smartbuffer_SRAM
23 |               attributes:
24 |                 memory_depth: 458752
25 |                 memory_width: 32
26 |                 n_banks: 64
27 |                 block-size: 2
28 |                 word-bits: 16
29 |                 read_bandwidth: 32
30 |                 write_bandwidth: 32
31 |           subtree:
32 |           - name: PE[0..1023]
33 |             local:
34 |               - name: pe_spad
35 |                 class: smartbuffer_SRAM
36 |                 attributes:
37 |                   memory_depth: 1
38 |                   memory_width: 16
39 |                   block-size: 1
40 |                   word-bits: 16
41 |                   meshX: 32
42 |               - name: mac
43 |                 class: intmac
44 |                 attributes:
45 |                   datawidth: 16
46 |                   meshX : 32
47 |               # input and output registers for the mac unit
48 |               - name: weight_reg
49 |                 class: reg_storage
50 |                 attributes:
51 |                   depth: 16
52 |                   width: 16           # width in bits
53 |                   meshX: 32
54 |               - name: input_activation_reg
55 |                 class: reg_storage
56 |                 attributes:
57 |                   depth: 1
58 |                   width: 16           # width in bits
59 |                   meshX: 32
60 |               - name: output_activation_reg
61 |                 class: reg_storage
62 |                 attributes:
63 |                   depth: 1
64 |                   width: 16           # width in bits
65 |                   meshX: 32
66 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/constraints/simple_output_stationary_arch_constraints.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # The following constraints are limitations of the hardware architecture and dataflow
 3 | #
 4 | 
 5 | architecture_constraints:
 6 |   targets:
 7 |     - target: DRAM
 8 |       type: temporal
 9 |       permutation: CPQ
10 |     #  pe spad only stored outputs
11 |     - target: pe_spad
12 |       type: bypass
13 |       bypass: [Inputs, Weights]
14 |       keep: [Outputs]
15 |     # pe spad keeps outputs stationary
16 |     - target: pe_spad
17 |       type: temporal
18 |       permutation: CRSPQ
19 |     # NoC sending C in x direction, M in y direction; parallel-for loops for C and M only
20 |     - target: shared_glb
21 |       type: bypass
22 |       bypass: [Outputs]
23 |       keep: [Inputs, Weights]
24 |     - target: shared_glb
25 |       type: spatial
26 |       permutation: MPQ
27 |       split: 1
28 |       factors: R=1 S=1 C=1
29 |     # enforce the registers to only store 1 data of the datatype it stores
30 |     - target: weight_reg
31 |       type: temporal
32 |       factors: R=1 S=1 P=1 Q=1 C=16
33 |     - target: weight_reg
34 |       type: bypass
35 |       keep: [Weights]
36 |       bypass: [Inputs, Outputs]
37 |     - target: input_activation_reg
38 |       type: temporal
39 |       factors: P=1 Q=1 C=1 N=1
40 |     - target: input_activation_reg
41 |       type: bypass
42 |       keep: [Inputs]
43 |       bypass: [Outputs, Weights]
44 |     - target: output_activation_reg
45 |       type: temporal
46 |       factors: P=1 Q=1 M=1 N=1
47 |     - target: output_activation_reg
48 |       type: bypass
49 |       keep: [Outputs]
50 |       bypass: [Inputs, Weights]
51 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/constraints/simple_output_stationary_map_constraints.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # The following constraints are not limitations of the hardware architecture and dataflow,
 3 | # but help limit the search space to speed up search
 4 | #
 5 | 
 6 | mapspace_constraints:
 7 |   targets:
 8 |     # intuitive optimization to not tile R and S at the GLB level
 9 |     - target: shared_glb
10 |       type: temporal
11 |       factors: R=1 S=1
12 |     # intuitive optimization according to architecture dimensions
13 |     #- target: shared_glb
14 |       #type: spatial
15 |       #      factors:  M=16 N=16
16 |     # intuitive optimization to not tile R and S at the DRAM level
17 |     - target: DRAM
18 |       type: temporal
19 |       factors: R=1 S=1
20 |     # optimization to constrain the amplification factor of R and S to only one register
21 |     - target: output_activation_reg
22 |       type: temporal
23 |       factors: R=1 S=1
24 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_os_32x32/mapper/mapper.yaml:
--------------------------------------------------------------------------------
1 | mapper:
2 |   optimization-metrics: [ delay, energy ]
3 |   live-status: False
4 |   num-threads: 8
5 |   timeout: 15000
6 |   victory-condition: 3000
7 |   algorithm: random-pruned
8 |   max-permutations-per-if-visit: 16


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/README.md:
--------------------------------------------------------------------------------
 1 | Simple Weight Stationary Architecture
 2 | ----------------------------
 3 | This folder contains a simple weight stationary architecture. 
 4 | 
 5 | Q&As:
 6 | ----------------------------
 7 | 1. How long do the Timeloop simulations take?
 8 |   
 9 |    Depending on your workload, the simulation takes various amount of time to finish. Generally, they should 
10 |    converge within 30 mins. You can manually stop the exploration when you see things are converging by 
11 |    pressing `ctrl + C`. They sometimes will take much longer to 
12 |    automaticaly stop as we set the converging cretiria to be pretty high to avoid early-stop with subooptimal mappings. Use you own
13 |    judgement.
14 |    
15 | 2. How to get started on using the architecture skeleton to model architectures with advanced technologies?
16 |    
17 |    You generally need to modify the definitions of the compound components. If needed, you are also like required to 
18 |    make updated the architecture description to include the additional setup for your architecture. 
19 |    
20 |    An example design for compute-in-memory architecture using ReRAM can be found 
21 |    [here](https://github.com/Accelergy-Project/processing-in-memory-design)
22 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/arch/components/reg_storage.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |     - name: reg_storage
 5 |       attributes:
 6 |         technology: 45nm
 7 |         latency: 1ns
 8 |         width: 16
 9 |         depth: 1
10 |       subcomponents:
11 |         - name: storage
12 |           class: reg
13 |           attributes:
14 |             technology: technology
15 |             latency: latency
16 |             datawidth : width
17 |       actions:
18 |         - name: access
19 |           subcomponents:
20 |             - name: storage
21 |               actions:
22 |                 - name: access


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/arch/components/smartbuffer_RF.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |   - name: smartbuffer_RF
 5 |     attributes:
 6 |       technology: 45nm
 7 |       memory_depth: 12
 8 |       memory_width: 16
 9 |       n_rdwr_ports: 2
10 |       n_banks: 1
11 |       n_buffets: 1
12 |     subcomponents:
13 |       - name: storage
14 |         class: regfile
15 |         attributes:
16 |           technology: technology
17 |           width: memory_width
18 |           depth: memory_depth
19 |           n_rdwr_ports: n_rdwr_ports
20 |           n_banks: n_banks
21 |       - name: address_generators[0..1]
22 |         class: intadder
23 |         attributes:
24 |           technology: technology
25 |           width: log(memory_depth)
26 |     actions:
27 |       - name: write
28 |         arguments:
29 |           data_delta: 0..1
30 |           address_delta: 0..n_banks
31 |         subcomponents:
32 |           - name: storage
33 |             actions:
34 |               - name: write
35 |                 arguments:
36 |                   data_delta: data_delta
37 |                   address_delta: address_delta
38 |           - name: address_generators[0]
39 |             actions:
40 |               - name: add
41 |           - name: address_generators[1]
42 |             actions:
43 |               - name: idle
44 |       - name: read
45 |         arguments:
46 |           data_delta: 0..1
47 |           address_delta: 0..n_banks
48 |         subcomponents:
49 |           - name: storage
50 |             actions:
51 |               - name: read
52 |                 arguments:
53 |                   data_delta: data_delta
54 |                   address_delta: address_delta
55 |           - name: address_generators[1]
56 |             actions:
57 |               - name: add
58 |           - name: address_generators[0]
59 |             actions:
60 |               - name: idle
61 |       - name: idle
62 |         subcomponents:
63 |           - name: storage
64 |             actions:
65 |               - name: idle
66 |           - name: address_generators[0..1]
67 |             actions:
68 |               - name: idle
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/arch/components/smartbuffer_SRAM.yaml:
--------------------------------------------------------------------------------
 1 | compound_components:
 2 |   version: 0.3
 3 |   classes:
 4 |   - name: smartbuffer_SRAM
 5 |     attributes:
 6 |       technology: 45nm
 7 |       memory_depth: 12
 8 |       memory_width: 16
 9 |       n_rdwr_ports: 2
10 |       n_banks: 1
11 |       n_buffets: 1
12 |     subcomponents:
13 |       - name: storage
14 |         class: SRAM
15 |         attributes:
16 |           technology: technology
17 |           width: memory_width
18 |           depth: memory_depth
19 |           n_rdwr_ports: n_rdwr_ports
20 |           n_banks: n_banks
21 |       - name: address_generators[0..1]
22 |         class: intadder
23 |         attributes:
24 |           technology: technology
25 |           width: log(memory_depth)
26 |     actions:
27 |       - name: write
28 |         arguments:
29 |           data_delta: 0..1
30 |           address_delta: 0..n_banks
31 |         subcomponents:
32 |           - name: storage
33 |             actions:
34 |               - name: write
35 |                 arguments:
36 |                   data_delta: data_delta
37 |                   address_delta: address_delta
38 |           - name: address_generators[0]
39 |             actions:
40 |               - name: count
41 |           - name: address_generators[1]
42 |             actions:
43 |               - name: idle
44 |       - name: read
45 |         arguments:
46 |           data_delta: 0..1
47 |           address_delta: 0..n_banks
48 |         subcomponents:
49 |           - name: storage
50 |             actions:
51 |               - name: read
52 |                 arguments:
53 |                   data_delta: data_delta
54 |                   address_delta: address_delta
55 |           - name: address_generators[1]
56 |             actions:
57 |               - name: add
58 |           - name: address_generators[0]
59 |             actions:
60 |               - name: idle
61 |       - name: idle
62 |         subcomponents:
63 |           - name: storage
64 |             actions:
65 |               - name: idle
66 |           - name: address_generators[0..1]
67 |             actions:
68 |               - name: idle
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/arch/simple_weight_stationary.yaml:
--------------------------------------------------------------------------------
 1 | architecture:
 2 |   # ============================================================
 3 |   # Architecture Description
 4 |   # ============================================================
 5 |   version: 0.3
 6 |   subtree:
 7 |     - name: system
 8 |       local:
 9 |         - name: DRAM
10 |           class: DRAM
11 |           attributes:
12 |             type: HBM2
13 |             width: 256
14 |             block-size: 8
15 |             word-bits: 32
16 |       subtree:
17 |         - name: simple_ws
18 |           attributes:
19 |             technology: 45nm
20 |           local:
21 |             - name: accum_spad
22 |               class: smartbuffer_SRAM
23 |               attributes:
24 |                 memory_depth: 2048
25 |                 memory_width: 256
26 |                 n_banks: 2
27 |                 block-size: 8
28 |                 word-bits: 32
29 |                 read_bandwidth: 8
30 |                 write_bandwidth: 8
31 |                 multiple-buffering: 2
32 |             - name: spad
33 |               class: smartbuffer_SRAM
34 |               attributes:
35 |                 memory_depth: 8192
36 |                 memory_width: 256
37 |                 n_banks: 4
38 |                 block-size: 8
39 |                 word-bits: 32
40 |                 read_bandwidth: 8
41 |                 write_bandwidth: 8
42 |                 multiple-buffering: 2
43 |           subtree:
44 |           - name: PE[0..63]
45 |             local:
46 |               - name: pe_spad
47 |                 class: smartbuffer_RF
48 |                 attributes:
49 | #                  memory_depth: 192
50 |                   memory_depth: 1
51 |                   memory_width: 32
52 |                   block-size: 1
53 |                   word-bits: 32
54 |                   meshX: 8
55 |               - name: mac
56 |                 class: intmac
57 |                 attributes:
58 |                   datawidth: 32
59 |                   meshX : 8
60 |               # input and output registers for the mac unit
61 |               - name: weight_reg
62 |                 class: reg_storage
63 |                 attributes:
64 |                   depth: 1
65 |                   width: 32           # width in bits
66 |                   meshX: 8
67 |               - name: input_activation_reg
68 |                 class: reg_storage
69 |                 attributes:
70 |                   depth: 1
71 |                   width: 32           # width in bits
72 |                   meshX: 8
73 |               - name: output_activation_reg
74 |                 class: reg_storage
75 |                 attributes:
76 |                   depth: 1
77 |                   width: 32           # width in bits
78 |                   meshX: 8
79 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/constraints/simple_weight_stationary_arch_constraints.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # The following constraints are limitations of the hardware architecture and dataflow
 3 | #
 4 | 
 5 | architecture_constraints:
 6 |   targets:
 7 |     # pe spad only stored weights
 8 |     - target: pe_spad
 9 |       type: bypass
10 |       bypass: [Inputs, Outputs]
11 |       keep: [Weights]
12 |     # pe spad keeps weights stationary
13 |     - target: pe_spad
14 |       type: temporal
15 |       permutation: PQCRS
16 |     # NoC sending C in x direction, M in y direction; parallel-for loops for C and M only
17 |     - target: spad
18 |       type: spatial
19 |       permutation: MCRS
20 |       split: 1
21 |       factors: P=1 Q=1
22 |     # enforce the registers to only store 1 data of the datatype it stores
23 |     - target: weight_reg
24 |       type: temporal
25 |       factors: R=1 S=1 M=1 C=1
26 |     - target: weight_reg
27 |       type: bypass
28 |       keep: [Weights]
29 |       bypass: [Inputs, Outputs]
30 |     - target: input_activation_reg
31 |       type: temporal
32 |       factors: P=1 Q=1 C=1 N=1
33 |     - target: input_activation_reg
34 |       type: bypass
35 |       keep: [Inputs]
36 |       bypass: [Outputs, Weights]
37 |     - target: output_activation_reg
38 |       type: temporal
39 |       factors: P=1 Q=1 M=1 N=1
40 |     - target: output_activation_reg
41 |       type: bypass
42 |       keep: [Outputs]
43 |       bypass: [Inputs, Weights]
44 |     - target: spad
45 |       type: bypass
46 |       keep: [Inputs, Weights]
47 |       bypass: [Outputs]
48 |     - target: accum_spad
49 |       type: bypass
50 |       keep: [Outputs]
51 |       bypass: [Inputs, Weights]
52 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/constraints/simple_weight_stationary_map_constraints.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # The following constraints are not limitations of the hardware architecture and dataflow,
 3 | # but help limit the search space to speed up search
 4 | #
 5 | 
 6 | mapspace_constraints:
 7 |   targets:
 8 |     # intuitive optimization to not tile R and S at the GLB level
 9 | #    - target: shared_glb
10 | #      type: temporal
11 | #      factors: R=1 S=1
12 |     # intuitive optimization according to architecture dimensions
13 |     - target: spad
14 |       type: spatial
15 |       factors:  M=8 C=8
16 |     # intuitive optimization to not tile R and S at the DRAM level
17 |     - target: DRAM
18 |       type: temporal
19 |       factors: R=1 S=1
20 |     # optimization to constrain the amplification factor of R and S to only one register
21 |     - target: output_activation_reg
22 |       type: temporal
23 |       factors: R=1 S=1
24 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ART.yaml:
--------------------------------------------------------------------------------
 1 | ART:
 2 |   version: 0.3
 3 |   tables:
 4 |   - name: system.simple_ws.PE[0..255].mac
 5 |     area: 1239.5
 6 |   - name: system.DRAM
 7 |     area: 1
 8 |   - name: system.simple_ws.PE[0..255].pe_spad
 9 |     area: 3634.68
10 |   - name: system.simple_ws.PE[0..255].weight_reg
11 |     area: 5.98
12 |   - name: system.simple_ws.PE[0..255].input_activation_reg
13 |     area: 5.98
14 |   - name: system.simple_ws.PE[0..255].output_activation_reg
15 |     area: 5.98
16 |   - name: system.simple_ws.shared_glb
17 |     area: 1162549.0
18 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ART_summary.yaml:
--------------------------------------------------------------------------------
 1 | ART_summary:
 2 |   version: 0.3
 3 |   table_summary:
 4 |   - name: system.simple_ws.PE[0..255].mac
 5 |     area: 1239.5
 6 |     primitive_estimations: Aladdin_table
 7 |   - name: system.DRAM
 8 |     area: 1
 9 |     primitive_estimations: dummy_table
10 |   - name: system.simple_ws.PE[0..255].pe_spad
11 |     area: 3634.68
12 |     primitive_estimations:
13 |     - name: storage
14 |       estimator: Aladdin_table
15 |     - name: address_generators[0..1]
16 |       estimator: Aladdin_table
17 |   - name: system.simple_ws.PE[0..255].weight_reg
18 |     area: 5.98
19 |     primitive_estimations:
20 |     - name: storage
21 |       estimator: Aladdin_table
22 |   - name: system.simple_ws.PE[0..255].input_activation_reg
23 |     area: 5.98
24 |     primitive_estimations:
25 |     - name: storage
26 |       estimator: Aladdin_table
27 |   - name: system.simple_ws.PE[0..255].output_activation_reg
28 |     area: 5.98
29 |     primitive_estimations:
30 |     - name: storage
31 |       estimator: Aladdin_table
32 |   - name: system.simple_ws.shared_glb
33 |     area: 1162549.0
34 |     primitive_estimations:
35 |     - name: storage
36 |       estimator: Cacti
37 |     - name: address_generators[0..1]
38 |       estimator: Aladdin_table
39 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.ERT_summary.yaml:
--------------------------------------------------------------------------------
 1 | ERT_summary:
 2 |   version: 0.3
 3 |   table_summary:
 4 |   - name: system.simple_ws.PE[0..255].mac
 5 |     actions:
 6 |     - name: mac_random
 7 |       energy: 2.2
 8 |     - name: mac_reused
 9 |       energy: 1.877
10 |     - name: mac_gated
11 |       energy: 0.103
12 |     - name: idle
13 |       energy: 0.066
14 |     primitive_estimation(s):
15 |     - name: system.simple_ws.PE[0..255].mac
16 |       estimator: Aladdin_table
17 |   - name: system.DRAM
18 |     actions:
19 |     - name: read
20 |       energy: 512
21 |     - name: write
22 |       energy: 512
23 |     - name: idle
24 |       energy: 0
25 |     primitive_estimation(s):
26 |     - name: system.DRAM
27 |       estimator: Cacti
28 |   - name: system.simple_ws.PE[0..255].pe_spad
29 |     actions:
30 |     - name: write
31 |       average_energy: 0.824
32 |       max_energy: 1.586
33 |       min_energy: 0.061
34 |     - name: read
35 |       average_energy: 0.824
36 |       max_energy: 1.586
37 |       min_energy: 0.061
38 |     - name: idle
39 |       energy: 0.024
40 |     primitive_estimation(s):
41 |     - name: storage
42 |       estimator: Aladdin_table
43 |     - name: address_generators[0]
44 |       estimator: Aladdin_table
45 |     - name: address_generators[1]
46 |       estimator: Aladdin_table
47 |     - name: address_generators[0..1]
48 |       estimator: Aladdin_table
49 |   - name: system.simple_ws.PE[0..255].weight_reg
50 |     actions:
51 |     - name: access
52 |       energy: 0.009
53 |     primitive_estimation(s):
54 |     - name: storage
55 |       estimator: Aladdin_table
56 |   - name: system.simple_ws.PE[0..255].input_activation_reg
57 |     actions:
58 |     - name: access
59 |       energy: 0.009
60 |     primitive_estimation(s):
61 |     - name: storage
62 |       estimator: Aladdin_table
63 |   - name: system.simple_ws.PE[0..255].output_activation_reg
64 |     actions:
65 |     - name: access
66 |       energy: 0.009
67 |     primitive_estimation(s):
68 |     - name: storage
69 |       estimator: Aladdin_table
70 |   - name: system.simple_ws.shared_glb
71 |     actions:
72 |     - name: write
73 |       average_energy: 37.635
74 |       max_energy: 75.215
75 |       min_energy: 0.055
76 |     - name: read
77 |       average_energy: 37.1
78 |       max_energy: 74.144
79 |       min_energy: 0.055
80 |     - name: idle
81 |       energy: 0.018
82 |     primitive_estimation(s):
83 |     - name: storage
84 |       estimator: Cacti
85 |     - name: address_generators[0]
86 |       estimator: Aladdin_table
87 |     - name: address_generators[1]
88 |       estimator: Aladdin_table
89 |     - name: address_generators[0..1]
90 |       estimator: Aladdin_table
91 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.defined_input_architecture.yaml:
--------------------------------------------------------------------------------
 1 | architecture:
 2 |   version: 0.3
 3 |   subtree:
 4 |   - name: system
 5 |     local:
 6 |     - name: DRAM
 7 |       class: DRAM
 8 |       attributes:
 9 |         block-size: 4
10 |         technology: 65nm
11 |         type: LPDDR4
12 |         width: 64
13 |         word-bits: 16
14 |     subtree:
15 |     - name: simple_ws
16 |       attributes:
17 |         technology: 45nm
18 |       local:
19 |       - name: shared_glb
20 |         class: smartbuffer_SRAM
21 |         attributes:
22 |           block-size: 4
23 |           memory_depth: 16384
24 |           memory_width: 64
25 |           n_banks: 32
26 |           n_buffets: 1
27 |           n_rdwr_ports: 2
28 |           read_bandwidth: 16
29 |           technology: 45nm
30 |           word-bits: 16
31 |           write_bandwidth: 16
32 |       subtree:
33 |       - name: PE[0..255]
34 |         local:
35 |         - name: pe_spad
36 |           class: smartbuffer_RF
37 |           attributes:
38 |             block-size: 1
39 |             memory_depth: 192
40 |             memory_width: 16
41 |             meshX: 16
42 |             n_banks: 1
43 |             n_buffets: 1
44 |             n_rdwr_ports: 2
45 |             technology: 45nm
46 |             word-bits: 16
47 |         - name: mac
48 |           class: intmac
49 |           attributes:
50 |             datawidth: 16
51 |             latency: 5ns
52 |             meshX: 16
53 |             num_pipeline_stages: 2
54 |             technology: 45nm
55 |         - name: weight_reg
56 |           class: reg_storage
57 |           attributes:
58 |             depth: 1
59 |             latency: 1ns
60 |             meshX: 16
61 |             technology: 45nm
62 |             width: 16
63 |         - name: input_activation_reg
64 |           class: reg_storage
65 |           attributes:
66 |             depth: 1
67 |             latency: 1ns
68 |             meshX: 16
69 |             technology: 45nm
70 |             width: 16
71 |         - name: output_activation_reg
72 |           class: reg_storage
73 |           attributes:
74 |             depth: 1
75 |             latency: 1ns
76 |             meshX: 16
77 |             technology: 45nm
78 |             width: 16
79 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.flattened_architecture.yaml:
--------------------------------------------------------------------------------
 1 | architecture:
 2 |   version: 0.3
 3 |   local:
 4 |   - name: system.simple_ws.PE[0..255].pe_spad
 5 |     class: smartbuffer_RF
 6 |     attributes:
 7 |       block-size: 1
 8 |       memory_depth: 192
 9 |       memory_width: 16
10 |       meshX: 16
11 |       n_banks: 1
12 |       n_buffets: 1
13 |       n_rdwr_ports: 2
14 |       technology: 45nm
15 |       word-bits: 16
16 |   - name: system.simple_ws.PE[0..255].mac
17 |     class: intmac
18 |     attributes:
19 |       datawidth: 16
20 |       latency: 5ns
21 |       meshX: 16
22 |       num_pipeline_stages: 2
23 |       technology: 45nm
24 |   - name: system.simple_ws.PE[0..255].weight_reg
25 |     class: reg_storage
26 |     attributes:
27 |       depth: 1
28 |       latency: 1ns
29 |       meshX: 16
30 |       technology: 45nm
31 |       width: 16
32 |   - name: system.simple_ws.PE[0..255].input_activation_reg
33 |     class: reg_storage
34 |     attributes:
35 |       depth: 1
36 |       latency: 1ns
37 |       meshX: 16
38 |       technology: 45nm
39 |       width: 16
40 |   - name: system.simple_ws.PE[0..255].output_activation_reg
41 |     class: reg_storage
42 |     attributes:
43 |       depth: 1
44 |       latency: 1ns
45 |       meshX: 16
46 |       technology: 45nm
47 |       width: 16
48 |   - name: system.simple_ws.shared_glb
49 |     class: smartbuffer_SRAM
50 |     attributes:
51 |       block-size: 4
52 |       memory_depth: 16384
53 |       memory_width: 64
54 |       n_banks: 32
55 |       n_buffets: 1
56 |       n_rdwr_ports: 2
57 |       read_bandwidth: 16
58 |       technology: 45nm
59 |       word-bits: 16
60 |       write_bandwidth: 16
61 |   - name: system.DRAM
62 |     class: DRAM
63 |     attributes:
64 |       block-size: 4
65 |       technology: 65nm
66 |       type: LPDDR4
67 |       width: 64
68 |       word-bits: 16
69 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/example_AlexNet_layer1_outputs/timeloop-mapper.map.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | DRAM [ Weights:34848 Inputs:154587 Outputs:290400 ] 
 3 | ---------------------------------------------------
 4 | | for Q in [0:5)
 5 | 
 6 | shared_glb [ Inputs:34731 ] 
 7 | ---------------------------
 8 | |   for M in [0:6)
 9 | |     for Q in [0:11)
10 | |       for P in [0:55)
11 | |         for M in [0:16) (Spatial-Y)
12 | |           for C in [0:3) (Spatial-X)
13 | 
14 | pe_spad [ Weights:121 ] 
15 | -----------------------
16 | |             for S in [0:11)
17 | |               for R in [0:11)
18 | 
19 | weight_reg [ Weights:1 ] 
20 | ------------------------
21 | |                 for Q in [0:1)
22 | 
23 | input_activation_reg [ Inputs:1 ] 
24 | ---------------------------------
25 | |                   for Q in [0:1)
26 | 
27 | output_activation_reg [ Outputs:1 ] 
28 | -----------------------------------
29 | |                     for Q in [0:1)
30 | 
31 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/mapper/mapper.yaml:
--------------------------------------------------------------------------------
1 | mapper:
2 |   optimization-metrics: [ delay, energy ]
3 |   live-status: False
4 |   num-threads: 8
5 |   timeout: 15000
6 |   victory-condition: 3000
7 |   algorithm: random-pruned
8 |   max-permutations-per-if-visit: 16
9 | 


--------------------------------------------------------------------------------
/configs/timeloop_configs/systolic_ws_8x8/timeloop-mapper.map.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | DRAM [ Weights:147456 (147456) Inputs:1663488 (1663488) Outputs:1605632 (1605632) ] 
 3 | -----------------------------------------------------------------------------------
 4 | | for Q in [0:2)
 5 | |   for C in [0:8)
 6 | |     for P in [0:2)
 7 | |       for M in [0:2)
 8 | 
 9 | shared_glb [ Inputs:53824 (53824) ] 
10 | -----------------------------------
11 | |         for Q in [0:2)
12 | |           for M in [0:16) (Spatial-Y)
13 | |             for C in [0:16) (Spatial-X)
14 | 
15 | pe_spad [ Weights:36 (36) ] 
16 | ---------------------------
17 | |               for M in [0:4)
18 | |                 for S in [0:3)
19 | |                   for R in [0:3)
20 | |                     for P in [0:28)
21 | 
22 | weight_reg [ Weights:1 (1) ] 
23 | ----------------------------
24 | |                       for P in [0:2)
25 | |                         for Q in [0:28)
26 | 
27 | input_activation_reg [ Inputs:1 (1) ] 
28 | -------------------------------------
29 | |                           for Q in [0:1)
30 | 
31 | output_activation_reg [ Outputs:1 (1) ] 
32 | ---------------------------------------
33 | |                             for Q in [0:1)
34 | 
35 | 


--------------------------------------------------------------------------------
/example/language_models.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "models": [
 3 |      {
 4 |          "name": "opt-125m",
 5 |          "trace_file" : "input.csv",
 6 |          "scheduler" : "simple",
 7 |          "scheduler_config": {
 8 |              "max_batch_size": 8  
 9 |           }
10 |      }
11 |   ]
12 |  }


--------------------------------------------------------------------------------
/example/models_list.json:
--------------------------------------------------------------------------------
1 | {
2 |  "models": [
3 |     {
4 |         "name": "resnet18",
5 |         "batch_size" : 1
6 |     }
7 |  ]
8 | }
9 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | 
3 | # Compiled Object files
4 | obj/
5 | 
6 | # Compiled target executable files
7 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(ramulator_project)
 3 | 
 4 | file(GLOB_RECURSE RAMULATOR_SRCS CONFIGURE_DEPENDS src/*.cpp)
 5 | add_library(ramulator1 STATIC ${RAMULATOR_SRCS})
 6 | target_include_directories(ramulator1 
 7 |   PUBLIC include
 8 |   PRIVATE include/ramulator
 9 |   PRIVATE src
10 | )
11 | target_compile_options(ramulator1 PRIVATE -Wall -O3)
12 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/include/ramulator/Ramulator.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __RAMULATOR_H
 2 | #define __RAMULATOR_H
 3 | #include <cstdint>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <queue>
 7 | #include <memory>
 8 | #include <unordered_map>
 9 | #include <functional>
10 | #include <robin_hood.h>
11 | namespace ram {
12 | class MemoryBase;
13 | class Request;
14 | class Ramulator {
15 | public:
16 |   Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim = false);
17 |   ~Ramulator();
18 |   void tick();
19 |   bool isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const;
20 |   bool isAvailable(uint64_t Addr, bool IsWrite) const;
21 |   void push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req);
22 |   void push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req);
23 |   bool isEmpty(int CtrlID) const;
24 |   const void* top(int CtrlID) const;
25 |   void pop(int CtrlID);
26 |   int getAtomicBytes() const;
27 |   int getNumChannels() const;
28 |   int getChannel(uint64_t Addr) const;
29 |   void print_stats();
30 | private:
31 |   std::unique_ptr<MemoryBase> MemBase;
32 |   class OutputPendingQueue;
33 |   std::vector<OutputPendingQueue> OutputPendingQueues;
34 |   using CallbackMap =
35 |     std::unordered_map<bool, std::function<void(const ram::Request&)>>;
36 |   CallbackMap Callbacks;
37 |   robin_hood::unordered_flat_set<int> hot_vids;
38 |   bool is_pim;
39 |   static std::unique_ptr<MemoryBase> createMemory(std::string ConfigFilePath, uint32_t num_core);
40 | };
41 | class Ramulator::OutputPendingQueue {
42 | public:
43 |   OutputPendingQueue(int Size);
44 |   bool isAvailable() const;
45 |   bool isAvailable(uint32_t count) const;
46 |   bool isEmpty() const;
47 |   void reserve();
48 |   void push(void* original_req);
49 |   const void* top() const;
50 |   void pop();
51 | private:
52 |   const int Size;
53 |   int NumReserved;
54 |   std::queue<void*> PendingQueue;
55 | };
56 | } // end namespace
57 | #endif
58 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/src/Config.cpp:
--------------------------------------------------------------------------------
 1 | #include "Config.h"
 2 | 
 3 | using namespace std;
 4 | using namespace ram;
 5 | 
 6 | RamulatorConfig::RamulatorConfig(const std::string& fname) {
 7 |   options["mapping"] = "RoBaRaCoCh";
 8 |   options["scheduler"] = "FRFCFS";
 9 |   parse(fname);
10 | }
11 | 
12 | void RamulatorConfig::parse(const string& fname)
13 | {
14 |     ifstream file(fname);
15 |     assert(file.good() && "Bad config file");
16 |     string line;
17 |     while (getline(file, line)) {
18 |         char delim[] = " \t=";
19 |         vector<string> tokens;
20 | 
21 |         while (true) {
22 |             size_t start = line.find_first_not_of(delim);
23 |             if (start == string::npos) 
24 |                 break;
25 | 
26 |             size_t end = line.find_first_of(delim, start);
27 |             if (end == string::npos) {
28 |                 tokens.push_back(line.substr(start));
29 |                 break;
30 |             }
31 | 
32 |             tokens.push_back(line.substr(start, end - start));
33 |             line = line.substr(end);
34 |         }
35 | 
36 |         // empty line
37 |         if (!tokens.size())
38 |             continue;
39 | 
40 |         // comment line
41 |         if (tokens[0][0] == '#')
42 |             continue;
43 | 
44 |         // parameter line
45 |         assert(tokens.size() == 2 && "Only allow two tokens in one line");
46 | 
47 |         options[tokens[0]] = tokens[1];
48 | 
49 |         if (tokens[0] == "channels") {
50 |           channels = atoi(tokens[1].c_str());
51 |         } else if (tokens[0] == "ranks") {
52 |           ranks = atoi(tokens[1].c_str());
53 |         } else if (tokens[0] == "subarrays") {
54 |           subarrays = atoi(tokens[1].c_str());
55 |         } else if (tokens[0] == "cpu_tick") {
56 |           cpu_tick = atoi(tokens[1].c_str());
57 |         } else if (tokens[0] == "mem_tick") {
58 |           mem_tick = atoi(tokens[1].c_str());
59 |         } else if (tokens[0] == "expected_limit_insts") {
60 |           expected_limit_insts = atoi(tokens[1].c_str());
61 |         } else if (tokens[0] == "warmup_insts") {
62 |           warmup_insts = atoi(tokens[1].c_str());
63 |         }
64 |     }
65 |     file.close();
66 | }
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/src/MemoryFactory.cpp:
--------------------------------------------------------------------------------
 1 | #include "MemoryFactory.h"
 2 | // #include "LPDDR4.h"
 3 | // #include "WideIO.h"
 4 | // #include "WideIO2.h"
 5 | #include "HBM.h"
 6 | //#include "SALP.h"
 7 | 
 8 | using namespace ram;
 9 | 
10 | namespace ram
11 | {
12 | //
13 | // template <>
14 | // void MemoryFactory<LPDDR4>::validate(int channels, int ranks, RamulatorConfig& configs) {
15 | //     assert(channels >= 2 && "LPDDR4 requires 2, 4, 8 ... channels");
16 | // }
17 | //
18 | // template <>
19 | // void MemoryFactory<WideIO>::validate(int channels, int ranks, RamulatorConfig& configs) {
20 | //     assert(channels == 4 && "WideIO comes with 4 channels");
21 | // }
22 | //
23 | // template <>
24 | // void MemoryFactory<WideIO2>::validate(int channels, int ranks, RamulatorConfig& configs) {
25 | //     assert((channels == 4 || channels == 8) && "WideIO2 comes with 4 or 8 channels");
26 | //     assert((ranks == 1 || ranks == 2) && "WideIO2 comes with 1 or 2 ranks");
27 | // }
28 | 
29 | template <>
30 | void MemoryFactory<HBM>::validate(int channels, int ranks, RamulatorConfig& configs) {
31 |     assert(channels == 8 && "HBM comes with 8 channels");
32 | }
33 | 
34 | // template <>
35 | // MemoryBase *MemoryFactory<WideIO2>::create(RamulatorConfig& configs, int cacheline) {
36 | //     int channels = stoi(configs["channels"], NULL, 0);
37 | //     int ranks = stoi(configs["ranks"], NULL, 0);
38 | //     validate(channels, ranks, configs);
39 | //
40 | //     const string& org_name = configs["org"];
41 | //     const string& speed_name = configs["speed"];
42 | //
43 | //     WideIO2 *spec = new WideIO2(org_name, speed_name, channels);
44 | //
45 | //     extend_channel_width(spec, cacheline);
46 | //
47 | //     return (MemoryBase *)populate_memory(configs, spec, channels, ranks);
48 | // }
49 | //
50 | //
51 | // template <>
52 | // MemoryBase *MemoryFactory<SALP>::create(RamulatorConfig& configs, int cacheline) {
53 | //     int channels = stoi(configs["channels"], NULL, 0);
54 | //     int ranks = stoi(configs["ranks"], NULL, 0);
55 | //     int subarrays = stoi(configs["subarrays"], NULL, 0);
56 | //     validate(channels, ranks, configs);
57 | //
58 | //     const string& std_name = configs["standard"];
59 | //     const string& org_name = configs["org"];
60 | //     const string& speed_name = configs["speed"];
61 | //
62 | //     SALP *spec = new SALP(org_name, speed_name, std_name, subarrays);
63 | //
64 | //     extend_channel_width(spec, cacheline);
65 | //
66 | //     return (MemoryBase *)populate_memory(configs, spec, channels, ranks);
67 | // }
68 | 
69 | }
70 | 
71 | // This function can be used by autoconf AC_CHECK_LIB since
72 | // apparently it can't detect C++ functions.
73 | // Basically just an entry in the symbol table
74 | // extern "C"
75 | // {
76 | //     void libramulator_is_present(void)
77 | //     {
78 | //         ;
79 | //     }
80 | // }
81 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/src/MemoryFactory.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MEMORY_FACTORY_H
 2 | #define __MEMORY_FACTORY_H
 3 | 
 4 | #include <map>
 5 | #include <string>
 6 | #include <cassert>
 7 | #include <memory>
 8 | 
 9 | #include "Memory.h"
10 | #include "DRAM.h"
11 | #include "Controller.h"
12 | #include "Config.h"
13 | 
14 | using namespace std;
15 | 
16 | namespace ram
17 | {
18 | template <typename T>
19 | class MemoryFactory {
20 | public:
21 |     static void extend_channel_width(T* spec, int cacheline)
22 |     {
23 |         int channel_unit = spec->prefetch_size * spec->channel_width / 8;
24 |         int gang_number = cacheline / channel_unit;
25 |         
26 |         assert(gang_number >= 1 && 
27 |             "cacheline size must be greater or equal to minimum channel width");
28 |         
29 |         assert(cacheline == gang_number * channel_unit &&
30 |             "cacheline size must be a multiple of minimum channel width");
31 |         
32 |         spec->channel_width *= gang_number;
33 |     }
34 | 
35 |     static std::unique_ptr<Memory<T>> populate_memory(RamulatorConfig& configs, 
36 |                                                       T *spec, 
37 |                                                       int channels, int ranks) {
38 |         int& default_ranks = spec->org_entry.count[int(T::Level::Rank)];
39 |         int& default_channels = spec->org_entry.count[int(T::Level::Channel)];
40 | 
41 |         if (default_channels == 0) default_channels = channels;
42 |         if (default_ranks == 0) default_ranks = ranks;
43 | 
44 |         vector<Controller<T> *> ctrls;
45 |         for (int c = 0; c < channels; c++){
46 |             DRAM<T>* channel = new DRAM<T>(spec, T::Level::Channel);
47 |             channel->id = c;
48 |             channel->regStats("");
49 |             ctrls.push_back(new Controller<T>(configs, channel));
50 |         }
51 |         return std::make_unique<Memory<T>>(configs, ctrls);
52 |     }
53 | 
54 |     static void validate(int channels, int ranks, RamulatorConfig& configs) {
55 |         assert(channels > 0 && ranks > 0);
56 |     }
57 | 
58 |     static std::unique_ptr<MemoryBase> create(RamulatorConfig& configs, 
59 |                                               int cacheline) {
60 |         int channels = stoi(configs["channels"], NULL, 0);
61 |         int ranks = stoi(configs["ranks"], NULL, 0);
62 |         
63 |         validate(channels, ranks, configs);
64 | 
65 |         const string& org_name = configs["org"];
66 |         const string& speed_name = configs["speed"];
67 | 
68 |         T *spec = new T(org_name, speed_name);
69 | 
70 |         // Set channel width statically in the header file
71 |         //extend_channel_width(spec, cacheline);
72 | 
73 |         return populate_memory(configs, spec, channels, ranks);
74 |     }
75 | };
76 | 
77 | // template <>
78 | // MemoryBase *MemoryFactory<WideIO2>::create(RamulatorConfig& configs, int cacheline);
79 | // template <>
80 | // MemoryBase *MemoryFactory<SALP>::create(RamulatorConfig& configs, int cacheline);
81 | 
82 | } /*namespace ram*/
83 | 
84 | #endif /*__MEMORY_FACTORY_H*/
85 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/src/Request.cpp:
--------------------------------------------------------------------------------
 1 | #include "Request.h"
 2 | 
 3 | namespace ram {
 4 | 
 5 | Request::Request() {}
 6 | 
 7 | Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
 8 |                  function<void(const Request&)> &cb)
 9 |     : type(Type),
10 |       is_first_command(true),
11 |       addr(Addr),
12 |       addr_vec(AddrVec),
13 |       coreid(0),
14 |       arrive(0),
15 |       depart(0),
16 |       callback(cb) {}
17 | 
18 | Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
19 |                  function<void(const Request&)> &cb, void* original_req)
20 |     : type(Type),
21 |       is_first_command(true),
22 |       addr(Addr),
23 |       addr_vec(AddrVec),
24 |       coreid(0),
25 |       arrive(0),
26 |       depart(0),
27 |       callback(cb),
28 |       orignal_request(original_req) {}
29 | 
30 | Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
31 |                  function<void(const Request&)> &cb, int vid)
32 |     : type(Type),
33 |       is_first_command(true),
34 |       addr(Addr),
35 |       addr_vec(AddrVec),
36 |       coreid(0),
37 |       arrive(0),
38 |       depart(0),
39 |       vid(vid),
40 |       callback(cb) {}
41 | 
42 | Request::Request(std::vector<int> addr_vec, Type type,
43 |                  function<void(Request&)> cb) 
44 |     : type(type),
45 |       is_first_command(true),
46 |       addr(-1),
47 |       BaseAddr(-1),
48 |       addr_vec(addr_vec),
49 |       coreid(0),
50 |       arrive(0),
51 |       depart(0),
52 |       callback(cb) {}
53 |       
54 | Request::Request(std::vector<int> addr_vec, Type type,
55 |                  function<void(Request&)> cb, void* original_req) 
56 |     : type(type),
57 |       is_first_command(true),
58 |       addr(-1),
59 |       BaseAddr(-1),
60 |       addr_vec(addr_vec),
61 |       coreid(0),
62 |       arrive(0),
63 |       depart(0),
64 |       callback(cb),
65 |       orignal_request(original_req) {}
66 | 
67 | Request::Request(Type Type, uint64_t BaseAddr, uint64_t Addr, 
68 |                  std::vector<int> AddrVec, function<void(const Request&)> &cb)
69 |     : type(Type),
70 |       is_first_command(true),
71 |       addr(Addr),
72 |       BaseAddr(BaseAddr),
73 |       addr_vec(AddrVec),
74 |       coreid(0),
75 |       arrive(0),
76 |       depart(0),
77 |       callback(cb) {}
78 | 
79 | bool Request::isRead() const {
80 |   return type == Type::READ;
81 | }
82 | bool Request::isWrite() const {
83 |   return type == Type::WRITE;
84 | }
85 | int Request::getChannelID() const {
86 |   return addr_vec[0];
87 | }
88 | 
89 | } // end namespace
90 | 
91 | 


--------------------------------------------------------------------------------
/extern/ramulator_custom/src/Request.h:
--------------------------------------------------------------------------------
 1 | #ifndef __REQUEST_H
 2 | #define __REQUEST_H
 3 | 
 4 | #include <vector>
 5 | #include <functional>
 6 | #include <cstdint>
 7 | 
 8 | using namespace std;
 9 | 
10 | namespace ram {
11 | class Request {
12 | public:
13 |   enum class Type {
14 |     READ, WRITE, PIM_WRITE, REFRESH, POWERDOWN, SELFREFRESH, EXTENSION, MAX
15 |   };
16 |   Type type;
17 |   bool is_first_command;
18 |   uint64_t addr;
19 |   uint64_t BaseAddr;
20 |   //int HandlerID;
21 | 
22 |   vector<int> addr_vec;
23 |   // specify which node this request sent from
24 |   int coreid;       // to remove compile errors
25 | 
26 |   uint64_t arrive;
27 |   uint64_t depart;
28 | 
29 |   int vid = -1;
30 |   void* orignal_request;
31 |   function<void(Request&)> callback; // call back with more info
32 | 
33 |   bool isRead() const;
34 |   bool isWrite() const;
35 |   int getChannelID() const;
36 | 
37 |   // Used to generate refresh request
38 |   Request();
39 |   Request(std::vector<int> addr_vec, Type type, function<void(Request&)> cb);
40 |   Request(std::vector<int> addr_vec, Type type, function<void(Request&)> cb, void* original_req);
41 |   Request(Type type, uint64_t Addr, 
42 |           std::vector<int> AddrVec, function<void(const Request&)> &cb);
43 |   Request(Type type, uint64_t Addr, 
44 |           std::vector<int> AddrVec, function<void(const Request&)> &cb, void* orignal_req);
45 |   Request(Type type, uint64_t Addr, 
46 |           std::vector<int> AddrVec, function<void(const Request&)> &cb, int vid);
47 |   Request(Type type, uint64_t BaseAddr, uint64_t Addr, 
48 |           std::vector<int> AddrVec, function<void(const Request&)> &cb);
49 | };
50 | 
51 | } /*namespace ram*/
52 | 
53 | #endif /*__REQUEST_H*/
54 | 
55 | 


--------------------------------------------------------------------------------
/img/ONNXim_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/img/ONNXim_demo.png


--------------------------------------------------------------------------------
/img/speedup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/img/speedup.png


--------------------------------------------------------------------------------
/models/language_models/llama3-8b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function" : "swish",
 3 |   "num_attention_heads" : 32,
 4 |   "num_kv_heads" : 8,
 5 |   "vocab_size" : 128256,
 6 |   "num_hidden_layers" : 32,
 7 |   "hidden_size" : 4096,
 8 |   "intermediate_size" : 14336,
 9 |   "ffn_type" : "llama",
10 |   "max_seq_length" : 8192,
11 |   "run_single_layer": true,
12 |   "tensor_parallel_size" : 1,
13 |   "pipeline_parallel_size" : 1
14 | }


--------------------------------------------------------------------------------
/models/language_models/opt-125m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function" : "relu",
 3 |   "num_attention_heads" : 12,
 4 |   "num_kv_heads" : 12,
 5 |   "vocab_size" : 50272,
 6 |   "num_hidden_layers" : 1,
 7 |   "hidden_size" : 768,
 8 |   "intermediate_size" : 3072,
 9 |   "ffn_type" : "default",
10 |   "max_seq_length" : 2048,
11 |   "run_single_layer": true,
12 |   "tensor_parallel_size" : 1,
13 |   "pipeline_parallel_size" : 1
14 | }


--------------------------------------------------------------------------------
/models/language_models/opt-66b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "activation_function" : "relu",
 3 |   "num_attention_heads" : 72,
 4 |   "num_kv_heads" : 72,
 5 |   "vocab_size" : 50272,
 6 |   "num_hidden_layers" : 64,
 7 |   "hidden_size" : 9216,
 8 |   "intermediate_size" : 36864,
 9 |   "ffn_type" : "default",
10 |   "max_seq_length" : 2048,
11 |   "run_single_layer": true,
12 |   "tensor_parallel_size" : 1,
13 |   "pipeline_parallel_size" : 1
14 | }


--------------------------------------------------------------------------------
/models/resnet18/resnet18.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PSAL-POSTECH/ONNXim/44b83bdb7b1987d3a01b867cb3a03326c5644aa2/models/resnet18/resnet18.onnx


--------------------------------------------------------------------------------
/scripts/aggregate_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # DATAES=`ls workspace2/`
 4 | # for date in $DATAES; do
 5 | #     for config in `ls workspace2/$date`; do
 6 | #     MODEL=($(cut -d- -f1 <<< $config))
 7 | #     CONFIG=($(cut -d- -f2 <<< $config))
 8 | #     grep " finish at" workspace2/${date}/${config}/log.out | while read line; do
 9 | #       CYCLE=`echo $line | awk '{print $8}'`
10 | #       LAYER=`echo $line | awk '{print $5}'`
11 | #       echo "$date,$MODEL,$CONFIG,$LAYER,$CYCLE"
12 | #     done
13 |       
14 | #       # grep " Model finish  at " workspace/${date}/${config}/log.out | awk '{print $}'
15 |       
16 |       
17 | #     done
18 | # done
19 | 
20 | DATAES=`ls workspace/`
21 | for date in $DATAES; do
22 |     for model in `ls workspace/$date`; do
23 |       for config in `ls workspace/$date/$model`; do
24 |         CONFIG=($(cut -d- -f2 <<< $config))
25 |         grep " finish  at " workspace/${date}/$model/${config}/log.out | tail -n1 | while read line; do
26 |           CYCLE=`echo $line | awk '{print $7}'`
27 |           MODEL=`echo $line | awk '{print $5}'`
28 |           LAYER=`echo $line | awk '{print $6}'`
29 |           echo "$date,$model,$config,$CYCLE"
30 |           # echo $line
31 |         done
32 |     done
33 |       
34 |       # grep " Model finish  at " workspace/${date}/${config}/log.out | awk '{print $}'
35 |       
36 |       
37 |     done
38 | done
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/scripts/generate_cnn_onnx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ONNX File generator 
 3 | Optimizer onnx graph for inference
 4 | """
 5 | 
 6 | # pylint: disable=unused-argument,missing-docstring,useless-super-delegation
 7 | 
 8 | import onnxruntime as rt
 9 | import torch
10 | import torchvision.models as models
11 | import argparse
12 | import pathlib
13 | import os
14 | import json
15 | 
16 | size_list = [1, 2, 4, 8, 16, 32]
17 | 
18 | HOME = os.getenv("ONNXIM_HOME", default="../")
19 | parser = argparse.ArgumentParser(prog = 'ONNX generator')
20 | parser.add_argument('--model', required=True, help="resnet18, resnet50, alexnet, vgg16, inception")
21 | parser.add_argument('--weight', type=int, default=1, help="export weight, defulat=True")
22 | args = parser.parse_args()
23 | 
24 | torchvision_models = {
25 |   'resnet18' : models.resnet18(),
26 |   'resnet50' : models.resnet50(),
27 |   'alexnet' : models.alexnet(),
28 |   'vgg16' :  models.vgg16(),
29 |   'squeezenet' : models.squeezenet1_0(),
30 |   'densenet' :  models.densenet161(),
31 |   'inception' : models.inception_v3(),
32 |   'googlenet' : models.googlenet(),
33 |   'shufflenet' : models.shufflenet_v2_x1_0(),
34 |   'mobilenet' : models.mobilenet_v2(),
35 |   'resnext50_32x4d' : models.resnext50_32x4d(),
36 |   'wide_resnet50_2' : models.wide_resnet50_2(),
37 |   'mnasnet' :  models.mnasnet1_0(),
38 | }
39 | 
40 | model = torchvision_models[args.model]
41 | batch_size = 1
42 | if args.model != 'inception':
43 |   input = torch.randn(1, 3, 224, 224, requires_grad=True)
44 |   input_shape = (3, 224, 224)
45 | else:
46 |   input = torch.randn(1, 3, 299, 299, requires_grad=True)
47 |   input_shape = (3, 299, 299)
48 | 
49 | # Export PyTorch model to onnx
50 | torch.onnx.export(
51 |   model,
52 |   input,
53 |   'tmp.onnx',
54 |   export_params = bool(args.weight),
55 |   input_names = ['input'],
56 |   output_names = ['output'],
57 |   dynamic_axes = {
58 |     'input' : {0 : 'batch_size'},
59 |     'output' : {0 : 'batch_size'}}
60 | )
61 | 
62 | # Create output folder
63 | pathlib.Path(f'{HOME}/models/{args.model}/').mkdir(parents=True, exist_ok=True)
64 | pathlib.Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True)
65 | 
66 | # Optimzied exported onnx file
67 | print(f"Converting ONNX FILE: {args.model}")
68 | opt = rt.SessionOptions()
69 | opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
70 | opt.optimized_model_filepath = f'{HOME}/models/{args.model}/{args.model}.onnx'
71 | sess = rt.InferenceSession('tmp.onnx', sess_options=opt)
72 | 
73 | # Generate model_list json file
74 | for size in size_list:
75 |     config = {
76 |         "models": [
77 |                 {
78 |                     "name": f"{args.model}",
79 |                     "batch_size": size,
80 |                     "request_time": 0
81 |                  }
82 |             ]
83 |         }
84 |     with open(f"{HOME}/model_lists/{args.model}_{size}.json", "w") as json_file:
85 |         json.dump(config, json_file, indent=4)
86 | print("DONE")


--------------------------------------------------------------------------------
/scripts/generate_conv_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pathlib import Path
 3 | import json
 4 | import os
 5 | 
 6 | size_list = [128]#64, 256, 1024]
 7 | dtype = torch.float32
 8 | C_in = 128
 9 | C_out = 128
10 | K_sz = 3
11 | padding = 1
12 | H = 14 * 4
13 | W = 14 * 4
14 | stride=2
15 | HOME = os.getenv("ONNXIM_HOME", default="../")
16 | 
17 | size_name = f"{C_in}_{C_out}_{K_sz}_{H}_{W}"
18 | # Test Convolution model
19 | class size_conv(torch.nn.Module):
20 |     def __init__(self, C_in, C_out, K_sz, padding=padding):
21 |         super().__init__()
22 |         self.fc = torch.nn.Conv2d(C_in, C_out, K_sz, stride=stride, padding=padding, bias=False, dtype=dtype)
23 | 
24 |     def forward(self, x):
25 |         return self.fc(x)
26 | 
27 | # Create output folder
28 | Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True)
29 | for size in size_list:
30 | 
31 |     # Export PyTorch model to onnx
32 |     Path(f"{HOME}/models/conv_{size_name}").mkdir(parents=True, exist_ok=True)
33 |     m = size_conv(C_in, C_out, K_sz, padding)
34 |     A = torch.zeros([1,C_in, H, W], dtype=dtype)
35 |     onnx_path = Path(f"{HOME}/models/conv_{size_name}/conv_{size_name}.onnx")
36 |     torch.onnx.export(m, A, onnx_path, export_params=True, input_names = ['input'], output_names=['output'])
37 | 
38 |     # Generate model_list json file
39 |     config = {
40 |         "models": [
41 |             {
42 |                 "name": f"conv_{size_name}",
43 |                 "request_time": 0
44 |             }
45 |         ]
46 |     }
47 |     with open(f"{HOME}/model_lists/conv_{size_name}.json", "w") as json_file:
48 |         json.dump(config, json_file, indent=4)
49 | 


--------------------------------------------------------------------------------
/scripts/generate_matmul_onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pathlib import Path
 3 | import json
 4 | import os
 5 | 
 6 | #size_list = [[512, 768, 2304],[512, 768, 512],[512, 768, 768], [512, 512, 768], [512, 768, 50257]]#32, 64, 128, 256, 512, 1024, 2048]
 7 | #size_list = [[512, 512, 1024],[512, 1024, 2],[512, 1024, 512], [512, 1024, 1024], [512, 1024, 3072], [512, 768, 3072], [512, 1024, 4096], [512, 4096, 1024]]#32, 64, 128, 256, 512, 1024, 2048]
 8 | size_list = [[1, 1024*8, 1024*8]] #[32,32,32], [64,64,64],[128]*3, [256]*3, [512]*3, [1024]*3, [2048]*3, [4096]*3, [8192]*3]
 9 | dtype = torch.float16
10 | 
11 | HOME = os.getenv("ONNXIM_HOME", default="../")
12 | 
13 | # Test matmul model
14 | class size_matmul(torch.nn.Module):
15 |     def __init__(self, size2, size3):
16 |         super().__init__()
17 |         self.fc = torch.nn.Linear(size2, size3, dtype=dtype, bias=False) #size, size, dtype=dtype, bias=False)
18 | 
19 |     def forward(self, x):
20 |         return self.fc(x)
21 | 
22 | # Create output folder
23 | Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True)
24 | for size1, size2, size3 in size_list:
25 |     # Export PyTorch model to onnx
26 |     Path(f"{HOME}/models/matmul_{size1}_{size2}_{size3}").mkdir(parents=True, exist_ok=True)
27 |     m = size_matmul(size2, size3)
28 |     A = torch.zeros([size1, size2], dtype=dtype)
29 |     onnx_path = Path(f"{HOME}/models/matmul_{size1}_{size2}_{size3}/matmul_{size1}_{size2}_{size3}.onnx")
30 |     torch.onnx.export(m, A, onnx_path, export_params=True, input_names = ['input'], output_names=['output'])
31 | 
32 |     # Generate model_list json file
33 |     config = {
34 |         "models": [
35 |             {
36 |                 "name": f"matmul_{size1}_{size2}_{size3}",
37 |                 "request_time": 0
38 |             }
39 |         ]
40 |     }
41 |     with open(f"{HOME}/model_lists/matmul_{size1}_{size2}_{size3}.json", "w") as json_file:
42 |         json.dump(config, json_file, indent=4)
43 | 


--------------------------------------------------------------------------------
/scripts/generate_multi-tenancy_onnx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ONNX File generator 
 3 | Optimizer onnx graph for inference
 4 | """
 5 | 
 6 | # pylint: disable=unused-argument,missing-docstring,useless-super-delegation
 7 | 
 8 | import onnxruntime as rt
 9 | import torch
10 | import torchvision.models as models
11 | # import pytorch2timeloop
12 | import argparse
13 | import pathlib
14 | import os
15 | import json
16 | 
17 | size_list = [1, 2, 4, 8, 16, 32]
18 | 
19 | HOME = os.getenv("ONNXIM_HOME", default="../")
20 | parser = argparse.ArgumentParser(prog = 'ONNX generator')
21 | parser.add_argument('--models')
22 | parser.add_argument('--weight', type=int, default=1)
23 | args = parser.parse_args()
24 | torchvision_models = {
25 |   'resnet18' : models.resnet18(),
26 |   'resnet50' : models.resnet50(),
27 |   'alexnet' : models.alexnet(),
28 |   'vgg16' :  models.vgg16(),
29 |   'squeezenet' : models.squeezenet1_0(),
30 |   'densenet' :  models.densenet161(),
31 |   'inception' : models.inception_v3(),
32 |   'googlenet' : models.googlenet(),
33 |   'shufflenet' : models.shufflenet_v2_x1_0(),
34 |   'mobilenet' : models.mobilenet_v2(),
35 |   'resnext50_32x4d' : models.resnext50_32x4d(),
36 |   'wide_resnet50_2' : models.wide_resnet50_2(),
37 |   'mnasnet' :  models.mnasnet1_0(),
38 | }
39 | 
40 | model_list = args.models.split(',')
41 | for model_name in model_list:
42 |   model = torchvision_models[model_name]
43 |   batch_size = 1
44 |   if model_name != 'inception':
45 |     input = torch.randn(1, 3, 224, 224, requires_grad=True)
46 |     input_shape = (3, 224, 224)
47 |   else:
48 |     input = torch.randn(1, 3, 299, 299, requires_grad=True)
49 |     input_shape = (3, 299, 299)
50 | 
51 |   top_dir = os.path.join(HOME, "models")
52 |   convert_fc = True
53 |   exception_module_names = []
54 | 
55 |   # pytorch2timeloop.convert_model(model, input_shape, batch_size, args.model, top_dir, convert_fc, exception_module_names) 
56 | 
57 |   torch.onnx.export(
58 |     model,
59 |     input,
60 |     'tmp.onnx',
61 |     export_params = bool(args.weight),
62 |     input_names = ['input'],
63 |     output_names = ['output'],
64 |     dynamic_axes = {
65 |       'input' : {0 : 'batch_size'},
66 |       'output' : {0 : 'batch_size'}}
67 |   )
68 | 
69 |   opt = rt.SessionOptions()
70 |   # enable level 3 optimizations
71 |   print(f"Converting ONNX FILE: {model_name}")
72 |   pathlib.Path(f'{HOME}/models/{model_name}/').mkdir(parents=True, exist_ok=True)
73 |   opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
74 |   opt.optimized_model_filepath = f'{HOME}/models/{model_name}/{model_name}.onnx'
75 |   sess = rt.InferenceSession('tmp.onnx', sess_options=opt)
76 | 
77 | pathlib.Path(f"{HOME}/model_lists").mkdir(parents=True, exist_ok=True)
78 | config = {
79 |     "models": [
80 |         ]
81 |     }
82 | for model_name in model_list:
83 |   config["models"].append(
84 |     {
85 |       "name": f"{model_name}",
86 |       "batch_size": 1,
87 |     }
88 |   )
89 | 
90 | file_name = '_'.join(model_list)
91 | 
92 | with open(f"{HOME}/model_lists/{file_name}.json", "w") as json_file:
93 |     json.dump(config, json_file, indent=4)
94 | print("DONE")


--------------------------------------------------------------------------------
/scripts/onnxim_sbatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p allcpu
 4 | #SBATCH --nodes=1
 5 | #SBATCH --nodelist=n10
 6 | #SBATCH --ntasks-per-node=1
 7 | 
 8 | ml swap gnu8 gnu13
 9 | which gcc
10 | 
11 | echo "config: $1 model: $2"
12 | echo "$ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$1.json --model $ONNXIM_HOME/model_lists/$2.json"
13 | $ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$1.json --model $ONNXIM_HOME/model_lists/$2.json
14 | 
15 | exit 0


--------------------------------------------------------------------------------
/scripts/run_matmul_conv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #models=("matmul_1024_1024_1024" "matmul_2048_2048_2048" "matmul_4096_4096_4096" "matmul_8192_8192_8192")
 3 | #models=("matmul_2048") #"matmul_32" "matmul_64" "matmul_128" "matmul_256" "matmul_512" "matmul_1024" "matmul_2048" "conv_64" "conv_256" "conv_1024")
 4 | #models=("matmul_32_32_32" "matmul_64_64_64" "matmul_128_128_128" "matmul_256_256_256" "matmul_512_512_512")
 5 | models=("multi_1_0_2_0_100_1121_1000_once" "multi_2_0_2_0_100_8121_1000_once" "multi_8_0_2_0_100_32121_1000_once")
 6 | #models=("matmul_512_512_1024" "matmul_512_1024_2" "matmul_512_1024_512" "matmul_512_1024_3072" "matmul_512_1024_4096" "matmul_512_4096_1024")
 7 | #models=("conv_64" "conv_256" "conv_1024")
 8 | #configs=("systolic_ws_8x8_c1_simple_noc_transformer") # "systolic_ws_8x8_c1_booksim2_transformer" "systolic_ws_8x8_c4_simple_noc_transformer" "systolic_ws_8x8_c4_booksim2_transformer")
 9 | #configs=("systolic_ws_8x8_c4_simple_noc_transformer" "systolic_ws_8x8_c4_booksim2_transformer")
10 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4_partition_quad") #"systolic_ws_128x128_c4_booksim2_tpuv4")
11 | #models=("matmul_4096_4096_4096" "matmul_8192_8192_8192") #("matmul_1024_1024_1024" "matmul_2048_2048_2048" "matmul_4096_4096_4096" "matmul_8192_8192_8192")
12 | i=5
13 | 
14 | #python3 $ONNXIM_HOME/scripts/generate_matmul_onnx.py
15 | #python3 $ONNXIM_HOME/scripts/generate_conv_onnx.py
16 | 
17 | if [ ! -d "$ONNXIM_HOME/results" ]; then
18 |     mkdir $ONNXIM_HOME/results
19 | fi
20 |  
21 | for model_file in "${models[@]}"; do
22 |     if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then
23 |         mkdir $ONNXIM_HOME/results/$model_file
24 |     fi
25 |     for config in "${configs[@]}"; do
26 |         if [ ! -d "$ONNXIM_HOME/results/$model_file/$config" ]; then
27 |             mkdir $ONNXIM_HOME/results/$model_file/$config
28 |         fi
29 |         total_time=0
30 |         for (( j=0; j<i; j++ )); do
31 |             echo "$ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$config.json --model $ONNXIM_HOME/model_lists/$model_file.json > $ONNXIM_HOME/results/$model_file/$config/result_$j"
32 |             $ONNXIM_HOME/build/bin/Simulator --config ./configs/$config.json --model $ONNXIM_HOME/model_lists/$model_file.json > $ONNXIM_HOME/results/$model_file/$config/result_$j &
33 |             simulation_time=$(grep "Simulation time:" "$ONNXIM_HOME/results/$model_file/$config/result_$j" | awk '{print $(NF-1)}')
34 |             if [[ ! -z "$simulation_time" ]]; then
35 |                 total_time=$(echo "$total_time + $simulation_time" | bc)
36 |             fi
37 |         done
38 |         mean_time=$(awk "BEGIN {print $total_time / $i}")
39 |         echo "Mean Simulation time: $mean_time seconds"
40 |     done
41 | done
42 | wait


--------------------------------------------------------------------------------
/scripts/run_multi-tenancy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=("resnet50_vgg16")
 4 | i=1
 5 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4") # "systolic_ws_128x128_c4_booksim2_tpuv4")
 6 | 
 7 | if [ ! -d "$ONNXIM_HOME/results" ]; then
 8 |     mkdir $ONNXIM_HOME/results
 9 | fi
10 | 
11 | for model_file in "${models[@]}"; do
12 |     if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then
13 |         mkdir $ONNXIM_HOME/results/$model_file
14 |     fi
15 |     if [[ $model_file == "gpt2_g" ]] || [[ $model_file == "gpt2_s" ]]; then
16 |         onnx_file="gpt2"
17 |     elif [[ $model_file == "bert" ]]; then
18 |         onnx_file="$model_file"
19 |     else
20 |         onnx_file="$model_file"
21 |     fi
22 |     if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then
23 |         mkdir $ONNXIM_HOME/results/$model_file
24 |     fi
25 |     for config in "${configs[@]}"; do
26 |         if [ ! -d "$ONNXIM_HOME/results/$model_file/$config" ]; then
27 |             mkdir $ONNXIM_HOME/results/$model_file/$config
28 |         fi
29 |         for (( j=0; j<i; j++ )); do
30 |             echo "sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file""
31 |             sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file"
32 |         done
33 |     done
34 | done


--------------------------------------------------------------------------------
/scripts/run_sbatch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=("gpt2_s" "gpt2_g" "bert")
 4 | batch_list=("1" "2" "4" "8" "16" "32")
 5 | i=3
 6 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4") # "systolic_ws_128x128_c4_booksim2_tpuv4")
 7 | no_batch=1
 8 | 
 9 | if [ ! -d "$ONNXIM_HOME/results" ]; then
10 |     mkdir $ONNXIM_HOME/results
11 | fi
12 | 
13 | for model_file in "${models[@]}"; do
14 |     if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then
15 |         mkdir $ONNXIM_HOME/results/$model_file
16 |     fi
17 |     if [[ $model_file == "gpt2_g" ]] || [[ $model_file == "gpt2_s" ]]; then
18 |         onnx_file="gpt2"
19 |     elif [[ $model_file == "bert" ]]; then
20 |         onnx_file="$model_file"
21 |     fi
22 |     for batch in "${batch_list[@]}"; do
23 |         if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch" ]; then
24 |             mkdir $ONNXIM_HOME/results/$model_file/$batch
25 |         fi
26 |         for config in "${configs[@]}"; do
27 |             if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch/$config" ]; then
28 |                 mkdir $ONNXIM_HOME/results/$model_file/$batch/$config
29 |             fi
30 |             for (( j=0; j<i; j++ )); do
31 |                 if [ "$no_batch" -eq 0 ]; then
32 |                     echo "sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file"_$batch"
33 |                     sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file"_$batch
34 |                 else
35 |                     echo "sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file""
36 |                     sbatch -J ONNXIM-$model_file -o $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.out  -e $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j.err $ONNXIM_HOME/scripts/onnxim_sbatch.sh $config "$model_file"
37 |                 fi
38 |             done
39 |         done
40 |     done
41 | done


--------------------------------------------------------------------------------
/scripts/run_simulation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Script for simulatioon
 3 | SCRIPT_DIR=`dirname $0`
 4 | SIMULTATOR=`realpath "${SCRIPT_DIR}/../build/bin/Simulator"`
 5 | # PRECOMMAND="gdb --args"
 6 | PRECOMMAND="srun -o log.out"
 7 | MODE=one_model
 8 | 
 9 | run_simulator() {
10 |   pushd $1
11 |   echo "Run Simulation"
12 |   $PRECOMMAND $SIMULTATOR --config $CONFIG_PATH --model $MODEL_PATH --input_name "input" --log_level debug --mode $MODE &
13 |   popd
14 | }
15 | #Argumenbt parser 
16 | while (( "$#" )); do
17 |   case "$1" in 
18 |     -m| --model)
19 |       if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then 
20 |         MODEL_PATH=$2
21 |         shift 2
22 |       else 
23 |         echo "Error: Arcument for $1 is missing" >&2
24 |         exit 1
25 |       fi
26 |       ;;
27 |     -c| --config)
28 |       if [ -n "$2" ] && [ ${2:0:1} != "-" ]; then 
29 |         CONFIG_PATH=$2
30 |          shift 2
31 |       else 
32 |         echo "Error: Arcument for $1 is missing" >&2
33 |         exit 1
34 |       fi
35 |       ;;
36 |     -t| --two)
37 |       MODE=two_model
38 |       shift 1
39 |       ;;
40 |     -h| --help) 
41 |       echo "Usage:  $0 -i <input> [options]" >&2
42 |           echo "        -m | --model  %  (set input model for simulation)" >&2
43 |           echo "        -c | --config  %  (set configuration for simulation)" >&2
44 |           exit 0
45 |       ;;
46 |   esac
47 | done
48 | 
49 | if [ -z "$MODEL_PATH" ] || [ -z "$CONFIG_PATH" ]; then
50 |   echo "Error: --model and --config option must be set" >&2
51 |   exit 1
52 | fi
53 | CONFIG_PATH=`realpath $CONFIG_PATH`
54 | 
55 | MODEL_TMP=$MODEL_PATH
56 | unset MODEL_PATH
57 | unset MODEL_NAME
58 | for model in ${MODEL_TMP//,/$'\n'}; do 
59 |   model=`realpath $model`
60 |   model_name=`basename $model`
61 |   model_name=${model_name%.*}
62 |   if [[ -n $MODEL_PATH ]]; then
63 |     MODEL_PATH="${MODEL_PATH},"
64 |     MODEL_NAME="${MODEL_NAME}-"
65 |   fi
66 |   MODEL_PATH="${MODEL_PATH}${model}"
67 |   MODEL_NAME="${MODEL_NAME}${model_name}"
68 | done
69 | 
70 | #Make simulation workspace
71 | CURRENTDATE=`date +"%Y-%m-%d"`
72 | CURRENTTIME=`date +"%H-%M"`
73 | CONFIG_NAME=`basename $CONFIG_PATH`
74 | # MODEL_NAME=`basename $MODEL_PATH`
75 | echo $MODEL_PATH
76 | echo ./workspace/$CURRENTDATE/${MODEL_NAME%.*}/${CONFIG_NAME%.*}-$CURRENTTIME
77 | WORKSPACE=./workspace/$CURRENTDATE/${MODEL_NAME%.*}/${CONFIG_NAME%.*}-$CURRENTTIME
78 | mkdir -p $WORKSPACE
79 | run_simulator $WORKSPACE


--------------------------------------------------------------------------------
/scripts/run_timeloop.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | CONFIG=$2
 4 | MODEL_PATH=../models/$MODEL
 5 | CONFIG_PATH=../configs/timeloop_configs/$CONFIG
 6 | 
 7 | pids=""
 8 | for  LAYER in `ls $MODEL_PATH/*.yaml`; do
 9 |     srun -p allcpu ./timeloop_slurm_job.sh $MODEL $CONFIG $LAYER & 
10 |     pids="$pids $!"
11 | done
12 | 
13 | wait $pids
14 | 
15 | for MAP_FILE in `ls $MODEL_PATH/*.map`; do 
16 |     echo $MAP_FILE
17 |     MAP_FILE_BASE=`basename $MAP_FILE`
18 |     ID="${MAP_FILE_BASE%.*}"
19 |     MAPPING=`cat $MAP_FILE`
20 |     echo $ID, $MAPPING >> $MODEL_PATH/$MODEL.mapping
21 |     rm $MAP_FILE
22 | done
23 | 
24 | echo "DONE"
25 | 


--------------------------------------------------------------------------------
/scripts/run_transformer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | models=("gpt2_s" "gpt2_g" "bert")
 4 | batch_list=("1" "2" "4" "8" "16" "32")
 5 | 
 6 | configs=("systolic_ws_128x128_c4_simple_noc_tpuv4" "systolic_ws_128x128_c4_booksim2_tpuv4")
 7 | i=1
 8 | 
 9 | #python3.8 $ONNXIM_HOME/scripts/generate_transformer_onnx.py --model gpt2
10 | #python3 $ONNXIM_HOME/scripts/generate_transformer_onnx.py --model bert
11 | 
12 | if [ ! -d "$ONNXIM_HOME/results" ]; then
13 |     mkdir $ONNXIM_HOME/results
14 | fi
15 | 
16 | for model_file in "${models[@]}"; do
17 |     if [ ! -d "$ONNXIM_HOME/results/$model_file" ]; then
18 |         mkdir $ONNXIM_HOME/results/$model_file
19 |     fi
20 |     if [[ $model_file == "gpt2_g" ]] || [[ $model_file == "gpt2_s" ]]; then
21 |         onnx_file="gpt2"
22 |     elif [[ $model_file == "bert" ]]; then
23 |         onnx_file="$model_file"
24 |     fi
25 |     for batch in "${batch_list[@]}"; do
26 |         if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch" ]; then
27 |             mkdir $ONNXIM_HOME/results/$model_file/$batch
28 |         fi
29 |         for config in "${configs[@]}"; do
30 |             if [ ! -d "$ONNXIM_HOME/results/$model_file/$batch/$config" ]; then
31 |                 mkdir $ONNXIM_HOME/results/$model_file/$batch/$config
32 |             fi
33 |             total_time=0
34 |             for (( j=0; j<i; j++ )); do
35 |                 echo "$ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$config.json --model $ONNXIM_HOME/model_lists/"$model_file"_$batch.json > $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j 2>&1"
36 |                 $ONNXIM_HOME/build/bin/Simulator --config $ONNXIM_HOME/configs/$config.json --model $ONNXIM_HOME/model_lists/"$model_file"_$batch.json > $ONNXIM_HOME/results/$model_file/$batch/$config/result_$j 2>&1
37 |                 simulation_time=$(grep "Simulation time:" "$ONNXIM_HOME/results/$model_file/$batch/$config/result_$j" | awk '{print $(NF-1)}')
38 |                 if [[ ! -z "$simulation_time" ]]; then
39 |                     total_time=$(echo "$total_time + $simulation_time" | bc)
40 |                 fi
41 |             done
42 |             mean_time=$(awk "BEGIN {print $total_time / $i}")
43 |             echo "Mean Simulation time: $mean_time seconds"
44 |         done
45 |     done
46 | done


--------------------------------------------------------------------------------
/scripts/timeloop_slurm_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | CONFIG=$2
 4 | LAYER=$3
 5 | MODEL_PATH=../models/$MODEL
 6 | CONFIG_PATH=../configs/timeloop_configs/$CONFIG
 7 | 
 8 | execute_timeloop() {
 9 |     echo $LAYER
10 |     LAYER_FILE=`basename $LAYER`
11 |     ID="${LAYER_FILE%.*}"
12 |     TMP_DIR=tmp-$ID
13 |     echo $TMP_DIR
14 |     mkdir $TMP_DIR
15 |     pushd $TMP_DIR
16 |         ../timeloop-mapper ../$CONFIG_PATH/arch/*.yaml ../$CONFIG_PATH/arch/components/*.yaml ../$CONFIG_PATH/mapper/mapper.yaml ../$CONFIG_PATH/constraints/*.yaml ../$LAYER > /dev/null 2>/dev/null
17 |         mv map.tmp.txt ../$MODEL_PATH/$ID.map
18 |     popd
19 |     rm -rf $TMP_DIR
20 | }
21 | 
22 | execute_timeloop
23 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.15)
 2 | 
 3 | # project setting
 4 | set(LIB_NAME "Simulator")
 5 | 
 6 | # set source and headers
 7 | file(GLOB_RECURSE SRC_FILES
 8 |   "${CMAKE_SOURCE_DIR}/src/*.h"
 9 |   "${CMAKE_SOURCE_DIR}/src/*.cc"
10 | )
11 | 
12 | # build
13 | add_executable(${LIB_NAME} ${SRC_FILES})
14 | add_library(${LIB_NAME}_lib ${SRC_FILES})
15 | 


--------------------------------------------------------------------------------
/src/Core.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <robin_hood.h>
 3 | 
 4 | #include <memory>
 5 | #include <vector>
 6 | 
 7 | #include "Dram.h"
 8 | #include "SimulationConfig.h"
 9 | #include "Sram.h"
10 | #include "Stat.h"
11 | 
12 | class Core {
13 |  public:
14 |   static std::unique_ptr<Core> create(uint32_t id, SimulationConfig config);
15 |   Core(uint32_t id, SimulationConfig config);
16 |   virtual ~Core() = default;
17 |   virtual bool running();
18 |   virtual bool can_issue(bool is_accum_tile=false);
19 |   virtual void issue(std::unique_ptr<Tile> tile);
20 |   virtual std::unique_ptr<Tile> pop_finished_tile();
21 | 
22 |   virtual void cycle();
23 | 
24 |   virtual bool has_memory_request();
25 |   virtual void pop_memory_request();
26 |   virtual MemoryAccess* top_memory_request() { return _request_queue.front(); }
27 |   virtual void push_memory_response(MemoryAccess* response);
28 |   virtual void print_stats();
29 |   virtual void print_current_stats();
30 | 
31 |   virtual cycle_type get_compute_cycles() { return _stat_tot_compute_cycle; }
32 | 
33 |  protected:
34 |   virtual bool can_issue_compute(std::unique_ptr<Instruction>& inst);
35 |   virtual cycle_type get_inst_compute_cycles(std::unique_ptr<Instruction>& inst) = 0;
36 |   virtual void update_stats();
37 |   virtual void finish_compute_pipeline();
38 |   virtual void finish_vector_pipeline();
39 |   virtual void handle_ld_inst_queue();
40 |   virtual void handle_st_inst_queue();
41 |   virtual cycle_type calculate_add_tree_iterations(uint32_t vector_size);
42 |   virtual cycle_type calculate_vector_op_iterations(uint32_t vector_size);
43 | 
44 |   const uint32_t _id;
45 |   const SimulationConfig _config;
46 | 
47 |   cycle_type _core_cycle;
48 |   
49 |   cycle_type _stat_idle_cycle;
50 |   cycle_type _stat_tot_idle_cycle = 0;
51 | 
52 |   cycle_type _stat_systolic_bubble_cycle = 0;
53 |   cycle_type _stat_tot_systolic_bubble_cycle = 0;
54 | 
55 |   cycle_type _stat_memory_idle_cycle;
56 |   cycle_type _stat_tot_memory_idle_cycle = 0;
57 | 
58 |   cycle_type _stat_compute_cycle = 0;
59 |   cycle_type _stat_tot_compute_cycle = 0;
60 | 
61 |   cycle_type _accum_request_rr_cycle;
62 |   cycle_type _max_request_rr_cycle;
63 |   cycle_type _min_request_rr_cycle;
64 |   
65 |   /* Vector Unit Params */
66 |   cycle_type _stat_vec_compute_cycle;
67 |   cycle_type _stat_tot_vec_compute_cycle = 0;
68 | 
69 |   cycle_type _stat_systolic_active_cycle = 0;
70 |   cycle_type _stat_tot_systolic_active_cycle = 0;
71 |   double _stat_matmul_cycle = 0;
72 |   double _stat_tot_matmul_cycle = 0;
73 | 
74 |   int _running_layer;
75 |   uint32_t tile_rr = 0;
76 |   std::deque<std::unique_ptr<Tile>> _tiles;
77 |   std::queue<std::unique_ptr<Tile>> _finished_tiles;
78 | 
79 |   std::queue<std::unique_ptr<Instruction>> _compute_pipeline;
80 |   std::queue<std::unique_ptr<Instruction>> _vector_pipeline;
81 | 
82 |   std::queue<std::unique_ptr<Instruction>> _ld_inst_queue;
83 |   std::queue<std::unique_ptr<Instruction>> _st_inst_queue;
84 |   std::queue<std::unique_ptr<Instruction>> _ex_inst_queue;
85 | 
86 |   std::queue<MemoryAccess*> _request_queue;
87 |   std::queue<MemoryAccess*> _response_queue;
88 |   uint32_t _waiting_write_reqs;
89 | 
90 |   uint32_t _current_layer_id;
91 |   uint32_t _current_fused_op_id;
92 |   Sram _spad;
93 |   Sram _acc_spad;
94 | };


--------------------------------------------------------------------------------
/src/Dram.h:
--------------------------------------------------------------------------------
 1 | #ifndef DRAM_H
 2 | #define DRAM_H
 3 | #include <robin_hood.h>
 4 | #include <cstdint>
 5 | #include <queue>
 6 | #include <utility>
 7 | 
 8 | #include "Common.h"
 9 | #include "ramulator/Ramulator.hpp"
10 | #include "ramulator2.hh"
11 | 
12 | 
13 | class Dram {
14 |  public:
15 |   virtual ~Dram() = default;
16 |   virtual bool running() = 0;
17 |   virtual void cycle() = 0;
18 |   virtual bool is_full(uint32_t cid, MemoryAccess* request) = 0;
19 |   virtual void push(uint32_t cid, MemoryAccess* request) = 0;
20 |   virtual bool is_empty(uint32_t cid) = 0;
21 |   virtual MemoryAccess* top(uint32_t cid) = 0;
22 |   virtual void pop(uint32_t cid) = 0;
23 |   uint32_t get_channel_id(MemoryAccess* request);
24 |   virtual void print_stat() {}
25 | 
26 |  protected:
27 |   SimulationConfig _config;
28 |   uint32_t _n_ch;
29 |   cycle_type _cycles;
30 | };
31 | 
32 | class SimpleDram : public Dram {
33 |  public:
34 |   SimpleDram(SimulationConfig config);
35 |   virtual bool running() override;
36 |   virtual void cycle() override;
37 |   virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
38 |   virtual void push(uint32_t cid, MemoryAccess* request) override;
39 |   virtual bool is_empty(uint32_t cid) override;
40 |   virtual MemoryAccess* top(uint32_t cid) override;
41 |   virtual void pop(uint32_t cid) override;
42 | 
43 |  private:
44 |   uint32_t _latency;
45 |   double _bandwidth;
46 | 
47 |   uint64_t _last_finish_cycle;
48 |   std::vector<std::queue<std::pair<addr_type, MemoryAccess*>>> _waiting_queue;
49 |   std::vector<std::queue<MemoryAccess*>> _response_queue;
50 | };
51 | 
52 | class DramRamulator : public Dram {
53 |  public:
54 |   DramRamulator(SimulationConfig config);
55 | 
56 |   virtual bool running() override;
57 |   virtual void cycle() override;
58 |   virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
59 |   virtual void push(uint32_t cid, MemoryAccess* request) override;
60 |   virtual bool is_empty(uint32_t cid) override;
61 |   virtual MemoryAccess* top(uint32_t cid) override;
62 |   virtual void pop(uint32_t cid) override;
63 |   virtual void print_stat() override;
64 | 
65 |  private:
66 |   std::unique_ptr<ram::Ramulator> _mem;
67 |   robin_hood::unordered_flat_map<uint64_t, MemoryAccess*> _waiting_mem_access;
68 |   std::queue<MemoryAccess*> _responses;
69 | 
70 |   std::vector<uint64_t> _total_processed_requests;
71 |   std::vector<uint64_t> _processed_requests;
72 | };
73 | 
74 | class DramRamulator2 : public Dram {
75 |  public:
76 |   DramRamulator2(SimulationConfig config);
77 | 
78 |   virtual bool running() override;
79 |   virtual void cycle() override;
80 |   virtual bool is_full(uint32_t cid, MemoryAccess* request) override;
81 |   virtual void push(uint32_t cid, MemoryAccess* request) override;
82 |   virtual bool is_empty(uint32_t cid) override;
83 |   virtual MemoryAccess* top(uint32_t cid) override;
84 |   virtual void pop(uint32_t cid) override;
85 |   virtual void print_stat() override;
86 | 
87 |  private:
88 |   std::vector<std::unique_ptr<NDPSim::Ramulator2>> _mem;
89 |   int _tx_ch_log2;
90 |   int _tx_log2;
91 |   int _req_size;
92 | };
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/Hashing.h:
--------------------------------------------------------------------------------
 1 | // author: Mahmoud Khairy, (Purdue Univ)
 2 | // email: abdallm@purdue.edu
 3 | 
 4 | #include <assert.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #ifndef HASHING_H
 9 | #define HASHING_H
10 | 
11 | typedef unsigned long long new_addr_type;
12 | 
13 | unsigned ipoly_hash_function(new_addr_type higher_bits, unsigned index,
14 |                              unsigned bank_set_num);
15 | 
16 | #endif


--------------------------------------------------------------------------------
/src/Instruction.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <robin_hood.h>
 4 | #include <spdlog/fmt/ranges.h>
 5 | #include <spdlog/spdlog.h>
 6 | 
 7 | #include <cassert>
 8 | #include <cstdint>
 9 | #include <memory>
10 | #include <vector>
11 | 
12 | enum class Opcode { MOVIN, MOVOUT, GEMM_PRELOAD, GEMM, GEMM_WRITE, COMP, BAR };
13 | 
14 | #define SPAD_BASE 0x10000000
15 | #define ASPAD_BASE 0x20000000
16 | typedef uint64_t addr_type;
17 | typedef uint64_t cycle_type;
18 | 
19 | class Instruction {
20 |  public:
21 |   Instruction();
22 |   std::string toString();
23 |   
24 |  private:
25 |   enum class Type {
26 |     LD_INST, ST_INST, EXE_INST
27 |   };
28 |   uint32_t id; 
29 |   Opcode opcode;
30 |   Type type;
31 |   size_t tile_size;
32 |   cycle_type start_cycle;
33 |   cycle_type finish_cycle;
34 |   std::vector<std::string> dependent_ids;
35 |   std::string dest_id;
36 |   addr_type spad_addr;
37 |   uint32_t spad_size;
38 |   std::vector<addr_type> dram_addrs;
39 | };


--------------------------------------------------------------------------------
/src/Interconnect.h:
--------------------------------------------------------------------------------
 1 | #ifndef INTERCONNECT_H
 2 | #define INTERCONNECT_H
 3 | #include "Common.h"
 4 | #include "booksim2/Interconnect.hpp"
 5 | #include "helper/HelperFunctions.h"
 6 | 
 7 | class Interconnect {
 8 |  public:
 9 |   virtual ~Interconnect() = default;
10 |   virtual bool running() = 0;
11 |   virtual void cycle() = 0;
12 |   virtual void push(uint32_t src, uint32_t dest, MemoryAccess* request) = 0;
13 |   virtual bool is_full(uint32_t src, MemoryAccess* request) = 0;
14 |   virtual bool is_empty(uint32_t nid) = 0;
15 |   virtual MemoryAccess* top(uint32_t nid) = 0;
16 |   virtual void pop(uint32_t nid) = 0;
17 |   virtual void print_stats() = 0;
18 | 
19 |  protected:
20 |   SimulationConfig _config;
21 |   uint32_t _n_nodes;
22 |   uint64_t _cycles;
23 | };
24 | 
25 | // Simple without conflict interconnect
26 | class SimpleInterconnect : public Interconnect {
27 |  public:
28 |   SimpleInterconnect(SimulationConfig config);
29 |   virtual bool running() override;
30 |   virtual void cycle() override;
31 |   virtual void push(uint32_t src, uint32_t dest,
32 |                     MemoryAccess* request) override;
33 |   virtual bool is_full(uint32_t src, MemoryAccess* request) override;
34 |   virtual bool is_empty(uint32_t nid) override;
35 |   virtual MemoryAccess* top(uint32_t nid) override;
36 |   virtual void pop(uint32_t nid) override;
37 |   virtual void print_stats() override {}
38 | 
39 |  private:
40 |   uint32_t _latency;
41 |   double _bandwidth;
42 |   uint32_t _rr_start;
43 |   uint32_t _buffer_size;
44 | 
45 |   struct Entity {
46 |     cycle_type finish_cycle;
47 |     uint32_t dest;
48 |     MemoryAccess* access;
49 |   };
50 | 
51 |   std::vector<std::queue<Entity>> _in_buffers;
52 |   std::vector<std::queue<MemoryAccess*>> _out_buffers;
53 |   std::vector<bool> _busy_node;
54 | };
55 | 
56 | class Booksim2Interconnect : public Interconnect {
57 |  public:
58 |   Booksim2Interconnect(SimulationConfig config);
59 |   virtual bool running() override;
60 |   virtual void cycle() override;
61 |   virtual void push(uint32_t src, uint32_t dest,
62 |                     MemoryAccess* request) override;
63 |   virtual bool is_full(uint32_t src, MemoryAccess* request) override;
64 |   virtual bool is_empty(uint32_t nid) override;
65 |   virtual MemoryAccess* top(uint32_t nid) override;
66 |   virtual void pop(uint32_t nid) override;
67 |   virtual void print_stats() override;
68 | 
69 |  private:
70 |   uint32_t _ctrl_size;
71 |   std::string _config_path;
72 |   std::unique_ptr<booksim2::Interconnect> _booksim;
73 | 
74 |   booksim2::Interconnect::Type get_booksim_type(MemoryAccess* access);
75 |   uint32_t get_packet_size(MemoryAccess* access);
76 | };
77 | #endif


--------------------------------------------------------------------------------
/src/Model.h:
--------------------------------------------------------------------------------
 1 | #ifndef INSTRUCTION_H
 2 | #define INSTRUCTION_H
 3 | 
 4 | #include "Common.h"
 5 | #include "helper/HelperFunctions.h"
 6 | #include "operations/Operation.h"
 7 | #include "Tensor.h"
 8 | #include "Mapping.h"
 9 | class Model {
10 |   public:
11 |     Model(std::string onnx_path, json model_config, SimulationConfig config, std::string name, MappingTable& map);
12 |     Model(json model_config, SimulationConfig config, std::string name);
13 |     virtual ~Model() = default;
14 |     uint32_t get_id() { return _id; }
15 |     json get_model_config() { return _model_config; }
16 |     Tensor* get_tensor(uint32_t id);
17 |     Tensor* find_tensor(std::string name);
18 |     uint32_t get_root_node_id() { return _root_node_id; }
19 |     void add_tensor(std::unique_ptr<Tensor> tensor);
20 |     void set_layer_finish(uint32_t id); 
21 | 
22 |     std::string get_name() { return _name; }
23 |     uint32_t executable_layer_size();
24 |     Operation* get_executable_tile();
25 |     uint64_t get_request_time() const { return _request_time; }
26 |     void set_request_time(uint64_t request_time) { _request_time=request_time; }
27 |     uint64_t get_start_time() const { return _start_time; }
28 |     void update_start_time(uint64_t start_time);
29 |     bool check_finish();
30 |     uint32_t get_partition_id() { return _partition_id; }
31 |     
32 |     virtual bool check_language_model() { return false; }
33 |     virtual bool check_regressive();
34 |     virtual void prepare_regressive();
35 | 
36 |     virtual void initialize_model(std::vector<std::unique_ptr<Tensor>>& weight_table);
37 |     virtual void initialize_weight(std::vector<std::unique_ptr<Tensor>>& weight_table);
38 |   protected:
39 | 
40 |     uint32_t _id;
41 |     MappingTable _mapping_table;
42 |     json _model_config;
43 |     std::string _onnx_path;
44 |     std::string _name;
45 |     uint32_t _root_node_id;
46 |     std::map<uint32_t, std::unique_ptr<Operation>> _operation_map;
47 |     std::map<uint32_t, std::unique_ptr<Tensor>> _tensor_map;
48 |     std::map<std::string, uint32_t> _axis_map;
49 |     std::vector<Operation*> _executable_layer;
50 |     SimulationConfig _config;
51 |     uint32_t _partition_id = 0;
52 |     uint32_t _target_core = 0;
53 | 
54 |     /* Number of simulating attention block */
55 |     int nr_skip = 0; // NR_SKIP == 2 * NR_ATTEN
56 |     uint64_t _request_time = 0;   // pico second
57 |     uint64_t _start_time = 0;   // pico second
58 |     bool _started = false;
59 |     bool check_exist_in_exeutable(uint32_t id);
60 | };
61 | 
62 | #endif


--------------------------------------------------------------------------------
/src/SimulationConfig.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <nlohmann/json.hpp>
 4 | #include <string>
 5 | 
 6 | using json = nlohmann::json;
 7 | 
 8 | enum class CoreType { SYSTOLIC_OS, SYSTOLIC_WS };
 9 | 
10 | enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
11 | 
12 | enum class IcntType { SIMPLE, BOOKSIM2 };
13 | 
14 | struct CoreConfig {
15 |   CoreType core_type;
16 |   uint32_t core_width;
17 |   uint32_t core_height;
18 | 
19 |   /* Vector config*/
20 |   uint32_t vector_process_bit;
21 |   uint32_t layernorm_latency = 1;
22 |   uint32_t softmax_latency = 1;
23 |   uint32_t add_latency = 1;
24 |   uint32_t mul_latency = 1;
25 |   uint32_t mac_latency = 1;
26 |   uint32_t div_latency = 1;
27 |   uint32_t exp_latency = 1;
28 |   uint32_t gelu_latency = 1;
29 |   uint32_t add_tree_latency = 1;
30 |   uint32_t scalar_sqrt_latency = 1;
31 |   uint32_t scalar_add_latency = 1;
32 |   uint32_t scalar_mul_latency = 1;
33 | 
34 |   /* SRAM config */
35 |   uint32_t sram_width;
36 |   uint32_t spad_size;
37 |   uint32_t accum_spad_size;
38 | };
39 | 
40 | struct SimulationConfig {
41 |   /* Core config */
42 |   uint32_t num_cores;
43 |   uint32_t core_freq;
44 |   uint32_t core_print_interval;
45 |   struct CoreConfig *core_config;
46 | 
47 |   /* DRAM config */
48 |   DramType dram_type;
49 |   uint32_t dram_freq;
50 |   uint32_t dram_channels;
51 |   uint32_t dram_req_size;
52 |   uint32_t dram_latency;
53 |   uint32_t dram_size; // in GB
54 |   uint32_t dram_nbl = 1; // busrt length in clock cycles (bust_length 8 in DDR -> 4 nbl)
55 |   uint32_t dram_print_interval;
56 |   std::string dram_config_path;
57 | 
58 |   /* ICNT config */
59 |   IcntType icnt_type;
60 |   std::string icnt_config_path;
61 |   uint32_t icnt_freq;
62 |   uint32_t icnt_latency;
63 |   uint32_t icnt_print_interval=0;
64 | 
65 |   /* Sheduler config */
66 |   std::string scheduler_type;
67 | 
68 |   /* Other configs */
69 |   uint32_t precision;
70 |   uint32_t full_precision = 4;
71 |   std::string layout;
72 | 
73 |   /*
74 |    * This map stores the partition information: <partition_id, core_id>
75 |    *
76 |    * Note: Each core belongs to one partition. Through these partition IDs,
77 |    * it is possible to assign a specific DNN model to a particular group of cores.
78 |    */
79 |   std::map<uint32_t, std::vector<uint32_t>> partiton_map;
80 | 
81 |   uint64_t align_address(uint64_t addr) {
82 |     return addr - (addr % dram_req_size);
83 |   }
84 | 
85 |   float max_systolic_flops(uint32_t id) {
86 |     return core_config[id].core_width * core_config[id].core_height * core_freq * 2 * num_cores / 1000; // GFLOPS
87 |   }
88 | 
89 |   float max_vector_flops(uint32_t id) {
90 |     return (core_config[id].vector_process_bit >> 3) / precision * 2 * core_freq / 1000; // GFLOPS
91 |   }
92 | 
93 |   float max_dram_bandwidth() {
94 |     return dram_freq * dram_channels * dram_req_size / dram_nbl / 1000; // GB/s
95 |   }
96 | 
97 | };


--------------------------------------------------------------------------------
/src/Simulator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Common.h"
 4 | #include "Core.h"
 5 | #include "Dram.h"
 6 | #include "Interconnect.h"
 7 | #include "Model.h"
 8 | #include "scheduler/Scheduler.h"
 9 | #include "scheduler/LanguageScheduler.h"
10 | #include <queue>
11 | 
12 | #define CORE_MASK 0x1 << 1
13 | #define DRAM_MASK 0x1 << 2
14 | #define ICNT_MASK 0x1 << 3
15 | 
16 | class Simulator {
17 |  public:
18 |   Simulator(SimulationConfig config, bool language_mode);
19 |   void register_model(std::unique_ptr<Model> model);
20 |   void register_language_model(json info, std::unique_ptr<LanguageModel> model);
21 |   void finish_language_model(uint32_t model_id);
22 |   void run_simulator();
23 |   const double get_tile_ops();
24 |   const size_t get_number_tile() { return _tile_timestamp.size(); }
25 |   // void run_offline(std::string model_name, uint32_t sample_count);
26 |   // void run_multistream(std::string model_name, uint32_t sample_count,
27 |   // uint32_t ); void run_server(std::string trace_path);
28 |  private:
29 |   void cycle();
30 |   bool running();
31 |   void set_cycle_mask();
32 |   void handle_model();
33 |   uint32_t get_dest_node(MemoryAccess* access);
34 |   SimulationConfig _config;
35 |   uint32_t _n_cores;
36 |   uint32_t _n_memories;
37 |   uint32_t _memory_req_size;
38 | 
39 |   // Components
40 |   std::vector<std::unique_ptr<Core>> _cores;
41 |   std::unique_ptr<Interconnect> _icnt;
42 |   std::unique_ptr<Dram> _dram;
43 |   std::unique_ptr<Scheduler> _scheduler;
44 |   
45 |   // period information (ps)
46 |   uint64_t _core_period;
47 |   uint64_t _icnt_period;
48 |   uint64_t _dram_period;
49 |   //
50 |   uint64_t _core_time;
51 |   uint64_t _icnt_time;
52 |   uint64_t _dram_time;
53 | 
54 |   addr_type _dram_ch_stride_size;
55 | 
56 |   uint64_t _core_cycles;
57 | 
58 |   uint32_t _cycle_mask;
59 |   bool _single_run;
60 |   bool _language_mode;
61 |   std::unique_ptr<LangScheduler> _lang_scheduler;
62 | 
63 |   // Icnt stat
64 |   uint64_t _nr_from_core=0;
65 |   uint64_t _nr_to_core=0;
66 |   uint64_t _nr_from_mem=0;
67 |   uint64_t _nr_to_mem=0;
68 |   cycle_type _icnt_cycle=0;
69 |   uint64_t _icnt_interval=0;
70 | 
71 |   struct CompareModel {
72 |     bool operator()(const std::unique_ptr<Model>& a, const std::unique_ptr<Model>& b) const {
73 |         return a->get_request_time() > b->get_request_time();
74 |     }
75 |   };
76 |   robin_hood::unordered_map<std::string, 
77 |     std::vector<std::unique_ptr<Tensor>>> _weight_table;
78 |   std::vector<std::unique_ptr<Model>>  _models;
79 |   robin_hood::unordered_map<std::string, std::unique_ptr<Model>> _language_models;
80 |   std::vector<std::chrono::time_point<std::chrono::high_resolution_clock>> _tile_timestamp;
81 | 
82 |   bool check_defined_model(std::string model_name);
83 | };


--------------------------------------------------------------------------------
/src/Sram.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Common.h"
 3 | 
 4 | class Sram {
 5 |  public:
 6 |   Sram(SimulationConfig config, const cycle_type& core_cycle, bool accum, uint32_t core_id);
 7 | 
 8 |   bool check_hit(addr_type address, int buffer_id);
 9 |   bool check_full(int buffer_id);
10 |   bool check_remain(size_t size, int buffer_id);
11 |   bool check_allocated(addr_type address, int buffer_id);
12 | 
13 |   void cycle();
14 |   void flush(int buffer_id);
15 |   int prefetch(addr_type address, int buffer_id, size_t allocated_size, size_t count);
16 |   void count_up(addr_type, int buffer_id);
17 |   void fill(addr_type address, int buffer_id);
18 |   int get_size() { return _size; }
19 |   int get_current_size(int buffer_id) { return _current_size[buffer_id]; }
20 |   void print_all(int buffer_id);
21 |  private:
22 |   struct SramEntry {
23 |     bool valid;
24 |     addr_type address;
25 |     size_t size;
26 |     size_t remain_req_count;
27 |     cycle_type timestamp;
28 |   };
29 | 
30 |   int _size;
31 |   int _data_width;
32 |   int _current_size[2];
33 |   bool _accum;
34 | 
35 |   const cycle_type& _core_cycle;
36 | 
37 |   robin_hood::unordered_map<addr_type, SramEntry> _cache_table[2];
38 | };
39 | 


--------------------------------------------------------------------------------
/src/Stat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | #include <vector>
 6 | 
 7 | typedef struct {
 8 |   uint64_t start_cycle;
 9 |   uint64_t cycles;
10 |   uint64_t compute_cycles;
11 |   uint64_t memory_stall;
12 |   uint64_t dependency_stall;
13 |   uint64_t sram_reads;
14 |   uint64_t sram_writes;
15 | } TileStat;
16 | 
17 | typedef struct {
18 |   uint64_t op_cycles;
19 |   std::vector<TileStat> tile_stats;
20 | } OpStat;
21 | 
22 | typedef struct {
23 |   uint64_t total_cycles;
24 |   std::vector<OpStat> op_stats;
25 | } ModelStat;
26 | 


--------------------------------------------------------------------------------
/src/SystolicOS.cc:
--------------------------------------------------------------------------------
 1 | #include "SystolicOS.h"
 2 | 
 3 | SystolicOS::SystolicOS(uint32_t id, SimulationConfig config)
 4 |     : Core(id, config) {}
 5 | 
 6 | void SystolicOS::cycle() {
 7 |   // Todo: Impement this;
 8 |   assert(0);
 9 | }
10 | 
11 | cycle_type SystolicOS::get_inst_compute_cycles(std::unique_ptr<Instruction>& inst) {
12 |   return _config.core_config[_id].core_height + _config.core_config[_id].core_width - 2 + inst->size;
13 | }


--------------------------------------------------------------------------------
/src/SystolicOS.h:
--------------------------------------------------------------------------------
1 | #include "Core.h"
2 | 
3 | class SystolicOS : public Core {
4 |   public:
5 |   SystolicOS(uint32_t id, SimulationConfig config);
6 |   virtual void cycle() override;
7 |   protected:
8 |   virtual cycle_type get_inst_compute_cycles(std::unique_ptr<Instruction>& inst);
9 | };


--------------------------------------------------------------------------------
/src/SystolicWS.h:
--------------------------------------------------------------------------------
 1 | #include "Core.h"
 2 | 
 3 | class SystolicWS : public Core {
 4 |  public:
 5 |   SystolicWS(uint32_t id, SimulationConfig config);
 6 |   virtual void cycle() override;
 7 |   virtual void print_stats() override;
 8 | 
 9 |  protected:
10 |   virtual bool can_issue_compute(std::unique_ptr<Instruction>& inst) override;
11 |   virtual cycle_type get_inst_compute_cycles(std::unique_ptr<Instruction>& inst) override;
12 |   uint32_t _stat_systolic_inst_issue_count = 0;
13 |   uint32_t _stat_systolic_preload_issue_count = 0;
14 |   cycle_type get_vector_compute_cycles(std::unique_ptr<Instruction>& inst);
15 | };


--------------------------------------------------------------------------------
/src/Tensor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Common.h"
 3 | 
 4 | class Model;
 5 | class Operation;
 6 | 
 7 | class Tensor {
 8 |  public:
 9 |   Tensor(uint32_t src_node, onnx::TensorProto &tensor_proto, int precision, bool produced);
10 |   Tensor(uint32_t src_node, std::string name, std::vector<uint32_t> &dims,
11 |          int precision, bool produced);
12 |   Tensor(uint32_t src_node, std::string name, int precision);
13 |   Tensor(const Tensor &tensor);
14 | 
15 |   void define_tensor(addr_type address, std::vector<uint32_t> &dims);
16 |   void redefine_tensor(uint32_t src_node, std::vector<uint32_t> &dims);
17 |   void resize_tensor(std::vector<uint32_t> &dims);
18 |   void add_child_node(Operation *op);
19 | 
20 |   uint32_t get_id() { return _id; }
21 |   std::string get_name() { return _name; }
22 |   uint32_t get_src_node() { return _src_node; }
23 |   std::vector<uint32_t> get_dims() { return _dims; }
24 |   void set_produced() { _produced = true; }
25 |   bool get_produced() { return _produced; }
26 |   uint32_t num_child_nodes() { return _child_nodes.size(); }
27 |   uint32_t get_child_node(uint32_t id) { return _child_nodes[id]; }
28 | 
29 |   void allocate_tensor(int precision);
30 |   addr_type get_address() { return _address; }
31 |   uint64_t get_size() { return _size; }
32 |   void print_tensor();
33 | 
34 |  private:
35 |   bool _temporal;
36 |   uint32_t _precision;
37 |   bool _produced;
38 |   uint32_t _id;
39 |   std::string _name;
40 |   std::vector<uint32_t> _dims;
41 |   uint32_t _src_node;
42 |   std::vector<uint32_t> _child_nodes;
43 |   addr_type _address;
44 |   uint64_t _size;
45 |   friend Model;
46 | };


--------------------------------------------------------------------------------
/src/allocator/AddressAllocator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "../Common.h"
3 | 
4 | class AddressAllocator {
5 |   virtual addr_type allocate(std::vector<int> shape, uint32_t data_size) = 0;
6 | };


--------------------------------------------------------------------------------
/src/helper/CommandLineParser.cc:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | This source code is licensed under the MIT license found in the
 3 | LICENSE file in the root directory of this source tree.
 4 | *******************************************************************************/
 5 | 
 6 | #include "CommandLineParser.h"
 7 | 
 8 | namespace po = boost::program_options;
 9 | 
10 | void CommandLineParser::parse(int argc, char** argv) noexcept(false) {
11 |   po::store(po::parse_command_line(argc, argv, options_description),
12 |             variables_map);
13 |   po::notify(variables_map);
14 | }
15 | 
16 | void CommandLineParser::print_help_message_if_required() const noexcept {
17 |   if (variables_map.count("help") > 0) {
18 |     std::cout << options_description << std::endl;
19 |     exit(0);
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/helper/HelperFunctions.h:
--------------------------------------------------------------------------------
 1 | #ifndef HELPER_FUNCTIONS_H
 2 | #define HELPER_FUNCTIONS_H
 3 | 
 4 | #define MIN(x, y) (((x) > (y)) ? (y) : (x))
 5 | #define MIN3(x, y, z) MIN(MIN(x, y), z)
 6 | #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 7 | #define GB * 1024 * 1024 * 1024
 8 | #define MHz * 1000 * 1000
 9 | 
10 | #endif


--------------------------------------------------------------------------------
/src/operations/AdaptiveAvgPool.cc:
--------------------------------------------------------------------------------
 1 | #include "AdaptiveAvgPool.h"
 2 | 
 3 | #include "../Model.h"
 4 | #include "../Tensor.h"
 5 | 
 6 | AdaptiveAvgPool::AdaptiveAvgPool(SimulationConfig config, Model* model,
 7 |                                  onnx::NodeProto& node_proto, uint32_t target_core)
 8 |     : Operation(config, model, node_proto, target_core) {
 9 |   int kernel_dim = 0;
10 |   for (auto attribute : node_proto.attribute()) {
11 |     if (attribute.name() == "kernel_shape") {
12 |       spdlog::trace(" kernel_shape {}", attribute.ints_size());
13 |       for (int i = 0; i < attribute.ints_size(); i++) {
14 |         _kernel_shape.push_back(attribute.ints(i));
15 |       }
16 |       kernel_dim = attribute.ints_size();
17 |     } else if (attribute.name() == "strides") {
18 |       spdlog::trace("stride_shape {}", attribute.ints_size());
19 |       for (int i = 0; i < attribute.ints_size(); i++) {
20 |         _strides.push_back(attribute.ints(i));
21 |       }
22 |     }
23 |   }
24 | 
25 |   /* We assume AdaptiveAvgPool2d */
26 |   assert(kernel_dim == 2);
27 |   std::vector<uint32_t> input_shape = get_input(0)->get_dims();
28 |   std::vector<uint32_t> output_shape = input_shape;
29 | 
30 |   /* Asuming input H W size are multiple of output H W*/
31 |   assert(!(input_shape[Hdim] % _kernel_shape[0]) &&
32 |          !(input_shape[Wdim] % _kernel_shape[1]));
33 | 
34 |   output_shape[Hdim] = (input_shape[Hdim] - _kernel_shape[0]) / _strides[0] + 1;
35 |   output_shape[Wdim] = (input_shape[Wdim] - _kernel_shape[1]) / _strides[1] + 1;
36 | 
37 |   spdlog::trace("output name : {} {}", node_proto.output(0).c_str(),
38 |                 output_shape);
39 | 
40 |   Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0));
41 |   if (predefined_tensor == nullptr) {
42 |     std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
43 |         _id, node_proto.output(0), output_shape, _config.precision, false);
44 |     _outputs.push_back(output_tensor.get()->get_id());
45 |     _model->add_tensor(std::move(output_tensor));
46 |   } else {
47 |     predefined_tensor->redefine_tensor(_id, output_shape);
48 |   }
49 | }
50 | 
51 | AdaptiveAvgPool::AdaptiveAvgPool(const AdaptiveAvgPool& src) : Operation(src) {
52 |   _kernel_shape = src._kernel_shape;
53 |   _strides = src._strides;
54 |   _skip = src._skip;
55 | }
56 | 
57 | void AdaptiveAvgPool::initialize_tiles(MappingTable& mapping_table) {
58 |   spdlog::trace("initialize_tile {}", _name);
59 |   std::vector<uint32_t> output_shape = get_output(0)->get_dims();
60 |   if (_skip) {
61 |     _tiles.push_back(std::make_unique<Tile>(Tile{.status = Tile::Status::INITIALIZED, .skip = true}));
62 |     return;
63 |   }
64 | 
65 |   std::unique_ptr<Tile> tile = std::make_unique<Tile>(Tile{
66 |     .status = Tile::Status::INITIALIZED,
67 |     .optype = "AdaptiveAvgPool",
68 |     .layer_id = _id,
69 |     .skip = true});
70 |   _tiles.push_back(std::move(tile));
71 |   initialize_instructions(_tiles.back().get(), Mapping{});
72 | }
73 | 
74 | void AdaptiveAvgPool::initialize_instructions(Tile* tile, Mapping mapping) {
75 |   return;
76 | }


--------------------------------------------------------------------------------
/src/operations/AdaptiveAvgPool.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Operation.h"
 4 | 
 5 | class AdaptiveAvgPool : public Operation {
 6 |  public:
 7 |   AdaptiveAvgPool(SimulationConfig config, Model* model,
 8 |                   onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 |   AdaptiveAvgPool(const AdaptiveAvgPool& src);
10 | 
11 |   virtual void initialize_tiles(MappingTable& mapping_table) override;
12 | 
13 |  protected:
14 |   virtual void initialize_instructions(Tile* tile, Mapping mapping);
15 | 
16 |  private:
17 |   std::vector<uint32_t> _kernel_shape;
18 |   std::vector<uint32_t> _strides;
19 |   bool _skip = false;
20 | };


--------------------------------------------------------------------------------
/src/operations/Attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | //#include "../tensor/NPUTensor.h"
 3 | #include "Operation.h"
 4 | #include "GemmWS.h"
 5 | 
 6 | class Attention : public Operation {
 7 |    public:
 8 |     Attention(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 |     Attention(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core=0);
10 |     //std::vector<Ptr<BTensor>> get_outputs(std::vector<Ptr<BTensor>> inputs) override;
11 | 
12 |     uint32_t _batch_size;
13 |     /* q,k,v shape : (nh,{1,l},dk) / (nh,{l,l+1},dk) / (nh,{l,l+1},dk) */
14 |     std::vector<uint32_t> _query_shape;
15 |     std::vector<uint32_t> _key_shape;
16 |     std::vector<uint32_t> _value_shape;
17 | 
18 |     std::vector<uint32_t> _weight_shape;
19 |     std::vector<uint32_t> _bias_shape;
20 |     std::vector<uint32_t> _mask_shape;
21 |     std::vector<uint32_t> _kv_cache_shape;
22 |     std::vector<uint32_t> _input_shape;
23 |     std::vector<uint32_t> _output_shape;
24 |     std::vector<uint32_t> _liner_output_shape;
25 |     std::vector<uint32_t> _projection_output_shape;
26 | 
27 |     GemmWS* _projection_node;
28 |     uint32_t _seq;
29 |     uint32_t _q_len;
30 |     uint32_t _dmodel;
31 |     uint32_t _nh;
32 |     uint32_t _nkvh;
33 |     uint32_t _dk;
34 | 
35 |     uint32_t _key_projection_id;
36 |     uint32_t _query_projection_id;
37 |     uint32_t _value_projection_id;
38 |     /* For kv cache */
39 |     bool onnx = false;
40 |     bool has_kv_cache = false;
41 |     bool use_fused = true;
42 |     bool need_scale = false;
43 | 
44 |     std::vector<uint32_t> _heads_per_tile;
45 |     std::vector<uint32_t> _tiles_per_head;
46 |     std::vector<uint32_t> _scale_tiles_per_head;
47 | 
48 |     void calculate_loops();
49 |     void calculate_loops(Mapping& mapping);
50 | 
51 |     //void initialize_tiles();
52 |     //void initialize_instructions(Tile &tile, int req_idx, int head_idx, int num_heads);
53 |     void initialize_tiles(MappingTable& mapping_table) override;
54 |     void initialize_onnx_tiles(MappingTable& mapping_table);
55 |     void initialize_non_fused_tiles(MappingTable& mapping_table);
56 |     void initialize_instructions(Tile* tile, Mapping mapping, int head_idx, int num_heads);
57 |     void initialize_instructions(Tile* tile, int head_idx, int num_heads);
58 | 
59 |     void initialize_scale_instructions(Tile* tile, Mapping mapping, int head_idx, int num_tiles, int query_idx, int num_queries);
60 |    protected:
61 |     uint32_t sram_size_needed();
62 | };


--------------------------------------------------------------------------------
/src/operations/BiasAct.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class BiasAct : public Operation {
 5 |  public:
 6 |   BiasAct(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |   BiasAct(SimulationConfig config, Model* model, std::string name,
 8 |           std::map<std::string, std::string>& attributes, uint32_t target_core=0);
 9 | 
10 |   void initialize_tiles(MappingTable& mapping_table) override;
11 | 
12 |  private:
13 |   void calculate_loops();
14 |   void initialize_instructions(Tile* tile, Mapping mapping,
15 |                                uint32_t token_offset, uint32_t tokens);
16 |   std::vector<uint32_t> _bias_shape;
17 | 
18 |   std::vector<uint32_t> _input_shape;
19 |   std::vector<uint32_t> _output_shape;
20 | 
21 |   uint32_t _batch_size;
22 |   uint32_t _seq;
23 |   uint32_t _dk;
24 |   uint32_t _tokens_per_tile;
25 |   bool _llama_mlp;
26 |   bool _use_bias;
27 |   Opcode _activation;
28 | };


--------------------------------------------------------------------------------
/src/operations/BiasGelu.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class BiasGelu : public Operation {
 5 | public:
 6 |     BiasGelu(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |     BiasGelu(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core=0);
 8 | 
 9 |     std::vector<uint32_t> _bias_shape;
10 | 
11 |     std::vector<uint32_t> _input_shape;
12 |     std::vector<uint32_t> _output_shape;
13 | 
14 |     uint32_t _batch_size;
15 |     uint32_t _seq;
16 |     uint32_t _dk;
17 |     uint32_t _tokens_per_tile;
18 | 
19 |     void calculate_loops();
20 |     void initialize_tiles(MappingTable& mapping_table) override;
21 |     void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens);
22 | };


--------------------------------------------------------------------------------
/src/operations/Concat.cc:
--------------------------------------------------------------------------------
 1 | /*TODO: implement this */
 2 | #include "Concat.h"
 3 | 
 4 | #include "../Model.h"
 5 | #include "../Tensor.h"
 6 | 
 7 | Concat::Concat(SimulationConfig config, Model* model,
 8 |               	onnx::NodeProto& node_proto, uint32_t target_core) 
 9 |     : Operation(config, model, node_proto, target_core) {
10 | 	for (auto attribute : node_proto.attribute()) {
11 | 		if (attribute.name() == "axis") {
12 | 			spdlog::trace("concat axis {}", attribute.ints(0));
13 |       _axis = attribute.ints(0);
14 | 		} 
15 | 	}
16 | 
17 | 	assert(_axis>=0 && _axis<4);
18 | 	std::vector<uint32_t> output_shape;
19 | 	std::vector<uint32_t> input0_shape = get_input(0)->get_dims();
20 | 	std::vector<uint32_t> input1_shape = get_input(1)->get_dims();
21 | 	output_shape.resize(input0_shape.size());
22 | 	for (int i = 0; i < input0_shape.size(); i++) {
23 | 		if (i == _axis)
24 | 			continue;
25 | 		assert(input0_shape[i] == input1_shape[i]);
26 | 		output_shape[i] = input0_shape[i];
27 | 	}
28 | 	output_shape[_axis] = input0_shape[_axis] + input1_shape[_axis];
29 | 
30 | 	spdlog::trace("output name : {} {}", node_proto.output(0).c_str(), 
31 | 									output_shape);
32 | 	Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0));
33 | 	if (predefined_tensor == nullptr) {
34 | 		std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
35 | 				_id, node_proto.output(0), output_shape, _config.precision, false);
36 | 		_outputs.push_back(output_tensor.get()->get_id());
37 | 		_model->add_tensor(std::move(output_tensor));
38 | 	} else {
39 | 		predefined_tensor->redefine_tensor(_id, output_shape);
40 | 	}
41 | }
42 | 
43 | Concat::Concat(const Concat& src) : Operation(src) {
44 | 	_axis = src._axis;
45 | }
46 | 
47 | Concat::Concat(SimulationConfig config, Model* model,
48 | 							 std::string name, std::map<std::string, std::string> &attributes, uint32_t target_core)
49 | 		: Operation(config, model, name, attributes, target_core) {
50 | 			//TODO:implement this
51 | 		_axis = std::stoi(get_attribute("axis"));
52 | }
53 | 
54 | void Concat::initialize_tiles(MappingTable& mapping_table) {
55 | 	if(_outputs.size() == 0) {
56 | 		std::vector<uint32_t> output_shape = _model->get_tensor(_inputs[0])->get_dims();
57 | 		output_shape[_axis] = 0;
58 | 		for(uint32_t input : _inputs) {
59 | 			Tensor* tensor = _model->get_tensor(input);
60 | 			output_shape[_axis] += tensor->get_dims()[_axis];
61 | 		}
62 | 		auto output_tensor = std::make_unique<Tensor>(_id, name_gen(_name, "output"), output_shape, _config.precision, false);
63 | 		_outputs.push_back(output_tensor->get_id());
64 | 		_model->add_tensor(std::move(output_tensor));
65 | 	}
66 | 	spdlog::trace("initialize_tile {} ", _name);
67 | 	std::unique_ptr<Tile> tile = std::make_unique<Tile>(Tile{
68 | 		.status = Tile::Status::INITIALIZED,
69 | 		.optype = "Concat",
70 | 		.layer_id = _id,
71 | 		.skip = true
72 | 	});
73 | 	_tiles.push_back(std::move(tile));
74 | }
75 | 
76 | void Concat::initialize_instructions(Tile* tile, Mapping mapping) {
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/src/operations/Concat.h:
--------------------------------------------------------------------------------
 1 | /*TODO: implement this */
 2 | #pragma once
 3 | 
 4 | #include "Operation.h"
 5 | 
 6 | class Concat : public Operation {
 7 |   public:
 8 |     Concat(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 |     Concat(const Concat& src);
10 |     Concat(SimulationConfig config, Model* model, std::string name,
11 |                   std::map<std::string, std::string>& attributes, uint32_t target_core=0);
12 |     virtual void initialize_tiles(MappingTable& mapping_table) override;
13 |     virtual void initialize_instructions(Tile* tile, Mapping mapping) override;
14 |   protected:
15 | 
16 |   private:
17 |     // std::vector<uint32_t> _kernel_shape;
18 |     // std::vector<uint32_t> _strides;
19 |     // std::vector<uint32_t> _dilations;
20 |     // std::vector<uint32_t> _pads;
21 | 
22 |     uint32_t _axis;
23 | };


--------------------------------------------------------------------------------
/src/operations/Conv.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Operation.h"
 4 | 
 5 | struct convInfo{
 6 |   std::vector<uint32_t> kernel_shape;
 7 |   std::vector<uint32_t> strides;
 8 |   std::vector<uint32_t> dilations;
 9 |   std::vector<uint32_t> pads;
10 |   std::vector<uint32_t> input_shape;
11 |   std::vector<uint32_t> weight_shape;
12 |   std::vector<uint32_t> conv_out_shape;
13 |   std::vector<uint32_t> pool_out_shape;
14 |   uint32_t group;
15 |   bool activation_fused;
16 |   std::string activation_type;
17 |   bool bathnorm_fused;
18 |   bool skip_connection_fused;
19 |   bool pool_fused;
20 |   std::string pool_type;
21 |   std::vector<uint32_t> pool_kernel_shape;
22 |   std::vector<uint32_t> pool_strides;
23 |   std::vector<uint32_t> pool_pads;  
24 | };
25 | 
26 | class Conv : public Operation {
27 |   public:
28 |     Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
29 |     Conv(const Conv& src);
30 |     Conv(SimulationConfig config, MappingTable& mapping_table, convInfo info, uint32_t target_core=0);
31 |     // virtual void initialize_tiles(MappingTable& mapping_table) override;
32 |   protected:
33 |     virtual void im2col_nhwc();
34 |     // void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
35 | 
36 |   protected:
37 |     std::vector<uint32_t> _kernel_shape;
38 |     std::vector<uint32_t> _strides;
39 |     std::vector<uint32_t> _dilations;
40 |     std::vector<uint32_t> _pads;  
41 |     std::vector<uint32_t> _input_shape;
42 |     std::vector<uint32_t> _weight_shape;
43 |     std::vector<uint32_t> _conv_out_shape;
44 |     std::vector<uint32_t> _pool_out_shape;
45 |     uint32_t _group;
46 |     bool _activation_fused;
47 |     std::string _activation_type;
48 |     bool _bathnorm_fused;
49 |     bool _skip_connection_fused;
50 |     bool _pool_fused;
51 |     std::string _pool_type;
52 |     std::vector<uint32_t> _pool_kernel_shape;
53 |     std::vector<uint32_t> _pool_strides;
54 |     std::vector<uint32_t> _pool_pads;  
55 | 
56 | };


--------------------------------------------------------------------------------
/src/operations/ConvOS.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Conv.h"
 4 | 
 5 | class ConvOS : public Conv {
 6 |   public:
 7 |     ConvOS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 8 |     ConvOS(const Conv& src);
 9 | 
10 |     virtual void initialize_tiles(MappingTable& mapping_table) override;
11 |   protected:
12 |     virtual void initialize_instructions(Tile* tile, Mapping mapping) ;
13 |     void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
14 | };


--------------------------------------------------------------------------------
/src/operations/ConvWS.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Conv.h"
 4 | 
 5 | class ConvWS : public Conv {
 6 |  public:
 7 |   ConvWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 8 |   ConvWS(const Conv& src);
 9 |   ConvWS(SimulationConfig config, MappingTable& mapping_table, convInfo info);
10 |   virtual void initialize_tiles(MappingTable& mapping_table) override;
11 | 
12 |  protected:
13 |   virtual void initialize_instructions(Tile* tile, Mapping mapping);
14 |   virtual void initialize_matmul_instructions(Tile* tile);
15 |   virtual addr_type make_weight_address(uint32_t S, uint32_t R, uint32_t M, uint32_t C,
16 |                                         std::vector<uint32_t> shape);
17 |   virtual addr_type make_activation_address(uint32_t N, uint32_t H, uint32_t W,
18 |                                             uint32_t C, std::vector<uint32_t> shape);
19 |   void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
20 |   Instruction make_weight_instruction(int m_offset, int s_offset, int r_offset,
21 |                                       int c_offset, Mapping mapping);
22 |   Instruction make_input_instruction(int m_offset, int s_offset, int r_offset,
23 |                                      int c_offset, Mapping mapping);
24 | };


--------------------------------------------------------------------------------
/src/operations/Dummy.cc:
--------------------------------------------------------------------------------
 1 | #include "Dummy.h"
 2 | #include "../Model.h"
 3 | #include "../Tensor.h"
 4 | 
 5 | Dummy::Dummy(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core)
 6 |     : Operation(config, model, node_proto, target_core) {
 7 |   _input_shape = get_input(0)->get_dims();
 8 |   _output_shape = _input_shape;
 9 |   spdlog::trace("output_shape : {}", _output_shape);
10 |   spdlog::trace("output name : {} {}", node_proto.output(0).c_str());
11 | 
12 |   for (int i=0;i<node_proto.output().size();i++) {
13 |     Tensor* pre_defind_tensor = _model->find_tensor(node_proto.output(i));
14 |     if (pre_defind_tensor == nullptr) {
15 |       std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
16 |           _id, node_proto.output(i), _output_shape, _config.precision, false);
17 |       _outputs.push_back(output_tensor.get()->get_id());
18 |       _model->add_tensor(std::move(output_tensor));
19 |     } else {
20 |       pre_defind_tensor->redefine_tensor(_id, _output_shape);
21 |     }
22 |   }
23 | }
24 | 
25 | void Dummy::initialize_tiles(MappingTable& mapping_table) {
26 |   std::unique_ptr<Tile> tile = std::make_unique<Tile>(Tile{
27 |                         .status = Tile::Status::INITIALIZED,
28 |                         .optype="Dummy",
29 |                         .layer_id=_id,
30 |                         .skip = true});
31 |   _tiles.push_back(std::move(tile));
32 |   initialize_instructions(_tiles.back().get(), Mapping{});
33 | }
34 | 
35 | void Dummy::initialize_instructions(Tile* tile, Mapping mapping) {
36 | }


--------------------------------------------------------------------------------
/src/operations/Dummy.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | //#include "../tensor/NPUTensor.h"
 3 | #include "Operation.h"
 4 | #include <numeric>
 5 | 
 6 | class Dummy: public Operation {
 7 | public:
 8 |    Dummy(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 | 
10 |    std::vector<uint32_t> _input_shape;
11 |    std::vector<uint32_t> _output_shape;
12 |    void initialize_tiles(MappingTable& mapping_table);
13 |    void initialize_instructions(Tile* tile, Mapping mapping);
14 |    uint64_t _total_loop;
15 |    uint32_t _element_in_tile;
16 | };


--------------------------------------------------------------------------------
/src/operations/EmbedLayerNorm.cc:
--------------------------------------------------------------------------------
 1 | #include "EmbedLayerNorm.h"
 2 | #include "../Model.h"
 3 | #include "../Tensor.h"
 4 | 
 5 | EmbedLayerNorm::EmbedLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core)
 6 |     : Operation(config, model, node_proto, target_core) {
 7 |   _input_shape = get_input(0)->get_dims();
 8 |   _weight_shape = get_input(2)->get_dims();
 9 | 
10 |   assert(_input_shape.size()==2);
11 |   _output_shape.push_back(_input_shape.at(0));
12 |   _output_shape.push_back(_input_shape.at(1)); 
13 |   _output_shape.push_back(_weight_shape.at(1)); 
14 |   spdlog::trace("output_shape : {}", _output_shape);
15 | 
16 |   Tensor* embed_output = _model->find_tensor(node_proto.output(0));
17 |   if (embed_output == nullptr) {
18 |     std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
19 |         _id, node_proto.output(0), _output_shape, _config.precision, false);
20 |     _outputs.push_back(output_tensor.get()->get_id());
21 |     _model->add_tensor(std::move(output_tensor));
22 |   } else {
23 |     embed_output->redefine_tensor(_id, _output_shape);
24 |   }
25 | 
26 |   /* mask */
27 |   Tensor* mask_output = _model->find_tensor(node_proto.output(1));
28 |   if (mask_output == nullptr) {
29 |     std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
30 |         _id, node_proto.output(1), _output_shape, _config.precision, false);
31 |     _outputs.push_back(output_tensor.get()->get_id());
32 |     _model->add_tensor(std::move(output_tensor));
33 |   } else {
34 |     mask_output->redefine_tensor(_id, _output_shape);
35 |   }
36 |   if (node_proto.output().size()==3) {
37 |     Tensor* embed_sum = _model->find_tensor(node_proto.output(2));
38 |     if (embed_sum == nullptr) {
39 |       std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
40 |           _id, node_proto.output(2), _output_shape, _config.precision, false);
41 |       _outputs.push_back(output_tensor.get()->get_id());
42 |       _model->add_tensor(std::move(output_tensor));
43 |     } else {
44 |       embed_sum->redefine_tensor(_id, _output_shape);
45 |     }
46 |   }
47 | }
48 | 
49 | void EmbedLayerNorm::initialize_tiles(MappingTable& mapping_table) {
50 |   std::unique_ptr<Tile> tile = std::make_unique<Tile>(Tile{
51 |                         .status = Tile::Status::INITIALIZED,
52 |                         .optype="EmbedLayerNorm",
53 |                         .layer_id=_id,
54 |                         .skip=true});
55 |   _tiles.push_back(std::move(tile));
56 | }
57 | 
58 | void EmbedLayerNorm::initialize_instructions(Tile* tile, Mapping mapping) {
59 | }


--------------------------------------------------------------------------------
/src/operations/EmbedLayerNorm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | //#include "../tensor/NPUTensor.h"
 3 | #include "Operation.h"
 4 | 
 5 | class EmbedLayerNorm: public Operation {
 6 |    public:
 7 |     EmbedLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 8 | 
 9 |     std::vector<uint32_t> _input_shape;
10 |     std::vector<uint32_t> _output_shape;
11 |     std::vector<uint32_t> _weight_shape;
12 |     std::vector<uint32_t> _position_weight_shape;
13 |     std::vector<uint32_t> _token_type_weight;
14 |     std::vector<uint32_t> _ln_weight_shape;
15 |     std::vector<uint32_t> _ln_bias_shape;
16 |     void initialize_tiles(MappingTable& mapping_table);
17 |     void initialize_instructions(Tile* tile, Mapping mapping);
18 |    protected:
19 | };


--------------------------------------------------------------------------------
/src/operations/Flatten.cc:
--------------------------------------------------------------------------------
 1 | /*TODO: implement this */
 2 | #include "Flatten.h"
 3 | 
 4 | #include "../Model.h"
 5 | #include "../Tensor.h"
 6 | 
 7 | Flatten::Flatten(SimulationConfig config, Model* model,
 8 |                  onnx::NodeProto& node_proto, uint32_t target_core)
 9 |     : Operation(config, model, node_proto, target_core) {
10 |   for (auto attribute : node_proto.attribute()) {
11 |     if (attribute.name() == "axis") {
12 |       spdlog::trace("flatten axis {}", attribute.i());
13 |       _axis = attribute.i();
14 |     }
15 |   }
16 | 
17 |   assert(_axis >= 0 && _axis < 4);
18 |   std::vector<uint32_t> input_shape = get_input(0)->get_dims();
19 |   std::vector<uint32_t> output_shape(_axis + 1, 1);
20 | 
21 |   for (int i = 0; i < input_shape.size(); i++) {
22 |     if (i < _axis) {
23 |       output_shape[i] = input_shape[i];
24 |     } else {
25 |       output_shape[_axis] *= input_shape[i];
26 |     }
27 |   }
28 | 
29 |   spdlog::trace("output name : {} {}", node_proto.output(0).c_str(), output_shape);
30 | 
31 |   Tensor* predefined_tensor = _model->find_tensor(node_proto.output(0));
32 |   if (predefined_tensor == nullptr) {
33 |     std::unique_ptr<Tensor> output_tensor = std::make_unique<Tensor>(
34 |         _id, node_proto.output(0), output_shape, _config.precision, false);
35 |     _outputs.push_back(output_tensor.get()->get_id());
36 |     _model->add_tensor(std::move(output_tensor));
37 |   } else {
38 |     predefined_tensor->redefine_tensor(_id, output_shape);
39 |   }
40 | }
41 | 
42 | Flatten::Flatten(const Flatten& src) : Operation(src) { _axis = src._axis; }
43 | 
44 | void Flatten::initialize_tiles(MappingTable& mapping_table) {
45 |   spdlog::trace("initialize_tile {}", _name);
46 | 
47 |   _tiles.push_back(std::make_unique<Tile>(Tile{.status = Tile::Status::INITIALIZED,
48 |                         .optype = "Flatten",
49 |                         .layer_id = _id,
50 |                         .skip = true}));
51 |   initialize_instructions(_tiles.back().get(), Mapping{});
52 | }
53 | 
54 | void Flatten::initialize_instructions(Tile* tile, Mapping mapping) {
55 | }
56 | 


--------------------------------------------------------------------------------
/src/operations/Flatten.h:
--------------------------------------------------------------------------------
 1 | /*TODO: implement this */
 2 | #pragma once
 3 | 
 4 | #include "Operation.h"
 5 | 
 6 | class Flatten : public Operation {
 7 | 	public:
 8 | 		Flatten(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 | 		Flatten(const Flatten& src);
10 | 		virtual void initialize_tiles(MappingTable& mapping_table) override;
11 |     virtual void initialize_instructions(Tile* tile, Mapping mapping) override;
12 | 	protected:
13 | 
14 | 	private:
15 | 		uint32_t _axis;
16 | };


--------------------------------------------------------------------------------
/src/operations/Gemm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class Gemm : public Operation {
 5 |  public:
 6 |   Gemm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |   Gemm(SimulationConfig config, MappingTable& mapping_table,
 8 |        std::vector<uint32_t> output_shape, std::vector<uint32_t> input_shape,
 9 |        std::vector<uint32_t> weight_shape, uint32_t target_core=0);
10 |   Gemm(SimulationConfig config, Model* model, std::string name,
11 |         std::map<std::string, std::string>& attributes, uint32_t target_core=0);
12 | 
13 |  protected:
14 | 
15 |   std::vector<uint32_t> _output_shape;
16 |   std::vector<uint32_t> _input_shape;
17 |   std::vector<uint32_t> _weight_shape;
18 |   int _batch_size;
19 | 
20 |  private:
21 |   uint32_t _alpha;
22 |   uint32_t _beta;
23 |   bool _transA;
24 |   bool _transB;
25 | };


--------------------------------------------------------------------------------
/src/operations/GemmOS.cc:
--------------------------------------------------------------------------------
 1 | #include "GemmOS.h"
 2 | 
 3 | #include "../Model.h"
 4 | 
 5 | GemmOS::GemmOS(SimulationConfig config, Model* model,
 6 |                onnx::NodeProto& node_proto, uint32_t target_core)
 7 |     : Gemm(config, model, node_proto, target_core) {}
 8 | 
 9 | /* TODO : Implement this */
10 | void GemmOS::initialize_tiles(MappingTable& mapping_table) {
11 | 
12 | }


--------------------------------------------------------------------------------
/src/operations/GemmOS.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "Gemm.h"
3 | 
4 | class GemmOS : public Gemm {
5 |   public:
6 |     GemmOS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
7 |     void initialize_tiles(MappingTable& mapping_table) override;
8 |   private:
9 | };


--------------------------------------------------------------------------------
/src/operations/GemmWS.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Gemm.h"
 3 | 
 4 | class GemmWS : public Gemm {
 5 |  public:
 6 |   GemmWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |   GemmWS(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, bool has_true, uint32_t target_core=0);
 8 |   GemmWS(SimulationConfig config, MappingTable& mapping_table,
 9 |          std::vector<uint32_t> input_shape, std::vector<uint32_t> weight_shape,
10 |          std::vector<uint32_t> output_shape, uint32_t target_core);
11 |   GemmWS(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attribute, uint32_t target_core);
12 |   virtual void initialize_tiles(MappingTable& mapping_table) override;
13 |   bool has_bias = true;
14 |  protected:
15 |   virtual void initialize_instructions(Tile* tile, Mapping mapping);
16 |  private:
17 | };


--------------------------------------------------------------------------------
/src/operations/GlobalAvgPool.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class GlobalAvgPool : public Operation {
 5 |   public:
 6 |     GlobalAvgPool(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |     GlobalAvgPool(const GlobalAvgPool& src);
 8 |     virtual void initialize_tiles(MappingTable& mapping_table) override;
 9 | 
10 |   protected:
11 |     virtual void initialize_instructions(Tile* tile, Mapping mapping) override;
12 |   private:
13 |     std::vector<uint32_t> _kernel_shape;
14 |     std::vector<uint32_t> _strides;
15 |     // std::vector<uint32_t> _dilations;
16 |     // std::vector<uint32_t> _pads;  
17 | };


--------------------------------------------------------------------------------
/src/operations/KVCacheConcat.h:
--------------------------------------------------------------------------------
 1 | #ifndef KV_CACHE_CONCAT_H
 2 | #define KV_CACHE_CONCAT_H
 3 | #include "Operation.h"
 4 | 
 5 | class KVCacheConcat : public Operation {
 6 |   public:
 7 |     KVCacheConcat(SimulationConfig config, Model* model,
 8 |                   onnx::NodeProto& node_proto, uint32_t target_core=0);
 9 |     KVCacheConcat(const KVCacheConcat& src);
10 |     KVCacheConcat(SimulationConfig config, Model* model, std::string name,
11 |                   std::map<std::string, std::string>& attributes, uint32_t target_core=0);
12 |     void initialize_tiles(MappingTable& mapping_table) override;
13 |   private:
14 |     void calculate_loops();
15 |     void initialize_instructions(Tile* tile, uint32_t idx);
16 | 
17 |     uint32_t _num_batches;
18 |     std::vector<uint32_t> _input_token_lengths;
19 |     uint32_t _num_kv_heads;
20 |     uint32_t _num_attention_heads;
21 |     uint32_t _hidden_size;
22 |     uint32_t _cache_dim;
23 |     uint32_t _outter_loops;
24 |     uint32_t _inner_loops;
25 | };
26 | 
27 | #endif


--------------------------------------------------------------------------------
/src/operations/MaxPool.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Operation.h"
 4 | 
 5 | class MaxPool : public Operation {
 6 |   public:
 7 |     MaxPool(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 8 |     MaxPool(const MaxPool& src);
 9 |     virtual void initialize_tiles(MappingTable& mapping_table) override;
10 |     virtual void initialize_instructions(Tile* tile, Mapping mapping) override;
11 | 
12 |   protected:
13 |     // virtual void initialize_instructions(SimulationConfig config, Tile& tile) override;
14 |   private:
15 |     std::vector<uint32_t> _kernel_shape;
16 |     std::vector<uint32_t> _strides;
17 |     std::vector<uint32_t> _dilations;
18 |     std::vector<uint32_t> _pads;  
19 | };


--------------------------------------------------------------------------------
/src/operations/Operation.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../Common.h"
 4 | #include "../Mapping.h"
 5 | #include "../Tensor.h"
 6 | 
 7 | class Model;
 8 | class OpParser;
 9 | 
10 | // Graph Node
11 | class Operation {
12 |  public:
13 |   Operation(SimulationConfig config, Model* model, onnx::NodeProto& node_proto,
14 |             uint32_t id, uint32_t target_core);
15 |   Operation(SimulationConfig config, MappingTable& mapping_table, uint32_t target_core);
16 |   Operation(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core);
17 |   Operation(const Operation& operation);
18 |   Operation(SimulationConfig config, Model* model,
19 |             std::string name,  std::map<std::string, std::string>&attribute, uint32_t target_core);
20 |   virtual ~Operation() = default;
21 |   virtual void set_finish();
22 | 
23 |   virtual std::string get_name() { return _name; }
24 |   virtual std::string get_optype() { return _optype; }
25 |   virtual uint32_t get_id() { return _id; }
26 |   virtual uint32_t num_inputs() { return _inputs.size(); }
27 |   virtual Tensor* get_input(int id);
28 |   virtual void add_input(int id);
29 |   virtual void add_output(int id);
30 |   virtual uint32_t num_outputs() { return _outputs.size(); }
31 |   virtual Tensor* get_output(int id);
32 |   virtual void set_model(Model* model) { _model=model; }
33 |   virtual std::vector<uint32_t> get_child_nodes();
34 |   virtual std::deque<std::unique_ptr<Tile>>& get_tiles();
35 |   virtual void clear_tiles();
36 |   virtual void initialize_tiles(MappingTable& mapping_table) = 0;
37 |   virtual bool check_executable();
38 |   bool check_finish() { return _finish; };
39 |   uint32_t target_core=0; // Targeted core id
40 | 
41 |  protected:
42 |   virtual void initialize_instructions(Tile* tile, Mapping mapping) {}
43 |   addr_type make_address(std::vector<uint32_t> index, std::vector<uint32_t> dims);
44 |   addr_type get_operand_addr(uint32_t operand_id);
45 |   std::string get_attribute(std::string key);
46 |  protected:
47 |   static const uint32_t _NO_OPERAND = 0;
48 |   static const uint32_t _INPUT_OPERAND = 100;
49 |   static const uint32_t _OUTPUT_OPERAND = 200;
50 |   uint32_t _id;
51 |   std::string _name;
52 |   std::string _optype;
53 |   SimulationConfig _config;
54 |   Model* _model;
55 |   onnx::NodeProto _proto;
56 |   std::vector<uint32_t> _inputs;
57 |   std::vector<uint32_t> _outputs;
58 |   std::map<std::string, std::string> _attributes;
59 |   std::deque<std::unique_ptr<Tile>> _tiles;
60 |   std::vector<std::vector<std::vector<addr_type>>> _weight_addrs;
61 |   std::vector<std::vector<std::vector<std::vector<addr_type>>>> _input_addrs;
62 |   std::vector<std::vector<std::vector<std::vector<addr_type>>>> _output_addrs;
63 | 
64 |   int Ndim;    // Batch dimension of activation tensor (commoly 0)
65 |   int Hdim;    // Height dimension of activation tensor
66 |   int Wdim;    // Width dimension of activation tensor
67 |   int Cdim;    // Channel dimension of activation tensor
68 |   int Cdim_w;  // Channel dimension of weight tensor
69 |   int Mdim;    // Output channel dimension of weight tensor
70 |   int Sdim;    // Height dimension of weight tensor
71 |   int Rdim;    // Width dimension of weight tensor
72 | 
73 |   bool _finish;
74 |   friend Model;
75 | };


--------------------------------------------------------------------------------
/src/operations/OperationFactory.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "../Common.h"
 3 | #include "Operation.h"
 4 | 
 5 | class Model; 
 6 | 
 7 | class OperationFactory {
 8 |   public:
 9 |     static void initialize(SimulationConfig config);
10 |     static std::unique_ptr<Operation> create_operation(Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
11 |     static std::unique_ptr<Operation> copy_operation(Operation* op);
12 | 
13 |   private:
14 |     static SimulationConfig _config;
15 | };


--------------------------------------------------------------------------------
/src/operations/SkipLayerNorm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class SkipLayerNorm : public Operation {
 5 | public:
 6 |     SkipLayerNorm(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |     SkipLayerNorm(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core=0);
 8 | 
 9 |     std::vector<uint32_t> _weight_shape;
10 |     std::vector<uint32_t> _bias_shape;
11 |     std::vector<uint32_t> _dense_bias_shape;
12 | 
13 |     std::vector<uint32_t> _input_shape;
14 |     std::vector<uint32_t> _skip_shape;
15 |     std::vector<uint32_t> _output_shape;
16 | 
17 |     uint32_t _batch_size;
18 |     uint32_t _seq;
19 |     uint32_t _dk;
20 |     uint32_t _tokens_per_tile;
21 | 
22 |     void calculate_loops();
23 |     void initialize_tiles(MappingTable& mapping_table) override;
24 |     void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens);
25 | };


--------------------------------------------------------------------------------
/src/operations/Softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Operation.h"
 3 | 
 4 | class Softmax : public Operation {
 5 | public:
 6 |     Softmax(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
 7 |     Softmax(SimulationConfig config, MappingTable& mapping_table,
 8 |        std::vector<uint32_t> input_shape, uint32_t target_core=0);
 9 |     std::vector<uint32_t> _input_shape;
10 |     std::vector<uint32_t> _output_shape;
11 | 
12 |     uint32_t _seq;
13 |     uint32_t _dk;
14 |     uint32_t _tokens_per_tile;
15 | 
16 |     void calculate_loops();
17 |     void initialize_tiles(MappingTable& mapping_table) override;
18 |     void initialize_instructions(Tile* tile, Mapping mapping, uint32_t token_offset, uint32_t tokens);
19 | };


--------------------------------------------------------------------------------
/src/scheduler/IterLevelScheduler.cc:
--------------------------------------------------------------------------------
 1 | #include "IterLevelScheduler.h"
 2 | 
 3 | IterLevelScheduler::IterLevelScheduler(std::string name, std::string path, 
 4 |                              std::unique_ptr<LanguageModel> model,
 5 |                              SimulationConfig config,
 6 |                              json scheduler_config) 
 7 |   : LangScheduler(name, path, std::move(model), config, scheduler_config) {
 8 | }
 9 | 
10 | void IterLevelScheduler::cycle() {
11 |   _cycle++;
12 |   if(_active_requests.size() <= _max_batch_size || _max_batch_size == 0) {
13 |     while(!_request_queue.empty()) {
14 |       if(_request_queue.front()->request_time <= _cycle) {
15 |         init_request(_request_queue.front());
16 |         _active_requests[_request_queue.front()->request_id] = std::move(_request_queue.front());
17 |         _request_queue.pop();
18 |       }
19 |       else {
20 |         break;
21 |       }
22 |       if(_max_batch_size > 0 && _active_requests.size() >= _max_batch_size) {
23 |         break;
24 |       }
25 |     }
26 |   }
27 | 
28 |   if(_model_queue.empty() && _requests_in_model.empty()) {
29 |     init_inputs_and_model();
30 |   }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/src/scheduler/IterLevelScheduler.h:
--------------------------------------------------------------------------------
 1 | #ifndef ITER_LEVEL_SCHEDULER_H
 2 | #define ITER_LEVEL_SCHEDULER_H
 3 | #include "LanguageScheduler.h"
 4 | 
 5 | class IterLevelScheduler : public LangScheduler {
 6 |   public:
 7 |     IterLevelScheduler(std::string name, std::string path, 
 8 |                   std::unique_ptr<LanguageModel> model,
 9 |                   SimulationConfig config,
10 |                   json scheduler_config);
11 |     virtual void cycle() override;
12 | };
13 | 
14 | 
15 | #endif


--------------------------------------------------------------------------------
/src/scheduler/LanguageScheduler.h:
--------------------------------------------------------------------------------
 1 | #ifndef LANGUAGE_SCHEDULER_H
 2 | #define LANGUAGE_SCHEDULER_H
 3 | #include <string>
 4 | 
 5 | #include "../Common.h"
 6 | #include "../models/LanguageModel.h"
 7 | 
 8 | struct LangRequest {
 9 |   uint32_t request_id;
10 |   bool running;
11 |   bool gen_phase;
12 |   uint64_t request_time;
13 |   uint64_t start_time;
14 |   uint64_t finish_time;
15 |   uint32_t prompt_length;
16 |   uint32_t current_length;
17 |   uint32_t target_length;
18 |   std::vector<std::unique_ptr<Tensor>> key_cache;
19 |   std::vector<std::unique_ptr<Tensor>> value_cache;
20 | };
21 | 
22 | class LangScheduler {
23 |   public:
24 |     static std::unique_ptr<LangScheduler> create(std::string name, std::string path, 
25 |                                                   std::unique_ptr<LanguageModel> model,
26 |                                                   SimulationConfig config,
27 |                                                   json scheduler_config);
28 |     LangScheduler(std::string name, std::string path, 
29 |                   std::unique_ptr<LanguageModel> model,
30 |                   SimulationConfig config,
31 |                   json scheduler_config);
32 |     bool can_schedule_model();
33 |     virtual std::unique_ptr<Model> pop_model();
34 |     virtual void finish_model(uint32_t model_id);
35 |     virtual void cycle();
36 |     virtual bool busy();
37 |     virtual uint64_t get_kv_memory_size();
38 |   protected:
39 |     SimulationConfig _config;
40 |     json _scheduler_config;
41 |     std::string _name;
42 |     std::unique_ptr<LanguageModel> _language_model;
43 |     std::queue<std::unique_ptr<LangRequest>> _request_queue;
44 |     std::map<uint32_t, std::unique_ptr<LangRequest>> _active_requests;
45 |     std::map<uint32_t, std::vector<uint32_t>> _requests_in_model;
46 |     std::queue<std::unique_ptr<Model>> _model_queue;
47 |     uint64_t _cycle;
48 | 
49 |     uint32_t _num_layers;
50 |     uint32_t _num_sim_layers;
51 |     uint32_t _num_attention_heads;
52 |     uint32_t _num_kv_heads;
53 |     uint32_t _hidden_size;
54 |     uint32_t _cache_dim;
55 |     uint32_t _max_seq_length;
56 |     uint32_t _max_batch_size; 
57 |     bool _run_single_layer;
58 |     bool _check_mem_size;
59 | 
60 | 
61 | 
62 |     std::vector<uint32_t> _max_dims;
63 | 
64 |     void parse_request_trace(std::string trace_path);
65 |     void init_request(std::unique_ptr<LangRequest>& request);
66 |     void init_inputs_and_model();
67 | };
68 | 
69 | #endif


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | SET(BINARY Simulator_test)
 2 | SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_SOURCE_DIR})
 3 | 
 4 | #
 5 | # import googletest as an external project
 6 | #
 7 | include(ExternalProject)
 8 | include(GoogleTest)
 9 | externalproject_add(
10 |   GoogleTest
11 |   URL https://github.com/google/googletest/archive/release-1.8.1.zip
12 |   PREFIX ${CMAKE_CURRENT_BINARY_DIR}/lib
13 |   CMAKE_ARGS -DCMAKE_CXX_FLAGS:STRING="-D_GLIBCXX_USE_CXX11_ABI=0"
14 |   INSTALL_COMMAND ""
15 | )
16 | 
17 | externalproject_get_property(GoogleTest source_dir)
18 | include_directories(${source_dir}/googletest/include)
19 | include_directories(${source_dir}/googlemock/include)
20 | 
21 | externalproject_get_property(GoogleTest binary_dir)
22 | set(GTEST_LIBRARY_PATH ${binary_dir}/googlemock/gtest/${CMAKE_FIND_LIBRARY_PREFIXES}gtest.a)
23 | set(GTEST_LIBRARY GTest::GTest)
24 | add_library(${GTEST_LIBRARY} UNKNOWN IMPORTED)
25 | set_target_properties(${GTEST_LIBRARY} PROPERTIES
26 |   IMPORTED_LOCATION ${GTEST_LIBRARY_PATH})
27 | add_dependencies(${GTEST_LIBRARY} GoogleTest)
28 | 
29 | set(GMOCK_LIBRARY_PATH ${binary_dir}/googlemock/${CMAKE_FIND_LIBRARY_PREFIXES}gmock.a)
30 | set(GMOCK_LIBRARY GTest::GMock)
31 | add_library(${GMOCK_LIBRARY} UNKNOWN IMPORTED)
32 | set_target_properties(${GMOCK_LIBRARY} PROPERTIES
33 |   IMPORTED_LOCATION ${GMOCK_LIBRARY_PATH})
34 | add_dependencies(${GMOCK_LIBRARY} GoogleTest)
35 | 
36 | file(GLOB_RECURSE TEST_SOURCES LIST_DIRECTORIES false *.h *.cc)
37 | SET(SOURCES ${TEST_SOURCES})
38 | add_executable(${BINARY} ${TEST_SOURCES})
39 | 
40 | target_include_directories(Simulator_test PUBLIC ${ONNX_INCLUDE_DIRS})
41 | target_include_directories(Simulator_test PUBLIC ${PROJECT_SOURCE_DIR}/src)
42 | target_link_libraries(Simulator_test Simulator_lib)
43 | # target_link_libraries(Simulator_test ramulator booksim2)
44 | # target_link_libraries(Simulator_test nlohmann_json::nlohmann_json ${PROTOBUF_LIB} onnx_proto ${CONAN_LIBS} stdc++fs spdlog::spdlog)
45 | target_link_libraries(Simulator_test GTest::GTest GTest::GMock)
46 | 
47 | # gtest_discover_tests(
48 | #   Simulator_test
49 | #   WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
50 | # )
51 | add_test(NAME Simulator_test COMMAND Simulator_test)
52 | 


--------------------------------------------------------------------------------
/tests/MappingTest.cc:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | #include "Mapping.h"
 3 | 
 4 | TEST(OSMappingParsingTest, BasicAssertions) {
 5 |   /* Parse mapping for output stationary accelerator */
 6 |   Mapping mapping("T N1 C128 M128 Q28 P28 S3 R3 - O P4 - I S3 R3 C128 P7 M4 Q28Y M32X");
 7 |   /*Total loop count check*/
 8 |   EXPECT_EQ(mapping.total_loop.N, 1);
 9 |   EXPECT_EQ(mapping.total_loop.C, 128);
10 |   EXPECT_EQ(mapping.total_loop.M, 128);
11 |   EXPECT_EQ(mapping.total_loop.Q, 28);
12 |   EXPECT_EQ(mapping.total_loop.P, 28);
13 |   EXPECT_EQ(mapping.total_loop.S, 3);
14 |   EXPECT_EQ(mapping.total_loop.R, 3);
15 | 
16 |   /*Spatial parsing check*/
17 |   EXPECT_EQ(mapping.spatial_M, 32);
18 |   EXPECT_EQ(mapping.spatial_Q, 28);
19 |   EXPECT_EQ(mapping.spatial_P, 1);
20 |   EXPECT_EQ(mapping.spatial_C, 1);
21 |   EXPECT_EQ(mapping.spatial_R, 1);
22 |   EXPECT_EQ(mapping.spatial_S, 1);
23 |   
24 | }
25 | 
26 | TEST(WSMappingParsingTest, BasicAssertions) {
27 |   /* Parse mapping for weight stationary accelerator */
28 |   Mapping mapping("T N1 C64 M256 Q56 P56 S1 R1 - O C8 - I M32 Q28 C8Y M8X P14 Q2 P4");
29 |   /*Total loop count check*/
30 |   EXPECT_EQ(mapping.total_loop.N, 1);
31 |   EXPECT_EQ(mapping.total_loop.C, 64);
32 |   EXPECT_EQ(mapping.total_loop.M, 256);
33 |   EXPECT_EQ(mapping.total_loop.Q, 56);
34 |   EXPECT_EQ(mapping.total_loop.P, 56);
35 |   EXPECT_EQ(mapping.total_loop.S, 1);
36 |   EXPECT_EQ(mapping.total_loop.R, 1);
37 | 
38 |   /*Spatial parsing check*/
39 |   EXPECT_EQ(mapping.spatial_M, 8);
40 |   EXPECT_EQ(mapping.spatial_Q, 1);
41 |   EXPECT_EQ(mapping.spatial_P, 1);
42 |   EXPECT_EQ(mapping.spatial_C, 8);
43 |   EXPECT_EQ(mapping.spatial_R, 1);
44 |   EXPECT_EQ(mapping.spatial_S, 1);
45 |   
46 | }


--------------------------------------------------------------------------------
/tests/SystolicOsTest.cc:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "Common.h"
 3 | #include "Core.h"
 4 | #include "SimulationConfig.h"
 5 | #include "SystolicOS.h"
 6 | #include "gtest/gtest.h"
 7 | #include "operations/ConvOS.h"
 8 | 
 9 | TEST(SystolicOSTileExecutionTest, BasicAssertions) {
10 |   // /* Weight statinary config*/
11 |   // SimulationConfig config;
12 |   // config.core_type = CoreType::SYSTOLIC_OS;
13 |   // config.core_height = 8;
14 |   // config.core_width = 8;
15 |   // config.spad_size = 192;
16 |   // config.precision = 4;
17 |   // config.dram_req_size = 32;
18 | 
19 |   // SystolicOS core(0, config);
20 |   // Tile tile{.status = Tile::Status::INITIALIZED, .layer_id = 0};
21 |   // tile.instructions.push(
22 |   //     Instruction{.opcode = Opcode::MOVIN,
23 |   //                 .id = "WEIGHT-0",
24 |   //                 .addrs = std::vector<addr_type>{0x00, 0x20}});
25 |   // tile.instructions.push(
26 |   //     Instruction{.opcode = Opcode::GEMM,
27 |   //                 .tile_size = 100,
28 |   //                 .dependent_ids = std::vector<std::string>{"WEIGHT-0"}});
29 |   // core.issue(&tile);
30 |   // cycle_type cycle = 0;
31 |   // while (tile.status != Tile::Status::FINISH) {
32 |   //   core.cycle();
33 |   //   if (core.has_memory_request()) {
34 |   //     MemoryAccess* access = core.top_memory_request();
35 |   //     access->request = false;
36 |   //     core.pop_memory_request();
37 |   //     core.push_memory_response(access);
38 |   //   }
39 |   //   cycle++;
40 |   //   if (cycle > 1000) break;
41 |   // }
42 |   // /*TODO: insert cycle count from GEMMINI */
43 |   // ASSERT_EQ(cycle, 125);
44 | }


--------------------------------------------------------------------------------
/tests/main.cc:
--------------------------------------------------------------------------------
 1 | #include "gtest/gtest.h"
 2 | 
 3 | GTEST_API_ int main(int argc, char** argv) {
 4 |   int status = 0;
 5 |   ::testing::InitGoogleTest(&argc, argv);
 6 |   try {
 7 |     const bool create_default_logger = false;
 8 |     status = RUN_ALL_TESTS();
 9 |   } catch (const std::exception& ex) {
10 |     std::cerr << ex.what();
11 |     status = -1;
12 |   }
13 |   return status;
14 | }


--------------------------------------------------------------------------------
/traces/input.csv:
--------------------------------------------------------------------------------
1 | time, prompt_length, target_length, cached_length
2 | 0, 100, 16, 0
3 | 100, 100, 15, 0
4 | 


--------------------------------------------------------------------------------