├── .gitignore
├── LICENSE
├── README.md
├── docker
    ├── Dockerfile
    └── install.sh
├── layerwise
    ├── ALL_matmul.sh
    ├── clean.sh
    ├── inspect
    ├── inspect_shape.py
    ├── layerwise.py
    ├── layerwise_matmul.py
    ├── pim_codegen.sh
    ├── process_csv.py
    ├── profile.sh
    ├── run
    ├── run_matmul
    ├── sim.sh
    ├── sim_matmul.sh
    ├── trace.sh
    └── trace_matmul.sh
├── marker
    ├── marker_cuda.cpp
    ├── marker_cuda_kernel.cu
    └── setup.py
├── pim
    ├── Makefile
    ├── main.cc
    ├── pim_trace.cc
    ├── pim_trace.h
    ├── transform.py
    └── util.py
├── pimflow
├── pipeline
    ├── clean.sh
    ├── extract_layers.py
    ├── inspect
    ├── layerwise.py
    ├── pim_codegen.sh
    ├── profile.sh
    ├── run
    ├── sim.sh
    ├── to_full_layer.py
    └── trace.sh
├── run.example.sh
├── run_matmul.example.sh
├── setup.py
├── setup.sh
└── solve
    ├── solve.py
    ├── solve.sh
    ├── stat.py
    └── stat.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | pim.egg-info/
 2 | *.pim
 3 | __pycache__/
 4 | pim_codegen
 5 | compile-*.so
 6 | tmp-*/
 7 | *.onnx
 8 | *.csv
 9 | traces-*/
10 | trace-*/
11 | gpgpu_inst_stats.txt
12 | traces-*.txt
13 | data/mobilenet-v2/
14 | tmp/
15 | compile.so
16 | .pkl_memoize_py3
17 | compile-*.so
18 | *.pkl
19 | *.stats
20 | *.log
21 | *.err
22 | *_node_map.txt
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PIMFlow
  2 | 
  3 | ## File Storage Update
  4 | Initially, the files were uploaded to GitHub using [Git Large File Storage (LFS)](https://git-lfs.github.com/), but due to potential recurrent billing issues, they have been removed. 
  5 | 
  6 | You can now find the files archived as `data.zip` on [Zenodo](https://zenodo.org/doi/10.5281/zenodo.7376801). This ensures that the data remains accessible without incurring additional costs.
  7 | 
  8 | 
  9 | ## Pre-requisites
 10 | ### Hardware dependencies
 11 | We've tested codes on NVIDIA GeForce RTX 2080 Ti GPU in Ubuntu 20.04 amd64 system. GPU should have Turing architecture.
 12 | 
 13 | Note: GPU architecture over Turing could have bug when tracing (NVBit bug).
 14 | 
 15 | ### System dependencies
 16 | We tested our code on Ubuntu 20.04 amd64 system, and used CUDA 11.3.1 and cuDNN 8.
 17 | 
 18 | ### Software dependencies
 19 | Software pre-requisites for installing from the source should be satisfied for the following repositories:
 20 | - [TVM](https://github.com/apache/tvm)
 21 | - [GPGPU-Sim](https://github.com/gpgpu-sim/gpgpu-sim_distribution)
 22 | - [Accel-Sim](https://github.com/accel-sim/accel-sim-framework)
 23 | - [Ramulator](https://github.com/CMU-SAFARI/ramulator)
 24 | 
 25 | You can install all dependencies by following this document.
 26 | 
 27 | Firstly, make sure CUDA is installed in your system:
 28 | ```bash
 29 | export CUDA_INSTALL_PATH=/usr/local/cuda # set it to your CUDA installation path
 30 | nvcc --version
 31 | ```
 32 | 
 33 | In Ubuntu 20.04 amd64 system, following commands install package dependencies:
 34 | ```bash
 35 | sudo apt-get update
 36 | sudo apt-get install -y --no-install-recommends python3-dev ca-certificates g++ python3-numpy gcc make git python3-setuptools python3-wheel python3-pip aria2 wget build-essential xutils-dev bison zlib1g-dev flex libglu1-mesa-dev git libssl-dev libxml2-dev libboost-all-dev vim python-setuptools python-dev ninja-build bc git-lfs libtinfo-dev htop libedit-dev
 37 | ```
 38 | 
 39 | Next, install Python (>= 3.8) dependencies.
 40 | 
 41 | Note: you need to use specific PyTorch version (= 1.11.0). Later version could generate different node name that cannot be processed by the current version.
 42 | 
 43 | ```bash
 44 | python3 -m pip install -U --force-reinstall pip
 45 | pip install  torch==1.11.0+cu113 \
 46 |              torchvision==0.12.0+cu113 \
 47 |              torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
 48 | pip3 install pyyaml==5.1 onnx plotly psutil pandas decorator attrs scipy
 49 | ```
 50 | 
 51 | Install CMake (>= 3.21):
 52 | ```bash
 53 | sudo aria2c -q -d /tmp -o cmake-3.21.0-linux-x86_64.tar.gz \
 54 |            https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.tar.gz
 55 | sudo tar -zxf /tmp/cmake-3.21.0-linux-x86_64.tar.gz --strip=1 -C /usr
 56 | ```
 57 | 
 58 | Install Clang and LLVM (>= 12)
 59 | ```bash
 60 | wget -c https://github.com/llvm/llvm-project/releases/download/llvmorg-13.0.0/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
 61 | tar -xvf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
 62 | sudo cp -rl clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/* /usr/local
 63 | rm -rf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04 \
 64 |        clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
 65 | ```
 66 | 
 67 | ## Setup
 68 | 
 69 | Install and build PIMFlow repositories from the source. We prepared installation script (docker/install.sh):
 70 | ```bash
 71 | GIT_BRANCH="2023cgo-artifact"
 72 | cd "$HOME"
 73 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_tvm.git
 74 | TVM_DIR="$HOME/PIMFlow_tvm"
 75 | cd "$TVM_DIR"
 76 | git submodule init && git submodule update
 77 | BUILD_DIR="$TVM_DIR/build"
 78 | mkdir -p "$BUILD_DIR" && cd "$BUILD_DIR"
 79 | cp "$TVM_DIR/cmake/config.cmake" "$BUILD_DIR"
 80 | cmake .. -G Ninja -DCMAKE_CXX_COMPILER=$(which g++) -DCMAKE_C_COMPILER=$(which gcc)
 81 | ninja
 82 | cd "$HOME"
 83 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_accel-sim-framework.git
 84 | GPU_DIR="$HOME/PIMFlow_accel-sim-framework/gpu-simulator"
 85 | NVBIT_DIR="$HOME/PIMFlow_accel-sim-framework/util/tracer_nvbit"
 86 | cd "$GPU_DIR"
 87 | source setup_environment.sh
 88 | # Generate binary file: $GPU_DIR/bin/release/accel-sim.out
 89 | make -j
 90 | # Install nvbit
 91 | cd "$NVBIT_DIR" && ./install_nvbit.sh && make -j
 92 | 
 93 | cd "$HOME"
 94 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_ramulator.git
 95 | RAM_DIR="$HOME/PIMFlow_ramulator"
 96 | cd "$RAM_DIR"
 97 | # Generate binary file: $RAM_DIR/ramulator
 98 | make -j
 99 | cd "$HOME"
100 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow.git
101 | PIMFLOW_DIR="$HOME/PIMFlow"
102 | cd "$PIMFLOW_DIR"
103 | pip install -e .
104 | cd "$PIMFLOW_DIR/pim"
105 | # Generate binary file: $PIMFLOW_DIR/pim/pim_codegen
106 | make -j
107 | # Extract network traces
108 | cd "$PIMFLOW_DIR"
109 | tar -xzf ./data/mobilenet-v2.tar.gz -C .
110 | 
111 | tar -xzf ./data/traces-mobilenet-v2-16-org.tar.gz -C .
112 | tar -xzf ./data/traces-mobilenet-v2-16-Newton+.tar.gz -C .
113 | tar -xzf ./data/traces-mobilenet-v2-16-Newton++.tar.gz -C .
114 | tar -xzf ./data/traces-mobilenet-v2-16-Pipeline.tar.gz -C .
115 | tar -xzf ./data/traces-mobilenet-v2-16-MDDP.tar.gz -C .
116 | tar -xzf ./data/traces-mobilenet-v2-16-PIMFlow.tar.gz -C .
117 | 
118 | tar -xzf ./data/mobilenet-v2-csv.tar.gz -C ../
119 | ```
120 | 
121 | Now, the directory should look like this:
122 | ```text
123 | . ($HOME)
124 | ./PIMFlow
125 | ./PIMFlow_tvm
126 | ./PIMFlow_accel-sim-framework
127 | ./PIMFlow_ramulator
128 | ```
129 | 
130 | Finally, you need to set the following environment variables, and include them to .bashrc for later session.
131 | ```bash
132 | export TVM_HOME=/root/PIMFlow_tvm
133 | export PYTHONPATH=/root/PIMFlow_tvm/python
134 | ```
135 | 
136 | ## Run
137 | You can manually peform profiling to find optimal execution mode and task size.
138 | 
139 | Note: it takes about 8 hours in server with 8x NVIDIA GeForce RTX 2080 Ti GPU and 2x Intel Xeon Gold 6248R CPU (24-core)
140 | 
141 | ```bash
142 | cd PIMFlow
143 | ./pimflow -m=profile -t=split -n=mobilenet-v2
144 | ./pimflow -m=profile -t=pipeline -n=mobilenet-v2
145 | ```
146 | Or, you can just use the profiled data we've prepared in PIMFlow/mobilenet-v2/ for MobileNet-V2.
147 | 
148 | Now, you can get the optimal solution using profiled data and get the speedup:
149 | ```bash
150 | ./pimflow -m=stat --conv_only -n=mobilenet-v2
151 | ```
152 | The output should look like this:
153 | ```text
154 | === N_CHANNEL: 16, N_GWRITE: 4, ramulator_disable_gwrite_latency_hiding: False ===
155 | newton++ (vs baseline): 1.365 (-388549.76000000024)
156 | pipeline (vs baseline): 1.413 (-425128.2000000004)
157 | split (vs baseline): 1.436 (-441899.4400000004)
158 | all (vs baseline): 1.481 (-472070.72000000044)
159 | ====================
160 | ```
161 | 
162 | Next, you can get speedup by the following commands:
163 | Note: it takes about 8 hours in our system.
164 | *policy* option is either *Newton+*, *Newton++*, *Pipeline*, *MDDP*, or *PIMFlow*.
165 | ```bash
166 | ./pimflow -m=solve -n=mobilenet-v2
167 | ./pimflow -m=run --gpu_only -n=mobilenet-v2 # get gpu-only execution time
168 | ./pimflow -m=run -n=mobilenet-v2 # get pimflow execution time
169 | ./pimflow -m=stat -n=mobilenet-v2 --policy=PIMFlow # show end-to-end speedup
170 | ```
171 | Output:
172 | ```text
173 | GPU CYCLE: 1445620
174 | PIMFlow CYCLE: 1047831.4000000001
175 | PIMFlow SPEEDUP: 1.38
176 | ```
177 | 
178 | You can replace "mobilenet-v2" with "efficientnet-v1-b0", "mnasnet-1.0", "resnet-50" or "vgg-16" for various network testing.
179 | We prepared very simple network "toy" for simple but fast test.
180 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04
 2 | ENV     DEBIAN_FRONTEND=noninteractive
 3 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 4 | RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
 5 |     apt-get update && \
 6 |     apt-get install -y --no-install-recommends \
 7 |                     python3-dev \
 8 |                     ca-certificates \
 9 |                     g++ \
10 |                     python3-numpy \
11 |                     gcc \
12 |                     make \
13 |                     git \
14 |                     python3-setuptools \
15 |                     python3-wheel \
16 |                     python3-pip \
17 |                     aria2 \
18 |                     wget \
19 |                     build-essential \
20 |                     xutils-dev \
21 |                     bison \
22 |                     zlib1g-dev \
23 |                     flex \
24 |                     libglu1-mesa-dev \
25 |                     git \
26 |                     libssl-dev \
27 |                     libxml2-dev \
28 |                     libboost-all-dev \
29 |                     vim \
30 |                     python-setuptools \
31 |                     python-dev \
32 |                     ninja-build \
33 |                     bc \
34 |                     git-lfs \
35 |                     libtinfo-dev \
36 |                     htop \
37 |                     libedit-dev && \
38 |     python3 -m pip install -U --force-reinstall pip && \
39 |     #pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 \
40 |     pip install  torch==1.11.0+cu113 \
41 |                  torchvision==0.12.0+cu113 \
42 |                  torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 \
43 |                  pyyaml==5.1 \
44 |                  onnx \
45 |                  plotly \
46 |                  psutil \
47 |                  pandas \
48 |                  decorator \
49 |                  attrs \
50 |                  scipy \
51 |                  matplotlib && \
52 |     aria2c -q -d /tmp -o cmake-3.21.0-linux-x86_64.tar.gz \
53 |            https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.tar.gz && \
54 |     tar -zxf /tmp/cmake-3.21.0-linux-x86_64.tar.gz --strip=1 -C /usr && \
55 |     wget -c https://github.com/llvm/llvm-project/releases/download/llvmorg-13.0.0/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz && \
56 |     tar -xvf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz && \
57 |     cp -rl clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04/* /usr/local && \
58 |     rm -rf clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04 \
59 |            clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-20.04.tar.xz
60 | ENV HOME /root
61 | WORKDIR /root
62 | ENV TVM_HOME /root/PIMFlow_tvm
63 | ENV PYTHONPATH /root/PIMFlow_tvm/python
64 | ENV CUDA_INSTALL_PATH /usr/local/cuda
65 | COPY install.sh /root/
66 | CMD ["/bin/bash"]
67 | 


--------------------------------------------------------------------------------
/docker/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | GIT_BRANCH="2023cgo-artifact"
 3 | cd "$HOME"
 4 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_tvm.git
 5 | TVM_DIR="$HOME/PIMFlow_tvm"
 6 | cd "$TVM_DIR"
 7 | git submodule init && git submodule update
 8 | BUILD_DIR="$TVM_DIR/build"
 9 | mkdir -p "$BUILD_DIR" && cd "$BUILD_DIR"
10 | cp "$TVM_DIR/cmake/config.cmake" "$BUILD_DIR"
11 | cmake .. -G Ninja -DCMAKE_CXX_COMPILER=$(which g++) -DCMAKE_C_COMPILER=$(which gcc)
12 | ninja
13 | cd "$HOME"
14 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_accel-sim-framework.git
15 | GPU_DIR="$HOME/PIMFlow_accel-sim-framework/gpu-simulator"
16 | NVBIT_DIR="$HOME/PIMFlow_accel-sim-framework/util/tracer_nvbit"
17 | cd "$GPU_DIR"
18 | source setup_environment.sh
19 | # Generate binary file: $GPU_DIR/bin/release/accel-sim.out
20 | make -j
21 | # Install nvbit
22 | cd "$NVBIT_DIR" && ./install_nvbit.sh && make -j
23 | 
24 | cd "$HOME"
25 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow_ramulator.git
26 | RAM_DIR="$HOME/PIMFlow_ramulator"
27 | cd "$RAM_DIR"
28 | # Generate binary file: $RAM_DIR/ramulator
29 | make -j
30 | cd "$HOME"
31 | git clone -b "$GIT_BRANCH" https://github.com/yongwonshin/PIMFlow.git
32 | PIMFLOW_DIR="$HOME/PIMFlow"
33 | cd "$PIMFLOW_DIR"
34 | pip install -e .
35 | cd "$PIMFLOW_DIR/pim"
36 | # Generate binary file: $PIMFLOW_DIR/pim/pim_codegen
37 | make -j
38 | # Extract network traces
39 | cd "$PIMFLOW_DIR"
40 | tar -xzf ./data/mobilenet-v2.tar.gz -C .
41 | 
42 | tar -xzf ./data/traces-mobilenet-v2-16-org.tar.gz -C .
43 | tar -xzf ./data/traces-mobilenet-v2-16-Newton+.tar.gz -C .
44 | tar -xzf ./data/traces-mobilenet-v2-16-Newton++.tar.gz -C .
45 | tar -xzf ./data/traces-mobilenet-v2-16-Pipeline.tar.gz -C .
46 | tar -xzf ./data/traces-mobilenet-v2-16-MDDP.tar.gz -C .
47 | tar -xzf ./data/traces-mobilenet-v2-16-PIMFlow.tar.gz -C .
48 | 
49 | tar -xzf ./data/mobilenet-v2-csv.tar.gz -C ../
50 | 


--------------------------------------------------------------------------------
/layerwise/ALL_matmul.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | GPU=$2
 4 | KERNEL_LAUNCH_LATENCY=$3
 5 | N_CHANNEL=$4
 6 | 
 7 | for (( i = 10 ; i < 101 ; i = i + 10 )); do
 8 |   python3 inspect_shape.py --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL
 9 |   python3 run_matmul --trace --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
10 |   python3 run_matmul --simulate --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
11 |   python3 run_matmul --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=1
12 |   python3 run_matmul --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=2
13 |   python3 run_matmul --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=4
14 |   python3 run_matmul --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=1
15 |   python3 run_matmul --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=2
16 |   python3 run_matmul --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=4
17 |   if [ $i -eq 100 ]; then
18 |     python3 run_matmul --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=1
19 |     python3 run_matmul --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=2
20 |     python3 run_matmul --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=4
21 |   fi
22 |   sh clean.sh $MODEL-matmul $i 16
23 | done
24 | 
25 | python3 process_csv.py --model=$MODEL --n_gwrite=1
26 | python3 process_csv.py --model=$MODEL --n_gwrite=1 --ramulator_disable_gwrite_latency_hiding
27 | python3 process_csv.py --model=$MODEL --n_gwrite=2
28 | python3 process_csv.py --model=$MODEL --n_gwrite=2 --ramulator_disable_gwrite_latency_hiding
29 | python3 process_csv.py --model=$MODEL --n_gwrite=4
30 | python3 process_csv.py --model=$MODEL --n_gwrite=4 --ramulator_disable_gwrite_latency_hiding
31 | 


--------------------------------------------------------------------------------
/layerwise/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | SPLIT=$2
 4 | N_CHANNEL=$3
 5 | 
 6 | mkdir -p result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 7 | mv trace*.txt result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 8 | mv *-matmul result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 9 | mv Conv_* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
10 | mv Gemm_* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
11 | mv accelwattch_power_report_*.log result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
12 | mv traces-* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
13 | rm -r traces-*
14 | rm -r Conv_*
15 | rm -r tmp-*
16 | rm -r Gemm_*
17 | rm compile-*.so
18 | rm layer-*.onnx
19 | rm MatMul_*.onnx
20 | 


--------------------------------------------------------------------------------
/layerwise/inspect:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | parser = argparse.ArgumentParser()
 4 | parser.add_argument("--path", help="trace path from accel-sim-framework/util/tracer_nvbit", required=True)
 5 | parser.add_argument("--iteration", type=int, default=3, help="iteration")
 6 | args = parser.parse_args()
 7 | 
 8 | TRACE_PATH=args.path
 9 | 
10 | START=None
11 | 
12 | def parse_kernel_number(l):
13 |   return int(l.split("-")[1].split(".")[0])
14 | 
15 | # start, end
16 | r = []
17 | skip = True # skip first interval
18 | with open(f"./{TRACE_PATH}/stats.csv") as f:
19 |   lines = f.readlines()
20 | 
21 |   for i, l in enumerate(lines):
22 |     if i == 0:
23 |       continue
24 | 
25 |     l = l.split(",")
26 | 
27 |     n = parse_kernel_number(l[0].strip())
28 |     name = l[1].strip()
29 | 
30 |     if name.find("forward_kernel_cuda_start") != -1:
31 |       # if skip:
32 |       #   skip = False
33 |       #   continue
34 | 
35 |       START = n
36 | 
37 |     if START is not None and name.find("forward_kernel_cuda_end") != -1:
38 |       r.append((START, n))
39 | 
40 | intv = r[0][1] - r[0][0]
41 | for s, t in r:
42 |   assert intv == t - s
43 | 
44 | assert len(r) == args.iteration
45 | 
46 | print(r[2][0]+1, r[2][1]-1)
47 | 
48 | 


--------------------------------------------------------------------------------
/layerwise/inspect_shape.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import torchvision.models as models
  4 | import onnx
  5 | from google.protobuf.json_format import MessageToDict
  6 | from pim.util import Net, activation_type, MODEL_LIST, get_random_input
  7 | import os
  8 | 
  9 | import argparse
 10 | 
 11 | class Range(object):
 12 |   def __init__(self, start, end):
 13 |     self.start = start
 14 |     self.end = end
 15 |   def __eq__(self, other):
 16 |     return self.start <= other <= self.end
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 20 | parser.add_argument("--n_channel", type=int, default=16)
 21 | parser.add_argument("--split_ratio", type=int,  required=True)
 22 | parser.add_argument("--full", action="store_true")
 23 | args = parser.parse_args()
 24 | 
 25 | os.environ["CUDA_VISIBLE_DEVICES"]=f"{args.n_channel // 4 % torch.cuda.device_count()}"
 26 | 
 27 | model = None
 28 | if args.model == "efficientnet-v1-b0":
 29 |   model = models.efficientnet_b0(pretrained=True)
 30 | elif args.model == "efficientnet-v1-b1":
 31 |   model = models.efficientnet_b1(pretrained=True)
 32 | elif args.model == "efficientnet-v1-b2":
 33 |   model = models.efficientnet_b2(pretrained=True)
 34 | elif args.model == "efficientnet-v1-b3":
 35 |   model = models.efficientnet_b3(pretrained=True)
 36 | elif args.model == "efficientnet-v1-b4":
 37 |   model = models.efficientnet_b4(pretrained=True)
 38 | elif args.model == "efficientnet-v1-b5":
 39 |   model = models.efficientnet_b5(pretrained=True)
 40 | elif args.model == "efficientnet-v1-b6":
 41 |   model = models.efficientnet_b6(pretrained=True)
 42 | elif args.model == "efficientnet-v1-b7":
 43 |   model = models.efficientnet_b7(pretrained=True)
 44 | elif args.model == "mobilenet-v2":
 45 |   model = models.mobilenet_v2(pretrained=True)
 46 | elif args.model == "mobilenet-v2-1.4": # https://arxiv.org/pdf/1801.04381.pdf
 47 |   model = models.mobilenet_v2(pretrained=False, width_mult=1.4)
 48 | elif args.model == "mobilenet-v3-small":
 49 |   model = models.mobilenet_v3_small(pretrained=True)
 50 | elif args.model == "mobilenet-v3-large":
 51 |   model = models.mobilenet_v3_large(pretrained=True)
 52 | elif args.model == "resnet-18":
 53 |   model = models.resnet18(pretrained=True)
 54 | elif args.model == "resnet-34":
 55 |   model = models.resnet34(pretrained=True)
 56 | elif args.model == "resnet-50":
 57 |   model = models.resnet50(pretrained=True)
 58 | elif args.model == "resnext-50":
 59 |   model = models.resnext50_32x4d(pretrained=True)
 60 | elif args.model == "inception-v3":
 61 |   model = models.inception_v3(pretrained=True)
 62 | elif args.model == "shufflenet-v2-x0.5":
 63 |   model = models.shufflenet_v2_x0_5(pretrained=True)
 64 | elif args.model == "shufflenet-v2-x1.0":
 65 |   model = models.shufflenet_v2_x1_0(pretrained=True)
 66 | elif args.model == "shufflenet-v2-x2.0":
 67 |   model = models.shufflenet_v2_x2_0(pretrained=False) # pretrained model is not yet supported
 68 | elif args.model == "mnasnet-0.5":
 69 |   model = models.mnasnet0_5(pretrained=True)
 70 | elif args.model == "mnasnet-1.0":
 71 |   model = models.mnasnet1_0(pretrained=True)
 72 | elif args.model == "mnasnet-1.3":
 73 |   model = models.mnasnet1_3(pretrained=False) # pretrained model is not yet supported
 74 | elif args.model == "vgg-16":
 75 |   model = models.vgg16(pretrained=True)
 76 | elif args.model == "regnet_y_400mf":
 77 |   model = models.regnet_y_400mf(pretrained=True)
 78 | elif args.model == "regnet_y_800mf":
 79 |   model = models.regnet_y_800mf(pretrained=True)
 80 | elif args.model == "regnet_y_1_6gf":
 81 |   model = models.regnet_y_1_6gf(pretrained=True)
 82 | elif args.model == "regnet_y_3_2gf":
 83 |   model = models.regnet_y_3_2gf(pretrained=True)
 84 | elif args.model == "regnet_y_8gf":
 85 |   model = models.regnet_y_8gf(pretrained=True)
 86 | elif args.model == "regnet_y_16gf":
 87 |   model = models.regnet_y_16gf(pretrained=True)
 88 | elif args.model == "regnet_y_32gf":
 89 |   model = models.regnet_y_32gf(pretrained=True)
 90 | elif args.model == "regnet_y_128gf":
 91 |   model = models.regnet_y_128gf(pretrained=True)
 92 | elif args.model == "regnet_x_400mf":
 93 |   model = models.regnet_x_400mf(pretrained=True)
 94 | elif args.model == "regnet_y_800mf":
 95 |   model = models.regnet_x_800mf(pretrained=True)
 96 | elif args.model == "regnet_x_1_6gf":
 97 |   model = models.regnet_x_1_6gf(pretrained=True)
 98 | elif args.model == "regnet_x_3_2gf":
 99 |   model = models.regnet_x_3_2gf(pretrained=True)
100 | elif args.model == "regnet_x_8gf":
101 |   model = models.regnet_x_8gf(pretrained=True)
102 | elif args.model == "regnet_x_16gf":
103 |   model = models.regnet_x_16gf(pretrained=True)
104 | elif args.model == "regnet_x_32gf":
105 |   model = models.regnet_x_32gf(pretrained=True)
106 | elif args.model == "regnet_x_128gf":
107 |   model = models.regnet_x_128gf(pretrained=True)
108 | elif args.model == "vit-b-16":
109 |   model = models.vit_b_16(pretrained=True)
110 | elif args.model == "vit-l-16":
111 |   model = models.vit_l_16(pretrained=True)
112 | elif args.model == "swin-b":
113 |   model = models.swin_b()
114 | elif args.model == "swin-s":
115 |   model = models.swin_s()
116 | elif args.model == "convnext-tiny":
117 |   model = models.convnext_tiny(pretrained=True)
118 | elif args.model == "convnext-small":
119 |   model = models.convnext_small(pretrained=True)
120 | elif args.model == "convnext-base":
121 |   model = models.convnext_base(pretrained=True)
122 | elif args.model == "convnext-large":
123 |   model = models.convnext_large(pretrained=True)
124 | elif args.model == "toy":
125 |   model = Net()
126 | elif args.model in ['bert-large-1x1', 'bert-large-1x3', 'bert-large-1x32', 'bert-large-1x64', 'bert-large-1x128', 'bert-base-1x1', 'bert-base-1x3', 'bert-base-1x32', 'bert-base-1x64', 'bert-base-1x128']:
127 |   model = Net() # temporary dummy model
128 | else:
129 |   raise Exception("MUST not reach here!")
130 | model.cuda()
131 | model.eval()
132 | model.half()
133 | 
134 | x = get_random_input(args.model)
135 | x = x.half()
136 | 
137 | if args.model in ['bert-large-1x3', 'bert-large-1x32', 'bert-large-1x64']:
138 |   onnx_model = onnx.load(f"{args.model}_{args.n_channel}.onnx")
139 |   # if args.model == 'bert-large-1x3':
140 |   #   x = {'input_ids': torch.LongTensor([[101, 3231, 102]]).cuda(), 'token_type_ids': torch.LongTensor([[0, 0, 0]]).cuda(), 'attention_mask': torch.LongTensor([[1, 1, 1]]).cuda()}
141 |   # elif args.model == 'bert-large-1x32':
142 |   #   x = {'input_ids': torch.LongTensor([[101, 6748, 15756, 6125, 2024, 2062, 3697, 2000, 3345, 1012, 2057, 2556, 1037, 21961, 4083, 7705, 2000, 7496, 1996, 2731, 1997, 6125, 2008, 2024, 12381, 6748, 2084, 2216, 2109, 3130, 1012, 102]]).cuda(), 'token_type_ids': torch.LongTensor([[0]*32]).cuda(), 'attention_mask': torch.LongTensor([[1]*32]).cuda()}
143 |   # elif args.model == 'bert-large-1x64':
144 |   #   x = {'input_ids': torch.LongTensor([101, 6748, 15756, 6125, 2024, 2062, 3697, 2000, 3345, 1012, 2057, 2556, 1037, 21961, 4083, 7705, 2000, 7496, 1996, 2731, 1997, 6125, 2008, 2024, 12381, 6748, 2084, 2216, 2109, 3130, 1012, 2057, 12045, 5290, 9869, 1996, 9014, 2004, 4083, 21961, 4972, 2007, 4431, 2000, 1996, 6741, 20407, 1010, 2612, 1997, 4083, 4895, 2890, 25523, 2094, 4972, 1012, 2057, 3073, 7721, 17537, 3350, 4760, 102]).cuda(), 'token_type_ids': torch.LongTensor([[0]*64]).cuda(), 'attention_mask': torch.LongTensor([[1]*64]).cuda()}
145 | else:
146 |   torch_out = model(x)
147 |   # Export the model
148 |   torch.onnx.export(model,               # model being run
149 |                     x,                         # model input (or a tuple for multiple inputs)
150 |                     f"{args.model}_{args.n_channel}.onnx",   # where to save the model (can be a file or file-like object)
151 |                     export_params=True,        # store the trained parameter weights inside the model file
152 |                     opset_version=13,          # the ONNX version to export the model to
153 |                     do_constant_folding=True,  # whether to execute constant folding for optimization
154 |                     # training=TrainingMode.TRAINING,
155 |                     input_names = ['input'],   # the model's input names
156 |                     output_names = ['output']) # the model's output names
157 |                     # dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
158 |                     #               'output' : {0 : 'batch_size'}})
159 | 
160 |   onnx_model = onnx.load(f"{args.model}_{args.n_channel}.onnx")
161 | onnx.checker.check_model(onnx_model)
162 | 
163 | from pim.util import find_initializer_by_arg_name, get_arg_shape
164 | 
165 | class OperatorPrinter:
166 |   def __init__(self, model_name):
167 |     self.conv_path = f"{model_name}_conv.csv"
168 |     self.matmul_path = f"{model_name}_matmul.csv"
169 |     with open(self.conv_path, 'w') as f:
170 |       pass
171 |     #   f.write("kernel_name,N,I_c,H,W,O_c,kernel_size,pads,strides,group,dilations,bias,activation\n")
172 |     with open(self.matmul_path, 'w') as f:
173 |       pass
174 |     #   f.write("kernel_name,row,col,bias,activation\n")
175 |     self.conv_configs = set()
176 |     self.fc_configs = set()
177 |     self.pim_configs = set()
178 | 
179 |   def reset(self):
180 |     self.conv_configs.clear()
181 | 
182 |   def print(self, op_type, config, debug=False):
183 |     if op_type == "Conv":
184 |       c = f"{config['input_shape'][0]},{config['input_shape'][1]},{config['input_shape'][2]},{config['input_shape'][3]},{config['weight_shape'][0]},\"({config['kernel_shape'][0]},{config['kernel_shape'][1]})\",\"({config['pads'][0]},{config['pads'][1]})\",{config['strides'][0]},{config['group']},{config['dilations'][0]},{int(config['bias'])},{config['activation']}"
185 |       if not args.full:
186 |         if config['kernel_name'].endswith("_pim_added"):
187 |           if c in self.pim_configs:
188 |             return
189 |           else:
190 |             self.pim_configs.add(c)
191 |         else:
192 |           if c in self.conv_configs:
193 |             return
194 |           else:
195 |             self.conv_configs.add(c)
196 | 
197 |       with open(self.conv_path, 'a') as f:
198 |         f.write(f"{config['kernel_name']},{c}\n")
199 |         if debug:
200 |           print(f"Conv: {config}")
201 |     elif op_type == "MatMul":
202 |       n = 1
203 |       for d in config[5][:-1]:
204 |         n *= d
205 |       c = f"{config[1]},{config[2]},{config[3]},{config[4]},{d}"
206 |       if not args.full:
207 |         if c in self.fc_configs:
208 |           return
209 |         else:
210 |           self.fc_configs.add(c)
211 |       with open(self.matmul_path, 'a') as f:
212 |         f.write(f"{config[0]},{c}\n")
213 |         if debug:
214 |           print(f"MatMul: {config}")
215 |     else:
216 |       raise Exception("Not implemented!")
217 | 
218 | def run(graph):
219 |   for input_ in graph.input:
220 |     m_dict = MessageToDict(input_)
221 |     dim_info = m_dict["type"]["tensorType"]["shape"]["dim"]  # ugly but we have to live with this when using dict
222 |     input_shape = [d.get("dimValue") for d in dim_info]  # [4,3,384,640]
223 | 
224 |   skipped_optype = set()
225 |   printer = OperatorPrinter(f"{args.model}_{args.n_channel}.onnx")
226 |   for node in graph.node:
227 |     if node.op_type == 'Conv':
228 |       # (N, C, H, W)
229 |       input_shape = get_arg_shape(graph, node, node.input[0])
230 |       if input_shape is None: # first node of the graph
231 |         assert len(graph.input) < 2 # single input
232 |         assert graph.node[0] == node # first node
233 |         m_dict = MessageToDict(graph.input[0])
234 |         dim_info = m_dict["type"]["tensorType"]["shape"]["dim"]  # ugly but we have to live with this when using dict
235 |         input_shape = [d.get("dimValue") for d in dim_info]  # [4,3,384,640]
236 | 
237 |       # (O_c, I_c, K_h, K_w)
238 |       weight_shape = find_initializer_by_arg_name(graph, node.input[1]).dims
239 |       # (kernel_size, stride, padding, dilation, groups)
240 |       attributes = {e.name:(e.ints if len(e.ints) > 0 else e.i) for e in node.attribute}
241 |       assert all(e == attributes['dilations'][0] for e in attributes['dilations'])
242 |       assert attributes['pads'][0] == attributes['pads'][2] and attributes['pads'][1] == attributes['pads'][3]
243 |       assert all(e == attributes['strides'][0] for e in attributes['strides'])
244 |       attributes = {k:v for k, v in attributes.items()}
245 |       attributes['input_shape'] = input_shape
246 |       attributes['weight_shape'] = weight_shape
247 |       attributes['kernel_name'] = node.name
248 |       attributes['bias'] = len(node.input) == 3
249 |       attributes['activation'] = activation_type(graph, node)
250 |       print(attributes)
251 | 
252 |       if input_shape[2] == 1 and input_shape[3] == 1 and weight_shape[2] == 1 and weight_shape[3] == 1:
253 |         printer.print("MatMul", [node.name, weight_shape[0], weight_shape[1], int(len(node.input) > 2), activation_type(graph, node), input_shape])
254 |         continue
255 | 
256 |       printer.print("Conv", attributes, debug=True)
257 |     elif node.op_type == 'MatMul':
258 |       input_shape = get_arg_shape(graph, node, node.input[0])
259 |       weight = find_initializer_by_arg_name(graph, node.input[1])
260 |       if weight is None:
261 |         continue
262 |       weight_shape = weight.dims
263 |       printer.print("MatMul", [node.name, weight_shape[0], weight_shape[1], 0, activation_type(graph, node), input_shape])
264 |     elif node.op_type == 'Gemm':
265 |       input_shape = get_arg_shape(graph, node, node.input[0])
266 |       attributes = {e.name:e for e in node.attribute}
267 |       if np.isclose(attributes['alpha'].f, 1.0) and np.isclose(attributes['beta'].f, 1.0) and attributes.get('transA', 0) == 0:
268 |         weight = find_initializer_by_arg_name(graph, node.input[1])
269 |         if weight is None:
270 |           print(f'Skipped Gemm: {node.input[1]}')
271 |           continue
272 |         weight_shape = weight.dims
273 |         printer.print("MatMul", [node.name, weight_shape[0], weight_shape[1], int(len(node.input) > 2), activation_type(graph, node), input_shape])
274 |       else:
275 |         skipped_optype.add(node.op_type)
276 |     else:
277 |       skipped_optype.add(node.op_type)
278 | 
279 |   print(f"Skipped: {skipped_optype}")
280 | 
281 | from pim.transform import InputSplit
282 | 
283 | onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
284 | ratio = args.split_ratio/100
285 | 
286 | onnx_model = InputSplit(ratio).transform(onnx_model)
287 | onnx.checker.check_model(onnx_model)
288 | onnx.save(onnx_model, f"{args.model}_{args.n_channel}_transformed.onnx")
289 | onnx_model = onnx.load(f"{args.model}_{args.n_channel}_transformed.onnx")
290 | run(onnx_model.graph)
291 | 


--------------------------------------------------------------------------------
/layerwise/layerwise.py:
--------------------------------------------------------------------------------
  1 | from numpy import clip
  2 | import torch
  3 | import argparse
  4 | import os
  5 | import onnx
  6 | import tvm
  7 | import tvm.relay as relay
  8 | from tvm.contrib import graph_executor
  9 | from torch.utils.cpp_extension import load
 10 | 
 11 | import argparse
 12 | 
 13 | class Range(object):
 14 |   def __init__(self, start, end):
 15 |     self.start = start
 16 |     self.end = end
 17 |   def __eq__(self, other):
 18 |     return self.start <= other <= self.end
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument("--ic", type=int, required=True)
 22 | parser.add_argument("--oc", type=int, required=True)
 23 | parser.add_argument("--h", type=int, required=True)
 24 | parser.add_argument("--w", type=int, required=True)
 25 | parser.add_argument("--kh", type=int, required=True)
 26 | parser.add_argument("--kw", type=int, required=True)
 27 | parser.add_argument("--stride", type=int)
 28 | parser.add_argument("--ph", type=int, required=True)
 29 | parser.add_argument("--pw", type=int, required=True)
 30 | parser.add_argument("--dilate", type=int, required=True)
 31 | parser.add_argument("--g", type=int, required=True)
 32 | parser.add_argument("--b", action="store_true", default=False)
 33 | parser.add_argument("--dev", type=int, required=True)
 34 | parser.add_argument("--activation", required=True)
 35 | args = parser.parse_args()
 36 | 
 37 | marker = load(name="marker", sources = ["/root/PIMFlow/marker/marker_cuda.cpp", "/root/PIMFlow/marker/marker_cuda_kernel.cu"])
 38 | 
 39 | class Net(torch.nn.Module):
 40 |   def __init__(self):
 41 |     super(Net, self).__init__()
 42 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 43 | 
 44 |   def forward(self, x):
 45 |     x = self.conv(x)
 46 |     return x
 47 | 
 48 | class NetElemwiseAct(torch.nn.Module):
 49 |   def __init__(self, type):
 50 |     super(NetElemwiseAct, self).__init__()
 51 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 52 |     if type == "HardSigmoid":
 53 |       self.act = torch.nn.Hardsigmoid()
 54 |     elif type == "Sigmoid":
 55 |       self.act = torch.nn.Sigmoid()
 56 |     elif type == "Relu":
 57 |       self.act = torch.nn.ReLU()
 58 |     else:
 59 |       raise Exception(f"Unknown activation: {type}")
 60 | 
 61 |   def forward(self, x):
 62 |     x = self.conv(x)
 63 |     x = self.act(x)
 64 |     return x
 65 | 
 66 | class NetSiLU(torch.nn.Module):
 67 |   def __init__(self):
 68 |     super(NetSiLU, self).__init__()
 69 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 70 | 
 71 |   def forward(self, x):
 72 |     x = self.conv(x)
 73 |     x2 = torch.sigmoid(x)
 74 |     x = torch.mul(x, x2)
 75 |     return x
 76 | 
 77 | class NetHardSiLU(torch.nn.Module):
 78 |   def __init__(self):
 79 |     super(NetHardSiLU, self).__init__()
 80 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 81 |     self.hardsigmoid = torch.nn.Hardsigmoid()
 82 | 
 83 |   def forward(self, x):
 84 |     x = self.conv(x)
 85 |     x2 = self.hardsigmoid(x)
 86 |     x = torch.mul(x, x2)
 87 |     return x
 88 | 
 89 | class NetClip(torch.nn.Module):
 90 |   def __init__(self, min, max):
 91 |     super(NetClip, self).__init__()
 92 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 93 |     self.min = min
 94 |     self.max = max
 95 | 
 96 |   def forward(self, x):
 97 |     x = self.conv(x)
 98 |     x = torch.clip(x, min=self.min, max=self.max)
 99 |     return x
100 | 
101 | if args.activation == "SiLU":
102 |   model = NetSiLU()
103 | elif args.activation == "HardSiLU":
104 |   model = NetHardSiLU()
105 | elif args.activation in ["HardSigmoid", "Sigmoid", "Relu"]:
106 |   model = NetElemwiseAct(type=args.activation)
107 | elif args.activation == "Clip":
108 |   model = NetClip(min=0, max=6)
109 | elif args.activation == "Identity":
110 |   model = Net()
111 | else:
112 |   raise Exception("Not supported activation!")
113 | 
114 | print(torch.cuda.device_count())
115 | model.cuda()
116 | model.eval()
117 | model.half()
118 | x = torch.randn(1, args.ic, args.h, args.w).cuda().half()
119 | 
120 | torch_out = model(x)
121 | 
122 | # Export the model
123 | torch.onnx.export(model,               # model being run
124 |                   x,                         # model input (or a tuple for multiple inputs)
125 |                   f"layer-{os.getpid()}.onnx",   # where to save the model (can be a file or file-like object)
126 |                   export_params=True,        # store the trained parameter weights inside the model file
127 |                   opset_version=11,          # the ONNX version to export the model to
128 |                   do_constant_folding=True,  # whether to execute constant folding for optimization
129 |                   # training=TrainingMode.TRAINING,
130 |                   input_names = ['input'],   # the model's input names
131 |                   output_names = ['output']) # the model's output names
132 |                   # dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
133 |                   #               'output' : {0 : 'batch_size'}})
134 | 
135 | onnx_model = onnx.load(f"layer-{os.getpid()}.onnx")
136 | onnx.checker.check_model(onnx_model)
137 | 
138 | def to_numpy(tensor):
139 |   return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
140 | 
141 | shape_dict = {"input": x.shape}
142 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
143 | desired_layouts = {
144 |   'nn.conv2d': ['NHWC', 'OHWI'],
145 |   'nn.max_pool2d': ['NHWC'],
146 |   'nn.global_avg_pool2d': ['NHWC'],
147 | }
148 | seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
149 |                                 relay.transform.ConvertLayout(desired_layouts),
150 |                                 relay.transform.FoldConstant()])
151 | with tvm.transform.PassContext(opt_level=3):
152 |   mod = seq(mod)
153 | 
154 | # debug
155 | # print(mod)
156 | 
157 | from tvm.relay.op.contrib.pim import partition_for_pim
158 | from tvm.contrib.pim import build_pim_kernels
159 | mod = partition_for_pim(mod)
160 | 
161 | # debug
162 | # print(mod)
163 | 
164 | target = "cuda -libs=cudnn,cublas"
165 | with tvm.transform.PassContext(opt_level=2):
166 |   lib = relay.build(mod, target, params=params)
167 | os.system(f"mkdir -p tmp-{os.getpid()}")
168 | lib = build_pim_kernels(lib, f"./tmp-{os.getpid()}", f"compile-{os.getpid()}.so")
169 | dev = tvm.cuda(0)
170 | module = graph_executor.GraphModule(lib["default"](dev))
171 | module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype("float16"), device=dev)})
172 | 
173 | for i in range(3):
174 |   marker.forward(True)
175 |   module.run()
176 |   marker.forward(False)
177 | 
178 | print("FINISH!!!")
179 | 


--------------------------------------------------------------------------------
/layerwise/layerwise_matmul.py:
--------------------------------------------------------------------------------
  1 | from numpy import clip
  2 | import torch
  3 | import argparse
  4 | import os
  5 | import onnx
  6 | import tvm
  7 | import tvm.relay as relay
  8 | from tvm.contrib import graph_executor
  9 | from torch.utils.cpp_extension import load
 10 | 
 11 | import argparse
 12 | 
 13 | class Range(object):
 14 |   def __init__(self, start, end):
 15 |     self.start = start
 16 |     self.end = end
 17 |   def __eq__(self, other):
 18 |     return self.start <= other <= self.end
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument("--batch", type=int, default=1)
 22 | parser.add_argument("--row", type=int, required=True)
 23 | parser.add_argument("--col", type=int, required=True)
 24 | parser.add_argument("--activation", choices=["SiLU", "Sigmoid", "Relu", "Identity"], required=True)
 25 | parser.add_argument("--name", required=True)
 26 | parser.add_argument("--bias", type=int, required=True)
 27 | args = parser.parse_args()
 28 | 
 29 | if args.bias != 0:
 30 |   args.bias = True
 31 | else:
 32 |   args.bias = False
 33 | 
 34 | marker = load(name="marker", sources = ["/root/PIMFlow/marker/marker_cuda.cpp", "/root/PIMFlow/marker/marker_cuda_kernel.cu"])
 35 | 
 36 | class Net(torch.nn.Module):
 37 |   def __init__(self):
 38 |     super(Net, self).__init__()
 39 |     self.conv = torch.nn.Linear(args.col, args.row, bias=args.bias)
 40 | 
 41 |   def forward(self, x):
 42 |     x = self.conv(x)
 43 |     return x
 44 | 
 45 | class NetElemwiseAct(torch.nn.Module):
 46 |   def __init__(self, type):
 47 |     super(NetElemwiseAct, self).__init__()
 48 |     self.conv = torch.nn.Linear(args.col, args.row, bias=args.bias)
 49 |     if type == "HardSigmoid":
 50 |       self.act = torch.nn.Hardsigmoid()
 51 |     elif type == "Sigmoid":
 52 |       self.act = torch.nn.Sigmoid()
 53 |     elif type == "Relu":
 54 |       self.act = torch.nn.ReLU()
 55 |     else:
 56 |       raise Exception(f"Unknown activation: {type}")
 57 | 
 58 |   def forward(self, x):
 59 |     x = self.conv(x)
 60 |     x = self.act(x)
 61 |     return x
 62 | 
 63 | class NetSiLU(torch.nn.Module):
 64 |   def __init__(self):
 65 |     super(NetSiLU, self).__init__()
 66 |     self.conv = torch.nn.Linear(args.col, args.row, bias=args.bias)
 67 | 
 68 |   def forward(self, x):
 69 |     x = self.conv(x)
 70 |     x2 = torch.sigmoid(x)
 71 |     x = torch.mul(x, x2)
 72 |     return x
 73 | 
 74 | class NetHardSiLU(torch.nn.Module):
 75 |   def __init__(self):
 76 |     super(NetHardSiLU, self).__init__()
 77 |     self.conv = torch.nn.Linear(args.col, args.row, bias=args.bias)
 78 |     self.hardsigmoid = torch.nn.Hardsigmoid()
 79 | 
 80 |   def forward(self, x):
 81 |     x = self.conv(x)
 82 |     x2 = self.hardsigmoid(x)
 83 |     x = torch.mul(x, x2)
 84 |     return x
 85 | 
 86 | class NetClip(torch.nn.Module):
 87 |   def __init__(self, min, max):
 88 |     super(NetClip, self).__init__()
 89 |     self.conv = torch.nn.Linear(args.col, args.row, bias=args.bias)
 90 |     self.min = min
 91 |     self.max = max
 92 | 
 93 |   def forward(self, x):
 94 |     x = self.conv(x)
 95 |     x = torch.clip(x, min=self.min, max=self.max)
 96 |     return x
 97 | 
 98 | if args.activation == "SiLU":
 99 |   model = NetSiLU()
100 | elif args.activation == "HardSiLU":
101 |   model = NetHardSiLU()
102 | elif args.activation in ["HardSigmoid", "Sigmoid", "Relu"]:
103 |   model = NetElemwiseAct(type=args.activation)
104 | elif args.activation == "Clip":
105 |   model = NetClip(min=0, max=6)
106 | elif args.activation == "Identity":
107 |   model = Net()
108 | else:
109 |   raise Exception("Not supported activation!")
110 | 
111 | model.cuda()
112 | model.eval()
113 | model.half()
114 | x = torch.randn(args.batch, args.col).cuda().half()
115 | 
116 | torch_out = model(x)
117 | 
118 | # Export the model
119 | torch.onnx.export(model,               # model being run
120 |                   x,                         # model input (or a tuple for multiple inputs)
121 |                   f"{args.name}-{os.getpid()}.onnx",   # where to save the model (can be a file or file-like object)
122 |                   export_params=True,        # store the trained parameter weights inside the model file
123 |                   opset_version=11,          # the ONNX version to export the model to
124 |                   do_constant_folding=True,  # whether to execute constant folding for optimization
125 |                   input_names = ['input'],   # the model's input names
126 |                   output_names = ['output']) # the model's output names
127 | 
128 | onnx_model = onnx.load(f"{args.name}-{os.getpid()}.onnx")
129 | onnx.checker.check_model(onnx_model)
130 | 
131 | def to_numpy(tensor):
132 |   return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
133 | 
134 | shape_dict = {"input": x.shape}
135 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
136 | 
137 | # print(mod)
138 | 
139 | target = "cuda -libs=cudnn,cublas"
140 | with tvm.transform.PassContext(opt_level=2):
141 |   lib = relay.build(mod, target, params=params)
142 | dev = tvm.cuda(0)
143 | module = graph_executor.GraphModule(lib["default"](dev))
144 | module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype("float16"), device=dev)})
145 | 
146 | for i in range(3):
147 |   marker.forward(True)
148 |   module.run()
149 |   marker.forward(False)
150 | 
151 | print("FINISH!!!")
152 | 


--------------------------------------------------------------------------------
/layerwise/pim_codegen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NAME=$1
 3 | OUT_CHANNELS=$2
 4 | IN_CHANNELS=$3
 5 | KH=$4
 6 | KW=$5
 7 | STRIDE=$6
 8 | PH=$7
 9 | PW=$8
10 | DILATE=$9
11 | GROUP=${10}
12 | BIAS=${11}
13 | IMAGE_HEIGHT=${12}
14 | IMAGE_WIDTH=${13}
15 | N_CHANNEL=${14}
16 | N_GWRITE=${15}
17 | 
18 | ../pim/pim_codegen -oc $OUT_CHANNELS -ic $IN_CHANNELS -h $IMAGE_HEIGHT -w $IMAGE_WIDTH -kh $KH -kw $KW -ph $PH -pw $PW -stride $STRIDE -name PIM_trace_partition_${N_CHANNEL}_${N_GWRITE} -n_channel $N_CHANNEL -gw $N_GWRITE
19 | 
20 | mkdir -p $NAME
21 | 
22 | for i in ./PIM_trace_partition_${N_CHANNEL}_${N_GWRITE}-*.pim; do
23 |     mv $i $NAME
24 | done
25 | 


--------------------------------------------------------------------------------
/layerwise/process_csv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import pandas as pd
 3 | import copy
 4 | from pim.util import MODEL_LIST
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 9 | parser.add_argument("--n_channel", type=int, default=16)
10 | parser.add_argument("--n_gwrite", type=int, default=4)
11 | parser.add_argument("--ramulator_disable_gwrite_latency_hiding", action="store_true")
12 | args = parser.parse_args()
13 | 
14 | postfix = ""
15 | if args.ramulator_disable_gwrite_latency_hiding:
16 |     postfix = "_noopt"
17 | 
18 | TERM = "split"
19 | OFFSET = 0
20 | if args.model in ['bert-large-1x3', 'bert-large-1x32', 'bert-large-1x64', 'bert-large-1x128', 'bert-base-1x3', 'bert-base-1x32', 'bert-base-1x64', 'bert-base-1x128']:
21 |     TERM = "matmul"
22 |     OFFSET = -7
23 | 
24 | def process(model):
25 |     baseline = pd.read_csv(f'{model}_{TERM}100-baseline.csv', delimiter=',')
26 |     BASE = [list(row) for row in baseline.values]
27 | 
28 |     if args.n_channel == 32:
29 |         newton = pd.read_csv(f'{model}_{TERM}100_32_{args.n_gwrite}{postfix}.csv', delimiter=',')
30 |         NEWTON = [list(row) for row in newton.values]
31 |     else:
32 |         newton = pd.read_csv(f'{model}_{TERM}0_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
33 |         NEWTON = [list(row) for row in newton.values]
34 | 
35 |     Top = pd.read_csv(f'{model}_{TERM}100_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
36 |     head = list(Top.columns)
37 |     head.append("TOTAL_cycle")
38 |     head.append("RATIO")
39 |     head.append("SPEED_UP")
40 |     MAX = [list(row) for row in Top.values]
41 |     GPU = copy.deepcopy(MAX)
42 | 
43 |     assert len(BASE) == len(NEWTON) and len(NEWTON) == len(MAX)
44 |     columns_len = len(MAX)
45 | 
46 |     for line in range(columns_len):
47 |         MAX[line].append(max(MAX[line][12+OFFSET], MAX[line][13+OFFSET]))
48 |         MAX[line].append('100')
49 |         MAX[line].append(BASE[line][12+OFFSET] / MAX[line][14+OFFSET])
50 | 
51 |     if args.n_channel < 32:
52 |         for i in range(0,91,10):
53 |             print(i)
54 |             f = pd.read_csv(f'{model}_{TERM}{i}_{args.n_channel}_{args.n_gwrite}{postfix}.csv',delimiter=',')
55 |             tmp = [list(row) for row in f.values]
56 |             for line in range(columns_len):
57 |                 tmp[line].append(max(tmp[line][12+OFFSET],tmp[line][13+OFFSET]))
58 |                 tmp[line].append(i)
59 |                 if tmp[line][14+OFFSET]:
60 |                     tmp[line].append(BASE[line][12+OFFSET] / tmp[line][14+OFFSET])
61 |             for line in range(columns_len):
62 |                 if float(tmp[line][14+OFFSET]) < float(MAX[line][14+OFFSET]):
63 |                     MAX[line] = tmp[line]
64 | 
65 |     for line in range(columns_len):
66 |         NEWTON[line][12+OFFSET] = BASE[line][12+OFFSET]
67 |         NEWTON[line].append(MAX[line][14+OFFSET])
68 |         NEWTON[line].append(0)
69 |         NEWTON[line].append(0)
70 |         NEWTON[line].append(0)
71 |         if NEWTON[line][13+OFFSET] < GPU[line][12+OFFSET]:
72 |             NEWTON[line][15+OFFSET] = 0
73 |         else:
74 |             NEWTON[line][15+OFFSET] = 10
75 |         if args.n_channel == 32 and NEWTON[line][13+OFFSET] == 0:
76 |             NEWTON[line][13+OFFSET] = BASE[line][12+OFFSET]
77 |         NEWTON[line][16+OFFSET] = BASE[line][12+OFFSET] / min(NEWTON[line][13+OFFSET], GPU[line][12+OFFSET])
78 |         NEWTON[line][17+OFFSET] = MAX[line][16+OFFSET]
79 | 
80 |     with open(f'max_performance_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
81 |         write = csv.writer(f)
82 |         write.writerow(head)
83 |         write.writerows(MAX)
84 | 
85 |     with open(f'newton_performance_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
86 |         write = csv.writer(f)
87 |         write.writerow(head + ["SPEED_UP (SPLIT)"])
88 |         write.writerows(NEWTON)
89 | 
90 | process(args.model)
91 | 


--------------------------------------------------------------------------------
/layerwise/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | GPU=$2
 4 | KERNEL_LAUNCH_LATENCY=$3
 5 | N_CHANNEL=$4
 6 | 
 7 | for (( i = 10 ; i < 101 ; i = i + 10 )); do
 8 |   python3 inspect_shape.py --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL
 9 |   python3 run --trace --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
10 |   python3 run --simulate --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
11 |   python3 run --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=1
12 |   python3 run --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=2
13 |   python3 run --pim_codegen --model=$MODEL --split_ratio=$i --n_channel=$N_CHANNEL --n_gwrite=4
14 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=1
15 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=2
16 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --split_ratio=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=4
17 | 
18 |   if [ $i -eq 100 ]; then
19 |     python3 run --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=1
20 |     python3 run --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=2
21 |     python3 run --pim --gpgpusim_config=$GPU --model=$MODEL --n_channel=$N_CHANNEL --n_gwrite=4
22 |   fi
23 |   sh clean.sh $MODEL $i $N_CHANNEL
24 | done
25 | 
26 | python3 process_csv.py --model=$MODEL --n_gwrite=1
27 | python3 process_csv.py --model=$MODEL --n_gwrite=1 --ramulator_disable_gwrite_latency_hiding
28 | python3 process_csv.py --model=$MODEL --n_gwrite=2
29 | python3 process_csv.py --model=$MODEL --n_gwrite=2 --ramulator_disable_gwrite_latency_hiding
30 | python3 process_csv.py --model=$MODEL --n_gwrite=4
31 | python3 process_csv.py --model=$MODEL --n_gwrite=4 --ramulator_disable_gwrite_latency_hiding
32 | 


--------------------------------------------------------------------------------
/layerwise/run:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen
  2 | import time
  3 | import argparse
  4 | import multiprocessing
  5 | import os
  6 | import numpy as np
  7 | import csv
  8 | import re
  9 | import pandas as pd
 10 | from pim.util import MODEL_LIST
 11 | from torch.cuda import device_count
 12 | parser = argparse.ArgumentParser()
 13 | group = parser.add_mutually_exclusive_group(required=True)
 14 | group.add_argument("--trace", action="store_true", help="create trace")
 15 | group.add_argument("--simulate", action="store_true", help="simulate")
 16 | group.add_argument("--pim_codegen", action="store_true", help="pim codegen")
 17 | group.add_argument("--stat", action="store_true", help="record statistics")
 18 | group.add_argument("--pim", action="store_true", help="pim_cycle")
 19 | group.add_argument("--make_max_shape", action="store_true", help="pim_cycle")
 20 | group.add_argument("--update_pim", action="store_true", help="update pim")
 21 | parser.add_argument("--power", type=bool, default=0)
 22 | parser.add_argument("--gpgpusim_config", help="gpgpusim config (e.g. SM75_RTX2060")
 23 | parser.add_argument("--split_ratio", type=int, default=0)
 24 | parser.add_argument("--device_id", type=int, default=0)
 25 | parser.add_argument("--kernel_launch_latency", type=int, default=5010)
 26 | parser.add_argument("--n_channel", type=int, default=16)
 27 | parser.add_argument("--n_gwrite", type=int, default=4)
 28 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 29 | args = parser.parse_args()
 30 | 
 31 | if args.trace and (args.gpgpusim_config is None):
 32 |   parser.error("--trace requires --gpgpusim_config")
 33 | if args.simulate and (args.gpgpusim_config is None):
 34 |   parser.error("--simulate requires --gpgpusim_config")
 35 | 
 36 | NGPU=list(range(device_count()))
 37 | NCPU=multiprocessing.cpu_count()
 38 | 
 39 | CONFIG_ALL=[]
 40 | CONFIG_GPU=[]
 41 | CONFIG_PIM=[]
 42 | if args.power:
 43 |   with open(f"{args.model}_max_ratio_{args.n_channel}_{args.n_gwrite}.onnx_conv.csv") as f:
 44 |     rdr = csv.reader(f)
 45 |     args.split_ratio = -2
 46 |     for line in rdr:
 47 |       CONFIG_ALL.append(line)
 48 |       if "pim" in line[0]:
 49 |         CONFIG_PIM.append(line)
 50 |       else:
 51 |         CONFIG_GPU.append(line)
 52 | elif args.make_max_shape:
 53 |   print('pass')
 54 | else:
 55 |   with open(f"{args.model}_{args.n_channel}.onnx_conv.csv") as f:
 56 |     rdr = csv.reader(f)
 57 |     for line in rdr:
 58 |       CONFIG_ALL.append(line)
 59 |       if "pim" in line[0]:
 60 |         CONFIG_PIM.append(line)
 61 |       else:
 62 |         CONFIG_GPU.append(line)
 63 | 
 64 | def get_device(n):
 65 |   return NGPU[n % len(NGPU)]
 66 | 
 67 | def make_args(config, n_mem):
 68 |   script = ""
 69 |   if args.trace:
 70 |     script = "trace.sh"
 71 |   elif args.simulate:
 72 |     script = "sim.sh"
 73 |   elif args.stat:
 74 |     script = "stat.sh"
 75 | 
 76 |   dev = get_device(n_mem)
 77 |   print(config)
 78 |   a = f"export CUDA_VISIBLE_DEVICES={dev} && export TRACES_PATH=traces-{config[0]} && echo $CUDA_VISIBLE_DEVICES && ./{script} {config[0]} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.gpgpusim_config} {args.split_ratio} {config[12]} {dev} {args.n_channel}"
 79 |   print(a)
 80 |   return a
 81 | 
 82 | def trace(configs):
 83 |   procs = []
 84 |   n = 0
 85 |   r = 0
 86 |   for i in range(min(len(NGPU), len(configs))):
 87 |     procs.append(Popen(make_args(configs[i], i), shell=True, start_new_session=True))
 88 |     r += 1
 89 |     time.sleep(3)
 90 |   while True:
 91 |     if n >= len(configs):
 92 |       break
 93 |     for i, p in enumerate(procs):
 94 |       if p is None:
 95 |         continue
 96 |       if p.poll() is not None:
 97 |         if r < len(configs):
 98 |           procs[i] = Popen(make_args(configs[r], i), shell=True, start_new_session=True)
 99 |           r += 1
100 |         else:
101 |           procs[i] = None
102 |         n += 1
103 |         time.sleep(3)
104 |     time.sleep(3)
105 | 
106 | def simulate(configs_gpu,configs_all):
107 |   procs = []
108 |   n = 0
109 |   r = 0
110 |   for i in range(min(NCPU,len(configs_gpu))):
111 |     pim_name=""
112 |     if "added" in configs_gpu[i][0]:
113 |       for j in range(len(configs_all)):
114 |         if configs_all[j][0] == configs_gpu[i][0]:
115 |           pim_name = configs_all[j+1][0]
116 |     print(configs_gpu[i][0])
117 |     print(pim_name)
118 |     procs.append(Popen(make_args(configs_gpu[i],i), shell=True, start_new_session=True))
119 |     r += 1
120 |     time.sleep(3)
121 | 
122 |   while True:
123 |     if n >= len(configs_gpu):
124 |       break
125 | 
126 |     for i, p in enumerate(procs):
127 |       if p is None:
128 |         continue
129 | 
130 |       if p.poll() is not None:
131 |         if r < len(configs_gpu):
132 |           if "added" in configs_gpu[r][0]:
133 |             for j in range(len(configs_all)):
134 |               if configs_all[j][0] == configs_gpu[r][0]:
135 |                 pim_name = configs_all[j+1][0]
136 |           print(configs_gpu[i][0])
137 |           print(pim_name)
138 |           procs[i] = Popen(make_args(configs_gpu[r],i), shell=True, start_new_session=True)
139 |         else:
140 |           procs[i] = None
141 | 
142 |         r += 1
143 |         n += 1
144 |         time.sleep(3)
145 | 
146 |     time.sleep(3)
147 | 
148 | def statistics(configs_gpu,configs_pim,configs_all):
149 |   if args.gpgpusim_config == "SM75_RTX2060" or args.gpgpusim_config== "SM75_RTX2060_S":
150 |     # scale=1.605882353 # HBM
151 |     scale = 1.56 # GDDR6
152 |   elif args.gpgpusim_config == "SM7_TITANV":
153 |     scale=1.411764706
154 |   else:
155 |     assert False
156 | 
157 |   pim_cycles = {}
158 |   for c in range(len(configs_pim)):
159 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32-args.n_channel}.pim"
160 |     cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
161 |     cyc = re.sub(r'[^0-9]', '', cyc)
162 |     pim_cycles[configs_pim[c][0]]=int(cyc) * scale
163 |     print(pim_cycles[configs_pim[c][0]])
164 | 
165 |   pim_cycles_noopt = {}
166 |   for c in range(len(configs_pim)):
167 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32-args.n_channel}.pim"
168 |     cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
169 |     cyc = re.sub(r'[^0-9]', '', cyc)
170 |     pim_cycles_noopt[configs_pim[c][0]]=int(cyc) * scale
171 |     print(pim_cycles_noopt[configs_pim[c][0]])
172 | 
173 |   gpu_cycles = {}
174 |   for c in range(len(configs_gpu)):
175 |     fname = f"traces-{configs_gpu[c][0]}.txt"
176 |     start = 0
177 |     end = 0
178 |     with open(fname) as f:
179 |       lines = f.readlines()
180 | 
181 |       tot_runs = []
182 |       runs = []
183 |       energy = 0
184 |       for i, l in enumerate(lines):
185 |         if l.find("gpu_tot_sim_cycle") != -1:
186 |           tot_runs.append(int(lines[i].split("=")[1]))
187 |         if l.find("kernel_name") != -1 and l.find("forward_kernel_cuda") == -1:
188 |           runs.append(int(lines[i+1].split("=")[1]))
189 |       # assert len(runs) == 3
190 |       gpu_cycles[configs_gpu[c][0]]=int(tot_runs[len(tot_runs)-1])
191 | 
192 |   with open(f'{args.model}_split{args.split_ratio}_{args.n_channel}_{args.n_gwrite}.csv','w', newline='') as f:
193 |     wr = csv.writer(f)
194 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
195 |     for c in range(len(configs_all)):
196 |       if "pim"in configs_all[c][0]:
197 |         continue
198 |       elif "added" in configs_all[c][0]:
199 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),pim_cycles.get(configs_all[c+1][0],0)])
200 |       else:
201 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
202 | 
203 |   with open(f'{args.model}_split{args.split_ratio}_{args.n_channel}_{args.n_gwrite}_noopt.csv','w', newline='') as f:
204 |     wr = csv.writer(f)
205 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
206 |     for c in range(len(configs_all)):
207 |       if "pim"in configs_all[c][0]:
208 |         continue
209 |       elif "added" in configs_all[c][0]:
210 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),pim_cycles_noopt.get(configs_all[c+1][0],0)])
211 |       else:
212 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
213 | 
214 |   if args.split_ratio == 100:
215 |     gpu_cycles = {}
216 |     for c in range(len(configs_gpu)):
217 |       fname = f"traces-{configs_gpu[c][0]}-baseline.txt"
218 |       start = 0
219 |       end = 0
220 |       with open(fname) as f:
221 |         lines = f.readlines()
222 | 
223 |         # if start != 3 or end != 3:
224 |         #   print(f"SKIP: {fname}")
225 |         #   continue
226 |         tot_runs = []
227 |         runs = []
228 |         energy = 0
229 |         for i, l in enumerate(lines):
230 |           if l.find("gpu_tot_sim_cycle") != -1:
231 |             tot_runs.append(int(lines[i].split("=")[1]))
232 |           if l.find("kernel_name") != -1 and l.find("forward_kernel_cuda") == -1:
233 |             runs.append(int(lines[i+1].split("=")[1]))
234 |         # assert len(runs) == 3
235 |         gpu_cycles[configs_gpu[c][0]]=int(tot_runs[len(tot_runs)-1])
236 | 
237 |     with open(f'{args.model}_split{args.split_ratio}-baseline.csv','w', newline='') as f:
238 |       wr = csv.writer(f)
239 |       wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
240 |       for c in range(len(configs_all)):
241 |         if "pim"in configs_all[c][0]:
242 |           continue
243 |         elif "added" in configs_all[c][0]:
244 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),pim_cycles.get(configs_all[c+1][0],0)])
245 |         else:
246 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
247 | 
248 | def pim(configs_all):
249 |   if args.gpgpusim_config== "SM75_RTX2060" or args.gpgpusim_config== "SM75_RTX2060_S":
250 |     # scale=1.605882353 # HBM
251 |     scale = 1.56 # GDDR6
252 |   elif args.gpgpusim_config== "SM7_TITANV":
253 |     scale=1.411764706
254 |   else:
255 |     assert False
256 |   pim_cycles = {}
257 |   with open(f'{args.model}_split0_{args.n_channel}_{args.n_gwrite}.csv','w', newline='') as f:
258 |     wr = csv.writer(f)
259 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
260 |     for c in range(len(configs_all)):
261 |       pname = f"{configs_all[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
262 |       cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
263 |       cyc = re.sub(r'[^0-9]', '', cyc)
264 |       pim_cycles[configs_all[c][0]]=int(cyc) * scale
265 |       print(pim_cycles[configs_all[c][0]])
266 |       wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles.get(configs_all[c][0],0)])
267 |   pim_cycles_noopt = {}
268 |   with open(f'{args.model}_split0_{args.n_channel}_{args.n_gwrite}_noopt.csv','w', newline='') as f:
269 |     wr = csv.writer(f)
270 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
271 |     for c in range(len(configs_all)):
272 |       pname = f"{configs_all[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
273 |       cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
274 |       cyc = re.sub(r'[^0-9]', '', cyc)
275 |       pim_cycles_noopt[configs_all[c][0]]=int(cyc) * scale
276 |       print(pim_cycles_noopt[configs_all[c][0]])
277 |       wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles_noopt.get(configs_all[c][0],0)])
278 | 
279 | def update_pim(configs_all, split_ratio):
280 |   if args.gpgpusim_config == "SM75_RTX2060":
281 |     # scale = 1.605882353 # HBM
282 |     scale = 1.56 # GDDR6
283 |   elif args.gpgpusim_config == "SM7_TITANV":
284 |     scale = 1.411764706
285 |   else:
286 |     assert False
287 | 
288 |   f = pd.read_csv(f'{args.model}_split{split_ratio}_{args.n_channel}_{args.n_gwrite}.csv', delimiter=',')
289 |   header = list(f.columns)
290 |   rows = [list(row) for row in f.values]
291 | 
292 |   pim_cycles = {}
293 |   for i, c in enumerate(configs_all):
294 |     if split_ratio == 0 or split_ratio == 100:
295 |       idx = i
296 |     elif "pim" in c[0]:
297 |       continue
298 |     elif "added" in c[0]:
299 |       idx = i + 1
300 |     else:
301 |       idx = i
302 | 
303 |     pname = f"{configs_all[idx][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
304 |     cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
305 |     cyc = re.sub(r'[^0-9]', '', cyc)
306 | 
307 |     pim_cycles[c[0]] = int(cyc) * scale
308 |     if split_ratio != 0 and int(c[9]) > 1:
309 |       pim_cycles[c[0]] = 0
310 | 
311 |   with open(f'{args.model}_split{split_ratio}_{args.n_channel}_{args.n_gwrite}.csv', 'w', newline='') as g:
312 |     writer = csv.writer(g)
313 |     writer.writerow(header)
314 |     for row in rows:
315 |       row[13] = pim_cycles[row[0]]
316 |       writer.writerow(row)
317 | 
318 |   f = pd.read_csv(f'{args.model}_split{split_ratio}_{args.n_channel}_{args.n_gwrite}_noopt.csv', delimiter=',')
319 |   header = list(f.columns)
320 |   rows = [list(row) for row in f.values]
321 | 
322 |   pim_cycles_noopt = {}
323 |   for i, c in enumerate(configs_all):
324 |     if split_ratio == 0 or split_ratio == 100:
325 |       idx = i
326 |     elif "pim" in c[0]:
327 |       continue
328 |     elif "added" in c[0]:
329 |       idx = i + 1
330 |     else:
331 |       idx = i
332 | 
333 |     pname = f"{configs_all[idx][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
334 |     cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
335 |     cyc = re.sub(r'[^0-9]', '', cyc)
336 | 
337 |     pim_cycles_noopt[c[0]] = int(cyc) * scale
338 |     if split_ratio != 0 and int(c[9]) > 1:
339 |       pim_cycles_noopt[c[0]] = 0
340 | 
341 |   with open(f'{args.model}_split{split_ratio}_{args.n_channel}_{args.n_gwrite}_noopt.csv', 'w', newline='') as g:
342 |     writer = csv.writer(g)
343 |     writer.writerow(header)
344 |     for row in rows:
345 |       row[13] = pim_cycles_noopt[row[0]]
346 |       writer.writerow(row)
347 | 
348 | if args.trace:
349 |   trace(CONFIG_GPU)
350 | 
351 | if args.simulate:
352 |   simulate(CONFIG_GPU, CONFIG_ALL)
353 | 
354 | if args.pim_codegen:
355 |   for config in CONFIG_PIM:
356 |     os.system(f'sh pim_codegen.sh {config[0]}_{args.n_channel} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.n_channel} {args.n_gwrite}')
357 | 
358 | if args.stat:
359 |   statistics(CONFIG_GPU, CONFIG_PIM, CONFIG_ALL)
360 | 
361 | if args.pim:
362 |   for config in CONFIG_ALL:
363 |     os.system(f'sh pim_codegen.sh {config[0]}_{args.n_channel} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.n_channel} {args.n_gwrite}')
364 |   pim(CONFIG_ALL)
365 | 
366 | if args.make_max_shape:
367 |   ratio=0
368 |   CONFIG_RESULT=[]
369 |   while(ratio<=100):
370 |     if ratio == 0:
371 |       os.system(f"python3 inspect_shape.py --model={args.model} --split_ratio=100")
372 |     else:
373 |       os.system(f"python3 inspect_shape.py --model={args.model} --split_ratio={ratio}")
374 |     CONFIG_ALL_max_shape=[]
375 |     CONFIG_GPU_max_shape=[]
376 |     CONFIG_PIM_max_shape=[]
377 |     with open(f"{args.model}_{args.n_channel}.onnx_conv.csv") as f:
378 |       rdr = csv.reader(f)
379 |       for line in rdr:
380 |         line.append(ratio)
381 |         CONFIG_ALL_max_shape.append(line)
382 |         if "pim" in line[0]:
383 |           CONFIG_PIM_max_shape.append(line)
384 |         else:
385 |           CONFIG_GPU_max_shape.append(line)
386 | 
387 |     max_csv = pd.read_csv(f"./max_performance_{args.model}_{args.n_channel}_{args.n_gwrite}.csv")
388 |     ratio_test = max_csv['RATIO']
389 |     loc= np.where(ratio_test==ratio)
390 | 
391 |     for i in (loc[0]):
392 |       if "added" in CONFIG_GPU_max_shape[i][0]:
393 |         CONFIG_RESULT.append(CONFIG_GPU_max_shape[i])
394 |         CONFIG_RESULT.append(CONFIG_ALL_max_shape[CONFIG_ALL_max_shape.index(CONFIG_GPU_max_shape[i])+1])
395 |       else:
396 |         CONFIG_RESULT.append(CONFIG_GPU_max_shape[i])
397 |     ratio = ratio+10
398 |   print(CONFIG_RESULT)
399 |   f = open(f'{args.model}_max_ratio_{args.n_channel}_{args.n_gwrite}.onnx_conv.csv','w', newline='')
400 |   wr = csv.writer(f)
401 |   for c in CONFIG_RESULT:
402 |     wr.writerow(c)
403 | 
404 | if args.update_pim:
405 |   assert args.n_channel < 32 or args.split_ratio == 100 and args.n_channel == 32
406 | 
407 |   split_ratio = args.split_ratio
408 |   if args.split_ratio == 100 and args.n_channel == 32:
409 |     os.system(f"cp {args.model}_split100-baseline.csv {args.model}_split100_32.csv")
410 |   else:
411 |     if split_ratio == 100:
412 |       split_ratio = 0
413 |   for config in CONFIG_ALL:
414 |     os.system(f'sh pim_codegen.sh {config[0]}_{args.n_channel} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.n_channel} {args.n_gwrite}')
415 |   update_pim(CONFIG_ALL, split_ratio)
416 | 
417 | print("Finished!")
418 | 


--------------------------------------------------------------------------------
/layerwise/run_matmul:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen
  2 | import time
  3 | import argparse
  4 | import multiprocessing
  5 | import os
  6 | import numpy as np
  7 | import csv
  8 | import re
  9 | import math
 10 | from pim.util import MODEL_LIST
 11 | from torch.cuda import device_count
 12 | parser = argparse.ArgumentParser()
 13 | group = parser.add_mutually_exclusive_group(required=True)
 14 | group.add_argument("--trace", action="store_true", help="create trace")
 15 | group.add_argument("--simulate", action="store_true", help="simulate")
 16 | group.add_argument("--pim_codegen", action="store_true", help="pim codegen")
 17 | group.add_argument("--stat", action="store_true", help="record statistics")
 18 | group.add_argument("--pim", action="store_true", help="pim_cycle")
 19 | parser.add_argument("--gpgpusim_config", help="gpgpusim config (e.g. SM75_RTX2060")
 20 | parser.add_argument("--split_ratio", type=int, default=0)
 21 | parser.add_argument("--device_id", type=int, default=0)
 22 | parser.add_argument("--kernel_launch_latency", type=int, default=5010)
 23 | parser.add_argument("--n_channel", type=int, default=16)
 24 | parser.add_argument("--n_gwrite", type=int, default=4)
 25 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 26 | args = parser.parse_args()
 27 | 
 28 | if args.trace and (args.gpgpusim_config is None):
 29 |   parser.error("--trace requires --gpgpusim_config")
 30 | if args.simulate and (args.gpgpusim_config is None):
 31 |   parser.error("--simulate requires --gpgpusim_config")
 32 | 
 33 | NGPU=list(range(device_count()))
 34 | NCPU=multiprocessing.cpu_count()
 35 | 
 36 | CONFIG=[]
 37 | with open(f"{args.model}_{args.n_channel}.onnx_matmul.csv") as f:
 38 |   rdr = csv.reader(f)
 39 |   for line in rdr:
 40 |     CONFIG.append(line)
 41 | 
 42 | def get_device(n):
 43 |   return NGPU[n % len(NGPU)]
 44 | 
 45 | def make_args(config, n_mem):
 46 |   script = ""
 47 |   if args.trace:
 48 |     script = "trace_matmul.sh"
 49 |   elif args.simulate:
 50 |     script = "sim_matmul.sh"
 51 | 
 52 |   dev = get_device(n_mem)
 53 |   print(config)
 54 | 
 55 |   batch_size = math.ceil(int(config[5]) * args.split_ratio / 100)
 56 |   a = f"export CUDA_VISIBLE_DEVICES={dev} && export TRACES_PATH=traces-matmul-{config[0]}-{args.split_ratio} && echo $CUDA_VISIBLE_DEVICES && ./{script} {config[0]}-{args.split_ratio} {config[1]} {config[2]} {config[3]} {config[4]} {batch_size} {args.gpgpusim_config} {args.split_ratio} {args.n_channel}"
 57 |   print(a)
 58 |   return a
 59 | 
 60 | def trace(configs):
 61 |   procs = []
 62 |   n = 0
 63 |   r = 0
 64 |   for i in range(min(len(NGPU), len(configs))):
 65 |     procs.append(Popen(make_args(configs[i], i), shell=True, start_new_session=True))
 66 |     r += 1
 67 |     time.sleep(3)
 68 |   while True:
 69 |     if n >= len(configs):
 70 |       break
 71 |     for i, p in enumerate(procs):
 72 |       if p is None:
 73 |         continue
 74 |       if p.poll() is not None:
 75 |         if r < len(configs):
 76 |           procs[i] = Popen(make_args(configs[r], i), shell=True, start_new_session=True)
 77 |           r += 1
 78 |         else:
 79 |           procs[i] = None
 80 |         n += 1
 81 |         time.sleep(3)
 82 |     time.sleep(3)
 83 | 
 84 | def simulate(configs):
 85 |   procs = []
 86 |   n = 0
 87 |   r = 0
 88 |   for i in range(min(NCPU, len(configs))):
 89 |     procs.append(Popen(make_args(configs[i], i), shell=True, start_new_session=True))
 90 |     r += 1
 91 |     time.sleep(3)
 92 | 
 93 |   while True:
 94 |     if n >= len(configs):
 95 |       break
 96 | 
 97 |     for i, p in enumerate(procs):
 98 |       if p is None:
 99 |         continue
100 | 
101 |       if p.poll() is not None:
102 |         if r < len(configs):
103 |           procs[i] = Popen(make_args(configs[r], i), shell=True, start_new_session=True)
104 |         else:
105 |           procs[i] = None
106 | 
107 |         r += 1
108 |         n += 1
109 |         time.sleep(3)
110 | 
111 |     time.sleep(3)
112 | 
113 | def statistics(configs):
114 |   if args.gpgpusim_config == "SM75_RTX2060":
115 |     # scale=1.605882353 # HBM
116 |     scale = 1.56 # GDDR6
117 |   elif args.gpgpusim_config == "SM7_TITANV":
118 |     scale=1.411764706
119 |   else:
120 |     assert False
121 | 
122 |   pim_cycles = {}
123 |   for c in configs:
124 |     pname = f"{c[0]}-{args.split_ratio}-matmul/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32-args.n_channel}.pim"
125 |     cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
126 |     cyc = re.sub(r'[^0-9]', '', cyc)
127 |     if int(cyc) == 1: # no trace
128 |       continue
129 |     pim_cycles[c[0]]=int(cyc) * scale
130 |     print(pim_cycles[c[0]])
131 | 
132 |   pim_cycles_noopt = {}
133 |   for c in configs:
134 |     pname = f"{c[0]}-{args.split_ratio}-matmul/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32-args.n_channel}.pim"
135 |     cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
136 |     cyc = re.sub(r'[^0-9]', '', cyc)
137 |     if int(cyc) == 1: # no trace
138 |       continue
139 |     pim_cycles_noopt[c[0]]=int(cyc) * scale
140 |     print(pim_cycles_noopt[c[0]])
141 | 
142 |   gpu_cycles = {}
143 |   for c in configs:
144 |     fname = f"traces-matmul-{c[0]}-{args.split_ratio}.txt"
145 |     start = 0
146 |     end = 0
147 |     with open(fname) as f:
148 |       lines = f.readlines()
149 | 
150 |       tot_runs = []
151 |       runs = []
152 |       energy = 0
153 |       for i, l in enumerate(lines):
154 |         if l.find("gpu_tot_sim_cycle") != -1:
155 |           tot_runs.append(int(lines[i].split("=")[1]))
156 |         if l.find("kernel_name") != -1 and l.find("forward_kernel_cuda") == -1:
157 |           runs.append(int(lines[i+1].split("=")[1]))
158 |       # assert len(runs) == 3
159 |       gpu_cycles[c[0]]=int(tot_runs[len(tot_runs)-1])
160 | 
161 |   with open(f'{args.model}_matmul{args.split_ratio}_{args.n_channel}_{args.n_gwrite}.csv','w', newline='') as f:
162 |     wr = csv.writer(f)
163 |     wr.writerow(['kernel_name','row','col','bias','activation','GPU cycles','PIM cycles'])
164 |     for c in configs:
165 |       wr.writerow([c[0],c[1],c[2],c[3],c[4],gpu_cycles.get(c[0],0),pim_cycles.get(c[0],0)])
166 | 
167 |   with open(f'{args.model}_matmul{args.split_ratio}_{args.n_channel}_{args.n_gwrite}_noopt.csv','w', newline='') as f:
168 |     wr = csv.writer(f)
169 |     wr.writerow(['kernel_name','row','col','bias','activation','GPU cycles','PIM cycles'])
170 |     for c in configs:
171 |       wr.writerow([c[0],c[1],c[2],c[3],c[4],gpu_cycles.get(c[0],0),pim_cycles_noopt.get(c[0],0)])
172 | 
173 |   if args.split_ratio == 100:
174 |     gpu_cycles = {}
175 |     for c in configs:
176 |       fname = f"traces-matmul-{c[0]}-{args.split_ratio}-baseline.txt"
177 |       start = 0
178 |       end = 0
179 |       with open(fname) as f:
180 |         lines = f.readlines()
181 | 
182 |         # if start != 3 or end != 3:
183 |         #   print(f"SKIP: {fname}")
184 |         #   continue
185 |         tot_runs = []
186 |         runs = []
187 |         energy = 0
188 |         for i, l in enumerate(lines):
189 |           if l.find("gpu_tot_sim_cycle") != -1:
190 |             tot_runs.append(int(lines[i].split("=")[1]))
191 |           if l.find("kernel_name") != -1 and l.find("forward_kernel_cuda") == -1:
192 |             runs.append(int(lines[i+1].split("=")[1]))
193 |         # assert len(runs) == 3
194 |         gpu_cycles[c[0]]=int(tot_runs[len(tot_runs)-1])
195 | 
196 |     with open(f'{args.model}_matmul{args.split_ratio}-baseline.csv','w', newline='') as f:
197 |       wr = csv.writer(f)
198 |       wr.writerow(['kernel_name','row','col','bias','activation','GPU cycles','PIM cycles'])
199 |       for c in configs:
200 |         wr.writerow([c[0],c[1],c[2],c[3],c[4],gpu_cycles.get(c[0],0),0])
201 | 
202 | def pim(configs):
203 |   if args.gpgpusim_config== "SM75_RTX2060":
204 |     # scale=1.605882353 # HBM
205 |     scale = 1.56 # GDDR6
206 |   elif args.gpgpusim_config== "SM7_TITANV":
207 |     scale=1.411764706
208 |   else:
209 |     assert False
210 |   pim_cycles = {}
211 |   with open(f'{args.model}_matmul0_{args.n_channel}_{args.n_gwrite}.csv','w', newline='') as f:
212 |     wr = csv.writer(f)
213 |     wr.writerow(['kernel_name','row','col','bias','activation','GPU cycles','PIM cycles'])
214 |     for c in configs:
215 |       pname = f"{c[0]}-0-matmul/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
216 |       cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
217 |       cyc = re.sub(r'[^0-9]', '', cyc)
218 |       pim_cycles[c[0]]=int(cyc) * scale
219 |       wr.writerow([c[0],c[1],c[2],c[3],c[4],0,pim_cycles.get(c[0],0)])
220 |   pim_cycles_noopt = {}
221 |   with open(f'{args.model}_matmul0_{args.n_channel}_{args.n_gwrite}_noopt.csv','w', newline='') as f:
222 |     wr = csv.writer(f)
223 |     wr.writerow(['kernel_name','row','col','bias','activation','GPU cycles','PIM cycles'])
224 |     for c in configs:
225 |       pname = f"{c[0]}-0-matmul/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
226 |       cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
227 |       cyc = re.sub(r'[^0-9]', '', cyc)
228 |       pim_cycles_noopt[c[0]]=int(cyc) * scale
229 |       wr.writerow([c[0],c[1],c[2],c[3],c[4],0,pim_cycles_noopt.get(c[0],0)])
230 | 
231 | if args.trace:
232 |   trace(CONFIG)
233 | 
234 | if args.simulate:
235 |   simulate(CONFIG)
236 | 
237 | if args.pim_codegen:
238 |   for config in CONFIG:
239 |     batch_size = math.ceil(int(config[5]) * args.split_ratio / 100)
240 |     os.system(f'sh pim_codegen.sh {config[0]}-{args.split_ratio}-matmul {config[1]} {config[2]} 1 1 1 0 0 1 1 {config[3]} {int(config[5]) - batch_size} 1 {args.n_channel} {args.n_gwrite}')
241 | 
242 | if args.stat:
243 |   statistics(CONFIG)
244 | 
245 | if args.pim:
246 |   for config in CONFIG:
247 |     os.system(f'sh pim_codegen.sh {config[0]}-0-matmul {config[1]} {config[2]} 1 1 1 0 0 1 1 {config[3]} {int(config[5])} 1 {args.n_channel} {args.n_gwrite}')
248 |   pim(CONFIG)
249 | print("Finished!")
250 | 


--------------------------------------------------------------------------------
/layerwise/sim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | OUT_CHANNELS=$2
 5 | IN_CHANNELS=$3
 6 | KH=$4
 7 | KW=$5
 8 | STRIDE=$6
 9 | PH=$7
10 | PW=$8
11 | DILATE=$9
12 | GROUP=${10}
13 | BIAS=${11}
14 | IMAGE_HEIGHT=${12}
15 | IMAGE_WIDTH=${13}
16 | GPU=${14}
17 | RATIO=${15}
18 | ACTIVATION=${16}
19 | DEVICE_ID=${17}
20 | N_CHANNEL=${18}
21 | 
22 | EXTRA_GPU_CONFIG_1="-gpgpu_n_mem $((32-$N_CHANNEL)) -gpgpu_deadlock_detect 0"
23 | EXTRA_GPU_CONFIG_2="-gpgpu_n_mem 32 -gpgpu_deadlock_detect 0"
24 | # TODO: add PIM_PATH
25 | 
26 | BASE_PATH="/root/PIMFlow_accel-sim-framework"
27 | 
28 | export CUDA_INSTALL_PATH=/usr/local/cuda
29 | source "$BASE_PATH/gpu-simulator/setup_environment.sh"
30 | if [ $RATIO -eq 100 ]
31 | then
32 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_2 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-$NAME-baseline.txt
33 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_1 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-$NAME.txt
34 | else
35 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_1 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-$NAME.txt
36 | fi
37 | 


--------------------------------------------------------------------------------
/layerwise/sim_matmul.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | ROW=$2
 5 | COL=$3
 6 | BIAS=$4
 7 | ACTIVATION=$5
 8 | BATCH_SIZE=$6
 9 | GPU=$7
10 | RATIO=$8
11 | N_CHANNEL=$9
12 | 
13 | EXTRA_GPU_CONFIG_1="-gpgpu_n_mem $((32-$N_CHANNEL)) -gpgpu_deadlock_detect 0"
14 | EXTRA_GPU_CONFIG_2="-gpgpu_n_mem 32 -gpgpu_deadlock_detect 0"
15 | 
16 | BASE_PATH="/root/PIMFlow_accel-sim-framework"
17 | 
18 | export CUDA_INSTALL_PATH=/usr/local/cuda
19 | source "$BASE_PATH/gpu-simulator/setup_environment.sh"
20 | if [ $RATIO -eq 100 ]
21 | then
22 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-matmul-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_2 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-matmul-$NAME-baseline.txt
23 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-matmul-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_1 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-matmul-$NAME.txt
24 | else
25 |     timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-matmul-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_1 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-matmul-$NAME.txt
26 | fi
27 | 


--------------------------------------------------------------------------------
/layerwise/trace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | OUT_CHANNELS=$2
 5 | IN_CHANNELS=$3
 6 | KH=$4
 7 | KW=$5
 8 | STRIDE=$6
 9 | PH=$7
10 | PW=$8
11 | DILATE=$9
12 | GROUP=${10}
13 | BIAS=${11}
14 | IMAGE_HEIGHT=${12}
15 | IMAGE_WIDTH=${13}
16 | GPU=${14}
17 | RATIO=${15}
18 | ACTIVATION=${16}
19 | DEVICE_ID=${17}
20 | N_CHANNEL=${18}
21 | 
22 | export DYNAMIC_KERNEL_LIMIT_START=1000000000
23 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/layerwise/layerwise.py --oc=$OUT_CHANNELS --ic=$IN_CHANNELS --kh=$KH --kw=$KW --stride=$STRIDE --ph=$PH --pw=$PW --dilate=$DILATE --g=$GROUP --b --h=$IMAGE_HEIGHT --w=$IMAGE_WIDTH --dev=$CUDA_VISIBLE_DEVICES --activation=$ACTIVATION
24 | 
25 | START=$(python3 inspect --path=traces-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\1/g')
26 | END=$(python3 inspect --path=traces-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\2/g')
27 | export DYNAMIC_KERNEL_LIMIT_START=$START
28 | export DYNAMIC_KERNEL_LIMIT_END=$END
29 | 
30 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/layerwise/layerwise.py --oc=$OUT_CHANNELS --ic=$IN_CHANNELS --kh=$KH --kw=$KW --stride=$STRIDE --ph=$PH --pw=$PW --dilate=$DILATE --g=$GROUP --b --h=$IMAGE_HEIGHT --w=$IMAGE_WIDTH --dev=$CUDA_VISIBLE_DEVICES --activation=$ACTIVATION
31 | 
32 | /root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing ./traces-$NAME/kernelslist
33 | 


--------------------------------------------------------------------------------
/layerwise/trace_matmul.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | ROW=$2
 5 | COL=$3
 6 | BIAS=$4
 7 | ACTIVATION=$5
 8 | BATCH_SIZE=$6
 9 | GPU=$7
10 | RATIO=$8
11 | N_CHANNEL=$9
12 | 
13 | export DYNAMIC_KERNEL_LIMIT_START=1000000000
14 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/layerwise/layerwise_matmul.py --name=$NAME --batch=$BATCH_SIZE --row=$ROW --col=$COL --bias=$BIAS --activation=$ACTIVATION
15 | 
16 | START=$(python3 inspect --path=traces-matmul-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\1/g')
17 | END=$(python3 inspect --path=traces-matmul-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\2/g')
18 | export DYNAMIC_KERNEL_LIMIT_START=$START
19 | export DYNAMIC_KERNEL_LIMIT_END=$END
20 | 
21 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/layerwise/layerwise_matmul.py --name=$NAME --batch=$BATCH_SIZE --row=$ROW --col=$COL --bias=$BIAS --activation=$ACTIVATION
22 | 
23 | /root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing ./traces-matmul-$NAME/kernelslist
24 | 


--------------------------------------------------------------------------------
/marker/marker_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | //#include <cuda.h>
 3 | 
 4 | void aggregation_kernel(bool is_start);
 5 | void linear_kernel(bool is_start);
 6 | void forward_kernel(bool is_start);
 7 | void backward_kernel(bool is_start);
 8 | void avgpool_kernel(bool is_start);
 9 | void fc1_kernel(bool is_start);
10 | void fc2_kernel(bool is_start);
11 | 
12 | void aggregation(bool is_start) {
13 |   aggregation_kernel(is_start);
14 | }
15 | void linear(bool is_start) {
16 |   linear_kernel(is_start);
17 | }
18 | 
19 | void forward(bool is_start) {
20 |   forward_kernel(is_start);
21 | }
22 | void backward(bool is_start) {
23 |   backward_kernel(is_start);
24 | }
25 | 
26 | void avgpool(bool is_start) {
27 |   avgpool_kernel(is_start);
28 | }
29 | void fc1(bool is_start) {
30 |   fc1_kernel(is_start);
31 | }
32 | void fc2(bool is_start) {
33 |   fc2_kernel(is_start);
34 | }
35 | 
36 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
37 |   m.def("aggregation", &aggregation, "GCN aggregation");
38 |   m.def("linear", &linear, "GCN linear");
39 |   m.def("forward", &forward, "forward");
40 |   m.def("backward", &backward, "backward");
41 |   m.def("avgpool", &avgpool, "avgpool");
42 |   m.def("fc1", &fc1, "fc1");
43 |   m.def("fc2", &fc2, "fc2");
44 | }
45 | 


--------------------------------------------------------------------------------
/marker/marker_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda.h>
 2 | 
 3 | __global__ void aggregation_kernel_cuda_start(void) {}
 4 | __global__ void aggregation_kernel_cuda_end(void) {}
 5 | __global__ void linear_kernel_cuda_start(void) {}
 6 | __global__ void linear_kernel_cuda_end(void) {}
 7 | __global__ void forward_kernel_cuda_start(void) {}
 8 | __global__ void forward_kernel_cuda_end(void) {}
 9 | __global__ void backward_kernel_cuda_start(void) {}
10 | __global__ void backward_kernel_cuda_end(void) {}
11 | __global__ void avgpool_kernel_cuda_start(void) {}
12 | __global__ void avgpool_kernel_cuda_end(void) {}
13 | __global__ void fc1_kernel_cuda_start(void) {}
14 | __global__ void fc1_kernel_cuda_end(void) {}
15 | __global__ void fc2_kernel_cuda_start(void) {}
16 | __global__ void fc2_kernel_cuda_end(void) {}
17 | 
18 | void aggregation_kernel(bool is_start) {
19 |   if (is_start)
20 |     aggregation_kernel_cuda_start<<<1, 1>>>();
21 |   else
22 |     aggregation_kernel_cuda_end<<<1, 1>>>();
23 | 
24 | }
25 | void linear_kernel(bool is_start) {
26 |   if (is_start)
27 |     linear_kernel_cuda_start<<<1, 1>>>();
28 |   else
29 |     linear_kernel_cuda_end<<<1, 1>>>();
30 | }
31 | 
32 | void forward_kernel(bool is_start) {
33 |   if (is_start)
34 |     forward_kernel_cuda_start<<<1, 1>>>();
35 |   else
36 |     forward_kernel_cuda_end<<<1, 1>>>();
37 | }
38 | 
39 | void backward_kernel(bool is_start) {
40 |   if (is_start)
41 |     backward_kernel_cuda_start<<<1, 1>>>();
42 |   else
43 |     backward_kernel_cuda_end<<<1, 1>>>();
44 | }
45 | 
46 | void avgpool_kernel(bool is_start) {
47 |   if (is_start)
48 |     avgpool_kernel_cuda_start<<<1, 1>>>();
49 |   else
50 |     avgpool_kernel_cuda_end<<<1, 1>>>();
51 | }
52 | 
53 | void fc1_kernel(bool is_start) {
54 |   if (is_start)
55 |     fc1_kernel_cuda_start<<<1, 1>>>();
56 |   else
57 |     fc1_kernel_cuda_end<<<1, 1>>>();
58 | }
59 | 
60 | void fc2_kernel(bool is_start) {
61 |   if (is_start)
62 |     fc2_kernel_cuda_start<<<1, 1>>>();
63 |   else
64 |     fc2_kernel_cuda_end<<<1, 1>>>();
65 | }
66 | 


--------------------------------------------------------------------------------
/marker/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | setup(
 5 |     name='marker',
 6 |     ext_modules=[
 7 |         CUDAExtension('marker_cuda', [
 8 |             'marker_cuda.cpp',
 9 |             'marker_cuda_kernel.cu',
10 |         ])
11 |     ],
12 |     cmdclass={
13 |         'build_ext': BuildExtension
14 |     }
15 | )
16 | 


--------------------------------------------------------------------------------
/pim/Makefile:
--------------------------------------------------------------------------------
1 | all: pim_codegen
2 | 
3 | pim_codegen:
4 | 	g++ pim_trace.cc main.cc -O3 -o pim_codegen
5 | clean:
6 | 	rm -f pim_codegen
7 | 


--------------------------------------------------------------------------------
/pim/main.cc:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <numeric>
  3 | #include <sstream>
  4 | #include <vector>
  5 | #include <algorithm>
  6 | #include <unordered_map>
  7 | #include <cstring>
  8 | #include <string>
  9 | #include <cassert>
 10 | #include <iostream>
 11 | 
 12 | #include "pim_trace.h"
 13 | 
 14 | using Str2StrMap = std::unordered_map<std::string, std::string>;
 15 | 
 16 | int nextPow2(int n) {
 17 |   int power = 1;
 18 |   while (power < n) {
 19 |     power *= 2;
 20 |   }
 21 |   return power;
 22 | }
 23 | 
 24 | std::vector<std::string> PimCodeGen(std::string id, const Str2StrMap& attrs,
 25 |                 const std::vector<std::string>& func_args) {
 26 |   int n_channel = std::stoi(attrs.at("N_CHANNEL"));
 27 |   std::vector<std::string> code;
 28 |   std::ostringstream OS;
 29 | 
 30 |   int h = std::stoi(attrs.at("H"));
 31 |   int w = std::stoi(attrs.at("W"));
 32 |   int kh = std::stoi(attrs.at("KH"));
 33 |   int kw = std::stoi(attrs.at("KW"));
 34 |   int ph = std::stoi(attrs.at("PH"));
 35 |   int pw = std::stoi(attrs.at("PW"));
 36 |   int stride_ = std::stoi(attrs.at("S"));
 37 | 
 38 |   int H = (h - kh + 2 * ph) / stride_ + 1;
 39 |   int W = (w - kw + 2 * pw) / stride_ + 1;
 40 |   int C_o = std::stoi(attrs.at("K"));
 41 |   int C_i = std::stoi(attrs.at("C"));
 42 | 
 43 |   auto i_c = (((C_i * kh * kw) + 15) / 16) * 16;
 44 |   auto o_c = C_o;
 45 |   auto row = ((o_c + 15) / 16) * 16;
 46 |   int fl, stride, n, col;
 47 | 
 48 |   // NOTE: Boundary condition is ignored (e.g., 3x3 kernel with (1, 1) padding)
 49 |   if (i_c <= 512) {
 50 |     fl = 512 / i_c;
 51 |     col = i_c * fl;
 52 |     stride = i_c;
 53 |     n = ((H * W) + fl - 1) / fl;
 54 |     for (int i = 0; i < n; i++) {
 55 |       pim::StrideInfo sinfo;
 56 |       if (kh > 1 || kw > 1) {
 57 |         sinfo.use_stride = true;
 58 |         sinfo.num_first_elem = C_i * kh;
 59 |         sinfo.stride = C_i * (h - kh);
 60 |         sinfo.num_after_elem = C_i * kh;
 61 |         sinfo.num_gwrite = 512 / (C_i * kh * kw);
 62 |       }
 63 |       pim::OutputNewtonTraceV2(OS, id, row, col, stride, sinfo);
 64 |     }
 65 |     code.push_back(OS.str());
 66 |   } else {
 67 |     int full = i_c / 512;
 68 |     col = 512;
 69 |     stride = 512;
 70 |     n = (H * W);
 71 |     // n = std::min(((n + n_channel - 1) / n_channel) * n_channel, nextPow2(n));
 72 |     for (int j = 0; j < full; j++) {
 73 |       for (int i = 0; i < n; i++) {
 74 |         pim::OutputNewtonTraceV2(OS, id, row, col, stride);
 75 |       }
 76 |     }
 77 |     code.push_back(OS.str());
 78 |     OS.str("");
 79 |     OS.clear();
 80 |     int i_c_remain = i_c - (512 * full);
 81 |     if (i_c_remain > 0) {
 82 |       // i_c_remain = nextPow2(i_c_remain);
 83 |       i_c_remain = ((i_c_remain + 15) / 16) * 16;
 84 |       fl = 512 / i_c_remain;
 85 |       col = fl * i_c_remain;
 86 |       stride = i_c_remain;
 87 |       n = (((H * W) + fl - 1) / fl);
 88 |       // n = std::min(((n + n_channel - 1) / n_channel) * n_channel, nextPow2(n));
 89 |       for (int i = 0; i < n; i++) {
 90 |         pim::OutputNewtonTraceV2(OS, id, row, col, stride);
 91 |       }
 92 |     }
 93 |     code.push_back(OS.str());
 94 |   }
 95 |   return code;
 96 | }
 97 | 
 98 | class Readres {
 99 |   std::vector<std::string> head;
100 | 
101 | public:
102 |   Readres(std::vector<std::string> cmds) {
103 |     head = cmds;
104 |   }
105 | 
106 |   std::string code() const {
107 |     std::string code;
108 |     for (auto h : head) {
109 |       code += h + "\n";
110 |     }
111 |     return code;
112 |   }
113 | 
114 |   int n_comp() {
115 |     if (head.size() == 0)
116 |       return 0;
117 |     return head.size() - 1;
118 |   }
119 | 
120 |   std::string at(int i) {
121 |     return head[i];
122 |   }
123 | };
124 | 
125 | class GAct {
126 |   std::vector<std::string> head;
127 |   std::vector<Readres> readres;
128 |   std::vector<std::string> buffer;
129 | 
130 | public:
131 |   GAct(std::string cmd) {
132 |     head.push_back(cmd);
133 |   }
134 |   void add(std::string cmd) {
135 |     if (cmd.find("G_ACT") != std::string::npos) {
136 |       head.push_back(cmd);
137 |     } else {
138 |       buffer.push_back(cmd);
139 |       if (cmd.find("READRES") != std::string::npos) {
140 |         readres.push_back(Readres(buffer));
141 |         buffer.clear();
142 |       }
143 |     }
144 |   }
145 | 
146 |   Readres& at(int i) {
147 |     return readres[i];
148 |   }
149 | 
150 |   int n_readres() const {
151 |     return readres.size();
152 |   }
153 | 
154 |   std::string h() const {
155 |     std::string code;
156 |     for (auto& h : head) {
157 |       code += h + "\n";
158 |     }
159 |     return code;
160 |   }
161 | 
162 |   std::string code(bool include_head) const {
163 |     std::string code;
164 |     if (include_head) {
165 |       for (auto h : head) {
166 |         code += h + "\n";
167 |       }
168 |     }
169 |     for (auto& r : readres) {
170 |       code += r.code();
171 |     }
172 |     return code;
173 |   }
174 |   void split(int factor) {
175 |     std::vector<Readres> new_readres;
176 |     for (int i = 0, offset = 0; i < n_readres(); i++, offset++) {
177 |       Readres& rr = readres[i];
178 |       int chunk = rr.n_comp() / factor;
179 |       for (int j = 0; j < factor; j++) {
180 |         std::vector<std::string> buf;
181 |         for (int k = 0; k < chunk; k++) {
182 |           if (j*chunk + k + offset >= rr.n_comp()) {
183 |             // std::cerr << "WARN: imbalance scheduling!" << std::endl;
184 |             continue;
185 |           }
186 |           buf.push_back(rr.at(j*chunk + k + offset));
187 |         }
188 |         buf.push_back("READRES");
189 |         new_readres.push_back(Readres(buf));
190 |       }
191 |     }
192 |     readres = new_readres;
193 |   }
194 | };
195 | 
196 | class GWrite {
197 |   std::string head;
198 |   std::vector<GAct> gacts;
199 | 
200 | public:
201 |   GWrite(std::string cmd) : head(cmd) { }
202 | 
203 |   void add(std::string cmd) {
204 |     if (cmd.find("G_ACT0") != std::string::npos) {
205 |       gacts.push_back(GAct(cmd));
206 |     } else {
207 |       auto& gact = gacts.back();
208 |       gact.add(cmd);
209 |     }
210 |   }
211 | 
212 |   int n_gact() const {
213 |     return gacts.size();
214 |   }
215 | 
216 |   int n_readres() const {
217 |     int n = 0;
218 |     for (auto& gact : gacts) {
219 |       n += gact.n_readres();
220 |     }
221 |     return n;
222 |   }
223 | 
224 |   GAct& at(int i) {
225 |     return gacts[i];
226 |   }
227 | 
228 |   std::string h() const {
229 |     return head + "\n";
230 |   }
231 | 
232 |   std::string code(bool include_head) const {
233 |     std::string code;
234 |     if (include_head) {
235 |       code += head + "\n";
236 |     }
237 |     for (auto& gact : gacts) {
238 |       code += gact.code(true);
239 |     }
240 |     return code;
241 |   }
242 | };
243 | 
244 | class Command {
245 |   std::vector<GWrite> gwrites;
246 |   int n_channel;
247 | 
248 | public:
249 |   Command(int n_channel) : n_channel(n_channel) { }
250 | 
251 |   void add(std::string cmd) {
252 |     if (cmd.find("GWRITE") != std::string::npos) {
253 |       gwrites.push_back(GWrite(cmd));
254 |     } else {
255 |       auto& gwrite = gwrites.back();
256 |       gwrite.add(cmd);
257 |     }
258 |   }
259 | 
260 |   void policy_readres_auto(GWrite& gwrite, std::vector<std::string>& code, int n, int offset=0, int n_gwrite=1) {
261 |     // write GWRITE for every channels
262 |     for (int i = 0; i < n; i++) {
263 |       std::string g = gwrite.h();
264 |       int s = g.find("GWRITE_");
265 |       if (s == std::string::npos) {
266 |         // replace regular GWRITE with multiple version (GWRITE_2/GWRITE_4)
267 |         g = g.substr(0, g.find("GWRITE")) + std::string("GWRITE_") + std::to_string(n_gwrite) + g.substr(g.find("GWRITE") + 6, g.length());
268 |       }
269 |       code[i + offset] += g;
270 |     }
271 | 
272 |     // TODO: pick right gact for validation by value
273 |     GAct& gact = gwrite.at(0);
274 | 
275 |     // distribute readres
276 |     int parallelism = gwrite.n_gact() * gact.n_readres();
277 | 
278 |     // exploit finer-grained parallelism at the expense of energy increase.
279 |     while (parallelism <= n / 2) {
280 |       int factor = n / parallelism;
281 |       gact.split(factor);
282 |       parallelism = gwrite.n_gact() * gact.n_readres();
283 |     }
284 | 
285 |     // # READRES per G_ACT
286 |     int stride = std::min(
287 |       std::max((gwrite.n_gact() * gact.n_readres() + n - 1) / n, 1),
288 |       gact.n_readres());
289 |     int gw_n_readres = gwrite.n_gact() * gact.n_readres();
290 |     for (int j = 0, idx = 0; j < gw_n_readres; j += stride, idx++) {
291 |       code[idx % n + offset] += gact.h();
292 |       for (int k = 0; k < stride; k++) {
293 |         if (j + k >= gw_n_readres) {
294 |           break;
295 |         }
296 |         auto& rr = gact.at(gact.n_readres() - 1);
297 |         if (j + k < gw_n_readres) {
298 |           // TODO: pick right rr for validation by value, considering remainder
299 |           // e.g., ./pim_codegen -oc 32 -ic 3 -h 113 -w 224 -kh 3 -kw 3 -ph 0 -pw 1 -stride 2 -name test12 -gw 4 -n_channel 12
300 |           rr = gact.at(k);
301 |         }
302 |         code[idx % n + offset] += rr.code();
303 |       }
304 |     }
305 |   }
306 |   std::vector<std::string> policy_auto(const Str2StrMap& attrs) {
307 |     std::vector<std::string> best_code(n_channel);
308 |     for (int n_gwrite = 1; n_gwrite <= std::stoi(attrs.at("GW")); n_gwrite *= 2) {
309 |       std::vector<std::string> code(n_channel);
310 |       int stride = n_channel / n_gwrite;
311 |       int chunk = (gwrites.size() + n_gwrite - 1) / n_gwrite;
312 |       for (int i = 0; i < n_gwrite; i++) {
313 |         int offset = i * stride;
314 |         for (int j = 0; j < chunk; j++) {
315 |           if (i * chunk + j >= gwrites.size()) {
316 |             break;
317 |           }
318 |           auto& gwrite = gwrites[i * chunk + j];
319 |           // TODO: for valication by value, (stride) number of gwrites must be passed to the policy_readres_auto
320 |           policy_readres_auto(gwrite, code, stride, offset, n_gwrite);
321 |         }
322 |       }
323 |       std::string::size_type pos = 0;
324 |       int gact_best = 0;
325 |       while (true) {
326 |         pos = best_code[0].find("G_ACT0", pos);
327 |         if (pos == std::string::npos) {
328 |           pos = 0;
329 |           break;
330 |         }
331 |         ++gact_best;
332 |         ++pos;
333 |       }
334 |       int gact_code = 0;
335 |       while (true) {
336 |         pos = code[0].find("G_ACT0", pos);
337 |         if (pos == std::string::npos) {
338 |           pos = 0;
339 |           break;
340 |         }
341 |         ++gact_code;
342 |         ++pos;
343 |       }
344 |       if (best_code[0].size() == 0 || gact_code < gact_best || gact_code == gact_best && code[0].size() < best_code[0].size()) {
345 |         best_code = code;
346 |       }
347 |     }
348 |     return best_code;
349 |   }
350 | };
351 | 
352 | void PimSchedule(std::string id, const Str2StrMap& attrs,
353 |                  const std::vector<std::string>& func_args, std::string code, bool append=false) {
354 |   int n_channel = std::stoi(attrs.at("N_CHANNEL"));
355 |   int gpu_channel = 32 - n_channel;
356 |   auto mode = std::ios_base::out;
357 |   if (append) {
358 |     mode = std::ios_base::app;
359 |   }
360 | 
361 |   std::vector<std::string> traces;
362 |   std::string token;
363 |   std::stringstream ss(code);
364 | 
365 |   Command command(n_channel);
366 | 
367 |   int idx = 0;
368 |   while (std::getline(ss, token, '\n')) {
369 |     traces.push_back(token);
370 |     command.add(token);
371 |     idx++;
372 |   }
373 | 
374 |   std::ofstream OS;
375 | 
376 |   OS.open(id + "-all.pim", mode);
377 |   for (auto trace : traces) {
378 |     OS << trace << "\n";
379 |   }
380 |   OS.flush();
381 |   OS.close();
382 | 
383 |   std::vector<std::string> cmds = command.policy_auto(attrs);
384 | 
385 |   for (int i = gpu_channel; i < gpu_channel + n_channel; i++) {
386 |     OS.open(id + "-" + std::to_string(i) + ".pim", mode);
387 |     OS << cmds[i - gpu_channel];
388 |     OS.flush();
389 |     OS.close();
390 |   }
391 | }
392 | 
393 | bool isdigit(const char *str) {
394 |   for (int i = 0; i < strlen(str); ++i)
395 |     if (str[i] < '0' || str[i] > '9')
396 |       return false;
397 |   return true;
398 | }
399 | 
400 | void get_args(int argc, char *argv[], Str2StrMap& attrs) {
401 |   int pos = 1;
402 |   // set default value
403 |   attrs["GW"] = "4";
404 |   attrs["N_CHANNEL"] = "16";
405 | 
406 |   while (pos < argc) {
407 |     if (pos + 1 < argc && !strcmp(argv[pos], "-oc") &&
408 |       isdigit(argv[pos + 1])) {
409 |       attrs["K"] = argv[pos + 1];
410 |       pos += 2;
411 |       continue;
412 |     }
413 |     if (pos + 1 < argc && !strcmp(argv[pos], "-ic") &&
414 |       isdigit(argv[pos + 1])) {
415 |       attrs["C"] = argv[pos + 1];
416 |       pos += 2;
417 |       continue;
418 |     }
419 |     if (pos + 1 < argc && !strcmp(argv[pos], "-h") &&
420 |       isdigit(argv[pos + 1])) {
421 |       attrs["H"] = argv[pos + 1];
422 |       pos += 2;
423 |       continue;
424 |     }
425 |     if (pos + 1 < argc && !strcmp(argv[pos], "-w") &&
426 |       isdigit(argv[pos + 1])) {
427 |       attrs["W"] = argv[pos + 1];
428 |       pos += 2;
429 |       continue;
430 |     }
431 |     if (pos + 1 < argc && !strcmp(argv[pos], "-kh") &&
432 |       isdigit(argv[pos + 1])) {
433 |       attrs["KH"] = argv[pos + 1];
434 |       pos += 2;
435 |       continue;
436 |     }
437 |     if (pos + 1 < argc && !strcmp(argv[pos], "-kw") &&
438 |       isdigit(argv[pos + 1])) {
439 |       attrs["KW"] = argv[pos + 1];
440 |       pos += 2;
441 |       continue;
442 |     }
443 |     if (pos + 1 < argc && !strcmp(argv[pos], "-ph") &&
444 |       isdigit(argv[pos + 1])) {
445 |       attrs["PH"] = argv[pos + 1];
446 |       pos += 2;
447 |       continue;
448 |     }
449 |     if (pos + 1 < argc && !strcmp(argv[pos], "-pw") &&
450 |       isdigit(argv[pos + 1])) {
451 |       attrs["PW"] = argv[pos + 1];
452 |       pos += 2;
453 |       continue;
454 |     }
455 |     if (pos + 1 < argc && !strcmp(argv[pos], "-stride") &&
456 |       isdigit(argv[pos + 1])) {
457 |       attrs["S"] = argv[pos + 1];
458 |       pos += 2;
459 |       continue;
460 |     }
461 |     if (pos + 1 < argc && !strcmp(argv[pos], "-name")) {
462 |       attrs["name"] = argv[pos + 1];
463 |       pos += 2;
464 |     }
465 |     if (pos + 1 < argc && !strcmp(argv[pos], "-gw") &&
466 |       isdigit(argv[pos + 1])) {
467 |       attrs["GW"] = argv[pos + 1];
468 |       pos += 2;
469 |       continue;
470 |     }
471 |     if (pos + 1 < argc && !strcmp(argv[pos], "-n_channel") &&
472 |       isdigit(argv[pos + 1])) {
473 |       attrs["N_CHANNEL"] = argv[pos + 1];
474 |       pos += 2;
475 |       continue;
476 |     }
477 |     pos += 1;
478 |   }
479 | }
480 | 
481 | int main(int argc, char *argv[]) {
482 |   Str2StrMap attrs{};
483 |   get_args(argc, argv, attrs);
484 |   std::string kernel_name = attrs.at("name");
485 |   auto code = PimCodeGen(kernel_name, attrs, {});
486 | 
487 |   for (int i = 0; i < code.size(); i++) {
488 |     PimSchedule(kernel_name, attrs, {}, code[i], i > 0);
489 |   }
490 |   return 0;
491 | }
492 | 


--------------------------------------------------------------------------------
/pim/pim_trace.cc:
--------------------------------------------------------------------------------
  1 | #include "pim_trace.h"
  2 | #include <iomanip>
  3 | #include <cstring>
  4 | #include <sstream>
  5 | 
  6 | namespace pim {
  7 | std::string ToBinary(int n) {
  8 |   static char b[15]; // TB
  9 |   static char r[15];
 10 |   b[0] = '\0';
 11 |   while (n > 0) {
 12 |     strcat(b, ((n % 2) == 0) ? "0" : "1");
 13 |     n /= 2;
 14 |   }
 15 |   for (size_t i = 0; i < strlen(b); i++)
 16 |     r[strlen(b)-1-i] = b[i];
 17 |   r[strlen(b)] = '\0';
 18 |   return r;
 19 | }
 20 | 
 21 | std::string FillZero(std::string s, size_t n) {
 22 |   if (s.size() > 100) std::invalid_argument("string size too large!");
 23 | 
 24 |   static char buf[100]; // arbitrary size large enough
 25 |   for (size_t i = 0; i < n; i++) {
 26 |     if (i < s.size()) buf[n-s.size()+i] = s[i];
 27 |     else buf[n-1-i] = '0';
 28 |   }
 29 |   buf[n] = '\0';
 30 |   return buf;
 31 | }
 32 | 
 33 | void GWrite(std::ostream& OS, char* buf, int r, StrideInfo sinfo) {
 34 |   sprintf(buf, "%02d%02d%s%05d%06d", 0, 0, FillZero(ToBinary(r), 14).c_str(), 0, 0);
 35 |   OS << "0x" << std::setfill('0') << std::setw(8) << std::hex << std::stoi(buf, nullptr, 2) << " GWRITE ";
 36 |   if (sinfo.use_stride) {
 37 |     OS << std::dec << sinfo.num_first_elem << " " << sinfo.stride << " " << sinfo.num_after_elem;
 38 |   }
 39 |   OS << "\n";
 40 |   OS.flush();
 41 | }
 42 | 
 43 | void GAct(std::ostream& OS, char* buf, int k, int r, int j, int num_act) {
 44 |   for (int i = 0; i < num_act; i++) {
 45 |     sprintf(buf, "%s%02d%s%05d%06d", FillZero(ToBinary(i), 2).c_str(), 0, FillZero(ToBinary(k*r+j), 14).c_str(), 0, 0);
 46 |     OS << "0x" << std::setfill('0') << std::setw(8) << std::hex << std::stoi(buf, nullptr, 2) << " G_ACT" << i << "\n";
 47 |   }
 48 |   OS.flush();
 49 | }
 50 | 
 51 | void Comp(std::ostream& OS, char* buf, int k, int r, int j, int h) {
 52 |   sprintf(buf, "%02d%02d%s%s%06d", 0, 0, FillZero(ToBinary(k*r+j), 14).c_str(), FillZero(ToBinary(h), 5).c_str(), 0);
 53 |   OS << "0x" << std::setfill('0') << std::setw(8) << std::hex << std::stoi(buf, nullptr, 2) << " COMP\n";
 54 |   OS.flush();
 55 | }
 56 | 
 57 | void ReadRes(std::ostream& OS) {
 58 |   OS << "READRES\n";
 59 |   OS.flush();
 60 | }
 61 | 
 62 | void OutputNewtonTrace(std::ostream& OS, std::string kernel_name, int64_t row, int64_t col) {
 63 |   char buf[100]; // fixed size
 64 | 
 65 |   int num_chunks = (col + 511) / 512;
 66 |   int r = (row + 15) / 16;
 67 | 
 68 |   for (int i = 0; i < num_chunks; i++) {
 69 |     GWrite(OS, buf, i);
 70 |     for (int j = 0; j < r; j++) {
 71 |       int num_act = 4;
 72 |       if (j == r - 1 && row % 16 != 0) {
 73 |         num_act = ((row - (row/16)*16) + 3) / 4;
 74 |       }
 75 |       GAct(OS, buf, i, r, j, num_act);
 76 |       int bound = 32;
 77 |       if (i == num_chunks - 1 && col % 512 != 0) {
 78 |         bound = ((col - (col/512)*512) + 15) / 16;
 79 |       }
 80 |       for (int k = 0; k < bound; k++) {
 81 |         Comp(OS, buf, i, r, j, k);
 82 |       }
 83 |       ReadRes(OS);
 84 |     }
 85 |   }
 86 |   OS.flush();
 87 | }
 88 | 
 89 | void OutputNewtonTraceV2(std::ostream& OS, std::string kernel_name, int64_t row, int64_t col, int64_t stride, StrideInfo sinfo) {
 90 |   char buf[100]; // fixed size
 91 | 
 92 |   int num_chunks = (col + 511) / 512;
 93 |   int r = (row + 15) / 16;
 94 | 
 95 |   for (int i = 0; i < num_chunks; i++) {
 96 |     int elem = 0;
 97 |     for (int g = 0; g < sinfo.num_gwrite; g++) {
 98 |       GWrite(OS, buf, i, sinfo);
 99 |       break; // TODO: allow multiple gwrites
100 |     }
101 |     for (int j = 0; j < r; j++) {
102 |       int num_act = 4;
103 |       // if (j == r - 1 && row % 16 != 0) {
104 |       //   num_act = ((row - (row/16)*16) + 3) / 4;
105 |       // }
106 |       GAct(OS, buf, i, r, j, num_act);
107 |       int bound = 32;
108 |       if (col % 512 != 0) {
109 |         bound = ((col - (col/512)*512) + 15) / 16;
110 |       }
111 |       for (int k = 0; k < bound; k++) {
112 |         Comp(OS, buf, i, r, j, k);
113 |         elem += 16;
114 | 
115 |         if (k == bound - 1 || elem % stride == 0) {
116 |           ReadRes(OS);
117 |         }
118 |       }
119 |     }
120 |   }
121 |   OS.flush();
122 | }
123 | }
124 | 


--------------------------------------------------------------------------------
/pim/pim_trace.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <sstream>
 4 | 
 5 | namespace pim {
 6 | typedef struct strideInfo {
 7 |   bool use_stride = false;
 8 |   int num_first_elem = 0;
 9 |   int stride = 0;
10 |   int num_after_elem = 0;
11 |   int num_gwrite = 1;
12 | } StrideInfo;
13 | std::string ToBinary(int n);
14 | std::string FillZero(std::string s, size_t n);
15 | 
16 | void GWrite(std::ostream& OS, char* buf, int r, StrideInfo sinfo=StrideInfo());
17 | void GAct(std::ostream& OS, char* buf, int k, int r, int j, int num_act);
18 | void Comp(std::ostream& OS, char* buf, int k, int r, int j, int h);
19 | 
20 | void ReadRes(std::ostream& OS);
21 | 
22 | void OutputNewtonTrace(std::ostream& OS, std::string kernel_name, int64_t row, int64_t col);
23 | void OutputNewtonTraceV2(std::ostream& OS, std::string kernel_name, int64_t row, int64_t col, int64_t stride, StrideInfo sinfo=StrideInfo());
24 | }
25 | 


--------------------------------------------------------------------------------
/pim/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | import onnx
  6 | from google.protobuf.json_format import MessageToDict
  7 | import torchvision.models as models
  8 | 
  9 | onnx_datatype = onnx.TensorProto.FLOAT16
 10 | MODEL_LIST = [
 11 |   # efficientnet
 12 |   'efficientnet-v1-b0', 'efficientnet-v1-b4', 'efficientnet-v1-b5', 'efficientnet-v1-b6',
 13 |   # mobilenet
 14 |   'mobilenet-v2', 'mobilenet-v2-1.4', 'mobilenet-v3-small', 'mobilenet-v3-large',
 15 |   # resnet
 16 |   'resnet-18', 'resnet-34', 'resnet-50',
 17 |   # resnext
 18 |   'resnext-50',
 19 |   # inception
 20 |   'inception-v3',
 21 |   # shufflenet
 22 |   'shufflenet-v2-x0.5', 'shufflenet-v2-x1.0', 'shufflenet-v2-x2.0',
 23 |   # mnasnet
 24 |   'mnasnet-0.5', 'mnasnet-1.0', 'mnasnet-1.3',
 25 |   # vgg
 26 |   'vgg-16',
 27 |   # regnet
 28 |   'regnet_y_400mf', 'regnet_y_800mf', 'regnet_y_1_6gf', 'regnet_y_3_2gf', 'regnet_y_8gf', 'regnet_y_16gf', 'regnet_y_32gf', 'regnet_y_128gf', 'regnet_x_400mf', 'regnet_x_800mf', 'regnet_x_1_6gf', 'regnet_x_3_2gf', 'regnet_x_8gf', 'regnet_x_16gf', 'regnet_x_32gf', 'regnet_x_128gf',
 29 |   # vision transformer
 30 |   'vit-b-16', 'vit-l-16',
 31 |   # convnext
 32 |   'convnext-tiny', 'convnext-small', 'convnext-base', 'convnext-large',
 33 |   # bert
 34 |   'bert-large-1x64', 'bert-large-1x32', 'bert-large-1x3',
 35 |   # test
 36 |   'toy', 'memopt',
 37 | ]
 38 | 
 39 | GVN = -1
 40 | PAR_EXEC_ID = 0
 41 | 
 42 | def gvn(reset=False):
 43 |   global GVN
 44 | 
 45 |   if reset:
 46 |     GVN = -1
 47 |     return None
 48 | 
 49 |   GVN += 1
 50 |   return GVN
 51 | 
 52 | def to_numpy(tensor):
 53 |   return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 54 | 
 55 | def par_exec_id(increment=False):
 56 |   global PAR_EXEC_ID
 57 |   if increment:
 58 |     PAR_EXEC_ID += 1
 59 |   return PAR_EXEC_ID
 60 | 
 61 | def activation_type(graph, node):
 62 |   nodes = find_nodes_by_arg_name(graph, node.output[0])
 63 |   nodes.remove(node)
 64 | 
 65 |   if len(nodes) == 1:
 66 |     if nodes[0].op_type in ["HardSigmoid", "Sigmoid", "Relu", "Clip"]:
 67 |       # TODO: tvm bug
 68 |       if nodes[0].op_type == "HardSigmoid":
 69 |         return "Sigmoid"
 70 |       return nodes[0].op_type
 71 |   elif len(nodes) == 2:
 72 |     if nodes[0].op_type == "Sigmoid" and nodes[1].op_type == "Mul":
 73 |       nodes_ = find_nodes_by_arg_name(graph, nodes[0].output[0])
 74 |       for n in nodes:
 75 |         nodes_.remove(n)
 76 |       assert len(nodes_) == 0
 77 |       return "SiLU"
 78 |     elif nodes[0].op_type == "Mul" and nodes[1].op_type == "Sigmoid":
 79 |       nodes_ = find_nodes_by_arg_name(graph, nodes[1].output[0])
 80 |       for n in nodes:
 81 |         nodes_.remove(n)
 82 |       assert len(nodes_) == 0
 83 |       return "SiLU"
 84 |     elif nodes[0].op_type == "HardSigmoid" and nodes[1].op_type == "Mul":
 85 |       nodes_ = find_nodes_by_arg_name(graph, nodes[0].output[0])
 86 |       for n in nodes:
 87 |         nodes_.remove(n)
 88 |       assert len(nodes_) == 0
 89 |       # TODO: tvm bug
 90 |       # return "HardSiLU"
 91 |       return "SiLU"
 92 |     elif nodes[0].op_type == "Mul" and nodes[1].op_type == "HardSigmoid":
 93 |       nodes_ = find_nodes_by_arg_name(graph, nodes[1].output[0])
 94 |       for n in nodes:
 95 |         nodes_.remove(n)
 96 |       assert len(nodes_) == 0
 97 |       # TODO: tvm bug
 98 |       # return "HardSiLU"
 99 |       return "SiLU"
100 |   return "Identity"
101 | 
102 | def is_silu(graph, nodes):
103 |   if len(nodes) != 2:
104 |     return False
105 | 
106 |   n1, n2 = nodes
107 | 
108 |   n1_inputs = find_nodes_by_arg_name(graph, n1.input[0])
109 |   n1_inputs.remove(n1)
110 |   n2_inputs = find_nodes_by_arg_name(graph, n2.input[0])
111 |   n2_inputs.remove(n2)
112 | 
113 |   if n1.op_type == "Sigmoid":
114 |     outputs = find_nodes_by_arg_name(graph, n1.output[0])
115 |     outputs.remove(n1)
116 |     outputs.remove(n2)
117 |     if len(outputs) > 0:
118 |       return False
119 |     return n2.op_type == "Mul"
120 |   elif n1.op_type == "Mul":
121 |     outputs = find_nodes_by_arg_name(graph, n2.output[0])
122 |     outputs.remove(n1)
123 |     outputs.remove(n2)
124 |     if len(outputs) > 0:
125 |       return False
126 |     return n2.op_type == "Sigmoid"
127 |   else:
128 |     return False
129 | 
130 | def is_hardsilu(graph, nodes):
131 |   if len(nodes) != 2:
132 |     return False
133 | 
134 |   n1, n2 = nodes
135 | 
136 |   n1_inputs = find_nodes_by_arg_name(graph, n1.input[0])
137 |   n1_inputs.remove(n1)
138 |   n2_inputs = find_nodes_by_arg_name(graph, n2.input[0])
139 |   n2_inputs.remove(n2)
140 | 
141 |   if n1.op_type == "HardSigmoid":
142 |     outputs = find_nodes_by_arg_name(graph, n1.output[0])
143 |     outputs.remove(n1)
144 |     outputs.remove(n2)
145 |     if len(outputs) > 0:
146 |       return False
147 |     return n2.op_type == "Mul"
148 |   elif n1.op_type == "Mul":
149 |     outputs = find_nodes_by_arg_name(graph, n2.output[0])
150 |     outputs.remove(n1)
151 |     outputs.remove(n2)
152 |     if len(outputs) > 0:
153 |       return False
154 |     return n2.op_type == "HardSigmoid"
155 |   else:
156 |     return False
157 | 
158 | def is_silu_like(graph, nodes):
159 |   return is_silu(graph, nodes) or is_hardsilu(graph, nodes)
160 | 
161 | def silu_like_type(graph, nodes):
162 |   if is_silu(graph, nodes):
163 |     return "Sigmoid"
164 |   elif is_hardsilu(graph, nodes):
165 |     return "HardSigmoid"
166 |   else:
167 |     raise Exception("Not supported type")
168 | 
169 | def create_initializer_tensor(
170 |         name: str,
171 |         tensor_array: np.ndarray,
172 |         data_type: onnx.TensorProto = onnx.TensorProto.FLOAT16
173 | ) -> onnx.TensorProto:
174 |     # (TensorProto)
175 |     initializer_tensor = onnx.helper.make_tensor(
176 |         name=name,
177 |         data_type=data_type,
178 |         dims=tensor_array.shape,
179 |         vals=tensor_array.flatten().tolist())
180 | 
181 |     return initializer_tensor
182 | 
183 | def get_arg_shape(graph, node, arg_name):
184 |   dims = None
185 |   for val in graph.value_info:
186 |     val = MessageToDict(val)
187 |     # print(val)
188 |     if val['name'] == arg_name:
189 |       dims = val['type']['tensorType']['shape']['dim']
190 |       dims = [int(e['dimValue']) for e in dims]
191 |       return dims
192 | 
193 |   if graph.node[0] == node:
194 |     assert len(graph.input) < 2 # single input
195 |     m_dict = MessageToDict(graph.input[0])
196 |     dim_info = m_dict["type"]["tensorType"]["shape"]["dim"]  # ugly but we have to live with this when using dict
197 |     dims = [int(d.get("dimValue")) for d in dim_info]
198 |     return dims
199 | 
200 | # NOTE: check result nodes for uses
201 | def find_nodes_by_arg_name(graph, arg_name):
202 |   r = []
203 |   for node in graph.node:
204 |     if arg_name in node.input or arg_name in node.output:
205 |       r.append(node)
206 |   return r
207 | 
208 | def find_initializer_by_arg_name(graph, arg_name):
209 |   for initializer in graph.initializer:
210 |     if arg_name == initializer.name:
211 |       return initializer
212 | 
213 | def find_value_info_by_arg_name(graph, arg_name):
214 |   for value_info in graph.value_info:
215 |     if arg_name == value_info.name:
216 |       return value_info
217 | 
218 | def find_node_index_by_name(graph, name):
219 |   for i, node in enumerate(graph.node):
220 |     if node.name == name:
221 |       return i
222 |   return None
223 | 
224 | def find_attribute_by_name(node, name):
225 |   for attr in node.attribute:
226 |     if attr.name == name:
227 |       return attr
228 | 
229 | # assign node the unique name
230 | def preprocess(onnx_model):
231 |   gvn(reset=True)
232 |   for node in onnx_model.graph.node:
233 |     # if not node.name:
234 |     op_type = node.op_type
235 |     if not op_type:
236 |       op_type = "Node"
237 |     node.name = f"{op_type}_{gvn()}"
238 | 
239 | def supportedActivationNodes(graph, nodes):
240 |   if len(nodes) == 1:
241 |     if nodes[0].op_type in ["Clip", "Relu", "Sigmoid", "HardSigmoid"]:
242 |       return True
243 |   elif len(nodes) == 2:
244 |     return is_silu_like(graph, nodes)
245 |   return False
246 | 
247 | class Net(torch.nn.Module):
248 |   def __init__(self):
249 |     super(Net, self).__init__()
250 |     self.conv1 = torch.nn.Conv2d(3, 64, 1, padding=0, stride=1, bias=True)
251 |     self.act1 = torch.nn.Hardtanh(0, 6)
252 |     self.conv2 = torch.nn.Conv2d(64, 64, 3, padding=1, stride=1, groups=64, bias=True)
253 |     self.act2 = torch.nn.Hardtanh(0, 6)
254 |     self.conv3 = torch.nn.Conv2d(64, 64, 1, padding=0, stride=1, bias=True)
255 |     self.act3 = torch.nn.Hardtanh(0, 6)
256 |     self.gap = torch.nn.AvgPool2d(16)
257 |     self.flatten = torch.nn.Flatten()
258 |     self.linear = torch.nn.Linear(64, 10)
259 | 
260 |   def forward(self, x):
261 |     x = self.conv1(x)
262 |     x = self.act1(x)
263 |     # x_ = torch.sigmoid(x)
264 |     # x = torch.mul(x, x_)
265 |     x = self.conv2(x)
266 |     x = self.act2(x)
267 |     # x_ = torch.sigmoid(x)
268 |     # x = torch.mul(x, x_)
269 |     x = self.conv3(x)
270 |     x = self.act3(x)
271 |     # x_ = torch.sigmoid(x)
272 |     # x = torch.mul(x, x_)
273 |     x = self.gap(x)
274 |     x = self.flatten(x)
275 |     x = self.linear(x)
276 |     return x
277 | 
278 | class NetMemOptTest(torch.nn.Module):
279 |   def __init__(self):
280 |     super(NetMemOptTest, self).__init__()
281 |     self.conv1 = torch.nn.Conv2d(3, 64, 3, padding=1, stride=1, bias=True)
282 |     self.act1 = torch.nn.Hardtanh(0, 6)
283 |     self.conv2 = torch.nn.Conv2d(64, 64, 3, padding=1, stride=1, groups=1, bias=True)
284 |     self.act2 = torch.nn.Hardtanh(0, 6)
285 |     self.conv3 = torch.nn.Conv2d(64, 64, 3, padding=1, stride=1, bias=True)
286 |     self.act3 = torch.nn.Hardtanh(0, 6)
287 |     self.gap = torch.nn.AvgPool2d(16)
288 |     self.flatten = torch.nn.Flatten()
289 |     self.linear = torch.nn.Linear(64, 10)
290 | 
291 |   def forward(self, x):
292 |     x = self.conv1(x)
293 |     x = self.act1(x)
294 |     # x_ = torch.sigmoid(x)
295 |     # x = torch.mul(x, x_)
296 |     x = self.conv2(x)
297 |     x = self.act2(x)
298 |     # x_ = torch.sigmoid(x)
299 |     # x = torch.mul(x, x_)
300 |     x = self.conv3(x)
301 |     x = self.act3(x)
302 |     # x_ = torch.sigmoid(x)
303 |     # x = torch.mul(x, x_)
304 |     x = self.gap(x)
305 |     x = self.flatten(x)
306 |     x = self.linear(x)
307 |     return x
308 | 
309 | def calc_pads(node):
310 |   dilations=list(find_attribute_by_name(node, 'dilations').ints)
311 |   kernel_shape=list(find_attribute_by_name(node, 'kernel_shape').ints)
312 |   pads=list(find_attribute_by_name(node, 'pads').ints)
313 |   strides=list(find_attribute_by_name(node, 'strides').ints)
314 | 
315 |   # TODO: currently only support dilation == 1
316 |   for dilation in dilations:
317 |     assert dilation == 1
318 | 
319 |   assert len(strides) == 2 and strides[0] == strides[1]
320 |   assert pads[0] == pads[2] and pads[0] <= kernel_shape[0] // 2
321 | 
322 |   pad_start = (pads[0], pads[1], 0, pads[3])
323 |   pad_middle = (0, pads[1], 0, pads[3])
324 |   pad_end = (0, pads[1], pads[2], pads[3])
325 | 
326 |   return pad_start, pad_middle, pad_end
327 | 
328 | def calc_space(node, h):
329 |   dilations=list(find_attribute_by_name(node, 'dilations').ints)
330 |   kernel_shape=list(find_attribute_by_name(node, 'kernel_shape').ints)
331 |   pads=list(find_attribute_by_name(node, 'pads').ints)
332 |   strides=list(find_attribute_by_name(node, 'strides').ints)
333 | 
334 |   # TODO: currently only support dilation == 1
335 |   for dilation in dilations:
336 |     assert dilation == 1
337 | 
338 |   # TODO: currently only support stride <= 2
339 |   for stride in strides:
340 |     assert stride <= 2
341 | 
342 |   assert len(strides) == 2 and strides[0] == strides[1]
343 |   assert pads[0] == pads[2] and pads[0] <= kernel_shape[0] // 2
344 | 
345 |   stride = strides[0]
346 | 
347 |   space = (kernel_shape[0] // 2) * 2 - pads[0]
348 |   if stride == 1:
349 |     space_start = (0, space)
350 |     space_middle = (-space, space)
351 |     space_end = (-space, 0)
352 |   elif stride == 2:
353 |     if h % 2 == 0:
354 |       space_start = (0, 0)
355 |       space_middle = (-space, 0)
356 |       space_end = (-space, 0)
357 |     else:
358 |       space_start = (0, space)
359 |       space_middle = (-space, space)
360 |       space_end = (-space, 0)
361 |   return space_start, space_middle, space_end
362 | 
363 | def get_torch_model(name):
364 |   model = None
365 |   if name == "efficientnet-v1-b0":
366 |     model = models.efficientnet_b0(pretrained=True)
367 |   elif name == "efficientnet-v1-b4":
368 |     model = models.efficientnet_b4(pretrained=True)
369 |   elif name == "efficientnet-v1-b6":
370 |     model = models.efficientnet_b6(pretrained=True)
371 |   elif name == "mobilenet-v2":
372 |     model = models.mobilenet_v2(pretrained=True)
373 |   elif name == "mobilenet-v2-1.4":
374 |     model = models.mobilenet_v2(pretrained=False, width_mult=1.4)
375 |   elif name == "mobilenet-v3-small":
376 |     model = models.mobilenet_v3_small(pretrained=True)
377 |   elif name == "mobilenet-v3-large":
378 |     model = models.mobilenet_v3_large(pretrained=True)
379 |   elif name == "resnet-18":
380 |     model = models.resnet18(pretrained=True)
381 |   elif name == "resnet-34":
382 |     model = models.resnet34(pretrained=True)
383 |   elif name == "resnet-50":
384 |     model = models.resnet50(pretrained=True)
385 |   elif name == "resnext-50":
386 |     model = models.resnext50_32x4d(pretrained=True)
387 |   elif name == "inception-v3":
388 |     model = models.inception_v3(pretrained=True)
389 |   elif name == "shufflenet-v2-x0.5":
390 |     model = models.shufflenet_v2_x0_5(pretrained=True)
391 |   elif name == "shufflenet-v2-x1.0":
392 |     model = models.shufflenet_v2_x1_0(pretrained=True)
393 |   elif name == "shufflenet-v2-x2.0":
394 |     model = models.shufflenet_v2_x2_0(pretrained=False) # pretrained model is not yet supported
395 |   elif name == "mnasnet-0.5":
396 |     model = models.mnasnet0_5(pretrained=True)
397 |   elif name == "mnasnet-1.0":
398 |     model = models.mnasnet1_0(pretrained=True)
399 |   elif name == "mnasnet-1.3":
400 |     model = models.mnasnet1_3(pretrained=False) # pretrained model is not yet supported
401 |   elif name == "vgg-16":
402 |     model = models.vgg16(pretrained=True)
403 |   elif name == "regnet_y_400mf":
404 |     model = models.regnet_y_400mf(pretrained=True)
405 |   elif name == "regnet_y_800mf":
406 |     model = models.regnet_y_800mf(pretrained=True)
407 |   elif name == "regnet_y_1_6gf":
408 |     model = models.regnet_y_1_6gf(pretrained=True)
409 |   elif name == "regnet_y_3_2gf":
410 |     model = models.regnet_y_3_2gf(pretrained=True)
411 |   elif name == "regnet_y_8gf":
412 |     model = models.regnet_y_8gf(pretrained=True)
413 |   elif name == "regnet_y_16gf":
414 |     model = models.regnet_y_16gf(pretrained=True)
415 |   elif name == "regnet_y_32gf":
416 |     model = models.regnet_y_32gf(pretrained=True)
417 |   elif name == "regnet_y_128gf":
418 |     model = models.regnet_y_128gf(pretrained=True)
419 |   elif name == "regnet_x_400mf":
420 |     model = models.regnet_x_400mf(pretrained=True)
421 |   elif name == "regnet_y_800mf":
422 |     model = models.regnet_x_800mf(pretrained=True)
423 |   elif name == "regnet_x_1_6gf":
424 |     model = models.regnet_x_1_6gf(pretrained=True)
425 |   elif name == "regnet_x_3_2gf":
426 |     model = models.regnet_x_3_2gf(pretrained=True)
427 |   elif name == "regnet_x_8gf":
428 |     model = models.regnet_x_8gf(pretrained=True)
429 |   elif name == "regnet_x_16gf":
430 |     model = models.regnet_x_16gf(pretrained=True)
431 |   elif name == "regnet_x_32gf":
432 |     model = models.regnet_x_32gf(pretrained=True)
433 |   elif name == "regnet_x_128gf":
434 |     model = models.regnet_x_128gf(pretrained=True)
435 |   elif name == "vit-b-16":
436 |     model = models.vit_b_16(pretrained=True)
437 |   elif name == "vit-l-16":
438 |     model = models.vit_l_16(pretrained=True)
439 |   elif name == "convnext-tiny":
440 |     model = models.convnext_tiny(pretrained=True)
441 |   elif name == "convnext-small":
442 |     model = models.convnext_small(pretrained=True)
443 |   elif name == "convnext-base":
444 |     model = models.convnext_base(pretrained=True)
445 |   elif name == "convnext-large":
446 |     model = models.convnext_large(pretrained=True)
447 |   elif name == "toy":
448 |     model = Net()
449 |   elif name == "memopt":
450 |     model = NetMemOptTest()
451 |   else:
452 |     raise Exception(f"Unsupported model: {name}")
453 |   return model
454 | 
455 | def get_random_input(name):
456 |   x = torch.randn(1, 3, 224, 224).cuda()
457 |   if name == "efficientnet-v1-b6":
458 |     x = torch.randn(1, 3, 528, 528).cuda()
459 |   elif name == "inception-v3":
460 |     x = torch.randn(1, 3, 299, 299).cuda()
461 |   elif name in ["toy", "memopt"]:
462 |     x = torch.randn(1, 3, 16, 16).cuda()
463 |   return x
464 | 
465 | def parse_kernel_number(l):
466 |   return int(l.split("-")[1].split(".")[0])
467 | 
468 | def get_kernel_start_and_end(trace_path, n_run=3):
469 |   start=None
470 | 
471 |   # start, end
472 |   runs = []
473 |   skip = True # skip first interval
474 |   with open(f"./{trace_path}/stats.csv") as f:
475 |     lines = f.readlines()
476 | 
477 |     for i, l in enumerate(lines):
478 |       if i == 0:
479 |         continue
480 | 
481 |       l = l.split(",")
482 | 
483 |       n = parse_kernel_number(l[0].strip())
484 |       name = l[1].strip()
485 | 
486 |       if name.find("forward_kernel_cuda_start") != -1:
487 |         # if skip:
488 |         #   skip = False
489 |         #   continue
490 | 
491 |         start = n
492 | 
493 |       if start is not None and name.find("forward_kernel_cuda_end") != -1:
494 |         runs.append((start, n))
495 | 
496 |   intv = runs[0][1] - runs[0][0]
497 |   for s, t in runs:
498 |     assert intv == t - s
499 | 
500 |   assert len(runs) == n_run
501 | 
502 |   return (runs[2][0]+1, runs[2][1]-1)
503 | 


--------------------------------------------------------------------------------
/pimflow:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | from __future__ import absolute_import
  3 | 
  4 | from pim.util import *
  5 | from pim.transform import *
  6 | 
  7 | import onnx
  8 | import torch
  9 | import argparse
 10 | import os
 11 | import csv
 12 | import tvm
 13 | import tvm.relay as relay
 14 | from tvm.contrib import graph_executor
 15 | from torch.utils.cpp_extension import load
 16 | import pickle
 17 | import csv
 18 | import pandas as pd
 19 | 
 20 | def parse_arguments():
 21 |   parser = argparse.ArgumentParser()
 22 |   parser.add_argument("-m", "--mode", help="mode", choices=["profile", "solve", "run", "stat", "transform", "build", "trace", "trace_opt", "trace_gpu_only", "run_opt", "run_gpu_only"], required=True)
 23 |   parser.add_argument("-t", "--transform", help="graph transformation", choices=["split", "pipeline"])
 24 |   parser.add_argument("-n", "--network", help="target network", choices=MODEL_LIST, required=True)
 25 |   parser.add_argument("--gpu_only", action="store_true", help="execute only on GPU")
 26 |   parser.add_argument("--conv_only", action="store_true", help="execute only convolution layers")
 27 |   parser.add_argument("--layerwise", action="store_true", help="layerwise performance breakdown")
 28 |   parser.add_argument("--split_ratio", action="store_true", help="distribution of MD-DP splitting ratios")
 29 |   parser.add_argument("--trace", action="store_true", help="only trace")
 30 |   parser.add_argument("--accel_sim_gpu", choices=["SM75_RTX2060"], default="SM75_RTX2060")
 31 |   parser.add_argument("--accel_sim_kernel_launch_latency", type=int, default=5010)
 32 |   parser.add_argument("--accel_sim_n_channel", type=int, default=16)
 33 |   parser.add_argument("--policy", choices=["None", "Newton+", "Newton++", "MDDP", "Pipeline", "PIMFlow"], default="PIMFlow")
 34 |   parser.add_argument("--pipeline_stage", type=int, default=2)
 35 |   args = parser.parse_args()
 36 | 
 37 |   if args.mode == "profile" and (args.transform is None):
 38 |     parser.error("-m/--mode requires -t/--transform")
 39 | 
 40 |   return args
 41 | 
 42 | def make_model(network):
 43 |   model = get_torch_model(network)
 44 |   model.cuda()
 45 |   model.half()
 46 |   model.eval()
 47 | 
 48 |   x = get_random_input(network)
 49 |   x = x.half()
 50 | 
 51 |   # Export the model
 52 |   torch.onnx.export(model,               # model being run
 53 |                     x,                         # model input (or a tuple for multiple inputs)
 54 |                     f"{network}.onnx",   # where to save the model (can be a file or file-like object)
 55 |                     export_params=True,        # store the trained parameter weights inside the model file
 56 |                     opset_version=13,          # the ONNX version to export the model to
 57 |                     do_constant_folding=True,  # whether to execute constant folding for optimization
 58 |                     # training=TrainingMode.TRAINING,
 59 |                     input_names = ['input'],   # the model's input names
 60 |                     output_names = ['output']) # the model's output names
 61 |                     # dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
 62 |                     #               'output' : {0 : 'batch_size'}})
 63 | 
 64 |   onnx_model = onnx.load(f"{network}.onnx")
 65 |   onnx.checker.check_model(onnx_model)
 66 | 
 67 |   # infer shapes & preprocess
 68 |   onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
 69 |   preprocess(onnx_model)
 70 |   return onnx_model
 71 | 
 72 | def profile(args):
 73 |   if args.transform == "split" and args.network in ["bert-large-1x64", "bert-large-1x32", "bert-large-1x3"]:
 74 |     os.system(f"cd layerwise && ./ALL_matmul.sh {args.network} {args.accel_sim_gpu} {args.accel_sim_kernel_launch_latency} {args.accel_sim_n_channel}")
 75 |   elif args.transform == "split":
 76 |     os.system(f"cd layerwise && ./profile.sh {args.network} {args.accel_sim_gpu} {args.accel_sim_kernel_launch_latency} {args.accel_sim_n_channel}")
 77 |   elif args.transform == "pipeline":
 78 |     os.system(f"cd pipeline && ./profile.sh {args.network} {args.accel_sim_gpu} {args.accel_sim_kernel_launch_latency} {args.accel_sim_n_channel} {args.pipeline_stage}")
 79 | 
 80 | def solve(args):
 81 |   os.system(f"cd solve && ./solve.sh {args.network} {args.accel_sim_n_channel}")
 82 | 
 83 | def stat_conv_only(args):
 84 |   os.system(f"cd solve && ./stat.sh {args.network} {args.accel_sim_n_channel} {args.pipeline_stage}")
 85 | 
 86 | def stat_layerwise(args):
 87 |   newtonp = f"layerwise/newton_performance_{args.network}_{args.accel_sim_n_channel}_1_noopt.csv"
 88 |   mddp = f"layerwise/newton_performance_{args.network}_{args.accel_sim_n_channel}_4.csv"
 89 | 
 90 |   if not os.path.exists(newtonp) or not os.path.exists(mddp):
 91 |     print(f"Error: Run this first: ./pimflow -m=profile -t=split -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel}")
 92 |     return
 93 | 
 94 |   newtonp = pd.read_csv(newtonp)
 95 |   mddp = pd.read_csv(mddp)
 96 | 
 97 |   NEWTONP = list(filter(lambda row: int(row[8]) == 1, [list(row) for row in newtonp.values]))
 98 |   MDDP = list(filter(lambda row: int(row[8]) == 1, [list(row) for row in mddp.values]))
 99 | 
100 |   n = len(NEWTONP)
101 |   assert n == len(MDDP)
102 | 
103 |   RESULT = []
104 |   for i in range(n):
105 |     RESULT.append([NEWTONP[i][16], MDDP[i][16], MDDP[i][17]])
106 | 
107 |   with open(f"{args.network}_layerwise.csv", "w") as f:
108 |     write = csv.writer(f)
109 |     write.writerows(RESULT)
110 | 
111 | def stat_split_ratio(args):
112 |   networks = ["efficientnet-v1-b0", "mobilenet-v2", "mnasnet-1.0", "resnet-50", "vgg-16"]
113 | 
114 |   RESULT = [0] * 11
115 |   RESULT_R = [0] * 11
116 |   for network in networks:
117 |     split = f"layerwise/max_performance_{network}_{args.accel_sim_n_channel}_4.csv"
118 | 
119 |     if not os.path.exists(split):
120 |       print(f"Error: Run this first: ./pimflow -m=profile -t=split -n={network} --accel_sim_n_channel={args.accel_sim_n_channel}")
121 |       continue
122 | 
123 |     split = pd.read_csv(split)
124 |     SPLIT = list(filter(lambda row: int(row[8]) == 1, [list(row) for row in split.values]))
125 | 
126 |     for row in SPLIT:
127 |       assert int(row[15]) % 10 == 0
128 |       RESULT[int(row[15]) // 10] += 1
129 | 
130 |   tot = sum(RESULT)
131 |   for i, r in enumerate(RESULT):
132 |     RESULT_R[i] = round(r / tot * 100)
133 | 
134 |   print(RESULT)
135 |   print(RESULT_R)
136 | 
137 | def get_gpu_cycle(path):
138 |   cycle = 0
139 |   with open(path) as f:
140 |     for line in f:
141 |       if "gpu_tot_sim_cycle" in line:
142 |         cycle = int(line.split("=")[1].strip())
143 |   return cycle
144 | 
145 | def get_pim_cycle(path):
146 |   cycle = 0
147 |   # scale = 1.605882353 # HBM
148 |   scale = 1.56 # GDDR6
149 |   with open(path) as f:
150 |     for line in f:
151 |       if "Cycle" in line:
152 |         cycle = scale * int(line.split(" ")[1].strip())
153 |   return cycle
154 | 
155 | def stat(args, gpu_only=False):
156 |   postfix = f"{args.policy}"
157 |   if gpu_only:
158 |     postfix = "org"
159 |   other_cycle = 0
160 |   par_cycle = 0
161 |   other_cycle = get_gpu_cycle(f"traces-{args.network}-{args.accel_sim_n_channel}-{postfix}/sim.txt")
162 |   i = 0
163 |   while not gpu_only and True:
164 |     found = False
165 |     gpu_cycle = 0
166 |     pim_cycle = 0
167 |     gpu_file = f"traces-{args.network}-{args.accel_sim_n_channel}-{postfix}/sim.{i}.gpu.txt"
168 |     pim_file = f"traces-{args.network}-{args.accel_sim_n_channel}-{postfix}/sim.{i}.pim.txt"
169 |     if os.path.exists(gpu_file):
170 |       gpu_cycle = get_gpu_cycle(gpu_file)
171 |       found = True
172 |     if os.path.exists(pim_file):
173 |       pim_cycle = get_pim_cycle(pim_file)
174 |       found = True
175 |     cycle = max(gpu_cycle, pim_cycle)
176 |     par_cycle += cycle
177 |     i += 1
178 |     if not found:
179 |       break
180 |   # print(other_cycle, par_cycle)
181 |   return other_cycle + par_cycle
182 | 
183 | def extract_profiled_trace(model: str):
184 |   if not os.path.exists(model) and os.path.exists(f"./data/{model}.tar.gz"):
185 |     os.system(f"tar -xzf ./data/{model}.tar.gz -C .")
186 | 
187 | def transform_graph(args):
188 |   split = {}
189 |   pipeline = []
190 | 
191 |   n_gwrite = 4
192 |   postfix = ""
193 |   if args.policy == "Newton+":
194 |     n_gwrite = 1
195 |     postfix = "_noopt"
196 | 
197 |   with open(f'./{args.network}/{args.policy}/{n_gwrite}/solve_{args.network}_{args.policy}_{n_gwrite}{postfix}.csv', newline='') as csvfile:
198 |     reader = csv.reader(csvfile, delimiter=',')
199 |     for row in reader:
200 |       if row[1] == "split":
201 |         split[row[0]] = int(row[2])
202 |       elif row[2] == "pipeline":
203 |         pipeline.append({'nodes': row[:2], 'is_gpu_first': int(row[3]) != 1})
204 |       elif row[3] == "pipeline":
205 |         pipeline.append({'nodes': row[:3], 'is_gpu_first': False})
206 |       else:
207 |         raise Exception("Must NOT reach here!")
208 | 
209 |   if not os.path.exists(f"{args.network}.onnx"):
210 |     onnx_model = make_model(args.network)
211 |   else:
212 |     onnx_model = onnx.load(f"{args.network}.onnx")
213 |   onnx.checker.check_model(onnx_model)
214 |   onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
215 | 
216 |   node_map = {}
217 |   onnx_model = InputSplit(-1, split, node_map=node_map).transform(onnx_model)
218 |   for kv in pipeline:
219 |     onnx_model = Pipeline(node_map=node_map).transform(onnx_model, kv['nodes'], stage=2, is_gpu_first=kv['is_gpu_first'])
220 |   onnx_model = OffloadFC().transform(onnx_model)
221 |   onnx.save(onnx_model, f"{args.network}_{args.accel_sim_n_channel}_{args.policy}_transformed_opt.onnx")
222 | 
223 |   return node_map
224 | 
225 | def partition_graph(args):
226 |   onnx_model = onnx.load(f"{args.network}_{args.accel_sim_n_channel}_{args.policy}_transformed_opt.onnx")
227 |   graph = onnx_model.graph
228 |   input_node = onnx_model.graph.node[0]
229 |   shape_dict = {"input": get_arg_shape(graph, input_node, input_node.input[0])}
230 |   mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
231 | 
232 |   # tvm/python/tvm/relay/op/strategy/cuda.py
233 |   desired_layouts = {
234 |     'nn.conv2d': ['NHWC', 'OHWI'],
235 |     'nn.max_pool2d': ['NHWC'],
236 |     'nn.global_avg_pool2d': ['NHWC'],
237 |   }
238 |   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
239 |                                   relay.transform.ConvertLayout(desired_layouts),
240 |                                   relay.transform.FoldConstant(),
241 |                                   # relay.transform.OptimizeMemory(),
242 |                                   relay.transform.RemoveUnusedFunctions(),
243 |                                   ])
244 |   with tvm.transform.PassContext(opt_level=3):
245 |     mod = seq(mod)
246 | 
247 |   from tvm.relay.op.contrib.pim import partition_for_pim
248 |   from tvm.contrib.pim import build_pim_kernels
249 | 
250 |   mod = partition_for_pim(mod)
251 | 
252 |   target = "cuda -libs=cudnn,cublas"
253 |   with tvm.transform.PassContext(opt_level=2):
254 |     lib = relay.build(mod, target, params=params)
255 |   os.system("mkdir -p ./tmp")
256 |   lib = build_pim_kernels(lib, "./tmp", f"compile-{args.network}-{args.accel_sim_n_channel}-{args.policy}.so")
257 | 
258 |   return lib
259 | 
260 | def load_module(args):
261 |   dev=tvm.cuda(0)
262 |   module = None
263 |   # if not args.policy == "None":
264 |   #   lib = tvm.runtime.load_module(f"compile-{args.network}-{args.accel_sim_n_channel}-{args.policy}.so")
265 |   #   module = graph_executor.GraphModule(lib["default"](dev))
266 |   with open(f"input.pkl","rb") as f:
267 |     x = pickle.load(f)
268 | 
269 |   return module, x
270 | 
271 | def check_input(network):
272 |   if not os.path.exists("input.pkl"):
273 |     x = get_random_input(network).half()
274 |     with open(f"input.pkl","wb") as f:
275 |       pickle.dump(x, f)
276 | 
277 | def trace(args, mode):
278 |   check_input(args.network)
279 | 
280 |   if mode == "run_opt":
281 |     postfix = ""
282 |     policy = args.policy
283 |   elif mode == "run_gpu_only":
284 |     postfix = "org"
285 |     policy = ""
286 |   else:
287 |     raise Exception("Must NOT reach here!")
288 | 
289 |   dev = 0
290 |   os.environ["TVM_USE_SIMULATOR"] = ""
291 |   os.environ["CUDA_VISIBLE_DEVICES"] = f"{dev}"
292 | 
293 |   # generate trace for optimal solution
294 |   os.environ["TRACES_PATH"] = f"traces-{args.network}-{args.accel_sim_n_channel}-{policy}{postfix}"
295 |   os.environ["DYNAMIC_KERNEL_LIMIT_START"] = "1000000000"
296 |   os.system(f"LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so ./pimflow -m={mode} --trace -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
297 | 
298 |   kernel_start, kernel_end = get_kernel_start_and_end(f"traces-{args.network}-{args.accel_sim_n_channel}-{policy}{postfix}")
299 |   os.environ["DYNAMIC_KERNEL_LIMIT_START"] = f"{kernel_start}"
300 |   os.environ["DYNAMIC_KERNEL_LIMIT_END"] = f"{kernel_end}"
301 |   os.system(f"LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so ./pimflow -m={mode} --trace -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
302 |   os.system(f"/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing ./traces-{args.network}-{args.accel_sim_n_channel}-{policy}{postfix}/kernelslist")
303 | 
304 | def run_opt(args, n=1):
305 |   # module, x = load_module(network)
306 | 
307 |   # dev = tvm.cuda(0)
308 |   # dtype = "float16"
309 |   # module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype(dtype), device=dev)})
310 | 
311 |   # marker = load(name="marker", sources = ["./marker/marker_cuda.cpp", "./marker/marker_cuda_kernel.cu"])
312 | 
313 |   # for _ in range(n):
314 |   #   marker.forward(True)
315 |   #   module.run()
316 |   #   marker.forward(False)
317 | 
318 |   _, x = load_module(args)
319 | 
320 |   onnx_model = onnx.load(f"{args.network}_{args.accel_sim_n_channel}_{args.policy}_transformed_opt.onnx")
321 |   graph = onnx_model.graph
322 |   input_node = onnx_model.graph.node[0]
323 |   shape_dict = {"input": get_arg_shape(graph, input_node, input_node.input[0])}
324 |   mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
325 | 
326 |   # tvm/python/tvm/relay/op/strategy/cuda.py
327 |   desired_layouts = {
328 |     'nn.conv2d': ['NHWC', 'OHWI'],
329 |     'nn.max_pool2d': ['NHWC'],
330 |     'nn.global_avg_pool2d': ['NHWC'],
331 |   }
332 |   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
333 |                                   relay.transform.ConvertLayout(desired_layouts),
334 |                                   relay.transform.FoldConstant(),
335 |                                   # relay.transform.OptimizeMemory(),
336 |                                   relay.transform.RemoveUnusedFunctions(),
337 |                                   ])
338 |   with tvm.transform.PassContext(opt_level=3):
339 |     mod = seq(mod)
340 | 
341 |   from tvm.relay.op.contrib.pim import partition_for_pim
342 |   from tvm.contrib.pim import build_pim_kernels
343 | 
344 |   mod = partition_for_pim(mod)
345 | 
346 |   target = "cuda -libs=cudnn,cublas"
347 |   with tvm.transform.PassContext(opt_level=2):
348 |     lib = relay.build(mod, target, params=params)
349 |   os.system("mkdir -p ./tmp")
350 |   lib = build_pim_kernels(lib, "./tmp", f"compile-{args.network}-{args.accel_sim_n_channel}-{args.policy}.so")
351 | 
352 |   dev = tvm.cuda(0)
353 |   module = graph_executor.GraphModule(lib["default"](dev))
354 |   dtype = "float16"
355 |   module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype(dtype), device=dev)})
356 | 
357 |   marker = load(name="marker", sources = ["./marker/marker_cuda.cpp", "./marker/marker_cuda_kernel.cu"])
358 | 
359 |   for _ in range(n):
360 |     marker.forward(True)
361 |     module.run()
362 |     marker.forward(False)
363 | 
364 | def run_gpu_only(args, n=1):
365 |   _, x = load_module(args)
366 | 
367 |   if not os.path.exists(f"{args.network}.onnx"):
368 |     onnx_model = make_model(args.network)
369 |   else:
370 |     onnx_model = onnx.load(f"{args.network}.onnx")
371 | 
372 |   shape_dict = {"input": x.shape}
373 |   mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
374 | 
375 |   # tvm/python/tvm/relay/op/strategy/cuda.py
376 |   desired_layouts = {
377 |     'nn.conv2d': ['NHWC', 'OHWI'],
378 |     'nn.max_pool2d': ['NHWC'],
379 |     'nn.global_avg_pool2d': ['NHWC'],
380 |   }
381 |   seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
382 |                                   relay.transform.ConvertLayout(desired_layouts),
383 |                                   relay.transform.FoldConstant(),
384 |                                   # relay.transform.OptimizeMemory(),
385 |                                   relay.transform.RemoveUnusedFunctions(),
386 |                                   ])
387 |   with tvm.transform.PassContext(opt_level=3):
388 |     mod = seq(mod)
389 | 
390 |   target = "cuda -libs=cudnn,cublas"
391 |   with tvm.transform.PassContext(opt_level=2):
392 |     lib = relay.build(mod, target, params=params)
393 | 
394 |   dev = tvm.cuda(0)
395 |   module = graph_executor.GraphModule(lib["default"](dev))
396 |   dtype = "float16"
397 |   module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype(dtype), device=dev)})
398 | 
399 |   marker = load(name="marker", sources = ["./marker/marker_cuda.cpp", "./marker/marker_cuda_kernel.cu"])
400 | 
401 |   for _ in range(n):
402 |     marker.forward(True)
403 |     module.run()
404 |     marker.forward(False)
405 | 
406 | def set_envs(args):
407 |   if args.policy == "Newton+":
408 |     os.environ["RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING"] = "1"
409 |   os.environ["PIMFLOW_POLICY"] = args.policy
410 |   os.environ["PIMFLOW_N_CHANNEL"] = f"{args.accel_sim_n_channel}"
411 | 
412 | if __name__ == '__main__':
413 |   args = parse_arguments()
414 |   set_envs(args)
415 |   if args.network not in ["bert-large-1x64", "bert-large-1x32", "bert-large-1x3"]:
416 |     make_model(args.network)
417 | 
418 |   if args.mode == "profile":
419 |     profile(args)
420 |   elif args.mode == "solve":
421 |     solve(args)
422 |   elif args.mode == "transform":
423 |     extract_profiled_trace(args.network)
424 |     node_map = transform_graph(args)
425 |     with open(f"{args.network}_{args.accel_sim_n_channel}_{args.policy}_node_map.txt", "w") as f:
426 |       for k, v in node_map.items():
427 |         f.write(f"{k},{','.join(v)}\n")
428 |   elif args.mode == "build":
429 |     _ = partition_graph(args)
430 |   elif args.mode == "trace_opt":
431 |     trace(args, "run_opt")
432 |   elif args.mode == "trace_gpu_only":
433 |     trace(args, "run_gpu_only")
434 |   elif args.mode == "trace":
435 |     os.system(f"./pimflow -m=trace_opt -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
436 |     os.system(f"./pimflow -m=trace_gpu_only -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
437 |   elif args.mode == "run_opt":
438 |     if not args.trace:
439 |       os.environ["TVM_USE_SIMULATOR"] = "1"
440 |       os.environ["TVM_TRACES_PATH"] = f"traces-{args.network}-{args.accel_sim_n_channel}-{args.policy}/"
441 |       os.environ["TVM_NETWORK"] = f"{args.network}"
442 |       run_opt(args, 1)
443 |     else:
444 |       run_opt(args, 3)
445 |   elif args.mode == "run_gpu_only":
446 |     if not args.trace:
447 |       os.environ["TVM_USE_SIMULATOR"] = "1"
448 |       os.environ["TVM_TRACES_PATH"] = f"traces-{args.network}-{args.accel_sim_n_channel}-org/"
449 |       os.environ["TVM_NETWORK"] = f"{args.network}"
450 |       run_gpu_only(args, 1)
451 |     else:
452 |       run_gpu_only(args, 3)
453 |   elif args.mode == "run":
454 |     if not args.gpu_only:
455 |       os.system(f"./pimflow -m=transform -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
456 |       os.system(f"./pimflow -m=build -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
457 | 
458 |     if args.gpu_only:
459 |       os.system(f"./pimflow -m=trace_gpu_only -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
460 |       os.system(f"./pimflow -m=run_gpu_only -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
461 |     else:
462 |       os.system(f"./pimflow -m=trace_opt -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
463 |       os.system(f"./pimflow -m=run_opt -n={args.network} --accel_sim_n_channel={args.accel_sim_n_channel} --policy={args.policy}")
464 | 
465 |   elif args.mode == "stat":
466 |     if args.layerwise:
467 |       stat_layerwise(args)
468 |     elif args.split_ratio:
469 |       stat_split_ratio(args)
470 |     elif args.conv_only:
471 |       stat_conv_only(args)
472 |     else:
473 |       gpu_cycle = stat(args, gpu_only=True)
474 |       policy_cycle = stat(args)
475 |       print(f"GPU CYCLE: {gpu_cycle}")
476 |       print(f"{args.policy} CYCLE: {policy_cycle}")
477 |       print(f"{args.policy} SPEEDUP: {round(gpu_cycle / policy_cycle, 3)}")
478 |   else:
479 |     raise Exception("Must NOT reach here!")
480 | 


--------------------------------------------------------------------------------
/pipeline/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | SPLIT=$2
 4 | N_CHANNEL=$3
 5 | 
 6 | mkdir -p result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 7 | mv trace*.txt result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 8 | mv *-matmul result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
 9 | mv Conv_* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
10 | mv Gemm_* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
11 | mv accelwattch_power_report_*.log result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
12 | mv traces-* result_simulate/$MODEL/${SPLIT}_${N_CHANNEL}
13 | rm -r traces-*
14 | rm -r Conv_*
15 | rm -r tmp-*
16 | rm -r Gemm_*
17 | rm compile-*.so
18 | rm layer-*.onnx
19 | rm MatMul_*.onnx
20 | 


--------------------------------------------------------------------------------
/pipeline/inspect:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | parser = argparse.ArgumentParser()
 4 | parser.add_argument("--path", help="trace path from accelsim_frontend/util/tracer_nvbit", required=True)
 5 | parser.add_argument("--iteration", type=int, default=3, help="iteration")
 6 | args = parser.parse_args()
 7 | 
 8 | TRACE_PATH=args.path
 9 | 
10 | START=None
11 | 
12 | def parse_kernel_number(l):
13 |   return int(l.split("-")[1].split(".")[0])
14 | 
15 | # start, end
16 | r = []
17 | skip = True # skip first interval
18 | with open(f"./{TRACE_PATH}/stats.csv") as f:
19 |   lines = f.readlines()
20 | 
21 |   for i, l in enumerate(lines):
22 |     if i == 0:
23 |       continue
24 | 
25 |     l = l.split(",")
26 | 
27 |     n = parse_kernel_number(l[0].strip())
28 |     name = l[1].strip()
29 | 
30 |     if name.find("forward_kernel_cuda_start") != -1:
31 |       # if skip:
32 |       #   skip = False
33 |       #   continue
34 | 
35 |       START = n
36 | 
37 |     if START is not None and name.find("forward_kernel_cuda_end") != -1:
38 |       r.append((START, n))
39 | 
40 | 
41 | intv = r[0][1] - r[0][0]
42 | for s, t in r:
43 |   assert intv == t - s
44 | 
45 | assert len(r) == args.iteration
46 | 
47 | print(r[2][0]+1, r[2][1]-1)
48 | 
49 | 


--------------------------------------------------------------------------------
/pipeline/layerwise.py:
--------------------------------------------------------------------------------
  1 | from numpy import clip
  2 | import torch
  3 | import argparse
  4 | import os
  5 | import onnx
  6 | import tvm
  7 | import tvm.relay as relay
  8 | from tvm.contrib import graph_executor
  9 | from torch.utils.cpp_extension import load
 10 | 
 11 | import argparse
 12 | 
 13 | class Range(object):
 14 |   def __init__(self, start, end):
 15 |     self.start = start
 16 |     self.end = end
 17 |   def __eq__(self, other):
 18 |     return self.start <= other <= self.end
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument("--ic", type=int, required=True)
 22 | parser.add_argument("--oc", type=int, required=True)
 23 | parser.add_argument("--h", type=int, required=True)
 24 | parser.add_argument("--w", type=int, required=True)
 25 | parser.add_argument("--kh", type=int, required=True)
 26 | parser.add_argument("--kw", type=int, required=True)
 27 | parser.add_argument("--stride", type=int)
 28 | parser.add_argument("--ph", type=int, required=True)
 29 | parser.add_argument("--pw", type=int, required=True)
 30 | parser.add_argument("--dilate", type=int, required=True)
 31 | parser.add_argument("--g", type=int, required=True)
 32 | parser.add_argument("--b", action="store_true", default=False)
 33 | parser.add_argument("--dev", type=int, required=True)
 34 | parser.add_argument("--activation", required=True)
 35 | args = parser.parse_args()
 36 | 
 37 | marker = load(name="marker", sources = ["/root/PIMFlow/marker/marker_cuda.cpp", "/root/PIMFlow/marker/marker_cuda_kernel.cu"])
 38 | 
 39 | class Net(torch.nn.Module):
 40 |   def __init__(self):
 41 |     super(Net, self).__init__()
 42 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 43 | 
 44 |   def forward(self, x):
 45 |     x = self.conv(x)
 46 |     return x
 47 | class NetElemwiseAct(torch.nn.Module):
 48 |   def __init__(self, type):
 49 |     super(NetElemwiseAct, self).__init__()
 50 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 51 |     if type == "HardSigmoid":
 52 |       self.act = torch.nn.Hardsigmoid()
 53 |     elif type == "Sigmoid":
 54 |       self.act = torch.nn.Sigmoid()
 55 |     elif type == "Relu":
 56 |       self.act = torch.nn.ReLU()
 57 |     else:
 58 |       raise Exception(f"Unknown activation: {type}")
 59 | 
 60 |   def forward(self, x):
 61 |     x = self.conv(x)
 62 |     x = self.act(x)
 63 |     return x
 64 | 
 65 | class NetSiLU(torch.nn.Module):
 66 |   def __init__(self):
 67 |     super(NetSiLU, self).__init__()
 68 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 69 | 
 70 |   def forward(self, x):
 71 |     x = self.conv(x)
 72 |     x2 = torch.sigmoid(x)
 73 |     x = torch.mul(x, x2)
 74 |     return x
 75 | 
 76 | class NetHardSiLU(torch.nn.Module):
 77 |   def __init__(self):
 78 |     super(NetHardSiLU, self).__init__()
 79 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 80 |     self.hardsigmoid = torch.nn.Hardsigmoid()
 81 | 
 82 |   def forward(self, x):
 83 |     x = self.conv(x)
 84 |     x2 = self.hardsigmoid(x)
 85 |     x = torch.mul(x, x2)
 86 |     return x
 87 | class NetClip(torch.nn.Module):
 88 |   def __init__(self, min, max):
 89 |     super(NetClip, self).__init__()
 90 |     self.conv = torch.nn.Conv2d(args.ic, args.oc, (args.kh, args.kw), args.stride, (args.ph, args.pw), args.dilate, args.g, args.b)
 91 |     self.min = min
 92 |     self.max = max
 93 | 
 94 |   def forward(self, x):
 95 |     x = self.conv(x)
 96 |     x = torch.clip(x, min=self.min, max=self.max)
 97 |     return x
 98 | 
 99 | if args.activation == "SiLU":
100 |   model = NetSiLU()
101 | elif args.activation == "HardSiLU":
102 |   model = NetHardSiLU()
103 | elif args.activation in ["HardSigmoid", "Sigmoid", "Relu"]:
104 |   model = NetElemwiseAct(type=args.activation)
105 | elif args.activation == "Clip":
106 |   model = NetClip(min=0, max=6)
107 | elif args.activation == "Identity":
108 |   model = Net()
109 | else:
110 |   raise Exception("Not supported activation!")
111 | 
112 | print(torch.cuda.device_count())
113 | model.cuda()
114 | model.eval()
115 | model.half()
116 | x = torch.randn(1, args.ic, args.h, args.w).cuda().half()
117 | 
118 | torch_out = model(x)
119 | 
120 | # Export the model
121 | torch.onnx.export(model,               # model being run
122 |                   x,                         # model input (or a tuple for multiple inputs)
123 |                   f"layer-{os.getpid()}.onnx",   # where to save the model (can be a file or file-like object)
124 |                   export_params=True,        # store the trained parameter weights inside the model file
125 |                   opset_version=11,          # the ONNX version to export the model to
126 |                   do_constant_folding=True,  # whether to execute constant folding for optimization
127 |                   # training=TrainingMode.TRAINING,
128 |                   input_names = ['input'],   # the model's input names
129 |                   output_names = ['output']) # the model's output names
130 |                   # dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
131 |                   #               'output' : {0 : 'batch_size'}})
132 | 
133 | onnx_model = onnx.load(f"layer-{os.getpid()}.onnx")
134 | onnx.checker.check_model(onnx_model)
135 | 
136 | def to_numpy(tensor):
137 |   return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
138 | 
139 | shape_dict = {"input": x.shape}
140 | mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
141 | desired_layouts = {
142 |   'nn.conv2d': ['NHWC', 'OHWI'],
143 |   'nn.max_pool2d': ['NHWC'],
144 |   'nn.global_avg_pool2d': ['NHWC'],
145 | }
146 | seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
147 |                                 relay.transform.ConvertLayout(desired_layouts),
148 |                                 relay.transform.FoldConstant()])
149 | with tvm.transform.PassContext(opt_level=3):
150 |   mod = seq(mod)
151 | 
152 | # debug
153 | # print(mod)
154 | 
155 | from tvm.relay.op.contrib.pim import partition_for_pim
156 | from tvm.contrib.pim import build_pim_kernels
157 | mod = partition_for_pim(mod)
158 | 
159 | # debug
160 | # print(mod)
161 | 
162 | target = "cuda -libs=cudnn,cublas"
163 | with tvm.transform.PassContext(opt_level=2):
164 |   lib = relay.build(mod, target, params=params)
165 | os.system(f"mkdir -p tmp-{os.getpid()}")
166 | lib = build_pim_kernels(lib, f"./tmp-{os.getpid()}", f"compile-{os.getpid()}.so")
167 | dev = tvm.cuda(0)
168 | module = graph_executor.GraphModule(lib["default"](dev))
169 | module.set_input(**{"input" : tvm.nd.array(to_numpy(x).astype("float16"), device=dev)})
170 | 
171 | for i in range(3):
172 |   marker.forward(True)
173 |   module.run()
174 |   marker.forward(False)
175 | 
176 | print("FINISH!!!")
177 | 


--------------------------------------------------------------------------------
/pipeline/pim_codegen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | NAME=$1
 3 | OUT_CHANNELS=$2
 4 | IN_CHANNELS=$3
 5 | KH=$4
 6 | KW=$5
 7 | STRIDE=$6
 8 | PH=$7
 9 | PW=$8
10 | DILATE=$9
11 | GROUP=${10}
12 | BIAS=${11}
13 | IMAGE_HEIGHT=${12}
14 | IMAGE_WIDTH=${13}
15 | N_CHANNEL=${14}
16 | N_GWRITE=${15}
17 | 
18 | ../pim/pim_codegen -oc $OUT_CHANNELS -ic $IN_CHANNELS -h $IMAGE_HEIGHT -w $IMAGE_WIDTH -kh $KH -kw $KW -ph $PH -pw $PW -stride $STRIDE -name PIM_trace_partition_${N_CHANNEL}_${N_GWRITE} -n_channel $N_CHANNEL -gw $N_GWRITE
19 | 
20 | mkdir -p $NAME
21 | 
22 | for i in ./PIM_trace_partition_${N_CHANNEL}_${N_GWRITE}-*.pim; do
23 |     mv $i $NAME
24 | done
25 | 


--------------------------------------------------------------------------------
/pipeline/profile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | GPU=$2
 4 | KERNEL_LAUNCH_LATENCY=$3
 5 | N_CHANNEL=$4
 6 | STAGE=$5
 7 | 
 8 | for (( i = 1; i < 4; i = i + 1)) ; do
 9 |   python3 extract_layers.py --model=$MODEL --n_channel=$N_CHANNEL --stage=$STAGE
10 |   python3 run --trace --gpgpusim_config=$GPU --model=$MODEL --pipeline=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
11 |   python3 run --simulate --gpgpusim_config=$GPU --model=$MODEL --pipeline=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL
12 |   python3 run --pim_codegen --model=$MODEL --pipeline=$i --n_channel=$N_CHANNEL --n_gwrite=1
13 |   python3 run --pim_codegen --model=$MODEL --pipeline=$i --n_channel=$N_CHANNEL --n_gwrite=2
14 |   python3 run --pim_codegen --model=$MODEL --pipeline=$i --n_channel=$N_CHANNEL --n_gwrite=4
15 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --pipeline=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=1
16 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --pipeline=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=2
17 |   python3 run --stat --gpgpusim_config=$GPU --model=$MODEL --pipeline=$i --kernel_launch_latency=$KERNEL_LAUNCH_LATENCY --n_channel=$N_CHANNEL --n_gwrite=4
18 |   sh clean.sh $MODEL $i $N_CHANNEL
19 | done
20 | 
21 | python3 to_full_layer.py --model $MODEL --n_gwrite=1
22 | python3 to_full_layer.py --model $MODEL --n_gwrite=1 --ramulator_disable_gwrite_latency_hiding
23 | python3 to_full_layer.py --model $MODEL --n_gwrite=2
24 | python3 to_full_layer.py --model $MODEL --n_gwrite=2 --ramulator_disable_gwrite_latency_hiding
25 | python3 to_full_layer.py --model $MODEL --n_gwrite=4
26 | python3 to_full_layer.py --model $MODEL --n_gwrite=4 --ramulator_disable_gwrite_latency_hiding
27 | 


--------------------------------------------------------------------------------
/pipeline/run:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen
  2 | import time
  3 | import argparse
  4 | import multiprocessing
  5 | import os
  6 | import numpy as np
  7 | import csv
  8 | import re
  9 | import pandas as pd
 10 | from pim.util import MODEL_LIST
 11 | from torch.cuda import device_count
 12 | parser = argparse.ArgumentParser()
 13 | group = parser.add_mutually_exclusive_group(required=True)
 14 | group.add_argument("--trace", action="store_true", help="create trace")
 15 | group.add_argument("--simulate", action="store_true", help="simulate")
 16 | group.add_argument("--pim_codegen", action="store_true", help="pim codegen")
 17 | group.add_argument("--stat", action="store_true", help="record statistics")
 18 | group.add_argument("--pim", action="store_true", help="pim_cycle")
 19 | group.add_argument("--update_pim", action="store_true", help="update pim")
 20 | parser.add_argument("--gpgpusim_config", help="gpgpusim config (e.g. SM75_RTX2060")
 21 | parser.add_argument("--device_id", type=int, default=0)
 22 | parser.add_argument("--pipeline", type=int, default=0)
 23 | parser.add_argument("--kernel_launch_latency", type=int, default=5010)
 24 | parser.add_argument("--n_channel", type=int, default=16)
 25 | parser.add_argument("--n_gwrite", type=int, default=4)
 26 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 27 | args = parser.parse_args()
 28 | 
 29 | if args.trace and (args.gpgpusim_config is None):
 30 |   parser.error("--trace requires --gpgpusim_config")
 31 | if args.simulate and (args.gpgpusim_config is None):
 32 |   parser.error("--simulate requires --gpgpusim_config")
 33 | 
 34 | NGPU=list(range(device_count()))
 35 | NCPU=multiprocessing.cpu_count()
 36 | 
 37 | CONFIG_ALL=[]
 38 | CONFIG_GPU=[]
 39 | CONFIG_PIM=[]
 40 | 
 41 | filename = f"{args.model}_pipelined{args.pipeline}_{args.n_channel}.onnx_conv.csv"
 42 | if not os.path.exists(filename):
 43 |   exit()
 44 | 
 45 | with open(filename) as f:
 46 |   rdr = csv.reader(f)
 47 |   for line in rdr:
 48 |     CONFIG_ALL.append(line)
 49 |     if "pim" in line[0]:
 50 |       CONFIG_PIM.append(line)
 51 |     else:
 52 |       CONFIG_GPU.append(line)
 53 | 
 54 | def get_device(n):
 55 |   return NGPU[n % len(NGPU)]
 56 | 
 57 | def make_args(config, n_mem):
 58 |   script = ""
 59 |   if args.trace:
 60 |     script = "trace.sh"
 61 |   elif args.simulate:
 62 |     script = "sim.sh"
 63 |   elif args.stat:
 64 |     script = "stat.sh"
 65 | 
 66 |   dev = get_device(n_mem)
 67 |   print(config)
 68 |   a = f"export CUDA_VISIBLE_DEVICES={dev} && export TRACES_PATH=traces-{config[0]} && echo $CUDA_VISIBLE_DEVICES && ./{script} {config[0]} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.gpgpusim_config} {config[12]} {dev} {args.n_channel}"
 69 |   print(a)
 70 |   return a
 71 | 
 72 | def trace(configs):
 73 |   procs = []
 74 |   n = 0
 75 |   r = 0
 76 |   for i in range(min(len(NGPU), len(configs))):
 77 |     procs.append(Popen(make_args(configs[i], i), shell=True, start_new_session=True))
 78 |     r += 1
 79 |     time.sleep(3)
 80 |   while True:
 81 |     if n >= len(configs):
 82 |       break
 83 |     for i, p in enumerate(procs):
 84 |       if p is None:
 85 |         continue
 86 |       if p.poll() is not None:
 87 |         if r < len(configs):
 88 |           procs[i] = Popen(make_args(configs[r], i), shell=True, start_new_session=True)
 89 |           r += 1
 90 |         else:
 91 |           procs[i] = None
 92 |         n += 1
 93 |         time.sleep(3)
 94 |     time.sleep(3)
 95 | 
 96 | def simulate(configs_gpu):
 97 |   procs = []
 98 |   n = 0
 99 |   r = 0
100 |   for i in range(min(NCPU, len(configs_gpu))):
101 |     print(configs_gpu[i][0])
102 |     procs.append(Popen(make_args(configs_gpu[i], i), shell=True, start_new_session=True))
103 |     r += 1
104 |     time.sleep(3)
105 | 
106 |   while True:
107 |     if n >= len(configs_gpu):
108 |       break
109 | 
110 |     for i, p in enumerate(procs):
111 |       if p is None:
112 |         continue
113 | 
114 |       if p.poll() is not None:
115 |         if r < len(configs_gpu):
116 |           print(configs_gpu[i][0])
117 |           procs[i] = Popen(make_args(configs_gpu[r], i), shell=True, start_new_session=True)
118 |         else:
119 |           procs[i] = None
120 | 
121 |         r += 1
122 |         n += 1
123 |         time.sleep(3)
124 | 
125 |     time.sleep(3)
126 | 
127 | def statistics(configs_gpu, configs_pim, configs_all):
128 |   if args.gpgpusim_config == "SM75_RTX2060":
129 |     # scale=1.605882353 # HBM
130 |     scale = 1.56 # GDDR6
131 |   elif args.gpgpusim_config == "SM7_TITANV":
132 |     scale=1.411764706
133 |   else:
134 |     assert False
135 | 
136 |   pim_cycles = {}
137 |   for c in range(len(configs_pim)):
138 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
139 |     cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
140 |     cyc = re.sub(r'[^0-9]', '', cyc)
141 |     pim_cycles[configs_pim[c][0]] = int(cyc) * scale
142 |     print(pim_cycles[configs_pim[c][0]])
143 | 
144 |   pim_cycles_noopt = {}
145 |   for c in range(len(configs_pim)):
146 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
147 |     cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
148 |     cyc = re.sub(r'[^0-9]', '', cyc)
149 |     pim_cycles_noopt[configs_pim[c][0]] = int(cyc) * scale
150 |     print(pim_cycles_noopt[configs_pim[c][0]])
151 | 
152 |   gpu_cycles = {}
153 |   for c in range(len(configs_gpu)):
154 |     fname = f"traces-{configs_gpu[c][0]}.txt"
155 |     with open(fname) as f:
156 |       lines = f.readlines()
157 | 
158 |       # if start != 3 or end != 3:
159 |       #   print(f"SKIP: {fname}")
160 |       #   continue
161 |       tot_runs = []
162 |       runs = []
163 |       energy = 0
164 |       for i, l in enumerate(lines):
165 |         if l.find("gpu_tot_sim_cycle") != -1:
166 |           tot_runs.append(int(lines[i].split("=")[1]))
167 |         if l.find("kernel_name") != -1 and l.find("forward_kernel_cuda") == -1:
168 |           runs.append(int(lines[i+1].split("=")[1]))
169 |       # assert len(runs) == 3
170 |       gpu_cycles[configs_gpu[c][0]]=int(tot_runs[len(tot_runs)-1])
171 |       # print(runs)
172 |   with open(f'{args.model}_pipeline{args.pipeline}_{args.n_channel}_{args.n_gwrite}.csv','w', newline='') as f:
173 |     wr = csv.writer(f)
174 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
175 | 
176 |     c = 0
177 |     while True:
178 |       if c >= len(configs_all):
179 |         break
180 | 
181 |       if "pim" in configs_all[c][0]:
182 |         if c + 1 >= len(configs_all):
183 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles.get(configs_all[c][0],0)])
184 |           c = c + 1
185 |           continue
186 |         if "pim" in configs_all[c+1][0]:
187 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles.get(configs_all[c][0],0)])
188 |         elif "added" in configs_all[c+1][0]:
189 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c+1][0],0),pim_cycles.get(configs_all[c][0],0)])
190 |           c = c + 1
191 |         else:
192 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles.get(configs_all[c][0],0)])
193 |       elif "added" in configs_all[c][0]:
194 |         if c + 1 >= len(configs_all):
195 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
196 |           c = c + 1
197 |           continue
198 |         if "pim" in configs_all[c+1][0]:
199 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),pim_cycles.get(configs_all[c+1][0],0)])
200 |           c = c + 1
201 |         elif "added" in configs_all[c+1][0]:
202 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
203 |         else:
204 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
205 |       else:
206 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
207 |       c = c + 1
208 | 
209 |   with open(f'{args.model}_pipeline{args.pipeline}_{args.n_channel}_{args.n_gwrite}_noopt.csv','w', newline='') as f:
210 |     wr = csv.writer(f)
211 |     wr.writerow(['kernel_name','I_c','O_c','H','W','kernel_size','pads','strides','group','dilations','bias','activation','GPU cycles','PIM cycles'])
212 | 
213 |     c = 0
214 |     while True:
215 |       if c >= len(configs_all):
216 |         break
217 | 
218 |       if "pim" in configs_all[c][0]:
219 |         if c + 1 >= len(configs_all):
220 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles_noopt.get(configs_all[c][0],0)])
221 |           c = c + 1
222 |           continue
223 |         if "pim" in configs_all[c+1][0]:
224 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles_noopt.get(configs_all[c][0],0)])
225 |         elif "added" in configs_all[c+1][0]:
226 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c+1][0],0),pim_cycles_noopt.get(configs_all[c][0],0)])
227 |           c = c + 1
228 |         else:
229 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],0,pim_cycles_noopt.get(configs_all[c][0],0)])
230 |       elif "added" in configs_all[c][0]:
231 |         if c + 1 >= len(configs_all):
232 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
233 |           c = c + 1
234 |           continue
235 |         if "pim" in configs_all[c+1][0]:
236 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),pim_cycles_noopt.get(configs_all[c+1][0],0)])
237 |           c = c + 1
238 |         elif "added" in configs_all[c+1][0]:
239 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
240 |         else:
241 |           wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
242 |       else:
243 |         wr.writerow([configs_all[c][0],configs_all[c][2],configs_all[c][5],configs_all[c][3],configs_all[c][4],configs_all[c][6],configs_all[c][7],configs_all[c][8],configs_all[c][9],configs_all[c][10],configs_all[c][11],configs_all[c][12],gpu_cycles.get(configs_all[c][0],0),0])
244 |       c = c + 1
245 | 
246 | def update_pim(configs_pim):
247 |   if args.gpgpusim_config == "SM75_RTX2060":
248 |     # scale = 1.605882353 # HBM
249 |     scale = 1.56 # GDDR6
250 |   elif args.gpgpusim_config == "SM7_TITANV":
251 |     scale = 1.411764706
252 |   else:
253 |     assert False
254 | 
255 |   pim_cycles = {}
256 |   for c in range(len(configs_pim)):
257 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
258 |     cyc = os.popen(f"/root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
259 |     cyc = re.sub(r'[^0-9]', '', cyc)
260 |     pim_cycles[configs_pim[c][0]] = int(cyc) * scale
261 | 
262 |   pim_cycles_noopt = {}
263 |   for c in range(len(configs_pim)):
264 |     pname = f"{configs_pim[c][0]}_{args.n_channel}/PIM_trace_partition_{args.n_channel}_{args.n_gwrite}-{32 - args.n_channel}.pim"
265 |     cyc = os.popen(f"RAMULATOR_DISABLE_GWRITE_LATENCY_HIDING=1 /root/PIMFlow_ramulator/ramulator /root/PIMFlow_ramulator/configs/GDDR6-config.cfg --mode=dram {pname} | grep Cycle").read()
266 |     cyc = re.sub(r'[^0-9]', '', cyc)
267 |     pim_cycles_noopt[configs_pim[c][0]] = int(cyc) * scale
268 | 
269 |   f = pd.read_csv(f'{args.model}_pipeline{args.pipeline}_{args.n_channel}_{args.n_gwrite}.csv', delimiter=',')
270 |   header = list(f.columns)
271 |   rows = [list(row) for row in f.values]
272 | 
273 |   g = pd.read_csv(f'{args.model}_pipelined{args.pipeline}_{args.n_channel}.onnx_conv.csv', delimiter=',')
274 |   # header2 = list(g.columns)
275 |   rows2 = [list(row) for row in g.values]
276 | 
277 |   with open(f'{args.model}_pipeline{args.pipeline}_{args.n_channel}_{args.n_gwrite}.csv', 'w', newline='') as h:
278 |     writer = csv.writer(h)
279 |     writer.writerow(header)
280 |     for i, row in enumerate(rows):
281 |       rows[i][13] = 0
282 |     for i, row in enumerate(rows):
283 |       if args.pipeline == 1:
284 |         if "pim" in row[0]:
285 |           rows[i][13] = pim_cycles[row[0]]
286 |       elif args.pipeline == 2:
287 |         if "pim" in row[0]:
288 |           for j, row2 in enumerate(rows2):
289 |             if row2[0] == row[0]:
290 |               rows[i-1][13] = pim_cycles[rows2[j-1][0]]
291 |               rows[i][13] = pim_cycles[rows2[j][0]]
292 |               break
293 |       elif args.pipeline == 3:
294 |         if "added" in rows[i][0] and "pim" not in rows[i][0] and "pim" in rows[i+1][0]:
295 |           for j, row2 in enumerate(rows2):
296 |             if row2[0] == row[0]:
297 |               rows[i-2][13] = pim_cycles[rows2[j-3][0]]
298 |               rows[i-1][13] = pim_cycles[rows2[j-2][0]]
299 |               rows[i][13] = pim_cycles[rows2[j+1][0]]
300 |               rows[i+1][13] = pim_cycles[rows2[j+2][0]]
301 |       else:
302 |         assert False
303 |     writer.writerows(rows)
304 | 
305 |   with open(f'{args.model}_pipeline{args.pipeline}_{args.n_channel}_{args.n_gwrite}_noopt.csv', 'w', newline='') as h:
306 |     writer = csv.writer(h)
307 |     writer.writerow(header)
308 |     for i, row in enumerate(rows):
309 |       rows[i][13] = 0
310 |     for i, row in enumerate(rows):
311 |       if args.pipeline == 1:
312 |         if "pim" in row[0]:
313 |           rows[i][13] = pim_cycles_noopt[row[0]]
314 |       elif args.pipeline == 2:
315 |         if "pim" in row[0]:
316 |           for j, row2 in enumerate(rows2):
317 |             if row2[0] == row[0]:
318 |               rows[i-1][13] = pim_cycles_noopt[rows2[j-1][0]]
319 |               rows[i][13] = pim_cycles_noopt[rows2[j][0]]
320 |               break
321 |       elif args.pipeline == 3:
322 |         if "added" in rows[i][0] and "pim" not in rows[i][0] and "pim" in rows[i+1][0]:
323 |           for j, row2 in enumerate(rows2):
324 |             if row2[0] == row[0]:
325 |               rows[i-2][13] = pim_cycles_noopt[rows2[j-3][0]]
326 |               rows[i-1][13] = pim_cycles_noopt[rows2[j-2][0]]
327 |               rows[i][13] = pim_cycles_noopt[rows2[j+1][0]]
328 |               rows[i+1][13] = pim_cycles_noopt[rows2[j+2][0]]
329 |       else:
330 |         assert False
331 |     writer.writerows(rows)
332 | 
333 | if args.trace:
334 |   trace(CONFIG_GPU)
335 | 
336 | if args.simulate:
337 |   simulate(CONFIG_GPU)
338 | 
339 | if args.pim_codegen:
340 |   for config in CONFIG_PIM:
341 |     os.system(f'sh pim_codegen.sh {config[0]}_{args.n_channel} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.n_channel} {args.n_gwrite}')
342 | 
343 | if args.stat:
344 |   statistics(CONFIG_GPU, CONFIG_PIM, CONFIG_ALL)
345 | 
346 | if args.update_pim:
347 |   for config in CONFIG_PIM:
348 |     os.system(f'sh pim_codegen.sh {config[0]}_{args.n_channel} {config[5]} {config[2]} {config[6][1]} {config[6][3]} {config[8]} {config[7][1]} {config[7][3]} {config[10]} {config[9]} {config[11]} {config[3]} {config[4]} {args.n_channel} {args.n_gwrite}')
349 |   update_pim(CONFIG_PIM)
350 | 
351 | print("Finished!")
352 | 


--------------------------------------------------------------------------------
/pipeline/sim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | OUT_CHANNELS=$2
 5 | IN_CHANNELS=$3
 6 | KH=$4
 7 | KW=$5
 8 | STRIDE=$6
 9 | PH=$7
10 | PW=$8
11 | DILATE=$9
12 | GROUP=${10}
13 | BIAS=${11}
14 | IMAGE_HEIGHT=${12}
15 | IMAGE_WIDTH=${13}
16 | GPU=${14}
17 | ACTIVATION=${15}
18 | DEVICE_ID=${16}
19 | N_CHANNEL=${17}
20 | 
21 | EXTRA_GPU_CONFIG_1="-gpgpu_n_mem $((32-$N_CHANNEL)) -gpgpu_deadlock_detect 0"
22 | EXTRA_GPU_CONFIG_2="-gpgpu_n_mem 32 -gpgpu_deadlock_detect 0"
23 | # TODO: add PIM_PATH
24 | 
25 | BASE_PATH="/root/PIMFlow_accel-sim-framework"
26 | 
27 | export CUDA_INSTALL_PATH=/usr/local/cuda
28 | source "$BASE_PATH/gpu-simulator/setup_environment.sh"
29 | timeout 21600 $BASE_PATH/gpu-simulator/bin/release/accel-sim.out -trace "traces-$NAME/kernelslist.g" -config "$BASE_PATH/gpu-simulator/configs/tested-cfgs/$GPU/trace.config" -config "$BASE_PATH/gpu-simulator/gpgpu-sim/configs/tested-cfgs/$GPU/gpgpusim.config" $EXTRA_GPU_CONFIG_1 | grep -E "kernel_name|gpu_sim_cycle|gpu_tot_sim_cycle" &> traces-$NAME.txt
30 | 
31 | 


--------------------------------------------------------------------------------
/pipeline/to_full_layer.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pandas as pd
  3 | 
  4 | import argparse
  5 | import os
  6 | from pim.util import MODEL_LIST
  7 | parser = argparse.ArgumentParser()
  8 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
  9 | parser.add_argument("--n_channel", type=int, default=16)
 10 | parser.add_argument("--n_gwrite", type=int, default=4)
 11 | parser.add_argument("--ramulator_disable_gwrite_latency_hiding", action="store_true")
 12 | args = parser.parse_args()
 13 | 
 14 | postfix = ""
 15 | if args.ramulator_disable_gwrite_latency_hiding:
 16 |     postfix = "_noopt"
 17 | 
 18 | def process(model):
 19 |   end_to_end = pd.read_csv(f'{model}_{args.n_channel}.onnx_conv.csv', delimiter=',',header=None)
 20 |   END_base = [list(row) for row in end_to_end.values]
 21 |   END_gpu = [list(row) for row in end_to_end.values]
 22 |   END_max = [list(row) for row in end_to_end.values]
 23 | 
 24 |   baseline = pd.read_csv(f'{model}_split100-baseline.csv', delimiter=',')
 25 |   gpu = pd.read_csv(f'{model}_split100_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 26 |   head = ["kernel_name","N","I_c","H","W","O_c","kernel_size","pads","strides","group","dilations","bias","activation","GPU cycles","PIM cycles","TOTAL_cycle","RATIO","SPEED_UP"]
 27 |   head_split = ["kernel_name","N","I_c","H","W","O_c","kernel_size","pads","strides","group","dilations","bias","activation","node_name","GPU cycles","PIM cycles","TOTAL_cycle","RATIO","SPEED_UP"]
 28 |   newton = pd.read_csv(f'../layerwise/newton_performance_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 29 | 
 30 |   # head.append("TOTAL_cycel")
 31 |   # head.append("RATIO")
 32 |   # head.append("SPEED_UP")
 33 |   # head.insert(1,'N')
 34 |   act = 0
 35 |   if 'activation' in baseline:
 36 |     act = 1
 37 | 
 38 | 
 39 |   BASE = [list(row) for row in baseline.values]
 40 |   GPU = [list(row) for row in gpu.values]
 41 |   NEWTON = [list(row) for row in newton.values]
 42 |   DIC_GPU_base={}
 43 |   DIC_GPU_only={}
 44 |   DIC_PIM_base={}
 45 |   DIC_node_name={}
 46 |   for i in range(len(BASE)):
 47 |     key = str(BASE[i][1]) + str(BASE[i][2]) + str(BASE[i][4]) + str(BASE[i][5]) + str(BASE[i][6][3]) + str(BASE[i][7]) + str(BASE[i][8])
 48 |     DIC_node_name[key] = BASE[i][0]
 49 |     DIC_GPU_base[key] = BASE[i][11+act]
 50 |     DIC_GPU_only[key] = GPU[i][11+act]
 51 |     DIC_PIM_base[key] = NEWTON[i][12+act]
 52 | 
 53 |   DIC_node_name_max={}
 54 |   DIC_GPU_max={}
 55 |   DIC_PIM_max={}
 56 |   DIC_TOT_max={}
 57 |   DIC_RATIO_max={}
 58 |   DIC_SPEED_max={}
 59 |   max_ = pd.read_csv(f'max_performance_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 60 |   MAX = [list(row) for row in max_.values]
 61 |   for i in range(len(MAX)):
 62 |     key = str(MAX[i][1]) + str(MAX[i][2]) + str(MAX[i][4]) + str(MAX[i][5]) + str(MAX[i][6][3]) + str(MAX[i][7]) + str(MAX[i][8])
 63 |     DIC_node_name_max[key] = MAX[i][0]
 64 |     DIC_GPU_max[key] = MAX[i][11+act]
 65 |     DIC_PIM_max[key] = MAX[i][12+act]
 66 |     DIC_TOT_max[key] = MAX[i][13+act]
 67 |     DIC_RATIO_max[key] = MAX[i][14+act]
 68 |     DIC_SPEED_max[key] = MAX[i][15+act]
 69 | 
 70 |   for i in range(len(END_base)):
 71 |     key = str(END_base[i][2]) + str(END_base[i][5]) + str(END_base[i][4]) + str(END_base[i][6]) + str(END_base[i][7][3]) + str(END_base[i][8]) + str(END_base[i][9])
 72 |     END_base[i].append(DIC_GPU_base.get(key,0))
 73 |     END_base[i].append(DIC_PIM_base.get(key,0))
 74 |     END_base[i].append(min(END_base[i][-2], END_base[i][-1])) # total
 75 |     if DIC_GPU_base.get(key,0) <= DIC_PIM_base.get(key,0): # ratio
 76 |       END_base[i].append(100)
 77 |     else:
 78 |       END_base[i].append(0)
 79 |     END_base[i].append(max(END_base[i][-4] / END_base[i][-3], 1)) # speedup
 80 | 
 81 |   for i in range(len(END_gpu)):
 82 |     key = str(END_gpu[i][2]) + str(END_gpu[i][5]) + str(END_gpu[i][4]) + str(END_gpu[i][6]) + str(END_gpu[i][7][3]) + str(END_gpu[i][8]) + str(END_gpu[i][9])
 83 |     END_gpu[i].append(DIC_node_name.get(key, 0))
 84 |     END_gpu[i].append(DIC_GPU_only.get(key,0))
 85 |     END_gpu[i].append(DIC_PIM_base.get(key,0))
 86 |     END_gpu[i].append(min(END_gpu[i][-2], END_gpu[i][-1])) # total
 87 |     if DIC_GPU_only.get(key,0) <= DIC_PIM_base.get(key,0): # ratio
 88 |       END_gpu[i].append(100)
 89 |     else:
 90 |       END_gpu[i].append(0)
 91 |     END_gpu[i].append(max(END_gpu[i][-4] / END_gpu[i][-3], 1)) # speedup
 92 | 
 93 |   for i in range(len(END_max)):
 94 |     key = str(END_max[i][2]) + str(END_max[i][5]) + str(END_max[i][4]) + str(END_max[i][6]) + str(END_max[i][7][3]) + str(END_max[i][8]) + str(END_max[i][9])
 95 |     END_max[i].append(DIC_node_name_max.get(key, 0))
 96 |     END_max[i].append(DIC_GPU_max.get(key, 0))
 97 |     END_max[i].append(DIC_PIM_max.get(key, 0))
 98 |     END_max[i].append(DIC_TOT_max.get(key, 0))
 99 |     END_max[i].append(DIC_RATIO_max.get(key, 0))
100 |     END_max[i].append(DIC_SPEED_max.get(key, 0))
101 | 
102 |   with open(f'max_performance_end_to_end_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
103 |     write = csv.writer(f)
104 |     write.writerow(head_split)
105 |     write.writerows(END_max)
106 | 
107 |   with open(f'baseline_end_to_end_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
108 |     write = csv.writer(f)
109 |     write.writerow(head)
110 |     write.writerows(END_base)
111 | 
112 |   with open(f'gpu_end_to_end_{model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
113 |     write = csv.writer(f)
114 |     write.writerow(head_split)
115 |     write.writerows(END_gpu)
116 | 
117 | os.system(f"python3 /root/PIMFlow/layerwise/inspect_shape.py --model={args.model} --split_ratio=100 --full --n_channel={args.n_channel}")
118 | os.system(f"cp ../layerwise/max_performance_{args.model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv ./")
119 | os.system(f"cp ../layerwise/{args.model}_split100-baseline.csv ./")
120 | os.system(f"cp ../layerwise/{args.model}_split100_{args.n_channel}_{args.n_gwrite}{postfix}.csv ./")
121 | # os.system(f"cp ../layerwise/{args.model}.onnx_conv.csv ./")
122 | process(args.model)
123 | 


--------------------------------------------------------------------------------
/pipeline/trace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=$1
 4 | OUT_CHANNELS=$2
 5 | IN_CHANNELS=$3
 6 | KH=$4
 7 | KW=$5
 8 | STRIDE=$6
 9 | PH=$7
10 | PW=$8
11 | DILATE=$9
12 | GROUP=${10}
13 | BIAS=${11}
14 | IMAGE_HEIGHT=${12}
15 | IMAGE_WIDTH=${13}
16 | GPU=${14}
17 | ACTIVATION=${15}
18 | DEVICE_ID=${16}
19 | N_CHANNEL=${17}
20 | 
21 | # assume docker is setup
22 | export TVM_HOME=/root/tvm
23 | export PYTHONPATH=$TVM_HOME/python:${PYTHONPATH}
24 | 
25 | export DYNAMIC_KERNEL_LIMIT_START=1000000000
26 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/pipeline/layerwise.py --oc=$OUT_CHANNELS --ic=$IN_CHANNELS --kh=$KH --kw=$KW --stride=$STRIDE --ph=$PH --pw=$PW --dilate=$DILATE --g=$GROUP --b --h=$IMAGE_HEIGHT --w=$IMAGE_WIDTH --dev=$CUDA_VISIBLE_DEVICES --activation=$ACTIVATION
27 | 
28 | START=$(python3 inspect --path=traces-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\1/g')
29 | END=$(python3 inspect --path=traces-$NAME | sed -r 's/([0-9]*)\s([0-9]*)/\2/g')
30 | export DYNAMIC_KERNEL_LIMIT_START=$START
31 | export DYNAMIC_KERNEL_LIMIT_END=$END
32 | 
33 | LD_PRELOAD=/root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/tracer_tool.so python3 /root/PIMFlow/pipeline/layerwise.py --oc=$OUT_CHANNELS --ic=$IN_CHANNELS --kh=$KH --kw=$KW --stride=$STRIDE --ph=$PH --pw=$PW --dilate=$DILATE --g=$GROUP --b --h=$IMAGE_HEIGHT --w=$IMAGE_WIDTH --dev=$CUDA_VISIBLE_DEVICES --activation=$ACTIVATION
34 | 
35 | /root/PIMFlow_accel-sim-framework/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing ./traces-$NAME/kernelslist
36 | 


--------------------------------------------------------------------------------
/run.example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function run() {
 4 |   NET=$1
 5 |   echo "START!"
 6 |   ./pimflow -m=profile -t=split -n=$NET >split-$NET.log 2>split-$NET.err
 7 |   ./pimflow -m=profile -t=pipeline -n=$NET >pipeline-$NET.log 2>pipeline-$NET.err
 8 |   ./pimflow -m=stat --conv_only -n=$NET
 9 |   ./pimflow -m=solve -n=$NET >solve-$NET.log 2>solve-$NET.err
10 |   ./pimflow -m=run --gpu_only --policy=None -n=$NET >run_gpu_only-$NET.log 2>run_gpu_only-$NET.err
11 |   ./pimflow -m=run --policy=Newton+ -n=$NET >run-$NET-Newton+.log 2>run-$NET-Newton+.err
12 |   ./pimflow -m=run --policy=Newton++ -n=$NET >run-$NET-Newton++.log 2>run-$NET-Newton++.err
13 |   ./pimflow -m=run --policy=MDDP -n=$NET >run-$NET-MDDP.log 2>run-$NET-MDDP.err
14 |   ./pimflow -m=run --policy=Pipeline -n=$NET >run-$NET-Pipeline.log 2>run-$NET-Pipeline.err
15 |   ./pimflow -m=run --policy=PIMFlow -n=$NET >run-$NET-PIMFlow.log 2>run-$NET-PIMFlow.err
16 |   ./pimflow -m=stat -n=$NET
17 |   echo "END!"
18 | }
19 | 
20 | run efficientnet-v1-b0
21 | run mobilenet-v2
22 | run mnasnet-1.0
23 | run resnet-50
24 | run vgg-16
25 | 
26 | # run efficientnet-v1-b4
27 | # run efficientnet-v1-b6
28 | # run mobilenet-v2-1.4
29 | # run mnasnet-1.3
30 | 


--------------------------------------------------------------------------------
/run_matmul.example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NET=bert-large-1x64
 4 | echo "START!"
 5 | ./pimflow -m=profile -t=split -n=$NET >split-$NET.log 2>split-$NET.err
 6 | # ./pimflow -m=stat --matmul_only -n=$NET
 7 | echo "END!"
 8 | 
 9 | NET=bert-large-1x32
10 | echo "START!"
11 | ./pimflow -m=profile -t=split -n=$NET >split-$NET.log 2>split-$NET.err
12 | # ./pimflow -m=stat --matmul_only -n=$NET
13 | echo "END!"
14 | 
15 | NET=bert-large-1x3
16 | echo "START!"
17 | ./pimflow -m=profile -t=split -n=$NET >split-$NET.log 2>split-$NET.err
18 | # ./pimflow -m=stat --matmul_only -n=$NET
19 | echo "END!"
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | setup(name='pim', version='0.1', packages=find_packages(), package_dir={"pim": "pim"})
3 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | (cd pim && make clean && make)
3 | 


--------------------------------------------------------------------------------
/solve/solve.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import math
  3 | import os
  4 | import csv
  5 | import argparse
  6 | from pim.util import MODEL_LIST
  7 | class Range(object):
  8 |   def __init__(self, start, end):
  9 |     self.start = start
 10 |     self.end = end
 11 |   def __eq__(self, other):
 12 |     return self.start <= other <= self.end
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--model", help="model", choices=MODEL_LIST, required=True)
 16 | parser.add_argument("--pipeline", choices=["none", "1", "2", "3", "all"], required=True)
 17 | parser.add_argument("--n_channel", type=int, default=16)
 18 | parser.add_argument("--n_gwrite", type=int, default=4)
 19 | parser.add_argument("--ramulator_disable_gwrite_latency_hiding", action="store_true")
 20 | parser.add_argument("--policy", choices=["Newton+", "Newton++", "Pipeline", "MDDP", "PIMFlow"], required=True)
 21 | args = parser.parse_args()
 22 | 
 23 | postfix = ""
 24 | if args.ramulator_disable_gwrite_latency_hiding:
 25 |   postfix = "_noopt"
 26 | 
 27 | baseline = pd.read_csv(f'../pipeline/baseline_end_to_end_{args.model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 28 | gpu = pd.read_csv(f'../pipeline/gpu_end_to_end_{args.model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 29 | split = pd.read_csv(f'../pipeline/max_performance_end_to_end_{args.model}_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 30 | 
 31 | pipeline1 = None
 32 | pipeline2 = None
 33 | pipeline3 = None
 34 | if os.path.exists(f'../pipeline/{args.model}_pipeline1_{args.n_channel}_{args.n_gwrite}{postfix}.csv'):
 35 |   pipeline1 = pd.read_csv(f'../pipeline/{args.model}_pipeline1_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 36 | if os.path.exists(f'../pipeline/{args.model}_pipeline2_{args.n_channel}_{args.n_gwrite}{postfix}.csv'):
 37 |   pipeline2 = pd.read_csv(f'../pipeline/{args.model}_pipeline2_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 38 | if os.path.exists(f'../pipeline/{args.model}_pipeline3_{args.n_channel}_{args.n_gwrite}{postfix}.csv'):
 39 |   pipeline3 = pd.read_csv(f'../pipeline/{args.model}_pipeline3_{args.n_channel}_{args.n_gwrite}{postfix}.csv', delimiter=',')
 40 | 
 41 | 
 42 | pipeline1_onnx = None
 43 | pipeline2_onnx = None
 44 | pipeline3_onnx = None
 45 | if os.path.exists(f'../pipeline/{args.model}_pipelined1_{args.n_channel}.onnx_conv.csv'):
 46 |   pipeline1_onnx = pd.read_csv(f'../pipeline/{args.model}_pipelined1_{args.n_channel}.onnx_conv.csv', delimiter=',',header=None)
 47 | if os.path.exists(f'../pipeline/{args.model}_pipelined2_{args.n_channel}.onnx_conv.csv'):
 48 |   pipeline2_onnx = pd.read_csv(f'../pipeline/{args.model}_pipelined2_{args.n_channel}.onnx_conv.csv', delimiter=',',header=None)
 49 | if os.path.exists(f'../pipeline/{args.model}_pipelined3_{args.n_channel}.onnx_conv.csv'):
 50 |   pipeline3_onnx = pd.read_csv(f'../pipeline/{args.model}_pipelined3_{args.n_channel}.onnx_conv.csv', delimiter=',',header=None)
 51 | 
 52 | 
 53 | assert len(list(baseline.values)) == len(list(split.values))
 54 | 
 55 | N = len(list(baseline.values))
 56 | 
 57 | dp_b = [[float("infinity") for _ in range(N+1)] for _ in range(N+1)]
 58 | dp_s = [[float("infinity") for _ in range(N+1)] for _ in range(N+1)]
 59 | dp_ws = [[-1 for _ in range(N+1)] for _ in range(N+1)]
 60 | pipeline_cycles = [[None for _ in range(N+1)] for _ in range(N+1)]
 61 | trace_name = [[None for _ in range(N+1)] for _ in range(N+1)]
 62 | pipeline_type = [[None for _ in range(N+1)] for _ in range(N+1)]
 63 | optimal_name = []
 64 | 
 65 | baseline_cycle = 0
 66 | newton_cycle = 0
 67 | for i, row in enumerate(baseline.values):
 68 |   row = list(row)
 69 |   baseline_cycle += float(row[-5])
 70 | 
 71 | for i, row in enumerate(gpu.values):
 72 |   row = list(row)
 73 |   cycle = min(float(row[-5]), float(row[-4]))
 74 |   dp_b[i+1][1] = cycle
 75 |   if args.policy == "Newton++" or args.policy == "Pipeline":
 76 |     dp_s[i+1][1] = cycle
 77 |     dp_ws[i+1][1] = cycle
 78 |     optimal_name.append([row[0],"split",row[-2],row[-6]])
 79 |     # print(optimal_name)
 80 |   elif args.policy == "Newton+":
 81 |     dp_s[i+1][1] = cycle
 82 |     dp_ws[i+1][1] = cycle
 83 |     optimal_name.append([row[0],"split",row[-2],row[-6]])
 84 |   newton_cycle += cycle
 85 | 
 86 | split_cycle = 0
 87 | for i, row in enumerate(split.values):
 88 |   row = list(row)
 89 |   cycle = float(row[-3])
 90 |   if math.isclose(float(row[-1]), 0):
 91 |     cycle = dp_b[i+1][1]
 92 |   if args.policy == "MDDP" or args.policy == "PIMFlow":
 93 |     dp_s[i+1][1] = cycle
 94 |     dp_ws[i+1][1] = cycle
 95 |     optimal_name.append([row[0],"split",row[-2],row[-6]])
 96 |   split_cycle += cycle
 97 | 
 98 | 
 99 | pipelines = set()
100 | worst_pipelines = set()
101 | valids = set()
102 | 
103 | # table for storing minimum runtime from jth node for 'i' number of nodes
104 | if args.policy == "Pipeline" or args.policy =="PIMFlow":
105 |   idx = 0
106 |   idx_v = 0
107 |   while True:
108 |     if args.pipeline not in ["1", "all"] or pipeline1 is None:
109 |       break
110 |     cycle = 0
111 |     rows = list(pipeline1.values)
112 |     row = list(rows[idx])
113 |     if "pim" in row[0]:
114 |       # cycle += float(rows[idx][-1])
115 |       # cycle += max(float(rows[idx+1][-1]), float(rows[idx+1][-2]))
116 |       # cycle += float(rows[idx+2][-2])
117 |       cycle += float(rows[idx+1][-1])
118 |       cycle += max(float(rows[idx+2][-2]), float(rows[idx][-1]))
119 |       cycle += float(rows[idx+1][-2])
120 |       dp_b[idx_v+1][2] = cycle
121 |       dp_s[idx_v+1][2] = cycle
122 |       dp_ws[idx_v+1][2] = cycle
123 |       pipeline_cycles[idx_v+1][2] = ([
124 |         float(rows[idx+1][-1]),
125 |         max(float(rows[idx+2][-2]), float(rows[idx][-1])),
126 |         float(rows[idx+1][-2])], 1)
127 |       trace_name[idx_v+1][2] = str(rows[idx+2][0])
128 |       pipeline_type[idx_v+1][2] = 1
129 |       valids.add((idx_v+1, 2))
130 |       idx += 3
131 |       idx_v += 2
132 |     else:
133 |       idx += 1
134 |       idx_v += 1
135 | 
136 |     if idx_v >= N:
137 |       break
138 | 
139 |   # table for storing minimum runtime from jth node for 'i' number of nodes
140 |   idx = 0
141 |   idx_v = 0
142 |   while True:
143 |     if args.pipeline not in ["2", "all"] or pipeline2 is None:
144 |       break
145 |     cycle = 0
146 |     rows = list(pipeline2.values)
147 |     row = list(rows[idx])
148 |     if "added" in row[0]:
149 |       # cycle += float(rows[idx][-2])
150 |       # cycle += max(float(rows[idx+1][-1]), float(rows[idx+1][-2]))
151 |       # cycle += float(rows[idx+2][-1])
152 |       cycle += float(rows[idx+1][-2])
153 |       cycle += max(float(rows[idx][-2]), float(rows[idx+2][-1]))
154 |       cycle += float(rows[idx+1][-1])
155 |       dp_b[idx_v+1][2] = cycle
156 |       dp_s[idx_v+1][2] = cycle
157 |       dp_ws[idx_v+1][2] = cycle
158 |       pipeline_cycles[idx_v+1][2] = ([
159 |         float(rows[idx+1][-2]),
160 |         max(float(rows[idx][-2]), float(rows[idx+2][-1])),
161 |         float(rows[idx+1][-1])], 2)
162 |       trace_name[idx_v+1][2] = str(rows[idx][0])
163 |       pipeline_type[idx_v+1][2] = 2
164 |       valids.add((idx_v+1, 2))
165 |       idx += 3
166 |       idx_v += 2
167 |     else:
168 |       idx += 1
169 |       idx_v += 1
170 | 
171 |     if idx_v >= N:
172 |       break
173 | 
174 |   # table for storing minimum runtime from jth node for 'i' number of nodes
175 |   idx = 0
176 |   idx_v = 0
177 |   while True:
178 |     if args.pipeline not in ["3", "all"] or pipeline3 is None:
179 |       break
180 |     cycle = 0
181 |     rows = list(pipeline3.values)
182 |     row = list(rows[idx])
183 |     if "pim" in row[0]:
184 |       # cycle += float(rows[idx][-1])
185 |       # cycle += max(float(rows[idx+1][-1]), float(rows[idx+1][-2]))
186 |       # cycle += max(float(rows[idx+2][-1]), float(rows[idx+2][-2]))
187 |       # cycle += float(rows[idx+3][-1])
188 |       cycle += float(rows[idx+1][-1])
189 |       cycle += max(float(rows[idx+2][-2]), float(rows[idx][-1]))
190 |       cycle += max(float(rows[idx+1][-2]), float(rows[idx+3][-1]))
191 |       cycle += float(rows[idx+2][-1])
192 |       dp_b[idx_v+1][3] = cycle
193 |       dp_s[idx_v+1][3] = cycle
194 |       dp_ws[idx_v+1][3] = cycle
195 |       pipeline_cycles[idx_v+1][3] = ([
196 |         float(rows[idx+1][-1]),
197 |         max(float(rows[idx+2][-2]), float(rows[idx][-1])),
198 |         max(float(rows[idx+1][-2]), float(rows[idx+3][-1])),
199 |         float(rows[idx+2][-1])], 3)
200 |       trace_name[idx_v+1][3] = str(rows[idx+2][0])
201 |       pipeline_type[idx_v+1][3] = 3
202 |       valids.add((idx_v+1, 3))
203 |       idx += 4
204 |       idx_v += 3
205 |     else:
206 |       idx += 1
207 |       idx_v += 1
208 | 
209 |     if idx_v >= N:
210 |       break
211 | 
212 | 
213 | # solve
214 | eaten = 0
215 | for l in range(1, N+1):
216 |   for i in range(1, N+1):
217 |     for k in range(1, l):
218 |       if i + k > N:
219 |         continue
220 |       if l == 2 or l == 3:
221 |         if dp_s[i][l] < dp_s[i][k] + dp_s[i+k][l-k] - 10:
222 |           if (i, k) in pipelines:
223 |             pipelines.remove((i, k))
224 |             eaten += 1
225 |           if (i+k, l-k) in pipelines:
226 |             pipelines.remove((i+k, l-k))
227 |             eaten += 1
228 |           if (i, l) in valids:
229 |             pipelines.add((i, l))
230 |         if dp_ws[i][l] > dp_ws[i][k] + dp_ws[i+k][l-k] + 10:
231 |           if (i, k) in worst_pipelines:
232 |             worst_pipelines.remove((i, k))
233 |             eaten += 1
234 |           if (i+k, l-k) in worst_pipelines:
235 |             worst_pipelines.remove((i+k, l-k))
236 |             eaten += 1
237 |           if (i, l) in valids:
238 |             worst_pipelines.add((i, l))
239 |       dp_b[i][l] = min(dp_b[i][l], dp_b[i][k] + dp_b[i+k][l-k])
240 |       dp_s[i][l] = min(dp_s[i][l], dp_s[i][k] + dp_s[i+k][l-k])
241 |       dp_ws[i][l] = max(dp_ws[i][l], dp_ws[i][k] + dp_ws[i+k][l-k])
242 | 
243 | 
244 | 
245 | def pipeline_type_to(t):
246 |   if t == 2:
247 |     return "g"
248 |   else:
249 |     return "p"
250 | 
251 | # removes = []
252 | # for p in pipelines:
253 | #   i, l = p
254 | #   b = [dp_s[i+j][1] for j in range(l)]
255 | #   if abs(1 - dp_s[i][l] / sum(b)) < 0.05:
256 | #     removes.append(p)
257 | # for r in removes:
258 | #   pipelines.remove(r)
259 | 
260 | # removes = []
261 | # for p in worst_pipelines:
262 | #   i, l = p
263 | #   b = [dp_s[i+j][1] for j in range(l)]
264 | #   if abs(1 - dp_ws[i][l] / sum(b)) < 0.30:
265 | #     removes.append(p)
266 | # for r in removes:
267 | #   worst_pipelines.remove(r)
268 | 
269 | for p in pipelines:
270 |   i, l = p
271 |   b = [dp_s[i+j][1] for j in range(l)]
272 | 
273 | 
274 | for p in worst_pipelines:
275 |   i, l = p
276 |   b = [dp_ws[i+j][1] for j in range(l)]
277 | 
278 | 
279 | for p in pipelines:
280 |   i, l = p
281 |   b = [dp_s[i+j][1] for j in range(l)]
282 |   if pipeline_type[i][l] == 2:
283 |     optimal_name[i-1] = [optimal_name[i-1][0],"pipeline",pipeline_type[i][l],trace_name[i][l]]
284 |   else:
285 |     optimal_name[i] = [optimal_name[i][0],"pipeline",pipeline_type[i][l],trace_name[i][l]]
286 | 
287 | 
288 | for p in worst_pipelines:
289 |   i, l = p
290 |   b = [dp_ws[i+j][1] for j in range(l)]
291 | 
292 | # print(f"=== N_CHANNEL: {args.n_channel}, N_GWRITE: {args.n_gwrite}, ramulator_disable_gwrite_latency_hiding: {args.ramulator_disable_gwrite_latency_hiding} ===")
293 | # print(f"newton++ (vs baseline): {round(baseline_cycle / newton_cycle, 3)} ({newton_cycle - baseline_cycle})")
294 | # print(f"pipeline (vs baseline): {round(baseline_cycle / dp_b[1][N], 3)} ({dp_b[1][N] - baseline_cycle})")
295 | # print(f"split (vs baseline): {round(baseline_cycle / split_cycle, 3)} ({split_cycle - baseline_cycle})")
296 | # print(f"all (vs baseline): {round(baseline_cycle / dp_s[1][N], 3)} ({dp_s[1][N] - baseline_cycle})")
297 | # print("====================\n")
298 | 
299 | # final
300 | 
301 | print(optimal_name)
302 | 
303 | os.system(f'mkdir -p /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}')
304 | OPTIMAL=[]
305 | 
306 | 
307 | for i, k in enumerate(optimal_name):
308 |   if optimal_name[i][3] != "pim":
309 |     if k[2] == 1:
310 |       optimal_name[i-1][1] = "pipeline"
311 |       optimal_name[i-1][2] = 1
312 |       optimal_name[i-1][3] = "pim"
313 |     elif k[2] == 2:
314 |       optimal_name[i+1][1] = "pipeline"
315 |       optimal_name[i+1][2] = 2
316 |       optimal_name[i+1][3] = "pim"
317 |     elif k[2] == 3:
318 |       optimal_name[i-1][1] = "pipeline"
319 |       optimal_name[i-1][2] = 3
320 |       optimal_name[i-1][3] = "pim"
321 |       optimal_name[i+1][1] = "pipeline"
322 |       optimal_name[i+1][2] = 3
323 |       optimal_name[i+1][3] = "pim"
324 | 
325 | 
326 | if pipeline1_onnx is not None:
327 |   pipeline1_onnx=list(pipeline1_onnx.values)
328 | else:
329 |   pipeline1_onnx = []
330 | if pipeline2_onnx is not None:
331 |   pipeline2_onnx=list(pipeline2_onnx.values)
332 | else:
333 |   pipeline2_onnx = []
334 | if pipeline3_onnx is not None:
335 |   pipeline3_onnx=list(pipeline3_onnx.values)
336 | else:
337 |   pipeline3_onnx = []
338 | for i, k in enumerate(optimal_name):
339 |   if k[1] == "Newton+" or k[1] == "Newton++":
340 |     # print(k)
341 |     if k[2] != 0:
342 |       os.system(f'cp -r /root/PIMFlow/layerwise/result_simulate/{args.model}/{k[2]}_16/traces-{k[3]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}')
343 |     optimal_name[i][3] = f'trace-{k[0]}'
344 |     OPTIMAL.append(k)
345 |   if k[1] == "split":
346 |     # print(k)
347 |     if k[2] != 0:
348 |       os.system(f'cp -r /root/PIMFlow/layerwise/result_simulate/{args.model}/{k[2]}_16/traces-{k[3]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}')
349 |     optimal_name[i][3] = f'trace-{k[0]}'
350 |     OPTIMAL.append(k)
351 |   elif k[1] == "pipeline" and k[3] != "pim":
352 |     if k[2] == 1:
353 |       for j, row in enumerate(pipeline1_onnx):
354 |         if row[0]== k[3]:
355 |           optimal_name[i].append(pipeline1_onnx[j-1][0])
356 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[3]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_2')
357 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[4]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_1')
358 |           optimal_name[i][3] = f'trace-{k[0]}_1'
359 |           optimal_name[i][4] = f'trace-{k[0]}_2'
360 |           optimal_name[i].insert(0,optimal_name[i-1][0])
361 |     elif k[2] == 2:
362 |       for j, row in enumerate(pipeline2_onnx):
363 |         if row[0]== k[3]:
364 |           optimal_name[i].append(pipeline2_onnx[j+1][0])
365 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[3]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_1')
366 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[4]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_2')
367 |           optimal_name[i][3] = f'trace-{k[0]}_1'
368 |           optimal_name[i][4] = f'trace-{k[0]}_2'
369 |           optimal_name[i].insert(1,optimal_name[i+1][0])
370 |     elif k[2] == 3:
371 |       for j, row in enumerate(pipeline3_onnx):
372 |         if row[0]== k[3]:
373 |           optimal_name[i].append(pipeline3_onnx[j-1][0])
374 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[3]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_2')
375 |           os.system(f'cp -r /root/PIMFlow/pipeline/result_simulate/{args.model}/{k[2]}_16/traces-{k[4]} /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/trace-{k[0]}_1')
376 |           optimal_name[i][3] = f'trace-{k[0]}_1'
377 |           optimal_name[i][4] = f'trace-{k[0]}_2'
378 |           optimal_name[i].insert(0,optimal_name[i-1][0])
379 |           optimal_name[i].insert(2,optimal_name[i+1][0])
380 |     OPTIMAL.append(k)
381 | 
382 | 
383 | with open(f'solve_{args.model}_{args.policy}_{args.n_gwrite}{postfix}.csv', 'w',newline='') as f:
384 |   write = csv.writer(f)
385 |   write.writerows(OPTIMAL)
386 | os.system(f'mv solve_{args.model}_{args.policy}_{args.n_gwrite}{postfix}.csv /root/PIMFlow/{args.model}/{args.policy}/{args.n_gwrite}/')
387 | for i in OPTIMAL:
388 |   print(i)
389 | 


--------------------------------------------------------------------------------
/solve/solve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | N_CHANNEL=$2
 4 | 
 5 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=1 --ramulator_disable_gwrite_latency_hiding --policy=Newton+
 6 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=2 --ramulator_disable_gwrite_latency_hiding --policy=Newton+
 7 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=4 --ramulator_disable_gwrite_latency_hiding --policy=Newton+
 8 | 
 9 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=1 --policy=Newton++
10 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=2 --policy=Newton++
11 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=4 --policy=Newton++
12 | 
13 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=1 --policy=Pipeline
14 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=2 --policy=Pipeline
15 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=4 --policy=Pipeline
16 | 
17 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=1 --policy=MDDP
18 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=2 --policy=MDDP
19 | python3 solve.py --model=$MODEL --pipeline=none --n_channel=$N_CHANNEL --n_gwrite=4 --policy=MDDP
20 | 
21 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=1 --policy=PIMFlow
22 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=2 --policy=PIMFlow
23 | python3 solve.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=4 --policy=PIMFlow
24 | 


--------------------------------------------------------------------------------
/solve/stat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL=$1
 3 | N_CHANNEL=$2
 4 | STAGE=$3
 5 | 
 6 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=1 --ramulator_disable_gwrite_latency_hiding --stage=$STAGE
 7 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=1 --stage=$STAGE
 8 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=2 --ramulator_disable_gwrite_latency_hiding --stage=$STAGE
 9 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=2 --stage=$STAGE
10 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=4 --ramulator_disable_gwrite_latency_hiding --stage=$STAGE
11 | python3 stat.py --model=$MODEL --pipeline=all --n_channel=$N_CHANNEL --n_gwrite=4 --stage=$STAGE
12 | 


--------------------------------------------------------------------------------