├── .gitignore ├── .gitmodules ├── .readthedocs.yaml ├── CITATION.bib ├── LICENSE ├── Makefile ├── README.md ├── bin ├── compile ├── compile-release ├── gvprof ├── gvprof-debug ├── gvprof_overhead ├── install └── install-debug ├── docs ├── .gitignore ├── Makefile ├── barracuda.md ├── castro.md ├── conf.py ├── darknet.md ├── deepwave.md ├── faq.md ├── index.rst ├── install.md ├── lammps.md ├── manual.md ├── namd.md ├── preface.md ├── pytorch.md ├── qmcpack.md ├── requirements.txt ├── roadmap.md ├── rodinia.md ├── unit_tests.md └── workflow.md ├── include ├── gpu-patch.h ├── gpu-queue.h └── utils.h ├── python ├── .gitignore ├── __init__.py ├── bench.py ├── filter_time.py ├── gviewer.py ├── overhead.sh ├── test.py ├── test_cases.py ├── tests │ ├── .gitignore │ ├── __init__.py │ ├── data_flow_test.py │ ├── instruction_test.py │ ├── redundancy_test.py │ └── value_pattern_test.py └── utils.py ├── requirements.txt └── src ├── gpu-analysis.cu ├── gpu-patch-address.cu ├── gpu-patch-aux.cu ├── gpu-patch-torch-aux.cu └── gpu-patch.cu /.gitignore: -------------------------------------------------------------------------------- 1 | *.fatbin 2 | *.cubin 3 | gvprof/* 4 | 5 | .vscode/ 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "samples"] 2 | path = samples 3 | url = https://github.com/FindHao/hpctoolkit-gpu-sanitizer-samples.git 4 | [submodule "jquery.graphviz.svg"] 5 | path = jquery.graphviz.svg 6 | url = https://github.com/Jokeren/jquery.graphviz.svg.git 7 | [submodule "redshow"] 8 | path = redshow 9 | url = https://github.com/Lin-Mao/redshow.git 10 | [submodule "hpctoolkit"] 11 | path = hpctoolkit 12 | url = https://github.com/Lin-Mao/hpctoolkit.git 13 | [submodule "torch-monitor"] 14 | path = torch-monitor 15 | url = https://github.com/Lin-Mao/torch-monitor.git 16 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.12" 12 | 13 | # Build documentation in the docs/ directory with Sphinx 14 | sphinx: 15 | configuration: docs/conf.py 16 | 17 | python: 18 | install: 19 | - requirements: docs/requirements.txt 20 | 21 | # Optionally build your docs in additional formats such as PDF 22 | formats: 23 | - pdf 24 | -------------------------------------------------------------------------------- /CITATION.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{lin2023drgpum, 2 | title={DrGPUM: Guiding Memory Optimization for GPU-Accelerated Applications}, 3 | author={Lin, Mao and Zhou, Keren and Su, Pengfei}, 4 | booktitle={Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3}, 5 | pages={164--178}, 6 | year={2023}, 7 | isbn={9781450399180}, 8 | publisher={Association for Computing Machinery}, 9 | address={New York, NY, USA}, 10 | url={https://doi.org/10.1145/3582016.3582044}, 11 | doi={10.1145/3582016.3582044}, 12 | keywords={GPU profilers, Memory management, CUDA, GPUs}, 13 | location={Vancouver, BC, Canada}, 14 | series={ASPLOS 2023} 15 | } 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, University of California, Merced All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Modified from https://github.com/Jokeren/compute-sanitizer-samples/tree/master/MemoryTracker 2 | PROJECT ?= gpu-patch.fatbin gpu-patch-address.fatbin gpu-patch-aux.fatbin gpu-patch-torch-aux.fatbin 3 | PROJECT_ANALYSIS ?= gpu-analysis.fatbin 4 | 5 | # Location of the CUDA Toolkit 6 | CUDA_PATH ?= /usr/local/cuda 7 | SANITIZER_PATH ?= $(CUDA_PATH)/compute-sanitizer 8 | CUPTI_PATH ?= $(CUDA_PATH) 9 | 10 | NVCC := $(CUDA_PATH)/bin/nvcc 11 | 12 | INCLUDE_DIRS := -I$(CUDA_PATH)/include -I$(SANITIZER_PATH)/include -I$(CUPTI_PATH)/include -Iinclude 13 | SRC_DIR := src 14 | CXXFLAGS := $(INCLUDE_DIRS) -O3 --fatbin 15 | 16 | ARCHS := 50 60 70 72 75 80 86 17 | 18 | # Generate SASS code for each SM architectures 19 | $(foreach sm,$(ARCHS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 20 | 21 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 22 | HIGHEST_SM := $(lastword $(sort $(ARCHS))) 23 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 24 | 25 | all: $(PROJECT) $(PROJECT_ANALYSIS) 26 | 27 | ifdef PREFIX 28 | install: all 29 | endif 30 | 31 | $(PROJECT): %.fatbin : $(SRC_DIR)/%.cu 32 | $(NVCC) $(CXXFLAGS) $(GENCODE_FLAGS) --compile-as-tools-patch -o $@ -c $< 33 | 34 | $(PROJECT_ANALYSIS): %.fatbin : $(SRC_DIR)/%.cu 35 | $(NVCC) $(CXXFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 36 | 37 | ifdef PREFIX 38 | install: $(PROJECT) $(PROJECT_ANALYSIS) 39 | mkdir -p $(PREFIX)/lib 40 | mkdir -p $(PREFIX)/include 41 | mkdir -p $(PREFIX)/bin 42 | cp -rf $(PROJECT) $(PROJECT_ANALYSIS) $(PREFIX)/lib 43 | cp -rf include $(PREFIX) 44 | endif 45 | 46 | clean: 47 | rm -f $(PROJECT) $(PROJECT_ANALYSIS) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DrGPUM 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7588406.svg)](https://doi.org/10.5281/zenodo.7588406) 4 | [![CodeFactor](https://www.codefactor.io/repository/github/lin-mao/drgpum/badge)](https://www.codefactor.io/repository/github/lin-mao/drgpum) 5 | [![Documentation Status](https://readthedocs.org/projects/drgpum/badge/?version=latest)](https://drgpum.readthedocs.io/en/latest/?badge=latest) 6 | 7 | 8 | DrGPUM is a memory profiler for NVIDIA GPUs to explore memory inefficiencies in GPU-accelerated applications. 9 | 10 | ## Quick Start 11 | 12 | ```bash 13 | git clone --recursive https://github.com/Lin-Mao/DrGPUM.git && cd DrGPUM 14 | 15 | git submodule update --init --recursive 16 | 17 | # Specify PyTorch dir 18 | export PYTORCH_DIR=path_to_pytorch/torch 19 | 20 | # Install DrGPUM 21 | ./bin/install 22 | 23 | # Setup environment variables 24 | export DrGPUM_PATH=$(pwd)/gvprof 25 | export PATH=${DrGPUM_PATH}/bin:$PATH 26 | export PATH=${DrGPUM_PATH}/hpctoolkit/bin:$PATH 27 | export PATH=${DrGPUM_PATH}/redshow/bin:$PATH 28 | 29 | # Test a sample 30 | cd samples/vectorAdd.f32 31 | make 32 | gvprof -v -e memory_liveness ./vectorAdd 33 | ``` 34 | 35 | ## Documentation 36 | 37 | - [Installation Guide](https://drgpum.readthedocs.io/en/latest/install.html) 38 | - [User's Guide](https://drgpum.readthedocs.io/en/latest/manual.html) 39 | - [Developer's Guide](https://drgpum.readthedocs.io/en/latest/workflow.html) 40 | 41 | ## Papers 42 | 43 | - Mao Lin, Keren Zhou, and Pengfei Su. 2023. [DrGPUM: Guiding Memory Optimization for GPU-accelerated Applications](https://doi.org/10.1145/3582016.3582044). In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3 (ASPLOS ’23), March 25–29, 2023, Vancouver, BC, Canada. ACM, New York, NY, USA, 15 pages. 44 | -------------------------------------------------------------------------------- /bin/compile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIR=$(pwd) 4 | DIR="" 5 | CUDA_PATH=/usr/local/cuda/ 6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer 7 | 8 | if [ $# -eq 0 ]; then 9 | DIR=$(pwd)/gvprof 10 | else 11 | if [ $# -eq 1 ]; then 12 | DIR=$1 13 | else 14 | if [ $# -eq 2 ]; then 15 | DIR=$1 16 | CUDA_PATH=$2 17 | SANITIZER_PATH=$2/compute-sanitizer 18 | else 19 | if [ $# -eq 3 ]; then 20 | DIR=$1 21 | CUDA_PATH=$2 22 | SANITIZER_PATH=$3 23 | fi 24 | fi 25 | fi 26 | fi 27 | 28 | 29 | if [ -z "$DIR" ] 30 | then 31 | echo "Wrong paths" 32 | echo "./install " 33 | exit 34 | fi 35 | 36 | echo $DIR 37 | echo $CUDA_PATH 38 | echo $SANITIZER_PATH 39 | 40 | if [ ! -d $DIR ] 41 | then 42 | mkdir $DIR 43 | fi 44 | 45 | cd $DIR 46 | # Install spack 47 | # git clone https://github.com/spack/spack.git 48 | export SPACK_ROOT=$(pwd)/spack 49 | export PATH=${SPACK_ROOT}/bin:${PATH} 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh 51 | 52 | # Install hpctoolkit dependencies 53 | # spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 54 | # spack install libmonitor@master+dlopen+hpctoolkit 55 | # spack install libunwind 56 | 57 | # Fix bug 58 | # spack install mbedtls gotcha 59 | 60 | # Python version for torch monitor 61 | PY_VERSION=3.8 62 | # spack install python@$PY_VERSION 63 | 64 | # Install gpu-patch 65 | cd $SOURCE_DIR 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install 67 | 68 | # Find spack and boost dir 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3) 70 | S=${B%/*} 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3) 72 | 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3) 74 | 75 | # Install torch monitor 76 | cd $SOURCE_DIR 77 | cd torch-monitor 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \ 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \ 80 | TORCH_DIR=$PYTORCH_DIR DEBUG=1 install 81 | 82 | # Install redshow 83 | cd $SOURCE_DIR 84 | cd redshow 85 | make clean 86 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \ 87 | TORCH_MONITOR_DIR=$DIR/torch-monitor DEBUG=1 install 88 | 89 | # install hpctoolkit 90 | cd $SOURCE_DIR 91 | cd hpctoolkit 92 | # mkdir build 93 | cd build 94 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \ 95 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \ 96 | --with-redshow=$DIR/redshow --with-spack=$S --enable-develop 97 | make install -j 98 | 99 | cd $SOURCE_DIR 100 | # mkdir $DIR/bin 101 | # mkdir $DIR/python 102 | cp ./bin/gvprof $DIR/bin/ 103 | cp -r ./python $DIR 104 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer 105 | chmod +x $DIR/bin/gviewer 106 | 107 | echo "Install in $DIR/bin/gvprof" 108 | -------------------------------------------------------------------------------- /bin/compile-release: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIR=$(pwd) 4 | DIR="" 5 | CUDA_PATH=/usr/local/cuda/ 6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer 7 | 8 | if [ $# -eq 0 ]; then 9 | DIR=$(pwd)/gvprof 10 | else 11 | if [ $# -eq 1 ]; then 12 | DIR=$1 13 | else 14 | if [ $# -eq 2 ]; then 15 | DIR=$1 16 | CUDA_PATH=$2 17 | SANITIZER_PATH=$2/compute-sanitizer 18 | else 19 | if [ $# -eq 3 ]; then 20 | DIR=$1 21 | CUDA_PATH=$2 22 | SANITIZER_PATH=$3 23 | fi 24 | fi 25 | fi 26 | fi 27 | 28 | 29 | if [ -z "$DIR" ] 30 | then 31 | echo "Wrong paths" 32 | echo "./install " 33 | exit 34 | fi 35 | 36 | echo $DIR 37 | echo $CUDA_PATH 38 | echo $SANITIZER_PATH 39 | 40 | if [ ! -d $DIR ] 41 | then 42 | mkdir $DIR 43 | fi 44 | 45 | cd $DIR 46 | # Install spack 47 | # git clone https://github.com/spack/spack.git 48 | export SPACK_ROOT=$(pwd)/spack 49 | export PATH=${SPACK_ROOT}/bin:${PATH} 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh 51 | 52 | # Install hpctoolkit dependencies 53 | # spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 54 | # spack install libmonitor@master+dlopen+hpctoolkit 55 | # spack install libunwind 56 | 57 | # Fix bug 58 | # spack install mbedtls gotcha 59 | 60 | # Python version for torch monitor 61 | PY_VERSION=3.8 62 | # spack install python@$PY_VERSION 63 | 64 | # Install gpu-patch 65 | cd $SOURCE_DIR 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install 67 | 68 | # Find spack and boost dir 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3) 70 | S=${B%/*} 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3) 72 | 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3) 74 | 75 | # Install torch monitor 76 | cd $SOURCE_DIR 77 | cd torch-monitor 78 | make clean 79 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \ 80 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \ 81 | TORCH_DIR=$PYTORCH_DIR install 82 | 83 | # Install redshow 84 | cd $SOURCE_DIR 85 | cd redshow 86 | make clean 87 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \ 88 | TORCH_MONITOR_DIR=$DIR/torch-monitor install 89 | 90 | # install hpctoolkit 91 | cd $SOURCE_DIR 92 | cd hpctoolkit 93 | rm -rf build 94 | mkdir build 95 | cd build 96 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \ 97 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \ 98 | --with-redshow=$DIR/redshow --with-spack=$S 99 | make install -j 100 | 101 | cd $SOURCE_DIR 102 | # mkdir $DIR/bin 103 | # mkdir $DIR/python 104 | cp ./bin/gvprof $DIR/bin/ 105 | cp -r ./python $DIR 106 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer 107 | chmod +x $DIR/bin/gviewer 108 | 109 | echo "Install in $DIR/bin/gvprof" 110 | -------------------------------------------------------------------------------- /bin/gvprof: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() 4 | { 5 | cat < 11 | redundancy 12 | data_flow 13 | value_pattern 14 | -j 15 | multi-threading analysis for binaries 16 | -ck 17 | control knob values 18 | -cfg gpu-cfg 19 | enable fine-grained gpu instruction analysis 20 | -s 21 | set block sampling frequency 22 | -l 23 | pass launcher command for execution. (e.g., "mpirun -np 1") 24 | -v verbose 25 | redirect output to gvprof.log 26 | EOF 27 | exit 0 28 | } 29 | 30 | while test "x$1" != x 31 | do 32 | arg="$1" ; shift 33 | case "$arg" in 34 | -v) 35 | export GVPROF_VERBOSE=1 36 | ;; 37 | -j) 38 | export GVPROF_THREADS=$1 39 | shift 40 | ;; 41 | -e) 42 | export GVPROF_EVENT=$1 43 | shift 44 | ;; 45 | -cfg) 46 | export GVPROF_CFG=1 47 | ;; 48 | -l) 49 | export GVPROF_LAUNCHER="$1" 50 | shift 51 | ;; 52 | -ck) 53 | export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1" 54 | shift 55 | ;; 56 | -s) 57 | export GVPROF_SAMPLING_FREQUENCY="@$1" 58 | shift 59 | ;; 60 | -h) 61 | usage 62 | exit 63 | ;; 64 | * ) 65 | set -- "$arg" "$@" 66 | break 67 | ;; 68 | esac 69 | done 70 | 71 | GVPROF_EXEC=$1 72 | GVPROF_ARGS="${*:2}" 73 | 74 | if [ -z "$GVPROF_EXEC" ] 75 | then 76 | echo "Empty executable" 77 | exit 78 | fi 79 | 80 | if [ -z "$GVPROF_EVENT" ] 81 | then 82 | echo "Empty event" 83 | exit 84 | fi 85 | 86 | if [ -z "$GVPROF_THREADS" ] 87 | then 88 | export GVPROF_THREADS=1 89 | fi 90 | 91 | if [ ! -z "$GVPROF_VERBOSE" ] 92 | then 93 | export GVPROF_REDIRECT=./gvprof.log 94 | else 95 | export GVPROF_REDIRECT=/dev/null 96 | fi 97 | 98 | ##Test 99 | #echo $GVPROF_EXEC 100 | #echo $GVPROF_ARGS 101 | #echo $GVPROF_THREADS 102 | #echo $GVPROF_EVENT 103 | #echo $GVPROF_CONTROL_KNOBS 104 | #echo $GVPROF_LAUNCHER 105 | 106 | MEASUREMENTS=gvprof-measurements 107 | DATABASE=gvprof-database 108 | echo "Make sure $MEASUREMENTS and $DATABASE is clean" 109 | rm -rf $MEASUREMENTS 110 | rm -rf $DATABASE 111 | 112 | echo "First pass: dump and analyze CPU and GPU binaries" 113 | 114 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &> $GVPROF_REDIRECT 115 | rm -rf $MEASUREMENTS/*.hpcrun 116 | 117 | if [ ! -z "$GVPROF_CFG" ] 118 | then 119 | hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT 120 | else 121 | hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT 122 | fi 123 | 124 | BASENAME=./$(basename $GVPROF_EXEC) 125 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>> $GVPROF_REDIRECT 126 | 127 | echo "Second pass: profiling" 128 | 129 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT 130 | hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT 131 | 132 | echo "Done..." 133 | -------------------------------------------------------------------------------- /bin/gvprof-debug: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() 4 | { 5 | cat < 11 | redundancy 12 | data_flow 13 | value_pattern 14 | -j 15 | multi-threading analysis for binaries 16 | -ck 17 | control knob values 18 | -cfg gpu-cfg 19 | enable fine-grained gpu instruction analysis 20 | -s 21 | set block sampling frequency 22 | -l 23 | pass launcher command for execution. (e.g., "mpirun -np 1") 24 | -v verbose 25 | redirect output to gvprof.log 26 | EOF 27 | exit 0 28 | } 29 | 30 | while test "x$1" != x 31 | do 32 | arg="$1" ; shift 33 | case "$arg" in 34 | -v) 35 | export GVPROF_VERBOSE=1 36 | ;; 37 | -j) 38 | export GVPROF_THREADS=$1 39 | shift 40 | ;; 41 | -e) 42 | export GVPROF_EVENT=$1 43 | shift 44 | ;; 45 | -cfg) 46 | export GVPROF_CFG=1 47 | ;; 48 | -l) 49 | export GVPROF_LAUNCHER="$1" 50 | shift 51 | ;; 52 | -ck) 53 | export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1" 54 | shift 55 | ;; 56 | -s) 57 | export GVPROF_SAMPLING_FREQUENCY="@$1" 58 | shift 59 | ;; 60 | -h) 61 | usage 62 | exit 63 | ;; 64 | * ) 65 | set -- "$arg" "$@" 66 | break 67 | ;; 68 | esac 69 | done 70 | 71 | GVPROF_EXEC=$1 72 | GVPROF_ARGS="${*:2}" 73 | 74 | if [ -z "$GVPROF_EXEC" ] 75 | then 76 | echo "Empty executable" 77 | exit 78 | fi 79 | 80 | if [ -z "$GVPROF_EVENT" ] 81 | then 82 | echo "Empty event" 83 | exit 84 | fi 85 | 86 | if [ -z "$GVPROF_THREADS" ] 87 | then 88 | export GVPROF_THREADS=1 89 | fi 90 | 91 | if [ ! -z "$GVPROF_VERBOSE" ] 92 | then 93 | export GVPROF_REDIRECT=./gvprof.log 94 | else 95 | export GVPROF_REDIRECT=/dev/null 96 | fi 97 | 98 | ##Test 99 | #echo $GVPROF_EXEC 100 | #echo $GVPROF_ARGS 101 | #echo $GVPROF_THREADS 102 | #echo $GVPROF_EVENT 103 | #echo $GVPROF_CONTROL_KNOBS 104 | #echo $GVPROF_LAUNCHER 105 | 106 | MEASUREMENTS=gvprof-measurements 107 | DATABASE=gvprof-database 108 | echo "Make sure $MEASUREMENTS and $DATABASE is clean" 109 | rm -rf $MEASUREMENTS 110 | rm -rf $DATABASE 111 | 112 | echo "First pass: dump and analyze CPU and GPU binaries" 113 | 114 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &> $GVPROF_REDIRECT 115 | rm -rf $MEASUREMENTS/*.hpcrun 116 | 117 | if [ ! -z "$GVPROF_CFG" ] 118 | then 119 | hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT 120 | else 121 | hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT 122 | fi 123 | 124 | BASENAME=./$(basename $GVPROF_EXEC) 125 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>> $GVPROF_REDIRECT 126 | 127 | echo "Second pass: profiling" 128 | 129 | # $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT 130 | # hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT 131 | 132 | # debug mode 133 | $GVPROF_LAUNCHER hpcrun -d -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT 134 | hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT 135 | 136 | echo "Done..." 137 | -------------------------------------------------------------------------------- /bin/gvprof_overhead: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { 4 | cat < 10 | redundancy 11 | data_flow 12 | value_pattern 13 | -j 14 | multi-threading analysis for binaries 15 | -ck 16 | control knob values 17 | -cfg gpu-cfg 18 | enable fine-grained gpu instruction analysis 19 | -s 20 | set block sampling frequency 21 | -l 22 | pass launcher command for execution. (e.g., "mpirun -np 1") 23 | -v verbose 24 | redirect output to gvprof.log 25 | EOF 26 | exit 0 27 | } 28 | 29 | while test "x$1" != x; do 30 | arg="$1" 31 | shift 32 | case "$arg" in 33 | -v) 34 | export GVPROF_VERBOSE=1 35 | ;; 36 | -j) 37 | export GVPROF_THREADS=$1 38 | shift 39 | ;; 40 | -e) 41 | export GVPROF_EVENT=$1 42 | shift 43 | ;; 44 | -cfg) 45 | export GVPROF_CFG=1 46 | ;; 47 | -l) 48 | export GVPROF_LAUNCHER="$1" 49 | shift 50 | ;; 51 | -ck) 52 | export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1" 53 | shift 54 | ;; 55 | -s) 56 | export GVPROF_SAMPLING_FREQUENCY="@$1" 57 | shift 58 | ;; 59 | -i) 60 | export GVPROF_ITERATIONS="$1" 61 | shift 62 | ;; 63 | -h) 64 | usage 65 | exit 66 | ;; 67 | *) 68 | set -- "$arg" "$@" 69 | break 70 | ;; 71 | esac 72 | done 73 | 74 | GVPROF_EXEC=$1 75 | GVPROF_ARGS="${*:2}" 76 | 77 | if [ -z "$GVPROF_EXEC" ]; then 78 | echo "Empty executable" 79 | exit 80 | fi 81 | 82 | if [ -z "$GVPROF_EVENT" ]; then 83 | echo "Empty event" 84 | exit 85 | fi 86 | 87 | if [ -z "$GVPROF_THREADS" ]; then 88 | export GVPROF_THREADS=1 89 | fi 90 | 91 | if [ ! -z "$GVPROF_VERBOSE" ]; then 92 | export GVPROF_REDIRECT=./gvprof.log 93 | else 94 | export GVPROF_REDIRECT=/dev/null 95 | fi 96 | 97 | ##Test 98 | #echo $GVPROF_EXEC 99 | #echo $GVPROF_ARGS 100 | #echo $GVPROF_THREADS 101 | #echo $GVPROF_EVENT 102 | #echo $GVPROF_CONTROL_KNOBS 103 | #echo $GVPROF_LAUNCHER 104 | 105 | MEASUREMENTS=gvprof-measurements 106 | DATABASE=gvprof-database 107 | echo "Make sure $MEASUREMENTS and $DATABASE is clean" 108 | rm -rf $MEASUREMENTS 109 | rm -rf $DATABASE 110 | 111 | echo "First pass: dump and analyze CPU and GPU binaries" 112 | 113 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &>$GVPROF_REDIRECT 114 | rm -rf $MEASUREMENTS/*.hpcrun 115 | 116 | if [ ! -z "$GVPROF_CFG" ]; then 117 | hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>>$GVPROF_REDIRECT 118 | else 119 | hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>>$GVPROF_REDIRECT 120 | fi 121 | 122 | BASENAME=./$(basename $GVPROF_EXEC) 123 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>>$GVPROF_REDIRECT 124 | 125 | echo "Second pass: profiling" 126 | 127 | if [ -z "$GVPROF_ITERATIONS" ]; then 128 | export GVPROF_ITERATIONS=1 129 | fi 130 | echo "ITERATIONS $GVPROF_ITERATIONS" 131 | START=1 132 | for (( i=$START; i<=${GVPROF_ITERATIONS}; i++ ));do 133 | { time $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>>$GVPROF_REDIRECT 134 | hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>>$GVPROF_REDIRECT ; } 2>>time_$GVPROF_EVENT.txt 135 | done 136 | 137 | echo "Done..." 138 | -------------------------------------------------------------------------------- /bin/install: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIR=$(pwd) 4 | DIR="" 5 | CUDA_PATH=/usr/local/cuda/ 6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer 7 | 8 | if [ $# -eq 0 ]; then 9 | DIR=$(pwd)/gvprof 10 | else 11 | if [ $# -eq 1 ]; then 12 | DIR=$1 13 | else 14 | if [ $# -eq 2 ]; then 15 | DIR=$1 16 | CUDA_PATH=$2 17 | SANITIZER_PATH=$2/compute-sanitizer 18 | else 19 | if [ $# -eq 3 ]; then 20 | DIR=$1 21 | CUDA_PATH=$2 22 | SANITIZER_PATH=$3 23 | fi 24 | fi 25 | fi 26 | fi 27 | 28 | 29 | if [ -z "$DIR" ] 30 | then 31 | echo "Wrong paths" 32 | echo "./install " 33 | exit 34 | fi 35 | 36 | echo $DIR 37 | echo $CUDA_PATH 38 | echo $SANITIZER_PATH 39 | 40 | if [ ! -d $DIR ] 41 | then 42 | mkdir $DIR 43 | fi 44 | 45 | cd $DIR 46 | # Install spack 47 | git clone https://github.com/Lin-Mao/spack.git 48 | export SPACK_ROOT=$(pwd)/spack 49 | export PATH=${SPACK_ROOT}/bin:${PATH} 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh 51 | 52 | # Install hpctoolkit dependencies 53 | spack install --only dependencies hpctoolkit ^dyninst@12.0.1 ^binutils@2.34+libiberty~nls 54 | spack install libmonitor@master+dlopen+hpctoolkit 55 | spack install libunwind 56 | 57 | # Fix bug 58 | spack install mbedtls gotcha 59 | 60 | # Python version for torch monitor 61 | PY_VERSION=3.8 62 | spack install python@$PY_VERSION 63 | 64 | # Install gpu-patch 65 | cd $SOURCE_DIR 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install 67 | 68 | # Find spack and boost dir 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3) 70 | S=${B%/*} 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3) 72 | 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3) 74 | 75 | # Install torch monitor 76 | cd $SOURCE_DIR 77 | cd torch-monitor 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \ 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \ 80 | TORCH_DIR=$PYTORCH_DIR install 81 | 82 | # Install redshow 83 | cd $SOURCE_DIR 84 | cd redshow 85 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \ 86 | TORCH_MONITOR_DIR=$DIR/torch-monitor install 87 | 88 | # install hpctoolkit 89 | cd $SOURCE_DIR 90 | cd hpctoolkit 91 | mkdir build 92 | cd build 93 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \ 94 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \ 95 | --with-redshow=$DIR/redshow --with-spack=$S 96 | make install -j16 97 | 98 | cd $SOURCE_DIR 99 | mkdir $DIR/bin 100 | mkdir $DIR/python 101 | cp ./bin/gvprof $DIR/bin/ 102 | cp -r ./python $DIR 103 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer 104 | chmod +x $DIR/bin/gviewer 105 | 106 | echo "Install in $DIR/bin/gvprof" 107 | -------------------------------------------------------------------------------- /bin/install-debug: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOURCE_DIR=$(pwd) 4 | DIR="" 5 | CUDA_PATH=/usr/local/cuda/ 6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer 7 | 8 | if [ $# -eq 0 ]; then 9 | DIR=$(pwd)/gvprof 10 | else 11 | if [ $# -eq 1 ]; then 12 | DIR=$1 13 | else 14 | if [ $# -eq 2 ]; then 15 | DIR=$1 16 | CUDA_PATH=$2 17 | SANITIZER_PATH=$2/compute-sanitizer 18 | else 19 | if [ $# -eq 3 ]; then 20 | DIR=$1 21 | CUDA_PATH=$2 22 | SANITIZER_PATH=$3 23 | fi 24 | fi 25 | fi 26 | fi 27 | 28 | 29 | if [ -z "$DIR" ] 30 | then 31 | echo "Wrong paths" 32 | echo "./install " 33 | exit 34 | fi 35 | 36 | echo $DIR 37 | echo $CUDA_PATH 38 | echo $SANITIZER_PATH 39 | 40 | if [ ! -d $DIR ] 41 | then 42 | mkdir $DIR 43 | fi 44 | 45 | cd $DIR 46 | # Install spack 47 | git clone https://github.com/Lin-Mao/spack.git 48 | export SPACK_ROOT=$(pwd)/spack 49 | export PATH=${SPACK_ROOT}/bin:${PATH} 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh 51 | 52 | # Install hpctoolkit dependencies 53 | spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 54 | spack install libmonitor@master+dlopen+hpctoolkit 55 | spack install libunwind 56 | 57 | # Fix bug 58 | spack install mbedtls gotcha 59 | 60 | # Python version for torch monitor 61 | PY_VERSION=3.8 62 | spack install python@$PY_VERSION 63 | 64 | # Install gpu-patch 65 | cd $SOURCE_DIR 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install 67 | 68 | # Find spack and boost dir 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3) 70 | S=${B%/*} 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3) 72 | 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3) 74 | 75 | # Install torch monitor 76 | cd $SOURCE_DIR 77 | cd torch-monitor 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \ 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \ 80 | TORCH_DIR=$PYTORCH_DIR DEBUG=1 install 81 | 82 | # Install redshow 83 | cd $SOURCE_DIR 84 | cd redshow 85 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \ 86 | TORCH_MONITOR_DIR=$DIR/torch-monitor DEBUG=1 install 87 | 88 | # install hpctoolkit 89 | cd $SOURCE_DIR 90 | cd hpctoolkit 91 | mkdir build 92 | cd build 93 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \ 94 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \ 95 | --with-redshow=$DIR/redshow --with-spack=$S --enable-develop 96 | make install -j16 97 | 98 | cd $SOURCE_DIR 99 | mkdir $DIR/bin 100 | mkdir $DIR/python 101 | cp ./bin/gvprof $DIR/bin/ 102 | cp -r ./python $DIR 103 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer 104 | chmod +x $DIR/bin/gviewer 105 | 106 | echo "Install in $DIR/bin/gvprof" 107 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build/* 2 | _static/* 3 | _templates/* 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/barracuda.md: -------------------------------------------------------------------------------- 1 | # BarraCUDA 2 | 3 | ## Introduction 4 | 5 | BarraCUDA is a GPU-accelerated sequence mapping software. BarraCUDA's code and sample data are open source and available at [sourceforge](http://seqbarracuda.sourceforge.net/). BarraCUDA's [FAQ page](http://seqbarracuda.sourceforge.net/faqs.html) provides useful instructions for installing and running benchmarks. 6 | 7 | We study BarraCUDA *0.7.107h*, using the `Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa` sample data. 8 | 9 | ## Profiling 10 | 11 | The input we used elapses for a short time so that we can profile it directly using the `gvprof` script. 12 | 13 | ```bash 14 | # prepare 15 | ./bin/barracuda index sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa 16 | 17 | # data_flow 18 | gvprof -e data_flow ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq > quicktest.sai 19 | 20 | # value_pattern 21 | gvprof -e value_pattern -cfg ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq > quicktest.sai 22 | ``` 23 | 24 | ## Optimizations 25 | 26 | - *data_flow* - *redundant values* 27 | 28 | `barracuda.cu: 398`. In this function, cuda memory apis called after *Line 440* are not necessary when `number_of_sequences=0`. 29 | In that case, zero data are transferred between CPUs and GPUs such that arrays remain the same values, but still triggering API invocation cost. 30 | 31 | - *value_pattern* - *dense values* 32 | 33 | `cuda2.cuh: 865`. This line copies all the elements from a local array to a global array, regardless of their values. While CPU's `memcpy` is fast for contiguous copy, GPU's `memcpy` is not. We observe that this copy operation involes many zeros. Therefore, we can create a `hits` array to record which positions have been updated, then only copy values at these positions. -------------------------------------------------------------------------------- /docs/castro.md: -------------------------------------------------------------------------------- 1 | # Castro 2 | 3 | ## Introduction 4 | 5 | Castro is an astrophysical radiation hydrodynamics simulation code based on AMReX framework. 6 | 7 | We study Castro version `5e0a1b9cbc259f4dd17f5453ba59808b4da5c3ab`, 8 | and profile Casto's `Exec/hydro_tests/Sedov` example using its `inputs.2d.cyl_in_cartcoords` input. 9 | 10 | To compile Castro, we setup the following variables in `GNUmakefile`: 11 | 12 | ```bash 13 | USE_CUDA=TRUE 14 | CUDA_ARCH=TRUE 15 | DIM=2 16 | USE_MPI=FALSE 17 | ``` 18 | 19 | ## Profiling 20 | 21 | For a small scale run, we setup `max_step=20` in `inputs.2d.cyl_in_cartcoords`. 22 | To generate the data flow graph for Castro, along with redundancy metrics, we can use the `gvprof` script directly. 23 | For other fine-grained metrics, we can use `gvprof` if GPU control flow graphs are not required. Otherwise, we recommend using hpctoolkit to perform step-by-step profiling. 24 | 25 | ## Optimization 26 | 27 | - *data_flow* - *redundant values* 28 | 29 | [`AMReX_Interp_2D_C.H: 344`](https://github.com/AMReX-Codes/amrex/blob/b7ddf2d2677fce63a567612978e01ced288dbda2/Src/AmrCore/AMReX_Interp_2D_C.H#L344). When castro invokes `cellconslin_slopes_mmlim`, which is an internal function provided by AMReX, it performs `slope(i, j, n) *= a` for each output. 30 | With the `inputs.2d.cyl_in_cartcoords` input, somehow *a* is mostly 1.0. 31 | Thereby, we can save one load and one store for each output if we conditionally perform `slope(i, j, n) *= a`. 32 | Though this optimization does not achieve a significant speedup, it is worth mentioning if this it also benefits other applications that use AMReX. -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'DrGPUM' 21 | copyright = '2023, University of California, Merced' 22 | author = 'Mao Lin' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'recommonmark' 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = 'sphinx_rtd_theme' 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = [] 54 | 55 | source_suffix = { 56 | '.rst': 'restructuredtext', 57 | '.txt': 'markdown', 58 | '.md': 'markdown', 59 | } 60 | 61 | html_theme = "sphinx_rtd_theme" 62 | html_theme_path = ["_themes", ] 63 | -------------------------------------------------------------------------------- /docs/darknet.md: -------------------------------------------------------------------------------- 1 | # Darknet 2 | 3 | ## Introduction 4 | 5 | [Darknet](https://github.com/AlexeyAB/darknet) is an open source neural network framework written in C and CUDA. It is fast, easy to install, and supports CPU and GPU computation. 6 | 7 | We check out Darknet version `312fd2e99a765949e468e18277d41f7992f08860`, study the `yolov4.cfg` and `yolov4-tiny.cfg` networks, and test an image `dog.jpg`. 8 | 9 | To compile darknet, we setup the following knobs in Makefile: 10 | 11 | ```bash 12 | GPU=1 13 | # append -lineinfo to the start of ARCH 14 | ARCH=-lineinfo ... 15 | # append -g to the start of CFLAGS 16 | CFLAGS=-g ... 17 | ``` 18 | 19 | ## Profiling 20 | 21 | For the data flow analysis, one can use gvprof to profile darknet directly. `-ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1` yields significant speedup. 22 | 23 | For the value pattern analysis, we recommend using a whitelist to specify interesting GPU kernels and turning on block sampling and kernel samples. 24 | In addition, if control flow graph based analysis is wanted, we don't recommend using `gvprof -cfg` directly because Darknet uses cuBLAS and cuDNN that trigger hundreds of large binaries loading at runtime. 25 | In fact, darkent's data type is almost uniform across all kernels so that one can gain insights even without `-cfg`. 26 | 27 | We can profile the fine grain patterns of darknet using 28 | 29 | ```bash 30 | gvprof -e value_pattern@10 -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=20 31 | ``` 32 | 33 | In the `whitelist` file, we specify the following three kernels: 34 | 35 | ``` 36 | _Z15add_bias_kernelPfS_iiii 37 | _Z21im2col_gpu_kernel_extiPKfiiiiiiiiiiiiPf 38 | _Z26activate_array_mish_kernelPfiS_S_ 39 | ``` 40 | 41 | Other than a few kernels with dense value patterns when approximation is used, we didn't find other interesting patterns. 42 | 43 | **You may want to lookup real kernel names with `gvprof -v` or `readelf -s` since compilers may generate different names** 44 | 45 | ## Optimization 46 | 47 | - *data_flow* - *redundant values* 48 | 49 | `upsampling_layer.c: 91` and `convolution_kernels.cu: 559`. In the generated data flow graph, we found that the nodes annotated with the `fill_ongpu` kernel always have redundant accesses. 50 | Because we run the inference mode only, the arrays are initialized with zeros and filled zeros again. 51 | To optimize it, we can set up a flag for each array to indicate if it is "clean". A "clean" array shouldn't be filled zeros again. -------------------------------------------------------------------------------- /docs/deepwave.md: -------------------------------------------------------------------------------- 1 | # Deepwave 2 | 3 | ## Introduction 4 | 5 | [Deepwave](https://github.com/ar4/deepwave) is a wave propagation software implemented based on. 6 | 7 | We study deepwave version `1154692258da342accd21df02f7fa9ddd008f75f`. The input for deepwave is attached in DrGPUM's samples. 8 | 9 | We first add `-lineinfo -g` to the `_make_cuda_extension` function in `setup.py`, and then add `-g` to the `_make_cpp_extension` function. Next we use `pip install .` to install deepwave. 10 | 11 | **Note that this pip is supposed be the pip installed by conda as we use conda across all the python samples** 12 | 13 | To run the deepwave example in DrGPUM, we need to install matplotlib by `conda install matplotlib`. 14 | 15 | ## Profiling 16 | 17 | Currently, using gvprof to profile python applications is intricate. We use HPCToolkit to profile and analyze deepwave separatedly. Please refer to the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page for the complete guide. 18 | 19 | With the default configuration, this example takes a relatively long time. 20 | We can change `num_epochs` to 1 and let it break after finishing the first batch. 21 | This deepwave application introduces higher overhead (150-200x) than other applications (~20x) because its kernels access millions of memory addresses with lots of gaps. 22 | As a result, we are not able to merge all of the memory access ranges on the GPU. 23 | Then, we will spend long time in both copying memory addresses from the GPU to the host and updating host memories. 24 | 25 | For value pattern profiling, we monitor the most expensive `propagate` kernel using the following options. 26 | 27 | ```bash 28 | LD_LIBRARY_PATH=/path/to/python/install/lib/python/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,value_pattern@10000 -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=100000 python ./Deepwave_SEAM_example1.py 29 | ``` 30 | 31 | For data flow profiling, we turn on these knobs to accelerate the profiling process. 32 | 33 | ```bash 34 | LD_LIBRARY_PATH=/path/to/python/install/lib/python/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_DATA_FLOW_HASH=0 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 python ./Deepwave_SEAM_example1.py 35 | 36 | # this gives you additional speedup 37 | # export OMP_NUM_THREADS=16 38 | ``` 39 | 40 | More information about accelerating data flow and value pattern profiling can be found in the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page 41 | 42 | ## Optimization 43 | 44 | Please refer to the `replication_pad3d` issue in [PyTorch](https://gvprof.readthedocs.io/en/latest/faq.html). -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Profile Python applications 4 | 5 | Please first refer to the MANUAL page for step-by-step profiling using HPCToolkit. 6 | 7 | In addition to the basic commands there, we also have to pay attention to other minor issues. 8 | 9 | In the measurement stage, `LD_LIBRARY_PATH=/path/to/python/library/:$LD_LIBRARY_PATH` may be needed as a prefix before *hpcrun*. We have a detailed example for [profiling PyTorch](https://gvprof.readthedocs.io/en/latest/deepwave.html). 10 | 11 | Then, after getting measurement data and GPU binaries, we will analyze cpu binaries to get necessary line information. 12 | For GPU binaries, we use *hpcstruct --gpucfg no* on the measurement directory as suggested by the manual. 13 | For CPU binaries, the *python* binary does not contain all the program structure we need to understand program contexts. 14 | Instead, we have to analyze these binaries loaded dynamically at runtime. 15 | A python application may load hundreds of libraries at runtime but not use all of them. 16 | Therefore, in order to use hpcstrut on a minimum set of binaries but still extract information to understand program contexts, we adopt a *test-and-analyze* strategy. 17 | Using this strategy, we try hpcprof to correlate performance data with line maps first, if hpcprof hangs because of the large size of line map in a binary, we kill hpcprof and use hpcstruct on this binary to enjoy its fine-grained and fast binary analysis against hpcprof. 18 | 19 | When hpcprof begins analyze a binary, it will print out some message like below. In such a case, we can kill hpcprof, remove the temporary database, and use `hpcstruct` to analyze `libtorch_python.so`. 20 | 21 | ```bash 22 | msg: Begin analyzing : /path/to/python/lib/python3.8/site-packages/torch/lib/libtorch_python.so 23 | ``` 24 | 25 | ## Accelerate data flow profiling 26 | 27 | The following three knobs are helpful for accelerating proiling of applications with many kernels. With all the options turned on, the expected end-to-end of DrGPUM is approximately 20x, while the overhead could be over 1200x without these knobs. 28 | 29 | ```bash 30 | HPCRUN_SANITIZER_READ_TRACE_IGNORE= 31 | HPCRUN_SANITIZER_DATA_FLOW_HASH= 32 | HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS= 33 | ``` 34 | 35 | Note that these knobs can disable some information generation. 36 | 37 | When GPU analysis is enabled, one can adjust the number of records on the GPU side to enlarge the buffer on the GPU side and further reduce overhead. 38 | 39 | ```bash 40 | HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM= 41 | ``` 42 | 43 | ## Accelerate value pattern profiling 44 | 45 | The following knobs are helpful for profiling the value pattern of specific kernels, focusing on just several kernel instances. 46 | 47 | Besides, one can also apply `@N` to activate block sampling that profiles a random GPU block out of every *N* blocks. 48 | 49 | ```bash 50 | HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY= 51 | HPCRUN_SANITIZER_WHITELIST= 52 | HPCRUN_SANITIZER_BLACKLIST= 53 | ``` -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. DrGPUM documentation master file, created by 2 | sphinx-quickstart on Sun Mar 21 02:21:40 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | DrGPUM: A Memory Profiler for GPUs 7 | ================================== 8 | 9 | DrGPUM is an advanced memory profiler that locates memory inefficiencies in GPU-accelerated applications. 10 | DrGPUM's code is available on `Github `_. 11 | 12 | .. toctree:: 13 | :maxdepth: 3 14 | :caption: DrGPUM Basics 15 | 16 | preface 17 | install 18 | manual 19 | faq 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :caption: DrGPUM Development 24 | 25 | workflow 26 | roadmap 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | :caption: Components 31 | 32 | hpctoolkit 33 | redshow 34 | gviewer 35 | 36 | .. toctree:: 37 | :maxdepth: 3 38 | :caption: DrGPUM Samples 39 | 40 | unit_tests 41 | rodinia 42 | qmcpack 43 | castro 44 | deepwave 45 | darknet 46 | pytorch 47 | namd 48 | barracuda 49 | 50 | Indices and Tables 51 | ================== 52 | 53 | * :ref:`genindex` 54 | * :ref:`search` 55 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | The documentation includes detailed instructions for every package required by DrGPUM. One can use `./bin/install` to install all these packages at once. 4 | 5 | The install script accepts three arguments in order: 6 | 7 | ```bash 8 | # Specify PyTorch dir 9 | export PYTORCH_DIR=/path_to_pytorch/torch 10 | 11 | ./bin/install 12 | # default values 13 | # =`pwd`/gvprof 14 | # =/usr/local/cuda 15 | # =/compute-sanitizer 16 | ``` 17 | 18 | Before you install, make sure all the CUDA related paths (e.g., `LD_LIBRARY_PATH`) are setup. 19 | 20 | ## GPU Patch 21 | 22 | If you install cuda toolkit in somewhere else, you need to change the value of `SANITIZER_PATH`. 23 | 24 | ```bash 25 | git clone --recursive https://github.com/Lin-Mao/DrGPUM.git && cd DrGPUM 26 | cd DrGPUM 27 | make PREFIX=/path/to/gpu-patch/installation SANITIZER_PATH=/usr/local/cuda/compute-sanitizer/ install 28 | ``` 29 | ## Dependencies 30 | 31 | - spack 32 | 33 | ```bash 34 | git clone https://github.com/spack/spack.git 35 | export SPACK_ROOT=/path/to/spack 36 | source ${SPACK_ROOT}/share/spack/setup-env.sh 37 | ``` 38 | - required packages 39 | 40 | ```bash 41 | # Install hpctoolkit dependencies 42 | spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 43 | spack install libmonitor@master+dlopen+hpctoolkit 44 | spack install libunwind 45 | 46 | spack install mbedtls gotcha 47 | 48 | # Python version for torch monitor 49 | PY_VERSION=3.8 50 | spack install python@$PY_VERSION 51 | ``` 52 | 53 | ## Redshow 54 | 55 | ```bash 56 | cd redshow 57 | # Tip: get boost libarary path 'spack find --path' and append include to that path 58 | make install -j8 PREFIX=/path/to/redshow/installation BOOST_DIR=/path/to/boost/installation GPU_PATH_DIR=/path/to/gpu-patch/installation 59 | # Useful options: 60 | # make DEBUG=1 61 | # make OPENMP=1 62 | ``` 63 | 64 | ## HPCToolkit 65 | 66 | - profiling substrates 67 | 68 | ```bash 69 | cd /path/to/hpctoolkit 70 | mkdir build && cd build 71 | # Tip: check spack libraries' root->spack find --path. 72 | # For example: --with-spack=/home/username/spack/opt/spack/linux-ubuntu18.04-zen/gcc-7.4.0/ 73 | ../configure --prefix=/path/to/hpctoolkit/installation --with-cuda=/usr/local/cuda-11.0 --with-sanitizer=/path/to/sanitizer --with-gpu-patch=/path/to/gpu-patch/installation --with-redshow=/path/to/redshow/installation --with-spack=/path/to/spack/libraries/root --with 74 | make install -j8 75 | ``` 76 | 77 | - hpcviewer (optional) 78 | 79 | [http://hpctoolkit.org/download/hpcviewer/](http://hpctoolkit.org/download/hpcviewer/) 80 | 81 | ## Setup and Test 82 | 83 | Add following lines into your `.bashrc` file and source it. 84 | 85 | ```bash 86 | export PATH=/path/to/hpctoolkit/install/bin/:$PATH 87 | export PATH=/path/to/DrGPUM/install/bin/:$PATH 88 | export PATH=/path/to/redshow/install/bin/:$PATH 89 | ``` 90 | 91 | Test if gvprof works. 92 | 93 | ```bash 94 | cd ./samples/vectorAdd.f32 95 | make 96 | gvprof -v -e redundancy ./vectorAdd 97 | hpcviewer gvprof-database 98 | ``` -------------------------------------------------------------------------------- /docs/lammps.md: -------------------------------------------------------------------------------- 1 | # Lammps 2 | 3 | ## Introduction 4 | 5 | We check out Lammps version `69d41dc16cd3272da8e768414d972b32a36803c1`, and test input `lammps/bench/in.lj` . 6 | 7 | To compile lammps, we first edit `lammps/lib/kokkos/bin/nvcc_wrapper:37` that append `-lineinfo` to `cuda_args`. Then create a build directory under lammps, and use the following command lines to compile it. 8 | 9 | ``` 10 | cmake -DPKG_KOKKOS=ON -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ENABLE_OPENMP=yes -D CMAKE_CXX_COMPILER=`pwd`/../lib/kokkos/bin/nvcc_wrapper ../cmake 11 | ../build/lmp -k on g 1 -sf kk -in in.lj 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/manual.md: -------------------------------------------------------------------------------- 1 | # Manual 2 | 3 | ## Compile with Line Information 4 | 5 | DrGPUM relies on debug information in binaries to attribute fine-grained value metrics on individual lines, loops, and functions. 6 | 7 | For GPU binaries, we recommend using `-O3 -lineinfo`. 8 | 9 | For CPU binaries, we recommend using `-O3 -g`. 10 | 11 | For software compiled with CMake system, usually we can edit `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS` to add line info flags. Additionally, CUDA line info can be added through `CMAKE_CUDA_FLAGS`. 12 | 13 | ## Profile Using DrGPUM 14 | 15 | The `gvprof` script automates a series of profiling and analysis processes, but supports only basic profiling features. For detailed profiling control, please refer to the next section. 16 | 17 | ```bash 18 | gvprof -h 19 | # Currently we offer three modes 20 | # gvprof -v is your friend for debugging 21 | gvprof -e 22 | ``` 23 | 24 | ## Profile Using HPCToolkit 25 | 26 | Using hpctoolkit to profile applications enables fine-grained control knobs, selective analysis of GPU/CPU binaries, and compatibilities with various launchers (e.g., jsrun). 27 | We invoke `hpcrun` to profile an application twice using the same input. 28 | In the first pass, we dump the cubins loaded at runtime and profile each kernel's running time. Then we invoke `hpcstruct` to analyze program structure and instruction dependency. 29 | In the second pass, we instrument the cubins and invoke `redshow` redundancy analysis library to analyze measurement data. 30 | 31 | 32 | ### First pass 33 | 34 | ```bash 35 | hpcrun -e gpu=nvidia 36 | hpcstruct 37 | # if '--gpucfg yes', hpcstruct will analyze the control flow graph of each GPU function and perform backward slicing, which is costly for large GPU binaries. 38 | hpcstruct --gpucfg no hpctoolkit--measurements 39 | # One can use also hpcstruct on the select GPU binaries only 40 | hpcstruct --gpucfg no 41 | ``` 42 | 43 | ### Second pass 44 | 45 | ```bash 46 | # Before profiling, we remove all profile data dumped in the first pass 47 | rm -rf hpctoolkit--measurements/*.hpcrun 48 | 49 | hpcrun -e gpu=nvidia, -ck -ck ... 50 | hpcprof -S .hpcstruct hpctoolkit--measurements 51 | # If only some binaries are analyzed using hpcstruct, 52 | # one has to supply the corresponding binaries' structure files 53 | hpcprof -S .hpcstruct -S .hpcstruct hpctoolkit--measurements 54 | ``` 55 | 56 | ### HPCToolkit separate pass 57 | 58 | Large scale applications, such as Castro heavily use lambda functions and template functions for GPU kernels. Therefore, tools like `nsys` and `ncu` cannot efficiently correlate each kernel's execution time their names. Even though nvtx can provide some information to locate kernels, it is still not straightforward to map metrics back to source lines. Instead, we recommend using HPCToolkit, which provides an integrate calling context span CPUs and GPUs, to lookup the calling context and running time for each kernel. The following commands can be used. 59 | 60 | ```bash 61 | hpcrun -e gpu=nvidia,pc 62 | hpcstruct 63 | hpcstruct --gpucfg no hpctoolkit--measurements 64 | hpcprof -S .hpcstruct hpctoolkit--measurements 65 | hpcviewer hpctoolkit--measurements 66 | ``` 67 | 68 | ## Control Knobs 69 | 70 | The following fine-grained options can be passed to either gvprof or hpcrun by pointing the option name and option value with `-ck =`. 71 | 72 | ```bash 73 | HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM= 74 | HPCRUN_SANITIZER_BUFFER_POOL_SIZE= 75 | HPCRUN_SANITIZER_APPROX_LEVEL= 76 | HPCRUN_SANITIZER_PC_VIEWS= 77 | HPCRUN_SANITIZER_MEM_VIEWS= 78 | HPCRUN_SANITIZER_DEFAULT_TYPE= 79 | HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY= 80 | HPCRUN_SANITIZER_WHITELIST= 81 | HPCRUN_SANITIZER_BLACKLIST= 82 | HPCRUN_SANITIZER_READ_TRACE_IGNORE= 83 | HPCRUN_SANITIZER_DATA_FLOW_HASH= 84 | HPCRUN_SANITIZER_LIVENESS_ONGPU= 85 | HPCRUN_SANITIZER_TORCH_ANALYSIS= 86 | HPCRUN_SANITIZER_TORCH_ANALYSIS_ONGPU= 87 | HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS= 88 | ``` 89 | 90 | ### Calling context view 91 | 92 | Only CPU calling context is available now. 93 | GPU calling context is under development. 94 | 95 | ```bash 96 | hpcviewer 97 | ``` 98 | 99 | ### Data flow view 100 | 101 | ```bash 102 | gviewer -f /data_flow.dot.context -cf file -p 103 | # gviewer -h for detailed options 104 | ``` 105 | The generated .svg can be visualized directly. To enable interactive control, we can rename the file to `demo.svg` and move it to `jquery.graphviz.svg`. After launch a server locally, we can visualize the graph, zoom in for important parts, and track each node's data flows. 106 | 107 | ### Fine grain pattern views 108 | 109 | ```bash 110 | # value pattern 111 | less /value_pattern_t.csv 112 | 113 | # redundancy 114 | less /temporal_read_t.csv 115 | less /temporal_write_t.csv 116 | less /spatial_read_t.csv 117 | less /spatial_write_t.csv 118 | ``` 119 | 120 | ## Example 121 | 122 | 123 | -------------------------------------------------------------------------------- /docs/namd.md: -------------------------------------------------------------------------------- 1 | # NAMD 2 | 3 | ## Introduction 4 | 5 | [NAMD](https://www.ks.uiuc.edu/Research/namd/) is a parallel molecular dynamics code designed for high-performance simulation of large biomolecular systems. NAMD uses the popular molecular graphics program VMD for simulation setup and trajectory analysis. 6 | 7 | We download NAMD source code from its [official website](https://www.ks.uiuc.edu/Development/Download/download.cgi). We use NAMD version `4a41c6087f69c4cfe3edfdb19c6a5780ac20f5f1` and study the `alanin` input. 8 | 9 | The following flags are setup in `Make.config`: 10 | 11 | ``` 12 | CUDAGENCODE = -arch -g -lineinfo 13 | CXX_OPTS = -g -O3 14 | ``` 15 | 16 | ## Profiling 17 | 18 | For data flow profiling, we use the normal gvprof script with the `-ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1` option. 19 | 20 | For value pattern profiling, we monitor the most costly `nonbondedForceKernel` kernel of namd. Note that because this function accesses many arrays with different value types, we need GPU control flow graph and backward slicing to derive the types of each array. 21 | For your reference, we use the command 22 | ``` 23 | gvprof -cfg -j 16 -e value_pattern -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=10 24 | ``` 25 | The CFG analysis phase could take up to an hour consuming about **100GB** main memory. 26 | 27 | **Caution: please use the full mangled name of `nonbondedForceKernel`** 28 | 29 | ## Optimization 30 | 31 | - *data_flow* - *redundant values* 32 | 33 | We find the *submitHalf* kernels are repetively invoked, forming an interesting diagram. 34 | Investigating carefully into the code, we find that the redundancy is introduced on purpose. 35 | The authors of namd pay close attention to its performance. They allocate some variables on the device to accumulate global sums and only transfer these value back to the host using the last block of the kernel. Besides, at the end of these kernels, they reset these values to zeros to make sure the next time the buffers are clean. 36 | 37 | You may wonder they are doing this. There are two reasons: 38 | 39 | 1. If variables are not cleaned on the device, we have to reset variable using either `memsetAsync` or implicit device host communication which trigger extra cost. In contract, directly set variables in a GPU kernel can hide this latency by overlapping memory latency with computations latencies without additional API invocation. 40 | 41 | 2. If the host variable is accessed every time, these kernels will be slowed down significantly. 42 | 43 | - *value_pattern* - *type overuse* 44 | 45 | `CudaComputeNonbondedKernel.cu: 579`. By profiling the value patterns of this `CudaCOmputeNonbondedKernel` kernel, we find this array's type is overused. We can use `uint8_t` to replace the original `int` data type. -------------------------------------------------------------------------------- /docs/preface.md: -------------------------------------------------------------------------------- 1 | # Preface 2 | 3 | DrGPUM is a memory profiling tool for applications running on GPU clusters, with advanced features for value based profiling and analysis. 4 | 5 | The following diagram describes how components communicate with each other. 6 | 7 | ``` 8 | 9 | ------------- --------------------- ------------------------ ************************************* 10 | | GPU Patch | <-> | Profiling Runtime | <-> | Measurement Analysis | -> ** Program Analyzer and Aggregator ** -> Performance Reports 11 | ------------- --------------------- ------------------------ ************************************* 12 | | /|\ 13 | |--------------------------------------------------------------------| 14 | 15 | ``` 16 | 17 | ## HPCToolkit (Profiling Runtime) 18 | 19 | [*HPCToolkit*](http://hpctoolkit.org/) is a powerful profiling tool that measures application performance on the world's largest supercomputers. 20 | DrGPUM customizes HPCToolkit and uses it as the default profiling runtime. 21 | Currently, we are developing on HPCToolkit's [*sanitizer*](https://github.com/Lin-Mao/hpctoolkit) version. 22 | 23 | ## Redshow 24 | 25 | [*Redshow*](https://github.com/Lin-Mao/redshow) is a postmortem metrics analysis substrate. 26 | It receives data from the profiling runtime, performs analysis enabled by the user, and store the analysis result onto the disk. 27 | Besides, redshow maintains the information of data objects allocated at runtime. 28 | Redshow also contains binary analysis modules to map virtual addresses to function index and symbol names and analyze GPU instruction characteristics. 29 | 30 | ## GPU Patch 31 | 32 | *GPU Patch* includes several implementation of instrumentation callbacks and a GPU-CPU communication system. 33 | It can collect GPU memory metrics, block enter/exit records, and GPU call/ret records (under development). 34 | The collected data are stored on a GPU buffer. 35 | The profiling runtime observes a signal once the GPU buffer is full and copies data from the GPU to the CPU. 36 | 37 | ## Program Analyzer and Aggregator 38 | 39 | Some high level performance metrics are output to performance reports directly. 40 | Low level detailed performance metrics are associated with individual functions and lines. 41 | Therefore, we analyze program structure to attribute these metrics. 42 | Moreover, when analyzing application running on multiple nodes, we can aggregate performance data together to compute overall metrics that represent the entire execution. 43 | -------------------------------------------------------------------------------- /docs/pytorch.md: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | 3 | ## Introduction 4 | 5 | [PyTorch](https://pytorch.org/) is a popular machine learning framework. 6 | 7 | We use PyTorch version `f5788898a928cb2489926c1a5418c94c598c361b`. We study `resnet50`, `bert`, `deepwave` models. 8 | 9 | We apply the following commands to compile PyTorch from source. 10 | 11 | ```bash 12 | spack install miniconda3 13 | 14 | conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses 15 | 16 | conda install -c pytorch magma-cuda110 17 | 18 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} 19 | export USE_CUDA=1 20 | export REL_WITH_DEB_INFO=1 21 | export MAX_JOBS=16 22 | export USE_NINJA=OFF 23 | python setup.py install 24 | ``` 25 | 26 | - *resnet* 27 | 28 | We get the `resnet` example from the [pytorch benchmark](https://github.com/pytorch/benchmark/tree/master/torchbenchmark/models/resnet50) repo. 29 | 30 | To ease the installtion, we provide `1-spatial-convolution-model.py` and `1-spatial-convolution-unit.py` to check layer-wise and end-to-end performance. 31 | 32 | - *deepwave* 33 | 34 | We provide the instructions for installing deepwave here. 35 | 36 | To ease checking the problematic kernel, we provide `2-replication-pad3d.py` script which only has a single `ReplicationPad3d` kernel. 37 | 38 | - *bert* 39 | 40 | We get the `reset` example from the [pytorch benchmark](https://github.com/pytorch/benchmark/tree/master/torchbenchmark/models/resnet50). 41 | 42 | To ease checking the problematic kernel, we provide `3-embedding-unit.py` script which only has a single `Embedding` kernel. 43 | 44 | ## Profiling 45 | 46 | Profiling a Python application takes extra steps than a normal application. We have a general guide to profile application in the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page. 47 | 48 | An example profiling command is attached below for reference: 49 | 50 | ```bash 51 | LD_LIBRARY_PATH=/path/to/python/install/lib/python/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_DATA_FLOW_HASH=0 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 python ./.py 52 | ``` 53 | 54 | ## Optimization 55 | 56 | We don't provide an automate performance testing suite for PyTorch in DrGPUM because recompile PyTorch for just small code changes still take long time and is a pain on low end servers. 57 | 58 | - *data_flow* - *redundant values* 59 | 60 | Please refer to this [issue](https://github.com/pytorch/pytorch/issues/48539) 61 | 62 | - *data_flow* - *redundant values* - *value_pattern* - *redundant zeros* 63 | 64 | Please refer to these two: [issue1](https://github.com/pytorch/pytorch/issues/48889) and [issue2](https://github.com/pytorch/pytorch/issues/49663) -------------------------------------------------------------------------------- /docs/qmcpack.md: -------------------------------------------------------------------------------- 1 | # QMCPACK 2 | 3 | ## Introduction 4 | 5 | [QMCPACK](https://github.com/QMCPACK/qmcpack) is an open-source production level many-body ab initio Quantum Monte Carlo code for computing the electronic structure of atoms, molecules, and solids. 6 | 7 | We study QMCPACK version `474062068a9f6348dbf7d55be7d1bd375c24f1fe`. 8 | 9 | There are a bunch of packages required to compiled QMCPACK, including clang, OpenMP (offloading), HDF5, FFTW, and BOOST. These packages can be installed directly via spack. 10 | 11 | To compile QMCPACK, we pass the following variables to cmake: 12 | 13 | ```bash 14 | CMAKE_C_COMPILER=mpicc 15 | CMAKE_CXX_COMPILER=mpicxx 16 | ENABLE_OFFLOAD=ON 17 | USE_OBJECT_TARGET=ON 18 | OFFLOAD_ARCH= 19 | ENABLE_CUDA=1 20 | CUDA_ARCH= 21 | CUDA_HOST_COMPILER=`which gcc` 22 | QMC_DATA= 23 | ENABLE_TIMERS=1 24 | ``` 25 | 26 | The following environment variables are also required: 27 | 28 | ```bash 29 | export OMPI_CC=clang 30 | export OMPI_CXX=clang++ 31 | ``` 32 | 33 | ## Profiling 34 | 35 | First follow the instructions in `tests/performance/NiO/README` to enable and run the NiO tests. The configuration file used is `Nio-fcc-S1-dmc.xml` under the `batched_driver` folder. 36 | 37 | At runtime, we use four worker threads (`export OMP_NUM_THREADS=4`). For a small scale run, one can adjust control variables such as `warmupSteps` to reduce execution time. 38 | 39 | The data flow pattern can be profiled directly using gvprof. For the value pattern mode, one has to find the interesting function's names and use gvprof's whitelist to focus on these functions. 40 | 41 | ## Optimization 42 | 43 | - *data_flow* - *redundant values* 44 | 45 | [`MatrixDelayedUpdateCUDA.h: 627`](https://github.com/QMCPACK/qmcpack/blob/5c4776b747fefef0146765379461c6593108cf11/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h#L627). This line is often copying the same base pointers to the arrays on the GPU. Though this is not be a performance bottleneck for the current workload, it might be worth attention once the number of arrays increases. -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | recommonmark 2 | sphinx-rtd-theme -------------------------------------------------------------------------------- /docs/roadmap.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | This document describes incoming features and release plans for DrGPUM. Since DrGPUM is a growing project, it has many components need fix and enhancement. Suggestions and feature requests are welcome. Users can post questions on Github's [discussion forum](https://github.com/Lin-Mao/DrGPUM/issues). 4 | 5 | ## Release v2.2 6 | 7 | We plan release *v2.2* around Fall 2021, which will focus on enhancing the stability and compatibility of DrGPUM. Also, a few new features, such as customized memory allocator support and more accessible function filters are planned to be integrated. 8 | 9 | 10 | - Features 11 | 12 | - NVTX 13 | 14 | Register CUPTI's NVTX callback to monitor customized memory allocators. 15 | 16 | - CUDA Memory Pool 17 | 18 | Support memory pool allocators in CUDA 11.2 19 | 20 | - Bug Fixes 21 | 22 | - Function Filters 23 | 24 | Support substring match in whitelist and blacklist 25 | 26 | - Value Pattern Output 27 | 28 | Sort output arrays based on their access counts and fix weird numbers 29 | 30 | - Deployment and Test 31 | 32 | - CMake 33 | 34 | Add CMake configurations to DrGPUM in addition to Makefile 35 | 36 | - Unittest 37 | 38 | Adapt python unittest package 39 | 40 | - Test configurations 41 | 42 | Adopt yaml files to configure test cases 43 | 44 | ## Pending Issues 45 | 46 | We haven't decided when to solve the following issues. 47 | 48 | - GViewer Website 49 | 50 | Launch a website to visualize data flow graphs. 51 | 52 | - Fine grain pattern and data flow integration 53 | 54 | Use the website described before to show both fine grain patterns and data flow. 55 | 56 | - HPCToolkit Merge 57 | 58 | Merge the latest HPCToolkit master into DrGPUM. 59 | -------------------------------------------------------------------------------- /docs/rodinia.md: -------------------------------------------------------------------------------- 1 | # Rodinia GPU Benchmark 2 | 3 | ## backprop 4 | 5 | - vp-opt1: *value_pattern* - *redundant zeros* 6 | 7 | [`backprop_cuda_kernel.cu: 81`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/backprop/backprop_cuda_kernel.cu#L81). The *delta* array has many zeros. We can check each entry on the GPU side to execute a special branch that avoid computation. 8 | 9 | - vp-opt2: *data_flow* - *duplicate values* 10 | 11 | [`backprop_cuda.cu: 180`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/backprop/backprop_cuda.cu#L180). *net->input_units* is copied to GPU at *Line 118* and copied back at *Line 188*. Meanwhile, both the GPU data and the CPU data are not changed. As a result, the copy at *Line 188* can be eliminated safely. 12 | 13 | ## bfs 14 | 15 | - vp-opt1: *value_pattern* - *type overuse* 16 | 17 | [`kernel.cu: 22`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/bfs/kernel.cu#L22). The *g_cost*'s array's values are within the range of `[-127, 128)`. We can specify this array's type as `int_8` instead of `int` to reduce both kernel execution time and memory copy time. 18 | 19 | - vp-opt2: *value_pattern* - *dense values* 20 | 21 | [`bfs.cu: 107-109`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/bfs/bfs.cu#L107). Accesses to these arrays showing a dense value pattern where zeros are read most of the time. We can replace the memory copies of all zeros from CPU to GPU by memset that is much faster to reduce memory copy time. 22 | 23 | ## cfd 24 | 25 | - vp-opt1: *value_pattern* - *dense values* 26 | 27 | [`euler3d.cu: 173`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/cfd/euler3d.cu#L173). The *cuda_initialize_variables* function writes values in a dense pattern. We can *hash* the accessing index of this array to limit memory access in a certain range and increase cache locality. Since this array is changed in the second iteration, this optimization only applies to the first iteration. 28 | 29 | - vp-opt2: *data_flow* - *redundant values* 30 | 31 | [`euler3d.cu: 570`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/cfd/euler3d.cu#L570). The *old_variables* array is originally initialized at *Line 551* with the same values are *variables* but copied again at *Line 570*. We can safely eliminate the second copy which is redundant to the first iteration. 32 | 33 | ## hotspot 34 | 35 | - vp-opt: *value_pattern* - *approximate* - *single value* 36 | 37 | [`hotspot.cu: 164`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/hotspot/hotspot.cu#L164). The *temp_src* array contains many very close floating point numbers. 38 | Using the approximate mode, gvprof determines values in this array are approximately the same under a certain approximation level. 39 | Therefore, we can read just some neighbor points on *Line 195* and still get similar final results. 40 | 41 | ## hotspot3D 42 | 43 | - vp-opt: *value_pattern* - *approximate* - *single value* 44 | 45 | [`opt1.cu: 29`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/hotspot3D/opt1.cu#L29). Like the *hotspot* example, the *tIn* array contains many very close floating point numbers. And gvprof determines all this values in this array are approximately the same under the certain approximation level. Incontrast to the *hotspot* example that selectively choose neighors, we use loop perforation to compute half of the loops and get similar result. 46 | 47 | ## huffman 48 | 49 | - vp-opt: *value_pattern* - *dense values* 50 | 51 | [`his.cu: 51`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/huffman/hist.cu#L51). DrGPUM reports dense values for the histo array in both the write and read modes. Because the most frequently updated value is zero, we can conditionally perform atomicAdd to reduce atomic operations. 52 | 53 | ## lavaMD 54 | 55 | - vp-opt: *value_pattern* - *type overuse* 56 | 57 | [`kernel_gpu_cuda.cu: 84`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/lavaMD/kernel/kernel_gpu_cuda.cu#L84). The *rA* array contains only few distinct numbers. By checking its initialization on the CPU side, we note that there are only ten fixed values within 0.1 to 1.0. We can store these values using `uint_8` instead of `double`, saving *8x* space. These values are then decoded on the GPU side. In this way, we trade in compute time for memory copy time. 58 | 59 | ## pathfinder 60 | 61 | - vp-opt: *value_pattern* - *type overuse* 62 | 63 | [`pathfinder.cu: 144`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/pathfinder/pathfinder.cu#L144). The *gpuWall* array's values for this input will be within `[0, 255]`, thereby we can use `uint8_t` to replace `int` to reduce global memory traffic. 64 | 65 | ## srad 66 | 67 | - vp-opt1: *value_pattern* - *single value* 68 | 69 | [`srad_kernel.cu: 79`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/srad_v1/srad_kernel.cu#L79). *d_c_loc* is always one for this output. We can memset all the values of *d_c* to 1 beforehand and to eliminate all stores with 1s 70 | 71 | - vp-opt2: *value_pattern* - *structured* 72 | 73 | [`srad_kernel.cu: 38`](https://github.com/DrGPUM/DrGPUM-samples/blob/a8c23e3aba/srad_v1/srad_kernel.cu#L38) . *d_iN*, *d_iS*, *d_jW*, *d_jE* are used to indicate the adjacent nodes' coordinates which have structured patterns. We removed these four arrays and replace them with the corresponding calculations. 74 | 75 | ## streamcluster 76 | 77 | - vp-opt: *data_flow* - *redundant values* 78 | 79 | [`streamcluster_cuda.cu:221`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/streamcluster/streamcluster_cuda.cu#L221). These arrays *center_table_d*, *switch_membership_d*, *p* are not changed in each iteration. Therefore, we can use flags on the CPU to detect if these arrays will be changed and only copy values if they are. 80 | -------------------------------------------------------------------------------- /docs/unit_tests.md: -------------------------------------------------------------------------------- 1 | # Unit Tests 2 | 3 | ## interval_merge 4 | 5 | This example is a carbon copy of the DrGPUM's interval analysis GPU module. 6 | 7 | ## op_graph_simple 8 | 9 | This example has a few redundant and duplicate memory access patterns, and is used to test basic functions of the DrGPUM's data_flow mode. 10 | 11 | ## op_pattern_simple 12 | 13 | This example has kernels with various fine-grained memory access patterns, and is used to test basic functions of the DrGPUM's value_pattern mode. 14 | 15 | ## stress 16 | 17 | A multi-context multi-stream proxy app to test DrGPUM's stability. 18 | 19 | ## vectorAdd 20 | 21 | A set of test cases for redshow's instruction parser. 22 | -------------------------------------------------------------------------------- /docs/workflow.md: -------------------------------------------------------------------------------- 1 | # Workflow 2 | 3 | ## Use GPU Patch 4 | 5 | GPU Patch is built upon [Compute Sanitizer API](https://docs.nvidia.com/cuda/sanitizer-docs/index.html). 6 | As we are closely working with NVIDIA on this API, we will update GPU Patch to use new features as soon as the new release is available. 7 | You can find a complete usage example of Sanitizer API in [`sanitizer-api.c`](https://github.com/HPCToolkit/hpctoolkit/blob/sanitizer/src/tool/hpcrun/gpu/nvidia/sanitizer-api.c). 8 | Some simple samples can be found in this [repository](https://github.com/NVIDIA/compute-sanitizer-samples). 9 | 10 | ## Use RedShow with HPCToolkit 11 | 12 | Please refer to the redshow [header file](https://github.com/Lin-Mao/redshow/blob/main/include/redshow.h) for the complete set of interface. 13 | 14 | If a new mode is added to DrGPUM, one should configure through the following redshow functions and sanitizer variables in HPCToolkit. 15 | 16 | ``` 17 | redshow_analysis_enable 18 | redshow_output_dir_config 19 | 20 | sanitizer_gpu_patch_type 21 | sanitizer_gpu_patch_record_size 22 | sanitizer_gpu_analysis_type 23 | sanitizer_gpu_analysis_record_size 24 | sanitizer_analysis_async 25 | ``` 26 | 27 | Currently, using a new runtime with redshow other than HPCToolkit is intricate, we will update the doc once we've gone through the whole process. 28 | 29 | ## DrGPUM Tests 30 | 31 | DrGPUM has end-to-end tests for each analysis mode plus an unit test for instruction analysis. Therefore, if a new analysis mode is added, we suppose the developer to add a test using python to verify its correctness. 32 | 33 | For each analysis mode, the developer should write at least one simple case that covers most situations and collect results from samples. 34 | 35 | We are in the process of completing the testing framework. 36 | 37 | To run DrGPUM test, we use the following command at DrGPUM's root directory. The instruction test could fail due to the default data type used, which is acceptable. 38 | 39 | ```bash 40 | python python/test.py -m all -a 41 | ``` 42 | -------------------------------------------------------------------------------- /include/gpu-patch.h: -------------------------------------------------------------------------------- 1 | #ifndef HPCTOOLKIT_GPU_PATCH_GPU_PATCH_H 2 | #define HPCTOOLKIT_GPU_PATCH_GPU_PATCH_H 3 | 4 | #include 5 | #include 6 | 7 | #define GPU_PATCH_MAX_ACCESS_SIZE (16) 8 | #define GPU_PATCH_WARP_SIZE (32) 9 | #define GPU_PATCH_ANALYSIS_THREADS (1024) 10 | #define GPU_PATCH_ANALYSIS_ITEMS (4) 11 | #define GPU_PATCH_ADDRESS_DICT_SIZE (1024) 12 | 13 | enum GPUPatchFlags { 14 | GPU_PATCH_NONE = 0, 15 | GPU_PATCH_READ = 0x1, 16 | GPU_PATCH_WRITE = 0x2, 17 | GPU_PATCH_ATOMSYS = 0x4, 18 | GPU_PATCH_LOCAL = 0x8, 19 | GPU_PATCH_SHARED = 0x10, 20 | GPU_PATCH_BLOCK_ENTER_FLAG = 0x20, 21 | GPU_PATCH_BLOCK_EXIT_FLAG = 0x40, 22 | GPU_PATCH_ANALYSIS = 0x80 23 | }; 24 | 25 | enum GPUPatchType { 26 | GPU_PATCH_TYPE_DEFAULT = 0, 27 | GPU_PATCH_TYPE_ADDRESS_PATCH = 1, 28 | GPU_PATCH_TYPE_ADDRESS_ANALYSIS = 2, 29 | GPU_PATCH_TYPE_COUNT = 3 30 | }; 31 | 32 | // Complete record 33 | typedef struct gpu_patch_record { 34 | uint64_t pc; 35 | uint32_t size; 36 | uint32_t active; 37 | uint32_t flat_thread_id; 38 | uint32_t flat_block_id; 39 | uint32_t flags; 40 | uint64_t address[GPU_PATCH_WARP_SIZE]; 41 | uint8_t value[GPU_PATCH_WARP_SIZE][GPU_PATCH_MAX_ACCESS_SIZE]; // STS.128->16 bytes 42 | } gpu_patch_record_t; 43 | 44 | // Address only 45 | typedef struct gpu_patch_record_address { 46 | uint32_t flags; 47 | uint32_t active; 48 | uint32_t size; 49 | uint64_t address[GPU_PATCH_WARP_SIZE]; 50 | } gpu_patch_record_address_t; 51 | 52 | // Address only, gpu analysis 53 | typedef struct gpu_patch_analysis_address { 54 | uint64_t start; 55 | uint64_t end; 56 | } gpu_patch_analysis_address_t; 57 | 58 | // Auxiliary data 59 | typedef struct gpu_patch_aux_address_dict { 60 | uint32_t size; 61 | gpu_patch_analysis_address_t start_end[GPU_PATCH_ADDRESS_DICT_SIZE]; 62 | uint8_t hit[GPU_PATCH_ADDRESS_DICT_SIZE]; 63 | uint8_t read[GPU_PATCH_ADDRESS_DICT_SIZE]; 64 | uint8_t write[GPU_PATCH_ADDRESS_DICT_SIZE]; 65 | } gpu_patch_aux_address_dict_t; 66 | 67 | typedef struct gpu_patch_buffer { 68 | volatile uint32_t full; 69 | volatile uint32_t analysis; 70 | volatile uint32_t head_index; 71 | volatile uint32_t tail_index; 72 | uint32_t size; 73 | uint32_t num_threads; // If num_threads == 0, the kernel is finished 74 | uint32_t block_sampling_offset; 75 | uint32_t block_sampling_frequency; 76 | uint32_t type; 77 | uint32_t flags; // read or write or both 78 | void *records; 79 | void *aux; 80 | void *torch_aux; 81 | } gpu_patch_buffer_t; 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /include/gpu-queue.h: -------------------------------------------------------------------------------- 1 | #ifndef HPCTOOLKIT_GPU_PATCH_GPU_QUEUE_H 2 | #define HPCTOOLKIT_GPU_PATCH_GPU_QUEUE_H 3 | 4 | #include 5 | 6 | #include "gpu-patch.h" 7 | 8 | /* 9 | * Get a gpu record 10 | */ 11 | extern "C" __device__ uint32_t gpu_queue_get(gpu_patch_buffer_t *buffer, uint32_t analysis = 0) { 12 | uint32_t size = buffer->size; 13 | uint32_t tail_index = 0; 14 | while (tail_index == 0) { 15 | tail_index = atomicAdd((uint32_t *)&buffer->tail_index, 1) + 1; 16 | // Write on tail_index - 1 17 | if (tail_index - 1 >= size) { 18 | // First warp that found buffer is full 19 | if (tail_index - 1 == size) { 20 | // Wait for previous warps finish writing 21 | while (buffer->head_index < size); 22 | if (analysis == 1) { 23 | // Sync with GPU 24 | __threadfence(); 25 | buffer->analysis = 1; 26 | __threadfence(); 27 | while (buffer->analysis == 1); 28 | } else { 29 | // Sync with CPU 30 | __threadfence_system(); 31 | buffer->full = 1; 32 | __threadfence_system(); 33 | while (buffer->full == 1); 34 | } 35 | __threadfence(); 36 | buffer->head_index = 0; 37 | __threadfence(); 38 | buffer->tail_index = 0; 39 | } else { 40 | // Other waps 41 | while (buffer->tail_index >= size); 42 | } 43 | tail_index = 0; 44 | } 45 | } 46 | 47 | return tail_index - 1; 48 | } 49 | 50 | /* 51 | * Finish writing gpu records 52 | */ 53 | extern "C" __device__ void gpu_queue_push(gpu_patch_buffer_t *buffer) { 54 | // Make sure records are visible 55 | atomicAdd((uint32_t *)&buffer->head_index, 1); 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef HPCTOOLKIT_GPU_PATCH_UTILITIES_H 2 | #define HPCTOOLKIT_GPU_PATCH_UTILITIES_H 3 | 4 | #include 5 | 6 | /* 7 | * Utility functions 8 | */ 9 | __device__ __forceinline__ uint32_t get_flat_block_id() { 10 | return blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; 11 | } 12 | 13 | __device__ __forceinline__ uint32_t get_flat_thread_id() { 14 | return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; 15 | } 16 | 17 | __device__ __forceinline__ uint64_t get_unique_thread_id() { 18 | return get_flat_block_id() * blockDim.x * blockDim.y * blockDim.z + get_flat_thread_id(); 19 | } 20 | 21 | __device__ __forceinline__ uint64_t get_grid_num_threads() { 22 | return gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z; 23 | } 24 | 25 | __device__ __forceinline__ uint64_t get_block_num_threads() { 26 | return blockDim.x * blockDim.y * blockDim.z; 27 | } 28 | 29 | __device__ __forceinline__ uint32_t get_laneid() { 30 | uint32_t laneid = 0; 31 | asm volatile("mov.u32 %0, %laneid;" : "=r"(laneid)); 32 | return laneid; 33 | } 34 | 35 | __device__ __forceinline__ bool sample_callback(uint32_t frequency, uint32_t offset) { 36 | if (frequency != 0) { 37 | // 1 : Sample all blocks 38 | // >1 : Sample a portion of blocks 39 | return get_flat_block_id() % frequency == offset; 40 | } 41 | // Skip all blocks 42 | return false; 43 | } 44 | 45 | __device__ __forceinline__ bool is_locked(uint32_t *lock, uint32_t id) { 46 | uint32_t old = *lock; 47 | // Read the newest value 48 | __threadfence(); 49 | return old == id; 50 | } 51 | 52 | __device__ __forceinline__ void read_shared_memory(uint32_t size, uint32_t ptr, uint8_t *buf) { 53 | for (uint32_t i = 0; i < size; ++i) { 54 | uint32_t ret = 0; 55 | asm volatile("ld.shared.b8 %0,[%1];" : "=r"(ret) : "r"(ptr + i) : "memory"); 56 | buf[i] = ret; 57 | } 58 | } 59 | 60 | __device__ __forceinline__ void read_global_memory(uint32_t size, uint64_t ptr, uint8_t *buf) { 61 | for (uint32_t i = 0; i < size; ++i) { 62 | uint32_t ret = 0; 63 | asm volatile("ld.b8 %0,[%1];" : "=r"(ret) : "l"(ptr + i) : "memory"); 64 | buf[i] = ret; 65 | } 66 | } 67 | 68 | __device__ __forceinline__ void read_local_memory(uint32_t size, uint32_t ptr, uint8_t *buf) { 69 | for (uint32_t i = 0; i < size; ++i) { 70 | uint32_t ret = 0; 71 | asm volatile("ld.local.b8 %0,[%1];" : "=r"(ret) : "r"(ptr + i) : "memory"); 72 | buf[i] = ret; 73 | } 74 | } 75 | 76 | template 77 | __device__ __forceinline__ T shfl(T v, uint32_t srcline, uint32_t mask = 0xFFFFFFFF) { 78 | T ret; 79 | #if (__CUDA_ARCH__ >= 300) 80 | #if (__CUDACC_VER_MAJOR__ >= 9) 81 | ret = __shfl_sync(mask, v, srcline); 82 | #else 83 | ret = __shfl(v, srcline); 84 | #endif 85 | #endif 86 | return ret; 87 | } 88 | 89 | template 90 | __device__ __forceinline__ T shfl_up(T v, uint32_t delta, uint32_t width = GPU_PATCH_WARP_SIZE, 91 | uint32_t mask = 0xFFFFFFFF) { 92 | T ret; 93 | #if (__CUDA_ARCH__ >= 300) 94 | #if (__CUDACC_VER_MAJOR__ >= 9) 95 | ret = __shfl_up_sync(mask, v, delta, width); 96 | #else 97 | ret = __shfl_up(v, delta, width); 98 | #endif 99 | #endif 100 | return ret; 101 | } 102 | 103 | template 104 | __device__ __forceinline__ T shfl_xor(T v, uint32_t lane_mask, uint32_t mask = 0xFFFFFFFF) { 105 | T ret; 106 | #if (__CUDA_ARCH__ >= 300) 107 | #if (__CUDACC_VER_MAJOR__ >= 9) 108 | ret = __shfl_xor_sync(mask, v, lane_mask); 109 | #else 110 | ret = __shfl_xor(v, lane_mask); 111 | #endif 112 | #endif 113 | return ret; 114 | } 115 | 116 | __device__ __forceinline__ uint32_t ballot(int32_t predicate, uint32_t mask = 0xFFFFFFFF) { 117 | uint32_t ret; 118 | #if (__CUDA_ARCH__ >= 300) 119 | #if (__CUDACC_VER_MAJOR__ >= 9) 120 | ret = __ballot_sync(mask, predicate); 121 | #else 122 | ret = __ballot(predicate); 123 | #endif 124 | #endif 125 | return ret; 126 | } 127 | 128 | __device__ __forceinline__ uint32_t bfe(uint32_t source, uint32_t bit_index) { 129 | uint32_t bit; 130 | asm volatile("bfe.u32 %0, %1, %2, %3;" 131 | : "=r"(bit) 132 | : "r"((uint32_t)source), "r"(bit_index), "r"(1)); 133 | return bit; 134 | } 135 | 136 | __device__ __forceinline__ uint32_t brev(uint32_t source) { 137 | uint32_t dest; 138 | asm volatile("brev.b32 %0, %1;" : "=r"(dest) : "r"(source)); 139 | return dest; 140 | } 141 | 142 | __device__ __forceinline__ uint32_t bfind(uint32_t source) { 143 | uint32_t bit_index; 144 | asm volatile("bfind.u32 %0, %1;" : "=r"(bit_index) : "r"((uint32_t)source)); 145 | return bit_index; 146 | } 147 | 148 | __device__ __forceinline__ uint32_t fns(uint32_t source, uint32_t base_index) { 149 | uint32_t bit_index; 150 | asm volatile("fns.b32 %0, %1, %2, %3;" : "=r"(bit_index) : "r"(source), "r"(base_index), "r"(1)); 151 | return bit_index; 152 | } 153 | 154 | template 155 | __device__ __forceinline__ T comparator(T x, uint32_t lane_mask, bool dir, 156 | uint32_t mask = 0xFFFFFFFF) { 157 | T y = shfl_xor(x, lane_mask, mask); 158 | return x < y == dir ? y : x; 159 | } 160 | 161 | template 162 | __device__ __forceinline__ T warp_sort(T x, uint32_t laneid) { 163 | x = comparator(x, 1, bfe(laneid, 1) ^ bfe(laneid, 0)); // A, sorted sequences of length 2 164 | x = comparator(x, 2, bfe(laneid, 2) ^ bfe(laneid, 1)); // B 165 | x = comparator(x, 1, bfe(laneid, 2) ^ bfe(laneid, 0)); // C, sorted sequences of length 4 166 | x = comparator(x, 4, bfe(laneid, 3) ^ bfe(laneid, 2)); // D 167 | x = comparator(x, 2, bfe(laneid, 3) ^ bfe(laneid, 1)); // E 168 | x = comparator(x, 1, bfe(laneid, 3) ^ bfe(laneid, 0)); // F, sorted sequences of length 8 169 | x = comparator(x, 8, bfe(laneid, 4) ^ bfe(laneid, 3)); // G 170 | x = comparator(x, 4, bfe(laneid, 4) ^ bfe(laneid, 2)); // H 171 | x = comparator(x, 2, bfe(laneid, 4) ^ bfe(laneid, 1)); // I 172 | x = comparator(x, 1, bfe(laneid, 4) ^ bfe(laneid, 0)); // J, sorted sequences of length 16 173 | x = comparator(x, 16, bfe(laneid, 4)); // K 174 | x = comparator(x, 8, bfe(laneid, 3)); // L 175 | x = comparator(x, 4, bfe(laneid, 2)); // M 176 | x = comparator(x, 2, bfe(laneid, 1)); // N 177 | x = comparator(x, 1, bfe(laneid, 0)); // O, sorted sequences of length 32 178 | 179 | return x; 180 | } 181 | 182 | template 183 | __device__ __forceinline__ T atomic_load(const T *addr) { 184 | const volatile T *vaddr = addr; // volatile to bypass cache 185 | __threadfence(); // for seq_cst loads. Remove for acquire semantics. 186 | const T value = *vaddr; 187 | // fence to ensure that dependent reads are correctly ordered 188 | __threadfence(); 189 | return value; 190 | } 191 | 192 | template 193 | __device__ __forceinline__ void atomic_store(T *addr, T value) { 194 | volatile T *vaddr = addr; // volatile to bypass cache 195 | // fence to ensure that previous non-atomic stores are visible to other threads 196 | __threadfence(); 197 | *vaddr = value; 198 | } 199 | 200 | template 201 | __device__ __forceinline__ void atomic_store_system(T *addr, T value) { 202 | volatile T *vaddr = addr; // volatile to bypass cache 203 | // fence to ensure that previous non-atomic stores are visible to other threads 204 | __threadfence_system(); 205 | *vaddr = value; 206 | } 207 | 208 | template 209 | __device__ __forceinline__ uint32_t map_upper_bound(T *map, T value, uint32_t len, C cmp) { 210 | uint32_t low = 0; 211 | uint32_t high = len; 212 | uint32_t mid = 0; 213 | while (low < high) { 214 | mid = (high - low) / 2 + low; 215 | if (cmp(map[mid], value)) { 216 | low = mid + 1; 217 | } else { 218 | high = mid; 219 | } 220 | } 221 | return low; 222 | } 223 | 224 | template 225 | __device__ __forceinline__ uint32_t map_prev(T *map, T value, uint32_t len, C cmp) { 226 | uint32_t pos = map_upper_bound(map, value, len, cmp); 227 | if (pos != 0) { 228 | --pos; 229 | } else { 230 | pos = len; 231 | } 232 | return pos; 233 | } 234 | 235 | #endif 236 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-Mao/DrGPUM/c6ffb1665df35905bfa4b0d93ac75eca00e451ac/python/__init__.py -------------------------------------------------------------------------------- /python/bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | 4 | from collections import namedtuple 5 | 6 | from test_cases import Test 7 | from utils import pipe_read, nsys_profile 8 | 9 | 10 | class Benchmark(Test): 11 | # (kernel_name, is_template) 12 | Config = namedtuple('Config', ['kernels']) 13 | 14 | def __init__(self, arch, version): 15 | super().__init__('Benchmark', arch, version) 16 | self._kernel_time = dict() 17 | self._gpu_kernel_time = dict() 18 | self._gpu_mem_time = dict() 19 | self._time = dict() 20 | 21 | def setup(self, choices): 22 | for choice in choices: 23 | if choice == 'backprop': 24 | self._configs[choice] = Benchmark.Config( 25 | kernels=[('bpnn_adjust_weights_cuda', False)]) 26 | elif choice == 'bfs': 27 | self._configs[choice] = Benchmark.Config( 28 | kernels=[('Kernel', False)]) 29 | elif choice == 'cfd': 30 | self._configs[choice] = Benchmark.Config(kernels=[('cuda_compute_flux', True), 31 | ('cuda_time_step', True), 32 | ('cuda_compute_step_factor', True)]) 33 | elif choice == 'hotspot': 34 | self._configs[choice] = Benchmark.Config( 35 | kernels=[('calculate_temp', False)]) 36 | elif choice == 'hotspot3D': 37 | self._configs[choice] = Benchmark.Config( 38 | kernels=[('hotspotOpt1', False)]) 39 | elif choice == 'huffman': 40 | self._configs[choice] = Benchmark.Config( 41 | kernels=[('histo_kernel', False)]) 42 | elif choice == 'lavaMD': 43 | self._configs[choice] = Benchmark.Config( 44 | kernels=[('kernel_gpu_cuda', False)]) 45 | elif choice == 'pathfinder': 46 | self._configs[choice] = Benchmark.Config( 47 | kernels=[('dynproc_kernel', False)]) 48 | elif choice == 'srad': 49 | self._configs[choice] = Benchmark.Config( 50 | kernels=[('srad', False), ('srad2', False)]) 51 | elif choice == 'streamcluster': 52 | self._configs[choice] = Benchmark.Config(kernels=[]) 53 | 54 | def _run_impl(self, case_name, version): 55 | version_name = 'origin' if version is None else version 56 | 57 | def _init_time_dict(time_dict): 58 | if case_name not in time_dict: 59 | time_dict[case_name] = dict() 60 | 61 | if version_name not in time_dict[case_name]: 62 | time_dict[case_name][version_name] = 0.0 63 | 64 | _init_time_dict(self._kernel_time) 65 | _init_time_dict(self._gpu_kernel_time) 66 | _init_time_dict(self._gpu_mem_time) 67 | _init_time_dict(self._time) 68 | 69 | command = Test.cases[case_name].command 70 | options = Test.cases[case_name].options 71 | 72 | time_start = time.time() 73 | pipe_read([command] + options) 74 | time_end = time.time() 75 | elapse = time_end - time_start 76 | 77 | self._time[case_name][version_name] += elapse 78 | 79 | print('{}/{}: {}s'.format(case_name, version_name, elapse)) 80 | 81 | kernel_times, gpu_kernel_time, gpu_mem_time = nsys_profile( 82 | [command] + options, self._configs[case_name].kernels) 83 | 84 | self._gpu_kernel_time[case_name][version_name] += gpu_kernel_time / (1e9) 85 | self._gpu_mem_time[case_name][version_name] += gpu_mem_time / (1e9) 86 | 87 | for kernel, kernel_time in kernel_times.items(): 88 | self._kernel_time[case_name][version_name] += kernel_time / (1e9) 89 | print('{}/{}/{}: {}s'.format(case_name, 90 | version_name, kernel, kernel_time / (1e9))) 91 | print('{}/{}/gpu_kernel_time: {}s'.format(case_name, 92 | version_name, gpu_kernel_time / (1e9))) 93 | print('{}/{}/gpu_mem_time: {}s'.format(case_name, 94 | version_name, gpu_mem_time / (1e9))) 95 | 96 | def report(self): 97 | def _report_speedup(time_dict, dict_name): 98 | for case_name, times in time_dict.items(): 99 | for version_name, version_time in times.items(): 100 | if version_name == 'origin': 101 | continue 102 | elif version_time != 0.0: 103 | sp = time_dict[case_name]['origin'] / version_time 104 | print('{}/{}/{}: {}x'.format(case_name, version_name, dict_name, sp)) 105 | 106 | _report_speedup(self._time, 'time') 107 | _report_speedup(self._kernel_time, 'kernel_time') 108 | _report_speedup(self._gpu_kernel_time, 'gpu_kernel_time') 109 | _report_speedup(self._gpu_mem_time, 'gpu_mem_time') 110 | 111 | 112 | parser = argparse.ArgumentParser( 113 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 114 | parser.add_argument('-c', '--case', help='case name') 115 | parser.add_argument('-v', '--version', default='all', help='benchmark version') 116 | parser.add_argument('-i', '--iterations', type=int, default=1) 117 | parser.add_argument('-a', '--arch', choices=['sm_70', 'sm_72', 118 | 'sm_75', 'sm_80', 'sm_86'], default='sm_70', help='gpu arch name') 119 | args = parser.parse_args() 120 | 121 | if args.case is None: 122 | choice = ['backprop', 'bfs', 'cfd', 'hotspot', 'hotspot3D', 'huffman', 'lavaMD', 'pathfinder', 'srad', 'streamcluster'] 123 | else: 124 | choice = [args.case] 125 | 126 | benchmark = Benchmark(args.arch, args.version) 127 | benchmark.setup(choice) 128 | benchmark.run(args.iterations) 129 | benchmark.report() 130 | -------------------------------------------------------------------------------- /python/filter_time.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | 4 | iteration_num = 5 5 | 6 | 7 | def filter_time(file_path): 8 | with open(file_path, 'r') as fin: 9 | content = fin.read() 10 | reg = re.compile('real\t(\d+)m(.+?)s') 11 | results = reg.findall(content) 12 | if not results: 13 | print("empty") 14 | exit(1) 15 | 16 | ss = [] 17 | for x in results: 18 | minute = int(x[0]) 19 | second = float(x[1]) 20 | second_all = 60 * minute + second 21 | ss.append(second_all) 22 | 23 | return np.mean(ss) 24 | 25 | 26 | 27 | def work(): 28 | original_time_file = 'time.txt' 29 | data_flow_time_file = 'time_data_flow.txt' 30 | value_pattern_time_file = 'time_value_pattern.txt' 31 | original_time = filter_time(original_time_file) 32 | data_flow_time = filter_time(data_flow_time_file) 33 | value_pattern_time = filter_time(value_pattern_time_file) 34 | overhead = data_flow_time / original_time + value_pattern_time / original_time 35 | print("%.2f" % overhead) 36 | 37 | work() 38 | -------------------------------------------------------------------------------- /python/gviewer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import pygraphviz as pgv 4 | 5 | RED_LEVEL_0 = 0.33 6 | RED_LEVEL_1 = 0.66 7 | RED_LEVEL_2 = 0.99 8 | RED_LEVEL_3 = 1.0 9 | MAX_NODE_WIDTH = 3.0 10 | MAX_EDGE_WIDTH = 5.0 11 | 12 | 13 | class Graph: 14 | def __init__(self): 15 | self._nodes = dict() 16 | self._edges = dict() 17 | 18 | def read_agraph(self, agraph): 19 | for node in agraph.nodes(): 20 | attrs = dict() 21 | for key, value in node.attr.items(): 22 | attrs[key] = value 23 | self._nodes[node] = attrs 24 | 25 | for edge in agraph.edges(): 26 | e = (edge[0], edge[1], edge.attr['memory_node_id'], 27 | edge.attr['edge_type']) 28 | attrs = dict() 29 | for key, value in edge.attr.items(): 30 | attrs[key] = value 31 | self._edges[e] = attrs 32 | 33 | def new_agraph(self): 34 | agraph = pgv.AGraph(strict=False, directed=True) 35 | for node, attrs in self._nodes.items(): 36 | agraph.add_node(node, **attrs) 37 | for edge, attrs in self._edges.items(): 38 | agraph.add_edge(edge[0], edge[1], **attrs) 39 | return agraph 40 | 41 | def add_edge(self, src, dst, meomry_node_id, edge_type, attrs): 42 | self._edges[(src, dst, meomry_node_id, edge_type)] = attrs 43 | 44 | def delete_edge(self, src, dst, meomry_node_id, edge_type): 45 | self._edges.pop((src, dst, meomry_node_id, edge_type), None) 46 | 47 | def delete_node(self, node): 48 | self._nodes.pop(node, None) 49 | edge_delete = [] 50 | for edge, _ in self._edges.items(): 51 | if edge[0] == node or edge[1] == node: 52 | edge_delete.append(edge) 53 | 54 | for edge in edge_delete: 55 | self._edges.pop(edge, None) 56 | 57 | def nodes(self): 58 | return self._nodes 59 | 60 | def edges(self): 61 | return self._edges 62 | 63 | 64 | def format_graph(args): 65 | def format_context(context, choice, known, leaf): 66 | ret = '' 67 | if choice == 'none': 68 | return ret 69 | frames = context.split('#') 70 | for frame in frames[::-1]: 71 | if frame == '' or frame == '\n': 72 | continue 73 | line, func = frame.split('\t') 74 | if known is True and (line.find('Unknown') != -1 or line.find('') != -1): 75 | continue 76 | if choice == 'path': 77 | func = '' 78 | elif choice == 'file': 79 | last_slash = line.rfind('/') 80 | if last_slash != -1: 81 | line = line[last_slash+1:] 82 | elif choice == 'func': 83 | line = '' 84 | ret = line + ' ' + func + '\l' + ret 85 | if leaf is True: 86 | break 87 | 88 | # escape characters 89 | ret = ret.replace('<', '\<') 90 | ret = ret.replace('>', '\>') 91 | return ret 92 | 93 | file_path = args.file 94 | # clean bug-ending chars 95 | new_lines = [] 96 | with open(file_path, 'r') as fin: 97 | lines = fin.readlines() 98 | for line in lines: 99 | if line.endswith("'\\"): 100 | line = line[:-2] 101 | elif line.endswith("'\\\n"): 102 | line = line.replace("'\\\n", '\n') 103 | new_lines.append(line) 104 | with open(file_path, 'w') as fout: 105 | for line in new_lines: 106 | fout.write(line) 107 | 108 | 109 | agraph = pgv.AGraph(file_path, strict=False) 110 | 111 | G = Graph() 112 | G.read_agraph(agraph) 113 | 114 | for node, attrs in G.nodes().items(): 115 | for key, attr in attrs.items(): 116 | if key == 'context': 117 | value = format_context( 118 | attr, args.context_filter, args.known, args.leaf) 119 | attrs['context'] = value 120 | 121 | return G 122 | 123 | 124 | def prune_graph(G, node_threshold=0.0, edge_threshold=0.0, keep_redundancy=False): 125 | # 1. prune no edge nodes 126 | nodes_with_edges = dict() 127 | for node in G.nodes(): 128 | nodes_with_edges[node] = False 129 | 130 | for edge in G.edges(): 131 | nodes_with_edges[edge[0]] = True 132 | nodes_with_edges[edge[1]] = True 133 | 134 | for k, v in nodes_with_edges.items(): 135 | if v is False: 136 | # XXX(Keren): pay attention to complexity O(NE) 137 | G.delete_node(k) 138 | 139 | # 2. prune no context nodes 140 | nodes_without_context = dict() 141 | for node, attrs in G.nodes().items(): 142 | if 'context' not in attrs or attrs['context'] == '': 143 | nodes_without_context[node] = True 144 | 145 | for node in nodes_without_context: 146 | G.delete_node(node) 147 | 148 | # 3. prune low importance nodes and edges 149 | node_total_count = 0 150 | for node, attrs in G.nodes().items(): 151 | if attrs['count'] is not None: 152 | node_total_count += float(attrs['count']) 153 | edge_total_count = 0 154 | for edge, attrs in G.edges().items(): 155 | if attrs['count'] is not None: 156 | edge_total_count += float(attrs['count']) 157 | 158 | delete_edges = [] 159 | node_reserve = dict() 160 | for edge, attrs in G.edges().items(): 161 | if attrs['count'] is not None: 162 | importance = float(attrs['count']) / edge_total_count 163 | if importance >= edge_threshold: 164 | node_reserve[edge[0]] = True 165 | node_reserve[edge[1]] = True 166 | elif keep_redundancy is True and float(attrs['redundancy']) >= RED_LEVEL_2: 167 | node_reserve[edge[0]] = True 168 | node_reserve[edge[1]] = True 169 | else: 170 | delete_edges.append(edge) 171 | delete_nodes = [] 172 | for node, attrs in G.nodes().items(): 173 | if attrs['count'] is not None: 174 | importance = float(attrs['count']) / node_total_count 175 | if importance < node_threshold: 176 | delete_nodes.append(node) 177 | 178 | for edge in delete_edges: 179 | G.delete_edge(edge[0], edge[1], edge[2], edge[3]) 180 | for node in delete_nodes: 181 | if node not in node_reserve: 182 | G.delete_node(node) 183 | 184 | return G 185 | 186 | 187 | def combine_graph(G): 188 | # Combine read write edges 189 | rw_edges = dict() 190 | for edge, attrs in G.edges().items(): 191 | edge_key = (edge[0], edge[1], edge[2]) 192 | if edge_key in rw_edges: 193 | rw_edge = rw_edges[edge_key][1] 194 | rw_edge['redundancy'] = max( 195 | float(rw_edge['redundancy']), float(attrs['redundancy'])) 196 | rw_edge['overwrite'] = max( 197 | float(rw_edge['overwrite']), float(attrs['overwrite'])) 198 | rw_edge['count'] = max(int(rw_edge['count']), int(attrs['count'])) 199 | rw_edges[edge_key] = (True, rw_edge) 200 | else: 201 | rw_edges[edge_key] = (False, attrs) 202 | 203 | for edge_key, attrs in rw_edges.items(): 204 | if attrs[0]: 205 | G.delete_edge(edge_key[0], edge_key[1], edge_key[2], 'READ') 206 | G.delete_edge(edge_key[0], edge_key[1], edge_key[2], 'WRITE') 207 | attrs[1]['edge_type'] = 'READ & WRITE' 208 | G.add_edge(edge_key[0], edge_key[1], 209 | edge_key[2], 'READ & WRITE', attrs[1]) 210 | 211 | return G 212 | 213 | 214 | def create_plain_graph(G): 215 | for node in G.nodes(): 216 | name = node.get_name() 217 | label = '{' 218 | label += ' ' + name + '|' 219 | for key, value in node.attr.items(): 220 | label += '{<' + key + '> ' + key.upper() + '|' + value + '}|' 221 | label = label[:-1] 222 | label += '}' 223 | node.attr['shape'] = 'record' 224 | node.attr['label'] = label 225 | 226 | for edge in G.edges(): 227 | label = '' 228 | if edge.attr['edge_type'] == 'READ': 229 | label = 'EDGE_TYPE: READ\nMEMORY_NODE_ID: ' + \ 230 | str(edge.attr['memory_node_id']) 231 | else: 232 | for key, value in edge.attr.items(): 233 | label += key.upper() + ': ' + value + '\n' 234 | edge.attr['label'] = label 235 | 236 | return G 237 | 238 | 239 | def create_pretty_graph(G): 240 | def color_edge_redundancy(G): 241 | for edge in G.edges(): 242 | if float(edge.attr['redundancy']) <= RED_LEVEL_0: 243 | edge.attr['color'] = '#cddc39' 244 | edge.attr['fillcolor'] = '#cddc39' 245 | elif float(edge.attr['redundancy']) <= RED_LEVEL_1: 246 | edge.attr['color'] = '#fffa55' 247 | edge.attr['fillcolor'] = '#fffa55' 248 | elif float(edge.attr['redundancy']) <= RED_LEVEL_2: 249 | edge.attr['color'] = '#fdcc3a' 250 | edge.attr['fillcolor'] = '#fdcc3a' 251 | else: 252 | edge.attr['color'] = '#f91100' 253 | edge.attr['fillcolor'] = '#f91100' 254 | return G 255 | 256 | def apportion_edge_width(G): 257 | edges = G.edges() 258 | max_edge = max(edges, key=lambda edge: float( 259 | edge.attr['overwrite']) * float(edge.attr['count'])) 260 | max_weight = float(max_edge.attr['overwrite']) * \ 261 | float(max_edge.attr['count']) 262 | 263 | for edge in edges: 264 | width = float(edge.attr['overwrite']) * \ 265 | float(edge.attr['count']) / max_weight * MAX_EDGE_WIDTH 266 | if width < 1.0: 267 | edge.attr['penwidth'] = 1.0 268 | else: 269 | edge.attr['penwidth'] = width 270 | 271 | return G 272 | 273 | def apportion_node_width(G): 274 | nodes = G.nodes() 275 | max_node = max(nodes, key=lambda node: float(node.attr['count'])) 276 | max_weight = float(max_node.attr['count']) 277 | 278 | for node in nodes: 279 | width = float(node.attr['count']) / max_weight * MAX_NODE_WIDTH 280 | if width < 1.0: 281 | node.attr['width'] = 0.6 282 | else: 283 | node.attr['width'] = width 284 | 285 | return G 286 | 287 | def label_node_duplicate(node): 288 | dup = node.attr['duplicate'] 289 | label = '' 290 | 291 | if dup is None: 292 | return label 293 | 294 | dup_entries = dup.split(';') 295 | from_node = node.get_name() 296 | 297 | for dup_entry in dup_entries: 298 | if len(dup_entry) > 0: 299 | dup_node = dup_entry.split(',')[0] 300 | label += dup_node + ' ' 301 | return 'DUPLICATE: ' + label 302 | 303 | #G.graph_attr['bgcolor'] = '#2e3e56' 304 | G.graph_attr['pad'] = '0.5' 305 | 306 | for node in G.nodes(): 307 | if node.attr['node_type'] == 'MEMORY': 308 | node.attr['shape'] = 'box' 309 | elif node.attr['node_type'] == 'KERNEL': 310 | node.attr['shape'] = 'ellipse' 311 | elif node.attr['node_type'] == 'MEMCPY' or node.attr['node_type'] == 'MEMSET': 312 | node.attr['shape'] = 'circle' 313 | else: 314 | node.attr['shape'] = 'box' 315 | node.attr['label'] = node.attr['node_type'] 316 | node.attr['style'] = 'filled' 317 | node.attr['penwidth'] = '0' 318 | tooltip = '' 319 | tooltip += 'TYPE: ' + node.attr['node_type'] + '\l' 320 | tooltip += 'COUNT: ' + node.attr['count'] + '\l' 321 | duplicate = label_node_duplicate(node) 322 | if duplicate != '': 323 | tooltip += duplicate + '\l' 324 | tooltip += 'CONTEXT: \l' + node.attr['context'] 325 | tooltip.replace('\l', ' ') 326 | node.attr['tooltip'] = tooltip 327 | 328 | for edge in G.edges(): 329 | tooltip = 'MEMORY_NODE_ID: ' + edge.attr['memory_node_id'] + '\l' 330 | tooltip += 'TYPE: ' + edge.attr['edge_type'] + '\l' 331 | tooltip += 'REDUNDANCY: ' + str(edge.attr['redundancy']) + '\l' 332 | tooltip += 'OVERWRITE: ' + str(edge.attr['overwrite']) + '\l' 333 | tooltip += 'BYTES: ' + str(edge.attr['count']) + '\l' 334 | tooltip.replace('\l', ' ') 335 | edge.attr['tooltip'] = tooltip 336 | edge.attr['fontname'] = 'helvetica Neue Ultra Light' 337 | 338 | G = apportion_node_width(G) 339 | 340 | G = color_edge_redundancy(G) 341 | 342 | G = apportion_edge_width(G) 343 | 344 | return G 345 | 346 | 347 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 348 | parser.add_argument('-f', '--file', help='file name') 349 | parser.add_argument('-cf', '--context-filter', choices=[ 350 | 'path', 'file', 'func', 'all', 'none'], default='all', help='show part of the calling context') 351 | parser.add_argument('-k', '--known', action='store_true', default=False, 352 | help='show only known function') 353 | parser.add_argument('-l', '--leaf', action='store_true', default=False, 354 | help='show only leaf function') 355 | parser.add_argument('-of', '--output-format', 356 | choices=['svg', 'png', 'pdf'], default='svg', help='output format') 357 | parser.add_argument('-pn', '--prune-node', default=0.0, 358 | help='prune node lower bound') 359 | parser.add_argument('-pe', '--prune-edge', default=0.0, 360 | help='prune edge lower bound') 361 | parser.add_argument('-kr', '--keep-redundancy', action='store_true', default=False, 362 | help='keep all high redundancy edges') 363 | parser.add_argument( 364 | '-ly', '--layout', choices=['dot', 'neato', 'circo'], default='dot', help='svg layout') 365 | parser.add_argument('-pr', '--pretty', action='store_true', default=False, 366 | help='tune output graph') 367 | parser.add_argument('-v', '--verbose', action='store_true', help='print log') 368 | args = parser.parse_args() 369 | 370 | if args.verbose: 371 | print('Format graph...') 372 | G = format_graph(args) 373 | 374 | if float(args.prune_node) > 0.0 or float(args.prune_edge) > 0.0: 375 | if args.verbose: 376 | print('Prune graph: {} nodes and {} edges...'.format( 377 | len(G.nodes()), len(G.edges()))) 378 | G = prune_graph(G, float(args.prune_node), float(args.prune_edge), args.keep_redundancy) 379 | 380 | if args.verbose: 381 | print('Refine graph...') 382 | if args.pretty: 383 | G = combine_graph(G) 384 | agraph = create_pretty_graph(G.new_agraph()) 385 | else: 386 | agraph = create_plain_graph(G.new_agraph()) 387 | 388 | if args.verbose: 389 | print('Organize graph: {} nodes and {} edges...'.format( 390 | len(agraph.nodes()), len(agraph.edges()))) 391 | agraph.layout(prog=args.layout) 392 | 393 | if args.verbose: 394 | print('Output graph...') 395 | #G.write(args.file + '.dot') 396 | agraph.draw(args.file + '.' + args.output_format) 397 | -------------------------------------------------------------------------------- /python/overhead.sh: -------------------------------------------------------------------------------- 1 | GVPROF_source_path=/root/GVProf 2 | rodinia_path=/root/GVProf/samples 3 | target_log_file=/root/GVProf/overhead.txt 4 | iteration_num=1 5 | 6 | cd $rodinia_path 7 | 8 | run() { 9 | cur_path=$1 10 | block_sampling=$2 11 | kernel_sampling=$3 12 | EXEC_AND_PARAMS=${@:4} 13 | echo ${EXEC_AND_PARAMS} 14 | cd ${cur_path} 15 | echo ${cur_path} >> ${target_log_file} 16 | rm -rf time*.txt gvprof* 17 | for i in {1..${iteration_num}}; do 18 | { time $EXEC_AND_PARAMS; } 2>>time.txt 19 | done 20 | 21 | gvprof_overhead -i ${iteration_num} -v -e data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 $EXEC_AND_PARAMS 22 | 23 | rm -rf gvprof* 24 | 25 | gvprof_overhead -i ${iteration_num} -v -e value_pattern -s ${block_sampling} -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=${kernel_sampling} -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 -ck HPCRUN_SANITIZER_WHITELIST=w.txt $EXEC_AND_PARAMS 26 | 27 | /usr/bin/python3 ${GVPROF_source_path}/python/filter_time.py >> ${target_log_file} 28 | } 29 | 30 | run $rodinia_path/bfs 20 20 ./bfs ../data/graph1MW_6.txt 31 | run $rodinia_path/backprop 20 20 ./backprop 65536 32 | run $rodinia_path/srad_v1 20 20 ./srad 1 0.5 502 458 33 | run $rodinia_path/hotspot 20 20 ./hotspot 512 2 2 ../data/temp_512 ../data/power_512 output.out 34 | run $rodinia_path/pathfinder 20 20 ./pathfinder 100000 100 20 35 | run $rodinia_path/cfd 20 20 ./euler3d ../data/fvcorr.domn.097K 36 | run $rodinia_path/huffman 20 20 ./pavle ../data/test1024_H2.206587175259.in 37 | run $rodinia_path/lavaMD 100 100 ./lavaMD -boxes1d 10 38 | run $rodinia_path/hotspot3D 20 20 ./3D 512 8 100 ../data/power_512x8 ../data/temp_512x8 output.out 39 | run $rodinia_path/streamcluster 100 100 ./sc_gpu 10 20 256 65536 65536 1000 none output.txt 1 40 | 41 | # # real applications 42 | # #darknet 43 | # run /root/gpuapps/darknet ./darknet detector test ./cfg/coco.data ./cfg/yolov4.cfg ./yolov4.weights data/dog.jpg -i 0 -thresh 0.25 44 | # # castro 45 | # run /root/gpuapps/Castro/Exec/hydro_tests/Sedov ./Castro2d.gnu.CUDA.ex inputs.2d.cyl_in_cartcoords 46 | # # barracuda 47 | # run /root/gpuapps/barracuda ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq >quicktest.sai 48 | # # pytorch 49 | # eval "$(/root/anaconda3/bin/conda shell.bash hook)" 50 | # conda activate pytorch 51 | 52 | # cd /root/gpuapps/pytorch_vp/pytorch 53 | # rm -rf gvprof* 54 | # rm w.txt 55 | # ln -s w_resnet.txt w.txt 56 | # run /root/gpuapps/pytorch_vp/pytorch python3 1-resnet50-unit.py 57 | # rm w.txt 58 | # ln -s w_deepwave.txt w.txt 59 | # run /root/gpuapps/pytorch_vp/pytorch python3 2-deepwave-unit.py 60 | # rm w.txt 61 | # ln -s w_bert.txt w.txt 62 | # run /root/gpuapps/pytorch_vp/pytorch python3 3-bert-unit.py 63 | 64 | # conda deactivate 65 | 66 | # # namd 67 | # run /root/gpuapps/NAMD/Linux-x86_64-g++ ./namd3 ../src/alanin 68 | 69 | # # qmcpack 70 | 71 | # #lammps 72 | # run /root/gpuapps/lammps/bench ../build/lmp -k on g 1 -sf kk -in in.lj 73 | -------------------------------------------------------------------------------- /python/test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from tests.data_flow_test import DataFlowTest 4 | from tests.redundancy_test import RedundancyTest 5 | from tests.value_pattern_test import ValuePatternTest 6 | from tests.instruction_test import InstructionTest 7 | from test_cases import Test 8 | 9 | parser = argparse.ArgumentParser( 10 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 11 | parser.add_argument('-c', '--case', help='case name') 12 | parser.add_argument('-m', '--mode', choices=['data_flow', 'redundancy', 13 | 'value_pattern', 'instruction', 'all'], default='all', help='mode name') 14 | parser.add_argument('-a', '--arch', choices=['sm_70', 'sm_72', 15 | 'sm_75', 'sm_80', 'sm_85'], default='sm_70', help='gpu arch name') 16 | args = parser.parse_args() 17 | 18 | tests = [] 19 | 20 | if args.mode == 'data_flow' or args.mode == 'all': 21 | tests.append(DataFlowTest(args.arch)) 22 | 23 | if args.mode == 'redundancy' or args.mode == 'all': 24 | tests.append(RedundancyTest(args.arch)) 25 | 26 | if args.mode == 'value_pattern' or args.mode == 'all': 27 | tests.append(ValuePatternTest(args.arch)) 28 | 29 | if args.mode == 'instruction' or args.mode == 'all': 30 | tests.append(InstructionTest(args.arch)) 31 | 32 | for test in tests: 33 | print("{}...".format(test.name())) 34 | if args.case is None: 35 | # Test all cases 36 | choice = Test.cases.keys() 37 | else: 38 | choice = [args.case] 39 | test.setup(choice) 40 | test.run() 41 | -------------------------------------------------------------------------------- /python/test_cases.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import os 3 | 4 | from utils import cleanup 5 | 6 | 7 | class Test(object): 8 | Case = namedtuple('Case', ['path', 'versions', 9 | 'command', 'options', 'cleanup']) 10 | cases = dict() 11 | 12 | # unit test cases 13 | cases['vectorAdd.f128'] = Case( 14 | path='samples/vectorAdd.f128', versions=[], command='./vectorAdd', options=[], cleanup=True) 15 | cases['op_graph_simple'] = Case( 16 | path='samples/op_graph_simple', versions=[], command='./main', options=[], cleanup=True) 17 | cases['op_pattern_simple'] = Case( 18 | path='samples/op_pattern_simple', versions=[], command='./main', options=[], cleanup=True) 19 | cases['stress'] = Case(path='samples/stress', versions=[], 20 | command='./stress', options=[], cleanup=True) 21 | 22 | # sample test cases 23 | cases['bfs'] = Case(path='samples/bfs', command='./bfs', versions=['vp-opt1', 24 | 'vp-opt2', 'vp-opt'], options=['../data/graph1MW_6.txt'], cleanup=True) 25 | cases['backprop'] = Case(path='samples/backprop', command='./backprop', versions=[ 26 | 'vp-opt1', 'vp-opt2', 'vp-opt'], options=['65536'], cleanup=True) 27 | cases['cfd'] = Case(path='samples/cfd', command='./euler3d', versions=['vp-opt1', 28 | 'vp-opt2', 'vp-opt'], options=['../data/fvcorr.domn.097K'], cleanup=True) 29 | cases['hotspot'] = Case(path='samples/hotspot', command='./hotspot', versions=['vp-opt'], options=[ 30 | '512', '2', '2', '../data/temp_512', '../data/power_512', 'output.out'], cleanup=True) 31 | cases['hotspot3D'] = Case(path='samples/hotspot3D', command='./3D', versions=['vp-opt'], options=[ 32 | '512', '8', '100', '../data/power_512x8', '../data/temp_512x8', 'output.out'], cleanup=True) 33 | cases['huffman'] = Case(path='samples/huffman', command='./pavle', versions=[ 34 | 'vp-opt'], options=['../data/test1024_H2.206587175259.in'], cleanup=True) 35 | cases['lavaMD'] = Case(path='samples/lavaMD', command='./lavaMD', 36 | versions=['vp-opt'], options=['-boxes1d', '10'], cleanup=True) 37 | cases['particlefilter'] = Case(path='samples/particlefilter', command='./particlefilter_float', versions=[ 38 | 'vp-opt'], options=['-x', '128', '-y', '128', '-z', '10', '-np', '1000'], cleanup=True) 39 | cases['pathfinder'] = Case(path='samples/pathfinder', command='./pathfinder', 40 | versions=['vp-opt'], options=['100000', '100', '20'], cleanup=True) 41 | cases['srad'] = Case(path='samples/srad_v1', command='./srad', versions=['vp-opt1', 42 | 'vp-opt2', 'vp-opt'], options=['10', '0.5', '502', '458'], cleanup=True) 43 | cases['streamcluster'] = Case(path='samples/streamcluster', command='./sc_gpu', versions=['vp-opt'], options=[ 44 | '10', '20', '256', '65536', '65536', '1000', 'none', 'output.txt', '1'], cleanup=True) 45 | 46 | # application cases 47 | cases['barracuda'] = Case(path='samples/barracuda', command='./barracuda', versions=['vp-opt'], 48 | options=['aln', 'sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa', 49 | 'sample_data/sample_reads.fastq', '>', 'quicktest.sai'], cleanup=False) 50 | 51 | cases['castro'] = Case(path='samples/Castro/Exec/hydro_tests/Sedov', command='Castro2d.gnu.CUDA.ex', versions=['vp-opt'], 52 | options=['./inputs.2d.cyl_in_cartcoords'], cleanup=False) 53 | 54 | cases['darknet'] = Case(path='samples/darknet', command='./darknet', versions=['vp-opt'], 55 | options=['detector', 'test', './cfg/coco.data', './cfg/yolov4.cfg', 56 | './yolov4.weights', 'data/dog.jpg', '-i', '0', '-thresh', '0.25'], cleanup=False) 57 | 58 | cases['deepwave'] = Case(path='samples/deepwave', command='./Deepwave_SEAM_example1.py', versions=['vp-opt'], 59 | options=[], cleanup=False) 60 | 61 | cases['namd'] = Case(path='samples/NAMD/Linux-x86_64-g++', command='./namd3', 62 | versions=['vp-opt'], options=['../alain'], cleanup=False) 63 | 64 | cases['qmcpack'] = Case(path='samples/qmcpack/workspace/NiO/dmc-a4-e48-batched_driver-DU8', 65 | command='../../../build/bin/qmcpack', versions=['vp-opt'], options=['./NiO-fcc-S1-dmc.xml'], cleanup=False) 66 | 67 | def __init__(self, name, arch, version=None): 68 | self._name = name 69 | self._arch = arch 70 | self._version = version 71 | self._configs = dict() 72 | 73 | def name(self): 74 | return self._name 75 | 76 | def setup(self, choices): 77 | pass 78 | 79 | def _run_impl(self, case_name, version): 80 | pass 81 | 82 | def run(self, iterations=1): 83 | cwd = os.getcwd() 84 | 85 | for i in range(iterations): 86 | for case_name, case in Test.cases.items(): 87 | if case_name not in self._configs: 88 | continue 89 | 90 | os.chdir(case.path) 91 | if i == 0 and case.cleanup: 92 | cleanup(self._arch) 93 | 94 | self._run_impl(case_name, None) 95 | 96 | os.chdir(cwd) 97 | 98 | if self._version is None: 99 | continue 100 | 101 | for version in case.versions: 102 | if version == self._version or self._version == 'all': 103 | os.chdir(case.path + '-' + version) 104 | if i == 0 and case.cleanup: 105 | cleanup(self._arch) 106 | 107 | self._run_impl(case_name, version) 108 | 109 | os.chdir(cwd) 110 | -------------------------------------------------------------------------------- /python/tests/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("../") 4 | -------------------------------------------------------------------------------- /python/tests/data_flow_test.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import subprocess 3 | import os 4 | import sys 5 | 6 | import pygraphviz as pgv 7 | 8 | from test_cases import Test 9 | from utils import pipe_read 10 | 11 | 12 | class DataFlowTest(Test): 13 | Config = namedtuple('Config', ['files', 'nodes', 'edges']) 14 | 15 | def __init__(self, arch): 16 | super().__init__('DataFlowTest', arch) 17 | 18 | def setup(self, choices): 19 | for choice in choices: 20 | if choice == 'op_graph_simple': 21 | self._configs[choice] = DataFlowTest.Config(files=['data_flow.dot'], 22 | nodes=[17], 23 | edges=[20]) 24 | elif choice == 'bfs': 25 | self._configs[choice] = DataFlowTest.Config(files=['data_flow.dot'], 26 | nodes=[23], 27 | edges=[41]) 28 | 29 | def _run_impl(self, case_name, version): 30 | if case_name not in self._configs: 31 | return 32 | 33 | command = Test.cases[case_name].command 34 | options = Test.cases[case_name].options 35 | path = Test.cases[case_name].path 36 | 37 | pipe_read(['gvprof', '-cfg', '-e', 'data_flow', 38 | command] + options) 39 | 40 | files = self._configs[case_name].files 41 | nodes = self._configs[case_name].nodes 42 | edges = self._configs[case_name].edges 43 | 44 | # Just count the number of nodes and edges, 45 | # redundancy and overwrite is difficult for autotest 46 | for i, f in enumerate(files): 47 | f = 'gvprof-database/' + f 48 | agraph = pgv.AGraph(f, strict=False) 49 | correct = True 50 | if len(agraph.nodes()) != nodes[i]: 51 | print('Error {} nodes (true: {} vs test: {})'.format( 52 | path, nodes[i], len(agraph.nodes()))) 53 | correct = False 54 | if len(agraph.edges()) != edges[i]: 55 | print('Error {} edges (true: {} vs test: {})'.format( 56 | path, edges[i], len(agraph.edges()))) 57 | correct = False 58 | if correct is True: 59 | print('Pass ' + path) 60 | -------------------------------------------------------------------------------- /python/tests/instruction_test.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import os 3 | import sys 4 | 5 | from test_cases import Test 6 | from utils import pipe_read 7 | 8 | 9 | class InstructionTest(Test): 10 | Config = namedtuple('Config', ['insts']) 11 | 12 | def __init__(self, arch): 13 | super().__init__('InstructionTest', arch) 14 | 15 | def setup(self, choices): 16 | for choice in choices: 17 | if choice == 'op_pattern_simple': 18 | self._configs[choice] = InstructionTest.Config(insts={ 19 | 'sm_70': 20 | ['FUNC: 18, PC: 0xd0, ACCESS_KIND: INTEGER,v:32,u:32', 21 | 'FUNC: 19, PC: 0xc0, ACCESS_KIND: INTEGER,v:32,u:32', 22 | 'FUNC: 20, PC: 0xf0, ACCESS_KIND: UNKNOWN,v:32,u:32', 23 | 'FUNC: 21, PC: 0x250, ACCESS_KIND: FLOAT,v:64,u:64', 24 | 'FUNC: 22, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:64,u:64', 25 | 'FUNC: 23, PC: 0xe0, ACCESS_KIND: FLOAT,v:64,u:64', 26 | 'FUNC: 23, PC: 0x100, ACCESS_KIND: FLOAT,v:64,u:64'], 27 | 'sm_75': 28 | ['FUNC: 17, PC: 0xb0, ACCESS_KIND: INTEGER,v:32,u:32', 29 | 'FUNC: 18, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 30 | 'FUNC: 18, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:32,u:32', 31 | 'FUNC: 19, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 32 | 'FUNC: 19, PC: 0x240, ACCESS_KIND: FLOAT,v:64,u:64', 33 | 'FUNC: 20, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 34 | 'FUNC: 20, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:64,u:64', 35 | 'FUNC: 21, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 36 | 'FUNC: 21, PC: 0xd0, ACCESS_KIND: FLOAT,v:64,u:64', 37 | 'FUNC: 21, PC: 0xf0, ACCESS_KIND: FLOAT,v:64,u:64'], 38 | 'sm_80': 39 | ['FUNC: 17, PC: 0xa0, ACCESS_KIND: INTEGER,v:64,u:32', 40 | 'FUNC: 17, PC: 0xc0, ACCESS_KIND: INTEGER,v:32,u:32', 41 | 'FUNC: 18, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 42 | 'FUNC: 18, PC: 0xc0, ACCESS_KIND: INTEGER,v:64,u:32', 43 | 'FUNC: 18, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:32,u:32', 44 | 'FUNC: 19, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 45 | 'FUNC: 19, PC: 0xe0, ACCESS_KIND: INTEGER,v:64,u:32', 46 | 'FUNC: 19, PC: 0x230, ACCESS_KIND: FLOAT,v:64,u:64', 47 | 'FUNC: 20, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 48 | 'FUNC: 20, PC: 0xb0, ACCESS_KIND: INTEGER,v:64,u:32', 49 | 'FUNC: 20, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:64,u:32', 50 | 'FUNC: 21, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32', 51 | 'FUNC: 21, PC: 0xb0, ACCESS_KIND: INTEGER,v:64,u:32', 52 | 'FUNC: 21, PC: 0xd0, ACCESS_KIND: FLOAT,v:64,u:64', 53 | 'FUNC: 21, PC: 0xf0, ACCESS_KIND: FLOAT,v:64,u:64'] 54 | }) 55 | elif choice == 'bfs': 56 | self._configs[choice] = InstructionTest.Config(insts={ 57 | 'sm_70': 58 | ['FUNC: 10, PC: 0xa0, ACCESS_KIND: INTEGER,v:8,u:8', 59 | 'FUNC: 10, PC: 0x170, ACCESS_KIND: INTEGER,v:8,u:8', 60 | 'FUNC: 10, PC: 0x180, ACCESS_KIND: INTEGER,v:8,u:8', 61 | 'FUNC: 10, PC: 0x190, ACCESS_KIND: INTEGER,v:8,u:8', 62 | 'FUNC: 10, PC: 0x1a0, ACCESS_KIND: UNKNOWN,v:8,u:8', 63 | 'FUNC: 11, PC: 0x90, ACCESS_KIND: INTEGER,v:8,u:8', 64 | 'FUNC: 11, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:8,u:8', 65 | 'FUNC: 11, PC: 0xf0, ACCESS_KIND: INTEGER,v:32,u:32', 66 | 'FUNC: 11, PC: 0x120, ACCESS_KIND: INTEGER,v:32,u:32', 67 | 'FUNC: 11, PC: 0x1b0, ACCESS_KIND: INTEGER,v:32,u:32', 68 | 'FUNC: 11, PC: 0x1f0, ACCESS_KIND: INTEGER,v:8,u:8', 69 | 'FUNC: 11, PC: 0x210, ACCESS_KIND: INTEGER,v:32,u:32', 70 | 'FUNC: 11, PC: 0x290, ACCESS_KIND: INTEGER,v:32,u:32', 71 | 'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:8,u:8', 72 | 'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32', 73 | 'FUNC: 11, PC: 0x2c0, ACCESS_KIND: INTEGER,v:32,u:32'], 74 | 'sm_75': 75 | ['FUNC: 10, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:64', 76 | 'FUNC: 10, PC: 0x80, ACCESS_KIND: INTEGER,v:8,u:8', 77 | 'FUNC: 10, PC: 0xc0, ACCESS_KIND: INTEGER,v:64,u:64', 78 | 'FUNC: 10, PC: 0xd0, ACCESS_KIND: INTEGER,v:64,u:64', 79 | 'FUNC: 10, PC: 0x100, ACCESS_KIND: UNKNOWN,v:8,u:8', 80 | 'FUNC: 10, PC: 0x110, ACCESS_KIND: INTEGER,v:64,u:64', 81 | 'FUNC: 10, PC: 0x120, ACCESS_KIND: UNKNOWN,v:8,u:8', 82 | 'FUNC: 10, PC: 0x130, ACCESS_KIND: INTEGER,v:64,u:64', 83 | 'FUNC: 10, PC: 0x140, ACCESS_KIND: UNKNOWN,v:8,u:8', 84 | 'FUNC: 10, PC: 0x150, ACCESS_KIND: UNKNOWN,v:8,u:8', 85 | 'FUNC: 11, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:64', 86 | 'FUNC: 11, PC: 0x80, ACCESS_KIND: INTEGER,v:8,u:8', 87 | 'FUNC: 11, PC: 0xd0, ACCESS_KIND: INTEGER,v:64,u:64', 88 | 'FUNC: 11, PC: 0x100, ACCESS_KIND: UNKNOWN,v:8,u:8', 89 | 'FUNC: 11, PC: 0x110, ACCESS_KIND: INTEGER,v:32,u:32', 90 | 'FUNC: 11, PC: 0x140, ACCESS_KIND: INTEGER,v:32,u:32', 91 | 'FUNC: 11, PC: 0x1c0, ACCESS_KIND: INTEGER,v:32,u:32', 92 | 'FUNC: 11, PC: 0x1d0, ACCESS_KIND: INTEGER,v:64,u:64', 93 | 'FUNC: 11, PC: 0x1f0, ACCESS_KIND: INTEGER,v:8,u:8', 94 | 'FUNC: 11, PC: 0x210, ACCESS_KIND: INTEGER,v:32,u:32', 95 | 'FUNC: 11, PC: 0x240, ACCESS_KIND: INTEGER,v:64,u:64', 96 | 'FUNC: 11, PC: 0x280, ACCESS_KIND: INTEGER,v:32,u:32', 97 | 'FUNC: 11, PC: 0x290, ACCESS_KIND: UNKNOWN,v:8,u:8', 98 | 'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:32,u:32', 99 | 'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32'], 100 | 'sm_80': 101 | ['FUNC: 10, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:32', 102 | 'FUNC: 10, PC: 0xa0, ACCESS_KIND: INTEGER,v:8,u:8', 103 | 'FUNC: 10, PC: 0x150, ACCESS_KIND: INTEGER,v:8,u:8', 104 | 'FUNC: 10, PC: 0x180, ACCESS_KIND: INTEGER,v:8,u:8', 105 | 'FUNC: 10, PC: 0x190, ACCESS_KIND: INTEGER,v:8,u:8', 106 | 'FUNC: 10, PC: 0x1a0, ACCESS_KIND: UNKNOWN,v:8,u:8', 107 | 'FUNC: 11, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:32', 108 | 'FUNC: 11, PC: 0x90, ACCESS_KIND: INTEGER,v:8,u:8', 109 | 'FUNC: 11, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:8,u:8', 110 | 'FUNC: 11, PC: 0xf0, ACCESS_KIND: INTEGER,v:32,u:32', 111 | 'FUNC: 11, PC: 0x120, ACCESS_KIND: INTEGER,v:32,u:32', 112 | 'FUNC: 11, PC: 0x1a0, ACCESS_KIND: INTEGER,v:32,u:32', 113 | 'FUNC: 11, PC: 0x1e0, ACCESS_KIND: INTEGER,v:8,u:8', 114 | 'FUNC: 11, PC: 0x200, ACCESS_KIND: INTEGER,v:32,u:32', 115 | 'FUNC: 11, PC: 0x280, ACCESS_KIND: INTEGER,v:32,u:32', 116 | 'FUNC: 11, PC: 0x290, ACCESS_KIND: INTEGER,v:8,u:8', 117 | 'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:32,u:32', 118 | 'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32'] 119 | }) 120 | 121 | def _run_impl(self, case_name, version): 122 | command = Test.cases[case_name].command 123 | options = Test.cases[case_name].options 124 | path = Test.cases[case_name].path 125 | 126 | pipe_read(['gvprof', '-cfg', '-e', 'data_flow', command] + options) 127 | 128 | files = os.listdir('./gvprof-measurements/structs/nvidia/') 129 | 130 | insts = self._configs[case_name].insts 131 | 132 | for f in files: 133 | if f.find('.inst') != -1: 134 | bufs = pipe_read( 135 | ['redshow_parser', './gvprof-measurements/structs/nvidia/' + f]).decode('utf-8').splitlines() 136 | 137 | correct = True 138 | for n, buf in enumerate(bufs): 139 | if buf != insts[self._arch][n]: 140 | print('Error {} line {} (true: {} vs test: {})'.format( 141 | path, n, insts[self._arch][n], buf)) 142 | correct = False 143 | if correct is True: 144 | print('Pass ' + path + ' ' + f) 145 | -------------------------------------------------------------------------------- /python/tests/redundancy_test.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import subprocess 3 | import os 4 | import sys 5 | 6 | from test_cases import Test 7 | from utils import pipe_read 8 | 9 | 10 | class RedundancyTest(Test): 11 | Config = namedtuple('Config', ['spatial_read_files', 'spatial_read_reds', 'spatial_write_files', 'spatial_write_reds', 12 | 'temporal_read_files', 'temporal_read_reds', 'temporal_write_files', 'temporal_write_reds', 'total', 'sampling', 'tolerate']) 13 | 14 | def __init__(self, arch): 15 | super().__init__('RedundancyTest', arch) 16 | 17 | def setup(self, choices): 18 | for choice in choices: 19 | if choice == 'vectorAdd.f128': 20 | self._configs[choice] = RedundancyTest.Config( 21 | spatial_read_files=['spatial_read_t0.csv'], 22 | spatial_read_reds=[3], 23 | spatial_write_files=['spatial_write_t0.csv'], 24 | spatial_write_reds=[1], 25 | temporal_read_files=['temporal_read_t0.csv'], 26 | temporal_read_reds=[0], 27 | temporal_write_files=['temporal_write_t0.csv'], 28 | temporal_write_reds=[0], 29 | total=[12], 30 | sampling=0, 31 | tolerate=0.0) 32 | elif choice == 'bfs': 33 | self._configs[choice] = RedundancyTest.Config( 34 | spatial_read_files=['spatial_read_t0.csv'], 35 | spatial_read_reds=[27707987], 36 | spatial_write_files=['spatial_write_t0.csv'], 37 | spatial_write_reds=[7997516], 38 | temporal_read_files=['temporal_read_t0.csv'], 39 | temporal_read_reds=[5603846], 40 | temporal_write_files=['temporal_write_t0.csv'], 41 | temporal_write_reds=[0], 42 | total=[52653451], 43 | sampling=0, 44 | tolerate=0.02) 45 | elif choice == 'backprop': 46 | self._configs[choice] = [ 47 | RedundancyTest.Config( 48 | spatial_read_files=['spatial_read_t0.csv'], 49 | spatial_read_reds=[4194507], 50 | spatial_write_files=['spatial_write_t0.csv'], 51 | spatial_write_reds=[1048623], 52 | temporal_read_files=['temporal_read_t0.csv'], 53 | temporal_read_reds=[3149872], 54 | temporal_write_files=['temporal_write_t0.csv'], 55 | temporal_write_reds=[0], 56 | total=[19988592], 57 | sampling=0, 58 | tolerate=0.01), 59 | RedundancyTest.Config( 60 | spatial_read_files=['spatial_read_t0.csv'], 61 | spatial_read_reds=[84039], 62 | spatial_write_files=['spatial_write_t0.csv'], 63 | spatial_write_reds=[21009], 64 | temporal_read_files=['temporal_read_t0.csv'], 65 | temporal_read_reds=[63058], 66 | temporal_write_files=['temporal_write_t0.csv'], 67 | temporal_write_reds=[0], 68 | total=[400160], 69 | sampling=50, 70 | tolerate=0.05)] 71 | 72 | def _run_impl(self, case_name, version): 73 | runs = self._configs[case_name] 74 | if not isinstance(runs, list): 75 | runs = [runs] 76 | 77 | command = Test.cases[case_name].command 78 | options = Test.cases[case_name].options 79 | path = Test.cases[case_name].path 80 | 81 | for run in runs: 82 | sampling = '' 83 | if run.sampling != 0: 84 | sampling = 'sampling' 85 | pipe_read(['gvprof', '-cfg', '-e', 'redundancy@' + 86 | str(run.sampling), command] + options) 87 | else: 88 | pipe_read(['gvprof', '-cfg', '-e', 'redundancy', 89 | command] + options) 90 | 91 | def redundancy_compare(red_files, true_reds): 92 | for i, red_file in enumerate(red_files): 93 | red_file = 'gvprof-database/' + red_file 94 | res = pipe_read(['tail', '-n', '1', red_file]).decode() 95 | red = float(res.split(',')[0]) 96 | true_red = float(true_reds[i]) 97 | epsilon = red if true_red == 0.0 else abs( 98 | red - true_red) / true_red 99 | if epsilon > run.tolerate: 100 | print('Error {} {}: (true: {} vs test: {})'.format( 101 | path, red_file, true_red, red)) 102 | else: 103 | print('Pass ' + path + ' ' + red_file + ' ' + sampling) 104 | 105 | redundancy_compare(run.spatial_read_files, run.spatial_read_reds) 106 | redundancy_compare(run.spatial_write_files, run.spatial_write_reds) 107 | redundancy_compare(run.temporal_read_files, run.temporal_read_reds) 108 | redundancy_compare(run.temporal_write_files, 109 | run.temporal_write_reds) 110 | -------------------------------------------------------------------------------- /python/tests/value_pattern_test.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import subprocess 3 | import os 4 | import sys 5 | 6 | from test_cases import Test 7 | from utils import pipe_read 8 | 9 | 10 | class ValuePatternTest(Test): 11 | Config = namedtuple('Config', ['files', 'op_counts', 'kernel_patterns']) 12 | 13 | def __init__(self, arch): 14 | super().__init__('ValuePatternTest', arch) 15 | 16 | def setup(self, choices): 17 | for choice in choices: 18 | if choice == 'op_pattern_simple': 19 | self._configs[choice] = ValuePatternTest.Config( 20 | files=['value_pattern_t0.csv'], 21 | op_counts=[[[500, 500, 1, 500], [500, 500, 1, 500]], 22 | [[250, 250, 1, 250], [250, 250, 1, 250]], 23 | [[500, 500, 500, 500], [500, 500, 1, 500]], 24 | [[500, 500, 1, 500], [500, 500, 1, 500]], 25 | [[500, 500, 500, 500]], 26 | [[1000, 1000, 20, 1000]]], 27 | kernel_patterns=[['Redundant Zeros', 'Single Value'], 28 | ['Redundant Zeros', 'Redundant Zeros'], 29 | ['Type Overuse', 'Single Value'], 30 | ['Redundant Zeros', 'Single Value'], 31 | ['Structured'], ['Dense Value']]) 32 | elif choice == 'bfs': 33 | self._configs[choice] = ValuePatternTest.Config( 34 | files=['value_pattern_t0.csv'], 35 | op_counts=[[[5861406, 2000000, 1000014, 5861406], 36 | [5999970, 5999970, 1000000, 5999970], 37 | [12000000, 0, 0, 0], 38 | [5999970, 32710, 2, 119381], 39 | [1930703, 633664, 11, 1930703], 40 | [1000000, 1000000, 1, 1000000], 41 | [0, 0, 0, 0], 42 | [1000000, 1000000, 1, 1000000], 43 | [1930703, 999999, 1, 1930703], 44 | [1930703, 999999, 11, 1930703]], 45 | [[12000000, 1, 1, 12], 46 | [999999, 999999, 1, 999999], 47 | [999999, 999999, 1, 999999], 48 | [0, 0, 0, 0], 49 | [999999, 999999, 1, 999999], 50 | [999999, 999999, 1, 999999], 51 | [999999, 1, 1, 999999]]], 52 | kernel_patterns=[['No Pattern', 'No Pattern', 'No Pattern', 53 | 'No Pattern', 'Dense Value', 'Inappropriate', 54 | 'Dense Value', 'Redundant Zeros', 'Single Value', 55 | 'Dense Value'], 56 | ['No Pattern', 'Single Value', 'Inappropriate', 57 | 'Dense Value', 'Redundant Zeros', 'Single Value', 58 | 'Single Value']] 59 | ) 60 | 61 | def _run_impl(self, case_name, version): 62 | def check(op_counts, kernel_patterns, buf: str): 63 | lines = buf.splitlines() 64 | order = -1 65 | count = -1 66 | pattern = -1 67 | find_pattern = False 68 | for n, line in enumerate(lines): 69 | count_line = False 70 | pattern_line = False 71 | dist_line = False 72 | if line.find('kernel id') != -1: 73 | order += 1 74 | pattern = -1 75 | elif line.find('array id:') != -1: 76 | count = -1 77 | pattern += 1 78 | find_pattern = False 79 | elif line.find('count:') != -1: 80 | count += 1 81 | count_line = True 82 | elif line.find(' * ') != -1: 83 | pattern_line = True 84 | elif line.find('TOP') != -1: 85 | dist_line = True 86 | if count_line is True: 87 | v = int(line.split(':')[1]) 88 | if op_counts[order][pattern][count] != v: 89 | return False, ' line {} count error: (true: {} vs test: {})'.format(n, op_counts[order][pattern][count], v) 90 | elif pattern_line is True: 91 | if line.find(kernel_patterns[order][pattern]): 92 | find_pattern = True 93 | elif dist_line is True: 94 | if find_pattern is False: 95 | return False, ' line {} pattern error: (true: {})'.format(n, kernel_patterns[order][pattern]) 96 | return True, '' 97 | 98 | command = Test.cases[case_name].command 99 | options = Test.cases[case_name].options 100 | path = Test.cases[case_name].path 101 | 102 | pipe_read(['gvprof', '-cfg', '-e', 'value_pattern', command] + options) 103 | 104 | files = self._configs[case_name].files 105 | op_counts = self._configs[case_name].op_counts 106 | kernel_patterns = self._configs[case_name].kernel_patterns 107 | 108 | for f in files: 109 | buf = pipe_read( 110 | ['cat', 'gvprof-database/' + f]).decode('utf-8') 111 | res, msg = check(op_counts, kernel_patterns, buf) 112 | if res is False: 113 | print('Error ' + path + ' ' + msg) 114 | else: 115 | print('Pass ' + path + ' ' + f) 116 | -------------------------------------------------------------------------------- /python/utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import csv 3 | 4 | 5 | def pipe_read(command, debug=False): 6 | process = subprocess.Popen(command, 7 | stdout=subprocess.PIPE, 8 | stderr=subprocess.PIPE) 9 | stdout, stderr = process.communicate() 10 | if debug is True: 11 | print(stdout) 12 | print(stderr) 13 | return stdout 14 | 15 | 16 | def cleanup(arch): 17 | pipe_read(['make', 'clean']) 18 | if arch is not None: 19 | pipe_read(['make', 'GPU_ARCH="-arch {}"'.join(arch)]) 20 | else: 21 | pipe_read(['make']) 22 | 23 | 24 | def nsys_profile(command, kernels): 25 | pipe_read(['nsys', 'profile', '-f', 'true', '-o', 'tmp'] + command) 26 | pipe_read(['nsys', 'stats', '--report', 'gpukernsum', '--report', 'gpumemtimesum', 27 | '--format', 'csv', '-o', 'tmp', '--force-overwrite', './tmp.qdrep']) 28 | 29 | kernel_times = dict() 30 | 31 | gpu_kernel_time = 0.0 32 | 33 | with open('tmp_gpukernsum.csv', newline='') as csvfile: 34 | spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') 35 | 36 | first_row = True 37 | for row in spamreader: 38 | if first_row is True: 39 | first_row = False 40 | continue 41 | 42 | time = row[1] 43 | kernel_args_name = row[6].replace('"', '').replace('void ', '') 44 | gpu_kernel_time += float(time) 45 | 46 | for kernel_name, template in kernels: 47 | if template is True: 48 | match_kernel_name = kernel_name 49 | else: 50 | match_kernel_name = kernel_name + '(' 51 | if kernel_args_name.startswith(match_kernel_name) is True: 52 | kernel_times[kernel_name] = float(time) 53 | break 54 | 55 | gpu_mem_time = 0.0 56 | 57 | with open('tmp_gpumemtimesum.csv', newline='') as csvfile: 58 | spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') 59 | 60 | first_row = True 61 | for row in spamreader: 62 | if first_row is True: 63 | first_row = False 64 | else: 65 | gpu_mem_time += float(row[1]) 66 | 67 | return kernel_times, gpu_kernel_time, gpu_mem_time 68 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pygraphviz 2 | numpy 3 | argparse -------------------------------------------------------------------------------- /src/gpu-analysis.cu: -------------------------------------------------------------------------------- 1 | #include "gpu-patch.h" 2 | #include "gpu-queue.h" 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | #define GPU_ANALYSIS_DEBUG 0 8 | 9 | #if GPU_ANALYSIS_DEBUG 10 | #define PRINT(...) \ 11 | if (threadIdx.x == 0 && blockIdx.x == 0) { \ 12 | printf(__VA_ARGS__); \ 13 | } 14 | #define PRINT_ALL(...) \ 15 | printf(__VA_ARGS__) 16 | #define PRINT_RECORDS(buffer) \ 17 | __syncthreads(); \ 18 | if (threadIdx.x == 0) { \ 19 | gpu_patch_analysis_address_t *records = (gpu_patch_analysis_address_t *)buffer->records; \ 20 | for (uint32_t i = 0; i < buffer->head_index; ++i) { \ 21 | printf("gpu analysis-> merged <%p, %p> (%p)\n", records[i].start, records[i].end, records[i].end - records[i].start); \ 22 | } \ 23 | } \ 24 | __syncthreads(); 25 | #else 26 | #define PRINT(...) 27 | #define PRINT_ALL(...) 28 | #define PRINT_RECORDS(buffer) 29 | #endif 30 | 31 | #define MAX_U64 (0xFFFFFFFFFFFFFFFF) 32 | #define MAX_U32 (0xFFFFFFFF) 33 | 34 | static 35 | __device__ 36 | void 37 | interval_compact 38 | ( 39 | gpu_patch_buffer_t *patch_buffer, 40 | gpu_patch_buffer_t *read_buffer, 41 | gpu_patch_buffer_t *write_buffer 42 | ) 43 | { 44 | auto warp_index = blockDim.x / GPU_PATCH_WARP_SIZE * blockIdx.x + threadIdx.x / GPU_PATCH_WARP_SIZE; 45 | auto num_warps = blockDim.x / GPU_PATCH_WARP_SIZE; 46 | auto laneid = get_laneid(); 47 | gpu_patch_record_address_t *records = (gpu_patch_record_address_t *)patch_buffer->records; 48 | gpu_patch_analysis_address_t *read_records = (gpu_patch_analysis_address_t *)read_buffer->records; 49 | gpu_patch_analysis_address_t *write_records = (gpu_patch_analysis_address_t *)write_buffer->records; 50 | 51 | PRINT("gpu analysis->full: %u, analysis: %u, head_index: %u, tail_index: %u, size: %u, num_threads: %u", 52 | patch_buffer->full, patch_buffer->analysis, patch_buffer->head_index, patch_buffer->tail_index, 53 | patch_buffer->size, patch_buffer->num_threads) 54 | 55 | for (auto iter = warp_index; iter < patch_buffer->head_index; iter += num_warps) { 56 | gpu_patch_record_address_t *record = records + iter; 57 | uint64_t address_start = record->address[laneid]; 58 | if (((0x1u << laneid) & record->active) == 0) { 59 | // Those address_start does not matter 60 | address_start = 0; 61 | } 62 | 63 | // Sort addresses and check if they are contiguous 64 | address_start = warp_sort(address_start, laneid); 65 | 66 | // First none zero 67 | uint32_t b = ballot((int32_t)(address_start != 0)); 68 | uint32_t first_laneid = __ffs(b) - 1; 69 | uint64_t interval_start = 0; 70 | interval_start = shfl_up(address_start, 1); 71 | 72 | PRINT_ALL("gpu_analysis <%d, %d>->active: %x, interval_start: %p, address_start: %p\n", 73 | blockIdx.x, threadIdx.x, record->active, interval_start, address_start); 74 | 75 | int32_t interval_start_point = 0; 76 | if (first_laneid == laneid || (address_start != 0 && (interval_start + record->size < address_start))) { 77 | interval_start_point = 1; 78 | } 79 | 80 | // In the worst case, a for loop takes 31 * 3 steps (shift + compare + loop) to find 81 | // the right end. The following procedure find the end with ~10 instructions. 82 | // Find the end position 83 | // 00100010b 84 | // 76543210 85 | // x 86 | // laneid = 1 87 | b = ballot(interval_start_point); 88 | 89 | PRINT_ALL("gpu_analysis <%d, %d>->ballot: %x, interval_start_point: %d, address_start: %p\n", 90 | blockIdx.x, threadIdx.x, b, interval_start_point, address_start); 91 | 92 | // 00100010b 93 | // b_rev 94 | // 01000100b 95 | // 76543210 96 | // x 97 | // laneid_rev = 8 - 1 - 1 = 6 98 | uint32_t b_rev = brev(b); 99 | uint32_t laneid_rev = GPU_PATCH_WARP_SIZE - laneid - 1; 100 | uint32_t laneid_rev_mask = (1 << laneid_rev) - 1; 101 | 102 | PRINT_ALL("gpu_analysis <%d, %d>->b_rev: %x, laneid_rev: %x, laneid_rev_mask: %x\n", 103 | blockIdx.x, threadIdx.x, b_rev, laneid_rev, laneid_rev_mask); 104 | 105 | // 00000100b 106 | // 76543210 107 | // x 108 | // p_rev = 2 109 | // p = 8 - 2 - 1 = 5 110 | uint32_t p = bfind(laneid_rev_mask & b_rev); 111 | if (p != MAX_U32) { 112 | // Get the end of the interval 113 | // max(p) = 30 114 | p = GPU_PATCH_WARP_SIZE - p - 1 - 1; 115 | } else { 116 | // Get last 117 | p = GPU_PATCH_WARP_SIZE - 1; 118 | } 119 | uint64_t address_end = address_start + record->size; 120 | address_end = shfl(address_end, p); 121 | 122 | PRINT_ALL("gpu_analysis <%d, %d>->p: %d, address_start: %p, address_end: %p\n", 123 | blockIdx.x, threadIdx.x, p, address_start, address_end); 124 | 125 | if (interval_start_point == 1) { 126 | gpu_patch_analysis_address_t *address_record = NULL; 127 | 128 | if (record->flags & GPU_PATCH_READ) { 129 | address_record = read_records + gpu_queue_get(read_buffer); 130 | address_record->start = address_start; 131 | address_record->end = address_end; 132 | 133 | PRINT_ALL("gpu_analysis <%d, %d>->push address_start: %p, address_end: %p\n", 134 | blockIdx.x, threadIdx.x, address_start, address_end); 135 | gpu_queue_push(read_buffer); 136 | } 137 | 138 | if (record->flags & GPU_PATCH_WRITE) { 139 | address_record = write_records + gpu_queue_get(write_buffer); 140 | address_record->start = address_start; 141 | address_record->end = address_end; 142 | 143 | PRINT_ALL("gpu_analysis <%d, %d>->push address_start: %p, address_end: %p\n", 144 | blockIdx.x, threadIdx.x, address_start, address_end); 145 | gpu_queue_push(write_buffer); 146 | } 147 | } 148 | } 149 | } 150 | 151 | 152 | template 153 | static 154 | __device__ 155 | int 156 | interval_merge_impl 157 | ( 158 | uint64_t *d_in, 159 | uint64_t *d_out, 160 | uint32_t valid_items 161 | ) 162 | { 163 | // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) 164 | typedef cub::BlockLoad BlockLoadT; 165 | // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) 166 | typedef cub::BlockStore BlockStoreT; 167 | // Specialize BlockRadixSort type for our thread block 168 | typedef cub::BlockRadixSort BlockRadixSortT; 169 | // Specialize BlockScan type for our thread block 170 | typedef cub::BlockScan BlockScanT; 171 | // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int 172 | typedef cub::BlockDiscontinuity BlockDiscontinuity; 173 | // Shared memory 174 | __shared__ union TempStorage 175 | { 176 | typename BlockLoadT::TempStorage load; 177 | typename BlockStoreT::TempStorage store; 178 | typename BlockRadixSortT::TempStorage sort; 179 | typename BlockScanT::TempStorage scan; 180 | typename BlockDiscontinuity::TempStorage disc; 181 | } temp_storage; 182 | 183 | // Per-thread tile items 184 | uint64_t items[ITEMS]; 185 | int interval_start_point[ITEMS]; 186 | int interval_end_point[ITEMS]; 187 | int interval_start_index[ITEMS]; 188 | int interval_end_index[ITEMS]; 189 | 190 | // Load items into a blocked arrangement 191 | BlockLoadT(temp_storage.load).Load(d_in, items, valid_items, MAX_U64); 192 | __syncthreads(); 193 | 194 | for (uint32_t i = 0; i < ITEMS / 2; ++i) { 195 | if (items[i * 2] != MAX_U64) { 196 | items[i * 2] = items[i * 2] << 1; 197 | } 198 | if (items[i * 2 + 1] != MAX_U64) { 199 | items[i * 2 + 1] = (items[i * 2 + 1] << 1) + 1; 200 | } 201 | } 202 | 203 | for (uint32_t i = 0; i < ITEMS / 2; ++i) { 204 | if (items[i * 2] != MAX_U64) { 205 | interval_start_point[i * 2] = 1; 206 | } else { 207 | interval_start_point[i * 2] = 0; 208 | } 209 | if (items[i * 2 + 1] != MAX_U64) { 210 | interval_start_point[i * 2 + 1] = -1; 211 | } else { 212 | interval_start_point[i * 2 + 1] = 0; 213 | } 214 | interval_end_point[i * 2] = 0; 215 | interval_end_point[i * 2 + 1] = 0; 216 | interval_start_index[i * 2] = 0; 217 | interval_start_index[i * 2 + 1] = 0; 218 | interval_end_index[i * 2] = 0; 219 | interval_end_index[i * 2 + 1] = 0; 220 | } 221 | 222 | // Sort keys 223 | BlockRadixSortT(temp_storage.sort).Sort(items, interval_start_point); 224 | __syncthreads(); 225 | 226 | // Get end marks 227 | BlockScanT(temp_storage.scan).InclusiveSum(interval_start_point, interval_start_point); 228 | __syncthreads(); 229 | 230 | for (uint32_t i = 0; i < ITEMS; ++i) { 231 | if (items[i] != MAX_U64 && interval_start_point[i] == 0) { 232 | interval_end_point[i] = 1; 233 | } 234 | } 235 | 236 | // Get start marks 237 | // XXX(Keren): this interface has a different input and output order. 238 | BlockDiscontinuity(temp_storage.disc).FlagHeads(interval_start_point, interval_end_point, cub::Inequality()); 239 | __syncthreads(); 240 | 241 | for (uint32_t i = 0; i < ITEMS; ++i) { 242 | if (items[i] != MAX_U64 && interval_start_point[i] == 1 && interval_end_point[i] != 1) { 243 | interval_start_point[i] = 1; 244 | } else { 245 | interval_start_point[i] = 0; 246 | } 247 | } 248 | 249 | // Get interval start index 250 | int aggregate = 0; 251 | BlockScanT(temp_storage.scan).InclusiveSum(interval_start_point, interval_start_index, aggregate); 252 | __syncthreads(); 253 | 254 | // Get interval end index 255 | BlockScanT(temp_storage.scan).InclusiveSum(interval_end_point, interval_end_index); 256 | __syncthreads(); 257 | 258 | // Put indices in the corresponding slots 259 | for (uint32_t i = 0; i < ITEMS; ++i) { 260 | if (interval_start_point[i] == 1) { 261 | d_out[(interval_start_index[i] - 1) * 2] = (items[i] >> 1); 262 | } 263 | if (interval_end_point[i] == 1) { 264 | d_out[(interval_end_index[i] - 1) * 2 + 1] = (items[i] - 1) >> 1; 265 | } 266 | } 267 | 268 | return aggregate; 269 | } 270 | 271 | 272 | template 273 | static 274 | __device__ 275 | void 276 | interval_merge 277 | ( 278 | gpu_patch_buffer_t *buffer 279 | ) 280 | { 281 | uint32_t cur_index = 0; 282 | uint32_t items = 0; 283 | uint32_t tile_size = THREADS * ITEMS; 284 | uint64_t *records = (uint64_t *)buffer->records; 285 | for (; cur_index + (tile_size / 2) <= buffer->head_index; cur_index += (tile_size / 2)) { 286 | items += interval_merge_impl(records + cur_index * 2, records + items * 2, tile_size); 287 | PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, tile_size, items); 288 | __syncthreads(); 289 | } 290 | // Remainder 291 | if (cur_index < buffer->head_index) { 292 | items += interval_merge_impl(records + cur_index * 2, records + items * 2, ((buffer->head_index - cur_index) * 2)); 293 | PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, (buffer->head_index - cur_index) * 2, items); 294 | __syncthreads(); 295 | } 296 | 297 | // Second pass 298 | // Fake shuffle 299 | if (items < buffer->head_index) { 300 | cur_index = 0; 301 | items = 0; 302 | for (; cur_index + (tile_size / 2) <= buffer->head_index; cur_index += (tile_size / 2)) { 303 | items += interval_merge_impl(records + cur_index * 2, records + items * 2, tile_size); 304 | PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, tile_size, items); 305 | __syncthreads(); 306 | } 307 | // Remainder 308 | if (cur_index < buffer->head_index) { 309 | items += interval_merge_impl(records + cur_index * 2, records + items * 2, ((buffer->head_index - cur_index) * 2)); 310 | PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, (buffer->head_index - cur_index) * 2, items); 311 | __syncthreads(); 312 | } 313 | } 314 | 315 | if (threadIdx.x == 0) { 316 | buffer->head_index = items; 317 | buffer->tail_index = items; 318 | } 319 | } 320 | 321 | 322 | // TODO(Keren): multiple buffers, no need to wait 323 | extern "C" 324 | __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1) 325 | __global__ 326 | void 327 | gpu_analysis_interval_merge 328 | ( 329 | gpu_patch_buffer_t *buffer, 330 | gpu_patch_buffer_t *read_buffer, 331 | gpu_patch_buffer_t *write_buffer 332 | ) 333 | { 334 | // Continue processing until CPU notifies analysis is done 335 | while (true) { 336 | // Wait until GPU notifies buffer is full. i.e., analysis can begin process. 337 | // Block sampling is not allowed 338 | while (buffer->analysis == 0 && atomic_load(&buffer->num_threads) != 0); 339 | 340 | if (atomic_load(&buffer->num_threads) == 0) { 341 | // buffer->analysis must be 0 342 | break; 343 | } 344 | 345 | // Compact addresses from contiguous thread accesses within each warp 346 | interval_compact(buffer, read_buffer, write_buffer); 347 | 348 | // Compact is done 349 | __syncthreads(); 350 | 351 | if (threadIdx.x == 0) { 352 | buffer->analysis = 0; 353 | } 354 | 355 | // Merge read buffer 356 | if (read_buffer->head_index != 0) { 357 | interval_merge(read_buffer); 358 | 359 | PRINT("gpu analysis-> read buffer\n") 360 | PRINT_RECORDS(read_buffer) 361 | } 362 | 363 | // Merge write buffer 364 | if (write_buffer->head_index != 0) { 365 | interval_merge(write_buffer); 366 | 367 | PRINT("gpu analysis-> write buffer\n") 368 | PRINT_RECORDS(write_buffer) 369 | } 370 | 371 | __syncthreads(); 372 | } 373 | 374 | // Last analysis 375 | interval_compact(buffer, read_buffer, write_buffer); 376 | 377 | // Compact is done 378 | __syncthreads(); 379 | 380 | // Merge read buffer 381 | if (read_buffer->head_index != 0) { 382 | interval_merge(read_buffer); 383 | 384 | PRINT("gpu analysis-> read buffer\n") 385 | PRINT_RECORDS(read_buffer) 386 | } 387 | 388 | // Merge write buffer 389 | if (write_buffer->head_index != 0) { 390 | interval_merge(write_buffer); 391 | 392 | PRINT("gpu analysis-> write buffer\n") 393 | PRINT_RECORDS(write_buffer) 394 | } 395 | 396 | __syncthreads(); 397 | 398 | if (threadIdx.x == 0) { 399 | atomic_store_system(&read_buffer->num_threads, (uint32_t)0); 400 | } 401 | } 402 | -------------------------------------------------------------------------------- /src/gpu-patch-address.cu: -------------------------------------------------------------------------------- 1 | #include "gpu-patch.h" 2 | #include "gpu-queue.h" 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | struct gpu_patch_analysis_address_comparator { 8 | __device__ 9 | bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) { 10 | return l.start <= r.start; 11 | } 12 | }; 13 | 14 | /* 15 | * Monitor each shared and global memory access. 16 | */ 17 | static 18 | __device__ __forceinline__ 19 | SanitizerPatchResult 20 | memory_access_callback 21 | ( 22 | void *user_data, 23 | uint64_t pc, 24 | void *address, 25 | uint32_t size, 26 | uint32_t flags, 27 | const void *new_value 28 | ) 29 | { 30 | gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data; 31 | 32 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 33 | return SANITIZER_PATCH_SUCCESS; 34 | } 35 | 36 | // 1. Init values 37 | uint32_t active_mask = __activemask(); 38 | uint32_t laneid = get_laneid(); 39 | uint32_t first_laneid = __ffs(active_mask) - 1; 40 | 41 | uint32_t keep = 1; 42 | if (buffer->aux != NULL && (flags & GPU_PATCH_READ) != 0 && 43 | (flags & (GPU_PATCH_WRITE | GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) { 44 | // Read address can be filtered 45 | gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux; 46 | gpu_patch_analysis_address_t *start_end = address_dict->start_end; 47 | gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 }; 48 | uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator()); 49 | 50 | if (pos != address_dict->size) { 51 | // Find an existing entry 52 | if (atomic_load(address_dict->hit + pos) == 0) { 53 | // Update 54 | atomic_store(address_dict->hit + pos, (uint8_t)1); 55 | } else { 56 | // Filter out 57 | keep = 0; 58 | } 59 | } 60 | } 61 | 62 | __syncwarp(active_mask); 63 | 64 | uint32_t all_keep = 0; 65 | all_keep = ballot((uint32_t)keep, active_mask); 66 | if (all_keep == 0) { 67 | // Fast path 68 | return SANITIZER_PATCH_SUCCESS; 69 | } 70 | 71 | gpu_patch_record_address_t *record = NULL; 72 | if (laneid == first_laneid) { 73 | // 3. Get a record 74 | gpu_patch_record_address_t *records = (gpu_patch_record_address_t *)buffer->records; 75 | record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 76 | 77 | // 4. Assign basic values 78 | record->flags = flags; 79 | record->size = size; 80 | record->active = all_keep & active_mask; 81 | } 82 | 83 | __syncwarp(active_mask); 84 | 85 | uint64_t r = (uint64_t)record; 86 | record = (gpu_patch_record_address_t *)shfl(r, first_laneid, active_mask); 87 | 88 | if (record != NULL && keep == 1) { 89 | record->address[laneid] = (uint64_t)address; 90 | } 91 | 92 | __syncwarp(active_mask); 93 | 94 | if (laneid == first_laneid) { 95 | // 5. Push a record 96 | gpu_queue_push(buffer); 97 | } 98 | 99 | return SANITIZER_PATCH_SUCCESS; 100 | } 101 | 102 | 103 | extern "C" 104 | __device__ __noinline__ 105 | SanitizerPatchResult 106 | sanitizer_global_memory_access_callback 107 | ( 108 | void *user_data, 109 | uint64_t pc, 110 | void *address, 111 | uint32_t size, 112 | uint32_t flags, 113 | const void *new_value 114 | ) 115 | { 116 | return memory_access_callback(user_data, pc, address, size, flags, new_value); 117 | } 118 | 119 | 120 | extern "C" 121 | __device__ __noinline__ 122 | SanitizerPatchResult 123 | sanitizer_shared_memory_access_callback 124 | ( 125 | void *user_data, 126 | uint64_t pc, 127 | void *address, 128 | uint32_t size, 129 | uint32_t flags, 130 | const void *new_value 131 | ) 132 | { 133 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value); 134 | } 135 | 136 | 137 | extern "C" 138 | __device__ __noinline__ 139 | SanitizerPatchResult 140 | sanitizer_local_memory_access_callback 141 | ( 142 | void *user_data, 143 | uint64_t pc, 144 | void *address, 145 | uint32_t size, 146 | uint32_t flags, 147 | const void *new_value 148 | ) 149 | { 150 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value); 151 | } 152 | 153 | 154 | /* 155 | * Lock the corresponding hash entry for a block 156 | */ 157 | extern "C" 158 | __device__ __noinline__ 159 | SanitizerPatchResult 160 | sanitizer_block_exit_callback 161 | ( 162 | void *user_data, 163 | uint64_t pc 164 | ) 165 | { 166 | gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data; 167 | 168 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 169 | return SANITIZER_PATCH_SUCCESS; 170 | } 171 | 172 | uint32_t active_mask = __activemask(); 173 | uint32_t laneid = get_laneid(); 174 | uint32_t first_laneid = __ffs(active_mask) - 1; 175 | int32_t pop_count = __popc(active_mask); 176 | 177 | if (laneid == first_laneid) { 178 | // Finish a bunch of threads 179 | atomicAdd(&buffer->num_threads, -pop_count); 180 | } 181 | 182 | return SANITIZER_PATCH_SUCCESS; 183 | } -------------------------------------------------------------------------------- /src/gpu-patch-aux.cu: -------------------------------------------------------------------------------- 1 | #include "gpu-patch.h" 2 | #include "gpu-queue.h" 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | struct gpu_patch_analysis_address_comparator { 8 | __device__ 9 | bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) { 10 | return l.start <= r.start; 11 | } 12 | }; 13 | 14 | /* 15 | * Monitor each shared and global memory access. 16 | */ 17 | static 18 | __device__ __forceinline__ 19 | SanitizerPatchResult 20 | memory_access_callback 21 | ( 22 | void *user_data, 23 | uint64_t pc, 24 | void *address, 25 | uint32_t size, 26 | uint32_t flags, 27 | const void *new_value 28 | ) 29 | { 30 | gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data; 31 | 32 | // 1. Init values 33 | uint32_t active_mask = __activemask(); 34 | uint32_t laneid = get_laneid(); 35 | uint32_t first_laneid = __ffs(active_mask) - 1; 36 | 37 | uint32_t keep = 1; 38 | if (buffer->aux != NULL && (flags & (GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) { 39 | // Read address can be filtered 40 | gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux; 41 | gpu_patch_analysis_address_t *start_end = address_dict->start_end; 42 | gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 }; 43 | uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator()); 44 | 45 | if (pos != address_dict->size) { 46 | // Find an existing entry 47 | if (atomic_load(address_dict->hit + pos) == 0) 48 | { 49 | // Update 50 | atomic_store(address_dict->hit + pos, (uint8_t)1); 51 | } else { 52 | // Filter out 53 | keep = 0; 54 | } 55 | if (atomic_load(address_dict->read + pos) == 0 && static_cast(flags) == GPU_PATCH_READ) { 56 | atomic_store(address_dict->read + pos, (uint8_t)1); 57 | } 58 | if (atomic_load(address_dict->write + pos) == 0 && static_cast(flags) == GPU_PATCH_WRITE) { 59 | atomic_store(address_dict->write + pos, (uint8_t)1); 60 | } 61 | } 62 | } 63 | 64 | __syncwarp(active_mask); 65 | 66 | uint32_t all_keep = 0; 67 | all_keep = ballot((uint32_t)keep, active_mask); 68 | if (all_keep == 0) { 69 | // Fast path 70 | return SANITIZER_PATCH_SUCCESS; 71 | } 72 | 73 | __syncwarp(active_mask); 74 | 75 | if (laneid == first_laneid) { 76 | // 5. Push a record 77 | gpu_queue_push(buffer); 78 | } 79 | 80 | return SANITIZER_PATCH_SUCCESS; 81 | } 82 | 83 | 84 | extern "C" 85 | __device__ __noinline__ 86 | SanitizerPatchResult 87 | sanitizer_global_memory_access_callback 88 | ( 89 | void *user_data, 90 | uint64_t pc, 91 | void *address, 92 | uint32_t size, 93 | uint32_t flags, 94 | const void *new_value 95 | ) 96 | { 97 | return memory_access_callback(user_data, pc, address, size, flags, new_value); 98 | } 99 | 100 | 101 | extern "C" 102 | __device__ __noinline__ 103 | SanitizerPatchResult 104 | sanitizer_shared_memory_access_callback 105 | ( 106 | void *user_data, 107 | uint64_t pc, 108 | void *address, 109 | uint32_t size, 110 | uint32_t flags, 111 | const void *new_value 112 | ) 113 | { 114 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value); 115 | } 116 | 117 | 118 | extern "C" 119 | __device__ __noinline__ 120 | SanitizerPatchResult 121 | sanitizer_local_memory_access_callback 122 | ( 123 | void *user_data, 124 | uint64_t pc, 125 | void *address, 126 | uint32_t size, 127 | uint32_t flags, 128 | const void *new_value 129 | ) 130 | { 131 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value); 132 | } 133 | 134 | 135 | /* 136 | * Lock the corresponding hash entry for a block 137 | */ 138 | extern "C" 139 | __device__ __noinline__ 140 | SanitizerPatchResult 141 | sanitizer_block_exit_callback 142 | ( 143 | void *user_data, 144 | uint64_t pc 145 | ) 146 | { 147 | gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data; 148 | 149 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 150 | return SANITIZER_PATCH_SUCCESS; 151 | } 152 | 153 | uint32_t active_mask = __activemask(); 154 | uint32_t laneid = get_laneid(); 155 | uint32_t first_laneid = __ffs(active_mask) - 1; 156 | int32_t pop_count = __popc(active_mask); 157 | 158 | if (laneid == first_laneid) { 159 | // Finish a bunch of threads 160 | atomicAdd(&buffer->num_threads, -pop_count); 161 | } 162 | 163 | return SANITIZER_PATCH_SUCCESS; 164 | } -------------------------------------------------------------------------------- /src/gpu-patch-torch-aux.cu: -------------------------------------------------------------------------------- 1 | #include "gpu-patch.h" 2 | #include "gpu-queue.h" 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | struct gpu_patch_analysis_address_comparator { 8 | __device__ 9 | bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) { 10 | return l.start <= r.start; 11 | } 12 | }; 13 | 14 | /* 15 | * Monitor each shared and global memory access. 16 | */ 17 | static 18 | __device__ __forceinline__ 19 | SanitizerPatchResult 20 | memory_access_callback 21 | ( 22 | void *user_data, 23 | uint64_t pc, 24 | void *address, 25 | uint32_t size, 26 | uint32_t flags, 27 | const void *new_value 28 | ) 29 | { 30 | gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data; 31 | 32 | // 1. Init values 33 | uint32_t active_mask = __activemask(); 34 | uint32_t laneid = get_laneid(); 35 | uint32_t first_laneid = __ffs(active_mask) - 1; 36 | 37 | uint32_t keep = 1; 38 | if (buffer->aux != NULL && buffer->torch_aux != NULL && (flags & (GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) { 39 | 40 | gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux; 41 | gpu_patch_analysis_address_t *start_end = address_dict->start_end; 42 | gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 }; 43 | uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator()); 44 | 45 | if (pos != address_dict->size) { 46 | // Find an existing entry 47 | if (atomic_load(address_dict->hit + pos) == 0) { 48 | // Update 49 | atomic_store(address_dict->hit + pos, (uint8_t)1); 50 | } else { 51 | // Filter out 52 | keep = 0; 53 | } 54 | } 55 | 56 | gpu_patch_aux_address_dict *torch_address_dict = (gpu_patch_aux_address_dict *)buffer->torch_aux; 57 | gpu_patch_analysis_address_t *torch_start_end = torch_address_dict->start_end; 58 | uint32_t torch_pos = map_prev(torch_start_end, addr, torch_address_dict->size, gpu_patch_analysis_address_comparator()); 59 | 60 | if (torch_pos != torch_address_dict->size) { 61 | // Find an existing entry 62 | if (atomic_load(torch_address_dict->hit + torch_pos) == 0) { 63 | // Update 64 | atomic_store(torch_address_dict->hit + torch_pos, (uint8_t)1); 65 | } else { 66 | // Filter out 67 | keep = 0; 68 | } 69 | } 70 | } 71 | 72 | __syncwarp(active_mask); 73 | 74 | uint32_t all_keep = 0; 75 | all_keep = ballot((uint32_t)keep, active_mask); 76 | if (all_keep == 0) { 77 | // Fast path 78 | return SANITIZER_PATCH_SUCCESS; 79 | } 80 | 81 | __syncwarp(active_mask); 82 | 83 | if (laneid == first_laneid) { 84 | // 5. Push a record 85 | gpu_queue_push(buffer); 86 | } 87 | 88 | return SANITIZER_PATCH_SUCCESS; 89 | } 90 | 91 | 92 | extern "C" 93 | __device__ __noinline__ 94 | SanitizerPatchResult 95 | sanitizer_global_memory_access_callback 96 | ( 97 | void *user_data, 98 | uint64_t pc, 99 | void *address, 100 | uint32_t size, 101 | uint32_t flags, 102 | const void *new_value 103 | ) 104 | { 105 | return memory_access_callback(user_data, pc, address, size, flags, new_value); 106 | } 107 | 108 | 109 | extern "C" 110 | __device__ __noinline__ 111 | SanitizerPatchResult 112 | sanitizer_shared_memory_access_callback 113 | ( 114 | void *user_data, 115 | uint64_t pc, 116 | void *address, 117 | uint32_t size, 118 | uint32_t flags, 119 | const void *new_value 120 | ) 121 | { 122 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value); 123 | } 124 | 125 | 126 | extern "C" 127 | __device__ __noinline__ 128 | SanitizerPatchResult 129 | sanitizer_local_memory_access_callback 130 | ( 131 | void *user_data, 132 | uint64_t pc, 133 | void *address, 134 | uint32_t size, 135 | uint32_t flags, 136 | const void *new_value 137 | ) 138 | { 139 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value); 140 | } 141 | 142 | 143 | /* 144 | * Lock the corresponding hash entry for a block 145 | */ 146 | extern "C" 147 | __device__ __noinline__ 148 | SanitizerPatchResult 149 | sanitizer_block_exit_callback 150 | ( 151 | void *user_data, 152 | uint64_t pc 153 | ) 154 | { 155 | gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data; 156 | 157 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 158 | return SANITIZER_PATCH_SUCCESS; 159 | } 160 | 161 | uint32_t active_mask = __activemask(); 162 | uint32_t laneid = get_laneid(); 163 | uint32_t first_laneid = __ffs(active_mask) - 1; 164 | int32_t pop_count = __popc(active_mask); 165 | 166 | if (laneid == first_laneid) { 167 | // Finish a bunch of threads 168 | atomicAdd(&buffer->num_threads, -pop_count); 169 | } 170 | 171 | return SANITIZER_PATCH_SUCCESS; 172 | } -------------------------------------------------------------------------------- /src/gpu-patch.cu: -------------------------------------------------------------------------------- 1 | #include "gpu-patch.h" 2 | #include "gpu-queue.h" 3 | #include "utils.h" 4 | 5 | #include 6 | 7 | /* 8 | * Monitor each shared and global memory access. 9 | */ 10 | static 11 | __device__ __forceinline__ 12 | SanitizerPatchResult 13 | memory_access_callback 14 | ( 15 | void *user_data, 16 | uint64_t pc, 17 | void *address, 18 | uint32_t size, 19 | uint32_t flags, 20 | const void *new_value 21 | ) 22 | { 23 | gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data; 24 | 25 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 26 | return SANITIZER_PATCH_SUCCESS; 27 | } 28 | 29 | // 1. Init values 30 | uint32_t active_mask = __activemask(); 31 | uint32_t laneid = get_laneid(); 32 | uint32_t first_laneid = __ffs(active_mask) - 1; 33 | 34 | // 2. Read memory values 35 | uint8_t buf[GPU_PATCH_MAX_ACCESS_SIZE]; 36 | if (new_value == NULL) { 37 | // Read operation, old value can be on local memory, shared memory, or global memory 38 | if (flags & GPU_PATCH_SHARED) { 39 | read_shared_memory(size, (uint32_t)address, buf); 40 | } else if (flags & GPU_PATCH_LOCAL) { 41 | read_local_memory(size, (uint32_t)address, buf); 42 | } else if (flags != SANITIZER_MEMORY_DEVICE_FLAG_FORCE_INT) { 43 | read_global_memory(size, (uint64_t)address, buf); 44 | } 45 | } else { 46 | // Write operation, new value is on global memory 47 | read_global_memory(size, (uint64_t)new_value, buf); 48 | } 49 | 50 | gpu_patch_record_t *record = NULL; 51 | if (laneid == first_laneid) { 52 | // 3. Get a record 53 | gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records; 54 | record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 55 | 56 | // 4. Assign basic values 57 | record->flags = flags; 58 | record->active = active_mask; 59 | record->pc = pc; 60 | record->size = size; 61 | record->flat_thread_id = get_flat_thread_id(); 62 | record->flat_block_id = get_flat_block_id(); 63 | } 64 | 65 | __syncwarp(active_mask); 66 | 67 | uint64_t r = (uint64_t)record; 68 | record = (gpu_patch_record_t *)shfl(r, first_laneid, active_mask); 69 | 70 | if (record != NULL) { 71 | record->address[laneid] = (uint64_t)address; 72 | for (uint32_t i = 0; i < size; ++i) { 73 | record->value[laneid][i] = buf[i]; 74 | } 75 | } 76 | 77 | __syncwarp(active_mask); 78 | 79 | if (laneid == first_laneid) { 80 | // 5. Push a record 81 | gpu_queue_push(buffer); 82 | } 83 | 84 | return SANITIZER_PATCH_SUCCESS; 85 | } 86 | 87 | 88 | extern "C" 89 | __device__ __noinline__ 90 | SanitizerPatchResult 91 | sanitizer_global_memory_access_callback 92 | ( 93 | void *user_data, 94 | uint64_t pc, 95 | void *address, 96 | uint32_t size, 97 | uint32_t flags, 98 | const void *new_value 99 | ) 100 | { 101 | return memory_access_callback(user_data, pc, address, size, flags, new_value); 102 | } 103 | 104 | 105 | extern "C" 106 | __device__ __noinline__ 107 | SanitizerPatchResult 108 | sanitizer_shared_memory_access_callback 109 | ( 110 | void *user_data, 111 | uint64_t pc, 112 | void *address, 113 | uint32_t size, 114 | uint32_t flags, 115 | const void *new_value 116 | ) 117 | { 118 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value); 119 | } 120 | 121 | 122 | extern "C" 123 | __device__ __noinline__ 124 | SanitizerPatchResult 125 | sanitizer_local_memory_access_callback 126 | ( 127 | void *user_data, 128 | uint64_t pc, 129 | void *address, 130 | uint32_t size, 131 | uint32_t flags, 132 | const void *new_value 133 | ) 134 | { 135 | return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value); 136 | } 137 | 138 | 139 | /* 140 | * Lock the corresponding hash entry for a block 141 | */ 142 | extern "C" 143 | __device__ __noinline__ 144 | SanitizerPatchResult 145 | sanitizer_block_exit_callback 146 | ( 147 | void *user_data, 148 | uint64_t pc 149 | ) 150 | { 151 | gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data; 152 | 153 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 154 | return SANITIZER_PATCH_SUCCESS; 155 | } 156 | 157 | uint32_t active_mask = __activemask(); 158 | uint32_t laneid = get_laneid(); 159 | uint32_t first_laneid = __ffs(active_mask) - 1; 160 | int32_t pop_count = __popc(active_mask); 161 | 162 | if (laneid == first_laneid) { 163 | gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records; 164 | gpu_patch_record_t *record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 165 | 166 | record->pc = pc; 167 | record->flags = GPU_PATCH_BLOCK_EXIT_FLAG; 168 | record->flat_block_id = get_flat_block_id(); 169 | record->flat_thread_id = get_flat_thread_id(); 170 | record->active = active_mask; 171 | 172 | gpu_queue_push(buffer); 173 | 174 | // Finish a bunch of threads 175 | atomicAdd(&(buffer->num_threads), -pop_count); 176 | } 177 | 178 | return SANITIZER_PATCH_SUCCESS; 179 | } 180 | 181 | 182 | /* 183 | * Sample the corresponding blocks 184 | */ 185 | extern "C" 186 | __device__ __noinline__ 187 | SanitizerPatchResult 188 | sanitizer_block_enter_callback 189 | ( 190 | void *user_data, 191 | uint64_t pc 192 | ) 193 | { 194 | gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data; 195 | 196 | if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) { 197 | return SANITIZER_PATCH_SUCCESS; 198 | } 199 | 200 | uint32_t active_mask = __activemask(); 201 | uint32_t laneid = get_laneid(); 202 | uint32_t first_laneid = __ffs(active_mask) - 1; 203 | 204 | if (laneid == first_laneid) { 205 | // Mark block begin 206 | gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records; 207 | gpu_patch_record_t *record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 208 | 209 | record->pc = pc; 210 | record->flags = GPU_PATCH_BLOCK_ENTER_FLAG; 211 | record->flat_block_id = get_flat_block_id(); 212 | record->flat_thread_id = get_flat_thread_id(); 213 | record->active = active_mask; 214 | 215 | gpu_queue_push(buffer); 216 | } 217 | 218 | return SANITIZER_PATCH_SUCCESS; 219 | } 220 | --------------------------------------------------------------------------------