├── .gitignore
├── .gitmodules
├── .readthedocs.yaml
├── CITATION.bib
├── LICENSE
├── Makefile
├── README.md
├── bin
    ├── compile
    ├── compile-release
    ├── gvprof
    ├── gvprof-debug
    ├── gvprof_overhead
    ├── install
    └── install-debug
├── docs
    ├── .gitignore
    ├── Makefile
    ├── barracuda.md
    ├── castro.md
    ├── conf.py
    ├── darknet.md
    ├── deepwave.md
    ├── faq.md
    ├── index.rst
    ├── install.md
    ├── lammps.md
    ├── manual.md
    ├── namd.md
    ├── preface.md
    ├── pytorch.md
    ├── qmcpack.md
    ├── requirements.txt
    ├── roadmap.md
    ├── rodinia.md
    ├── unit_tests.md
    └── workflow.md
├── include
    ├── gpu-patch.h
    ├── gpu-queue.h
    └── utils.h
├── python
    ├── .gitignore
    ├── __init__.py
    ├── bench.py
    ├── filter_time.py
    ├── gviewer.py
    ├── overhead.sh
    ├── test.py
    ├── test_cases.py
    ├── tests
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── data_flow_test.py
    │   ├── instruction_test.py
    │   ├── redundancy_test.py
    │   └── value_pattern_test.py
    └── utils.py
├── requirements.txt
└── src
    ├── gpu-analysis.cu
    ├── gpu-patch-address.cu
    ├── gpu-patch-aux.cu
    ├── gpu-patch-torch-aux.cu
    └── gpu-patch.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | *.fatbin
2 | *.cubin
3 | gvprof/*
4 | 
5 | .vscode/
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "samples"]
 2 | 	path = samples
 3 | 	url = https://github.com/FindHao/hpctoolkit-gpu-sanitizer-samples.git
 4 | [submodule "jquery.graphviz.svg"]
 5 | 	path = jquery.graphviz.svg
 6 | 	url = https://github.com/Jokeren/jquery.graphviz.svg.git
 7 | [submodule "redshow"]
 8 | 	path = redshow
 9 | 	url = https://github.com/Lin-Mao/redshow.git
10 | [submodule "hpctoolkit"]
11 | 	path = hpctoolkit
12 | 	url = https://github.com/Lin-Mao/hpctoolkit.git
13 | [submodule "torch-monitor"]
14 | 	path = torch-monitor
15 | 	url = https://github.com/Lin-Mao/torch-monitor.git
16 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.12"
12 | 
13 | # Build documentation in the docs/ directory with Sphinx
14 | sphinx:
15 |    configuration: docs/conf.py
16 | 
17 | python:
18 |    install:
19 |       - requirements: docs/requirements.txt
20 | 
21 | # Optionally build your docs in additional formats such as PDF
22 | formats:
23 |    - pdf
24 | 


--------------------------------------------------------------------------------
/CITATION.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{lin2023drgpum,
 2 |   title={DrGPUM: Guiding Memory Optimization for GPU-Accelerated Applications},
 3 |   author={Lin, Mao and Zhou, Keren and Su, Pengfei},
 4 |   booktitle={Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3},
 5 |   pages={164--178},
 6 |   year={2023},
 7 |   isbn={9781450399180},
 8 |   publisher={Association for Computing Machinery},
 9 |   address={New York, NY, USA},
10 |   url={https://doi.org/10.1145/3582016.3582044},
11 |   doi={10.1145/3582016.3582044},
12 |   keywords={GPU profilers, Memory management, CUDA, GPUs},
13 |   location={Vancouver, BC, Canada},
14 |   series={ASPLOS 2023}
15 | }
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, University of California, Merced All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Modified from https://github.com/Jokeren/compute-sanitizer-samples/tree/master/MemoryTracker
 2 | PROJECT ?= gpu-patch.fatbin gpu-patch-address.fatbin gpu-patch-aux.fatbin gpu-patch-torch-aux.fatbin
 3 | PROJECT_ANALYSIS ?= gpu-analysis.fatbin
 4 | 
 5 | # Location of the CUDA Toolkit
 6 | CUDA_PATH ?= /usr/local/cuda
 7 | SANITIZER_PATH ?= $(CUDA_PATH)/compute-sanitizer
 8 | CUPTI_PATH ?= $(CUDA_PATH)
 9 | 
10 | NVCC := $(CUDA_PATH)/bin/nvcc
11 | 
12 | INCLUDE_DIRS := -I$(CUDA_PATH)/include -I$(SANITIZER_PATH)/include -I$(CUPTI_PATH)/include -Iinclude
13 | SRC_DIR := src
14 | CXXFLAGS := $(INCLUDE_DIRS) -O3 --fatbin
15 | 
16 | ARCHS := 50 60 70 72 75 80 86
17 | 
18 | # Generate SASS code for each SM architectures
19 | $(foreach sm,$(ARCHS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
20 | 
21 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
22 | HIGHEST_SM := $(lastword $(sort $(ARCHS)))
23 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
24 | 
25 | all: $(PROJECT) $(PROJECT_ANALYSIS)
26 | 
27 | ifdef PREFIX
28 | install: all
29 | endif
30 | 
31 | $(PROJECT): %.fatbin : $(SRC_DIR)/%.cu
32 | 	$(NVCC) $(CXXFLAGS) $(GENCODE_FLAGS) --compile-as-tools-patch -o $@ -c $<
33 | 
34 | $(PROJECT_ANALYSIS): %.fatbin : $(SRC_DIR)/%.cu
35 | 	$(NVCC) $(CXXFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
36 | 
37 | ifdef PREFIX
38 | install: $(PROJECT) $(PROJECT_ANALYSIS)
39 | 	mkdir -p $(PREFIX)/lib
40 | 	mkdir -p $(PREFIX)/include
41 | 	mkdir -p $(PREFIX)/bin
42 | 	cp -rf $(PROJECT) $(PROJECT_ANALYSIS) $(PREFIX)/lib
43 | 	cp -rf include $(PREFIX)
44 | endif
45 | 
46 | clean:
47 | 	rm -f $(PROJECT) $(PROJECT_ANALYSIS)
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DrGPUM
 2 | 
 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7588406.svg)](https://doi.org/10.5281/zenodo.7588406)
 4 | [![CodeFactor](https://www.codefactor.io/repository/github/lin-mao/drgpum/badge)](https://www.codefactor.io/repository/github/lin-mao/drgpum)
 5 | [![Documentation Status](https://readthedocs.org/projects/drgpum/badge/?version=latest)](https://drgpum.readthedocs.io/en/latest/?badge=latest)
 6 | 
 7 | 
 8 | DrGPUM is a memory profiler for NVIDIA GPUs to explore memory inefficiencies in GPU-accelerated applications.
 9 | 
10 | ## Quick Start
11 | 
12 | ```bash
13 | git clone --recursive https://github.com/Lin-Mao/DrGPUM.git && cd DrGPUM
14 | 
15 | git submodule update --init --recursive
16 | 
17 | # Specify PyTorch dir
18 | export PYTORCH_DIR=path_to_pytorch/torch
19 | 
20 | # Install DrGPUM
21 | ./bin/install
22 | 
23 | # Setup environment variables
24 | export DrGPUM_PATH=$(pwd)/gvprof
25 | export PATH=${DrGPUM_PATH}/bin:$PATH
26 | export PATH=${DrGPUM_PATH}/hpctoolkit/bin:$PATH
27 | export PATH=${DrGPUM_PATH}/redshow/bin:$PATH
28 | 
29 | # Test a sample
30 | cd samples/vectorAdd.f32
31 | make
32 | gvprof -v -e memory_liveness ./vectorAdd
33 | ```
34 | 
35 | ## Documentation
36 | 
37 | - [Installation Guide](https://drgpum.readthedocs.io/en/latest/install.html)
38 | - [User's Guide](https://drgpum.readthedocs.io/en/latest/manual.html)
39 | - [Developer's Guide](https://drgpum.readthedocs.io/en/latest/workflow.html)
40 | 
41 | ## Papers
42 | 
43 | - Mao Lin, Keren Zhou, and Pengfei Su. 2023. [DrGPUM: Guiding Memory Optimization for GPU-accelerated Applications](https://doi.org/10.1145/3582016.3582044). In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3 (ASPLOS ’23), March 25–29, 2023, Vancouver, BC, Canada. ACM, New York, NY, USA, 15 pages.
44 | 


--------------------------------------------------------------------------------
/bin/compile:
--------------------------------------------------------------------------------
  1 | #!/bin/bash            
  2 | 
  3 | SOURCE_DIR=$(pwd)
  4 | DIR=""
  5 | CUDA_PATH=/usr/local/cuda/
  6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer
  7 | 
  8 | if [ $# -eq 0 ]; then
  9 |   DIR=$(pwd)/gvprof
 10 | else
 11 |   if [ $# -eq 1 ]; then
 12 |     DIR=$1
 13 |   else
 14 |     if [ $# -eq 2 ]; then
 15 |       DIR=$1
 16 |       CUDA_PATH=$2
 17 |       SANITIZER_PATH=$2/compute-sanitizer
 18 |     else
 19 |       if [ $# -eq 3 ]; then
 20 |         DIR=$1
 21 |         CUDA_PATH=$2
 22 |         SANITIZER_PATH=$3
 23 |       fi
 24 |     fi
 25 |   fi  
 26 | fi
 27 | 
 28 | 
 29 | if [ -z "$DIR" ]
 30 | then
 31 |   echo "Wrong paths"
 32 |   echo "./install <prefix> <cuda-path> <sanitizer-path>"
 33 |   exit
 34 | fi
 35 | 
 36 | echo $DIR
 37 | echo $CUDA_PATH
 38 | echo $SANITIZER_PATH
 39 | 
 40 | if [ ! -d $DIR ]
 41 | then
 42 |   mkdir $DIR
 43 | fi
 44 | 
 45 | cd $DIR
 46 | # Install spack
 47 | # git clone https://github.com/spack/spack.git
 48 | export SPACK_ROOT=$(pwd)/spack
 49 | export PATH=${SPACK_ROOT}/bin:${PATH}
 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh
 51 | 
 52 | # Install hpctoolkit dependencies
 53 | # spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 
 54 | # spack install libmonitor@master+dlopen+hpctoolkit
 55 | # spack install libunwind
 56 | 
 57 | # Fix bug
 58 | # spack install mbedtls gotcha
 59 | 
 60 | # Python version for torch monitor
 61 | PY_VERSION=3.8
 62 | # spack install python@$PY_VERSION
 63 | 
 64 | # Install gpu-patch
 65 | cd $SOURCE_DIR
 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install
 67 | 
 68 | # Find spack and boost dir
 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3)
 70 | S=${B%/*}
 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3)
 72 | 
 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3)
 74 | 
 75 | # Install torch monitor
 76 | cd $SOURCE_DIR
 77 | cd torch-monitor
 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \
 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \
 80 | TORCH_DIR=$PYTORCH_DIR DEBUG=1 install
 81 | 
 82 | # Install redshow
 83 | cd $SOURCE_DIR
 84 | cd redshow
 85 | make clean
 86 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \
 87 | TORCH_MONITOR_DIR=$DIR/torch-monitor DEBUG=1 install
 88 | 
 89 | # install hpctoolkit
 90 | cd $SOURCE_DIR
 91 | cd hpctoolkit
 92 | # mkdir build
 93 | cd build
 94 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \
 95 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \
 96 | --with-redshow=$DIR/redshow --with-spack=$S  --enable-develop
 97 | make install -j
 98 | 
 99 | cd $SOURCE_DIR
100 | # mkdir $DIR/bin
101 | # mkdir $DIR/python
102 | cp ./bin/gvprof $DIR/bin/
103 | cp -r ./python $DIR
104 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer
105 | chmod +x $DIR/bin/gviewer
106 | 
107 | echo "Install in $DIR/bin/gvprof"
108 | 


--------------------------------------------------------------------------------
/bin/compile-release:
--------------------------------------------------------------------------------
  1 | #!/bin/bash            
  2 | 
  3 | SOURCE_DIR=$(pwd)
  4 | DIR=""
  5 | CUDA_PATH=/usr/local/cuda/
  6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer
  7 | 
  8 | if [ $# -eq 0 ]; then
  9 |   DIR=$(pwd)/gvprof
 10 | else
 11 |   if [ $# -eq 1 ]; then
 12 |     DIR=$1
 13 |   else
 14 |     if [ $# -eq 2 ]; then
 15 |       DIR=$1
 16 |       CUDA_PATH=$2
 17 |       SANITIZER_PATH=$2/compute-sanitizer
 18 |     else
 19 |       if [ $# -eq 3 ]; then
 20 |         DIR=$1
 21 |         CUDA_PATH=$2
 22 |         SANITIZER_PATH=$3
 23 |       fi
 24 |     fi
 25 |   fi  
 26 | fi
 27 | 
 28 | 
 29 | if [ -z "$DIR" ]
 30 | then
 31 |   echo "Wrong paths"
 32 |   echo "./install <prefix> <cuda-path> <sanitizer-path>"
 33 |   exit
 34 | fi
 35 | 
 36 | echo $DIR
 37 | echo $CUDA_PATH
 38 | echo $SANITIZER_PATH
 39 | 
 40 | if [ ! -d $DIR ]
 41 | then
 42 |   mkdir $DIR
 43 | fi
 44 | 
 45 | cd $DIR
 46 | # Install spack
 47 | # git clone https://github.com/spack/spack.git
 48 | export SPACK_ROOT=$(pwd)/spack
 49 | export PATH=${SPACK_ROOT}/bin:${PATH}
 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh
 51 | 
 52 | # Install hpctoolkit dependencies
 53 | # spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 
 54 | # spack install libmonitor@master+dlopen+hpctoolkit
 55 | # spack install libunwind
 56 | 
 57 | # Fix bug
 58 | # spack install mbedtls gotcha
 59 | 
 60 | # Python version for torch monitor
 61 | PY_VERSION=3.8
 62 | # spack install python@$PY_VERSION
 63 | 
 64 | # Install gpu-patch
 65 | cd $SOURCE_DIR
 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install
 67 | 
 68 | # Find spack and boost dir
 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3)
 70 | S=${B%/*}
 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3)
 72 | 
 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3)
 74 | 
 75 | # Install torch monitor
 76 | cd $SOURCE_DIR
 77 | cd torch-monitor
 78 | make clean
 79 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \
 80 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \
 81 | TORCH_DIR=$PYTORCH_DIR install
 82 | 
 83 | # Install redshow
 84 | cd $SOURCE_DIR
 85 | cd redshow
 86 | make clean
 87 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \
 88 | TORCH_MONITOR_DIR=$DIR/torch-monitor install
 89 | 
 90 | # install hpctoolkit
 91 | cd $SOURCE_DIR
 92 | cd hpctoolkit
 93 | rm -rf build
 94 | mkdir build
 95 | cd build
 96 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \
 97 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \
 98 | --with-redshow=$DIR/redshow --with-spack=$S
 99 | make install -j
100 | 
101 | cd $SOURCE_DIR
102 | # mkdir $DIR/bin
103 | # mkdir $DIR/python
104 | cp ./bin/gvprof $DIR/bin/
105 | cp -r ./python $DIR
106 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer
107 | chmod +x $DIR/bin/gviewer
108 | 
109 | echo "Install in $DIR/bin/gvprof"
110 | 


--------------------------------------------------------------------------------
/bin/gvprof:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage()
  4 | {
  5 |     cat <<EOF
  6 | Usage:
  7 |     gvprof [profiling options] [executable] [executable options]
  8 |     profiling options:
  9 |     -h help
 10 |     -e <profiling-event>
 11 |       redundancy
 12 |       data_flow
 13 |       value_pattern
 14 |     -j <cpu-threads> 
 15 |       multi-threading analysis for binaries 
 16 |     -ck <control-knob>
 17 |       control knob values
 18 |     -cfg gpu-cfg
 19 |       enable fine-grained gpu instruction analysis
 20 |     -s <sampling frequency>
 21 |       set block sampling frequency
 22 |     -l <launcher>
 23 |       pass launcher command for execution. (e.g., "mpirun -np 1")
 24 |     -v verbose
 25 |       redirect output to gvprof.log
 26 | EOF
 27 |     exit 0
 28 | }
 29 | 
 30 | while test "x$1" != x
 31 | do
 32 |   arg="$1" ; shift
 33 |   case "$arg" in
 34 |     -v)
 35 |       export GVPROF_VERBOSE=1
 36 |       ;;
 37 |     -j)
 38 |       export GVPROF_THREADS=$1
 39 |       shift
 40 |       ;;
 41 |     -e)
 42 |       export GVPROF_EVENT=$1
 43 |       shift
 44 |       ;;
 45 |     -cfg)
 46 |       export GVPROF_CFG=1
 47 |       ;;
 48 |     -l)
 49 |       export GVPROF_LAUNCHER="$1"
 50 |       shift
 51 |       ;;
 52 |     -ck)
 53 |       export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1"
 54 |       shift
 55 |       ;;
 56 |     -s)
 57 |       export GVPROF_SAMPLING_FREQUENCY="@$1"
 58 |       shift
 59 |       ;;
 60 |     -h)
 61 |       usage
 62 |       exit
 63 |       ;;
 64 |     * )
 65 |       set -- "$arg" "$@"
 66 |       break
 67 |       ;;
 68 |   esac
 69 | done
 70 | 
 71 | GVPROF_EXEC=$1
 72 | GVPROF_ARGS="${*:2}"
 73 | 
 74 | if [ -z "$GVPROF_EXEC" ]
 75 | then
 76 |   echo "Empty executable"
 77 |   exit
 78 | fi
 79 | 
 80 | if [ -z "$GVPROF_EVENT" ]
 81 | then
 82 |   echo "Empty event"
 83 |   exit
 84 | fi
 85 | 
 86 | if [ -z "$GVPROF_THREADS" ]
 87 | then
 88 |   export GVPROF_THREADS=1
 89 | fi
 90 | 
 91 | if [ ! -z "$GVPROF_VERBOSE" ]
 92 | then
 93 |   export GVPROF_REDIRECT=./gvprof.log
 94 | else
 95 |   export GVPROF_REDIRECT=/dev/null
 96 | fi
 97 | 
 98 | ##Test
 99 | #echo $GVPROF_EXEC
100 | #echo $GVPROF_ARGS
101 | #echo $GVPROF_THREADS
102 | #echo $GVPROF_EVENT
103 | #echo $GVPROF_CONTROL_KNOBS
104 | #echo $GVPROF_LAUNCHER
105 | 
106 | MEASUREMENTS=gvprof-measurements
107 | DATABASE=gvprof-database
108 | echo "Make sure $MEASUREMENTS and $DATABASE is clean"
109 | rm -rf $MEASUREMENTS
110 | rm -rf $DATABASE
111 | 
112 | echo "First pass: dump and analyze CPU and GPU binaries"
113 | 
114 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &> $GVPROF_REDIRECT
115 | rm -rf $MEASUREMENTS/*.hpcrun
116 | 
117 | if [ ! -z "$GVPROF_CFG" ]
118 | then
119 | hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT
120 | else
121 | hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT
122 | fi
123 | 
124 | BASENAME=./$(basename $GVPROF_EXEC)
125 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>> $GVPROF_REDIRECT
126 | 
127 | echo "Second pass: profiling"
128 | 
129 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT
130 | hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT
131 | 
132 | echo "Done..."
133 | 


--------------------------------------------------------------------------------
/bin/gvprof-debug:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage()
  4 | {
  5 |     cat <<EOF
  6 | Usage:
  7 |     gvprof [profiling options] [executable] [executable options]
  8 |     profiling options:
  9 |     -h help
 10 |     -e <profiling-event>
 11 |       redundancy
 12 |       data_flow
 13 |       value_pattern
 14 |     -j <cpu-threads> 
 15 |       multi-threading analysis for binaries 
 16 |     -ck <control-knob>
 17 |       control knob values
 18 |     -cfg gpu-cfg
 19 |       enable fine-grained gpu instruction analysis
 20 |     -s <sampling frequency>
 21 |       set block sampling frequency
 22 |     -l <launcher>
 23 |       pass launcher command for execution. (e.g., "mpirun -np 1")
 24 |     -v verbose
 25 |       redirect output to gvprof.log
 26 | EOF
 27 |     exit 0
 28 | }
 29 | 
 30 | while test "x$1" != x
 31 | do
 32 |   arg="$1" ; shift
 33 |   case "$arg" in
 34 |     -v)
 35 |       export GVPROF_VERBOSE=1
 36 |       ;;
 37 |     -j)
 38 |       export GVPROF_THREADS=$1
 39 |       shift
 40 |       ;;
 41 |     -e)
 42 |       export GVPROF_EVENT=$1
 43 |       shift
 44 |       ;;
 45 |     -cfg)
 46 |       export GVPROF_CFG=1
 47 |       ;;
 48 |     -l)
 49 |       export GVPROF_LAUNCHER="$1"
 50 |       shift
 51 |       ;;
 52 |     -ck)
 53 |       export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1"
 54 |       shift
 55 |       ;;
 56 |     -s)
 57 |       export GVPROF_SAMPLING_FREQUENCY="@$1"
 58 |       shift
 59 |       ;;
 60 |     -h)
 61 |       usage
 62 |       exit
 63 |       ;;
 64 |     * )
 65 |       set -- "$arg" "$@"
 66 |       break
 67 |       ;;
 68 |   esac
 69 | done
 70 | 
 71 | GVPROF_EXEC=$1
 72 | GVPROF_ARGS="${*:2}"
 73 | 
 74 | if [ -z "$GVPROF_EXEC" ]
 75 | then
 76 |   echo "Empty executable"
 77 |   exit
 78 | fi
 79 | 
 80 | if [ -z "$GVPROF_EVENT" ]
 81 | then
 82 |   echo "Empty event"
 83 |   exit
 84 | fi
 85 | 
 86 | if [ -z "$GVPROF_THREADS" ]
 87 | then
 88 |   export GVPROF_THREADS=1
 89 | fi
 90 | 
 91 | if [ ! -z "$GVPROF_VERBOSE" ]
 92 | then
 93 |   export GVPROF_REDIRECT=./gvprof.log
 94 | else
 95 |   export GVPROF_REDIRECT=/dev/null
 96 | fi
 97 | 
 98 | ##Test
 99 | #echo $GVPROF_EXEC
100 | #echo $GVPROF_ARGS
101 | #echo $GVPROF_THREADS
102 | #echo $GVPROF_EVENT
103 | #echo $GVPROF_CONTROL_KNOBS
104 | #echo $GVPROF_LAUNCHER
105 | 
106 | MEASUREMENTS=gvprof-measurements
107 | DATABASE=gvprof-database
108 | echo "Make sure $MEASUREMENTS and $DATABASE is clean"
109 | rm -rf $MEASUREMENTS
110 | rm -rf $DATABASE
111 | 
112 | echo "First pass: dump and analyze CPU and GPU binaries"
113 | 
114 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &> $GVPROF_REDIRECT
115 | rm -rf $MEASUREMENTS/*.hpcrun
116 | 
117 | if [ ! -z "$GVPROF_CFG" ]
118 | then
119 | hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT
120 | else
121 | hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>> $GVPROF_REDIRECT
122 | fi
123 | 
124 | BASENAME=./$(basename $GVPROF_EXEC)
125 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>> $GVPROF_REDIRECT
126 | 
127 | echo "Second pass: profiling"
128 | 
129 | # $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT
130 | # hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT
131 | 
132 | # debug mode
133 | $GVPROF_LAUNCHER hpcrun -d -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>> $GVPROF_REDIRECT
134 | hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>> $GVPROF_REDIRECT
135 | 
136 | echo "Done..."
137 | 


--------------------------------------------------------------------------------
/bin/gvprof_overhead:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage() {
  4 |   cat <<EOF
  5 | Usage:
  6 |     gvprof [profiling options] [executable] [executable options]
  7 |     profiling options:
  8 |     -h help
  9 |     -e <profiling-event>
 10 |       redundancy
 11 |       data_flow
 12 |       value_pattern
 13 |     -j <cpu-threads> 
 14 |       multi-threading analysis for binaries 
 15 |     -ck <control-knob>
 16 |       control knob values
 17 |     -cfg gpu-cfg
 18 |       enable fine-grained gpu instruction analysis
 19 |     -s <sampling frequency>
 20 |       set block sampling frequency
 21 |     -l <launcher>
 22 |       pass launcher command for execution. (e.g., "mpirun -np 1")
 23 |     -v verbose
 24 |       redirect output to gvprof.log
 25 | EOF
 26 |   exit 0
 27 | }
 28 | 
 29 | while test "x$1" != x; do
 30 |   arg="$1"
 31 |   shift
 32 |   case "$arg" in
 33 |   -v)
 34 |     export GVPROF_VERBOSE=1
 35 |     ;;
 36 |   -j)
 37 |     export GVPROF_THREADS=$1
 38 |     shift
 39 |     ;;
 40 |   -e)
 41 |     export GVPROF_EVENT=$1
 42 |     shift
 43 |     ;;
 44 |   -cfg)
 45 |     export GVPROF_CFG=1
 46 |     ;;
 47 |   -l)
 48 |     export GVPROF_LAUNCHER="$1"
 49 |     shift
 50 |     ;;
 51 |   -ck)
 52 |     export GVPROF_CONTROL_KNOBS="$GVPROF_CONTROL_KNOBS -ck $1"
 53 |     shift
 54 |     ;;
 55 |   -s)
 56 |     export GVPROF_SAMPLING_FREQUENCY="@$1"
 57 |     shift
 58 |     ;;
 59 |   -i)
 60 |     export GVPROF_ITERATIONS="$1"
 61 |     shift
 62 |     ;;
 63 |   -h)
 64 |     usage
 65 |     exit
 66 |     ;;
 67 |   *)
 68 |     set -- "$arg" "$@"
 69 |     break
 70 |     ;;
 71 |   esac
 72 | done
 73 | 
 74 | GVPROF_EXEC=$1
 75 | GVPROF_ARGS="${*:2}"
 76 | 
 77 | if [ -z "$GVPROF_EXEC" ]; then
 78 |   echo "Empty executable"
 79 |   exit
 80 | fi
 81 | 
 82 | if [ -z "$GVPROF_EVENT" ]; then
 83 |   echo "Empty event"
 84 |   exit
 85 | fi
 86 | 
 87 | if [ -z "$GVPROF_THREADS" ]; then
 88 |   export GVPROF_THREADS=1
 89 | fi
 90 | 
 91 | if [ ! -z "$GVPROF_VERBOSE" ]; then
 92 |   export GVPROF_REDIRECT=./gvprof.log
 93 | else
 94 |   export GVPROF_REDIRECT=/dev/null
 95 | fi
 96 | 
 97 | ##Test
 98 | #echo $GVPROF_EXEC
 99 | #echo $GVPROF_ARGS
100 | #echo $GVPROF_THREADS
101 | #echo $GVPROF_EVENT
102 | #echo $GVPROF_CONTROL_KNOBS
103 | #echo $GVPROF_LAUNCHER
104 | 
105 | MEASUREMENTS=gvprof-measurements
106 | DATABASE=gvprof-database
107 | echo "Make sure $MEASUREMENTS and $DATABASE is clean"
108 | rm -rf $MEASUREMENTS
109 | rm -rf $DATABASE
110 | 
111 | echo "First pass: dump and analyze CPU and GPU binaries"
112 | 
113 | $GVPROF_LAUNCHER hpcrun -e gpu=nvidia -o $MEASUREMENTS $GVPROF_EXEC $GVPROF_ARGS &>$GVPROF_REDIRECT
114 | rm -rf $MEASUREMENTS/*.hpcrun
115 | 
116 | if [ ! -z "$GVPROF_CFG" ]; then
117 |   hpcstruct --gpucfg yes -j $GVPROF_THREADS $MEASUREMENTS &>>$GVPROF_REDIRECT
118 | else
119 |   hpcstruct -j $GVPROF_THREADS $MEASUREMENTS &>>$GVPROF_REDIRECT
120 | fi
121 | 
122 | BASENAME=./$(basename $GVPROF_EXEC)
123 | hpcstruct $GVPROF_EXEC -o $BASENAME".hpcstruct" &>>$GVPROF_REDIRECT
124 | 
125 | echo "Second pass: profiling"
126 | 
127 | if [ -z "$GVPROF_ITERATIONS" ]; then
128 |   export GVPROF_ITERATIONS=1
129 | fi
130 | echo "ITERATIONS $GVPROF_ITERATIONS"
131 | START=1
132 | for (( i=$START; i<=${GVPROF_ITERATIONS}; i++ ));do
133 | { time   $GVPROF_LAUNCHER hpcrun -e gpu=nvidia,$GVPROF_EVENT$GVPROF_SAMPLING_FREQUENCY -o $MEASUREMENTS $GVPROF_CONTROL_KNOBS $GVPROF_EXEC $GVPROF_ARGS &>>$GVPROF_REDIRECT
134 |   hpcprof -S $BASENAME".hpcstruct" -o $DATABASE $MEASUREMENTS &>>$GVPROF_REDIRECT ; } 2>>time_$GVPROF_EVENT.txt
135 | done
136 | 
137 | echo "Done..."
138 | 


--------------------------------------------------------------------------------
/bin/install:
--------------------------------------------------------------------------------
  1 | #!/bin/bash            
  2 | 
  3 | SOURCE_DIR=$(pwd)
  4 | DIR=""
  5 | CUDA_PATH=/usr/local/cuda/
  6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer
  7 | 
  8 | if [ $# -eq 0 ]; then
  9 |   DIR=$(pwd)/gvprof
 10 | else
 11 |   if [ $# -eq 1 ]; then
 12 |     DIR=$1
 13 |   else
 14 |     if [ $# -eq 2 ]; then
 15 |       DIR=$1
 16 |       CUDA_PATH=$2
 17 |       SANITIZER_PATH=$2/compute-sanitizer
 18 |     else
 19 |       if [ $# -eq 3 ]; then
 20 |         DIR=$1
 21 |         CUDA_PATH=$2
 22 |         SANITIZER_PATH=$3
 23 |       fi
 24 |     fi
 25 |   fi  
 26 | fi
 27 | 
 28 | 
 29 | if [ -z "$DIR" ]
 30 | then
 31 |   echo "Wrong paths"
 32 |   echo "./install <prefix> <cuda-path> <sanitizer-path>"
 33 |   exit
 34 | fi
 35 | 
 36 | echo $DIR
 37 | echo $CUDA_PATH
 38 | echo $SANITIZER_PATH
 39 | 
 40 | if [ ! -d $DIR ]
 41 | then
 42 |   mkdir $DIR
 43 | fi
 44 | 
 45 | cd $DIR
 46 | # Install spack
 47 | git clone https://github.com/Lin-Mao/spack.git
 48 | export SPACK_ROOT=$(pwd)/spack
 49 | export PATH=${SPACK_ROOT}/bin:${PATH}
 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh
 51 | 
 52 | # Install hpctoolkit dependencies
 53 | spack install --only dependencies hpctoolkit ^dyninst@12.0.1 ^binutils@2.34+libiberty~nls 
 54 | spack install libmonitor@master+dlopen+hpctoolkit
 55 | spack install libunwind
 56 | 
 57 | # Fix bug
 58 | spack install mbedtls gotcha
 59 | 
 60 | # Python version for torch monitor
 61 | PY_VERSION=3.8
 62 | spack install python@$PY_VERSION
 63 | 
 64 | # Install gpu-patch
 65 | cd $SOURCE_DIR
 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install
 67 | 
 68 | # Find spack and boost dir
 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3)
 70 | S=${B%/*}
 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3)
 72 | 
 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3)
 74 | 
 75 | # Install torch monitor
 76 | cd $SOURCE_DIR
 77 | cd torch-monitor
 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \
 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \
 80 | TORCH_DIR=$PYTORCH_DIR install
 81 | 
 82 | # Install redshow
 83 | cd $SOURCE_DIR
 84 | cd redshow
 85 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \
 86 | TORCH_MONITOR_DIR=$DIR/torch-monitor install
 87 | 
 88 | # install hpctoolkit
 89 | cd $SOURCE_DIR
 90 | cd hpctoolkit
 91 | mkdir build
 92 | cd build
 93 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \
 94 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \
 95 | --with-redshow=$DIR/redshow --with-spack=$S 
 96 | make install -j16
 97 | 
 98 | cd $SOURCE_DIR
 99 | mkdir $DIR/bin
100 | mkdir $DIR/python
101 | cp ./bin/gvprof $DIR/bin/
102 | cp -r ./python $DIR
103 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer
104 | chmod +x $DIR/bin/gviewer
105 | 
106 | echo "Install in $DIR/bin/gvprof"
107 | 


--------------------------------------------------------------------------------
/bin/install-debug:
--------------------------------------------------------------------------------
  1 | #!/bin/bash            
  2 | 
  3 | SOURCE_DIR=$(pwd)
  4 | DIR=""
  5 | CUDA_PATH=/usr/local/cuda/
  6 | SANITIZER_PATH=$CUDA_PATH/compute-sanitizer
  7 | 
  8 | if [ $# -eq 0 ]; then
  9 |   DIR=$(pwd)/gvprof
 10 | else
 11 |   if [ $# -eq 1 ]; then
 12 |     DIR=$1
 13 |   else
 14 |     if [ $# -eq 2 ]; then
 15 |       DIR=$1
 16 |       CUDA_PATH=$2
 17 |       SANITIZER_PATH=$2/compute-sanitizer
 18 |     else
 19 |       if [ $# -eq 3 ]; then
 20 |         DIR=$1
 21 |         CUDA_PATH=$2
 22 |         SANITIZER_PATH=$3
 23 |       fi
 24 |     fi
 25 |   fi  
 26 | fi
 27 | 
 28 | 
 29 | if [ -z "$DIR" ]
 30 | then
 31 |   echo "Wrong paths"
 32 |   echo "./install <prefix> <cuda-path> <sanitizer-path>"
 33 |   exit
 34 | fi
 35 | 
 36 | echo $DIR
 37 | echo $CUDA_PATH
 38 | echo $SANITIZER_PATH
 39 | 
 40 | if [ ! -d $DIR ]
 41 | then
 42 |   mkdir $DIR
 43 | fi
 44 | 
 45 | cd $DIR
 46 | # Install spack
 47 | git clone https://github.com/Lin-Mao/spack.git
 48 | export SPACK_ROOT=$(pwd)/spack
 49 | export PATH=${SPACK_ROOT}/bin:${PATH}
 50 | source ${SPACK_ROOT}/share/spack/setup-env.sh
 51 | 
 52 | # Install hpctoolkit dependencies
 53 | spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 
 54 | spack install libmonitor@master+dlopen+hpctoolkit
 55 | spack install libunwind
 56 | 
 57 | # Fix bug
 58 | spack install mbedtls gotcha
 59 | 
 60 | # Python version for torch monitor
 61 | PY_VERSION=3.8
 62 | spack install python@$PY_VERSION
 63 | 
 64 | # Install gpu-patch
 65 | cd $SOURCE_DIR
 66 | make PREFIX=$DIR/gpu-patch SANITIZER_PATH=$SANITIZER_PATH CUDA_PATH=$CUDA_PATH install
 67 | 
 68 | # Find spack and boost dir
 69 | B=$(spack find --path boost | tail -n 1 | cut -d ' ' -f 3)
 70 | S=${B%/*}
 71 | UNWIND=$(spack find --path libunwind | tail -n 1 | cut -d ' ' -f 3)
 72 | 
 73 | PY_DEV=$(spack find --path python@$PY_VERSION | tail -n 1 | cut -d ' ' -f 3)
 74 | 
 75 | # Install torch monitor
 76 | cd $SOURCE_DIR
 77 | cd torch-monitor
 78 | make PREFIX=$DIR/torch-monitor PYTHON_INCLUDE_DIR=$PY_DEV/include/python$PY_VERSION \
 79 | PYTHON_LIB_DIR=$PY_DEV/lib/python$PY_VERSION PYTHON_VERSION=$PY_VERSION \
 80 | TORCH_DIR=$PYTORCH_DIR DEBUG=1 install
 81 | 
 82 | # Install redshow
 83 | cd $SOURCE_DIR
 84 | cd redshow
 85 | make PREFIX=$DIR/redshow BOOST_DIR=$B LIBUNWIND_DIR=$UNWIND GPU_PATCH_DIR=$DIR/gpu-patch/ \
 86 | TORCH_MONITOR_DIR=$DIR/torch-monitor DEBUG=1 install
 87 | 
 88 | # install hpctoolkit
 89 | cd $SOURCE_DIR
 90 | cd hpctoolkit
 91 | mkdir build
 92 | cd build
 93 | ../configure --prefix=$DIR/hpctoolkit --with-cuda=$CUDA_PATH \
 94 | --with-sanitizer=$SANITIZER_PATH --with-gpu-patch=$DIR/gpu-patch \
 95 | --with-redshow=$DIR/redshow --with-spack=$S  --enable-develop
 96 | make install -j16
 97 | 
 98 | cd $SOURCE_DIR
 99 | mkdir $DIR/bin
100 | mkdir $DIR/python
101 | cp ./bin/gvprof $DIR/bin/
102 | cp -r ./python $DIR
103 | echo "python $DIR/python/gviewer.py "'${@:1}' > $DIR/bin/gviewer
104 | chmod +x $DIR/bin/gviewer
105 | 
106 | echo "Install in $DIR/bin/gvprof"
107 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build/*
2 | _static/*
3 | _templates/*
4 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/barracuda.md:
--------------------------------------------------------------------------------
 1 | # BarraCUDA
 2 | 
 3 | ## Introduction
 4 | 
 5 | BarraCUDA is a GPU-accelerated sequence mapping software. BarraCUDA's code and sample data are open source and available at [sourceforge](http://seqbarracuda.sourceforge.net/). BarraCUDA's [FAQ page](http://seqbarracuda.sourceforge.net/faqs.html) provides useful instructions for installing and running benchmarks.
 6 | 
 7 | We study BarraCUDA *0.7.107h*, using the `Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa` sample data.
 8 | 
 9 | ## Profiling
10 | 
11 | The input we used elapses for a short time so that we can profile it directly using the `gvprof` script.
12 | 
13 | ```bash
14 | # prepare
15 | ./bin/barracuda index sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa
16 | 
17 | # data_flow
18 | gvprof -e data_flow ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq > quicktest.sai
19 | 
20 | # value_pattern
21 | gvprof -e value_pattern -cfg ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq > quicktest.sai
22 | ```
23 | 
24 | ## Optimizations
25 | 
26 | - *data_flow* - *redundant values*
27 | 
28 | `barracuda.cu: 398`. In this function, cuda memory apis called after *Line 440* are not necessary when `number_of_sequences=0`.
29 | In that case, zero data are transferred between CPUs and GPUs such that arrays remain the same values, but still triggering API invocation cost. 
30 | 
31 | - *value_pattern* - *dense values*
32 | 
33 | `cuda2.cuh: 865`. This line copies all the elements from a local array to a global array, regardless of their values. While CPU's `memcpy` is fast for contiguous copy, GPU's `memcpy` is not. We observe that this copy operation involes many zeros. Therefore, we can create a `hits` array to record which positions have been updated, then only copy values at these positions.


--------------------------------------------------------------------------------
/docs/castro.md:
--------------------------------------------------------------------------------
 1 | # Castro
 2 | 
 3 | ## Introduction
 4 | 
 5 | Castro is an astrophysical radiation hydrodynamics simulation code based on AMReX framework.
 6 | 
 7 | We study Castro version `5e0a1b9cbc259f4dd17f5453ba59808b4da5c3ab`,
 8 | and profile Casto's `Exec/hydro_tests/Sedov` example using its `inputs.2d.cyl_in_cartcoords` input.
 9 | 
10 | To compile Castro, we setup the following variables in `GNUmakefile`:
11 | 
12 | ```bash
13 | USE_CUDA=TRUE
14 | CUDA_ARCH=TRUE
15 | DIM=2
16 | USE_MPI=FALSE
17 | ```
18 | 
19 | ## Profiling
20 | 
21 | For a small scale run, we setup `max_step=20` in `inputs.2d.cyl_in_cartcoords`.
22 | To generate the data flow graph for Castro, along with redundancy metrics, we can use the `gvprof` script directly.
23 | For other fine-grained metrics, we can use `gvprof` if GPU control flow graphs are not required. Otherwise, we recommend using hpctoolkit to perform step-by-step profiling. 
24 | 
25 | ## Optimization
26 | 
27 | - *data_flow* - *redundant values*
28 | 
29 | [`AMReX_Interp_2D_C.H: 344`](https://github.com/AMReX-Codes/amrex/blob/b7ddf2d2677fce63a567612978e01ced288dbda2/Src/AmrCore/AMReX_Interp_2D_C.H#L344). When castro invokes `cellconslin_slopes_mmlim`, which is an internal function provided by AMReX, it performs `slope(i, j, n) *= a` for each output.
30 | With the `inputs.2d.cyl_in_cartcoords` input, somehow *a* is mostly 1.0.
31 | Thereby, we can save one load and one store for each output if we conditionally perform `slope(i, j, n) *= a`.
32 | Though this optimization does not achieve a significant speedup, it is worth mentioning if this it also benefits other applications that use AMReX.


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'DrGPUM'
21 | copyright = '2023, University of California, Merced'
22 | author = 'Mao Lin'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     'recommonmark'
32 | ]
33 | 
34 | # Add any paths that contain templates here, relative to this directory.
35 | templates_path = ['_templates']
36 | 
37 | # List of patterns, relative to source directory, that match files and
38 | # directories to ignore when looking for source files.
39 | # This pattern also affects html_static_path and html_extra_path.
40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
41 | 
42 | 
43 | # -- Options for HTML output -------------------------------------------------
44 | 
45 | # The theme to use for HTML and HTML Help pages.  See the documentation for
46 | # a list of builtin themes.
47 | #
48 | html_theme = 'sphinx_rtd_theme'
49 | 
50 | # Add any paths that contain custom static files (such as style sheets) here,
51 | # relative to this directory. They are copied after the builtin static files,
52 | # so a file named "default.css" will overwrite the builtin "default.css".
53 | html_static_path = []
54 | 
55 | source_suffix = {
56 |     '.rst': 'restructuredtext',
57 |     '.txt': 'markdown',
58 |     '.md': 'markdown',
59 | }
60 | 
61 | html_theme = "sphinx_rtd_theme"
62 | html_theme_path = ["_themes", ]
63 | 


--------------------------------------------------------------------------------
/docs/darknet.md:
--------------------------------------------------------------------------------
 1 | # Darknet
 2 | 
 3 | ## Introduction
 4 | 
 5 | [Darknet](https://github.com/AlexeyAB/darknet) is an open source neural network framework written in C and CUDA. It is fast, easy to install, and supports CPU and GPU computation.
 6 | 
 7 | We check out Darknet version `312fd2e99a765949e468e18277d41f7992f08860`, study the `yolov4.cfg` and `yolov4-tiny.cfg` networks, and test an image `dog.jpg`.
 8 | 
 9 | To compile darknet, we setup the following knobs in Makefile:
10 | 
11 | ```bash
12 | GPU=1
13 | # append -lineinfo to the start of ARCH
14 | ARCH=-lineinfo ...
15 | # append -g to the start of CFLAGS
16 | CFLAGS=-g ...
17 | ```
18 | 
19 | ## Profiling
20 | 
21 | For the data flow analysis, one can use gvprof to profile darknet directly. `-ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1` yields significant speedup.
22 | 
23 | For the value pattern analysis, we recommend using a whitelist to specify interesting GPU kernels and turning on block sampling and kernel samples.
24 | In addition, if control flow graph based analysis is wanted, we don't recommend using `gvprof -cfg` directly because Darknet uses cuBLAS and cuDNN that trigger hundreds of large binaries loading at runtime.
25 | In fact, darkent's data type is almost uniform across all kernels so that one can gain insights even without `-cfg`.
26 | 
27 | We can profile the fine grain patterns of darknet using
28 | 
29 | ```bash
30 | gvprof -e value_pattern@10 -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=20
31 | ```
32 | 
33 | In the `whitelist` file, we specify the following three kernels:
34 | 
35 | ```
36 | _Z15add_bias_kernelPfS_iiii
37 | _Z21im2col_gpu_kernel_extiPKfiiiiiiiiiiiiPf
38 | _Z26activate_array_mish_kernelPfiS_S_      
39 | ```
40 | 
41 | Other than a few kernels with dense value patterns when approximation is used, we didn't find other interesting patterns.
42 | 
43 | **You may want to lookup real kernel names with `gvprof -v` or `readelf -s` since compilers may generate different names**
44 | 
45 | ## Optimization
46 | 
47 | - *data_flow* - *redundant values*
48 | 
49 | `upsampling_layer.c: 91` and `convolution_kernels.cu: 559`. In the generated data flow graph, we found that the nodes annotated with the `fill_ongpu` kernel always have redundant accesses.
50 | Because we run the inference mode only, the arrays are initialized with zeros and filled zeros again.
51 | To optimize it, we can set up a flag for each array to indicate if it is "clean". A "clean" array shouldn't be filled zeros again.


--------------------------------------------------------------------------------
/docs/deepwave.md:
--------------------------------------------------------------------------------
 1 | # Deepwave
 2 | 
 3 | ## Introduction
 4 | 
 5 | [Deepwave](https://github.com/ar4/deepwave) is a wave propagation software implemented based on.
 6 | 
 7 | We study deepwave version `1154692258da342accd21df02f7fa9ddd008f75f`. The input for deepwave is attached in DrGPUM's samples.
 8 | 
 9 | We first add `-lineinfo -g` to the `_make_cuda_extension` function in `setup.py`, and then add `-g` to the `_make_cpp_extension` function. Next we use `pip install .` to install deepwave. 
10 | 
11 | **Note that this pip is supposed be the pip installed by conda as we use conda across all the python samples**
12 | 
13 | To run the deepwave example in DrGPUM, we need to install matplotlib by `conda install matplotlib`.
14 | 
15 | ## Profiling
16 | 
17 | Currently, using gvprof to profile python applications is intricate. We use HPCToolkit to profile and analyze deepwave separatedly. Please refer to the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page for the complete guide.
18 | 
19 | With the default configuration, this example takes a relatively long time.
20 | We can change `num_epochs` to 1 and let it break after finishing the first batch.
21 | This deepwave application introduces higher overhead (150-200x) than other applications (~20x) because its kernels access millions of memory addresses with lots of gaps. 
22 | As a result, we are not able to merge all of the memory access ranges on the GPU.
23 | Then, we will spend long time in both copying memory addresses from the GPU to the host and updating host memories. 
24 | 
25 | For value pattern profiling, we monitor the most expensive `propagate` kernel using the following options.
26 | 
27 | ```bash
28 | LD_LIBRARY_PATH=/path/to/python/install/lib/python<version>/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,value_pattern@10000 -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=100000 python ./Deepwave_SEAM_example1.py
29 | ```
30 | 
31 | For data flow profiling, we turn on these knobs to accelerate the profiling process.
32 | 
33 | ```bash
34 | LD_LIBRARY_PATH=/path/to/python/install/lib/python<version>/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_DATA_FLOW_HASH=0 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 python ./Deepwave_SEAM_example1.py
35 | 
36 | # this gives you additional speedup
37 | # export OMP_NUM_THREADS=16
38 | ```
39 | 
40 | More information about accelerating data flow and value pattern profiling can be found in the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page
41 | 
42 | ## Optimization
43 | 
44 | Please refer to the `replication_pad3d` issue in [PyTorch](https://gvprof.readthedocs.io/en/latest/faq.html).


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Profile Python applications
 4 | 
 5 | Please first refer to the MANUAL page for step-by-step profiling using HPCToolkit.
 6 | 
 7 | In addition to the basic commands there, we also have to pay attention to other minor issues. 
 8 | 
 9 | In the measurement stage, `LD_LIBRARY_PATH=/path/to/python/library/:$LD_LIBRARY_PATH` may be needed as a prefix before *hpcrun*. We have a detailed example for [profiling PyTorch](https://gvprof.readthedocs.io/en/latest/deepwave.html).
10 | 
11 | Then, after getting measurement data and GPU binaries, we will analyze cpu binaries to get necessary line information.
12 | For GPU binaries, we use *hpcstruct --gpucfg no* on the measurement directory as suggested by the manual.
13 | For CPU binaries, the *python* binary does not contain all the program structure we need to understand program contexts.
14 | Instead, we have to analyze these binaries loaded dynamically at runtime.
15 | A python application may load hundreds of libraries at runtime but not use all of them.
16 | Therefore, in order to use hpcstrut on a minimum set of binaries but still extract information to understand program contexts, we adopt a *test-and-analyze* strategy.
17 | Using this strategy, we try hpcprof to correlate performance data with line maps first, if hpcprof hangs because of the large size of line map in a binary, we kill hpcprof and use hpcstruct on this binary to enjoy its fine-grained and fast binary analysis against hpcprof. 
18 | 
19 | When hpcprof begins analyze a binary, it will print out some message like below. In such a case, we can kill hpcprof, remove the temporary database, and use `hpcstruct` to analyze `libtorch_python.so`.
20 | 
21 | ```bash
22 | msg: Begin analyzing : /path/to/python/lib/python3.8/site-packages/torch/lib/libtorch_python.so
23 | ```
24 | 
25 | ## Accelerate data flow profiling
26 | 
27 | The following three knobs are helpful for accelerating proiling of applications with many kernels. With all the options turned on, the expected end-to-end of DrGPUM is approximately 20x, while the overhead could be over 1200x without these knobs. 
28 | 
29 | ```bash
30 | HPCRUN_SANITIZER_READ_TRACE_IGNORE=<if read addresses are ignored, default: 0>
31 | HPCRUN_SANITIZER_DATA_FLOW_HASH=<if SHA256 hash is calculated for every operation, default: 0>
32 | HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=<number of gpu blocks dedicated for analysis, default: 0>
33 | ```
34 | 
35 | Note that these knobs can disable some information generation.
36 | 
37 | When GPU analysis is enabled, one can adjust the number of records on the GPU side to enlarge the buffer on the GPU side and further reduce overhead.
38 | 
39 | ```bash
40 | HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=<size of the buffer on GPU, default: 16 * 1024>
41 | ```
42 | 
43 | ## Accelerate value pattern profiling
44 | 
45 | The following knobs are helpful for profiling the value pattern of specific kernels, focusing on just several kernel instances.
46 | 
47 | Besides, one can also apply `<pattern-name>@N` to activate block sampling that profiles a random GPU block out of every *N* blocks.
48 | 
49 | ```bash
50 | HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=<kernel sampling frequency, default: 1>
51 | HPCRUN_SANITIZER_WHITELIST=<functions to be monitored during execution, default: 0>
52 | HPCRUN_SANITIZER_BLACKLIST=<functions not monitored during execution, default: 0>
53 | ```


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. DrGPUM documentation master file, created by
 2 |    sphinx-quickstart on Sun Mar 21 02:21:40 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | DrGPUM: A Memory Profiler for GPUs
 7 | ==================================
 8 | 
 9 | DrGPUM is an advanced memory profiler that locates memory inefficiencies in GPU-accelerated applications.
10 | DrGPUM's code is available on `Github <https://github.com/Lin-Mao/DrGPUM.git>`_.
11 | 
12 | .. toctree::
13 |    :maxdepth: 3
14 |    :caption: DrGPUM Basics
15 | 
16 |    preface
17 |    install
18 |    manual
19 |    faq
20 | 
21 | .. toctree::
22 |    :maxdepth: 1
23 |    :caption: DrGPUM Development
24 | 
25 |    workflow
26 |    roadmap
27 | 
28 | .. toctree::
29 |    :maxdepth: 1
30 |    :caption: Components
31 | 
32 |    hpctoolkit
33 |    redshow
34 |    gviewer
35 | 
36 | .. toctree::
37 |    :maxdepth: 3
38 |    :caption: DrGPUM Samples
39 | 
40 |    unit_tests
41 |    rodinia
42 |    qmcpack
43 |    castro
44 |    deepwave
45 |    darknet
46 |    pytorch
47 |    namd
48 |    barracuda
49 | 
50 | Indices and Tables
51 | ==================
52 | 
53 | * :ref:`genindex`
54 | * :ref:`search`
55 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | The documentation includes detailed instructions for every package required by DrGPUM. One can use `./bin/install` to install all these packages at once.
 4 | 
 5 | The install script accepts three arguments in order:
 6 | 
 7 | ```bash
 8 | # Specify PyTorch dir
 9 | export PYTORCH_DIR=/path_to_pytorch/torch
10 | 
11 | ./bin/install <install-prefix> <path/to/cuda> <path/to/compute-sanitizer>
12 | # default values
13 | # <install-prefix>=`pwd`/gvprof
14 | # <path/to/cuda>=/usr/local/cuda
15 | # <path/to/compute-sanitizer>=<path/to/cuda>/compute-sanitizer
16 | ```
17 | 
18 | Before you install, make sure all the CUDA related paths (e.g., `LD_LIBRARY_PATH`) are setup.
19 | 
20 | ## GPU Patch
21 | 
22 | If you install cuda toolkit in somewhere else, you need to change the value of `SANITIZER_PATH`.
23 | 
24 | ```bash
25 | git clone --recursive https://github.com/Lin-Mao/DrGPUM.git && cd DrGPUM
26 | cd DrGPUM
27 | make PREFIX=/path/to/gpu-patch/installation SANITIZER_PATH=/usr/local/cuda/compute-sanitizer/ install
28 | ```
29 | ## Dependencies
30 | 
31 | - spack
32 | 
33 | ```bash
34 | git clone https://github.com/spack/spack.git
35 | export SPACK_ROOT=/path/to/spack
36 | source ${SPACK_ROOT}/share/spack/setup-env.sh
37 | ```
38 | - required packages
39 | 
40 | ```bash
41 | # Install hpctoolkit dependencies
42 | spack install --only dependencies hpctoolkit ^dyninst@master ^binutils@2.34+libiberty~nls 
43 | spack install libmonitor@master+dlopen+hpctoolkit
44 | spack install libunwind
45 | 
46 | spack install mbedtls gotcha
47 | 
48 | # Python version for torch monitor
49 | PY_VERSION=3.8
50 | spack install python@$PY_VERSION
51 | ```
52 | 
53 | ## Redshow
54 | 
55 | ```bash
56 | cd redshow
57 | # Tip: get boost libarary path 'spack find --path' and append include to that path
58 | make install -j8 PREFIX=/path/to/redshow/installation BOOST_DIR=/path/to/boost/installation GPU_PATH_DIR=/path/to/gpu-patch/installation
59 | # Useful options:
60 | # make DEBUG=1
61 | # make OPENMP=1
62 | ```
63 | 
64 | ## HPCToolkit
65 | 
66 | - profiling substrates
67 | 
68 | ```bash
69 | cd /path/to/hpctoolkit
70 | mkdir build && cd build
71 | # Tip: check spack libraries' root->spack find --path.  
72 | # For example: --with-spack=/home/username/spack/opt/spack/linux-ubuntu18.04-zen/gcc-7.4.0/
73 | ../configure --prefix=/path/to/hpctoolkit/installation --with-cuda=/usr/local/cuda-11.0 --with-sanitizer=/path/to/sanitizer --with-gpu-patch=/path/to/gpu-patch/installation --with-redshow=/path/to/redshow/installation --with-spack=/path/to/spack/libraries/root --with
74 | make install -j8
75 | ```
76 | 
77 | - hpcviewer (optional)
78 | 
79 | [http://hpctoolkit.org/download/hpcviewer/](http://hpctoolkit.org/download/hpcviewer/)
80 | 
81 | ## Setup and Test
82 | 
83 | Add following lines into your `.bashrc` file and source it.
84 | 
85 | ```bash
86 | export PATH=/path/to/hpctoolkit/install/bin/:$PATH
87 | export PATH=/path/to/DrGPUM/install/bin/:$PATH
88 | export PATH=/path/to/redshow/install/bin/:$PATH
89 | ```
90 | 
91 | Test if gvprof works.
92 | 
93 | ```bash
94 | cd ./samples/vectorAdd.f32
95 | make
96 | gvprof -v -e redundancy ./vectorAdd
97 | hpcviewer gvprof-database
98 | ```


--------------------------------------------------------------------------------
/docs/lammps.md:
--------------------------------------------------------------------------------
 1 | # Lammps
 2 | 
 3 | ## Introduction
 4 | 
 5 | We check out Lammps version `69d41dc16cd3272da8e768414d972b32a36803c1`, and test input `lammps/bench/in.lj` .
 6 | 
 7 | To compile lammps, we first edit `lammps/lib/kokkos/bin/nvcc_wrapper:37` that append `-lineinfo` to `cuda_args`. Then create a build directory under lammps, and use the following command lines to compile it.
 8 | 
 9 | ```
10 | cmake -DPKG_KOKKOS=ON -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ENABLE_OPENMP=yes -D CMAKE_CXX_COMPILER=`pwd`/../lib/kokkos/bin/nvcc_wrapper ../cmake
11 | ../build/lmp -k on g 1 -sf kk -in in.lj 
12 | ```
13 | 


--------------------------------------------------------------------------------
/docs/manual.md:
--------------------------------------------------------------------------------
  1 | # Manual
  2 | 
  3 | ## Compile with Line Information
  4 | 
  5 | DrGPUM relies on debug information in binaries to attribute fine-grained value metrics on individual lines, loops, and functions. 
  6 | 
  7 | For GPU binaries, we recommend using `-O3 -lineinfo`.
  8 | 
  9 | For CPU binaries, we recommend using `-O3 -g`.
 10 | 
 11 | For software compiled with CMake system, usually we can edit `CMAKE_C_FLAGS` and `CMAKE_CXX_FLAGS` to add line info flags. Additionally, CUDA line info can be added through `CMAKE_CUDA_FLAGS`.
 12 | 
 13 | ## Profile Using DrGPUM
 14 | 
 15 | The `gvprof` script automates a series of profiling and analysis processes, but supports only basic profiling features. For detailed profiling control, please refer to the next section.
 16 | 
 17 | ```bash
 18 | gvprof -h
 19 | # Currently we offer three modes
 20 | # gvprof -v is your friend for debugging
 21 | gvprof -e <redundancy/data_flow/value_pattern> <app-name>
 22 | ```
 23 | 
 24 | ## Profile Using HPCToolkit
 25 | 
 26 | Using hpctoolkit to profile applications enables fine-grained control knobs, selective analysis of GPU/CPU binaries, and compatibilities with various launchers (e.g., jsrun).
 27 | We invoke `hpcrun` to profile an application twice using the same input.
 28 | In the first pass, we dump the cubins loaded at runtime and profile each kernel's running time. Then we invoke `hpcstruct` to analyze program structure and instruction dependency.
 29 | In the second pass, we instrument the cubins and invoke `redshow` redundancy analysis library to analyze measurement data.
 30 | 
 31 | 
 32 | ### First pass
 33 |    
 34 | ```bash
 35 | hpcrun -e gpu=nvidia <app-name>
 36 | hpcstruct <app-name>
 37 | # if '--gpucfg yes', hpcstruct will analyze the control flow graph of each GPU function and perform backward slicing, which is costly for large GPU binaries.
 38 | hpcstruct --gpucfg no hpctoolkit-<app-name>-measurements
 39 | # One can use also hpcstruct on the select GPU binaries only 
 40 | hpcstruct --gpucfg no <binary-name>
 41 | ```
 42 |    
 43 | ### Second pass
 44 | 
 45 | ```bash
 46 | # Before profiling, we remove all profile data dumped in the first pass
 47 | rm -rf hpctoolkit-<app-name>-measurements/*.hpcrun
 48 | 
 49 | hpcrun -e gpu=nvidia,<mode> -ck <option1> -ck <option2> ... <app-name>
 50 | hpcprof -S <app-name>.hpcstruct hpctoolkit-<app-name>-measurements    
 51 | # If only some binaries are analyzed using hpcstruct,
 52 | # one has to supply the corresponding binaries' structure files
 53 | hpcprof -S <app-name>.hpcstruct -S <binary-name>.hpcstruct hpctoolkit-<app-name>-measurements    
 54 | ```
 55 | 
 56 | ### HPCToolkit separate pass
 57 | 
 58 | Large scale applications, such as Castro heavily use lambda functions and template functions for GPU kernels. Therefore, tools like `nsys` and `ncu` cannot efficiently correlate each kernel's execution time their names. Even though nvtx can provide some information to locate kernels, it is still not straightforward to map metrics back to source lines. Instead, we recommend using HPCToolkit, which provides an integrate calling context span CPUs and GPUs, to lookup the calling context and running time for each kernel. The following commands can be used.
 59 | 
 60 | ```bash
 61 | hpcrun -e gpu=nvidia,pc <app-name>
 62 | hpcstruct <app-name>
 63 | hpcstruct --gpucfg no hpctoolkit-<app-name>-measurements
 64 | hpcprof -S <app-name>.hpcstruct hpctoolkit-<app-name>-measurements
 65 | hpcviewer hpctoolkit-<app-name>-measurements
 66 | ```
 67 | 
 68 | ## Control Knobs
 69 | 
 70 | The following fine-grained options can be passed to either gvprof or hpcrun by pointing the option name and option value with `-ck <option-name>=<option-value>`.
 71 | 
 72 | ```bash
 73 | HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=<size of the buffer on GPU, default: 16 * 1024>
 74 | HPCRUN_SANITIZER_BUFFER_POOL_SIZE=<size of the buffer pool on CPU, default: 500>
 75 | HPCRUN_SANITIZER_APPROX_LEVEL=<enable approximated profiling, 0-5, default: 0>
 76 | HPCRUN_SANITIZER_PC_VIEWS=<number of top redundant values per pc, default: 0>
 77 | HPCRUN_SANITIZER_MEM_VIEWS=<number of top redundant values per memory object, default: 0>
 78 | HPCRUN_SANITIZER_DEFAULT_TYPE=<default data type of memory objects, default: float>
 79 | HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=<kernel sampling frequency, default: 1>
 80 | HPCRUN_SANITIZER_WHITELIST=<functions to be monitored during execution, default: 0>
 81 | HPCRUN_SANITIZER_BLACKLIST=<functions not monitored during execution, default: 0>
 82 | HPCRUN_SANITIZER_READ_TRACE_IGNORE=<if read addresses are ignored, default: 0>
 83 | HPCRUN_SANITIZER_DATA_FLOW_HASH=<if SHA256 hash is calculated for every operation, default: 0>
 84 | HPCRUN_SANITIZER_LIVENESS_ONGPU=<if enable memory liveness analysis on GPU, default: 0>
 85 | HPCRUN_SANITIZER_TORCH_ANALYSIS=<if enable PyTorch memory analysis, default: 0>
 86 | HPCRUN_SANITIZER_TORCH_ANALYSIS_ONGPU=<if enable PyTorch analysis on GPU, default: 0>
 87 | HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=<number of gpu blocks dedicated for analysis, default: 0>
 88 | ```
 89 | 
 90 | ### Calling context view 
 91 | 
 92 | Only CPU calling context is available now.
 93 | GPU calling context is under development.
 94 | 
 95 | ```bash
 96 | hpcviewer <database-dir>
 97 | ```
 98 |       
 99 | ### Data flow view
100 | 
101 | ```bash
102 | gviewer -f <database-dir>/data_flow.dot.context -cf file -p 
103 | # gviewer -h for detailed options
104 | ```
105 | The generated .svg can be visualized directly. To enable interactive control, we can rename the file to `demo.svg` and move it to `jquery.graphviz.svg`. After launch a server locally, we can visualize the graph, zoom in for important parts, and track each node's data flows.
106 | 
107 | ### Fine grain pattern views
108 | 
109 | ```bash
110 | # value pattern
111 | less <database-dir>/value_pattern_t<cpu-thread-id>.csv
112 | 
113 | # redundancy
114 | less <database-dir>/temporal_read_t<cpu-thread-id>.csv
115 | less <database-dir>/temporal_write_t<cpu-thread-id>.csv
116 | less <database-dir>/spatial_read_t<cpu-thread-id>.csv
117 | less <database-dir>/spatial_write_t<cpu-thread-id>.csv
118 | ```
119 | 
120 | ## Example
121 | 
122 | <work-in-progress>
123 | 


--------------------------------------------------------------------------------
/docs/namd.md:
--------------------------------------------------------------------------------
 1 | # NAMD
 2 | 
 3 | ## Introduction
 4 | 
 5 | [NAMD](https://www.ks.uiuc.edu/Research/namd/) is a parallel molecular dynamics code designed for high-performance simulation of large biomolecular systems. NAMD uses the popular molecular graphics program VMD for simulation setup and trajectory analysis.
 6 | 
 7 | We download NAMD source code from its [official website](https://www.ks.uiuc.edu/Development/Download/download.cgi). We use NAMD version `4a41c6087f69c4cfe3edfdb19c6a5780ac20f5f1` and study the `alanin` input.
 8 | 
 9 | The following flags are setup in `Make.config`:
10 | 
11 | ```
12 | CUDAGENCODE = -arch <gpu-arch> -g -lineinfo
13 | CXX_OPTS = -g -O3            
14 | ```
15 | 
16 | ## Profiling
17 | 
18 | For data flow profiling, we use the normal gvprof script with the `-ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1` option.
19 | 
20 | For value pattern profiling, we monitor the most costly `nonbondedForceKernel` kernel of namd. Note that because this function accesses many arrays with different value types, we need GPU control flow graph and backward slicing to derive the types of each array.
21 | For your reference, we use the command
22 | ```
23 | gvprof -cfg -j 16 -e value_pattern -ck HPCRUN_SANITIZER_WHITELIST=./whitelist -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=10
24 | ```
25 | The CFG analysis phase could take up to an hour consuming about **100GB** main memory.
26 | 
27 | **Caution: please use the full mangled name of `nonbondedForceKernel`**
28 | 
29 | ## Optimization
30 | 
31 | - *data_flow* - *redundant values*
32 | 
33 | We find the *submitHalf* kernels are repetively invoked, forming an interesting diagram.
34 | Investigating carefully into the code, we find that the redundancy is introduced on purpose.
35 | The authors of namd pay close attention to its performance. They allocate some variables on the device to accumulate global sums and only transfer these value back to the host using the last block of the kernel. Besides, at the end of these kernels, they reset these values to zeros to make sure the next time the buffers are clean.
36 | 
37 | You may wonder they are doing this. There are two reasons:
38 | 
39 | 1. If variables are not cleaned on the device, we have to reset variable using either `memsetAsync` or implicit device host communication which trigger extra cost. In contract, directly set variables in a GPU kernel can hide this latency by overlapping memory latency with computations latencies without additional API invocation.
40 | 
41 | 2. If the host variable is accessed every time, these kernels will be slowed down significantly.
42 | 
43 | - *value_pattern* - *type overuse*
44 | 
45 | `CudaComputeNonbondedKernel.cu: 579`. By profiling the value patterns of this `CudaCOmputeNonbondedKernel` kernel, we find this array's type is overused. We can use `uint8_t` to replace the original `int` data type.


--------------------------------------------------------------------------------
/docs/preface.md:
--------------------------------------------------------------------------------
 1 | # Preface
 2 | 
 3 | DrGPUM is a memory profiling tool for applications running on GPU clusters, with advanced features for value based profiling and analysis.
 4 | 
 5 | The following diagram describes how components communicate with each other.
 6 | 
 7 | ```
 8 |                                                                                                                                            
 9 |    -------------       ---------------------       ------------------------      *************************************
10 |    | GPU Patch |  <->  | Profiling Runtime |  <->  | Measurement Analysis |  ->  ** Program Analyzer and Aggregator **  ->  Performance Reports
11 |    -------------       ---------------------       ------------------------      *************************************
12 |                                                             |                                                                   /|\
13 |                                                             |--------------------------------------------------------------------|
14 | 
15 | ```
16 | 
17 | ## HPCToolkit (Profiling Runtime)
18 | 
19 | [*HPCToolkit*](http://hpctoolkit.org/) is a powerful profiling tool that measures application performance on the world's largest supercomputers.
20 | DrGPUM customizes HPCToolkit and uses it as the default profiling runtime.
21 | Currently, we are developing on HPCToolkit's [*sanitizer*](https://github.com/Lin-Mao/hpctoolkit) version.
22 | 
23 | ## Redshow
24 | 
25 | [*Redshow*](https://github.com/Lin-Mao/redshow) is a postmortem metrics analysis substrate.
26 | It receives data from the profiling runtime, performs analysis enabled by the user, and store the analysis result onto the disk.
27 | Besides, redshow maintains the information of data objects allocated at runtime.
28 | Redshow also contains binary analysis modules to map virtual addresses to function index and symbol names and analyze GPU instruction characteristics.
29 | 
30 | ## GPU Patch
31 | 
32 | *GPU Patch* includes several implementation of instrumentation callbacks and a GPU-CPU communication system.
33 | It can collect GPU memory metrics, block enter/exit records, and GPU call/ret records (under development).
34 | The collected data are stored on a GPU buffer.
35 | The profiling runtime observes a signal once the GPU buffer is full and copies data from the GPU to the CPU.
36 | 
37 | ## Program Analyzer and Aggregator
38 | 
39 | Some high level performance metrics are output to performance reports directly. 
40 | Low level detailed performance metrics are associated with individual functions and lines.
41 | Therefore, we analyze program structure to attribute these metrics.
42 | Moreover, when analyzing application running on multiple nodes, we can aggregate performance data together to compute overall metrics that represent the entire execution.
43 | 


--------------------------------------------------------------------------------
/docs/pytorch.md:
--------------------------------------------------------------------------------
 1 | # PyTorch
 2 | 
 3 | ## Introduction
 4 | 
 5 | [PyTorch](https://pytorch.org/) is a popular machine learning framework.
 6 | 
 7 | We use PyTorch version `f5788898a928cb2489926c1a5418c94c598c361b`. We study `resnet50`, `bert`, `deepwave` models. 
 8 | 
 9 | We apply the following commands to compile PyTorch from source.
10 | 
11 | ```bash
12 | spack install miniconda3
13 | 
14 | conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
15 | 
16 | conda install -c pytorch magma-cuda110
17 | 
18 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
19 | export USE_CUDA=1
20 | export REL_WITH_DEB_INFO=1
21 | export MAX_JOBS=16
22 | export USE_NINJA=OFF 
23 | python setup.py install
24 | ```
25 | 
26 | - *resnet*
27 | 
28 | We get the `resnet` example from the [pytorch benchmark](https://github.com/pytorch/benchmark/tree/master/torchbenchmark/models/resnet50) repo. 
29 | 
30 | To ease the installtion, we provide `1-spatial-convolution-model.py` and `1-spatial-convolution-unit.py` to check layer-wise and end-to-end performance.
31 | 
32 | - *deepwave*
33 | 
34 | We provide the instructions for installing deepwave here.
35 | 
36 | To ease checking the problematic kernel, we provide `2-replication-pad3d.py` script which only has a single `ReplicationPad3d` kernel.
37 | 
38 | - *bert*
39 | 
40 | We get the `reset` example from the [pytorch benchmark](https://github.com/pytorch/benchmark/tree/master/torchbenchmark/models/resnet50).
41 | 
42 | To ease checking the problematic kernel, we provide `3-embedding-unit.py` script which only has a single `Embedding` kernel.
43 | 
44 | ## Profiling
45 | 
46 | Profiling a Python application takes extra steps than a normal application. We have a general guide to profile application in the [FAQ](https://gvprof.readthedocs.io/en/latest/faq.html) page.
47 | 
48 | An example profiling command is attached below for reference:
49 | 
50 | ```bash
51 | LD_LIBRARY_PATH=/path/to/python/install/lib/python<version>/site-packages/torch:$LD_LIBRARY_PATH hpcrun -e gpu=nvidia,data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_DATA_FLOW_HASH=0 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 python ./<pytorch-script>.py
52 | ```
53 | 
54 | ## Optimization
55 | 
56 | We don't provide an automate performance testing suite for PyTorch in DrGPUM because recompile PyTorch for just small code changes still take long time and is a pain on low end servers. 
57 | 
58 | - *data_flow* - *redundant values*
59 | 
60 | Please refer to this [issue](https://github.com/pytorch/pytorch/issues/48539)
61 | 
62 | - *data_flow* - *redundant values* - *value_pattern* - *redundant zeros*
63 | 
64 | Please refer to these two: [issue1](https://github.com/pytorch/pytorch/issues/48889) and [issue2](https://github.com/pytorch/pytorch/issues/49663)


--------------------------------------------------------------------------------
/docs/qmcpack.md:
--------------------------------------------------------------------------------
 1 | # QMCPACK
 2 | 
 3 | ## Introduction
 4 | 
 5 | [QMCPACK](https://github.com/QMCPACK/qmcpack) is an open-source production level many-body ab initio Quantum Monte Carlo code for computing the electronic structure of atoms, molecules, and solids.
 6 | 
 7 | We study QMCPACK version `474062068a9f6348dbf7d55be7d1bd375c24f1fe`.
 8 | 
 9 | There are a bunch of packages required to compiled QMCPACK, including clang, OpenMP (offloading), HDF5, FFTW, and BOOST. These packages can be installed directly via spack.
10 | 
11 | To compile QMCPACK, we pass the following variables to cmake:
12 | 
13 | ```bash
14 | CMAKE_C_COMPILER=mpicc
15 | CMAKE_CXX_COMPILER=mpicxx
16 | ENABLE_OFFLOAD=ON
17 | USE_OBJECT_TARGET=ON
18 | OFFLOAD_ARCH=<gpu-arch>
19 | ENABLE_CUDA=1
20 | CUDA_ARCH=<gpu-arch>
21 | CUDA_HOST_COMPILER=`which gcc`
22 | QMC_DATA=<path/to/qmc/data>
23 | ENABLE_TIMERS=1
24 | ```
25 | 
26 | The following environment variables are also required:
27 | 
28 | ```bash
29 | export OMPI_CC=clang
30 | export OMPI_CXX=clang++
31 | ```
32 | 
33 | ## Profiling
34 | 
35 | First follow the instructions in `tests/performance/NiO/README` to enable and run the NiO tests. The configuration file used is `Nio-fcc-S1-dmc.xml` under the `batched_driver` folder.
36 | 
37 | At runtime, we use four worker threads (`export OMP_NUM_THREADS=4`). For a small scale run, one can adjust control variables such as `warmupSteps` to reduce execution time.
38 | 
39 | The data flow pattern can be profiled directly using gvprof. For the value pattern mode, one has to find the interesting function's names and use gvprof's whitelist to focus on these functions.
40 | 
41 | ## Optimization
42 | 
43 | - *data_flow* - *redundant values*
44 | 
45 | [`MatrixDelayedUpdateCUDA.h: 627`](https://github.com/QMCPACK/qmcpack/blob/5c4776b747fefef0146765379461c6593108cf11/src/QMCWaveFunctions/Fermion/MatrixDelayedUpdateCUDA.h#L627). This line is often copying the same base pointers to the arrays on the GPU. Though this is not be a performance bottleneck for the current workload, it might be worth attention once the number of arrays increases. 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | recommonmark
2 | sphinx-rtd-theme


--------------------------------------------------------------------------------
/docs/roadmap.md:
--------------------------------------------------------------------------------
 1 | # Roadmap
 2 | 
 3 | This document describes incoming features and release plans for DrGPUM. Since DrGPUM is a growing project, it has many components need fix and enhancement. Suggestions and feature requests are welcome. Users can post questions on Github's [discussion forum](https://github.com/Lin-Mao/DrGPUM/issues).
 4 | 
 5 | ## Release v2.2
 6 | 
 7 | We plan release *v2.2* around Fall 2021, which will focus on enhancing the stability and compatibility of DrGPUM. Also, a few new features, such as customized memory allocator support and more accessible function filters are planned to be integrated.
 8 | 
 9 | 
10 | - Features
11 | 
12 |     - NVTX
13 | 
14 |         Register CUPTI's NVTX callback to monitor customized memory allocators.
15 | 
16 |     - CUDA Memory Pool
17 | 
18 |         Support memory pool allocators in CUDA 11.2
19 | 
20 | - Bug Fixes
21 | 
22 |     - Function Filters
23 |     
24 |         Support substring match in whitelist and blacklist
25 | 
26 |     - Value Pattern Output
27 | 
28 |         Sort output arrays based on their access counts and fix weird numbers
29 | 
30 | - Deployment and Test
31 | 
32 |     - CMake
33 | 
34 |         Add CMake configurations to DrGPUM in addition to Makefile
35 | 
36 |     - Unittest
37 | 
38 |         Adapt python unittest package
39 | 
40 |     - Test configurations
41 | 
42 |         Adopt yaml files to configure test cases
43 | 
44 | ## Pending Issues
45 | 
46 | We haven't decided when to solve the following issues.
47 | 
48 | - GViewer Website
49 |     
50 |     Launch a website to visualize data flow graphs.
51 | 
52 | - Fine grain pattern and data flow integration
53 | 
54 |     Use the website described before to show both fine grain patterns and data flow.
55 | 
56 | - HPCToolkit Merge
57 | 
58 |     Merge the latest HPCToolkit master into DrGPUM.
59 | 


--------------------------------------------------------------------------------
/docs/rodinia.md:
--------------------------------------------------------------------------------
 1 | # Rodinia GPU Benchmark
 2 | 
 3 | ## backprop
 4 | 
 5 | - vp-opt1: *value_pattern* - *redundant zeros*
 6 | 
 7 | [`backprop_cuda_kernel.cu: 81`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/backprop/backprop_cuda_kernel.cu#L81). The *delta* array has many zeros. We can check each entry on the GPU side to execute a special branch that avoid computation.
 8 | 
 9 | - vp-opt2: *data_flow* - *duplicate values*
10 | 
11 | [`backprop_cuda.cu: 180`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/backprop/backprop_cuda.cu#L180). *net->input_units* is copied to GPU at *Line 118* and copied back at *Line 188*. Meanwhile, both the GPU data and the CPU data are not changed. As a result, the copy at *Line 188* can be eliminated safely.
12 | 
13 | ## bfs
14 | 
15 | - vp-opt1: *value_pattern* - *type overuse*
16 | 
17 | [`kernel.cu: 22`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/bfs/kernel.cu#L22). The *g_cost*'s array's values are within the range of `[-127, 128)`. We can specify this array's type as `int_8` instead of `int` to reduce both kernel execution time and memory copy time.
18 | 
19 | - vp-opt2: *value_pattern* - *dense values*
20 | 
21 | [`bfs.cu: 107-109`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/bfs/bfs.cu#L107). Accesses to these arrays showing a dense value pattern where zeros are read most of the time. We can replace the memory copies of all zeros from CPU to GPU by memset that is much faster to reduce memory copy time.
22 | 
23 | ## cfd
24 | 
25 | - vp-opt1: *value_pattern* - *dense values*
26 | 
27 | [`euler3d.cu: 173`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/cfd/euler3d.cu#L173). The *cuda_initialize_variables* function writes values in a dense pattern. We can *hash* the accessing index of this array to limit memory access in a certain range and increase cache locality. Since this array is changed in the second iteration, this optimization only applies to the first iteration.
28 | 
29 | - vp-opt2: *data_flow* - *redundant values*
30 | 
31 | [`euler3d.cu: 570`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/cfd/euler3d.cu#L570). The *old_variables* array is originally initialized at *Line 551* with the same values are *variables* but copied again at *Line 570*. We can safely eliminate the second copy which is redundant to the first iteration.
32 | 
33 | ## hotspot
34 | 
35 | - vp-opt: *value_pattern* - *approximate* - *single value*
36 | 
37 | [`hotspot.cu: 164`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/hotspot/hotspot.cu#L164). The *temp_src* array contains many very close floating point numbers.
38 | Using the approximate mode, gvprof determines values in this array are approximately the same under a certain approximation level.
39 | Therefore, we can read just some neighbor points on *Line 195* and still get similar final results.
40 | 
41 | ## hotspot3D
42 | 
43 | - vp-opt: *value_pattern* - *approximate* - *single value*
44 | 
45 | [`opt1.cu: 29`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/hotspot3D/opt1.cu#L29). Like the *hotspot* example, the *tIn* array contains many very close floating point numbers. And gvprof determines all this values in this array are approximately the same under the certain approximation level. Incontrast to the *hotspot* example that selectively choose neighors, we use loop perforation to compute half of the loops and get similar result.
46 | 
47 | ## huffman
48 | 
49 | - vp-opt: *value_pattern* - *dense values*
50 | 
51 | [`his.cu: 51`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/huffman/hist.cu#L51). DrGPUM reports dense values for the histo array in both the write and read modes. Because the most frequently updated value is zero, we can conditionally perform atomicAdd to reduce atomic operations.
52 | 
53 | ## lavaMD
54 | 
55 | - vp-opt: *value_pattern* - *type overuse*
56 | 
57 | [`kernel_gpu_cuda.cu: 84`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/lavaMD/kernel/kernel_gpu_cuda.cu#L84). The *rA* array contains only few distinct numbers. By checking its initialization on the CPU side, we note that there are only ten fixed values within 0.1 to 1.0. We can store these values using `uint_8` instead of `double`, saving *8x* space. These values are then decoded on the GPU side. In this way, we trade in compute time for memory copy time.
58 | 
59 | ## pathfinder
60 | 
61 | - vp-opt: *value_pattern* - *type overuse*
62 | 
63 | [`pathfinder.cu: 144`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/pathfinder/pathfinder.cu#L144). The *gpuWall* array's values for this input will be within `[0, 255]`, thereby we can use `uint8_t` to replace `int` to reduce global memory traffic.
64 | 
65 | ## srad
66 | 
67 | - vp-opt1: *value_pattern* - *single value*
68 | 
69 | [`srad_kernel.cu: 79`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/srad_v1/srad_kernel.cu#L79). *d_c_loc* is always one for this output. We can memset all the values of *d_c* to 1 beforehand and to eliminate all stores with 1s
70 | 
71 | - vp-opt2: *value_pattern* - *structured*
72 | 
73 | [`srad_kernel.cu: 38`](https://github.com/DrGPUM/DrGPUM-samples/blob/a8c23e3aba/srad_v1/srad_kernel.cu#L38) . *d_iN*, *d_iS*, *d_jW*, *d_jE* are used to indicate the adjacent nodes' coordinates which have structured patterns. We removed these four arrays and replace them with the corresponding calculations.
74 | 
75 | ## streamcluster
76 | 
77 | - vp-opt: *data_flow* - *redundant values*
78 | 
79 | [`streamcluster_cuda.cu:221`](https://github.com/FindHao/DrGPUM-samples/blob/110a7cdb0d57f5902941deb59899e6266f79844e/streamcluster/streamcluster_cuda.cu#L221). These arrays *center_table_d*, *switch_membership_d*, *p* are not changed in each iteration. Therefore, we can use flags on the CPU to detect if these arrays will be changed and only copy values if they are.
80 | 


--------------------------------------------------------------------------------
/docs/unit_tests.md:
--------------------------------------------------------------------------------
 1 | # Unit Tests
 2 | 
 3 | ## interval_merge
 4 | 
 5 | This example is a carbon copy of the DrGPUM's interval analysis GPU module.
 6 | 
 7 | ## op_graph_simple
 8 | 
 9 | This example has a few redundant and duplicate memory access patterns, and is used to test basic functions of the DrGPUM's data_flow mode.
10 | 
11 | ## op_pattern_simple
12 | 
13 | This example has kernels with various fine-grained memory access patterns, and is used to test basic functions of the DrGPUM's value_pattern mode.
14 | 
15 | ## stress
16 | 
17 | A multi-context multi-stream proxy app to test DrGPUM's stability.
18 | 
19 | ## vectorAdd
20 | 
21 | A set of test cases for redshow's instruction parser.
22 | 


--------------------------------------------------------------------------------
/docs/workflow.md:
--------------------------------------------------------------------------------
 1 | # Workflow
 2 | 
 3 | ## Use GPU Patch
 4 | 
 5 | GPU Patch is built upon [Compute Sanitizer API](https://docs.nvidia.com/cuda/sanitizer-docs/index.html). 
 6 | As we are closely working with NVIDIA on this API, we will update GPU Patch to use new features as soon as the new release is available.
 7 | You can find a complete usage example of Sanitizer API in [`sanitizer-api.c`](https://github.com/HPCToolkit/hpctoolkit/blob/sanitizer/src/tool/hpcrun/gpu/nvidia/sanitizer-api.c).
 8 | Some simple samples can be found in this [repository](https://github.com/NVIDIA/compute-sanitizer-samples).
 9 | 
10 | ## Use RedShow with HPCToolkit
11 | 
12 | Please refer to the redshow [header file](https://github.com/Lin-Mao/redshow/blob/main/include/redshow.h) for the complete set of interface.
13 | 
14 | If a new mode is added to DrGPUM, one should configure through the following redshow functions and sanitizer variables in HPCToolkit.
15 | 
16 | ```
17 | redshow_analysis_enable
18 | redshow_output_dir_config
19 | 
20 | sanitizer_gpu_patch_type
21 | sanitizer_gpu_patch_record_size
22 | sanitizer_gpu_analysis_type
23 | sanitizer_gpu_analysis_record_size
24 | sanitizer_analysis_async
25 | ```
26 | 
27 | Currently, using a new runtime with redshow other than HPCToolkit is intricate, we will update the doc once we've gone through the whole process.
28 | 
29 | ## DrGPUM Tests
30 | 
31 | DrGPUM has end-to-end tests for each analysis mode plus an unit test for instruction analysis. Therefore, if a new analysis mode is added, we suppose the developer to add a test using python to verify its correctness. 
32 | 
33 | For each analysis mode, the developer should write at least one simple case that covers most situations and collect results from samples.
34 | 
35 | We are in the process of completing the testing framework.
36 | 
37 | To run DrGPUM test, we use the following command at DrGPUM's root directory. The instruction test could fail due to the default data type used, which is acceptable.
38 | 
39 | ```bash
40 | python python/test.py -m all -a <gpu arch>
41 | ```
42 | 


--------------------------------------------------------------------------------
/include/gpu-patch.h:
--------------------------------------------------------------------------------
 1 | #ifndef HPCTOOLKIT_GPU_PATCH_GPU_PATCH_H
 2 | #define HPCTOOLKIT_GPU_PATCH_GPU_PATCH_H
 3 | 
 4 | #include <stdbool.h>
 5 | #include <stdint.h>
 6 | 
 7 | #define GPU_PATCH_MAX_ACCESS_SIZE (16)
 8 | #define GPU_PATCH_WARP_SIZE (32)
 9 | #define GPU_PATCH_ANALYSIS_THREADS (1024)
10 | #define GPU_PATCH_ANALYSIS_ITEMS (4)
11 | #define GPU_PATCH_ADDRESS_DICT_SIZE (1024)
12 | 
13 | enum GPUPatchFlags {
14 |   GPU_PATCH_NONE = 0,
15 |   GPU_PATCH_READ = 0x1,
16 |   GPU_PATCH_WRITE = 0x2,
17 |   GPU_PATCH_ATOMSYS = 0x4,
18 |   GPU_PATCH_LOCAL = 0x8,
19 |   GPU_PATCH_SHARED = 0x10,
20 |   GPU_PATCH_BLOCK_ENTER_FLAG = 0x20,
21 |   GPU_PATCH_BLOCK_EXIT_FLAG = 0x40,
22 |   GPU_PATCH_ANALYSIS = 0x80
23 | };
24 | 
25 | enum GPUPatchType {
26 |   GPU_PATCH_TYPE_DEFAULT = 0,
27 |   GPU_PATCH_TYPE_ADDRESS_PATCH = 1,
28 |   GPU_PATCH_TYPE_ADDRESS_ANALYSIS = 2,
29 |   GPU_PATCH_TYPE_COUNT = 3
30 | };
31 | 
32 | // Complete record
33 | typedef struct gpu_patch_record {
34 |   uint64_t pc;
35 |   uint32_t size;
36 |   uint32_t active;
37 |   uint32_t flat_thread_id;
38 |   uint32_t flat_block_id;
39 |   uint32_t flags;
40 |   uint64_t address[GPU_PATCH_WARP_SIZE];
41 |   uint8_t value[GPU_PATCH_WARP_SIZE][GPU_PATCH_MAX_ACCESS_SIZE];  // STS.128->16 bytes
42 | } gpu_patch_record_t;
43 | 
44 | // Address only
45 | typedef struct gpu_patch_record_address {
46 |   uint32_t flags;
47 |   uint32_t active;
48 |   uint32_t size;
49 |   uint64_t address[GPU_PATCH_WARP_SIZE];
50 | } gpu_patch_record_address_t;
51 | 
52 | // Address only, gpu analysis
53 | typedef struct gpu_patch_analysis_address {
54 |   uint64_t start;
55 |   uint64_t end;
56 | } gpu_patch_analysis_address_t;
57 | 
58 | // Auxiliary data
59 | typedef struct gpu_patch_aux_address_dict {
60 |   uint32_t size;
61 |   gpu_patch_analysis_address_t start_end[GPU_PATCH_ADDRESS_DICT_SIZE];
62 |   uint8_t hit[GPU_PATCH_ADDRESS_DICT_SIZE];
63 |   uint8_t read[GPU_PATCH_ADDRESS_DICT_SIZE];
64 |   uint8_t write[GPU_PATCH_ADDRESS_DICT_SIZE];
65 | } gpu_patch_aux_address_dict_t;
66 | 
67 | typedef struct gpu_patch_buffer {
68 |   volatile uint32_t full;
69 |   volatile uint32_t analysis;
70 |   volatile uint32_t head_index;
71 |   volatile uint32_t tail_index;
72 |   uint32_t size;
73 |   uint32_t num_threads;  // If num_threads == 0, the kernel is finished
74 |   uint32_t block_sampling_offset;
75 |   uint32_t block_sampling_frequency;
76 |   uint32_t type;
77 |   uint32_t flags;  // read or write or both
78 |   void *records;
79 |   void *aux;
80 |   void *torch_aux;
81 | } gpu_patch_buffer_t;
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/include/gpu-queue.h:
--------------------------------------------------------------------------------
 1 | #ifndef HPCTOOLKIT_GPU_PATCH_GPU_QUEUE_H
 2 | #define HPCTOOLKIT_GPU_PATCH_GPU_QUEUE_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #include "gpu-patch.h"
 7 | 
 8 | /*
 9 |  * Get a gpu record
10 |  */
11 | extern "C" __device__ uint32_t gpu_queue_get(gpu_patch_buffer_t *buffer, uint32_t analysis = 0) {
12 |   uint32_t size = buffer->size;
13 |   uint32_t tail_index = 0;
14 |   while (tail_index == 0) {
15 |     tail_index = atomicAdd((uint32_t *)&buffer->tail_index, 1) + 1;
16 |     // Write on tail_index - 1
17 |     if (tail_index - 1 >= size) {
18 |       // First warp that found buffer is full
19 |       if (tail_index - 1 == size) {
20 |         // Wait for previous warps finish writing
21 |         while (buffer->head_index < size);
22 |         if (analysis == 1) {
23 |           // Sync with GPU
24 |           __threadfence();
25 |           buffer->analysis = 1;
26 |           __threadfence();
27 |           while (buffer->analysis == 1);
28 |         } else {
29 |           // Sync with CPU
30 |           __threadfence_system();
31 |           buffer->full = 1;
32 |           __threadfence_system();
33 |           while (buffer->full == 1);
34 |         }
35 |         __threadfence();
36 |         buffer->head_index = 0;
37 |         __threadfence();
38 |         buffer->tail_index = 0;
39 |       } else {
40 |         // Other waps
41 |         while (buffer->tail_index >= size);
42 |       }
43 |       tail_index = 0;
44 |     }
45 |   }
46 | 
47 |   return tail_index - 1;
48 | }
49 | 
50 | /*
51 |  * Finish writing gpu records
52 |  */
53 | extern "C" __device__ void gpu_queue_push(gpu_patch_buffer_t *buffer) {
54 |   // Make sure records are visible
55 |   atomicAdd((uint32_t *)&buffer->head_index, 1);
56 | }
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef HPCTOOLKIT_GPU_PATCH_UTILITIES_H
  2 | #define HPCTOOLKIT_GPU_PATCH_UTILITIES_H
  3 | 
  4 | #include <stdint.h>
  5 | 
  6 | /*
  7 |  * Utility functions
  8 |  */
  9 | __device__ __forceinline__ uint32_t get_flat_block_id() {
 10 |   return blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
 11 | }
 12 | 
 13 | __device__ __forceinline__ uint32_t get_flat_thread_id() {
 14 |   return threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 15 | }
 16 | 
 17 | __device__ __forceinline__ uint64_t get_unique_thread_id() {
 18 |   return get_flat_block_id() * blockDim.x * blockDim.y * blockDim.z + get_flat_thread_id();
 19 | }
 20 | 
 21 | __device__ __forceinline__ uint64_t get_grid_num_threads() {
 22 |   return gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z;
 23 | }
 24 | 
 25 | __device__ __forceinline__ uint64_t get_block_num_threads() {
 26 |   return blockDim.x * blockDim.y * blockDim.z;
 27 | }
 28 | 
 29 | __device__ __forceinline__ uint32_t get_laneid() {
 30 |   uint32_t laneid = 0;
 31 |   asm volatile("mov.u32 %0, %laneid;" : "=r"(laneid));
 32 |   return laneid;
 33 | }
 34 | 
 35 | __device__ __forceinline__ bool sample_callback(uint32_t frequency, uint32_t offset) {
 36 |   if (frequency != 0) {
 37 |     // 1  : Sample all blocks
 38 |     // >1 : Sample a portion of blocks
 39 |     return get_flat_block_id() % frequency == offset;
 40 |   }
 41 |   // Skip all blocks
 42 |   return false;
 43 | }
 44 | 
 45 | __device__ __forceinline__ bool is_locked(uint32_t *lock, uint32_t id) {
 46 |   uint32_t old = *lock;
 47 |   // Read the newest value
 48 |   __threadfence();
 49 |   return old == id;
 50 | }
 51 | 
 52 | __device__ __forceinline__ void read_shared_memory(uint32_t size, uint32_t ptr, uint8_t *buf) {
 53 |   for (uint32_t i = 0; i < size; ++i) {
 54 |     uint32_t ret = 0;
 55 |     asm volatile("ld.shared.b8 %0,[%1];" : "=r"(ret) : "r"(ptr + i) : "memory");
 56 |     buf[i] = ret;
 57 |   }
 58 | }
 59 | 
 60 | __device__ __forceinline__ void read_global_memory(uint32_t size, uint64_t ptr, uint8_t *buf) {
 61 |   for (uint32_t i = 0; i < size; ++i) {
 62 |     uint32_t ret = 0;
 63 |     asm volatile("ld.b8 %0,[%1];" : "=r"(ret) : "l"(ptr + i) : "memory");
 64 |     buf[i] = ret;
 65 |   }
 66 | }
 67 | 
 68 | __device__ __forceinline__ void read_local_memory(uint32_t size, uint32_t ptr, uint8_t *buf) {
 69 |   for (uint32_t i = 0; i < size; ++i) {
 70 |     uint32_t ret = 0;
 71 |     asm volatile("ld.local.b8 %0,[%1];" : "=r"(ret) : "r"(ptr + i) : "memory");
 72 |     buf[i] = ret;
 73 |   }
 74 | }
 75 | 
 76 | template <class T>
 77 | __device__ __forceinline__ T shfl(T v, uint32_t srcline, uint32_t mask = 0xFFFFFFFF) {
 78 |   T ret;
 79 | #if (__CUDA_ARCH__ >= 300)
 80 | #if (__CUDACC_VER_MAJOR__ >= 9)
 81 |   ret = __shfl_sync(mask, v, srcline);
 82 | #else
 83 |   ret = __shfl(v, srcline);
 84 | #endif
 85 | #endif
 86 |   return ret;
 87 | }
 88 | 
 89 | template <class T>
 90 | __device__ __forceinline__ T shfl_up(T v, uint32_t delta, uint32_t width = GPU_PATCH_WARP_SIZE,
 91 |                                      uint32_t mask = 0xFFFFFFFF) {
 92 |   T ret;
 93 | #if (__CUDA_ARCH__ >= 300)
 94 | #if (__CUDACC_VER_MAJOR__ >= 9)
 95 |   ret = __shfl_up_sync(mask, v, delta, width);
 96 | #else
 97 |   ret = __shfl_up(v, delta, width);
 98 | #endif
 99 | #endif
100 |   return ret;
101 | }
102 | 
103 | template <class T>
104 | __device__ __forceinline__ T shfl_xor(T v, uint32_t lane_mask, uint32_t mask = 0xFFFFFFFF) {
105 |   T ret;
106 | #if (__CUDA_ARCH__ >= 300)
107 | #if (__CUDACC_VER_MAJOR__ >= 9)
108 |   ret = __shfl_xor_sync(mask, v, lane_mask);
109 | #else
110 |   ret = __shfl_xor(v, lane_mask);
111 | #endif
112 | #endif
113 |   return ret;
114 | }
115 | 
116 | __device__ __forceinline__ uint32_t ballot(int32_t predicate, uint32_t mask = 0xFFFFFFFF) {
117 |   uint32_t ret;
118 | #if (__CUDA_ARCH__ >= 300)
119 | #if (__CUDACC_VER_MAJOR__ >= 9)
120 |   ret = __ballot_sync(mask, predicate);
121 | #else
122 |   ret = __ballot(predicate);
123 | #endif
124 | #endif
125 |   return ret;
126 | }
127 | 
128 | __device__ __forceinline__ uint32_t bfe(uint32_t source, uint32_t bit_index) {
129 |   uint32_t bit;
130 |   asm volatile("bfe.u32 %0, %1, %2, %3;"
131 |                : "=r"(bit)
132 |                : "r"((uint32_t)source), "r"(bit_index), "r"(1));
133 |   return bit;
134 | }
135 | 
136 | __device__ __forceinline__ uint32_t brev(uint32_t source) {
137 |   uint32_t dest;
138 |   asm volatile("brev.b32 %0, %1;" : "=r"(dest) : "r"(source));
139 |   return dest;
140 | }
141 | 
142 | __device__ __forceinline__ uint32_t bfind(uint32_t source) {
143 |   uint32_t bit_index;
144 |   asm volatile("bfind.u32 %0, %1;" : "=r"(bit_index) : "r"((uint32_t)source));
145 |   return bit_index;
146 | }
147 | 
148 | __device__ __forceinline__ uint32_t fns(uint32_t source, uint32_t base_index) {
149 |   uint32_t bit_index;
150 |   asm volatile("fns.b32 %0, %1, %2, %3;" : "=r"(bit_index) : "r"(source), "r"(base_index), "r"(1));
151 |   return bit_index;
152 | }
153 | 
154 | template <typename T>
155 | __device__ __forceinline__ T comparator(T x, uint32_t lane_mask, bool dir,
156 |                                         uint32_t mask = 0xFFFFFFFF) {
157 |   T y = shfl_xor(x, lane_mask, mask);
158 |   return x < y == dir ? y : x;
159 | }
160 | 
161 | template <typename T>
162 | __device__ __forceinline__ T warp_sort(T x, uint32_t laneid) {
163 |   x = comparator(x, 1, bfe(laneid, 1) ^ bfe(laneid, 0));  // A, sorted sequences of length 2
164 |   x = comparator(x, 2, bfe(laneid, 2) ^ bfe(laneid, 1));  // B
165 |   x = comparator(x, 1, bfe(laneid, 2) ^ bfe(laneid, 0));  // C, sorted sequences of length 4
166 |   x = comparator(x, 4, bfe(laneid, 3) ^ bfe(laneid, 2));  // D
167 |   x = comparator(x, 2, bfe(laneid, 3) ^ bfe(laneid, 1));  // E
168 |   x = comparator(x, 1, bfe(laneid, 3) ^ bfe(laneid, 0));  // F, sorted sequences of length 8
169 |   x = comparator(x, 8, bfe(laneid, 4) ^ bfe(laneid, 3));  // G
170 |   x = comparator(x, 4, bfe(laneid, 4) ^ bfe(laneid, 2));  // H
171 |   x = comparator(x, 2, bfe(laneid, 4) ^ bfe(laneid, 1));  // I
172 |   x = comparator(x, 1, bfe(laneid, 4) ^ bfe(laneid, 0));  // J, sorted sequences of length 16
173 |   x = comparator(x, 16, bfe(laneid, 4));                  // K
174 |   x = comparator(x, 8, bfe(laneid, 3));                   // L
175 |   x = comparator(x, 4, bfe(laneid, 2));                   // M
176 |   x = comparator(x, 2, bfe(laneid, 1));                   // N
177 |   x = comparator(x, 1, bfe(laneid, 0));                   // O, sorted sequences of length 32
178 | 
179 |   return x;
180 | }
181 | 
182 | template <typename T>
183 | __device__ __forceinline__ T atomic_load(const T *addr) {
184 |   const volatile T *vaddr = addr;  // volatile to bypass cache
185 |   __threadfence();                 // for seq_cst loads. Remove for acquire semantics.
186 |   const T value = *vaddr;
187 |   // fence to ensure that dependent reads are correctly ordered
188 |   __threadfence();
189 |   return value;
190 | }
191 | 
192 | template <typename T>
193 | __device__ __forceinline__ void atomic_store(T *addr, T value) {
194 |   volatile T *vaddr = addr;  // volatile to bypass cache
195 |   // fence to ensure that previous non-atomic stores are visible to other threads
196 |   __threadfence();
197 |   *vaddr = value;
198 | }
199 | 
200 | template <typename T>
201 | __device__ __forceinline__ void atomic_store_system(T *addr, T value) {
202 |   volatile T *vaddr = addr;  // volatile to bypass cache
203 |   // fence to ensure that previous non-atomic stores are visible to other threads
204 |   __threadfence_system();
205 |   *vaddr = value;
206 | }
207 | 
208 | template <typename T, typename C>
209 | __device__ __forceinline__ uint32_t map_upper_bound(T *map, T value, uint32_t len, C cmp) {
210 |   uint32_t low = 0;
211 |   uint32_t high = len;
212 |   uint32_t mid = 0;
213 |   while (low < high) {
214 |     mid = (high - low) / 2 + low;
215 |     if (cmp(map[mid], value)) {
216 |       low = mid + 1;
217 |     } else {
218 |       high = mid;
219 |     }
220 |   }
221 |   return low;
222 | }
223 | 
224 | template <typename T, typename C>
225 | __device__ __forceinline__ uint32_t map_prev(T *map, T value, uint32_t len, C cmp) {
226 |   uint32_t pos = map_upper_bound<T, C>(map, value, len, cmp);
227 |   if (pos != 0) {
228 |     --pos;
229 |   } else {
230 |     pos = len;
231 |   }
232 |   return pos;
233 | }
234 | 
235 | #endif
236 | 


--------------------------------------------------------------------------------
/python/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 | 


--------------------------------------------------------------------------------
/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-Mao/DrGPUM/c6ffb1665df35905bfa4b0d93ac75eca00e451ac/python/__init__.py


--------------------------------------------------------------------------------
/python/bench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | 
  4 | from collections import namedtuple
  5 | 
  6 | from test_cases import Test
  7 | from utils import pipe_read, nsys_profile
  8 | 
  9 | 
 10 | class Benchmark(Test):
 11 |     # (kernel_name, is_template)
 12 |     Config = namedtuple('Config', ['kernels'])
 13 | 
 14 |     def __init__(self, arch, version):
 15 |         super().__init__('Benchmark', arch, version)
 16 |         self._kernel_time = dict()
 17 |         self._gpu_kernel_time = dict()
 18 |         self._gpu_mem_time = dict()
 19 |         self._time = dict()
 20 | 
 21 |     def setup(self, choices):
 22 |         for choice in choices:
 23 |             if choice == 'backprop':
 24 |                 self._configs[choice] = Benchmark.Config(
 25 |                     kernels=[('bpnn_adjust_weights_cuda', False)])
 26 |             elif choice == 'bfs':
 27 |                 self._configs[choice] = Benchmark.Config(
 28 |                     kernels=[('Kernel', False)])
 29 |             elif choice == 'cfd':
 30 |                 self._configs[choice] = Benchmark.Config(kernels=[('cuda_compute_flux', True),
 31 |                                                                   ('cuda_time_step', True),
 32 |                                                                   ('cuda_compute_step_factor', True)])
 33 |             elif choice == 'hotspot':
 34 |                 self._configs[choice] = Benchmark.Config(
 35 |                     kernels=[('calculate_temp', False)])
 36 |             elif choice == 'hotspot3D':
 37 |                 self._configs[choice] = Benchmark.Config(
 38 |                     kernels=[('hotspotOpt1', False)])
 39 |             elif choice == 'huffman':
 40 |                 self._configs[choice] = Benchmark.Config(
 41 |                     kernels=[('histo_kernel', False)])
 42 |             elif choice == 'lavaMD':
 43 |                 self._configs[choice] = Benchmark.Config(
 44 |                     kernels=[('kernel_gpu_cuda', False)])
 45 |             elif choice == 'pathfinder':
 46 |                 self._configs[choice] = Benchmark.Config(
 47 |                     kernels=[('dynproc_kernel', False)])
 48 |             elif choice == 'srad':
 49 |                 self._configs[choice] = Benchmark.Config(
 50 |                     kernels=[('srad', False), ('srad2', False)])
 51 |             elif choice == 'streamcluster':
 52 |                 self._configs[choice] = Benchmark.Config(kernels=[])
 53 | 
 54 |     def _run_impl(self, case_name, version):
 55 |         version_name = 'origin' if version is None else version
 56 | 
 57 |         def _init_time_dict(time_dict):
 58 |             if case_name not in time_dict:
 59 |                 time_dict[case_name] = dict()
 60 | 
 61 |             if version_name not in time_dict[case_name]:
 62 |                 time_dict[case_name][version_name] = 0.0
 63 |            
 64 |         _init_time_dict(self._kernel_time)
 65 |         _init_time_dict(self._gpu_kernel_time)
 66 |         _init_time_dict(self._gpu_mem_time)
 67 |         _init_time_dict(self._time)
 68 | 
 69 |         command = Test.cases[case_name].command
 70 |         options = Test.cases[case_name].options
 71 | 
 72 |         time_start = time.time()
 73 |         pipe_read([command] + options)
 74 |         time_end = time.time()
 75 |         elapse = time_end - time_start
 76 | 
 77 |         self._time[case_name][version_name] += elapse
 78 | 
 79 |         print('{}/{}: {}s'.format(case_name, version_name, elapse))
 80 | 
 81 |         kernel_times, gpu_kernel_time, gpu_mem_time = nsys_profile(
 82 |             [command] + options, self._configs[case_name].kernels)
 83 | 
 84 |         self._gpu_kernel_time[case_name][version_name] += gpu_kernel_time / (1e9)
 85 |         self._gpu_mem_time[case_name][version_name] += gpu_mem_time / (1e9)
 86 | 
 87 |         for kernel, kernel_time in kernel_times.items():
 88 |             self._kernel_time[case_name][version_name] += kernel_time / (1e9)
 89 |             print('{}/{}/{}: {}s'.format(case_name,
 90 |                                          version_name, kernel, kernel_time / (1e9)))
 91 |         print('{}/{}/gpu_kernel_time: {}s'.format(case_name,
 92 |               version_name, gpu_kernel_time / (1e9)))
 93 |         print('{}/{}/gpu_mem_time: {}s'.format(case_name,
 94 |               version_name, gpu_mem_time / (1e9)))
 95 | 
 96 |     def report(self):
 97 |         def _report_speedup(time_dict, dict_name):
 98 |             for case_name, times in time_dict.items():
 99 |                 for version_name, version_time in times.items():
100 |                     if version_name == 'origin':
101 |                         continue
102 |                     elif version_time != 0.0:
103 |                         sp = time_dict[case_name]['origin'] / version_time
104 |                         print('{}/{}/{}: {}x'.format(case_name, version_name, dict_name, sp))
105 |         
106 |         _report_speedup(self._time, 'time')
107 |         _report_speedup(self._kernel_time, 'kernel_time')
108 |         _report_speedup(self._gpu_kernel_time, 'gpu_kernel_time')
109 |         _report_speedup(self._gpu_mem_time, 'gpu_mem_time')
110 | 
111 | 
112 | parser = argparse.ArgumentParser(
113 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
114 | parser.add_argument('-c', '--case', help='case name')
115 | parser.add_argument('-v', '--version', default='all', help='benchmark version')
116 | parser.add_argument('-i', '--iterations', type=int, default=1)
117 | parser.add_argument('-a', '--arch', choices=['sm_70', 'sm_72',
118 |                     'sm_75', 'sm_80', 'sm_86'], default='sm_70', help='gpu arch name')
119 | args = parser.parse_args()
120 | 
121 | if args.case is None:
122 |     choice = ['backprop', 'bfs', 'cfd', 'hotspot', 'hotspot3D', 'huffman', 'lavaMD', 'pathfinder', 'srad', 'streamcluster']
123 | else:
124 |     choice = [args.case]
125 | 
126 | benchmark = Benchmark(args.arch, args.version)
127 | benchmark.setup(choice)
128 | benchmark.run(args.iterations)
129 | benchmark.report()
130 | 


--------------------------------------------------------------------------------
/python/filter_time.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | 
 4 | iteration_num = 5
 5 | 
 6 | 
 7 | def filter_time(file_path):
 8 |     with open(file_path, 'r') as fin:
 9 |         content = fin.read()
10 |     reg = re.compile('real\t(\d+)m(.+?)s')
11 |     results = reg.findall(content)
12 |     if not results:
13 |         print("empty")
14 |         exit(1)
15 | 
16 |     ss = []
17 |     for x in results:
18 |         minute = int(x[0])
19 |         second = float(x[1])
20 |         second_all = 60 * minute + second
21 |         ss.append(second_all)
22 | 
23 |     return np.mean(ss)
24 | 
25 |     
26 | 
27 | def work():
28 |     original_time_file = 'time.txt'
29 |     data_flow_time_file = 'time_data_flow.txt'
30 |     value_pattern_time_file = 'time_value_pattern.txt'
31 |     original_time = filter_time(original_time_file)
32 |     data_flow_time = filter_time(data_flow_time_file)
33 |     value_pattern_time = filter_time(value_pattern_time_file)
34 |     overhead = data_flow_time / original_time + value_pattern_time / original_time
35 |     print("%.2f" % overhead)
36 | 
37 | work()
38 | 


--------------------------------------------------------------------------------
/python/gviewer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import pygraphviz as pgv
  4 | 
  5 | RED_LEVEL_0 = 0.33
  6 | RED_LEVEL_1 = 0.66
  7 | RED_LEVEL_2 = 0.99
  8 | RED_LEVEL_3 = 1.0
  9 | MAX_NODE_WIDTH = 3.0
 10 | MAX_EDGE_WIDTH = 5.0
 11 | 
 12 | 
 13 | class Graph:
 14 |     def __init__(self):
 15 |         self._nodes = dict()
 16 |         self._edges = dict()
 17 | 
 18 |     def read_agraph(self, agraph):
 19 |         for node in agraph.nodes():
 20 |             attrs = dict()
 21 |             for key, value in node.attr.items():
 22 |                 attrs[key] = value
 23 |             self._nodes[node] = attrs
 24 | 
 25 |         for edge in agraph.edges():
 26 |             e = (edge[0], edge[1], edge.attr['memory_node_id'],
 27 |                  edge.attr['edge_type'])
 28 |             attrs = dict()
 29 |             for key, value in edge.attr.items():
 30 |                 attrs[key] = value
 31 |             self._edges[e] = attrs
 32 | 
 33 |     def new_agraph(self):
 34 |         agraph = pgv.AGraph(strict=False, directed=True)
 35 |         for node, attrs in self._nodes.items():
 36 |             agraph.add_node(node, **attrs)
 37 |         for edge, attrs in self._edges.items():
 38 |             agraph.add_edge(edge[0], edge[1], **attrs)
 39 |         return agraph
 40 | 
 41 |     def add_edge(self, src, dst, meomry_node_id, edge_type, attrs):
 42 |         self._edges[(src, dst, meomry_node_id, edge_type)] = attrs
 43 | 
 44 |     def delete_edge(self, src, dst, meomry_node_id, edge_type):
 45 |         self._edges.pop((src, dst, meomry_node_id, edge_type), None)
 46 | 
 47 |     def delete_node(self, node):
 48 |         self._nodes.pop(node, None)
 49 |         edge_delete = []
 50 |         for edge, _ in self._edges.items():
 51 |             if edge[0] == node or edge[1] == node:
 52 |                 edge_delete.append(edge)
 53 | 
 54 |         for edge in edge_delete:
 55 |             self._edges.pop(edge, None)
 56 | 
 57 |     def nodes(self):
 58 |         return self._nodes
 59 | 
 60 |     def edges(self):
 61 |         return self._edges
 62 | 
 63 | 
 64 | def format_graph(args):
 65 |     def format_context(context, choice, known, leaf):
 66 |         ret = ''
 67 |         if choice == 'none':
 68 |             return ret
 69 |         frames = context.split('#')
 70 |         for frame in frames[::-1]:
 71 |             if frame == '' or frame == '\n':
 72 |                 continue
 73 |             line, func = frame.split('\t')
 74 |             if known is True and (line.find('Unknown') != -1 or line.find('<unknown file>') != -1):
 75 |                 continue
 76 |             if choice == 'path':
 77 |                 func = ''
 78 |             elif choice == 'file':
 79 |                 last_slash = line.rfind('/')
 80 |                 if last_slash != -1:
 81 |                     line = line[last_slash+1:]
 82 |             elif choice == 'func':
 83 |                 line = ''
 84 |             ret = line + ' ' + func + '\l' + ret
 85 |             if leaf is True:
 86 |                 break
 87 | 
 88 |         # escape characters
 89 |         ret = ret.replace('<', '\<')
 90 |         ret = ret.replace('>', '\>')
 91 |         return ret
 92 | 
 93 |     file_path = args.file
 94 |     # clean bug-ending chars
 95 |     new_lines = []
 96 |     with open(file_path, 'r') as fin:
 97 |         lines = fin.readlines()
 98 |         for line in lines:
 99 |             if line.endswith("'\\"):
100 |                 line = line[:-2]
101 |             elif line.endswith("'\\\n"):
102 |                 line = line.replace("'\\\n", '\n')
103 |             new_lines.append(line)
104 |     with open(file_path, 'w') as fout:
105 |         for line in new_lines:
106 |             fout.write(line)
107 | 
108 | 
109 |     agraph = pgv.AGraph(file_path, strict=False)
110 | 
111 |     G = Graph()
112 |     G.read_agraph(agraph)
113 | 
114 |     for node, attrs in G.nodes().items():
115 |         for key, attr in attrs.items():
116 |             if key == 'context':
117 |                 value = format_context(
118 |                     attr, args.context_filter, args.known, args.leaf)
119 |                 attrs['context'] = value
120 | 
121 |     return G
122 | 
123 | 
124 | def prune_graph(G, node_threshold=0.0, edge_threshold=0.0, keep_redundancy=False):
125 |     # 1. prune no edge nodes
126 |     nodes_with_edges = dict()
127 |     for node in G.nodes():
128 |         nodes_with_edges[node] = False
129 | 
130 |     for edge in G.edges():
131 |         nodes_with_edges[edge[0]] = True
132 |         nodes_with_edges[edge[1]] = True
133 | 
134 |     for k, v in nodes_with_edges.items():
135 |         if v is False:
136 |             # XXX(Keren): pay attention to complexity O(NE)
137 |             G.delete_node(k)
138 | 
139 |     # 2. prune no context nodes
140 |     nodes_without_context = dict()
141 |     for node, attrs in G.nodes().items():
142 |         if 'context' not in attrs or attrs['context'] == '':
143 |             nodes_without_context[node] = True
144 | 
145 |     for node in nodes_without_context:
146 |         G.delete_node(node)
147 | 
148 |     # 3. prune low importance nodes and edges
149 |     node_total_count = 0
150 |     for node, attrs in G.nodes().items():
151 |         if attrs['count'] is not None:
152 |             node_total_count += float(attrs['count'])
153 |     edge_total_count = 0
154 |     for edge, attrs in G.edges().items():
155 |         if attrs['count'] is not None:
156 |             edge_total_count += float(attrs['count'])
157 | 
158 |     delete_edges = []
159 |     node_reserve = dict()
160 |     for edge, attrs in G.edges().items():
161 |         if attrs['count'] is not None:
162 |             importance = float(attrs['count']) / edge_total_count
163 |             if importance >= edge_threshold:
164 |                 node_reserve[edge[0]] = True
165 |                 node_reserve[edge[1]] = True
166 |             elif keep_redundancy is True and float(attrs['redundancy']) >= RED_LEVEL_2:
167 |                 node_reserve[edge[0]] = True
168 |                 node_reserve[edge[1]] = True
169 |             else:
170 |                 delete_edges.append(edge)
171 |     delete_nodes = []
172 |     for node, attrs in G.nodes().items():
173 |         if attrs['count'] is not None:
174 |             importance = float(attrs['count']) / node_total_count
175 |             if importance < node_threshold:
176 |                 delete_nodes.append(node)
177 | 
178 |     for edge in delete_edges:
179 |         G.delete_edge(edge[0], edge[1], edge[2], edge[3])
180 |     for node in delete_nodes:
181 |         if node not in node_reserve:
182 |             G.delete_node(node)
183 | 
184 |     return G
185 | 
186 | 
187 | def combine_graph(G):
188 |     # Combine read write edges
189 |     rw_edges = dict()
190 |     for edge, attrs in G.edges().items():
191 |         edge_key = (edge[0], edge[1], edge[2])
192 |         if edge_key in rw_edges:
193 |             rw_edge = rw_edges[edge_key][1]
194 |             rw_edge['redundancy'] = max(
195 |                 float(rw_edge['redundancy']), float(attrs['redundancy']))
196 |             rw_edge['overwrite'] = max(
197 |                 float(rw_edge['overwrite']), float(attrs['overwrite']))
198 |             rw_edge['count'] = max(int(rw_edge['count']), int(attrs['count']))
199 |             rw_edges[edge_key] = (True, rw_edge)
200 |         else:
201 |             rw_edges[edge_key] = (False, attrs)
202 | 
203 |     for edge_key, attrs in rw_edges.items():
204 |         if attrs[0]:
205 |             G.delete_edge(edge_key[0], edge_key[1], edge_key[2], 'READ')
206 |             G.delete_edge(edge_key[0], edge_key[1], edge_key[2], 'WRITE')
207 |             attrs[1]['edge_type'] = 'READ & WRITE'
208 |             G.add_edge(edge_key[0], edge_key[1],
209 |                        edge_key[2], 'READ & WRITE', attrs[1])
210 | 
211 |     return G
212 | 
213 | 
214 | def create_plain_graph(G):
215 |     for node in G.nodes():
216 |         name = node.get_name()
217 |         label = '{'
218 |         label += '<name> ' + name + '|'
219 |         for key, value in node.attr.items():
220 |             label += '{<' + key + '> ' + key.upper() + '|' + value + '}|'
221 |         label = label[:-1]
222 |         label += '}'
223 |         node.attr['shape'] = 'record'
224 |         node.attr['label'] = label
225 | 
226 |     for edge in G.edges():
227 |         label = ''
228 |         if edge.attr['edge_type'] == 'READ':
229 |           label = 'EDGE_TYPE: READ\nMEMORY_NODE_ID: ' + \
230 |               str(edge.attr['memory_node_id'])
231 |         else:
232 |           for key, value in edge.attr.items():
233 |             label += key.upper() + ': ' + value + '\n'
234 |         edge.attr['label'] = label
235 | 
236 |     return G
237 | 
238 | 
239 | def create_pretty_graph(G):
240 |     def color_edge_redundancy(G):
241 |         for edge in G.edges():
242 |             if float(edge.attr['redundancy']) <= RED_LEVEL_0:
243 |                 edge.attr['color'] = '#cddc39'
244 |                 edge.attr['fillcolor'] = '#cddc39'
245 |             elif float(edge.attr['redundancy']) <= RED_LEVEL_1:
246 |                 edge.attr['color'] = '#fffa55'
247 |                 edge.attr['fillcolor'] = '#fffa55'
248 |             elif float(edge.attr['redundancy']) <= RED_LEVEL_2:
249 |                 edge.attr['color'] = '#fdcc3a'
250 |                 edge.attr['fillcolor'] = '#fdcc3a'
251 |             else:
252 |                 edge.attr['color'] = '#f91100'
253 |                 edge.attr['fillcolor'] = '#f91100'
254 |         return G
255 | 
256 |     def apportion_edge_width(G):
257 |         edges = G.edges()
258 |         max_edge = max(edges, key=lambda edge: float(
259 |             edge.attr['overwrite']) * float(edge.attr['count']))
260 |         max_weight = float(max_edge.attr['overwrite']) * \
261 |             float(max_edge.attr['count'])
262 | 
263 |         for edge in edges:
264 |             width = float(edge.attr['overwrite']) * \
265 |                 float(edge.attr['count']) / max_weight * MAX_EDGE_WIDTH
266 |             if width < 1.0:
267 |                 edge.attr['penwidth'] = 1.0
268 |             else:
269 |                 edge.attr['penwidth'] = width
270 | 
271 |         return G
272 | 
273 |     def apportion_node_width(G):
274 |         nodes = G.nodes()
275 |         max_node = max(nodes, key=lambda node: float(node.attr['count']))
276 |         max_weight = float(max_node.attr['count'])
277 | 
278 |         for node in nodes:
279 |             width = float(node.attr['count']) / max_weight * MAX_NODE_WIDTH
280 |             if width < 1.0:
281 |                 node.attr['width'] = 0.6
282 |             else:
283 |                 node.attr['width'] = width
284 | 
285 |         return G
286 | 
287 |     def label_node_duplicate(node):
288 |         dup = node.attr['duplicate']
289 |         label = ''
290 | 
291 |         if dup is None:
292 |           return label
293 | 
294 |         dup_entries = dup.split(';')
295 |         from_node = node.get_name()
296 | 
297 |         for dup_entry in dup_entries:
298 |             if len(dup_entry) > 0:
299 |                 dup_node = dup_entry.split(',')[0]
300 |                 label += dup_node + ' '
301 |         return 'DUPLICATE: ' + label
302 | 
303 |     #G.graph_attr['bgcolor'] = '#2e3e56'
304 |     G.graph_attr['pad'] = '0.5'
305 | 
306 |     for node in G.nodes():
307 |         if node.attr['node_type'] == 'MEMORY':
308 |             node.attr['shape'] = 'box'
309 |         elif node.attr['node_type'] == 'KERNEL':
310 |             node.attr['shape'] = 'ellipse'
311 |         elif node.attr['node_type'] == 'MEMCPY' or node.attr['node_type'] == 'MEMSET':
312 |             node.attr['shape'] = 'circle'
313 |         else:
314 |             node.attr['shape'] = 'box'
315 |             node.attr['label'] = node.attr['node_type']
316 |         node.attr['style'] = 'filled'
317 |         node.attr['penwidth'] = '0'
318 |         tooltip = ''
319 |         tooltip += 'TYPE: ' + node.attr['node_type'] + '\l'
320 |         tooltip += 'COUNT: ' + node.attr['count'] + '\l'
321 |         duplicate = label_node_duplicate(node)
322 |         if duplicate != '':
323 |             tooltip += duplicate + '\l'
324 |         tooltip += 'CONTEXT: \l' + node.attr['context']
325 |         tooltip.replace('\l', '&#10;')
326 |         node.attr['tooltip'] = tooltip
327 | 
328 |     for edge in G.edges():
329 |         tooltip = 'MEMORY_NODE_ID: ' + edge.attr['memory_node_id'] + '\l'
330 |         tooltip += 'TYPE: ' + edge.attr['edge_type'] + '\l'
331 |         tooltip += 'REDUNDANCY: ' + str(edge.attr['redundancy']) + '\l'
332 |         tooltip += 'OVERWRITE: ' + str(edge.attr['overwrite']) + '\l'
333 |         tooltip += 'BYTES: ' + str(edge.attr['count']) + '\l'
334 |         tooltip.replace('\l', '&#10;')
335 |         edge.attr['tooltip'] = tooltip
336 |         edge.attr['fontname'] = 'helvetica Neue Ultra Light'
337 | 
338 |     G = apportion_node_width(G)
339 | 
340 |     G = color_edge_redundancy(G)
341 | 
342 |     G = apportion_edge_width(G)
343 | 
344 |     return G
345 | 
346 | 
347 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
348 | parser.add_argument('-f', '--file', help='file name')
349 | parser.add_argument('-cf', '--context-filter', choices=[
350 |                     'path', 'file', 'func', 'all', 'none'], default='all', help='show part of the calling context')
351 | parser.add_argument('-k', '--known', action='store_true', default=False,
352 |                     help='show only known function')
353 | parser.add_argument('-l', '--leaf', action='store_true', default=False,
354 |                     help='show only leaf function')
355 | parser.add_argument('-of', '--output-format',
356 |                     choices=['svg', 'png', 'pdf'], default='svg', help='output format')
357 | parser.add_argument('-pn', '--prune-node', default=0.0,
358 |                     help='prune node lower bound')
359 | parser.add_argument('-pe', '--prune-edge', default=0.0,
360 |                     help='prune edge lower bound')
361 | parser.add_argument('-kr', '--keep-redundancy', action='store_true', default=False,
362 |                     help='keep all high redundancy edges')
363 | parser.add_argument(
364 |     '-ly', '--layout', choices=['dot', 'neato', 'circo'], default='dot', help='svg layout')
365 | parser.add_argument('-pr', '--pretty', action='store_true', default=False,
366 |                     help='tune output graph')
367 | parser.add_argument('-v', '--verbose', action='store_true', help='print log')
368 | args = parser.parse_args()
369 | 
370 | if args.verbose:
371 |   print('Format graph...')
372 | G = format_graph(args)
373 | 
374 | if float(args.prune_node) > 0.0 or float(args.prune_edge) > 0.0:
375 |   if args.verbose:
376 |     print('Prune graph: {} nodes and {} edges...'.format(
377 |         len(G.nodes()), len(G.edges())))
378 |   G = prune_graph(G, float(args.prune_node), float(args.prune_edge), args.keep_redundancy)
379 | 
380 | if args.verbose:
381 |   print('Refine graph...')
382 | if args.pretty:
383 |     G = combine_graph(G)
384 |     agraph = create_pretty_graph(G.new_agraph())
385 | else:
386 |     agraph = create_plain_graph(G.new_agraph())
387 | 
388 | if args.verbose:
389 |   print('Organize graph: {} nodes and {} edges...'.format(
390 |       len(agraph.nodes()), len(agraph.edges())))
391 | agraph.layout(prog=args.layout)
392 | 
393 | if args.verbose:
394 |   print('Output graph...')
395 | #G.write(args.file + '.dot')
396 | agraph.draw(args.file + '.' + args.output_format)
397 | 


--------------------------------------------------------------------------------
/python/overhead.sh:
--------------------------------------------------------------------------------
 1 | GVPROF_source_path=/root/GVProf
 2 | rodinia_path=/root/GVProf/samples
 3 | target_log_file=/root/GVProf/overhead.txt
 4 | iteration_num=1
 5 | 
 6 | cd $rodinia_path
 7 | 
 8 | run() {
 9 |     cur_path=$1
10 |     block_sampling=$2
11 |     kernel_sampling=$3
12 |     EXEC_AND_PARAMS=${@:4}
13 |     echo ${EXEC_AND_PARAMS}
14 |     cd ${cur_path}
15 |     echo ${cur_path} >>  ${target_log_file}
16 |     rm -rf time*.txt gvprof*
17 |     for i in {1..${iteration_num}}; do
18 |         { time $EXEC_AND_PARAMS; } 2>>time.txt
19 |     done
20 | 
21 |     gvprof_overhead -i ${iteration_num} -v -e data_flow -ck HPCRUN_SANITIZER_READ_TRACE_IGNORE=1 -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 -ck HPCRUN_SANITIZER_GPU_ANALYSIS_BLOCKS=1 $EXEC_AND_PARAMS
22 | 
23 |     rm -rf gvprof*
24 | 
25 |     gvprof_overhead -i ${iteration_num} -v -e value_pattern -s ${block_sampling} -ck HPCRUN_SANITIZER_KERNEL_SAMPLING_FREQUENCY=${kernel_sampling} -ck HPCRUN_SANITIZER_GPU_PATCH_RECORD_NUM=131072 -ck HPCRUN_SANITIZER_WHITELIST=w.txt $EXEC_AND_PARAMS
26 |     
27 |     /usr/bin/python3 ${GVPROF_source_path}/python/filter_time.py >> ${target_log_file}
28 | }
29 | 
30 | run $rodinia_path/bfs 20 20 ./bfs ../data/graph1MW_6.txt
31 | run $rodinia_path/backprop 20 20 ./backprop 65536
32 | run $rodinia_path/srad_v1 20 20 ./srad 1 0.5 502 458
33 | run $rodinia_path/hotspot 20 20 ./hotspot 512 2 2 ../data/temp_512 ../data/power_512 output.out
34 | run $rodinia_path/pathfinder 20 20 ./pathfinder 100000 100 20
35 | run $rodinia_path/cfd 20 20 ./euler3d ../data/fvcorr.domn.097K
36 | run $rodinia_path/huffman 20 20 ./pavle ../data/test1024_H2.206587175259.in
37 | run $rodinia_path/lavaMD 100 100 ./lavaMD -boxes1d 10
38 | run $rodinia_path/hotspot3D 20 20 ./3D 512 8 100 ../data/power_512x8 ../data/temp_512x8 output.out
39 | run $rodinia_path/streamcluster 100 100 ./sc_gpu 10 20 256 65536 65536 1000 none output.txt 1
40 | 
41 | # # real applications
42 | # #darknet
43 | # run /root/gpuapps/darknet ./darknet detector test ./cfg/coco.data ./cfg/yolov4.cfg ./yolov4.weights data/dog.jpg -i 0 -thresh 0.25
44 | # # castro
45 | # run /root/gpuapps/Castro/Exec/hydro_tests/Sedov ./Castro2d.gnu.CUDA.ex inputs.2d.cyl_in_cartcoords
46 | # # barracuda
47 | # run /root/gpuapps/barracuda ./bin/barracuda aln sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa sample_data/sample_reads.fastq >quicktest.sai
48 | # # pytorch
49 | # eval "$(/root/anaconda3/bin/conda shell.bash hook)"
50 | # conda activate pytorch
51 | 
52 | # cd /root/gpuapps/pytorch_vp/pytorch
53 | # rm -rf gvprof*
54 | # rm w.txt
55 | # ln -s w_resnet.txt w.txt
56 | # run /root/gpuapps/pytorch_vp/pytorch python3 1-resnet50-unit.py
57 | # rm w.txt
58 | # ln -s w_deepwave.txt w.txt
59 | # run /root/gpuapps/pytorch_vp/pytorch python3 2-deepwave-unit.py
60 | # rm w.txt
61 | # ln -s w_bert.txt w.txt
62 | # run /root/gpuapps/pytorch_vp/pytorch python3 3-bert-unit.py
63 | 
64 | # conda deactivate
65 | 
66 | # # namd
67 | # run /root/gpuapps/NAMD/Linux-x86_64-g++ ./namd3 ../src/alanin
68 | 
69 | # # qmcpack
70 | 
71 | # #lammps
72 | # run /root/gpuapps/lammps/bench ../build/lmp -k on g 1 -sf kk -in in.lj
73 | 


--------------------------------------------------------------------------------
/python/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from tests.data_flow_test import DataFlowTest
 4 | from tests.redundancy_test import RedundancyTest
 5 | from tests.value_pattern_test import ValuePatternTest
 6 | from tests.instruction_test import InstructionTest
 7 | from test_cases import Test
 8 | 
 9 | parser = argparse.ArgumentParser(
10 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 | parser.add_argument('-c', '--case', help='case name')
12 | parser.add_argument('-m', '--mode', choices=['data_flow', 'redundancy',
13 |                     'value_pattern', 'instruction', 'all'], default='all', help='mode name')
14 | parser.add_argument('-a', '--arch', choices=['sm_70', 'sm_72',
15 |                     'sm_75', 'sm_80', 'sm_85'], default='sm_70', help='gpu arch name')
16 | args = parser.parse_args()
17 | 
18 | tests = []
19 | 
20 | if args.mode == 'data_flow' or args.mode == 'all':
21 |     tests.append(DataFlowTest(args.arch))
22 | 
23 | if args.mode == 'redundancy' or args.mode == 'all':
24 |     tests.append(RedundancyTest(args.arch))
25 | 
26 | if args.mode == 'value_pattern' or args.mode == 'all':
27 |     tests.append(ValuePatternTest(args.arch))
28 | 
29 | if args.mode == 'instruction' or args.mode == 'all':
30 |     tests.append(InstructionTest(args.arch))
31 | 
32 | for test in tests:
33 |     print("{}...".format(test.name()))
34 |     if args.case is None:
35 |         # Test all cases
36 |         choice = Test.cases.keys()
37 |     else:
38 |         choice = [args.case]
39 |     test.setup(choice)
40 |     test.run()
41 | 


--------------------------------------------------------------------------------
/python/test_cases.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import os
  3 | 
  4 | from utils import cleanup
  5 | 
  6 | 
  7 | class Test(object):
  8 |     Case = namedtuple('Case', ['path', 'versions',
  9 |                       'command', 'options', 'cleanup'])
 10 |     cases = dict()
 11 | 
 12 |     # unit test cases
 13 |     cases['vectorAdd.f128'] = Case(
 14 |         path='samples/vectorAdd.f128', versions=[], command='./vectorAdd', options=[], cleanup=True)
 15 |     cases['op_graph_simple'] = Case(
 16 |         path='samples/op_graph_simple', versions=[], command='./main', options=[], cleanup=True)
 17 |     cases['op_pattern_simple'] = Case(
 18 |         path='samples/op_pattern_simple', versions=[], command='./main', options=[], cleanup=True)
 19 |     cases['stress'] = Case(path='samples/stress', versions=[],
 20 |                            command='./stress', options=[], cleanup=True)
 21 | 
 22 |     # sample test cases
 23 |     cases['bfs'] = Case(path='samples/bfs', command='./bfs', versions=['vp-opt1',
 24 |                         'vp-opt2', 'vp-opt'], options=['../data/graph1MW_6.txt'], cleanup=True)
 25 |     cases['backprop'] = Case(path='samples/backprop', command='./backprop', versions=[
 26 |                              'vp-opt1', 'vp-opt2', 'vp-opt'], options=['65536'], cleanup=True)
 27 |     cases['cfd'] = Case(path='samples/cfd', command='./euler3d', versions=['vp-opt1',
 28 |                         'vp-opt2', 'vp-opt'], options=['../data/fvcorr.domn.097K'], cleanup=True)
 29 |     cases['hotspot'] = Case(path='samples/hotspot', command='./hotspot', versions=['vp-opt'], options=[
 30 |                             '512', '2', '2', '../data/temp_512', '../data/power_512', 'output.out'], cleanup=True)
 31 |     cases['hotspot3D'] = Case(path='samples/hotspot3D', command='./3D', versions=['vp-opt'], options=[
 32 |                               '512', '8', '100', '../data/power_512x8', '../data/temp_512x8', 'output.out'], cleanup=True)
 33 |     cases['huffman'] = Case(path='samples/huffman', command='./pavle', versions=[
 34 |                             'vp-opt'], options=['../data/test1024_H2.206587175259.in'], cleanup=True)
 35 |     cases['lavaMD'] = Case(path='samples/lavaMD', command='./lavaMD',
 36 |                            versions=['vp-opt'], options=['-boxes1d', '10'], cleanup=True)
 37 |     cases['particlefilter'] = Case(path='samples/particlefilter', command='./particlefilter_float', versions=[
 38 |                                    'vp-opt'], options=['-x', '128', '-y', '128', '-z', '10', '-np', '1000'], cleanup=True)
 39 |     cases['pathfinder'] = Case(path='samples/pathfinder', command='./pathfinder',
 40 |                                versions=['vp-opt'], options=['100000', '100', '20'], cleanup=True)
 41 |     cases['srad'] = Case(path='samples/srad_v1', command='./srad', versions=['vp-opt1',
 42 |                          'vp-opt2', 'vp-opt'], options=['10', '0.5', '502', '458'], cleanup=True)
 43 |     cases['streamcluster'] = Case(path='samples/streamcluster', command='./sc_gpu', versions=['vp-opt'], options=[
 44 |                                   '10', '20', '256', '65536', '65536', '1000', 'none', 'output.txt', '1'], cleanup=True)
 45 | 
 46 |     # application cases
 47 |     cases['barracuda'] = Case(path='samples/barracuda', command='./barracuda', versions=['vp-opt'],
 48 |                               options=['aln', 'sample_data/Saccharomyces_cerevisiae.SGD1.01.50.dna_rm.toplevel.fa',
 49 |                               'sample_data/sample_reads.fastq', '>', 'quicktest.sai'], cleanup=False)
 50 | 
 51 |     cases['castro'] = Case(path='samples/Castro/Exec/hydro_tests/Sedov', command='Castro2d.gnu.CUDA.ex', versions=['vp-opt'],
 52 |                            options=['./inputs.2d.cyl_in_cartcoords'], cleanup=False)
 53 |     
 54 |     cases['darknet'] = Case(path='samples/darknet', command='./darknet', versions=['vp-opt'],
 55 |                             options=['detector', 'test', './cfg/coco.data', './cfg/yolov4.cfg',
 56 |                                      './yolov4.weights', 'data/dog.jpg', '-i', '0', '-thresh', '0.25'], cleanup=False)
 57 | 
 58 |     cases['deepwave'] = Case(path='samples/deepwave', command='./Deepwave_SEAM_example1.py', versions=['vp-opt'],
 59 |                              options=[], cleanup=False)
 60 | 
 61 |     cases['namd'] = Case(path='samples/NAMD/Linux-x86_64-g++', command='./namd3',
 62 |                          versions=['vp-opt'], options=['../alain'], cleanup=False)
 63 | 
 64 |     cases['qmcpack'] = Case(path='samples/qmcpack/workspace/NiO/dmc-a4-e48-batched_driver-DU8',
 65 |                             command='../../../build/bin/qmcpack', versions=['vp-opt'], options=['./NiO-fcc-S1-dmc.xml'], cleanup=False)
 66 | 
 67 |     def __init__(self, name, arch, version=None):
 68 |         self._name = name
 69 |         self._arch = arch
 70 |         self._version = version
 71 |         self._configs = dict()
 72 | 
 73 |     def name(self):
 74 |         return self._name
 75 | 
 76 |     def setup(self, choices):
 77 |         pass
 78 | 
 79 |     def _run_impl(self, case_name, version):
 80 |         pass
 81 | 
 82 |     def run(self, iterations=1):
 83 |         cwd = os.getcwd()
 84 | 
 85 |         for i in range(iterations):
 86 |             for case_name, case in Test.cases.items():
 87 |                 if case_name not in self._configs:
 88 |                     continue
 89 | 
 90 |                 os.chdir(case.path)
 91 |                 if i == 0 and case.cleanup:
 92 |                     cleanup(self._arch)
 93 | 
 94 |                 self._run_impl(case_name, None)
 95 | 
 96 |                 os.chdir(cwd)
 97 | 
 98 |                 if self._version is None:
 99 |                     continue
100 | 
101 |                 for version in case.versions:
102 |                     if version == self._version or self._version == 'all':
103 |                         os.chdir(case.path + '-' + version)
104 |                         if i == 0 and case.cleanup:
105 |                             cleanup(self._arch)
106 | 
107 |                         self._run_impl(case_name, version)
108 | 
109 |                         os.chdir(cwd)
110 | 


--------------------------------------------------------------------------------
/python/tests/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 | 


--------------------------------------------------------------------------------
/python/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | sys.path.append("../")
4 | 


--------------------------------------------------------------------------------
/python/tests/data_flow_test.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import subprocess
 3 | import os
 4 | import sys
 5 | 
 6 | import pygraphviz as pgv
 7 | 
 8 | from test_cases import Test
 9 | from utils import pipe_read
10 | 
11 | 
12 | class DataFlowTest(Test):
13 |     Config = namedtuple('Config', ['files', 'nodes', 'edges'])
14 | 
15 |     def __init__(self, arch):
16 |         super().__init__('DataFlowTest', arch)
17 | 
18 |     def setup(self, choices):
19 |         for choice in choices:
20 |             if choice == 'op_graph_simple':
21 |                 self._configs[choice] = DataFlowTest.Config(files=['data_flow.dot'],
22 |                                                             nodes=[17],
23 |                                                             edges=[20])
24 |             elif choice == 'bfs':
25 |                 self._configs[choice] = DataFlowTest.Config(files=['data_flow.dot'],
26 |                                                             nodes=[23],
27 |                                                             edges=[41])
28 | 
29 |     def _run_impl(self, case_name, version):
30 |         if case_name not in self._configs:
31 |             return
32 | 
33 |         command = Test.cases[case_name].command
34 |         options = Test.cases[case_name].options
35 |         path = Test.cases[case_name].path
36 | 
37 |         pipe_read(['gvprof', '-cfg', '-e', 'data_flow',
38 |                    command] + options)
39 | 
40 |         files = self._configs[case_name].files
41 |         nodes = self._configs[case_name].nodes
42 |         edges = self._configs[case_name].edges
43 | 
44 |         # Just count the number of nodes and edges,
45 |         # redundancy and overwrite is difficult for autotest
46 |         for i, f in enumerate(files):
47 |             f = 'gvprof-database/' + f
48 |             agraph = pgv.AGraph(f, strict=False)
49 |             correct = True
50 |             if len(agraph.nodes()) != nodes[i]:
51 |                 print('Error {} nodes (true: {} vs test: {})'.format(
52 |                     path, nodes[i], len(agraph.nodes())))
53 |                 correct = False
54 |             if len(agraph.edges()) != edges[i]:
55 |                 print('Error {} edges (true: {} vs test: {})'.format(
56 |                     path, edges[i], len(agraph.edges())))
57 |                 correct = False
58 |             if correct is True:
59 |                 print('Pass ' + path)
60 | 


--------------------------------------------------------------------------------
/python/tests/instruction_test.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import os
  3 | import sys
  4 | 
  5 | from test_cases import Test
  6 | from utils import pipe_read
  7 | 
  8 | 
  9 | class InstructionTest(Test):
 10 |     Config = namedtuple('Config', ['insts'])
 11 | 
 12 |     def __init__(self, arch):
 13 |         super().__init__('InstructionTest', arch)
 14 | 
 15 |     def setup(self, choices):
 16 |         for choice in choices:
 17 |             if choice == 'op_pattern_simple':
 18 |                 self._configs[choice] = InstructionTest.Config(insts={
 19 |                     'sm_70':
 20 |                     ['FUNC: 18, PC: 0xd0, ACCESS_KIND: INTEGER,v:32,u:32',
 21 |                      'FUNC: 19, PC: 0xc0, ACCESS_KIND: INTEGER,v:32,u:32',
 22 |                      'FUNC: 20, PC: 0xf0, ACCESS_KIND: UNKNOWN,v:32,u:32',
 23 |                      'FUNC: 21, PC: 0x250, ACCESS_KIND: FLOAT,v:64,u:64',
 24 |                      'FUNC: 22, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:64,u:64',
 25 |                      'FUNC: 23, PC: 0xe0, ACCESS_KIND: FLOAT,v:64,u:64',
 26 |                      'FUNC: 23, PC: 0x100, ACCESS_KIND: FLOAT,v:64,u:64'],
 27 |                     'sm_75':
 28 |                     ['FUNC: 17, PC: 0xb0, ACCESS_KIND: INTEGER,v:32,u:32',
 29 |                      'FUNC: 18, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 30 |                      'FUNC: 18, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:32,u:32',
 31 |                      'FUNC: 19, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 32 |                      'FUNC: 19, PC: 0x240, ACCESS_KIND: FLOAT,v:64,u:64',
 33 |                      'FUNC: 20, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 34 |                      'FUNC: 20, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:64,u:64',
 35 |                      'FUNC: 21, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 36 |                      'FUNC: 21, PC: 0xd0, ACCESS_KIND: FLOAT,v:64,u:64',
 37 |                      'FUNC: 21, PC: 0xf0, ACCESS_KIND: FLOAT,v:64,u:64'],
 38 |                     'sm_80':
 39 |                     ['FUNC: 17, PC: 0xa0, ACCESS_KIND: INTEGER,v:64,u:32',
 40 |                      'FUNC: 17, PC: 0xc0, ACCESS_KIND: INTEGER,v:32,u:32',
 41 |                      'FUNC: 18, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 42 |                      'FUNC: 18, PC: 0xc0, ACCESS_KIND: INTEGER,v:64,u:32',
 43 |                      'FUNC: 18, PC: 0xe0, ACCESS_KIND: UNKNOWN,v:32,u:32',
 44 |                      'FUNC: 19, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 45 |                      'FUNC: 19, PC: 0xe0, ACCESS_KIND: INTEGER,v:64,u:32',
 46 |                      'FUNC: 19, PC: 0x230, ACCESS_KIND: FLOAT,v:64,u:64',
 47 |                      'FUNC: 20, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 48 |                      'FUNC: 20, PC: 0xb0, ACCESS_KIND: INTEGER,v:64,u:32',
 49 |                      'FUNC: 20, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:64,u:32',
 50 |                      'FUNC: 21, PC: 0x20, ACCESS_KIND: INTEGER,v:64,u:32',
 51 |                      'FUNC: 21, PC: 0xb0, ACCESS_KIND: INTEGER,v:64,u:32',
 52 |                      'FUNC: 21, PC: 0xd0, ACCESS_KIND: FLOAT,v:64,u:64',
 53 |                      'FUNC: 21, PC: 0xf0, ACCESS_KIND: FLOAT,v:64,u:64']
 54 |                 })
 55 |             elif choice == 'bfs':
 56 |                 self._configs[choice] = InstructionTest.Config(insts={
 57 |                     'sm_70':
 58 |                     ['FUNC: 10, PC: 0xa0, ACCESS_KIND: INTEGER,v:8,u:8',
 59 |                      'FUNC: 10, PC: 0x170, ACCESS_KIND: INTEGER,v:8,u:8',
 60 |                      'FUNC: 10, PC: 0x180, ACCESS_KIND: INTEGER,v:8,u:8',
 61 |                      'FUNC: 10, PC: 0x190, ACCESS_KIND: INTEGER,v:8,u:8',
 62 |                      'FUNC: 10, PC: 0x1a0, ACCESS_KIND: UNKNOWN,v:8,u:8',
 63 |                      'FUNC: 11, PC: 0x90, ACCESS_KIND: INTEGER,v:8,u:8',
 64 |                      'FUNC: 11, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:8,u:8',
 65 |                      'FUNC: 11, PC: 0xf0, ACCESS_KIND: INTEGER,v:32,u:32',
 66 |                      'FUNC: 11, PC: 0x120, ACCESS_KIND: INTEGER,v:32,u:32',
 67 |                      'FUNC: 11, PC: 0x1b0, ACCESS_KIND: INTEGER,v:32,u:32',
 68 |                      'FUNC: 11, PC: 0x1f0, ACCESS_KIND: INTEGER,v:8,u:8',
 69 |                      'FUNC: 11, PC: 0x210, ACCESS_KIND: INTEGER,v:32,u:32',
 70 |                      'FUNC: 11, PC: 0x290, ACCESS_KIND: INTEGER,v:32,u:32',
 71 |                      'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:8,u:8',
 72 |                      'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32',
 73 |                      'FUNC: 11, PC: 0x2c0, ACCESS_KIND: INTEGER,v:32,u:32'],
 74 |                     'sm_75':
 75 |                     ['FUNC: 10, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:64',
 76 |                      'FUNC: 10, PC: 0x80, ACCESS_KIND: INTEGER,v:8,u:8',
 77 |                      'FUNC: 10, PC: 0xc0, ACCESS_KIND: INTEGER,v:64,u:64',
 78 |                      'FUNC: 10, PC: 0xd0, ACCESS_KIND: INTEGER,v:64,u:64',
 79 |                      'FUNC: 10, PC: 0x100, ACCESS_KIND: UNKNOWN,v:8,u:8',
 80 |                      'FUNC: 10, PC: 0x110, ACCESS_KIND: INTEGER,v:64,u:64',
 81 |                      'FUNC: 10, PC: 0x120, ACCESS_KIND: UNKNOWN,v:8,u:8',
 82 |                      'FUNC: 10, PC: 0x130, ACCESS_KIND: INTEGER,v:64,u:64',
 83 |                      'FUNC: 10, PC: 0x140, ACCESS_KIND: UNKNOWN,v:8,u:8',
 84 |                      'FUNC: 10, PC: 0x150, ACCESS_KIND: UNKNOWN,v:8,u:8',
 85 |                      'FUNC: 11, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:64',
 86 |                      'FUNC: 11, PC: 0x80, ACCESS_KIND: INTEGER,v:8,u:8',
 87 |                      'FUNC: 11, PC: 0xd0, ACCESS_KIND: INTEGER,v:64,u:64',
 88 |                      'FUNC: 11, PC: 0x100, ACCESS_KIND: UNKNOWN,v:8,u:8',
 89 |                      'FUNC: 11, PC: 0x110, ACCESS_KIND: INTEGER,v:32,u:32',
 90 |                      'FUNC: 11, PC: 0x140, ACCESS_KIND: INTEGER,v:32,u:32',
 91 |                      'FUNC: 11, PC: 0x1c0, ACCESS_KIND: INTEGER,v:32,u:32',
 92 |                      'FUNC: 11, PC: 0x1d0, ACCESS_KIND: INTEGER,v:64,u:64',
 93 |                      'FUNC: 11, PC: 0x1f0, ACCESS_KIND: INTEGER,v:8,u:8',
 94 |                      'FUNC: 11, PC: 0x210, ACCESS_KIND: INTEGER,v:32,u:32',
 95 |                      'FUNC: 11, PC: 0x240, ACCESS_KIND: INTEGER,v:64,u:64',
 96 |                      'FUNC: 11, PC: 0x280, ACCESS_KIND: INTEGER,v:32,u:32',
 97 |                      'FUNC: 11, PC: 0x290, ACCESS_KIND: UNKNOWN,v:8,u:8',
 98 |                      'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:32,u:32',
 99 |                      'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32'],
100 |                     'sm_80':
101 |                     ['FUNC: 10, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:32',
102 |                      'FUNC: 10, PC: 0xa0, ACCESS_KIND: INTEGER,v:8,u:8',
103 |                      'FUNC: 10, PC: 0x150, ACCESS_KIND: INTEGER,v:8,u:8',
104 |                      'FUNC: 10, PC: 0x180, ACCESS_KIND: INTEGER,v:8,u:8',
105 |                      'FUNC: 10, PC: 0x190, ACCESS_KIND: INTEGER,v:8,u:8',
106 |                      'FUNC: 10, PC: 0x1a0, ACCESS_KIND: UNKNOWN,v:8,u:8',
107 |                      'FUNC: 11, PC: 0x70, ACCESS_KIND: INTEGER,v:64,u:32',
108 |                      'FUNC: 11, PC: 0x90, ACCESS_KIND: INTEGER,v:8,u:8',
109 |                      'FUNC: 11, PC: 0xd0, ACCESS_KIND: UNKNOWN,v:8,u:8',
110 |                      'FUNC: 11, PC: 0xf0, ACCESS_KIND: INTEGER,v:32,u:32',
111 |                      'FUNC: 11, PC: 0x120, ACCESS_KIND: INTEGER,v:32,u:32',
112 |                      'FUNC: 11, PC: 0x1a0, ACCESS_KIND: INTEGER,v:32,u:32',
113 |                      'FUNC: 11, PC: 0x1e0, ACCESS_KIND: INTEGER,v:8,u:8',
114 |                      'FUNC: 11, PC: 0x200, ACCESS_KIND: INTEGER,v:32,u:32',
115 |                      'FUNC: 11, PC: 0x280, ACCESS_KIND: INTEGER,v:32,u:32',
116 |                      'FUNC: 11, PC: 0x290, ACCESS_KIND: INTEGER,v:8,u:8',
117 |                      'FUNC: 11, PC: 0x2a0, ACCESS_KIND: INTEGER,v:32,u:32',
118 |                      'FUNC: 11, PC: 0x2b0, ACCESS_KIND: INTEGER,v:32,u:32']
119 |                 })
120 | 
121 |     def _run_impl(self, case_name, version):
122 |         command = Test.cases[case_name].command
123 |         options = Test.cases[case_name].options
124 |         path = Test.cases[case_name].path
125 | 
126 |         pipe_read(['gvprof', '-cfg', '-e', 'data_flow', command] + options)
127 | 
128 |         files = os.listdir('./gvprof-measurements/structs/nvidia/')
129 | 
130 |         insts = self._configs[case_name].insts
131 | 
132 |         for f in files:
133 |             if f.find('.inst') != -1:
134 |                 bufs = pipe_read(
135 |                     ['redshow_parser', './gvprof-measurements/structs/nvidia/' + f]).decode('utf-8').splitlines()
136 | 
137 |                 correct = True
138 |                 for n, buf in enumerate(bufs):
139 |                     if buf != insts[self._arch][n]:
140 |                         print('Error {} line {} (true: {} vs test: {})'.format(
141 |                             path, n, insts[self._arch][n], buf))
142 |                         correct = False
143 |                 if correct is True:
144 |                     print('Pass ' + path + ' ' + f)
145 | 


--------------------------------------------------------------------------------
/python/tests/redundancy_test.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import subprocess
  3 | import os
  4 | import sys
  5 | 
  6 | from test_cases import Test
  7 | from utils import pipe_read
  8 | 
  9 | 
 10 | class RedundancyTest(Test):
 11 |     Config = namedtuple('Config', ['spatial_read_files', 'spatial_read_reds', 'spatial_write_files', 'spatial_write_reds',
 12 |                                    'temporal_read_files', 'temporal_read_reds', 'temporal_write_files', 'temporal_write_reds', 'total', 'sampling', 'tolerate'])
 13 | 
 14 |     def __init__(self, arch):
 15 |         super().__init__('RedundancyTest', arch)
 16 | 
 17 |     def setup(self, choices):
 18 |         for choice in choices:
 19 |             if choice == 'vectorAdd.f128':
 20 |                 self._configs[choice] = RedundancyTest.Config(
 21 |                     spatial_read_files=['spatial_read_t0.csv'],
 22 |                     spatial_read_reds=[3],
 23 |                     spatial_write_files=['spatial_write_t0.csv'],
 24 |                     spatial_write_reds=[1],
 25 |                     temporal_read_files=['temporal_read_t0.csv'],
 26 |                     temporal_read_reds=[0],
 27 |                     temporal_write_files=['temporal_write_t0.csv'],
 28 |                     temporal_write_reds=[0],
 29 |                     total=[12],
 30 |                     sampling=0,
 31 |                     tolerate=0.0)
 32 |             elif choice == 'bfs':
 33 |                 self._configs[choice] = RedundancyTest.Config(
 34 |                     spatial_read_files=['spatial_read_t0.csv'],
 35 |                     spatial_read_reds=[27707987],
 36 |                     spatial_write_files=['spatial_write_t0.csv'],
 37 |                     spatial_write_reds=[7997516],
 38 |                     temporal_read_files=['temporal_read_t0.csv'],
 39 |                     temporal_read_reds=[5603846],
 40 |                     temporal_write_files=['temporal_write_t0.csv'],
 41 |                     temporal_write_reds=[0],
 42 |                     total=[52653451],
 43 |                     sampling=0,
 44 |                     tolerate=0.02)
 45 |             elif choice == 'backprop':
 46 |                 self._configs[choice] = [
 47 |                     RedundancyTest.Config(
 48 |                         spatial_read_files=['spatial_read_t0.csv'],
 49 |                         spatial_read_reds=[4194507],
 50 |                         spatial_write_files=['spatial_write_t0.csv'],
 51 |                         spatial_write_reds=[1048623],
 52 |                         temporal_read_files=['temporal_read_t0.csv'],
 53 |                         temporal_read_reds=[3149872],
 54 |                         temporal_write_files=['temporal_write_t0.csv'],
 55 |                         temporal_write_reds=[0],
 56 |                         total=[19988592],
 57 |                         sampling=0,
 58 |                         tolerate=0.01),
 59 |                     RedundancyTest.Config(
 60 |                         spatial_read_files=['spatial_read_t0.csv'],
 61 |                         spatial_read_reds=[84039],
 62 |                         spatial_write_files=['spatial_write_t0.csv'],
 63 |                         spatial_write_reds=[21009],
 64 |                         temporal_read_files=['temporal_read_t0.csv'],
 65 |                         temporal_read_reds=[63058],
 66 |                         temporal_write_files=['temporal_write_t0.csv'],
 67 |                         temporal_write_reds=[0],
 68 |                         total=[400160],
 69 |                         sampling=50,
 70 |                         tolerate=0.05)]
 71 | 
 72 |     def _run_impl(self, case_name, version):
 73 |         runs = self._configs[case_name]
 74 |         if not isinstance(runs, list):
 75 |             runs = [runs]
 76 | 
 77 |         command = Test.cases[case_name].command
 78 |         options = Test.cases[case_name].options
 79 |         path = Test.cases[case_name].path
 80 | 
 81 |         for run in runs:
 82 |             sampling = ''
 83 |             if run.sampling != 0:
 84 |                 sampling = 'sampling'
 85 |                 pipe_read(['gvprof', '-cfg', '-e', 'redundancy@' +
 86 |                            str(run.sampling), command] + options)
 87 |             else:
 88 |                 pipe_read(['gvprof', '-cfg', '-e', 'redundancy',
 89 |                            command] + options)
 90 | 
 91 |             def redundancy_compare(red_files, true_reds):
 92 |                 for i, red_file in enumerate(red_files):
 93 |                     red_file = 'gvprof-database/' + red_file
 94 |                     res = pipe_read(['tail', '-n', '1', red_file]).decode()
 95 |                     red = float(res.split(',')[0])
 96 |                     true_red = float(true_reds[i])
 97 |                     epsilon = red if true_red == 0.0 else abs(
 98 |                         red - true_red) / true_red
 99 |                     if epsilon > run.tolerate:
100 |                         print('Error {} {}: (true: {} vs test: {})'.format(
101 |                             path, red_file, true_red, red))
102 |                     else:
103 |                         print('Pass ' + path + ' ' + red_file + ' ' + sampling)
104 | 
105 |             redundancy_compare(run.spatial_read_files, run.spatial_read_reds)
106 |             redundancy_compare(run.spatial_write_files, run.spatial_write_reds)
107 |             redundancy_compare(run.temporal_read_files, run.temporal_read_reds)
108 |             redundancy_compare(run.temporal_write_files,
109 |                                run.temporal_write_reds)
110 | 


--------------------------------------------------------------------------------
/python/tests/value_pattern_test.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import subprocess
  3 | import os
  4 | import sys
  5 | 
  6 | from test_cases import Test
  7 | from utils import pipe_read
  8 | 
  9 | 
 10 | class ValuePatternTest(Test):
 11 |     Config = namedtuple('Config', ['files', 'op_counts', 'kernel_patterns'])
 12 | 
 13 |     def __init__(self, arch):
 14 |         super().__init__('ValuePatternTest', arch)
 15 | 
 16 |     def setup(self, choices):
 17 |         for choice in choices:
 18 |             if choice == 'op_pattern_simple':
 19 |                 self._configs[choice] = ValuePatternTest.Config(
 20 |                     files=['value_pattern_t0.csv'],
 21 |                     op_counts=[[[500, 500, 1, 500], [500, 500, 1, 500]],
 22 |                                [[250, 250, 1, 250], [250, 250, 1, 250]],
 23 |                                [[500, 500, 500, 500], [500, 500, 1, 500]],
 24 |                                [[500, 500, 1, 500], [500, 500, 1, 500]],
 25 |                                [[500, 500, 500, 500]],
 26 |                                [[1000, 1000, 20, 1000]]],
 27 |                     kernel_patterns=[['Redundant Zeros', 'Single Value'],
 28 |                                      ['Redundant Zeros', 'Redundant Zeros'],
 29 |                                      ['Type Overuse', 'Single Value'],
 30 |                                      ['Redundant Zeros', 'Single Value'],
 31 |                                      ['Structured'], ['Dense Value']])
 32 |             elif choice == 'bfs':
 33 |                 self._configs[choice] = ValuePatternTest.Config(
 34 |                     files=['value_pattern_t0.csv'],
 35 |                     op_counts=[[[5861406, 2000000, 1000014, 5861406],
 36 |                                 [5999970, 5999970, 1000000, 5999970],
 37 |                                 [12000000, 0, 0, 0],
 38 |                                 [5999970, 32710, 2, 119381],
 39 |                                 [1930703, 633664, 11, 1930703],
 40 |                                 [1000000, 1000000, 1, 1000000],
 41 |                                 [0, 0, 0, 0],
 42 |                                 [1000000, 1000000, 1, 1000000],
 43 |                                 [1930703, 999999, 1, 1930703],
 44 |                                 [1930703, 999999, 11, 1930703]],
 45 |                                [[12000000, 1, 1, 12],
 46 |                                 [999999, 999999, 1, 999999],
 47 |                                 [999999, 999999, 1, 999999],
 48 |                                 [0, 0, 0, 0],
 49 |                                 [999999, 999999, 1, 999999],
 50 |                                 [999999, 999999, 1, 999999],
 51 |                                 [999999, 1, 1, 999999]]],
 52 |                     kernel_patterns=[['No Pattern', 'No Pattern', 'No Pattern',
 53 |                                       'No Pattern', 'Dense Value', 'Inappropriate',
 54 |                                       'Dense Value', 'Redundant Zeros', 'Single Value',
 55 |                                       'Dense Value'],
 56 |                                      ['No Pattern', 'Single Value', 'Inappropriate',
 57 |                                       'Dense Value', 'Redundant Zeros', 'Single Value',
 58 |                                       'Single Value']]
 59 |                 )
 60 | 
 61 |     def _run_impl(self, case_name, version):
 62 |         def check(op_counts, kernel_patterns, buf: str):
 63 |             lines = buf.splitlines()
 64 |             order = -1
 65 |             count = -1
 66 |             pattern = -1
 67 |             find_pattern = False
 68 |             for n, line in enumerate(lines):
 69 |                 count_line = False
 70 |                 pattern_line = False
 71 |                 dist_line = False
 72 |                 if line.find('kernel id') != -1:
 73 |                     order += 1
 74 |                     pattern = -1
 75 |                 elif line.find('array id:') != -1:
 76 |                     count = -1
 77 |                     pattern += 1
 78 |                     find_pattern = False
 79 |                 elif line.find('count:') != -1:
 80 |                     count += 1
 81 |                     count_line = True
 82 |                 elif line.find(' * ') != -1:
 83 |                     pattern_line = True
 84 |                 elif line.find('TOP') != -1:
 85 |                     dist_line = True
 86 |                 if count_line is True:
 87 |                     v = int(line.split(':')[1])
 88 |                     if op_counts[order][pattern][count] != v:
 89 |                         return False, ' line {} count error: (true: {} vs test: {})'.format(n, op_counts[order][pattern][count], v)
 90 |                 elif pattern_line is True:
 91 |                     if line.find(kernel_patterns[order][pattern]):
 92 |                         find_pattern = True
 93 |                 elif dist_line is True:
 94 |                     if find_pattern is False:
 95 |                         return False, ' line {} pattern error: (true: {})'.format(n, kernel_patterns[order][pattern])
 96 |             return True, ''
 97 | 
 98 |         command = Test.cases[case_name].command
 99 |         options = Test.cases[case_name].options
100 |         path = Test.cases[case_name].path
101 | 
102 |         pipe_read(['gvprof', '-cfg', '-e', 'value_pattern', command] + options)
103 | 
104 |         files = self._configs[case_name].files
105 |         op_counts = self._configs[case_name].op_counts
106 |         kernel_patterns = self._configs[case_name].kernel_patterns
107 | 
108 |         for f in files:
109 |             buf = pipe_read(
110 |                 ['cat', 'gvprof-database/' + f]).decode('utf-8')
111 |             res, msg = check(op_counts, kernel_patterns, buf)
112 |             if res is False:
113 |                 print('Error ' + path + ' ' + msg)
114 |             else:
115 |                 print('Pass ' + path + ' ' + f)
116 | 


--------------------------------------------------------------------------------
/python/utils.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import csv
 3 | 
 4 | 
 5 | def pipe_read(command, debug=False):
 6 |     process = subprocess.Popen(command,
 7 |                                stdout=subprocess.PIPE,
 8 |                                stderr=subprocess.PIPE)
 9 |     stdout, stderr = process.communicate()
10 |     if debug is True:
11 |         print(stdout)
12 |         print(stderr)
13 |     return stdout
14 | 
15 | 
16 | def cleanup(arch):
17 |     pipe_read(['make', 'clean'])
18 |     if arch is not None:
19 |         pipe_read(['make', 'GPU_ARCH="-arch {}"'.join(arch)])
20 |     else:
21 |         pipe_read(['make'])
22 | 
23 | 
24 | def nsys_profile(command, kernels):
25 |     pipe_read(['nsys', 'profile', '-f', 'true', '-o', 'tmp'] + command)
26 |     pipe_read(['nsys', 'stats', '--report', 'gpukernsum', '--report', 'gpumemtimesum',
27 |                '--format', 'csv', '-o', 'tmp', '--force-overwrite', './tmp.qdrep'])
28 | 
29 |     kernel_times = dict()
30 | 
31 |     gpu_kernel_time = 0.0
32 | 
33 |     with open('tmp_gpukernsum.csv', newline='') as csvfile:
34 |         spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
35 | 
36 |         first_row = True
37 |         for row in spamreader:
38 |             if first_row is True:
39 |                 first_row = False
40 |                 continue
41 | 
42 |             time = row[1]
43 |             kernel_args_name = row[6].replace('"', '').replace('void ', '')
44 |             gpu_kernel_time += float(time)
45 | 
46 |             for kernel_name, template in kernels:
47 |                 if template is True:
48 |                     match_kernel_name = kernel_name
49 |                 else:
50 |                     match_kernel_name = kernel_name + '('
51 |                 if kernel_args_name.startswith(match_kernel_name) is True:
52 |                     kernel_times[kernel_name] = float(time)
53 |                     break
54 | 
55 |     gpu_mem_time = 0.0
56 | 
57 |     with open('tmp_gpumemtimesum.csv', newline='') as csvfile:
58 |         spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
59 | 
60 |         first_row = True
61 |         for row in spamreader:
62 |             if first_row is True:
63 |                 first_row = False
64 |             else:
65 |                 gpu_mem_time += float(row[1])
66 | 
67 |     return kernel_times, gpu_kernel_time, gpu_mem_time
68 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pygraphviz
2 | numpy
3 | argparse


--------------------------------------------------------------------------------
/src/gpu-analysis.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu-patch.h"
  2 | #include "gpu-queue.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <cub/cub.cuh>
  6 | 
  7 | #define GPU_ANALYSIS_DEBUG 0
  8 | 
  9 | #if GPU_ANALYSIS_DEBUG
 10 | #define PRINT(...) \
 11 |  if (threadIdx.x == 0 && blockIdx.x == 0) { \
 12 |    printf(__VA_ARGS__); \
 13 |  } 
 14 | #define PRINT_ALL(...) \
 15 |   printf(__VA_ARGS__)
 16 | #define PRINT_RECORDS(buffer) \
 17 |   __syncthreads(); \
 18 |   if (threadIdx.x == 0) { \
 19 |     gpu_patch_analysis_address_t *records = (gpu_patch_analysis_address_t *)buffer->records; \
 20 |     for (uint32_t i = 0; i < buffer->head_index; ++i) { \
 21 |       printf("gpu analysis-> merged <%p, %p> (%p)\n", records[i].start, records[i].end, records[i].end - records[i].start); \
 22 |     } \
 23 |   } \
 24 |   __syncthreads(); 
 25 | #else
 26 | #define PRINT(...)
 27 | #define PRINT_ALL(...)
 28 | #define PRINT_RECORDS(buffer) 
 29 | #endif
 30 | 
 31 | #define MAX_U64 (0xFFFFFFFFFFFFFFFF)
 32 | #define MAX_U32 (0xFFFFFFFF)
 33 | 
 34 | static
 35 | __device__
 36 | void
 37 | interval_compact
 38 | (
 39 |  gpu_patch_buffer_t *patch_buffer,
 40 |  gpu_patch_buffer_t *read_buffer,
 41 |  gpu_patch_buffer_t *write_buffer
 42 | )
 43 | {
 44 |   auto warp_index = blockDim.x / GPU_PATCH_WARP_SIZE * blockIdx.x + threadIdx.x / GPU_PATCH_WARP_SIZE;
 45 |   auto num_warps = blockDim.x / GPU_PATCH_WARP_SIZE;
 46 |   auto laneid = get_laneid();
 47 |   gpu_patch_record_address_t *records = (gpu_patch_record_address_t *)patch_buffer->records;
 48 |   gpu_patch_analysis_address_t *read_records = (gpu_patch_analysis_address_t *)read_buffer->records;
 49 |   gpu_patch_analysis_address_t *write_records = (gpu_patch_analysis_address_t *)write_buffer->records;
 50 | 
 51 |   PRINT("gpu analysis->full: %u, analysis: %u, head_index: %u, tail_index: %u, size: %u, num_threads: %u",
 52 |     patch_buffer->full, patch_buffer->analysis, patch_buffer->head_index, patch_buffer->tail_index,
 53 |     patch_buffer->size, patch_buffer->num_threads)
 54 | 
 55 |   for (auto iter = warp_index; iter < patch_buffer->head_index; iter += num_warps) {
 56 |     gpu_patch_record_address_t *record = records + iter;
 57 |     uint64_t address_start = record->address[laneid];
 58 |     if (((0x1u << laneid) & record->active) == 0) {
 59 |       // Those address_start does not matter
 60 |       address_start = 0;
 61 |     }
 62 | 
 63 |     // Sort addresses and check if they are contiguous
 64 |     address_start = warp_sort(address_start, laneid);
 65 | 
 66 |     // First none zero
 67 |     uint32_t b = ballot((int32_t)(address_start != 0));
 68 |     uint32_t first_laneid = __ffs(b) - 1;
 69 |     uint64_t interval_start = 0;
 70 |     interval_start = shfl_up(address_start, 1);
 71 | 
 72 |     PRINT_ALL("gpu_analysis <%d, %d>->active: %x, interval_start: %p, address_start: %p\n",
 73 |       blockIdx.x, threadIdx.x, record->active, interval_start, address_start);
 74 | 
 75 |     int32_t interval_start_point = 0;
 76 |     if (first_laneid == laneid || (address_start != 0 && (interval_start + record->size < address_start))) {
 77 |       interval_start_point = 1;
 78 |     }
 79 | 
 80 |     // In the worst case, a for loop takes 31 * 3 steps (shift + compare + loop) to find 
 81 |     // the right end. The following procedure find the end with ~10 instructions.
 82 |     // Find the end position
 83 |     // 00100010b
 84 |     // 76543210
 85 |     //       x
 86 |     // laneid = 1
 87 |     b = ballot(interval_start_point);
 88 | 
 89 |     PRINT_ALL("gpu_analysis <%d, %d>->ballot: %x, interval_start_point: %d, address_start: %p\n",
 90 |       blockIdx.x, threadIdx.x, b, interval_start_point, address_start);
 91 | 
 92 |     // 00100010b
 93 |     // b_rev
 94 |     // 01000100b
 95 |     // 76543210
 96 |     //  x
 97 |     // laneid_rev = 8 - 1 - 1 = 6
 98 |     uint32_t b_rev = brev(b);
 99 |     uint32_t laneid_rev = GPU_PATCH_WARP_SIZE - laneid - 1; 
100 |     uint32_t laneid_rev_mask = (1 << laneid_rev) - 1;
101 | 
102 |     PRINT_ALL("gpu_analysis <%d, %d>->b_rev: %x, laneid_rev: %x, laneid_rev_mask: %x\n",
103 |       blockIdx.x, threadIdx.x, b_rev, laneid_rev, laneid_rev_mask);
104 | 
105 |     // 00000100b
106 |     // 76543210
107 |     //      x
108 |     // p_rev = 2
109 |     // p = 8 - 2 - 1 = 5
110 |     uint32_t p = bfind(laneid_rev_mask & b_rev);
111 |     if (p != MAX_U32) {
112 |       // Get the end of the interval
113 |       // max(p) = 30
114 |       p = GPU_PATCH_WARP_SIZE - p - 1 - 1;
115 |     } else {
116 |       // Get last
117 |       p = GPU_PATCH_WARP_SIZE - 1;
118 |     }
119 |     uint64_t address_end = address_start + record->size;
120 |     address_end = shfl(address_end, p);
121 |     
122 |     PRINT_ALL("gpu_analysis <%d, %d>->p: %d, address_start: %p, address_end: %p\n",
123 |       blockIdx.x, threadIdx.x, p, address_start, address_end);
124 | 
125 |     if (interval_start_point == 1) {
126 |       gpu_patch_analysis_address_t *address_record = NULL;
127 | 
128 |       if (record->flags & GPU_PATCH_READ) {
129 |         address_record = read_records + gpu_queue_get(read_buffer); 
130 |         address_record->start = address_start;
131 |         address_record->end = address_end;
132 | 
133 |         PRINT_ALL("gpu_analysis <%d, %d>->push address_start: %p, address_end: %p\n",
134 |           blockIdx.x, threadIdx.x, address_start, address_end);
135 |         gpu_queue_push(read_buffer);
136 |       } 
137 |       
138 |       if (record->flags & GPU_PATCH_WRITE) {
139 |         address_record = write_records + gpu_queue_get(write_buffer); 
140 |         address_record->start = address_start;
141 |         address_record->end = address_end;
142 | 
143 |         PRINT_ALL("gpu_analysis <%d, %d>->push address_start: %p, address_end: %p\n",
144 |           blockIdx.x, threadIdx.x, address_start, address_end);
145 |         gpu_queue_push(write_buffer);
146 |       } 
147 |     }
148 |   }
149 | }
150 | 
151 | 
152 | template<int THREADS, int ITEMS>
153 | static
154 | __device__
155 | int
156 | interval_merge_impl
157 | (
158 |  uint64_t *d_in,
159 |  uint64_t *d_out,
160 |  uint32_t valid_items
161 | )
162 | {
163 |   // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
164 |   typedef cub::BlockLoad<uint64_t, THREADS, ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
165 |   // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
166 |   typedef cub::BlockStore<uint64_t, THREADS, ITEMS, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
167 |   // Specialize BlockRadixSort type for our thread block
168 |   typedef cub::BlockRadixSort<uint64_t, THREADS, ITEMS, int> BlockRadixSortT;
169 |   // Specialize BlockScan type for our thread block
170 |   typedef cub::BlockScan<int, THREADS> BlockScanT;
171 |   // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
172 |   typedef cub::BlockDiscontinuity<int, THREADS> BlockDiscontinuity;
173 |   // Shared memory
174 |   __shared__ union TempStorage
175 |   {
176 |     typename BlockLoadT::TempStorage         load;
177 |     typename BlockStoreT::TempStorage        store;
178 |     typename BlockRadixSortT::TempStorage    sort;
179 |     typename BlockScanT::TempStorage         scan;
180 |     typename BlockDiscontinuity::TempStorage disc;
181 |   } temp_storage;
182 | 
183 |   // Per-thread tile items
184 |   uint64_t items[ITEMS];
185 |   int interval_start_point[ITEMS];
186 |   int interval_end_point[ITEMS];
187 |   int interval_start_index[ITEMS];
188 |   int interval_end_index[ITEMS];
189 | 
190 |   // Load items into a blocked arrangement
191 |   BlockLoadT(temp_storage.load).Load(d_in, items, valid_items, MAX_U64);
192 |   __syncthreads();
193 | 
194 |   for (uint32_t i = 0; i < ITEMS / 2; ++i) {
195 |     if (items[i * 2] != MAX_U64) {
196 |       items[i * 2] = items[i * 2] << 1;
197 |     }
198 |     if (items[i * 2 + 1] != MAX_U64) {
199 |       items[i * 2 + 1] = (items[i * 2 + 1] << 1) + 1;
200 |     }
201 |   }
202 | 
203 |   for (uint32_t i = 0; i < ITEMS / 2; ++i) {
204 |     if (items[i * 2] != MAX_U64) {
205 |       interval_start_point[i * 2] = 1;
206 |     } else {
207 |       interval_start_point[i * 2] = 0;
208 |     }
209 |     if (items[i * 2 + 1] != MAX_U64) {
210 |       interval_start_point[i * 2 + 1] = -1;
211 |     } else {
212 |       interval_start_point[i * 2 + 1] = 0;
213 |     }
214 |     interval_end_point[i * 2] = 0;
215 |     interval_end_point[i * 2 + 1] = 0;
216 |     interval_start_index[i * 2] = 0;
217 |     interval_start_index[i * 2 + 1] = 0;
218 |     interval_end_index[i * 2] = 0;
219 |     interval_end_index[i * 2 + 1] = 0;
220 |   }
221 | 
222 |   // Sort keys
223 |   BlockRadixSortT(temp_storage.sort).Sort(items, interval_start_point);
224 |   __syncthreads();
225 | 
226 |   // Get end marks
227 |   BlockScanT(temp_storage.scan).InclusiveSum(interval_start_point, interval_start_point);
228 |   __syncthreads();
229 | 
230 |   for (uint32_t i = 0; i < ITEMS; ++i) {
231 |     if (items[i] != MAX_U64 && interval_start_point[i] == 0) {
232 |       interval_end_point[i] = 1;
233 |     }
234 |   }
235 | 
236 |   // Get start marks
237 |   // XXX(Keren): this interface has a different input and output order.
238 |   BlockDiscontinuity(temp_storage.disc).FlagHeads(interval_start_point, interval_end_point, cub::Inequality());
239 |   __syncthreads();
240 | 
241 |   for (uint32_t i = 0; i < ITEMS; ++i) {
242 |     if (items[i] != MAX_U64 && interval_start_point[i] == 1 && interval_end_point[i] != 1) {
243 |       interval_start_point[i] = 1;
244 |     } else {
245 |       interval_start_point[i] = 0;
246 |     }
247 |   }
248 | 
249 |   // Get interval start index
250 |   int aggregate = 0;
251 |   BlockScanT(temp_storage.scan).InclusiveSum(interval_start_point, interval_start_index, aggregate);
252 |   __syncthreads();
253 | 
254 |   // Get interval end index
255 |   BlockScanT(temp_storage.scan).InclusiveSum(interval_end_point, interval_end_index);
256 |   __syncthreads();
257 | 
258 |   // Put indices in the corresponding slots
259 |   for (uint32_t i = 0; i < ITEMS; ++i) {
260 |     if (interval_start_point[i] == 1) {
261 |       d_out[(interval_start_index[i] - 1) * 2] = (items[i] >> 1);
262 |     }
263 |     if (interval_end_point[i] == 1) {
264 |       d_out[(interval_end_index[i] - 1) * 2 + 1] = (items[i] - 1) >> 1;
265 |     }
266 |   }
267 | 
268 |   return aggregate;
269 | }
270 | 
271 | 
272 | template<int THREADS, int ITEMS>
273 | static
274 | __device__
275 | void
276 | interval_merge
277 | (
278 |  gpu_patch_buffer_t *buffer
279 | )
280 | {
281 |   uint32_t cur_index = 0;
282 |   uint32_t items = 0;
283 |   uint32_t tile_size = THREADS * ITEMS;
284 |   uint64_t *records = (uint64_t *)buffer->records;
285 |   for (; cur_index + (tile_size / 2) <= buffer->head_index; cur_index += (tile_size / 2)) {
286 |     items += interval_merge_impl<THREADS, ITEMS>(records + cur_index * 2, records + items * 2, tile_size);
287 |     PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, tile_size, items);
288 |     __syncthreads();
289 |   }
290 |   // Remainder
291 |   if (cur_index < buffer->head_index) {
292 |     items += interval_merge_impl<THREADS, ITEMS>(records + cur_index * 2, records + items * 2, ((buffer->head_index - cur_index) * 2));
293 |     PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, (buffer->head_index - cur_index) * 2, items);
294 |     __syncthreads();
295 |   }
296 | 
297 |   // Second pass
298 |   // Fake shuffle
299 |   if (items < buffer->head_index) {
300 |     cur_index = 0;
301 |     items = 0;
302 |     for (; cur_index + (tile_size / 2) <= buffer->head_index; cur_index += (tile_size / 2)) {
303 |       items += interval_merge_impl<THREADS, ITEMS>(records + cur_index * 2, records + items * 2, tile_size);
304 |       PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, tile_size, items);
305 |       __syncthreads();
306 |     }
307 |     // Remainder
308 |     if (cur_index < buffer->head_index) {
309 |       items += interval_merge_impl<THREADS, ITEMS>(records + cur_index * 2, records + items * 2, ((buffer->head_index - cur_index) * 2));
310 |       PRINT("gpu analysis-> head_index %u, cur_index %u, tile_size %u, items %u\n", buffer->head_index, cur_index, (buffer->head_index - cur_index) * 2, items);
311 |       __syncthreads();
312 |     }
313 |   }
314 | 
315 |   if (threadIdx.x == 0) {
316 |     buffer->head_index = items;
317 |     buffer->tail_index = items;
318 |   }
319 | }
320 | 
321 | 
322 | // TODO(Keren): multiple buffers, no need to wait
323 | extern "C"
324 | __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1)
325 | __global__
326 | void
327 | gpu_analysis_interval_merge
328 | (
329 |  gpu_patch_buffer_t *buffer,
330 |  gpu_patch_buffer_t *read_buffer,
331 |  gpu_patch_buffer_t *write_buffer
332 | )
333 | {
334 |   // Continue processing until CPU notifies analysis is done
335 |   while (true) {
336 |     // Wait until GPU notifies buffer is full. i.e., analysis can begin process.
337 |     // Block sampling is not allowed
338 |     while (buffer->analysis == 0 && atomic_load(&buffer->num_threads) != 0);
339 | 
340 |     if (atomic_load(&buffer->num_threads) == 0) {
341 |       // buffer->analysis must be 0
342 |       break;
343 |     }
344 | 
345 |     // Compact addresses from contiguous thread accesses within each warp
346 |     interval_compact(buffer, read_buffer, write_buffer);
347 | 
348 |     // Compact is done
349 |     __syncthreads();
350 | 
351 |     if (threadIdx.x == 0) {
352 |       buffer->analysis = 0;
353 |     }
354 | 
355 |     // Merge read buffer
356 |     if (read_buffer->head_index != 0) {
357 |       interval_merge<GPU_PATCH_ANALYSIS_THREADS, GPU_PATCH_ANALYSIS_ITEMS>(read_buffer);
358 | 
359 |       PRINT("gpu analysis-> read buffer\n")
360 |       PRINT_RECORDS(read_buffer)
361 |     }
362 | 
363 |     // Merge write buffer
364 |     if (write_buffer->head_index != 0) {
365 |       interval_merge<GPU_PATCH_ANALYSIS_THREADS, GPU_PATCH_ANALYSIS_ITEMS>(write_buffer);
366 | 
367 |       PRINT("gpu analysis-> write buffer\n")
368 |       PRINT_RECORDS(write_buffer)
369 |     }
370 | 
371 |     __syncthreads();
372 |   }
373 | 
374 |   // Last analysis
375 |   interval_compact(buffer, read_buffer, write_buffer);
376 | 
377 |   // Compact is done
378 |   __syncthreads();
379 | 
380 |   // Merge read buffer
381 |   if (read_buffer->head_index != 0) {
382 |     interval_merge<GPU_PATCH_ANALYSIS_THREADS, GPU_PATCH_ANALYSIS_ITEMS>(read_buffer);
383 | 
384 |     PRINT("gpu analysis-> read buffer\n")
385 |     PRINT_RECORDS(read_buffer)
386 |   }
387 | 
388 |   // Merge write buffer
389 |   if (write_buffer->head_index != 0) {
390 |     interval_merge<GPU_PATCH_ANALYSIS_THREADS, GPU_PATCH_ANALYSIS_ITEMS>(write_buffer);
391 | 
392 |     PRINT("gpu analysis-> write buffer\n")
393 |     PRINT_RECORDS(write_buffer)
394 |   }
395 | 
396 |   __syncthreads();
397 | 
398 |   if (threadIdx.x == 0) {
399 |     atomic_store_system(&read_buffer->num_threads, (uint32_t)0);
400 |   }
401 | }
402 | 


--------------------------------------------------------------------------------
/src/gpu-patch-address.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu-patch.h"
  2 | #include "gpu-queue.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <sanitizer_patching.h>
  6 | 
  7 | struct gpu_patch_analysis_address_comparator {
  8 |   __device__
  9 |   bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) {
 10 |     return l.start <= r.start;
 11 |   }
 12 | };
 13 | 
 14 | /*
 15 |  * Monitor each shared and global memory access.
 16 |  */
 17 | static 
 18 | __device__ __forceinline__
 19 | SanitizerPatchResult
 20 | memory_access_callback
 21 | (
 22 |  void *user_data,
 23 |  uint64_t pc,
 24 |  void *address,
 25 |  uint32_t size,
 26 |  uint32_t flags,
 27 |  const void *new_value
 28 | ) 
 29 | {
 30 |   gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data;
 31 | 
 32 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
 33 |     return SANITIZER_PATCH_SUCCESS;
 34 |   }
 35 | 
 36 |   // 1. Init values
 37 |   uint32_t active_mask = __activemask();
 38 |   uint32_t laneid = get_laneid();
 39 |   uint32_t first_laneid = __ffs(active_mask) - 1;
 40 | 
 41 |   uint32_t keep = 1;
 42 |   if (buffer->aux != NULL && (flags & GPU_PATCH_READ) != 0 &&
 43 |     (flags & (GPU_PATCH_WRITE | GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) {
 44 |     // Read address can be filtered
 45 |     gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux;
 46 |     gpu_patch_analysis_address_t *start_end = address_dict->start_end;
 47 |     gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 };
 48 |     uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator());
 49 | 
 50 |     if (pos != address_dict->size) {
 51 |       // Find an existing entry
 52 |       if (atomic_load(address_dict->hit + pos) == 0) {
 53 |         // Update
 54 |         atomic_store(address_dict->hit + pos, (uint8_t)1);
 55 |       } else {
 56 |         // Filter out
 57 |         keep = 0;
 58 |       }
 59 |     } 
 60 |   }
 61 | 
 62 |   __syncwarp(active_mask);
 63 | 
 64 |   uint32_t all_keep = 0;
 65 |   all_keep = ballot((uint32_t)keep, active_mask);
 66 |   if (all_keep == 0) {
 67 |     // Fast path
 68 |     return SANITIZER_PATCH_SUCCESS;
 69 |   }
 70 | 
 71 |   gpu_patch_record_address_t *record = NULL;
 72 |   if (laneid == first_laneid) {
 73 |     // 3. Get a record
 74 |     gpu_patch_record_address_t *records = (gpu_patch_record_address_t *)buffer->records;
 75 |     record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 
 76 | 
 77 |     // 4. Assign basic values
 78 |     record->flags = flags;
 79 |     record->size = size;
 80 |     record->active = all_keep & active_mask;
 81 |   }
 82 | 
 83 |   __syncwarp(active_mask);
 84 | 
 85 |   uint64_t r = (uint64_t)record;
 86 |   record = (gpu_patch_record_address_t *)shfl(r, first_laneid, active_mask);
 87 | 
 88 |   if (record != NULL && keep == 1) {
 89 |     record->address[laneid] = (uint64_t)address;
 90 |   }
 91 | 
 92 |   __syncwarp(active_mask);
 93 | 
 94 |   if (laneid == first_laneid) {
 95 |     // 5. Push a record
 96 |     gpu_queue_push(buffer);
 97 |   }
 98 | 
 99 |   return SANITIZER_PATCH_SUCCESS;
100 | }
101 | 
102 | 
103 | extern "C"
104 | __device__ __noinline__
105 | SanitizerPatchResult
106 | sanitizer_global_memory_access_callback
107 | (
108 |  void *user_data,
109 |  uint64_t pc,
110 |  void *address,
111 |  uint32_t size,
112 |  uint32_t flags,
113 |  const void *new_value
114 | ) 
115 | {
116 |   return memory_access_callback(user_data, pc, address, size, flags, new_value);
117 | }
118 | 
119 | 
120 | extern "C"
121 | __device__ __noinline__
122 | SanitizerPatchResult
123 | sanitizer_shared_memory_access_callback
124 | (
125 |  void *user_data,
126 |  uint64_t pc,
127 |  void *address,
128 |  uint32_t size,
129 |  uint32_t flags,
130 |  const void *new_value
131 | ) 
132 | {
133 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value);
134 | }
135 | 
136 | 
137 | extern "C"
138 | __device__ __noinline__
139 | SanitizerPatchResult
140 | sanitizer_local_memory_access_callback
141 | (
142 |  void *user_data,
143 |  uint64_t pc,
144 |  void *address,
145 |  uint32_t size,
146 |  uint32_t flags,
147 |  const void *new_value
148 | ) 
149 | {
150 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value);
151 | }
152 | 
153 | 
154 | /*
155 |  * Lock the corresponding hash entry for a block
156 |  */
157 | extern "C"
158 | __device__ __noinline__
159 | SanitizerPatchResult
160 | sanitizer_block_exit_callback
161 | (
162 |  void *user_data,
163 |  uint64_t pc
164 | )
165 | {
166 |   gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data;
167 | 
168 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
169 |     return SANITIZER_PATCH_SUCCESS;
170 |   }
171 | 
172 |   uint32_t active_mask = __activemask();
173 |   uint32_t laneid = get_laneid();
174 |   uint32_t first_laneid = __ffs(active_mask) - 1;
175 |   int32_t pop_count = __popc(active_mask);
176 | 
177 |   if (laneid == first_laneid) {
178 |     // Finish a bunch of threads
179 |     atomicAdd(&buffer->num_threads, -pop_count);
180 |   }
181 | 
182 |   return SANITIZER_PATCH_SUCCESS;
183 | }


--------------------------------------------------------------------------------
/src/gpu-patch-aux.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu-patch.h"
  2 | #include "gpu-queue.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <sanitizer_patching.h>
  6 | 
  7 | struct gpu_patch_analysis_address_comparator {
  8 |   __device__
  9 |   bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) {
 10 |     return l.start <= r.start;
 11 |   }
 12 | };
 13 | 
 14 | /*
 15 |  * Monitor each shared and global memory access.
 16 |  */
 17 | static 
 18 | __device__ __forceinline__
 19 | SanitizerPatchResult
 20 | memory_access_callback
 21 | (
 22 |  void *user_data,
 23 |  uint64_t pc,
 24 |  void *address,
 25 |  uint32_t size,
 26 |  uint32_t flags,
 27 |  const void *new_value
 28 | ) 
 29 | {
 30 |   gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data;
 31 | 
 32 |   // 1. Init values
 33 |   uint32_t active_mask = __activemask();
 34 |   uint32_t laneid = get_laneid();
 35 |   uint32_t first_laneid = __ffs(active_mask) - 1;
 36 | 
 37 |   uint32_t keep = 1;
 38 |   if (buffer->aux != NULL && (flags & (GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) {
 39 |     // Read address can be filtered
 40 |     gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux;
 41 |     gpu_patch_analysis_address_t *start_end = address_dict->start_end;
 42 |     gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 };
 43 |     uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator());
 44 | 
 45 |     if (pos != address_dict->size) {
 46 |       // Find an existing entry
 47 |       if (atomic_load(address_dict->hit + pos) == 0)
 48 |       {
 49 |         // Update
 50 |         atomic_store(address_dict->hit + pos, (uint8_t)1);
 51 |       } else {
 52 |         // Filter out
 53 |         keep = 0;
 54 |       }
 55 |       if (atomic_load(address_dict->read + pos) == 0 && static_cast<GPUPatchFlags>(flags) == GPU_PATCH_READ) {
 56 |         atomic_store(address_dict->read + pos, (uint8_t)1);
 57 |       }
 58 |       if (atomic_load(address_dict->write + pos) == 0 && static_cast<GPUPatchFlags>(flags) == GPU_PATCH_WRITE) {
 59 |         atomic_store(address_dict->write + pos, (uint8_t)1);
 60 |       }
 61 |     } 
 62 |   }
 63 | 
 64 |   __syncwarp(active_mask);
 65 | 
 66 |   uint32_t all_keep = 0;
 67 |   all_keep = ballot((uint32_t)keep, active_mask);
 68 |   if (all_keep == 0) {
 69 |     // Fast path
 70 |     return SANITIZER_PATCH_SUCCESS;
 71 |   }
 72 | 
 73 |   __syncwarp(active_mask);
 74 | 
 75 |   if (laneid == first_laneid) {
 76 |     // 5. Push a record
 77 |     gpu_queue_push(buffer);
 78 |   }
 79 | 
 80 |   return SANITIZER_PATCH_SUCCESS;
 81 | }
 82 | 
 83 | 
 84 | extern "C"
 85 | __device__ __noinline__
 86 | SanitizerPatchResult
 87 | sanitizer_global_memory_access_callback
 88 | (
 89 |  void *user_data,
 90 |  uint64_t pc,
 91 |  void *address,
 92 |  uint32_t size,
 93 |  uint32_t flags,
 94 |  const void *new_value
 95 | ) 
 96 | {
 97 |   return memory_access_callback(user_data, pc, address, size, flags, new_value);
 98 | }
 99 | 
100 | 
101 | extern "C"
102 | __device__ __noinline__
103 | SanitizerPatchResult
104 | sanitizer_shared_memory_access_callback
105 | (
106 |  void *user_data,
107 |  uint64_t pc,
108 |  void *address,
109 |  uint32_t size,
110 |  uint32_t flags,
111 |  const void *new_value
112 | ) 
113 | {
114 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value);
115 | }
116 | 
117 | 
118 | extern "C"
119 | __device__ __noinline__
120 | SanitizerPatchResult
121 | sanitizer_local_memory_access_callback
122 | (
123 |  void *user_data,
124 |  uint64_t pc,
125 |  void *address,
126 |  uint32_t size,
127 |  uint32_t flags,
128 |  const void *new_value
129 | ) 
130 | {
131 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value);
132 | }
133 | 
134 | 
135 | /*
136 |  * Lock the corresponding hash entry for a block
137 |  */
138 | extern "C"
139 | __device__ __noinline__
140 | SanitizerPatchResult
141 | sanitizer_block_exit_callback
142 | (
143 |  void *user_data,
144 |  uint64_t pc
145 | )
146 | {
147 |   gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data;
148 | 
149 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
150 |     return SANITIZER_PATCH_SUCCESS;
151 |   }
152 | 
153 |   uint32_t active_mask = __activemask();
154 |   uint32_t laneid = get_laneid();
155 |   uint32_t first_laneid = __ffs(active_mask) - 1;
156 |   int32_t pop_count = __popc(active_mask);
157 | 
158 |   if (laneid == first_laneid) {
159 |     // Finish a bunch of threads
160 |     atomicAdd(&buffer->num_threads, -pop_count);
161 |   }
162 | 
163 |   return SANITIZER_PATCH_SUCCESS;
164 | }


--------------------------------------------------------------------------------
/src/gpu-patch-torch-aux.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu-patch.h"
  2 | #include "gpu-queue.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <sanitizer_patching.h>
  6 | 
  7 | struct gpu_patch_analysis_address_comparator {
  8 |   __device__
  9 |   bool operator()(gpu_patch_analysis_address &l, gpu_patch_analysis_address &r) {
 10 |     return l.start <= r.start;
 11 |   }
 12 | };
 13 | 
 14 | /*
 15 |  * Monitor each shared and global memory access.
 16 |  */
 17 | static 
 18 | __device__ __forceinline__
 19 | SanitizerPatchResult
 20 | memory_access_callback
 21 | (
 22 |  void *user_data,
 23 |  uint64_t pc,
 24 |  void *address,
 25 |  uint32_t size,
 26 |  uint32_t flags,
 27 |  const void *new_value
 28 | ) 
 29 | {
 30 |   gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data;
 31 | 
 32 |   // 1. Init values
 33 |   uint32_t active_mask = __activemask();
 34 |   uint32_t laneid = get_laneid();
 35 |   uint32_t first_laneid = __ffs(active_mask) - 1;
 36 | 
 37 |   uint32_t keep = 1;
 38 |   if (buffer->aux != NULL && buffer->torch_aux != NULL && (flags & (GPU_PATCH_SHARED | GPU_PATCH_LOCAL)) == 0) {
 39 | 
 40 |     gpu_patch_aux_address_dict *address_dict = (gpu_patch_aux_address_dict *)buffer->aux;
 41 |     gpu_patch_analysis_address_t *start_end = address_dict->start_end;
 42 |     gpu_patch_analysis_address_t addr = { (uint64_t)address, 0 };
 43 |     uint32_t pos = map_prev(start_end, addr, address_dict->size, gpu_patch_analysis_address_comparator());
 44 | 
 45 |     if (pos != address_dict->size) {
 46 |       // Find an existing entry
 47 |       if (atomic_load(address_dict->hit + pos) == 0) {
 48 |         // Update
 49 |         atomic_store(address_dict->hit + pos, (uint8_t)1);
 50 |       } else {
 51 |         // Filter out
 52 |         keep = 0;
 53 |       }
 54 |     }
 55 | 
 56 |     gpu_patch_aux_address_dict *torch_address_dict = (gpu_patch_aux_address_dict *)buffer->torch_aux;
 57 |     gpu_patch_analysis_address_t *torch_start_end = torch_address_dict->start_end;
 58 |     uint32_t torch_pos = map_prev(torch_start_end, addr, torch_address_dict->size, gpu_patch_analysis_address_comparator());
 59 | 
 60 |     if (torch_pos != torch_address_dict->size) {
 61 |       // Find an existing entry
 62 |       if (atomic_load(torch_address_dict->hit + torch_pos) == 0) {
 63 |         // Update
 64 |         atomic_store(torch_address_dict->hit + torch_pos, (uint8_t)1);
 65 |       } else {
 66 |         // Filter out
 67 |         keep = 0;
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   __syncwarp(active_mask);
 73 | 
 74 |   uint32_t all_keep = 0;
 75 |   all_keep = ballot((uint32_t)keep, active_mask);
 76 |   if (all_keep == 0) {
 77 |     // Fast path
 78 |     return SANITIZER_PATCH_SUCCESS;
 79 |   }
 80 | 
 81 |   __syncwarp(active_mask);
 82 | 
 83 |   if (laneid == first_laneid) {
 84 |     // 5. Push a record
 85 |     gpu_queue_push(buffer);
 86 |   }
 87 | 
 88 |   return SANITIZER_PATCH_SUCCESS;
 89 | }
 90 | 
 91 | 
 92 | extern "C"
 93 | __device__ __noinline__
 94 | SanitizerPatchResult
 95 | sanitizer_global_memory_access_callback
 96 | (
 97 |  void *user_data,
 98 |  uint64_t pc,
 99 |  void *address,
100 |  uint32_t size,
101 |  uint32_t flags,
102 |  const void *new_value
103 | ) 
104 | {
105 |   return memory_access_callback(user_data, pc, address, size, flags, new_value);
106 | }
107 | 
108 | 
109 | extern "C"
110 | __device__ __noinline__
111 | SanitizerPatchResult
112 | sanitizer_shared_memory_access_callback
113 | (
114 |  void *user_data,
115 |  uint64_t pc,
116 |  void *address,
117 |  uint32_t size,
118 |  uint32_t flags,
119 |  const void *new_value
120 | ) 
121 | {
122 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value);
123 | }
124 | 
125 | 
126 | extern "C"
127 | __device__ __noinline__
128 | SanitizerPatchResult
129 | sanitizer_local_memory_access_callback
130 | (
131 |  void *user_data,
132 |  uint64_t pc,
133 |  void *address,
134 |  uint32_t size,
135 |  uint32_t flags,
136 |  const void *new_value
137 | ) 
138 | {
139 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value);
140 | }
141 | 
142 | 
143 | /*
144 |  * Lock the corresponding hash entry for a block
145 |  */
146 | extern "C"
147 | __device__ __noinline__
148 | SanitizerPatchResult
149 | sanitizer_block_exit_callback
150 | (
151 |  void *user_data,
152 |  uint64_t pc
153 | )
154 | {
155 |   gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data;
156 | 
157 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
158 |     return SANITIZER_PATCH_SUCCESS;
159 |   }
160 | 
161 |   uint32_t active_mask = __activemask();
162 |   uint32_t laneid = get_laneid();
163 |   uint32_t first_laneid = __ffs(active_mask) - 1;
164 |   int32_t pop_count = __popc(active_mask);
165 | 
166 |   if (laneid == first_laneid) {
167 |     // Finish a bunch of threads
168 |     atomicAdd(&buffer->num_threads, -pop_count);
169 |   }
170 | 
171 |   return SANITIZER_PATCH_SUCCESS;
172 | }


--------------------------------------------------------------------------------
/src/gpu-patch.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu-patch.h"
  2 | #include "gpu-queue.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <sanitizer_patching.h>
  6 | 
  7 | /*
  8 |  * Monitor each shared and global memory access.
  9 |  */
 10 | static 
 11 | __device__ __forceinline__
 12 | SanitizerPatchResult
 13 | memory_access_callback
 14 | (
 15 |  void *user_data,
 16 |  uint64_t pc,
 17 |  void *address,
 18 |  uint32_t size,
 19 |  uint32_t flags,
 20 |  const void *new_value
 21 | ) 
 22 | {
 23 |   gpu_patch_buffer_t *buffer = (gpu_patch_buffer_t *)user_data;
 24 | 
 25 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
 26 |     return SANITIZER_PATCH_SUCCESS;
 27 |   }
 28 | 
 29 |   // 1. Init values
 30 |   uint32_t active_mask = __activemask();
 31 |   uint32_t laneid = get_laneid();
 32 |   uint32_t first_laneid = __ffs(active_mask) - 1;
 33 | 
 34 |   // 2. Read memory values
 35 |   uint8_t buf[GPU_PATCH_MAX_ACCESS_SIZE];
 36 |   if (new_value == NULL) {
 37 |     // Read operation, old value can be on local memory, shared memory, or global memory
 38 |     if (flags & GPU_PATCH_SHARED) {
 39 |       read_shared_memory(size, (uint32_t)address, buf);
 40 |     } else if (flags & GPU_PATCH_LOCAL) {
 41 |       read_local_memory(size, (uint32_t)address, buf);
 42 |     } else if (flags != SANITIZER_MEMORY_DEVICE_FLAG_FORCE_INT) {
 43 |       read_global_memory(size, (uint64_t)address, buf);
 44 |     }
 45 |   } else {
 46 |     // Write operation, new value is on global memory
 47 |     read_global_memory(size, (uint64_t)new_value, buf);
 48 |   }
 49 | 
 50 |   gpu_patch_record_t *record = NULL;
 51 |   if (laneid == first_laneid) {
 52 |     // 3. Get a record
 53 |     gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records;
 54 |     record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 
 55 | 
 56 |     // 4. Assign basic values
 57 |     record->flags = flags;
 58 |     record->active = active_mask;
 59 |     record->pc = pc;
 60 |     record->size = size;
 61 |     record->flat_thread_id = get_flat_thread_id();
 62 |     record->flat_block_id = get_flat_block_id();
 63 |   }
 64 | 
 65 |   __syncwarp(active_mask);
 66 | 
 67 |   uint64_t r = (uint64_t)record;
 68 |   record = (gpu_patch_record_t *)shfl(r, first_laneid, active_mask);
 69 | 
 70 |   if (record != NULL) {
 71 |     record->address[laneid] = (uint64_t)address;
 72 |     for (uint32_t i = 0; i < size; ++i) {
 73 |       record->value[laneid][i] = buf[i];
 74 |     }
 75 |   }
 76 | 
 77 |   __syncwarp(active_mask);
 78 | 
 79 |   if (laneid == first_laneid) {
 80 |     // 5. Push a record
 81 |     gpu_queue_push(buffer);
 82 |   }
 83 | 
 84 |   return SANITIZER_PATCH_SUCCESS;
 85 | }
 86 | 
 87 | 
 88 | extern "C"
 89 | __device__ __noinline__
 90 | SanitizerPatchResult
 91 | sanitizer_global_memory_access_callback
 92 | (
 93 |  void *user_data,
 94 |  uint64_t pc,
 95 |  void *address,
 96 |  uint32_t size,
 97 |  uint32_t flags,
 98 |  const void *new_value
 99 | ) 
100 | {
101 |   return memory_access_callback(user_data, pc, address, size, flags, new_value);
102 | }
103 | 
104 | 
105 | extern "C"
106 | __device__ __noinline__
107 | SanitizerPatchResult
108 | sanitizer_shared_memory_access_callback
109 | (
110 |  void *user_data,
111 |  uint64_t pc,
112 |  void *address,
113 |  uint32_t size,
114 |  uint32_t flags,
115 |  const void *new_value
116 | ) 
117 | {
118 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_SHARED, new_value);
119 | }
120 | 
121 | 
122 | extern "C"
123 | __device__ __noinline__
124 | SanitizerPatchResult
125 | sanitizer_local_memory_access_callback
126 | (
127 |  void *user_data,
128 |  uint64_t pc,
129 |  void *address,
130 |  uint32_t size,
131 |  uint32_t flags,
132 |  const void *new_value
133 | ) 
134 | {
135 |   return memory_access_callback(user_data, pc, address, size, flags | GPU_PATCH_LOCAL, new_value);
136 | }
137 | 
138 | 
139 | /*
140 |  * Lock the corresponding hash entry for a block
141 |  */
142 | extern "C"
143 | __device__ __noinline__
144 | SanitizerPatchResult
145 | sanitizer_block_exit_callback
146 | (
147 |  void *user_data,
148 |  uint64_t pc
149 | )
150 | {
151 |   gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data;
152 | 
153 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
154 |     return SANITIZER_PATCH_SUCCESS;
155 |   }
156 | 
157 |   uint32_t active_mask = __activemask();
158 |   uint32_t laneid = get_laneid();
159 |   uint32_t first_laneid = __ffs(active_mask) - 1;
160 |   int32_t pop_count = __popc(active_mask);
161 | 
162 |   if (laneid == first_laneid) {
163 |     gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records;
164 |     gpu_patch_record_t *record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 
165 | 
166 |     record->pc = pc;
167 |     record->flags = GPU_PATCH_BLOCK_EXIT_FLAG;
168 |     record->flat_block_id = get_flat_block_id();
169 |     record->flat_thread_id = get_flat_thread_id();
170 |     record->active = active_mask;
171 | 
172 |     gpu_queue_push(buffer);
173 | 
174 |     // Finish a bunch of threads
175 |     atomicAdd(&(buffer->num_threads), -pop_count);
176 |   }
177 | 
178 |   return SANITIZER_PATCH_SUCCESS;
179 | }
180 | 
181 | 
182 | /*
183 |  * Sample the corresponding blocks
184 |  */
185 | extern "C"
186 | __device__ __noinline__
187 | SanitizerPatchResult
188 | sanitizer_block_enter_callback
189 | (
190 |  void *user_data,
191 |  uint64_t pc
192 | )
193 | {
194 |   gpu_patch_buffer_t* buffer = (gpu_patch_buffer_t *)user_data;
195 | 
196 |   if (!sample_callback(buffer->block_sampling_frequency, buffer->block_sampling_offset)) {
197 |     return SANITIZER_PATCH_SUCCESS;
198 |   }
199 | 
200 |   uint32_t active_mask = __activemask();
201 |   uint32_t laneid = get_laneid();
202 |   uint32_t first_laneid = __ffs(active_mask) - 1;
203 | 
204 |   if (laneid == first_laneid) {
205 |     // Mark block begin
206 |     gpu_patch_record_t *records = (gpu_patch_record_t *)buffer->records;
207 |     gpu_patch_record_t *record = records + gpu_queue_get(buffer, (buffer->flags & GPU_PATCH_ANALYSIS) != 0); 
208 | 
209 |     record->pc = pc;
210 |     record->flags = GPU_PATCH_BLOCK_ENTER_FLAG;
211 |     record->flat_block_id = get_flat_block_id();
212 |     record->flat_thread_id = get_flat_thread_id();
213 |     record->active = active_mask;
214 | 
215 |     gpu_queue_push(buffer);
216 |   }
217 | 
218 |   return SANITIZER_PATCH_SUCCESS;
219 | }
220 | 


--------------------------------------------------------------------------------