├── plot-BarrierPoint.sh ├── run-BarrierPoint.sh ├── sample ├── Makefile └── matrixmul.c ├── .gitignore ├── configs_plot.json ├── configs.json ├── barrierpoint-libs ├── Makefile ├── sim_api.h ├── bp_dr.cc ├── bp_perfcntrs.cc └── omp_counters.h ├── Makefile ├── LICENSE-MIT.txt ├── dynamorio_client ├── region.cpp ├── include │ ├── reuse_distance.hpp │ ├── region.hpp │ ├── barrierpoint.hpp │ ├── thread_data.hpp │ └── reuse_distance_impl.h ├── CMakeLists.txt ├── README.md ├── reuse_distance.cpp ├── thread_data.cpp ├── barrierpoint.cpp ├── main.cpp └── LICENSE-BSD-LGPL.txt ├── simpoint.patch ├── LICENSE.txt ├── README.md ├── errorEstimate.py └── RunBenchmarks.py /plot-BarrierPoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare config_json='configs_plot.json' 4 | 5 | python errorEstimate.py -c $config_json 6 | -------------------------------------------------------------------------------- /run-BarrierPoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare config_json='configs.json' 4 | 5 | # DR analysis 6 | python DrAnalysis.py -c $config_json 7 | # Perfcntr analysis 8 | python RunBenchmarks.py -c $config_json 9 | -------------------------------------------------------------------------------- /sample/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | OPTFLAGS = -g -O3 4 | CFLAGS = $(OPTFLAGS) -fopenmp -Wall 5 | 6 | all: 7 | $(CC) $(CFLAGS) matrixmul.c $(LDFLAGS) -o matrixmul 8 | 9 | clean: 10 | rm -f *.o matrixmul 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.d 3 | *.so 4 | *.pyc 5 | simpoint 6 | SimPoint.3.2 7 | dynamorio_client/build 8 | benchmarks 9 | outputs 10 | plots 11 | __pycache__ 12 | python-env/ 13 | sample/matrixmul 14 | debug-barrierpoint.log 15 | debug-errorEstimate.log 16 | -------------------------------------------------------------------------------- /configs_plot.json: -------------------------------------------------------------------------------- 1 | { 2 | "paths": { 3 | "rootdir": "./", 4 | "outpath": "./outputs", 5 | "plotpath": "./plots" 6 | }, 7 | "Application": { 8 | "matrixmul":{ 9 | "Barriers_suffix":[ 10 | "sample" 11 | ], 12 | "Perfcntrs_suffix":[ 13 | "sample" 14 | ] 15 | } 16 | }, 17 | "threads": [ 18 | 4 19 | ], 20 | "plot_format":"pdf", 21 | "Debug":false 22 | } 23 | -------------------------------------------------------------------------------- /configs.json: -------------------------------------------------------------------------------- 1 | { 2 | "paths": { 3 | "rootdir": "./", 4 | "benchpath": "./benchmarks", 5 | "libspath": "./barrierpoint-libs", 6 | "outpath": "./outputs" 7 | }, 8 | "execution": { 9 | "bp_identification":true, 10 | "perfcntrs": true 11 | }, 12 | "threads": [ 13 | 4 14 | ], 15 | "Application":{ 16 | "matrixmul":"" 17 | }, 18 | "Suffix":[ 19 | "sample" 20 | ], 21 | "Debug":false, 22 | "DR_iterations":10, 23 | "Perfcntrs_iterations":20, 24 | "dry_run":false 25 | } 26 | -------------------------------------------------------------------------------- /barrierpoint-libs/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | PAPI_DIR ?= 3 | 4 | ifdef PAPI_DIR 5 | PAPI_INCLUDE=$(PAPI_DIR)/include 6 | PAPI_LIBRARY=$(PAPI_DIR)/lib 7 | endif 8 | 9 | ifeq ($(DEBUG),ON) 10 | DBG = -DDEBUG 11 | endif 12 | 13 | all: bp_perfcntrs bp_dr 14 | 15 | bp_dr: bp_dr.cc 16 | $(CXX) -Wall -fPIC $(DBG) -shared bp_dr.cc -o libbp_dr.so -ldl 17 | 18 | bp_perfcntrs: bp_perfcntrs.cc 19 | $(CXX) -I$(PAPI_INCLUDE) -L$(PAPI_LIBRARY) -Wall -fPIC $(DBG) -shared bp_perfcntrs.cc -o libbp_perfcntrs.so -ldl -lpapi -fopenmp 20 | 21 | clean: 22 | rm -rf libbp_dr.so libbp_perfcntrs.so 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: simpoint sample BPlibs 2 | 3 | # Set custom PAPI directory, if needed 4 | PAPI_DIR ?= 5 | 6 | BPlibs: 7 | make PAPI_DIR=$(PAPI_DIR) -C barrierpoint-libs 8 | 9 | simpoint: 10 | if [ ! -d SimPoint.3.2 ]; then\ 11 | wget -O - http://cseweb.ucsd.edu/~calder/simpoint/releases/SimPoint.3.2.tar.gz | tar -x -f - -z;\ 12 | patch -p0 < simpoint.patch;\ 13 | fi 14 | make -C SimPoint.3.2 15 | ln -s SimPoint.3.2/bin/simpoint ./simpoint 16 | 17 | microbenchmarks: 18 | make -C sample 19 | 20 | clean: 21 | make -C sample clean 22 | make -C barrierpoint-libs clean 23 | rm ./simpoint 24 | 25 | distclean: clean 26 | rm -rf SimPoint.3.2 27 | 28 | .PHONY: clean distclean sample 29 | -------------------------------------------------------------------------------- /LICENSE-MIT.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014, 2015 Trevor E. Carlson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /barrierpoint-libs/sim_api.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * 4 | * SPDX-License-Identifier: Apache-2.0 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /* Region of interest start and stop markers for the DynamoRIO instrumentation */ 20 | 21 | void SimRoiStart(void) __attribute__((noinline)); 22 | void SimRoiEnd (void) __attribute__((noinline)); 23 | 24 | /* Define a compiler barrier to prevent compiler reordering */ 25 | void SimRoiStart(void){ 26 | __asm__ volatile("" ::: "memory"); 27 | } 28 | 29 | /* Define a compiler barrier to prevent compiler reordering */ 30 | void SimRoiEnd(void){ 31 | __asm__ volatile("" ::: "memory"); 32 | } 33 | -------------------------------------------------------------------------------- /dynamorio_client/region.cpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #include 22 | #include 23 | #include"region.hpp" 24 | 25 | 26 | Region::Region(std::string synch_point_name, 27 | uint64_t total_instr, 28 | std::unordered_map current_bbv, 29 | std::vector lru_stack_hist){ 30 | synch_name = synch_point_name; 31 | instr_count = total_instr; 32 | bbv = current_bbv; 33 | lru_hist = lru_stack_hist; 34 | } -------------------------------------------------------------------------------- /dynamorio_client/include/reuse_distance.hpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #ifndef REUSE_DISTANCE_H 22 | #define REUSE_DISTANCE_H 23 | 24 | #include "reuse_distance_impl.h" 25 | 26 | typedef uintptr_t addr_t; 27 | 28 | /* This class keeps track of the addresses reuse distances */ 29 | class ReuseDistance{ 30 | public: 31 | ReuseDistance(); 32 | ReuseDistance(unsigned int skip_distance, bool verify, unsigned int reuse_threshold); 33 | int_least64_t lookup_distance(addr_t tag); 34 | std::unique_ptr ref_list; 35 | std::unordered_map cache_map; 36 | 37 | private: 38 | int total_refs; 39 | bool verify_skip; 40 | unsigned int reuse_threshold; 41 | unsigned int skip_dist; 42 | }; 43 | 44 | #endif -------------------------------------------------------------------------------- /dynamorio_client/include/region.hpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #ifndef INTER_BARRIER_REGION_H 22 | #define INTER_BARRIER_REGION_H 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | 29 | /* This class represents the region between different synchronization points. 30 | * We use this to store and manage the performance data we gather during the 31 | * execution, such as BBV and LRU Stack Distance. 32 | */ 33 | class Region{ 34 | public: 35 | Region(std::string synch_point_name, uint64_t total_instr, 36 | std::unordered_map current_bbv, 37 | std::vector lru_stack_dist); 38 | 39 | std::string synch_name; 40 | uint64_t instr_count; 41 | std::unordered_map bbv; 42 | std::vector lru_hist; /* LRU Stack Distance histogram */ 43 | }; 44 | 45 | #endif -------------------------------------------------------------------------------- /barrierpoint-libs/bp_dr.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * 4 | * SPDX-License-Identifier: Apache-2.0 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /* Region of interest library for DynamoRIO BarrierPoint client */ 20 | 21 | #include 22 | #include 23 | #include "sim_api.h" 24 | 25 | namespace BarrierPointsNS { 26 | 27 | bool _inside_roi = false; 28 | bool _dr_flag = false; 29 | 30 | extern "C" void GOMP_barrier (void) { 31 | typedef void (*GOMP_barrier_t) (void); 32 | GOMP_barrier_t GOMP_barrier = (GOMP_barrier_t) dlsym(RTLD_NEXT, "GOMP_barrier"); 33 | 34 | char * s = secure_getenv("ROI_BP"); 35 | if(s) 36 | _inside_roi = strtol(s, NULL, 10) == 1 ? true : false; 37 | 38 | if(_inside_roi){ 39 | if (!_dr_flag){ 40 | _dr_flag = true; 41 | #ifdef DEBUG 42 | std::cout << "[BarrierPoint] Start of RoI" << std::endl; 43 | #endif 44 | SimRoiStart(); 45 | return; 46 | } 47 | }else 48 | if (_dr_flag){ 49 | _dr_flag = false; 50 | #ifdef DEBUG 51 | std::cout << "[BarrierPoint] End of RoI" << std::endl; 52 | #endif 53 | SimRoiEnd(); 54 | return; 55 | } 56 | 57 | return (GOMP_barrier)(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /dynamorio_client/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | # DynamoRIO compile options. Their value is by default the one used in the methodology 4 | option(WITH_VALIDATION "Basic Block Vector Detection Validation" OFF) 5 | if(WITH_VALIDATION) 6 | add_definitions(-DVALIDATE) 7 | endif() 8 | 9 | option(MASK_ADDRESS "Mask the memory addresses using the cache line value" ON) 10 | if(MASK_ADDRESS) 11 | add_definitions(-DMASK_ADDRESSES) 12 | endif() 13 | 14 | option(TRACE_BEFORE_ROI "Trace memory accesses before start of the ROI" ON) 15 | if(TRACE_BEFORE_ROI) 16 | add_definitions(-DTRACE_MEM_BEFORE_ROI) 17 | endif() 18 | 19 | set (CMAKE_CXX_FLAGS "-Wall -Werror") 20 | set (CMAKE_CXX_STANDARD 11) 21 | file(GLOB SOURCES RELATIVE ${CMAKE_SOURCE_DIR} "*.cpp") 22 | add_library(barrierpoint SHARED main.cpp ${SOURCES}) 23 | 24 | if(NOT DEFINED ENV{DYNAMORIO_BUILD_DIR}) 25 | message(FATAL_ERROR "Please define the following environment variable: export DYNAMORIO_BUILD_DIR= ") 26 | endif () 27 | 28 | string(CONCAT DR_PACKAGE $ENV{DYNAMORIO_BUILD_DIR} "/cmake/") 29 | set(DynamoRIO_DIR ${DR_PACKAGE} CACHE PATH 30 | "DynamoRIO installation's cmake directory") 31 | 32 | find_package(DynamoRIO) 33 | if (NOT DynamoRIO_FOUND) 34 | message(FATAL_ERROR "DynamoRIO package required to build") 35 | endif(NOT DynamoRIO_FOUND) 36 | 37 | string(CONCAT DR_INCLUDE $ENV{DYNAMORIO_BUILD_DIR} "/include/") 38 | string(CONCAT DR_EXT_INCLUDE $ENV{DYNAMORIO_BUILD_DIR} "/ext/include/") 39 | include_directories(${DR_INCLUDE}) 40 | include_directories("./include/") 41 | include_directories(${DR_EXT_INCLUDE}) 42 | 43 | configure_DynamoRIO_client(barrierpoint) 44 | use_DynamoRIO_extension(barrierpoint drmgr) 45 | use_DynamoRIO_extension(barrierpoint drwrap) 46 | use_DynamoRIO_extension(barrierpoint drutil) 47 | use_DynamoRIO_extension(barrierpoint drreg) 48 | use_DynamoRIO_extension(barrierpoint drx) 49 | use_DynamoRIO_extension(barrierpoint drsyms) 50 | use_DynamoRIO_extension(barrierpoint droption) 51 | -------------------------------------------------------------------------------- /sample/matrixmul.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * 4 | * SPDX-License-Identifier: Apache-2.0 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /* 20 | Double precision parallel matrix multiplication. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #define size 10 28 | #define iterations 10 29 | 30 | double A[size][size]; 31 | double B[size][size]; 32 | double C[size][size]; 33 | 34 | int main( int argc, char* argv[] ) 35 | { 36 | int i, j, k, itr; 37 | srand(1); 38 | 39 | // Begin Region of Interest 40 | setenv("ROI_BP", "1", 1); 41 | #pragma omp barrier 42 | 43 | // Initialize buffers. 44 | for (i = 0; i < size; ++i) { 45 | for (j = 0; j < size; ++j) { 46 | A[i][j] = (double)rand()/(double)RAND_MAX; 47 | B[i][j] = (double)rand()/(double)RAND_MAX; 48 | C[i][j] = 0.0; 49 | } 50 | } 51 | 52 | for (itr = 0 ; itr < iterations ; itr++) { 53 | printf("Iteration: %d\n", itr); 54 | #pragma omp parallel for private(j,k) shared(A,B,C) 55 | // C <- C + A x B 56 | for (i = 0; i < size; ++i) 57 | for (j = 0; j < size; ++j) 58 | for (k = 0; k < size; ++k) 59 | C[i][j] += A[i][k] * B[k][j]; 60 | } 61 | 62 | // End Region of Interest 63 | setenv("ROI_BP", "0", 1); 64 | #pragma omp barrier 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /dynamorio_client/include/barrierpoint.hpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #ifndef BARRIERPOINT_H 22 | #define BARRIERPOINT_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "thread_data.hpp" 31 | #include "dr_api.h" 32 | 33 | /* This class gathers together all the piece of data needed for running the 34 | * BarrierPoint methodology. 35 | */ 36 | class BarrierPoint{ 37 | public: 38 | BarrierPoint(); 39 | void incr_synch_count(void); 40 | void add_thread_data(ThreadData data); 41 | uint32_t synch_count; /* Current synchronization point id. */ 42 | void save(std::string output_path); 43 | void free(void); 44 | 45 | private: 46 | /* Status for the current region */ 47 | std::vector threads; 48 | /* In this vector we save the parallel function names called right after omp parallel */ 49 | void save_bbv_inst_count(std::string out_path); 50 | void save_bp_id(std::string out_path); 51 | void save_bbv_count(std::string out_path); 52 | void save_ldv_hist(std::string out_path); 53 | void save_ldv_bb(std::string out_path); 54 | void generate_fake_tid(void); 55 | void align_synch_bb(void); 56 | }; 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /dynamorio_client/include/thread_data.hpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #ifndef THREAD_DATA_H 22 | #define THREAD_DATA_H 23 | 24 | #include 25 | #include 26 | #include 27 | #include"region.hpp" 28 | #include"dr_defines.h" 29 | #include"reuse_distance.hpp" 30 | #include"dr_api.h" 31 | 32 | enum { 33 | MEMTRACE_TLS_OFFS_BUF_PTR, /* Allocated TLS slot offsets */ 34 | MEMTRACE_TLS_COUNT, /* total number of TLS slots allocated */ 35 | }; 36 | 37 | typedef struct _mem_ref_t{ 38 | app_pc addr; 39 | } mem_ref_t; 40 | 41 | extern reg_id_t tls_seg; 42 | extern uint tls_offs; 43 | 44 | /* Max number of mem_ref a buffer can have. It should be big enough to hold 45 | * all entries between clean calls. 46 | */ 47 | #define MAX_NUM_MEM_REFS 4096 48 | #define MEM_BUF_SIZE (sizeof(mem_ref_t) * MAX_NUM_MEM_REFS) 49 | 50 | #define TLS_SLOT(tls_base, enum_val) (void **)((byte *)(tls_base) + tls_offs + (enum_val)) 51 | #define BUF_PTR(tls_base) *(mem_ref_t **)TLS_SLOT(tls_base, MEMTRACE_TLS_OFFS_BUF_PTR) 52 | 53 | /* This class represents all thread data, including inter-barrier region information 54 | * and reuse distance. 55 | */ 56 | class ThreadData{ 57 | public: 58 | unsigned int tid; 59 | unsigned int fake_tid; /* Fake thread id for building output files. */ 60 | bool is_master; /* checks for master thread */ 61 | bool in_single; /* omp single calls tracker */ 62 | ReuseDistance *reuse_dist; 63 | uint64_t cur_bb_tag; 64 | /* Region data associated with a global identifier */ 65 | std::unordered_map regions; 66 | 67 | /* All public functions are accessible for the rest of the program */ 68 | ThreadData(int thread_id, bool is_master_thread, uint32_t region_id); 69 | 70 | void add_bb(uint64_t key, uint64_t inst_n); 71 | void add_address(void); 72 | void clean_buffer(void); 73 | void save_barrier(std::string synch_point_name); 74 | const uint64_t get_instr_count(uint32_t synch_id); 75 | 76 | /* Define sorting according to the thread_id */ 77 | bool operator < (const ThreadData& data) const{ 78 | return (tid < data.tid); 79 | } 80 | 81 | #ifdef VALIDATE 82 | file_t disassemble_file; 83 | file_t memory_access_file; 84 | file_t runtime_bb_file; 85 | file_t region_file; 86 | #endif 87 | 88 | private: 89 | /* Status for the current region */ 90 | uint64_t cur_instr_count; 91 | uint32_t cur_synch_id; /* Current synchronization point id. */ 92 | std::unordered_map cur_bbv; 93 | std::vector cur_lru_hist; 94 | /* Memory buffer containing instructions which have not yet been fed to the treap */ 95 | byte *seg_base; 96 | mem_ref_t *buf_base; 97 | }; 98 | 99 | #endif -------------------------------------------------------------------------------- /barrierpoint-libs/bp_perfcntrs.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * 4 | * SPDX-License-Identifier: Apache-2.0 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /* Library of Region of interest and GOMP directives to trigger PAPI 20 | * Performance counters 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "omp_counters.h" 28 | #include 29 | 30 | namespace BarrierPointsNS { 31 | 32 | int _BP_number=-1; 33 | bool _inside_single = false; 34 | bool _ignore_single = false; 35 | bool _measuring = false; 36 | bool _inside_roi = false; 37 | bool _inside_barrier = false; 38 | int total_threads = 0; 39 | 40 | void BarrierPoints(){ 41 | if(_inside_roi && omp_get_thread_num() == 0) { 42 | _BP_number++; 43 | #ifdef DEBUG 44 | std::cout << "[BarrierPoint] Executing BP: " << _BP_number << std::endl; 45 | #endif 46 | } 47 | } 48 | 49 | extern "C" void GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags){ 50 | typedef void (*GOMP_parallel_t) (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags); 51 | GOMP_parallel_t GOMP_parallel = (GOMP_parallel_t) dlsym(RTLD_NEXT, "GOMP_parallel"); 52 | 53 | #ifdef DEBUG 54 | std::cout << "[BarrierPoint] Intercepted GOMP_parallel ALL | Thread: " \ 55 | << omp_get_thread_num() << std::endl; 56 | #endif 57 | if(!_measuring){ 58 | #ifdef DEBUG 59 | std::cout << "[BarrierPoint] Intercepted GOMP_parallel" << std::endl; 60 | #endif 61 | 62 | if(_inside_roi) { 63 | BarrierPoints(); 64 | _measuring = true; 65 | parallelRegionPerformanceCounters(); 66 | _measuring = false; 67 | } 68 | } 69 | return (GOMP_parallel)(fn, data, num_threads, flags); 70 | } 71 | 72 | extern "C" void GOMP_barrier (void) { 73 | typedef void (*GOMP_barrier_t) (void); 74 | GOMP_barrier_t GOMP_barrier = (GOMP_barrier_t) dlsym(RTLD_NEXT, "GOMP_barrier"); 75 | 76 | char * s = secure_getenv("ROI_BP"); 77 | if(s) { 78 | #ifdef DEBUG 79 | std::cout << "ROI_BP: " << s << std::endl; 80 | #endif 81 | _inside_roi = strtol(s, NULL, 10) == 1 ? true : false; 82 | } 83 | 84 | if(_inside_roi){ 85 | BarrierPoints(); 86 | 87 | if(_BP_number == 0){ 88 | _measuring = true; 89 | initPerformanceCounters(); 90 | startPerformanceCounters(); 91 | _measuring = false; 92 | }else 93 | barrierRegionPerformanceCounters(); 94 | }else if(_BP_number > -1) { 95 | #ifdef DEBUG 96 | std::cout << "[BarrierPoint] Intercepted end of ROI barrier | Total number of threads: " \ 97 | << omp_get_num_threads() << std::endl; 98 | #endif 99 | stopPerformanceCounters(); 100 | } 101 | return (GOMP_barrier)(); 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /dynamorio_client/README.md: -------------------------------------------------------------------------------- 1 | # DynamoRIO BarrierPoint Client 2 | 3 | ## How to build 4 | 5 | First download and build [DynamoRIO](https://github.com/DynamoRIO/dynamorio). 6 | Then cd into the BarrierPoint client folder (dynamorio_client) and build the client. 7 | 8 | BarrierPoint has been built and tested on different Aarch64 and x86-64 machines with GCC 7. 9 | 10 | ``` 11 | $ mkdir build && cd build 12 | $ export DYNAMORIO_BUILD_DIR=path/to/dynamorio_build 13 | $ cmake .. 14 | $ make 15 | ``` 16 | 17 | Make sure your application contains the Region-of-Interest start and stop functions. 18 | Check the main README file for more information. 19 | 20 | ## BarrierPoint Client Design 21 | 22 | The client identifies and selects synchronization points in the code (barrier points or BPs) 23 | and gathers architecture agnostic statistics to classify the different BPs. 24 | 25 | ### Barrier Points (synchronization points) 26 | 27 | The synchronization points are identified by OpenMP directives such as `OMP Parallel` 28 | (which identifies a parallel region) or `OMP Barrier` (explicit synchronization point). 29 | 30 | To be able to detect these, the client parses the application libraries at loading time 31 | and adds a callback to all the relevant OpenMP synchronization functions. 32 | These callbacks are private to each thread, since each thread will execute 33 | their own synchronization code and gather their own performance statistics. 34 | We make use of the `ThreadData` class to allocate a 'per-thread local storage' to achieve this. 35 | 36 | At the end of the application computation, the tool post-processes all 37 | the gathered data per thread and dumps it into several output files. 38 | These operations are performed in the `BarrierPoint` class. 39 | 40 | ### Gathering agnostic metrics 41 | 42 | The client tracks architecture-agnostic statistics for each of the identified BPs. 43 | The statistics are collect between BPs (in the `Region` class) and are as follows: 44 | 45 | - **Basic Block (BB) information**: The application's execution is split 46 | into basic blocks, which are instruction sequences with a single point of entry and exit. 47 | For each BB, the client traces the address of its first instruction (identifier), 48 | its length in instructions and how many times it has been executed by the target application. 49 | 50 | - **Least Recently Used (LRU) Stack Distance**: The client also gathers the LRU Stack 51 | distance, which is the number of distinct memory addresses accessed between 52 | two consecutive references to the same memory address. 53 | 54 | ## Output Files 55 | 56 | The client outputs many files to the `outputs/barriers` directory, some of them 57 | which will be later used by Simpoint. The list of outputs is as follows: 58 | 59 | - **.bp_id** - Contains the BPs that have been identified by the client. 60 | These are associated with a unique identifier and how many times the BP has been 61 | executed by application. 62 | 63 | - **.bbv_inscount** - Displays the total number of instructions executed 64 | by each thread, for each BP. Each column represents a different thread and 65 | each line a different BP. 66 | 67 | - **.bbv_count** - Displays the basic blocks that have been executed by the 68 | application. An unique incremental identifier is associated to each BB, 69 | together with the BB length (in terms of instructions). 70 | It follows the format: ` : ` 71 | 72 | - **.ldv_hist** - Displays an histogram for the LRU stack distance for each thread. 73 | Each line represents a different BP. 74 | 75 | - **.ldv_bb** - Displays the corresponding LRU stack distance vectors. 76 | Each line represents a different BP. 77 | -------------------------------------------------------------------------------- /simpoint.patch: -------------------------------------------------------------------------------- 1 | --- SimPoint.3.2/analysiscode/CmdLineParser.cpp 2006-01-04 01:24:58.000000000 +0000 2 | +++ SimPoint.3.2/analysiscode/CmdLineParser.cpp 2019-06-17 16:18:37.309236626 +0100 3 | @@ -71,6 +71,7 @@ 4 | 5 | 6 | #include "CmdLineParser.h" 7 | +#include 8 | 9 | bool CmdLineParser::parseCmdLine(int argc, char **argv) { 10 | for (int argNdx = 0; argNdx < argc; argNdx++) { 11 | Only in SimPoint.3.2/analysiscode: CmdLineParser.d 12 | Only in SimPoint.3.2/analysiscode: CmdLineParser.o 13 | Only in SimPoint.3.2/analysiscode: Datapoint.d 14 | 15 | --- SimPoint.3.2/analysiscode/Datapoint.h 2006-01-31 02:17:57.000000000 +0000 16 | +++ SimPoint.3.2/analysiscode/Datapoint.h 2019-06-17 16:18:37.309236626 +0100 17 | @@ -83,6 +83,7 @@ 18 | 19 | #include 20 | #include 21 | +#include 22 | 23 | using namespace std; 24 | 25 | Only in SimPoint.3.2/analysiscode: Datapoint.o 26 | Only in SimPoint.3.2/analysiscode: Dataset.d 27 | 28 | --- SimPoint.3.2/analysiscode/Dataset.h 2006-01-31 02:17:57.000000000 +0000 29 | +++ SimPoint.3.2/analysiscode/Dataset.h 2019-06-17 16:18:37.313236565 +0100 30 | @@ -83,6 +83,7 @@ 31 | 32 | #include "Datapoint.h" 33 | #include 34 | +#include 35 | 36 | class Dataset : public vector { 37 | public: 38 | Only in SimPoint.3.2/analysiscode: Dataset.o 39 | 40 | --- SimPoint.3.2/analysiscode/FVParser.cpp 2005-06-30 21:20:51.000000000 +0100 41 | +++ SimPoint.3.2/analysiscode/FVParser.cpp 2019-06-17 16:18:37.313236565 +0100 42 | @@ -79,6 +79,7 @@ 43 | #include "FVParser.h" 44 | #include "Utilities.h" 45 | #include 46 | +#include 47 | 48 | // take care of a difference between G++ 2.96 and 3.x 49 | #if (__GNUC__ >= 3) 50 | Only in SimPoint.3.2/analysiscode: FVParser.d 51 | Only in SimPoint.3.2/analysiscode: FVParser.o 52 | Only in SimPoint.3.2/analysiscode: KMeans.d 53 | Only in SimPoint.3.2/analysiscode: KMeans.o 54 | Only in SimPoint.3.2/analysiscode: Logger.d 55 | Only in SimPoint.3.2/analysiscode: Logger.o 56 | 57 | --- SimPoint.3.2/analysiscode/Makefile 2005-06-30 22:55:04.000000000 +0100 58 | +++ SimPoint.3.2/analysiscode/Makefile 2019-06-17 16:18:37.313236565 +0100 59 | @@ -1,4 +1,5 @@ 60 | -CPPFLAGS = -Wall -pedantic -pedantic-errors -O3 61 | +CPPFLAGS = -Wall -pedantic -pedantic-errors -O3 -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 62 | +# -D_GLIBCXX_USE_CXX11_ABI=1 has been added in order to support modern compiler version 63 | 64 | CXX = g++ 65 | 66 | Only in SimPoint.3.2/analysiscode: simpoint 67 | Only in SimPoint.3.2/analysiscode: Simpoint.d 68 | Only in SimPoint.3.2/analysiscode: Simpoint.o 69 | Only in SimPoint.3.2/analysiscode: SimpointOptions.d 70 | Only in SimPoint.3.2/analysiscode: SimpointOptions.o 71 | Only in SimPoint.3.2/analysiscode: Utilities.d 72 | 73 | --- SimPoint.3.2/analysiscode/Utilities.h 2006-02-01 19:39:48.000000000 +0000 74 | +++ SimPoint.3.2/analysiscode/Utilities.h 2019-06-17 16:18:37.313236565 +0100 75 | @@ -86,6 +86,9 @@ 76 | #include "Dataset.h" 77 | #include "Logger.h" 78 | #include 79 | +#include 80 | +#include 81 | +#include 82 | 83 | string toString(int i); 84 | string toString(double d); 85 | @@ -131,6 +134,22 @@ 86 | exit(1); 87 | } 88 | } 89 | + 90 | + static inline void check(std::ifstream& checkval, const string &msg){ 91 | + if(! checkval.is_open()){ 92 | + Logger::log() << "\nError: " << msg << endl; 93 | + exit(1); 94 | + } 95 | + } 96 | + 97 | + static inline void check(std::ofstream& checkval, const string &msg){ 98 | + if(! checkval.is_open()){ 99 | + Logger::log() << "\nError: " << msg << endl; 100 | + exit(1); 101 | + } 102 | + } 103 | + 104 | + 105 | }; 106 | 107 | 108 | Only in SimPoint.3.2/analysiscode: Utilities.o 109 | Only in SimPoint.3.2/bin: simpoint 110 | -------------------------------------------------------------------------------- /dynamorio_client/reuse_distance.cpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | /* ********************************************************** 22 | * Copyright (c) 2016-2019 Google, Inc. All rights reserved. 23 | * **********************************************************/ 24 | 25 | /* 26 | * Redistribution and use in source and binary forms, with or without 27 | * modification, are permitted provided that the following conditions are met: 28 | * 29 | * * Redistributions of source code must retain the above copyright notice, 30 | * this list of conditions and the following disclaimer. 31 | * 32 | * * Redistributions in binary form must reproduce the above copyright notice, 33 | * this list of conditions and the following disclaimer in the documentation 34 | * and/or other materials provided with the distribution. 35 | * 36 | * * Neither the name of Google, Inc. nor the names of its contributors may be 37 | * used to endorse or promote products derived from this software without 38 | * specific prior written permission. 39 | * 40 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 41 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 | * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE 44 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 46 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 47 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 50 | * DAMAGE. 51 | */ 52 | 53 | /* Reuse distance functions */ 54 | #include "reuse_distance.hpp" 55 | 56 | ReuseDistance::ReuseDistance(){ 57 | skip_dist=500; 58 | verify_skip=false; 59 | reuse_threshold=100; 60 | ref_list = std::unique_ptr( 61 | new line_ref_list_t(reuse_threshold, skip_dist, verify_skip)); 62 | } 63 | 64 | 65 | ReuseDistance::ReuseDistance(unsigned int skip_distance, bool verify, unsigned int threshold){ 66 | verify_skip=verify; 67 | reuse_threshold=threshold; 68 | skip_dist=skip_distance; 69 | ref_list = std::unique_ptr( 70 | new line_ref_list_t(reuse_threshold, skip_dist, verify_skip)); 71 | } 72 | 73 | 74 | /* Given a memory address, it returns the LRU stack distance. 75 | * It returns -1 if it's the first time the address has been used. 76 | */ 77 | int_least64_t ReuseDistance::lookup_distance(addr_t tag){ 78 | int_least64_t dist; 79 | total_refs++; 80 | std::unordered_map::iterator it = cache_map.find(tag); 81 | 82 | if (it == cache_map.end()){ 83 | line_ref_t *ref = new line_ref_t(tag); 84 | /* insert into the map */ 85 | cache_map.insert(std::pair(tag, ref)); 86 | /* insert into the list */ 87 | ref_list->add_to_front(ref); 88 | dist = -1; 89 | }else 90 | dist = ref_list->move_to_front(it->second); 91 | return dist; 92 | } -------------------------------------------------------------------------------- /dynamorio_client/thread_data.cpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include"thread_data.hpp" 26 | #include"dr_api.h" 27 | 28 | extern uint64_t cache_line_mask; 29 | 30 | /* Per-thread data functions: Saves and gathers the instrumented information to/from 31 | * each thread. This includes basic block information and LRU stack distances. 32 | */ 33 | ThreadData::ThreadData(int thread_id, bool is_master_thread, uint32_t region_id){ 34 | tid = (unsigned int)thread_id; 35 | cur_bb_tag = 0; 36 | in_single = false; 37 | is_master = is_master_thread; 38 | cur_synch_id = region_id; 39 | cur_instr_count = 0; 40 | 41 | #ifdef VALIDATE 42 | std::string region_file_name = std::to_string(thread_id) + "_region"; 43 | region_file = dr_open_file(region_file_name.c_str(), DR_FILE_WRITE_OVERWRITE); 44 | /* This file stores all the basic blocks that are seen by the tool before 45 | * being inserted in the code cache. 46 | */ 47 | std::string disassemble_file_name = std::to_string(thread_id); 48 | disassemble_file = dr_open_file(disassemble_file_name.c_str(), DR_FILE_WRITE_OVERWRITE); 49 | /* This file stores all the instructions that are performing any kind of 50 | * memory access: their operands are being taken into account by the tool for 51 | * computing the LRU Stack distance. 52 | */ 53 | std::string memory_file_name = std::to_string(thread_id) + "_mem"; 54 | memory_access_file = dr_open_file(memory_file_name.c_str(), DR_FILE_WRITE_OVERWRITE); 55 | /* This file stores all the basic blocks that are seen by the tool when 56 | * actually executed from the target application: these basic blocks are the 57 | * ones taken into account for computing the basic block vectors (BBVs) within 58 | * synchronization points. 59 | */ 60 | std::string runtime_bb_file_name = std::to_string(thread_id) + "_run_bb"; 61 | 62 | runtime_bb_file = dr_open_file(runtime_bb_file_name.c_str(), DR_FILE_WRITE_OVERWRITE); 63 | #endif 64 | 65 | /* LRU Stack Distance Initialization */ 66 | reuse_dist = new ReuseDistance(500, false, 100); 67 | seg_base = reinterpret_cast(dr_get_dr_segment_base(tls_seg)); 68 | buf_base = reinterpret_cast(dr_raw_mem_alloc(MEM_BUF_SIZE, 69 | DR_MEMPROT_READ | DR_MEMPROT_WRITE, nullptr)); 70 | DR_ASSERT(seg_base != nullptr && buf_base != nullptr); 71 | BUF_PTR(seg_base) = buf_base; 72 | } 73 | 74 | 75 | /* Add the bb into the hash map. We store its address and instructions number 76 | * in the cur_bbv hash map. We update the actual total number of executed instructions 77 | * using cur_instr_count counter. 78 | */ 79 | void ThreadData::add_bb(uint64_t key, uint64_t inst_n){ 80 | 81 | #ifdef VALIDATE 82 | dr_fprintf(runtime_bb_file, "[DR.thread_data] TID: %d Executed at runtime " 83 | PFX " with %d instructions\n", tid, key, inst_n); 84 | dr_flush_file(runtime_bb_file); 85 | #endif 86 | 87 | auto search = cur_bbv.find(key); 88 | if(search != cur_bbv.end()) 89 | cur_bbv[key] = cur_bbv[key] + inst_n; 90 | else 91 | cur_bbv[key] = inst_n; 92 | 93 | /* Increment instruction global counter */ 94 | cur_instr_count = cur_instr_count + inst_n; 95 | } 96 | 97 | 98 | /* Saving all the inter-barrier information */ 99 | void ThreadData::save_barrier(std::string synch_point_name){ 100 | regions.insert(std::make_pair(cur_synch_id, Region{synch_point_name, cur_instr_count, 101 | cur_bbv, cur_lru_hist})); 102 | DR_ASSERT_MSG(!regions.empty(), "[DR.thread_data] ERROR: Inter-barrier region is empty"); 103 | 104 | #ifdef VALIDATE 105 | /* Save a different dump file for each inter-barrier region detected */ 106 | dr_fprintf(region_file, "[DR.thread_data] Saving Barrier for synchronization name %s\n", 107 | synch_point_name.c_str()); 108 | dr_flush_file(region_file); 109 | dr_fprintf(region_file, "[DR.thread_data] Total Number of instruction is %" PRIu64 "\n", 110 | cur_instr_count); 111 | dr_flush_file(region_file); 112 | dr_fprintf(region_file, "[DR.thread_data] Thread id is %d\n", tid); 113 | dr_flush_file(region_file); 114 | dr_fprintf(region_file, "[DR.thread_data] Basic Block Saved is the following:\n"); 115 | for(auto& bb: cur_bbv){ 116 | dr_fprintf(region_file,"[DR.thread_data] BB @ %" PRIx64 " Instructions %" PRIu32 "\n", 117 | bb.first, bb.second); 118 | } 119 | dr_flush_file(region_file); 120 | 121 | dr_fprintf(region_file, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); 122 | 123 | #endif 124 | 125 | /* Update the next synchronization point */ 126 | cur_synch_id++; 127 | 128 | cur_instr_count=0; 129 | cur_bbv.clear(); 130 | cur_lru_hist.clear(); 131 | } 132 | 133 | 134 | /* Returns the instruction count for the given synchronization id */ 135 | const uint64_t ThreadData::get_instr_count(uint32_t synch_id){ 136 | auto search = regions.find(synch_id); 137 | if(search != regions.end()) 138 | return search->second.instr_count; 139 | else 140 | return 0; 141 | } 142 | 143 | 144 | /* Adds the memory addresses to the treap to extract the LRU stack distance 145 | * and stores the result in the histogram. 146 | */ 147 | void ThreadData::add_address(void){ 148 | mem_ref_t *mem_ref, *buf_ptr; 149 | buf_ptr = BUF_PTR(seg_base); 150 | 151 | for(mem_ref = (mem_ref_t *)(buf_base); mem_ref < buf_ptr; mem_ref++){ 152 | /* Address of cache read, depends on the cache block size! */ 153 | #ifdef MASK_ADDRESSES 154 | uint64_t address = reinterpret_cast(mem_ref->addr) & cache_line_mask; 155 | #else 156 | uint64_t address = reinterpret_cast(mem_ref->addr); 157 | #endif 158 | 159 | #ifdef VALIDATE 160 | /* The address we're dumping into the file changes whether you have 161 | * chosen to mask the memory address or not and according to the value 162 | * of the cache_line_mask specified 163 | */ 164 | dr_fprintf(memory_access_file ,"[DR.thread_data] Memory address @ " PFX " \n", address); 165 | #endif 166 | 167 | int_least64_t mem_access_diff = reuse_dist->lookup_distance(reinterpret_cast(address)); 168 | /* Check if this is the first time the address is added in the treap */ 169 | if (mem_access_diff != -1){ 170 | uint64_t log2_rd = mem_access_diff != 0 ? ((sizeof(unsigned long long)*8-1) 171 | - __builtin_clzll(mem_access_diff)) : 0; 172 | if(log2_rd >= cur_lru_hist.size()) 173 | cur_lru_hist.resize(log2_rd+1); 174 | cur_lru_hist[log2_rd]++; 175 | } 176 | } 177 | /* Reset back the buffer pointer */ 178 | BUF_PTR(seg_base) = buf_base; 179 | return; 180 | } 181 | 182 | 183 | /* Clean the memory buffer discarding its data */ 184 | void ThreadData::clean_buffer(void){ 185 | BUF_PTR(seg_base) = buf_base; 186 | return; 187 | } -------------------------------------------------------------------------------- /dynamorio_client/barrierpoint.cpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | /* BarrierPoint DR client: Gathers basic block information and computes 22 | * LRU stack distance with corresponding LDVs. Creates the necessary input files 23 | * for simpoint execution. 24 | */ 25 | 26 | #include"barrierpoint.hpp" 27 | extern std::unordered_map parallel_omp_f; 28 | 29 | 30 | BarrierPoint::BarrierPoint(){ 31 | synch_count = 0; 32 | } 33 | 34 | 35 | /* Increment the synchronization number */ 36 | void BarrierPoint::incr_synch_count(void){ 37 | synch_count++; 38 | } 39 | 40 | 41 | /* Add the single thread data, to be able to post process it */ 42 | void BarrierPoint::add_thread_data(ThreadData data){ 43 | threads.push_back(data); 44 | } 45 | 46 | 47 | /* Save all the gathered data into files */ 48 | void BarrierPoint::save(std::string output_path){ 49 | std::cout << "Saving data into " << output_path << std::endl; 50 | generate_fake_tid(); 51 | align_synch_bb(); 52 | save_bbv_inst_count(output_path); 53 | save_bp_id(output_path); 54 | save_bbv_count(output_path); 55 | save_ldv_hist(output_path); 56 | save_ldv_bb(output_path); 57 | 58 | } 59 | 60 | 61 | /* DynamoRIO traces the basic blocks corresponding to the parallel 62 | * synchronization functions as belonging to the region being ended when, 63 | * instead, they have to be taken into account in the next region. 64 | */ 65 | void BarrierPoint::align_synch_bb(void){ 66 | uint64_t bb_count_balance=0; 67 | for(auto &t: threads){ 68 | for(uint32_t id=0; id < synch_count; id++){ 69 | auto region = t.regions.find(id); 70 | /* Update the instruction count balance from the previous region */ 71 | region->second.instr_count = region->second.instr_count + bb_count_balance; 72 | bb_count_balance=0; 73 | 74 | /* For all the omp parallel equivalent functions, 75 | * look for its corresponding basic block in this region: if found, print it. 76 | */ 77 | for(auto &f_omp : parallel_omp_f){ 78 | uint64_t bb_addr = reinterpret_cast(f_omp.first); 79 | auto bb_found = region->second.bbv.find(bb_addr); 80 | if(bb_found != region->second.bbv.end()){ 81 | region->second.bbv.erase(bb_addr); 82 | bb_count_balance = bb_found->second; 83 | if(region->second.instr_count >= bb_count_balance) 84 | region->second.instr_count = region->second.instr_count - bb_count_balance; 85 | else 86 | region->second.instr_count = 0; 87 | } 88 | } 89 | } 90 | } 91 | } 92 | 93 | 94 | /* Generate fake thread ids: we need this for saving the thread information inside the output files */ 95 | void BarrierPoint::generate_fake_tid(void){ 96 | /* The master thread will be the first element in the vector. */ 97 | std::sort(threads.begin(), threads.end()); 98 | for(std::size_t i=0; i < threads.size(); i++){ 99 | threads[i].fake_tid = i; 100 | } 101 | } 102 | 103 | 104 | /* Dump the ldv histogram */ 105 | void BarrierPoint::save_ldv_hist(std::string out_path){ 106 | std::ofstream ldv_fp(out_path + ".ldv_hist", std::ofstream::out); 107 | for(auto &t: threads){ 108 | for(uint32_t id=0; id < synch_count; id++){ 109 | auto region = t.regions.find(id); 110 | ldv_fp << "Th:" << std::setw(3) << t.fake_tid << " b:" << std::setw(4) << id; 111 | if(region != t.regions.end()) 112 | /* Actually dump the histogram content (ent = entry) */ 113 | for(auto &ent : region->second.lru_hist) 114 | ldv_fp << " " << std::setw(10) << ent; 115 | /* If the thread has not seen that synchronization point, its histogram is empty */ 116 | ldv_fp << std::endl; 117 | } 118 | } 119 | ldv_fp.close(); 120 | } 121 | 122 | 123 | void BarrierPoint::save_ldv_bb(std::string out_path){ 124 | /* Compute the maximum size of the saved LRU Stack Distance */ 125 | size_t max_size = 0; 126 | for(auto &t: threads){ 127 | for(uint32_t id=0; id < synch_count; id++){ 128 | auto region = t.regions.find(id); 129 | if(region != t.regions.end()) 130 | max_size = std::max(max_size,region->second.lru_hist.size()); 131 | } 132 | } 133 | 134 | 135 | /* Actually dump the file */ 136 | std::ofstream ldv_fp(out_path + ".ldv_bb", std::ofstream::out); 137 | auto delim = "W"; 138 | auto actual_delim = "T"; 139 | for(uint32_t synch_id=0; synch_id < synch_count; synch_id++){ 140 | ldv_fp << delim; 141 | delim=actual_delim; 142 | for(auto &t: threads){ 143 | int entry_count=0; 144 | auto region = t.regions.find(synch_id); 145 | if(region != t.regions.end()) 146 | for(auto &ent : region->second.lru_hist){ 147 | if(ent !=0) 148 | ldv_fp << ":" << 1+entry_count+(t.fake_tid*max_size) << ":" << ent << " "; 149 | entry_count++; 150 | } 151 | } 152 | ldv_fp << std::endl; 153 | } 154 | ldv_fp.close(); 155 | system((std::string("gzip -f " + out_path +".ldv_bb").c_str())); 156 | } 157 | 158 | 159 | /* Dump the instruction count for all the inter-barrier regions of all threads */ 160 | void BarrierPoint::save_bbv_inst_count(std::string out_path){ 161 | std::ofstream fp_inst(out_path + ".bbv_inscount", std::ofstream::out); 162 | auto delim = ""; 163 | auto actual_delim = ","; 164 | for(uint32_t synch_id=0; synch_id < synch_count; synch_id++){ 165 | delim = ""; 166 | for(auto& t : threads){ 167 | fp_inst << delim << t.get_instr_count(synch_id); 168 | delim = actual_delim; 169 | } 170 | fp_inst << "\n"; 171 | } 172 | fp_inst.close(); 173 | system((std::string("gzip -f " + out_path + ".bbv_inscount").c_str())); 174 | } 175 | 176 | 177 | /* Save all the synchronization points seen by the master thread. */ 178 | void BarrierPoint::save_bp_id(std::string out_path){ 179 | /* Gather the master thread */ 180 | std::ofstream bp_id(out_path + ".bp_id", std::ofstream::out); 181 | /* Temporary hash data structure for taking into account how many times we 182 | * have seen the same synchronization function 183 | */ 184 | std::unordered_map synch_times; 185 | 186 | bp_id << "BarrierPoint,Iteration,Routine" << std::endl; 187 | for(auto t : threads){ 188 | /* Just pick the routines for the master thread */ 189 | if(t.is_master){ 190 | /* The first synchronization point is implicitly the ROI_start: */ 191 | bp_id << "0,0,ROI_Start" << std::endl; 192 | for(uint32_t id=0; id < synch_count; id++){ 193 | auto region = t.regions.find(id); 194 | DR_ASSERT_MSG(region != t.regions.end(), 195 | "[DR.barrierpoint] Master Thread is missing some piece of info"); 196 | 197 | /* Keep track of how many times we've seen this function so far */ 198 | std::string f_name = region->second.synch_name; 199 | 200 | if(synch_times.count(f_name) == 0) 201 | synch_times[f_name] = 0; 202 | else 203 | synch_times[f_name]++; 204 | /* We output id+1 because of the implicit starting point for ROI_start+1 205 | * Inter-barrier regions are identified by the name of the initial sync point 206 | */ 207 | if(f_name != "thread_exit") 208 | bp_id << id+1 << "," << synch_times[region->second.synch_name] 209 | << "," << f_name << std::endl; 210 | } 211 | } 212 | } 213 | bp_id.close(); 214 | } 215 | 216 | /* Iterate over the basic blocks and associate all of them to a unique identifier, 217 | * saving to output at the end. We do this with an unordered map. 218 | */ 219 | void BarrierPoint::save_bbv_count(std::string out_path){ 220 | int64_t id = 1; 221 | /* bb_ids in the format: --> id */ 222 | std::unordered_map bb_ids; 223 | /* Initialize bb_ids, which associates BBs to IDs */ 224 | for(auto &t : threads){ 225 | /* For each region, forcing the time ordering */ 226 | for(uint32_t synch_id=0; synch_id < synch_count; synch_id++){ 227 | auto search = t.regions.find(synch_id); 228 | if(search != t.regions.end()) 229 | for(auto &bb : search->second.bbv){ 230 | /* If it's the first time we see a new address, associate it with a new ID */ 231 | if(bb_ids.count(bb.first) == 0){ 232 | bb_ids[bb.first] = id; 233 | id++; 234 | } 235 | } 236 | } 237 | } 238 | int bb_total_number = id - 1; 239 | 240 | /* Save the actual file */ 241 | std::ofstream fp_count(out_path + ".bbv_count", std::ofstream::out); 242 | 243 | /* Synch_id is initialized as 1 since all synchronization points 244 | * are counted starting from 1 245 | */ 246 | auto delim = "W"; 247 | auto actual_delim = "T"; 248 | for(uint32_t synch_id=0; synch_id < synch_count; synch_id++){ 249 | fp_count << delim; 250 | delim = actual_delim; 251 | for(auto &t : threads){ 252 | /* If the thread has seen that synchronization point, you dump all the bb 253 | * seen with their id and their instruction count (within the bb itself, 254 | * the actual number of executed instructions is instead saved in the inst_count file). 255 | */ 256 | uint64_t bb_complex_identifier = 0; 257 | auto search = t.regions.find(synch_id); 258 | if(search != t.regions.end()) 259 | for(auto &bb: search->second.bbv){ 260 | /* The output has the format: bb_identifier:instruction_count */ 261 | bb_complex_identifier = bb_ids[bb.first] + (t.fake_tid * bb_total_number); 262 | fp_count << ":" << bb_complex_identifier << ":" << bb.second << " "; 263 | } 264 | } 265 | fp_count << "\n"; 266 | } 267 | fp_count.close(); 268 | system((std::string("gzip -f " + out_path + ".bbv_count").c_str())); 269 | } 270 | 271 | 272 | void BarrierPoint::free(void){ 273 | for(auto &t : threads){ 274 | for(std::pair elem : t.reuse_dist->cache_map){ 275 | delete elem.second; 276 | } 277 | } 278 | threads.clear(); 279 | } -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /dynamorio_client/include/reuse_distance_impl.h: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | /* ********************************************************** 22 | * Copyright (c) 2016-2019 Google, Inc. All rights reserved. 23 | * **********************************************************/ 24 | /* 25 | * Redistribution and use in source and binary forms, with or without 26 | * modification, are permitted provided that the following conditions are met: 27 | * 28 | * * Redistributions of source code must retain the above copyright notice, 29 | * this list of conditions and the following disclaimer. 30 | * 31 | * * Redistributions in binary form must reproduce the above copyright notice, 32 | * this list of conditions and the following disclaimer in the documentation 33 | * and/or other materials provided with the distribution. 34 | * 35 | * * Neither the name of Google, Inc. nor the names of its contributors may be 36 | * used to endorse or promote products derived from this software without 37 | * specific prior written permission. 38 | * 39 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 40 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 41 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 42 | * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE 43 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 44 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 45 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 46 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 47 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 48 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 49 | * DAMAGE. 50 | */ 51 | 52 | /* reuse-distance: a memory trace reuse distance analysis tool */ 53 | 54 | #ifndef _REUSE_DISTANCE_H_ 55 | #define _REUSE_DISTANCE_H_ 1 56 | 57 | #include 58 | #include 59 | #include 60 | #include 61 | #include 62 | #include 63 | 64 | /* We see noticeable overhead in release build with an if() that directly 65 | * checks knob_verbose, so for debug-only we turn it into something the 66 | * compiler can remove for better performance without going so far as ifdef-ing 67 | * big code chunks and impairing readability. 68 | */ 69 | #ifdef DEBUG 70 | #define DEBUG_VERBOSE(level) (reuse_distance_t::knob_verbose >= (level)) 71 | #else 72 | #define DEBUG_VERBOSE(level) (false) 73 | #endif 74 | 75 | typedef uintptr_t addr_t; 76 | struct line_ref_t; 77 | struct line_ref_list_t; 78 | 79 | /* A doubly linked list node for the cache line reference info */ 80 | struct line_ref_t { 81 | struct line_ref_t *prev; /* the prev line_ref in the list */ 82 | struct line_ref_t *next; /* the next line_ref in the list */ 83 | uint64_t time_stamp; /* the most recent reference time stamp on this line */ 84 | uint64_t total_refs; /* the total number of references on this line */ 85 | uint64_t distant_refs; /* the total number of distant references on this line */ 86 | addr_t tag; 87 | 88 | /* We have a one-layer skip list for more efficient depth computation. 89 | * We inline the fields in every node for simplicity and to reduce allocs. 90 | */ 91 | struct line_ref_t *prev_skip; /* the prev line_ref in the skip list */ 92 | struct line_ref_t *next_skip; /* the next line_ref in the skip list */ 93 | int_least64_t depth; /* only valid for skip list nodes; -1 for others */ 94 | 95 | line_ref_t(addr_t val) 96 | : prev(NULL) 97 | , next(NULL) 98 | , total_refs(1) 99 | , distant_refs(0) 100 | , tag(val) 101 | , prev_skip(NULL) 102 | , next_skip(NULL) 103 | , depth(-1) 104 | { 105 | } 106 | }; 107 | 108 | /* We use a doubly linked list to keep track of the cache line reuse distance. 109 | The head of the list is the most recently accessed cache line. 110 | The earlier a cache line was accessed last time, the deeper that cache line 111 | is in the list. 112 | If a cache line is accessed, its time stamp is set as current, and it is 113 | added/moved to the front of the list. The cache line reference reuse distance 114 | is the cache line position in the list before moving. 115 | We also keep a pointer to the the earliest cache line referenced within the threshold (gate). 116 | Thus, we can quickly check whether a cache line is recently accessed 117 | by comparing the time stamp of the referenced cache line and the gate cache line. 118 | 119 | We have a second doubly-linked list, a one-layer skip list, for 120 | more efficient computation of the depth. Each node in the skip 121 | list stores its depth from the front. 122 | */ 123 | struct line_ref_list_t { 124 | line_ref_t *head; /* the most recently accessed cache line */ 125 | line_ref_t *gate; /* the earliest cache line refs within the threshold */ 126 | uint64_t cur_time; /* current time stamp */ 127 | uint64_t unique_lines; /* the total number of unique cache lines accessed */ 128 | uint64_t threshold; /* the reuse distance threshold */ 129 | uint64_t skip_distance; /* distance between skip list nodes */ 130 | bool verify_skip; /* check results using brute-force walks */ 131 | 132 | line_ref_list_t(uint64_t reuse_threshold, uint64_t skip_dist, bool verify) 133 | : head(NULL) 134 | , gate(NULL) 135 | , cur_time(0) 136 | , unique_lines(0) 137 | , threshold(reuse_threshold) 138 | , skip_distance(skip_dist) 139 | , verify_skip(verify) 140 | { 141 | } 142 | 143 | virtual ~line_ref_list_t(){ 144 | line_ref_t *ref; 145 | line_ref_t *next; 146 | if (head == NULL) 147 | return; 148 | for (ref = head; ref != NULL; ref = next) { 149 | next = ref->next; 150 | delete ref; 151 | } 152 | } 153 | 154 | 155 | bool ref_is_distant(line_ref_t *ref){ 156 | if (gate == NULL || ref->time_stamp >= gate->time_stamp) 157 | return false; 158 | return true; 159 | } 160 | 161 | /* For debug purposes */ 162 | void print_list(){ 163 | std::cerr << "Reuse tag list:\n"; 164 | for (line_ref_t *node = head; node != NULL; node = node->next) { 165 | std::cerr << "\tTag 0x" << std::hex << node->tag; 166 | if (node->depth != -1) { 167 | std::cerr << " depth=" << std::dec << node->depth << " prev=" << std::hex 168 | << (node->prev_skip == NULL ? 0 : node->prev_skip->tag) 169 | << " next=" << std::hex 170 | << (node->next_skip == NULL ? 0 : node->next_skip->tag); 171 | assert(node->next_skip == NULL || node->next_skip->prev_skip == node); 172 | }else 173 | assert(node->next_skip == NULL && node->prev_skip == NULL); 174 | std::cerr << "\n"; 175 | } 176 | } 177 | 178 | 179 | void move_skip_fields(line_ref_t *src, line_ref_t *dst){ 180 | dst->prev_skip = src->prev_skip; 181 | dst->next_skip = src->next_skip; 182 | dst->depth = src->depth; 183 | if (src->prev_skip != NULL) 184 | src->prev_skip->next_skip = dst; 185 | if (src->next_skip != NULL) 186 | src->next_skip->prev_skip = dst; 187 | src->prev_skip = NULL; 188 | src->next_skip = NULL; 189 | src->depth = -1; 190 | } 191 | 192 | 193 | /* Add a new cache line to the front of the list. 194 | * We may need to move the gate forward if there are more cache lines 195 | * than the threshold so that the gate points to the earliest 196 | * referenced cache line within the threshold. 197 | */ 198 | void add_to_front(line_ref_t *ref){ 199 | if (DEBUG_VERBOSE(3)) 200 | std::cerr << "Add tag 0x" << std::hex << ref->tag << "\n"; 201 | 202 | /* update head */ 203 | ref->next = head; 204 | if (head != NULL) 205 | head->prev = ref; 206 | head = ref; 207 | if (gate == NULL) 208 | gate = head; 209 | 210 | /* move gate forward if necessary */ 211 | if (unique_lines > threshold) 212 | gate = gate->prev; 213 | unique_lines++; 214 | head->time_stamp = cur_time++; 215 | 216 | /* Add a new skip node if necessary. 217 | * Don't bother keeping one right at the front: too much overhead. 218 | */ 219 | uint64_t count = 0; 220 | line_ref_t *node, *skip = NULL; 221 | for (node = head; node != NULL && node->depth == -1; node = node->next) { 222 | ++count; 223 | if (count == skip_distance) 224 | skip = node; 225 | } 226 | if (count >= 2 * skip_distance - 1) { 227 | assert(skip != NULL); 228 | if (DEBUG_VERBOSE(3)) 229 | std::cerr << "New skip node for tag 0x" << std::hex << skip->tag << "\n"; 230 | skip->depth = skip_distance - 1; 231 | if (node != NULL) { 232 | assert(node->prev_skip == NULL); 233 | node->prev_skip = skip; 234 | } 235 | skip->next_skip = node; 236 | assert(skip->prev_skip == NULL); 237 | } 238 | 239 | /* Update skip list depths */ 240 | for (; node != NULL; node = node->next_skip) 241 | ++node->depth; 242 | if (DEBUG_VERBOSE(3)) 243 | print_list(); 244 | } 245 | 246 | 247 | /* Move a referenced cache line to the front of the list. 248 | * We need to move the gate pointer forward if the referenced cache 249 | * line is the gate cache line or any cache line after. 250 | * Returns the reuse distance of ref. 251 | */ 252 | int_least64_t move_to_front(line_ref_t *ref){ 253 | if (DEBUG_VERBOSE(3)) 254 | std::cerr << "Move tag 0x" << std::hex << ref->tag << " to front\n"; 255 | 256 | line_ref_t *prev; 257 | line_ref_t *next; 258 | 259 | ref->total_refs++; 260 | if (ref == head) 261 | return 0; 262 | if (ref_is_distant(ref)) { 263 | ref->distant_refs++; 264 | gate = gate->prev; 265 | /* move gate if ref is the gate */ 266 | }else if (ref == gate) 267 | gate = gate->prev; 268 | 269 | /* Compute reuse distance */ 270 | int_least64_t dist = 0; 271 | line_ref_t *skip; 272 | for (skip = ref; skip != NULL && skip->depth == -1; skip = skip->prev) 273 | ++dist; 274 | if (skip != NULL) 275 | dist += skip->depth; 276 | else 277 | --dist; /* Don't count self */ 278 | if (DEBUG_VERBOSE(0) && verify_skip) { 279 | /* Compute reuse distance with a full list walk as a sanity check. 280 | * This is a debug-only option, so we guard with DEBUG_VERBOSE(0). 281 | * The option check branch shows noticeable overhead without it. 282 | */ 283 | int_least64_t brute_dist = 0; 284 | for (prev = head; prev != ref; prev = prev->next) 285 | ++brute_dist; 286 | if (brute_dist != dist) { 287 | std::cerr << "Mismatch! Brute=" << brute_dist << " vs skip=" << dist << "\n"; 288 | print_list(); 289 | assert(false); 290 | } 291 | } 292 | 293 | /* Shift skip nodes between where ref was and head one earlier to 294 | * maintain spacing. This means their depths remain the same. 295 | */ 296 | if (skip != NULL) { 297 | for (; skip != NULL; skip = next) { 298 | next = skip->prev_skip; 299 | assert(skip->prev != NULL); 300 | move_skip_fields(skip, skip->prev); 301 | } 302 | }else 303 | assert(ref->depth == -1); 304 | 305 | /* remove ref from the list */ 306 | prev = ref->prev; 307 | next = ref->next; 308 | prev->next = next; 309 | /* ref could be the last */ 310 | if (next != NULL) 311 | next->prev = prev; 312 | /* move ref to the front */ 313 | ref->prev = NULL; 314 | ref->next = head; 315 | head->prev = ref; 316 | head = ref; 317 | head->time_stamp = cur_time++; 318 | if (DEBUG_VERBOSE(3)) 319 | print_list(); 320 | /* XXX: we should keep a running mean of the distance, and adjust 321 | * knob_reuse_skip_dist to stay close to the mean, for best performance. 322 | */ 323 | return dist; 324 | } 325 | }; 326 | 327 | #endif /* _REUSE_DISTANCE_H_ */ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BarrierPoint 2 | 3 | A cross-architecture tool to identify, select and analyse representative regions of parallel OpenMP applications. It allows the representation of whole applications through only their most representative regions, with high accuracy rates. 4 | Based on the work of Trevor Carlson et al. [P1 & R1](#references). 5 | 6 | For a quick run check the [build](#build) and [run](#run) sections. A sample OpenMP matrix multiplication code is provided. 7 | 8 | The BarrierPoint tool is composed of two parts: the identification and selection of representative regions; and the performance analysis of the selected regions. 9 | The generated output files can be optionally plotted to estimate the error that the representative regions have over executing the whole application. 10 | 11 | ## Identification and Selection of BPs 12 | 13 | BarrierPoint dynamically identifies barrier points (BPs) in an application through a custom DynamoRIO [R2](#references) instrumentation client. BPs are synchronisation points in the code where all OpenMP threads are guaranteed to wait before resuming. Examples of synchronisation points include the start of a parallel section or OMP barriers. 14 | 15 | Each BP is represented by architecture-agnostic metrics, namely basic block information (includes number of instructions) and Least Recently Used (LRU) Stack Distance. Check the README file inside `dynamorio_client`for more information on how the DynamoRIO extracts these metrics. 16 | 17 | After BPs are identified, BarrierPoint invokes Simpoint [R3](#references), to cluster and weight the BPs and select the most representative ones. Check the used simpoint parameters inside `DrAnalysis.py`. 18 | 19 | 20 | ## BP Performance Analysis 21 | 22 | BarrierPoint can also provide a performance analysis of each identified BP as well of the whole application, through performance counters obtained with PAPI [R4](#references). 23 | 24 | Currently, BarrierPoint gathers the following performance statistics: cycles, instructions, L1D and L2D misses. We have found that these four statistics can be used to estimate the error of the representative regions versus the whole application. 25 | To facilitate this, we provide plot capabilities to BarrierPoint, that estimate the reconstruction error of an application for any given number of runs. 26 | The number of runs is relevant, since multithread execution is bound to have high variability between threads. The tool was designed with this in mind, easily allowing multiple executions of an application, for both the BP identification and selection, as well as the performance analysis. 27 | 28 | 29 | ## Dependencies 30 | 31 | To run BarrierPoint you will need: 32 | 33 | - [DynamoRIO](https://github.com/DynamoRIO) 34 | - [Simpoint 3.2](http://cseweb.ucsd.edu/~calder/simpoint/releases/SimPoint.3.2.tar.gz) (fetched through the included Makefile) 35 | - [PAPI]([http://icl.utk.edu/papi/software/](http://icl.utk.edu/papi/software/)) (Tested with version 5.7.0) 36 | - Cmake 37 | - Python 2 or 3 38 | - Matplotlib (for plotting) 39 | 40 | ## Build 41 | 42 | BarrierPoint has been built and tested on different Aarch64 and x86-64 machines with GCC 7. 43 | 44 | ### 1. Install dependencies 45 | 46 | You can get DynamoRIO from [R2](https://www.dynamorio.org/) and PAPI from [R4](https://icl.utk.edu/papi/). 47 | Simpoint is automatically downloaded and installed during the install process. 48 | 49 | ### 2. Build BP libraries and DR Client 50 | 51 | ``` 52 | # Build BP libraries and simpoint with top-level Makefile 53 | $ make 54 | 55 | # Build the DR client 56 | $ cd dynamorio_client && mkdir build && cd build 57 | $ export DYNAMORIO_BUILD_DIR=path/to/dynamorio_build 58 | $ cmake .. 59 | $ make 60 | ``` 61 | 62 | Note: If you are using a custom PAPI build, please set `$PAPI_DIR` inside the main BarrierPoint Makefile. 63 | 64 | ## Prepare Application Binaries 65 | 66 | To be compatible with BarrierPoint, applications need to define a Region-of-Interest (RoI) in their source code. BarrierPoint will only identify representative regions inside the RoI. 67 | To set a compatible RoI, just add the following code snippets to your code, delimiting the sections you want to analyse (please note that only a single RoI is supported at the moment): 68 | 69 | **Start RoI:** 70 | ``` 71 | setenv("ROI_BP", "1", 1); 72 | #pragma omp barrier 73 | ``` 74 | 75 | **Stop RoI:** 76 | ``` 77 | setenv("ROI_BP", "0", 1); 78 | #pragma omp barrier 79 | ``` 80 | 81 | A sample matrix multiplication application is provided inside the `sample`directory, with the RoI set. By default, the configuration files are already setup for this application. You just need to compile the app and move it to the `benchmarks`folder before running the tool. 82 | 83 | ## Run BarrierPoint 84 | 85 | Modify the configuration file `configs.json` with the parameters you want to run. Check the [Configuration section](#configuration-parameters) below for additional information on the configuration parameters. 86 | 87 | Before running BarrierPoint make sure that the application to execute is available in the correct path (set in the configuration file) and its name matches the one in the configuration file. 88 | 89 | **Also make sure that the `$DYNAMORIO_BUILD_DIR` flag is set**, so BarrierPoint knows DR location. 90 | 91 | ``` 92 | $ ./run-BarrierPoint.sh 93 | ``` 94 | 95 | ## Plotting the Error 96 | 97 | Modify the configuration file `configs_plot.json` with the parameters you want to plot, then start the plot generation. Check the [Configuration section](#configuration-parameters) below for additional information on the configuration parameters. 98 | 99 | ``` 100 | $ ./plot-BarrierPoint.sh 101 | ``` 102 | The plotting step of BarrierPoint generates the error estimation of running just the identified BPs versus the whole application. It contains the errors for all four performance statistics: cycles, instructions, L1D and L2D misses. 103 | Due to variability across runs (greatly caused by load distribution across threads) each figure contains multiple plots, one for each run of the application. All these parameters can be modified in the configuration files. 104 | 105 | ## Configuration Parameters 106 | 107 | BarrierPoint uses JSON configuration files to set the parameters required to run the tool and plot the error estimates. 108 | This sections goes through the configuration file format used in BarrierPoint. 109 | 110 | ### configs.json 111 | This configuration file refers to the BP identification, selection and performance analysis. All parameters are featured in the provided file and must not be removed (or else they will be invalid). 112 | The default parameters are tailored for the included sample matrix multiplication app. Please change the configuration accordingly to your needs. 113 | 114 | - **paths**: List of paths used by BarrierPoint. All default paths are set. 115 | - **rootdir**: BarrierPoint path. 116 | - **benchpath**: Application binaries path. 117 | - **libspath**: BarrierPoint libraries path. 118 | - **outpath**: Output path. 119 | 120 | - **execution**: Defines which BarrierPoint steps will be executed. Set to true or false (Default: both true). 121 | - **bp_identification** Run the BP identification and selection (uses DynamoRIO). 122 | - **perfcntrs** Run the performance analysis (uses PAPI). 123 | 124 | - **threads**: Set the number of threads used by the applications. This parameter is a list of multiple comma-separated values. The tool will do multiple executions of each application for each number of threads set. This option sets the `OMP_NUM_THREADS` to the number of threads specified. It also substitutes any input in the application parameters in the form of ``{}``, for applications that set the number of threads through an explicit parameter. 125 | - E.g.: `[2, 4 ,8] ` 126 | 127 | - **Application**: Set of applications and their respective input parameters, comma-separated. If one of the input parameters is the number of threads, instead of hardcoding the number, you can instead pass ``{}`` . BarrierPoint will then substitute the brackets for the number of threads set in the previous configuration parameter. 128 | - E.g.: ` { "matrixmul" : " ", "matrixmul2":"-threads {}" } ` 129 | - In the above example we are running two apps: `matrixmul` and `matrixmul2`, which takes the number of threads as an input. 130 | 131 | - **Suffix**: Strings to append to the generated files and folders, comma separated. This is useful to identify different runs of the same application under different circumstances (e.g. running the same app with modifications or at a different date). The adopted naming convention for the output files and folders already uses the name of the application and the thread count. If multiple suffixes are provided, multiple executions of the application/threads are done. 132 | - E.g. `["sample", "dd-mm-yyyy"] 133 | - The above example will run the provided applications one for every suffix and for every thread count. 134 | - Using the `matrixmul` as an example, the output files will look like: `matrixmul-sample.4t`and `matrixmul-dd-mm-yyyy.2t` 135 | 136 | - **Debug**: Prints additional debug information to a logfile. Default is false. 137 | 138 | - **DR_iterations**: Number of iterations for the BP identification and selection step of BarrierPoint. More iterations will result in better error estimates when plotting. This step uses DynamoRIO, so expect it to be slow the more iterations you add. The generated output files will contain the correspondent number of the run. 139 | - Default is set to 10 iterations. Reduce if it takes too long to generate the barrier points. 140 | 141 | - **Perfcntrs_iterations**: Number of iterations for the performance analysis step of BarrierPoint. More iterations will result in better error estimates when plotting. The generated output files will contain the correspondent number of the run. 142 | - Default is set to 20 iterations. Reduce if it takes too long to gather performance statistics for the app. 143 | 144 | - **dry_run**: . Dry BarrierPoint run, without DynamoRIO instrumentation or PAPI performance analysis. This flag requires previously generated files. When on, it will still run the simpoint clustering step and the performance statistics parsing. Useful when sharing BP files and do not want to repeat the whole generation process again. Default is set to false. 145 | 146 | 147 | ### configs_plot.json 148 | This configuration file refers to the plotting of the error estimate between the selected BPs and the whole application. All parameters are featured in the provided file and must not be removed (or else they will be invalid). 149 | The default parameters are tailored for the included sample matrix multiplication app (if you run BarrierPoint before to identify, select and analyse the BPs). Please change the configuration accordingly to your needs. 150 | 151 | Note that each plot figure will contain the a plot for each of the iterations set in the application execution before. As long as the application name and suffixes match, the tool will do this automatically. This is due to the variability across runs (greatly caused by load distribution across threads). 152 | 153 | - **paths**: List of paths used by BarrierPoint plotting. All default paths are set. 154 | - **rootdir**: BarrierPoint path. 155 | - **outpath**: BP Output path (generated by the tool when running an application). 156 | - **plotpath**: Plot output path. Location where the plots are saved. 157 | 158 | - **Application**: Set of applications and their suffixes, comma-separated. It will plot, for each app, the combination of all *barrier* and *perfcntrs* suffixes. All the files must exist in the `outpath`folder. For each application, you can set: 159 | - **Barriers_suffix** refers to the suffixes generated during the BP identification and selection step (using DynamoRIO). Comma-separated. 160 | - **Perfcntrs_suffix** refers to the suffixes generated during the BP performance analysis (using PAPI). Comma-separated. 161 | - E.g.: ` { "matrixmul" : { "Barriers_suffix": ["sample", "dd-mm-yyyy"] , "Perfcntrs_suffix": ["sample"] } }` 162 | - In the above example we are plotting 2 graphs for `matrixmul`: the first estimating the error of the selected BPs with suffix `sample`and the second estimating the error of the selected BPs with suffix `dd-mm-yyyy`. In both case, the error estimation uses the same performance statistics, generated in the run with the suffix `sample`. 163 | 164 | - **threads**: List of number threads to plot, comma-separated. These have to match the ones previously generated and available in the `outpath`folder. Each thread number will generate a different graph. 165 | - E.g. `[2, 4, 8]` 166 | 167 | - **plot_format**: Output plot file format. Accepts `png` or `pdf`. Default is `pdf`. 168 | 169 | - **Debug**: Prints additional debug information to a logfile. Default is false. 170 | 171 | 172 | ## Publications 173 | 174 | - [P1](http://dx.doi.org/10.1109/ISPASS.2014.6844456) Carlson, T. E., Heirman, W., Van Craeynest, K., & Eeckhout, L. (2014, March). Barrierpoint: Sampled simulation of multi-threaded applications. In 2014 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) (pp. 2-12). IEEE. 175 | - [P2](https://ieeexplore.ieee.org/abstract/document/7581284) Ferrerón, A., Jagtap, R., & Rusitoru, R. (2016, September). Identifying representative regions of parallel HPC applications: a cross-architectural evaluation. In 2016 IEEE International Symposium on Workload Characterization (IISWC) (pp. 1-2). IEEE. 176 | - [P3](https://ieeexplore.ieee.org/abstract/document/7975275) Ferrerón, A., Jagtap, R., Bischoff, S., & Ruşitoru, R. (2017, April). Crossing the architectural barrier: Evaluating representative regions of parallel HPC applications. In 2017 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) (pp. 109-120). IEEE 177 | - [P4](https://ieeexplore.ieee.org/abstract/document/8366944) Tairum Cruz, M., Bischoff, S., & Rusitoru, R. (2018, April). Shifting the barrier: Extending the boundaries of the BarrierPoint methodology. In 2018 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) (pp. 120-122). IEEE. 178 | 179 | ## References 180 | - [R1] [Trevor Carlson's BarrierPoint](https://github.com/trevorcarlson/barrierpoint) 181 | - [R2] [DynamoRIO](https://www.dynamorio.org/) 182 | - [R3] [Simpoint ](https://cseweb.ucsd.edu/~calder/simpoint/simpoint_overview.htm) 183 | - [R4] [PAPI](https://icl.utk.edu/papi/) 184 | 185 | 186 | ## License 187 | 188 | This project is licensed under [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0). For more information, see LICENSE.txt. 189 | 190 | This project also contains code derived from other projects as listed below: 191 | 192 | - Some code in `DrAnalysis.py` is derived from the [original BarrierPoint tool](https://github.com/trevorcarlson/barrierpoint/tree/91d1f54f10ed5a442dfd1b5b50276c249964481b) by Trevor Carlson, which uses the MIT license. The original license text can be found in `LICENSE-MIT.txt`. 193 | 194 | - Some code featured into the BarrierPoint `dynamorio_client` is derived from existing DynamoRIO clients, under BSD and LGPL licenses. The original license text can be found in `dynamorio_client/LICENSE-BSD-LGPL.txt`. 195 | 196 | 197 | ## Contributions / Pull Requests 198 | 199 | Contributions are accepted under Apache-2.0. Only submit contributions where you have authored all of the code. If you do this on work time, make sure you have your employer's approval. 200 | -------------------------------------------------------------------------------- /barrierpoint-libs/omp_counters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * 4 | * SPDX-License-Identifier: Apache-2.0 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /* Auxiliary functions for PAPI performance counters in Barrierpoint */ 20 | 21 | #ifndef __BP_PERFCNTRS_H__ 22 | #define __BP_PERFCNTRS_H__ 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | int papi_max_num_threads = 0; 39 | int papi_event_set; 40 | #define MAX_STR_LEN 128 41 | #pragma omp threadprivate(papi_event_set) 42 | 43 | #define MAX_CNTRS 6 44 | 45 | int BP_AVAILABLE_CNTRS = 0; 46 | 47 | int _BP_PERFCNTRS = 0; 48 | int _BP_PERFCNTRS_VERBOSE = 0; 49 | int _BP_PERFCNTRS_OMPPARALLEL = 0; 50 | int _BP_PERFCNTRS_SAMPLING = 0; 51 | int BP_PERFCNTRS_INITIALIZED = 0; 52 | int PERF_CNTRS_STARTED = 0; 53 | 54 | int t_events = 0; 55 | char event_output_filename[1024]; 56 | int event_codes[MAX_CNTRS]; 57 | long long event_values[MAX_CNTRS + 1]; /* +1 is the cycle counter */ 58 | 59 | int perfc_thread_started = 1; 60 | 61 | /* Only when instrumenting OMP parallel regions */ 62 | #define MAX_PARALLEL_PHASES 25800 63 | unsigned long long event_values_pbarrier[MAX_PARALLEL_PHASES][MAX_CNTRS + 1]; /* +1 is the cycle counter */ 64 | int perfc_omp_parallel_region_count = 0; 65 | 66 | /* Only when instrumenting samplings */ 67 | unsigned long long event_values_sample[MAX_CNTRS + 1]; /* +1 is the cycle counter */ 68 | 69 | #pragma omp threadprivate(perfc_thread_started, perfc_omp_parallel_region_count, \ 70 | event_values, event_values_pbarrier, event_values_sample) 71 | 72 | /****************************** AUX FUNCTIONS ******************************/ 73 | inline __attribute__((always_inline)) int env2int(const char *name){ 74 | const char *val = getenv(name); 75 | if(val) 76 | return strtol(val, NULL, 0); 77 | return 0; 78 | } 79 | 80 | inline __attribute__((always_inline)) void _papi_error(char *error_str){ 81 | PAPI_perror(error_str); 82 | exit(2); 83 | } 84 | /**************************** END AUX FUNCTIONS ****************************/ 85 | 86 | /* Set performance counters. This function must be called before starting any instrumentation */ 87 | inline __attribute__((always_inline)) void initPerformanceCounters(){ 88 | #pragma omp master 89 | { 90 | if (BP_PERFCNTRS_INITIALIZED == 0) { 91 | BP_PERFCNTRS_INITIALIZED = 1; 92 | _BP_PERFCNTRS = env2int("BP_PERFCNTRS"); 93 | _BP_PERFCNTRS_VERBOSE = env2int("BP_PERFCNTRS_VERBOSE"); 94 | _BP_PERFCNTRS_OMPPARALLEL = env2int("BP_PERFCNTRS_OMPPARALLEL"); 95 | _BP_PERFCNTRS_SAMPLING = env2int("BP_PERFCNTRS_SAMPLING"); 96 | 97 | /* Ensure the maximum number of counters is respected */ 98 | if (_BP_PERFCNTRS) { 99 | BP_AVAILABLE_CNTRS = PAPI_num_counters(); 100 | 101 | if (_BP_PERFCNTRS_VERBOSE) 102 | std::cout << "[OMP PerfCntrs] Initializing\n \ 103 | [OMP PerfCntrs] Number of available counters: " 104 | << BP_AVAILABLE_CNTRS << std::endl; 105 | 106 | if (BP_AVAILABLE_CNTRS > MAX_CNTRS) { 107 | BP_AVAILABLE_CNTRS = MAX_CNTRS; 108 | std::cout << "[OMP PerfCntrs] This module only supports up to " 109 | << MAX_CNTRS << " counters\n"; 110 | } 111 | 112 | if (_BP_PERFCNTRS_SAMPLING && _BP_PERFCNTRS_OMPPARALLEL) { 113 | std::cout << "[OMP PerfCntrs] Only Sampling-based or Parallel Region-based \ 114 | can be enabled, switching to Parallel Region-based\n"; 115 | _BP_PERFCNTRS_SAMPLING = 0; 116 | } 117 | 118 | /* Output file to store the results */ 119 | char *_output_filename = getenv("BP_PERFCNTRS_OUTPUT_FILE"); 120 | if (_output_filename) 121 | strcpy(event_output_filename, _output_filename); 122 | else 123 | strcpy(event_output_filename, "/tmp/perfcntrs_events.out"); 124 | 125 | if (_BP_PERFCNTRS_VERBOSE) 126 | std::cout << "[OMP PerfCntrs] Dumping results in: " 127 | << event_output_filename << std::endl; 128 | 129 | if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) 130 | _papi_error((char *)"[OMP PerfCntrs] PAPI_library_init error"); 131 | papi_max_num_threads = omp_get_max_threads(); 132 | if (PAPI_thread_init((long unsigned int (*)()) omp_get_thread_num) != PAPI_OK) 133 | _papi_error((char *)"[OMP PerfCntrs] PAPI_thread_init error"); 134 | 135 | /* Get the events; specify events separated by commas */ 136 | char *events_str_ptr = getenv("BP_PERFCNTRS_EVENTS"); 137 | char events_str[1024]; 138 | 139 | if (events_str_ptr) 140 | strcpy(events_str, events_str_ptr); 141 | else{ 142 | strcpy(events_str, "PAPI_TOT_INS,PAPI_TOT_CYC"); /* Default counters */ 143 | std::cout << "[OMP PerfCntrs] No PAPI counters set. Using the default counters: \ 144 | PAPI_TOT_INS and PAPI_TOT_CYC\n"; 145 | } 146 | 147 | char *event = strtok(events_str, ","); 148 | while (event) { 149 | if (t_events == BP_AVAILABLE_CNTRS) { 150 | std::cerr << "[OMP PerfCntrs] Too many events!\n"; 151 | exit(1); 152 | } 153 | if (PAPI_event_name_to_code(event, &event_codes[t_events]) != PAPI_OK){ 154 | std::cerr << "[OMP PerfCntrs] Invalid PAPI event: " << event << std::endl; 155 | exit(1); 156 | } 157 | t_events++; 158 | event = strtok(NULL, ","); 159 | } 160 | 161 | if (_BP_PERFCNTRS_VERBOSE) { 162 | std::cout << "[OMP PerfCntrs] Registering " << t_events << " events\n"; 163 | char event_names[MAX_CNTRS + 1][MAX_STR_LEN]; 164 | int i; 165 | for (i = 0; i < t_events; i++) { 166 | PAPI_event_code_to_name(event_codes[i], event_names[i]); 167 | std::cout << " " << event_names[i] << ","; 168 | } 169 | std::cout << "\n"; 170 | } 171 | } 172 | } /* end if _BP_PERFCNTRS_INITIALIZED */ 173 | } /* end of pragma omp master */ 174 | 175 | #pragma omp barrier 176 | 177 | if (_BP_PERFCNTRS) { 178 | /* Init all threads */ 179 | #pragma omp parallel 180 | { 181 | int i; 182 | 183 | if (_BP_PERFCNTRS_VERBOSE) 184 | std::cout << "[OMP PerfCntrs] Thread " << omp_get_thread_num() 185 | << " in its init phase\n"; 186 | 187 | papi_event_set = PAPI_NULL; 188 | if (PAPI_create_eventset(&papi_event_set) != PAPI_OK) 189 | _papi_error((char *)"[OMP PerfCntrs] PAPI_create_eventset"); 190 | 191 | for (i = 0; i < t_events; i++) 192 | if (PAPI_add_event(papi_event_set, event_codes[i]) != PAPI_OK) 193 | _papi_error((char *)"[OMP PerfCntrs] PAPI_add_event"); 194 | } 195 | } 196 | } 197 | 198 | /* Initalize and reset performance counters */ 199 | inline __attribute__((always_inline)) void startPerformanceCounters(){ 200 | if (_BP_PERFCNTRS) { 201 | #pragma omp parallel 202 | { 203 | if (_BP_PERFCNTRS_VERBOSE) 204 | std::cout << "[OMP PerfCntrs] Thread " << omp_get_thread_num() 205 | << " in its start phase\n"; 206 | 207 | perfc_thread_started = 1; 208 | perfc_omp_parallel_region_count = 0; 209 | 210 | int rc = PAPI_start(papi_event_set); 211 | if (rc == PAPI_EISRUN) { 212 | if (PAPI_stop(papi_event_set, event_values) != PAPI_OK) 213 | _papi_error((char *)"[OMP PerfCntrs] PAPI_stop in startPerformanceCounters"); 214 | if (PAPI_reset(papi_event_set) != PAPI_OK) 215 | _papi_error((char *)"[OMP PerfCntrs] PAPI_reset in startPerformanceCounters"); 216 | rc = PAPI_start(papi_event_set); 217 | } 218 | if (rc != PAPI_OK) 219 | _papi_error((char *)"[OMP PerfCntrs] PAPI_start in startPerformanceCounters"); 220 | } 221 | PERF_CNTRS_STARTED = 1; 222 | } 223 | } 224 | 225 | /* Stop instrumentation. Store the values in the output file */ 226 | inline __attribute__((always_inline)) void stopPerformanceCounters(){ 227 | if (_BP_PERFCNTRS) { 228 | if (!PERF_CNTRS_STARTED) 229 | std::cout << "[OMP PerfCntrs] Trying to stop non-started counters\n"; 230 | 231 | #pragma omp parallel 232 | { 233 | if (!perfc_thread_started) { 234 | std::cout << "[OMP PerfCntrs] Trying to stop non-started counters (thread " 235 | << omp_get_thread_num() << ")\n"; 236 | } 237 | 238 | /* Get the counters values */ 239 | int i, j; 240 | 241 | if (PAPI_stop(papi_event_set, event_values) != PAPI_OK) 242 | _papi_error((char *)"[OMP PerfCntrs] PAPI_stop in stopPerformanceCounters"); 243 | 244 | int tid = omp_get_thread_num(); 245 | int pid = getpid(); 246 | 247 | if (_BP_PERFCNTRS_VERBOSE) 248 | std::cout << "[OMP PerfCntrs] Thread " << tid << " in its stop phase\n"; 249 | 250 | FILE *outfile = fopen(event_output_filename, "a"); 251 | if (!outfile){ 252 | std::cerr << "[OMP PerfCntrs] Could not create output file\n"; 253 | exit(1); 254 | } 255 | 256 | char event_names[MAX_CNTRS + 1][MAX_STR_LEN]; 257 | for (i = 0; i < t_events; i++) 258 | if (PAPI_event_code_to_name(event_codes[i], event_names[i]) != PAPI_OK) 259 | _papi_error((char *)"[OMP PerfCntrs] PAPI_event_code_to_name in stopPerformanceCounters"); 260 | 261 | #pragma omp critical 262 | { 263 | if (_BP_PERFCNTRS_OMPPARALLEL) { 264 | for (j = 0; j < perfc_omp_parallel_region_count; j++) 265 | for (i = 0; i < t_events; i++) 266 | fprintf(outfile, "%s[%d][%d][%d]=%lld\n", event_names[i], tid, \ 267 | pid, j, event_values_pbarrier[j][i]); 268 | } 269 | /* When BP_PERFCNTRS_OMPPARALLEL is set, the last phase goes from 270 | * the last "omp parallel" to the end of the region of interest. 271 | * Otherwise perfc_omp_parallel_region_count will be 0. 272 | */ 273 | for (i = 0; i < t_events; i++) 274 | fprintf(outfile, "%s[%d][%d][%d]=%lld\n", event_names[i], tid, pid, 275 | perfc_omp_parallel_region_count, event_values[i]); 276 | } 277 | 278 | fclose(outfile); 279 | perfc_thread_started = 0; 280 | } 281 | 282 | PERF_CNTRS_STARTED = 0; 283 | } 284 | } 285 | 286 | /* Store the performance counters values of the parallel region and do a reset */ 287 | inline __attribute__((always_inline)) void parallelRegionPerformanceCounters(){ 288 | if (_BP_PERFCNTRS && _BP_PERFCNTRS_OMPPARALLEL && PERF_CNTRS_STARTED) { 289 | /* We might have parallel regions BEFORE the region of interest, so 290 | * it is easier to check if the instrumentation has started here 291 | */ 292 | 293 | /* Check for a nested parallelism */ 294 | if(omp_in_parallel() == 1){ 295 | if (!perfc_thread_started) 296 | std::cout << "[OMP PerfCntrs] Trying to restart non-started counters (thread " 297 | << omp_get_thread_num() << ")\n"; 298 | 299 | /* Get the counters values */ 300 | int i; 301 | if (PAPI_read(papi_event_set, event_values) != PAPI_OK) 302 | _papi_error((char *)"[OMP PerfCntrs] PAPI_read in parallelRegionPerformanceCounters"); 303 | 304 | int tid = omp_get_thread_num(); 305 | 306 | if (_BP_PERFCNTRS_VERBOSE) 307 | std::cout << "[OMP PerfCntrs] Thread " << tid << " saw an OMP parallel region\n"; 308 | 309 | if (perfc_omp_parallel_region_count < MAX_PARALLEL_PHASES) 310 | for (i = 0; i < t_events; i++) 311 | event_values_pbarrier[perfc_omp_parallel_region_count][i] = event_values[i]; 312 | else 313 | std::cout << "[OMP PerfCntrs] Run out of space for storing intermediate values \ 314 | (thread " << tid << ", region " << perfc_omp_parallel_region_count << ")\n"; 315 | 316 | perfc_omp_parallel_region_count++; 317 | 318 | if (PAPI_reset(papi_event_set) != PAPI_OK) 319 | _papi_error((char *)"[OMP PerfCntrs] PAPI_reset in parallelRegionPerformanceCounters"); 320 | 321 | }else{ /* No nested parallelism */ 322 | #pragma omp parallel 323 | { 324 | if (!perfc_thread_started) { 325 | std::cout << "[OMP PerfCntrs] Trying to restart non-started counters \ 326 | (thread " << omp_get_thread_num() << ")\n"; 327 | } 328 | 329 | /* Get the counters values */ 330 | int i; 331 | 332 | if (PAPI_read(papi_event_set, event_values) != PAPI_OK) 333 | _papi_error((char *)"[OMP PerfCntrs] PAPI_read in parallelRegionPerformanceCounters"); 334 | 335 | int tid = omp_get_thread_num(); 336 | 337 | if (_BP_PERFCNTRS_VERBOSE) 338 | std::cout << "[OMP PerfCntrs] Thread " << tid << " saw an OMP parallel region\n"; 339 | 340 | if (perfc_omp_parallel_region_count < MAX_PARALLEL_PHASES) 341 | for (i = 0; i < t_events; i++) 342 | event_values_pbarrier[perfc_omp_parallel_region_count][i] = event_values[i]; 343 | else 344 | std::cout << "[OMP PerfCntrs] Run out of space for storing intermediate values \ 345 | (thread " << tid << ", region " << perfc_omp_parallel_region_count << ")\n"; 346 | 347 | perfc_omp_parallel_region_count++; 348 | 349 | if (PAPI_reset(papi_event_set) != PAPI_OK) 350 | _papi_error((char *)"[OMP PerfCntrs] PAPI_reset in parallelRegionPerformanceCounters"); 351 | } 352 | } 353 | } 354 | } 355 | 356 | /* Store the performance counters values of the barrier region and do a reset */ 357 | inline __attribute__((always_inline)) void barrierRegionPerformanceCounters(){ 358 | if (_BP_PERFCNTRS && _BP_PERFCNTRS_OMPPARALLEL && PERF_CNTRS_STARTED) { 359 | /* We might have parallel regions BEFORE the region of interest, so 360 | * it is easier to check if the instrumentation has started here 361 | */ 362 | if (!perfc_thread_started) { 363 | std::cout << "[OMP PerfCntrs] Trying to restart non-started counters (thread " 364 | << omp_get_thread_num() << ")\n"; 365 | } 366 | 367 | /* Get the counters values */ 368 | int i; 369 | 370 | if (PAPI_read(papi_event_set, event_values) != PAPI_OK) 371 | _papi_error((char *)"[OMP PerfCntrs] PAPI_read in parallelRegionPerformanceCounters"); 372 | 373 | int tid = omp_get_thread_num(); 374 | 375 | if (_BP_PERFCNTRS_VERBOSE) 376 | std::cout << "[OMP PerfCntrs] Thread " << tid << " saw an OMP parallel region\n"; 377 | 378 | if (perfc_omp_parallel_region_count < MAX_PARALLEL_PHASES) 379 | for (i = 0; i < t_events; i++) 380 | event_values_pbarrier[perfc_omp_parallel_region_count][i] = event_values[i]; 381 | else 382 | std::cout << "[OMP PerfCntrs] Run out of space for storing intermediate values \ 383 | (thread " << tid << ", region " << perfc_omp_parallel_region_count << ")\n"; 384 | 385 | perfc_omp_parallel_region_count++; 386 | 387 | if (PAPI_reset(papi_event_set) != PAPI_OK) 388 | _papi_error((char *)"[OMP PerfCntrs] PAPI_reset in parallelRegionPerformanceCounters"); 389 | } 390 | } 391 | 392 | 393 | #ifdef __cplusplus 394 | } 395 | #endif 396 | 397 | #endif /* __BP_PERFCNTRS_H__ */ 398 | -------------------------------------------------------------------------------- /errorEstimate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2020, Arm Limited and Contributors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Module to parse performance statistics collected with PAPI, validate the 18 | # output files and estimate the average reconstruction error between the 19 | # selected BPs and the whole application. It also generates error plots. 20 | 21 | import logging 22 | import os 23 | import sys 24 | import argparse 25 | import csv 26 | import json 27 | from collections import defaultdict 28 | from itertools import product 29 | import matplotlib.pyplot as plt 30 | import numpy as np 31 | 32 | import matplotlib 33 | matplotlib.use('Agg') 34 | 35 | 36 | class errorEstimate(object): 37 | def __init__(self, fbaseline, gbarriers, bbpoints, prfcntrs_suf): 38 | 39 | self.logger = logging.getLogger('errorEstimate') 40 | # bperf check CSV file (generated by PAPI) 41 | self.fbaselineFN = fbaseline 42 | # bperf check BP CSV file (per BP) (generated by PAPI) 43 | self.gbarriersFN = gbarriers 44 | self.bbpointsP = bbpoints # Barriers folder (generated by DR) 45 | self.prfcntrs_suf = prfcntrs_suf 46 | # Storing the barrier points 47 | self.BPs = defaultdict(list) 48 | self.BPs_mult = defaultdict(list) 49 | # Storing the stats 50 | self.STATSt = [] 51 | self.STATSbp = [] 52 | 53 | # Parse all the subdirectory output paths for all the barrier points repetitions 54 | # (files generated by DR, located in outputs/barriers) 55 | _subdirs = [x[0] 56 | for x in os.walk(self.bbpointsP) if os.path.isdir(x[0])][1:] 57 | self.logger.debug("Found {} BP iterations".format(len(_subdirs))) 58 | 59 | _bpsFN = [] 60 | for _sd in _subdirs: 61 | _bpsFN.append([y for y in [x[2] for x in os.walk(_sd)] 62 | [0] if y.endswith('.barrierpoints')][0]) 63 | 64 | for p, f in zip(_subdirs, _bpsFN): 65 | bbpointsFN = '{}/{}'.format(p, f) 66 | try: 67 | self.b = open(bbpointsFN, 'r') 68 | self.logger.debug( 69 | 'Parsing barrierpoints file: {}'.format(bbpointsFN)) 70 | 71 | bps_id = int(p.split('/')[-1]) 72 | 73 | for line in self.b: 74 | _line = line.split(' ') 75 | self.BPs[bps_id].append(int(_line[0])) 76 | self.BPs_mult[bps_id].append(float(_line[1].strip())) 77 | 78 | self.logger.debug('Found {} barrier points: {}'.format( 79 | len(self.BPs[bps_id]), ','.join(str(x) for x in self.BPs[bps_id]))) 80 | self.logger.debug('Multipliers: {}'.format( 81 | ','.join(str(x) for x in self.BPs_mult[bps_id]))) 82 | 83 | except OSError as err: 84 | self.logger.error( 85 | '{0} bbpointsFN: {1}'.format(err, bbpointsFN)) 86 | return 87 | 88 | # Parsing fbaseline (performance statistics of whole app) 89 | try: 90 | self.f = open(self.fbaselineFN, 'r') 91 | self.logger.debug( 92 | 'Parsing baseline file: {}'.format(self.fbaselineFN)) 93 | 94 | csvreader = csv.reader(self.f) 95 | # Read header line of CSV file 96 | if sys.version_info[0] < 3: # Python cross-compatible 97 | stat = csvreader.next()[4:] 98 | else: 99 | stat = csvreader.__next__()[4:] 100 | 101 | for _line in csvreader: 102 | self.miniApp = _line[0] 103 | tid = int(_line[1]) 104 | itr = int(_line[2]) 105 | bid = int(_line[3]) 106 | 107 | # Create a dict of stats per thread (taken from .csv) 108 | # Each statistic has a list of values correspoding to the iterations 109 | if len(self.STATSt) < (tid + 1): 110 | self.STATSt.append(defaultdict(list)) 111 | for i, value in enumerate(_line[4:]): # Stats 112 | if len(self.STATSt[tid][stat[i]]) < (itr + 1): 113 | self.STATSt[tid][stat[i]].append([]) 114 | self.STATSt[tid][stat[i]][itr].append(float(value)) 115 | assert len(self.STATSt[tid][stat[i]][itr]) == (bid + 1) 116 | 117 | self.iterations = len(self.STATSt[tid]['Cycles']) 118 | self.nthreads = len(self.STATSt) 119 | 120 | except OSError as err: 121 | self.logger.error( 122 | '{} fbaselineFN: {}'.format(err, self.fbaselineFN)) 123 | return 124 | 125 | # Parsing gbarriers (performance statistics per BP) 126 | try: 127 | self.g = open(self.gbarriersFN, 'r') 128 | self.logger.debug( 129 | 'Parsing baseline file: {}'.format(self.gbarriersFN)) 130 | 131 | # Read header line of CSV file 132 | csvreader = csv.reader(self.g) 133 | if sys.version_info[0] < 3: # Python cross-compatible 134 | stat = csvreader.next()[4:] 135 | else: # Python3 136 | stat = csvreader.__next__()[4:] 137 | 138 | for _line in csvreader: 139 | tid = int(_line[1]) 140 | itr = int(_line[2]) 141 | bid = int(_line[3]) 142 | 143 | # Create a dict of stats per thread (taken from .csv) 144 | # Each statistic has a list of values correspoding to the iterations 145 | if len(self.STATSbp) < (tid + 1): 146 | self.STATSbp.append(defaultdict(list)) 147 | for i, value in enumerate(_line[4:]): 148 | if len(self.STATSbp[tid][stat[i]]) < (itr + 1): 149 | self.STATSbp[tid][stat[i]].append([]) 150 | self.STATSbp[tid][stat[i]][itr].append(float(value)) 151 | assert len(self.STATSbp[tid][stat[i]][itr]) == (bid + 1) 152 | 153 | # Last line always contains the highest Barrier ID 154 | self.barriers = bid 155 | 156 | except OSError as err: 157 | self.logger.error( 158 | '{0} gbarriersFN: {1}'.format(err, self.gbarriersFN)) 159 | return 160 | 161 | def Metric(self, metric, tid, bid=0): 162 | # Returns the average across iterations and stdev for thread tID of a given metric 163 | values = list(map(lambda x: x[bid], self.STATSt[tid][metric])) 164 | return np.mean(values), np.std(values, ddof=1) 165 | 166 | def EstMetric(self, metric, tid, bps_id=0): 167 | # Returns the estimated average across iterations and stdev for thread tID of a given metric 168 | values = [0.0 for _ in range(self.iterations)] 169 | 170 | for i, bp in enumerate(self.BPs[bps_id]): 171 | if bp > len(self.STATSbp[tid][metric][0]): 172 | continue 173 | values = [x + y for x, y in zip(values, map( 174 | lambda x: self.BPs_mult[bps_id][i] * x[bp], self.STATSbp[tid][metric]))] 175 | 176 | return np.mean(values), np.std(values, ddof=1) 177 | 178 | 179 | def plotError(plots_data, outpath, debug, suffix, plot_format): 180 | # Plots the reconstruction error estimations for different metrics 181 | # and different BP iterations. It generates an error graph for each 182 | # BP identification suffix (it compares all the performance statistic suffixes to it) 183 | # and for each BP repetition previously generated. 184 | # Each graph will contain an error bar (and std dev) for each metric and 185 | # for each suffix of the target application. 186 | 187 | logger = logging.getLogger('errorEstimate') 188 | logger.info("Plotting Error") 189 | 190 | # Check plot format (available formats: pdf, png) 191 | if plot_format != "pdf" and plot_format != "png": 192 | logger.warn( 193 | "Format {} not supported for the plot figures. \ 194 | Please choose between png or pdf. Using pdf format.".format(plot_format)) 195 | plot_format = "pdf" 196 | 197 | # Number of plots in the figure (N rows, 2 columns for better visibility) 198 | num_sets = len(plots_data[0].BPs) 199 | n_x = np.ceil(num_sets / 2) 200 | n_y = 2 201 | 202 | # Metrics to plot 203 | metrics = ['Cycles', 'Instructions', 'L1D', 'L2D'] 204 | metrics_labels = ['Cycles', 'Instrs', 'L1DMiss', 'L2DMiss'] 205 | 206 | # Set plot figure size 207 | # How many bars per metric in the plots 208 | num_bars = len(plots_data) 209 | width_sep = 0.9 210 | # The more bars in the plot, the wider the figure (better readability) 211 | fig_width = 12. + 0.5 * num_bars 212 | fig = plt.figure(figsize=(fig_width, 12.)) 213 | fig.subplots_adjust(hspace=0.5) 214 | ax = [] 215 | rect = [] 216 | hatches = [' ', '///', '---', '\\\\\\'] 217 | index = 0 # Index for labeling the error bars with the error values 218 | 219 | # Get the number of repetions 220 | for i, bps in enumerate(sorted(plots_data[0].BPs.keys())): 221 | ax.append(fig.add_subplot(n_x, n_y, i + 1)) 222 | logger.info("i: {}, bps: {}".format(i, bps)) 223 | ind = np.arange(len(metrics)) # X positions of the plot 224 | width = width_sep / num_bars # Width of the bars 225 | 226 | # Multiple bars (one for each perfcntrs suffix) for each of the four metrics 227 | for j, m in enumerate(metrics): 228 | logger.info("Metrics: {}".format(m)) 229 | for k, suf in enumerate(plots_data): 230 | Yl, Yerrl = [], [] 231 | # Calculate the average values of all threads for the metrics 232 | for tid in range(suf.nthreads): 233 | if m == 'Cycles' or m == 'Instructions': 234 | b, berr = suf.Metric(m, tid) 235 | y, yerr = suf.EstMetric(m, tid, bps_id=bps) 236 | else: 237 | b, berr = suf.Metric(m + 'Misses', tid) 238 | y, yerr = suf.EstMetric(m + 'Misses', tid, bps_id=bps) 239 | 240 | logger.info("b={}, y={}".format(b, y)) 241 | 242 | Yl.append(abs(y / b - 1) * 100) 243 | Yerrl.append(100 * yerr / b) 244 | 245 | rect.append(ax[i].bar(x=ind[j] + k * width, height=np.mean(Yl), 246 | width=width, edgecolor='black', hatch=hatches[k % 4], yerr=max(Yerrl))) 247 | 248 | # Error numbers at the top of the plot bars (Uncomment to include them) 249 | # height = rect[index][0].get_height() 250 | # ax[i].text(rect[index][0].get_x() + rect[index][0].get_width()/2., 251 | # 1.05*height, '%.2f' % (np.mean(Yl)), ha='center', va='bottom') 252 | # index += 1 253 | 254 | # Resets the color cycle for each metric 255 | ax[i].set_prop_cycle(None) 256 | 257 | # Pretty axis 258 | ymin, ymax = ax[i].get_ylim() 259 | if ymax < 1.5: 260 | ax[i].set_ylim(0.0, 1.5) 261 | else: 262 | ax[i].set_ylim(0.0, ymax) 263 | plt.grid(True) 264 | 265 | # Center xtick for any number of bars 266 | ax[i].set_xticks(ind + (width_sep / 2) - (width / 2)) 267 | ax[i].set_xticklabels(metrics_labels) 268 | ax[i].set_ylabel('Error [%]') 269 | axtitle = 'BarrierPoint Set {}'.format(i) 270 | ax[i].set_title(axtitle, fontsize=11) 271 | 272 | # Legend placement 273 | _legend = [plots_data[i].prfcntrs_suf for i, k in enumerate(plots_data)] 274 | plt.legend(_legend, title="Perfcntr runs", loc="center", 275 | bbox_to_anchor=(-0.13, -0.5), ncol=4) 276 | plt.suptitle('Estimation Errors\n(Barrierpoint identification: {}.{}t-{})'. 277 | format(plots_data[0].miniApp, plots_data[0].nthreads, suffix), fontsize=18) 278 | 279 | # Set figure name and save it 280 | figname = '{}/{}.{}t.Error-{}.{}'.format( 281 | outpath, plots_data[0].miniApp, plots_data[0].nthreads, suffix, plot_format) 282 | 283 | logger.info("Figname: {}".format(figname)) 284 | 285 | if not os.path.exists(os.path.dirname(figname)): 286 | try: 287 | os.makedirs(os.path.dirname(figname)) 288 | except OSError: 289 | logger.error( 290 | "CSV files do not have a match rep number {} != {}".format(figname)) 291 | sys.exit() 292 | 293 | fig.savefig(figname, format=plot_format, bbox_inches='tight') 294 | 295 | fig.clf() 296 | plt.close() 297 | 298 | 299 | def find_csv_file(name, path): 300 | logger = logging.getLogger('errorEstimate') 301 | 302 | for root, dirs, files in os.walk(path): 303 | files = [f for f in files if not f[0] == '.'] # ignore hidden files 304 | # Look for the *.check.csv files 305 | check_csv = [s for s in files if name in s and "BP" not in s] 306 | if len(check_csv) == 0: 307 | logger.error("CSV file `xx.{}.csv` not found in {}".format(name, path)) 308 | sys.exit() 309 | 310 | BP_check_csv = [s for s in files if name in s and "BP" in s] 311 | if len(BP_check_csv) == 0: 312 | logger.error("CSV file `xx.BP.repX.{}.csv` not found in {}".format(name, path)) 313 | sys.exit() 314 | 315 | # Sanity check - repetition number in the filename has to match 316 | rep = [s for s in check_csv[0].split('.') if "rep" in s] 317 | rep_bp = [s for s in BP_check_csv[0].split('.') if "rep" in s] 318 | if int(rep[0][3:]) != int(rep_bp[0][3:]): 319 | logger.error("CSV files do not have a match rep number {} != {}".format( 320 | check_csv, BP_check_csv)) 321 | sys.exit() 322 | return check_csv[0], BP_check_csv[0] 323 | 324 | 325 | def main(): 326 | 327 | parser = argparse.ArgumentParser() 328 | parser.add_argument('-c', '--config_file', 329 | required=True, help='JSON config file') 330 | # parser.add_argument('-m', '--metrics', default=0, help='Plot Metrics per BP') 331 | 332 | args = parser.parse_args() 333 | 334 | # Decode JSON configuration file 335 | with open(args.config_file, 'r') as f: 336 | json_data = json.load(f) 337 | 338 | # rootdir = json_data["paths"]["rootdir"] 339 | outpath = json_data["paths"]["outpath"] 340 | plotpath = json_data["paths"]["plotpath"] 341 | apps = json_data["Application"] 342 | nthreads = json_data["threads"] 343 | plot_format = json_data["plot_format"] 344 | debug_mode = json_data["Debug"] 345 | 346 | # Define the logger (debug is written to an out file) 347 | formatter = logging.Formatter('[%(asctime)s] [%(levelname)s] - (%(name)s) - %(message)s') 348 | handler = logging.FileHandler("debug-errorEstimate.log", "w") 349 | handler.setFormatter(formatter) 350 | 351 | # Console handler (only INFO level) 352 | ch = logging.StreamHandler() 353 | ch.setLevel(logging.INFO) 354 | ch.setFormatter(formatter) 355 | 356 | logger = logging.getLogger("errorEstimate") 357 | if debug_mode: 358 | logger.setLevel(logging.DEBUG) 359 | else: 360 | logger.setLevel(logging.INFO) 361 | 362 | # Add handlers to logger 363 | logger.addHandler(handler) 364 | logger.addHandler(ch) 365 | 366 | # Generate error plots for every combination of applications and threads 367 | for (ap, nt) in product(apps, nthreads): 368 | logger.info("== Plotting {}.{}t".format(ap, nt)) 369 | 370 | # Detect errors in the configuration file 371 | if "Barriers_suffix" not in apps[ap]: 372 | logger.error( 373 | "Barriers_suffix parameter not found in the configuration file for {}".format(ap)) 374 | elif "Perfcntrs_suffix" not in apps[ap]: 375 | logger.error( 376 | "Perfcntrs_suffix parameter not found in the configuration file for {}".format(ap)) 377 | if not apps[ap]['Barriers_suffix']: 378 | logger.warn("Barriers_suffix list is empty for {}".format(ap)) 379 | elif not apps[ap]['Perfcntrs_suffix']: 380 | logger.warn("Perfcntrs_suffix list is empty for {}".format(ap)) 381 | 382 | # Generate plots for each identified BP set (marked by different suffixes) 383 | for bp in apps[ap]['Barriers_suffix']: 384 | logger.debug("= Barrier suffix: {}".format(bp)) 385 | 386 | plots_data = [] 387 | bp_filename = "{}-{}.{}t".format(ap, bp, nt) 388 | bbpoints = "{}/barriers/{}".format(outpath, bp_filename) 389 | 390 | # Plot the reconstruction error estimate (selected BPs vs. whole app) 391 | # of each perfcntr run against the set of identified BPs 392 | for prfcntrs in apps[ap]['Perfcntrs_suffix']: 393 | logger.debug("= Perfcntr suffix: {}".format(prfcntrs)) 394 | perf_filename = "{}-{}.{}t".format(ap, prfcntrs, nt) 395 | perf_dir = "{}/bperf.{}/{}".format(outpath, 396 | prfcntrs, perf_filename) 397 | # Check the output directory for the perfcntrs CSV files 398 | fbaseline, gbarriers = find_csv_file("check", perf_dir) 399 | fbaseline = "{}/{}".format(perf_dir, fbaseline) 400 | gbarriers = "{}/{}".format(perf_dir, gbarriers) 401 | logger.debug("\n> fbaseline: {}\n> gbarriers: {}\n> bbpoints: {}".format( 402 | fbaseline, gbarriers, bbpoints)) 403 | 404 | plots_data.append(errorEstimate( 405 | fbaseline, gbarriers, bbpoints, prfcntrs)) 406 | 407 | plotError(plots_data, plotpath, debug_mode, bp, plot_format) 408 | 409 | 410 | if __name__ == '__main__': 411 | main() 412 | -------------------------------------------------------------------------------- /dynamorio_client/main.cpp: -------------------------------------------------------------------------------- 1 | /* ********************************************************** 2 | * Copyright (c) 2020, Arm Limited and Contributors. 3 | * **********************************************************/ 4 | 5 | /* 6 | * SPDX-License-Identifier: Apache-2.0 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | /* ********************************************************** 22 | * Copyright (c) 2011-2018 Google, Inc. All rights reserved. 23 | * Copyright (c) 2010 Massachusetts Institute of Technology All rights reserved. 24 | * **********************************************************/ 25 | 26 | /* 27 | * Redistribution and use in source and binary forms, with or without 28 | * modification, are permitted provided that the following conditions are met: 29 | * 30 | * * Redistributions of source code must retain the above copyright notice, 31 | * this list of conditions and the following disclaimer. 32 | * 33 | * * Redistributions in binary form must reproduce the above copyright notice, 34 | * this list of conditions and the following disclaimer in the documentation 35 | * and/or other materials provided with the distribution. 36 | * 37 | * * Neither the name of Google, Inc., nor the names of its contributors may be 38 | * used to endorse or promote products derived from this software without 39 | * specific prior written permission. 40 | * 41 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 42 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 44 | * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 47 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 48 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 51 | * DAMAGE. 52 | */ 53 | 54 | /* DynamoRIO client to detect OMP synchronization points in the code and gather 55 | * basic block (BB) information as well as the Least Recently Used (LRU) stack distance 56 | * for each identified barrier. 57 | * This client works by: 58 | * - Parsing the application libraries at load time and adding callbacks to all relevant 59 | * OMP synchronization functions. These callbacks are private to each threat. 60 | * - During execution, each thread will gather their own performance statistics, 61 | * basic blocks (BB) and Least Recently Used (LRU) stack distance on their TLS. 62 | * - For each BB, gather number of instructions (event_app_instruction) 63 | * and memory references (instrument_mem). 64 | * - The memory references are stored into a treap structure (thread_data.cpp) 65 | * to calculate the LRU. 66 | * - All the collected data is computed and outputed to simpoint-ready files (barrierpoint.cpp) 67 | */ 68 | 69 | /* DynamoRIO dependencies */ 70 | #include "dr_api.h" 71 | #include "drmgr.h" 72 | #include "drwrap.h" 73 | #include "drreg.h" 74 | #include "drutil.h" 75 | #include "drsyms.h" 76 | #include "droption.h" 77 | #include "drx.h" 78 | /* BarrierPoint dependencies */ 79 | #include "region.hpp" 80 | #include "thread_data.hpp" 81 | #include "barrierpoint.hpp" 82 | 83 | #include 84 | #include 85 | #include 86 | #include 87 | 88 | void register_trace_events(void); 89 | void unregister_trace_events(void); 90 | static void event_exit(void); 91 | static void event_omp_parallel(void *wrapcxt, OUT void **user_data); 92 | static void event_roi_init(void *wrapcxt, OUT void **user_data); 93 | static void event_roi_end(void *wrapcxt, OUT void **user_data); 94 | static bool enumerate_symbols_event(const char *name, size_t modoffs, void *data); 95 | static void event_barrier(void *wrapcxt, OUT void **user_data); 96 | static void event_single(void *wrapcxt, OUT void **user_data); 97 | 98 | /* Thread specific callback functions */ 99 | static void event_thread_init(void* drcontext); 100 | static void event_thread_exit(void* drcontext); 101 | 102 | void save_data(void *drcontext, std::string omp_name); 103 | 104 | /* Index used for keeping track of thread local storage */ 105 | static void register_roi_f(const module_data_t *mod); 106 | static int tls_idx; 107 | 108 | /* Main module initial address. We use this to filter out whether a basic block 109 | * belongs to the application itself or to a dynamically loaded module. 110 | */ 111 | static app_pc exe_start; 112 | reg_id_t tls_seg; 113 | uint tls_offs; 114 | 115 | #define MINSERT instrlist_meta_preinsert 116 | 117 | typedef struct{ 118 | char const *f_name; 119 | void (*f_pre)(void *wrapcxt, OUT void **user_data); 120 | void (*f_post)(void *wrapcxt, void *user_data); 121 | } wrap_callback_t; 122 | 123 | static bool in_roi=false; 124 | static bool roi_has_ended=false; 125 | static bool roi_start_detected = false; 126 | static bool roi_end_detected = false; 127 | 128 | /* Mutex global variable for synchronizing access to all_thread_data */ 129 | BarrierPoint barrierpoint; 130 | static void *synch_mutex; 131 | 132 | /* Optional client option for tracing basic block vectors belonging to dynamic libraries */ 133 | static droption_t trace_libraries( 134 | DROPTION_SCOPE_CLIENT, "trace_libraries", false, 135 | "Count, in addition, lib instructions", 136 | "Count, along with the instructions in the application itself, the instructions in " 137 | "shared libraries." 138 | ); 139 | 140 | /* Output folder and file option */ 141 | static droption_t out_path( 142 | DROPTION_SCOPE_CLIENT, "out_path", "./barrierpoint", 143 | "Specify output folder and file name", 144 | "Specify output folder and file name" 145 | ); 146 | 147 | uint64_t cache_line_mask; 148 | 149 | droption_t cache_line_size_bytes( 150 | DROPTION_SCOPE_CLIENT, "cache_line_size_bytes", 64, 151 | "Trace each single specific memory address instead of masking them with the cache line block", 152 | "Trace each single specific memory address instead of masking them with the cache line block" 153 | ); 154 | 155 | /* Option to ignore identified barriers generated by OMP_SINGLE */ 156 | static droption_t ignore_single( 157 | DROPTION_SCOPE_CLIENT, "ignore_single", false, 158 | "Ignore omp single and thus its corresponding implicit parallel", 159 | "Ignore omp single and thus its correspondign implicit parallel" 160 | ); 161 | 162 | #define GOMP_F_SIZE 2 163 | 164 | /* OMP function names to track */ 165 | static wrap_callback_t gomp_f[] = { 166 | {.f_name="GOMP_barrier", .f_pre=event_barrier, .f_post=NULL}, 167 | {.f_name="GOMP_single_start", .f_pre=event_single, .f_post=NULL} 168 | }; 169 | 170 | /* Regions of interest */ 171 | static wrap_callback_t roi_f[] = { 172 | {.f_name="SimRoiStart", .f_pre=event_roi_init, .f_post=nullptr}, 173 | {.f_name="SimRoiEnd", .f_pre=event_roi_end, .f_post=nullptr} 174 | }; 175 | 176 | /* Dictionary of functions called right afterwards the parallel omp function call. 177 | * We use this to understand, when called back, their exact function name. 178 | */ 179 | std::unordered_map parallel_omp_f; 180 | 181 | /* Function list to be registered for a callback */ 182 | std::vector> omp_functions; 183 | 184 | 185 | /* Callback called upon module loading for matching all of the 186 | * non-synchronization but omp related function calls. 187 | */ 188 | static bool enumerate_symbols_event(const char *name, size_t modoffs, void *data){ 189 | std::string f_name(name); 190 | 191 | std::size_t omp_f_found = f_name.find("omp_fn."); 192 | /* Add the found openMP function call to this global variable. 193 | * We are going to register it in the register_omp function 194 | */ 195 | if(omp_f_found != std::string::npos) 196 | omp_functions.push_back(std::tuple(f_name, modoffs)); 197 | 198 | return true; 199 | } 200 | 201 | 202 | static void event_roi_init(void *wrapcxt, OUT void **user_data){ 203 | dr_printf("[DR] RoI has been initialized\n"); 204 | roi_start_detected = true; 205 | 206 | /* Register all the useful callbacks, but only once. 207 | * This may be executed by multiple threads. 208 | */ 209 | static bool only_once = true; 210 | if(only_once){ 211 | register_trace_events(); 212 | in_roi=true; 213 | only_once = false; 214 | }else 215 | DR_ASSERT_MSG(false,"[DR] ERROR: You have defined multiple regions of interest"); 216 | } 217 | 218 | 219 | static void event_roi_end(void *wrapcxt, OUT void **user_data){ 220 | dr_printf("[DR] RoI has ended\n"); 221 | roi_end_detected = true; 222 | 223 | /* Unregister all the useful callbacks, but only once. 224 | * This may be executed by multiple threads. 225 | */ 226 | static bool only_once = true; 227 | if(only_once){ 228 | unregister_trace_events(); 229 | in_roi=false; 230 | roi_has_ended=true; 231 | only_once = false; 232 | }else 233 | DR_ASSERT_MSG(false, "[DR] ERROR: You have defined multiple regions of interest"); 234 | 235 | /* roi_end is equivalent to 'thread_exit' for the master thread: we save here 236 | * its last synchronization point 237 | */ 238 | ThreadData *data = reinterpret_cast( 239 | drmgr_get_tls_field(drwrap_get_drcontext(wrapcxt), tls_idx)); 240 | if(data->is_master) 241 | save_data(drwrap_get_drcontext(wrapcxt), "thread_exit"); 242 | else 243 | DR_ASSERT_MSG(false, "[DR] ERROR: Thread executing ROI end function is NOT master"); 244 | } 245 | 246 | 247 | static void register_roi_f(const module_data_t *mod){ 248 | /* Initialize drsyms */ 249 | for(int i=0; i < GOMP_F_SIZE ; i++){ 250 | size_t modoffs = 0; 251 | drsym_error_t symres = DRSYM_ERROR; 252 | symres = drsym_lookup_symbol(mod->full_path, roi_f[i].f_name, &modoffs, DRSYM_DEMANGLE); 253 | if(symres == DRSYM_SUCCESS){ 254 | app_pc startup_wrap = modoffs + mod->start; 255 | dr_printf("[DR] wrapping %s @" PFX "\n", roi_f[i].f_name, startup_wrap); 256 | drwrap_wrap(startup_wrap,roi_f[i].f_pre, roi_f[i].f_post); 257 | } 258 | } 259 | } 260 | 261 | 262 | /* After OMP parallel, the compiler adds in the code an invocation for a symbol containing 263 | * "FUNCTION_WHERE_OMP_PARALLEL_HAS_BEEN_DEFINED.omp_fn.INCREMENTAL_NUMBER". 264 | * We want to trace that back in such a way that in the bp_id file we know 265 | * exactly where in the code the omp parallel invocations are defined in the code. 266 | */ 267 | static void register_omp_f(const module_data_t * mod){ 268 | drsym_enumerate_symbols(mod->full_path, enumerate_symbols_event, NULL, DRSYM_LEAVE_MANGLED); 269 | /* If a new omp function has been found in a global variable, register that function */ 270 | size_t modoffs; 271 | std::string f_name; 272 | if(!omp_functions.empty()){ 273 | for(auto& f : omp_functions){ 274 | std::tie(f_name, modoffs) = f; 275 | app_pc startup_wrap = modoffs + mod->start ; 276 | dr_fprintf(STDERR, "[DR] wrapping %s @" PFX "\n", f_name.c_str(), startup_wrap); 277 | drwrap_wrap(startup_wrap, event_omp_parallel, nullptr); 278 | /* Register the wrapped functions in a dictionary to be used in the callback event */ 279 | auto search = parallel_omp_f.find(startup_wrap); 280 | if(search == parallel_omp_f.end()) 281 | parallel_omp_f[startup_wrap] = f_name; 282 | } 283 | omp_functions.clear(); 284 | } 285 | } 286 | 287 | 288 | static void event_omp_parallel(void *wrapcxt, OUT void **user_data){ 289 | /* Get current virtual address for this function */ 290 | app_pc f_addr = drwrap_get_func(wrapcxt); 291 | /* Gather the current function name matching the current address to the 292 | * parallel_omp_f dictionary. 293 | */ 294 | auto search = parallel_omp_f.find(f_addr); 295 | if(search != parallel_omp_f.end()){ 296 | if(in_roi){ 297 | void* drcontext = drwrap_get_drcontext(wrapcxt); 298 | save_data(drcontext, search->second); 299 | } 300 | }else{ 301 | dr_printf("[DR] ERROR: Failed looking up for function at address: " PFX "\n", f_addr); 302 | DR_ASSERT(false); 303 | } 304 | } 305 | 306 | #ifdef VALIDATE 307 | static file_t modules_f; 308 | #endif 309 | 310 | 311 | /* Register omp callback functions when loading libGOMP, roi_init and roi_end */ 312 | static void module_load_event(void *drcontext, const module_data_t *mod, bool loaded){ 313 | #ifdef VALIDATE 314 | static bool first_time = true; 315 | /* Keep an external file with a list of all loaded modules (for validation only) */ 316 | if(first_time){ 317 | modules_f = dr_open_file("loaded.modules", DR_FILE_WRITE_OVERWRITE); 318 | dr_fprintf(modules_f, "[DR] loading %s @" PFX "\n", mod->full_path, mod->start); 319 | first_time = false; 320 | }else 321 | dr_fprintf(modules_f, "[DR] loading %s @" PFX "\n", mod->full_path, mod->start); 322 | #endif 323 | 324 | /* Register ROI detection functions */ 325 | register_roi_f(mod); 326 | /* Register omp generic functions */ 327 | register_omp_f(mod); 328 | 329 | /* Register function call */ 330 | const char *ret = strstr(dr_module_preferred_name(mod), "libgomp.so"); 331 | if (ret != NULL){ 332 | for(int i=0; i < GOMP_F_SIZE; i++){ 333 | app_pc towrap = (app_pc)dr_get_proc_address(mod->handle, gomp_f[i].f_name); 334 | if (towrap != NULL){ 335 | bool ok = drwrap_wrap(towrap, gomp_f[i].f_pre, gomp_f[i].f_post); 336 | if (ok) 337 | dr_fprintf(STDERR, "[DR] wrapped %s @" PFX "\n", gomp_f[i].f_name, towrap); 338 | else 339 | dr_fprintf(STDERR, "[DR] FAILED to wrap %s @" PFX 340 | ": already wrapped?\n", gomp_f[i].f_name ,towrap); 341 | } 342 | } 343 | } 344 | } 345 | 346 | 347 | /* Dumps the memory reference info to the log file */ 348 | static void clean_call(uint64_t inst_count, uint64_t address){ 349 | void *drcontext = dr_get_current_drcontext(); 350 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 351 | #ifdef TRACE_MEM_BEFORE_ROI 352 | /* Save basic blocks */ 353 | if(in_roi) 354 | data->add_bb(address, inst_count); 355 | if(!roi_has_ended) /* Save memory addresses being accessed in this bb */ 356 | data->add_address(); 357 | else 358 | data->clean_buffer(); 359 | #else 360 | if(in_roi){ 361 | data->add_bb(address, inst_count); 362 | data->add_address(); 363 | }else 364 | data->clean_buffer(); 365 | #endif 366 | } 367 | 368 | 369 | static void 370 | insert_load_buf_ptr(void *drcontext, instrlist_t *ilist, instr_t *where, reg_id_t reg_ptr){ 371 | dr_insert_read_raw_tls(drcontext, ilist, where, tls_seg, 372 | tls_offs + MEMTRACE_TLS_OFFS_BUF_PTR, reg_ptr); 373 | } 374 | 375 | 376 | static void 377 | insert_update_buf_ptr(void *drcontext, instrlist_t *ilist, instr_t *where, 378 | reg_id_t reg_ptr, int adjust){ 379 | MINSERT( 380 | ilist, where, 381 | XINST_CREATE_add(drcontext, opnd_create_reg(reg_ptr), OPND_CREATE_INT16(adjust)) 382 | ); 383 | dr_insert_write_raw_tls(drcontext, ilist, where, tls_seg, 384 | tls_offs + MEMTRACE_TLS_OFFS_BUF_PTR, reg_ptr); 385 | } 386 | 387 | 388 | static void 389 | insert_save_addr(void *drcontext, instrlist_t *ilist, instr_t *where, opnd_t ref, 390 | reg_id_t reg_ptr, reg_id_t reg_addr){ 391 | bool ok; 392 | /* we use reg_ptr as scratch to get addr */ 393 | ok = drutil_insert_get_mem_addr(drcontext, ilist, where, ref, reg_addr, reg_ptr); 394 | DR_ASSERT(ok); 395 | insert_load_buf_ptr(drcontext, ilist, where, reg_ptr); 396 | MINSERT(ilist, where, 397 | XINST_CREATE_store(drcontext, 398 | OPND_CREATE_MEMPTR(reg_ptr, offsetof(mem_ref_t, addr)), 399 | opnd_create_reg(reg_addr))); 400 | } 401 | 402 | 403 | /* Insert inline code to add a memory reference info entry into the buffer */ 404 | static void 405 | instrument_mem(void *drcontext, instrlist_t *ilist, instr_t *where, opnd_t ref, 406 | bool write){ 407 | #ifdef VALIDATE 408 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 409 | dr_fprintf(data->memory_access_file ,"[DR] TID %d mem instrumenting PC " PFX "\n", 410 | data->tid, instr_get_app_pc(where)); 411 | dr_flush_file(data->disassemble_file); 412 | #endif 413 | 414 | /* We need two scratch registers */ 415 | reg_id_t reg_ptr, reg_tmp; 416 | if (drreg_reserve_register(drcontext, ilist, where, NULL, ®_ptr) != DRREG_SUCCESS || 417 | drreg_reserve_register(drcontext, ilist, where, NULL, ®_tmp) !=DRREG_SUCCESS){ 418 | DR_ASSERT(false); /* cannot recover */ 419 | return; 420 | } 421 | /* Inject code that saves the address into the buffer */ 422 | insert_save_addr(drcontext, ilist, where, ref, reg_ptr, reg_tmp); 423 | insert_update_buf_ptr(drcontext, ilist, where, reg_ptr, sizeof(mem_ref_t)); 424 | /* Restore scratch registers */ 425 | if (drreg_unreserve_register(drcontext, ilist, where, reg_ptr) != DRREG_SUCCESS || 426 | drreg_unreserve_register(drcontext, ilist, where, reg_tmp) != DRREG_SUCCESS) 427 | DR_ASSERT(false); 428 | } 429 | 430 | 431 | /* For each memory reference app instr, we insert inline code to fill the buffer 432 | * with an instruction entry and memory reference entries. 433 | */ 434 | static dr_emit_flags_t 435 | event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *instr, 436 | bool for_trace, bool translating, void *user_data){ 437 | /* drmgr enables auto-predication by default, which predicates 438 | * all instructions with the predicate of the current instruction on Arm. 439 | * We disable it because we want to unconditionally execute the 440 | * following lines of instrumentation 441 | */ 442 | drmgr_disable_auto_predication(drcontext,bb); 443 | 444 | /* By default, take into account only BBs belonging to the application itself. 445 | * Setting the 'trace_libraries' parameter enables DR to trace all dynamic libraries 446 | */ 447 | module_data_t *mod = dr_lookup_module(dr_fragment_app_pc(tag)); 448 | if (!trace_libraries.get_value()) 449 | if (mod != NULL){ 450 | bool belongs_to_app = (mod->start == exe_start); 451 | dr_free_module_data(mod); 452 | if (!belongs_to_app) 453 | return DR_EMIT_DEFAULT; 454 | } 455 | 456 | if (instr_is_app(instr)){ 457 | if (instr_reads_memory(instr) || instr_writes_memory(instr)){ 458 | int i; 459 | /* insert code to add an entry for each memory reference opnd */ 460 | for (i = 0; i < instr_num_srcs(instr); i++){ 461 | if (opnd_is_memory_reference(instr_get_src(instr, i))) 462 | instrument_mem(drcontext, bb, instr, instr_get_src(instr, i), false); 463 | } 464 | 465 | for (i = 0; i < instr_num_dsts(instr); i++){ 466 | if (opnd_is_memory_reference(instr_get_dst(instr, i))) 467 | instrument_mem(drcontext, bb, instr, instr_get_dst(instr, i), true); 468 | } 469 | } 470 | 471 | if(drmgr_is_first_instr(drcontext, instr)){ 472 | /* Extract the number of instructions in the basic block */ 473 | uint64_t instr_count = 0; 474 | instr_t *instr_it; 475 | for(instr_it = instrlist_first_app(bb); instr_it != NULL; 476 | instr_it = instr_get_next_app(instr_it)){ 477 | if(!instr_is_meta(instr_it)) 478 | instr_count++; 479 | } 480 | uint64_t address = reinterpret_cast(tag); 481 | 482 | if (/* XXX i#1698: there are constraints for code between ldrex/strex pairs, 483 | * so we minimize the instrumentation in between by skipping the clean call. 484 | * As we're only inserting instrumentation on a memory reference, and the 485 | * app should be avoiding memory accesses in between the ldrex...strex, 486 | * the only problematic point should be before the strex. 487 | * However, there is still a chance that the instrumentation code may clear the 488 | * exclusive monitor state. 489 | * Using a fault to handle a full buffer should be more robust, and the 490 | * forthcoming buffer filling API (i#513) will provide that. 491 | */ 492 | IF_AARCHXX_ELSE(!instr_is_exclusive_store(instr), true)){ 493 | dr_insert_clean_call(drcontext, bb, instr, (void *)clean_call, false, 2, 494 | OPND_CREATE_INT64(instr_count), OPND_CREATE_INT64(address)); 495 | } 496 | /* Since we want to keep track for the execution of a whole basic block, we save its 497 | * statistics the first time we see it and skip the other ones. */ 498 | 499 | #ifdef VALIDATE 500 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 501 | instrlist_disassemble(drcontext, (app_pc)tag, bb, data->disassemble_file); 502 | dr_flush_file(data->disassemble_file); 503 | #endif 504 | 505 | } 506 | } 507 | return DR_EMIT_DEFAULT; 508 | } 509 | 510 | 511 | /* Memtrace Callback. 512 | * We transform string loops into regular loops so that we can more easily 513 | * monitor every memory reference. 514 | */ 515 | static dr_emit_flags_t 516 | event_bb_app2app(void *drcontext, void *tag, instrlist_t *bb, bool for_trace, 517 | bool translating){ 518 | DR_ASSERT_MSG(drutil_expand_rep_string(drcontext, bb), "[DR] Error: Failed expanding rep string"); 519 | return DR_EMIT_DEFAULT; 520 | } 521 | 522 | 523 | void register_trace_events(void){ 524 | /* Ignore thread information that started and finished execution before the ROI */ 525 | drmgr_register_thread_exit_event(event_thread_exit); 526 | } 527 | 528 | 529 | void unregister_trace_events(void){ 530 | drmgr_unregister_thread_init_event(event_thread_init); 531 | drmgr_unregister_bb_app2app_event(event_bb_app2app); 532 | drmgr_unregister_bb_insertion_event(event_app_instruction); 533 | } 534 | 535 | 536 | /* Upon execution ending, save all the data we've gathered into files */ 537 | static void event_exit(void){ 538 | dr_printf("%s\n", "[DR] BarrierPoint client execution is ending"); 539 | drmgr_unregister_tls_field(tls_idx); 540 | drmgr_unregister_module_load_event(module_load_event); 541 | drmgr_unregister_thread_exit_event(event_thread_exit); 542 | /* Assert if ROI was set */ 543 | if(!roi_start_detected) 544 | DR_ASSERT_MSG(false, "[DR] SimRoiStart has not been detected. \ 545 | Have you defined it in the source code?"); 546 | if(!roi_end_detected) 547 | DR_ASSERT_MSG(false, "[DR] SimRoiEnd has not been detected. \ 548 | Have you defined it in the source code?"); 549 | 550 | /* Dump Traces into a file */ 551 | barrierpoint.save(out_path.get_value()); 552 | 553 | barrierpoint.free(); 554 | dr_mutex_destroy(synch_mutex); 555 | drwrap_exit(); 556 | drutil_exit(); 557 | drreg_exit(); 558 | drmgr_exit(); 559 | drsym_exit(); 560 | } 561 | 562 | 563 | static void event_barrier(void *wrapcxt, OUT void **user_data){ 564 | if(in_roi){ 565 | void* drcontext = drwrap_get_drcontext(wrapcxt); 566 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 567 | if(!data->in_single) 568 | save_data(drcontext, "GOMP_barrier"); 569 | /* If set, ignore implicit barriers generated by OMP Single */ 570 | else{ 571 | dr_printf("[DR] Barrier Ignored due to a single\n"); 572 | data->in_single=false; 573 | } 574 | } 575 | } 576 | 577 | 578 | /* Called on synchronization points, this function saves all the current intra-barrier 579 | * region information gathered so far. 580 | */ 581 | void save_data(void *drcontext, std::string omp_name){ 582 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 583 | data->save_barrier(omp_name); 584 | /* Update the synchronization point number */ 585 | if(data->is_master) 586 | barrierpoint.incr_synch_count(); 587 | } 588 | 589 | 590 | /* Updates the thread internal status for tracking that the next barrier 591 | * will be implicitly generated by omp single 592 | */ 593 | static void event_single(void *wrapcxt, OUT void **user_data){ 594 | if(in_roi && ignore_single.get_value()){ 595 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drwrap_get_drcontext(wrapcxt), tls_idx)); 596 | data->in_single = true; 597 | } 598 | } 599 | 600 | 601 | /* Initialize, per each thread created, its corresponding data structure representation. 602 | * If a thread is spawned outside of the ROI, it will not take into account 603 | * synchronization points or bb/ldv collecting. */ 604 | static void event_thread_init(void* drcontext){ 605 | static bool first_time_called = true; 606 | bool is_master_thread = false; 607 | 608 | if(first_time_called){ 609 | is_master_thread = true; 610 | first_time_called = false; 611 | } 612 | 613 | /* Allocate per thread specific data structure */ 614 | ThreadData* data = reinterpret_cast(dr_thread_alloc(drcontext, sizeof(data))); 615 | DR_ASSERT_MSG(data != NULL,"Failed Allocating per_thread data"); 616 | data = new ThreadData{dr_get_thread_id(drcontext),is_master_thread, barrierpoint.synch_count}; 617 | drmgr_set_tls_field(drcontext,tls_idx,data); 618 | } 619 | 620 | 621 | /* Upon thread exiting, we save all the important collected data. 622 | * event_thread_exit can be seen as a "Synchronization point". 623 | */ 624 | static void event_thread_exit(void* drcontext){ 625 | ThreadData *data = reinterpret_cast(drmgr_get_tls_field(drcontext, tls_idx)); 626 | /* We treat thread_exit as a last synchronization point. 627 | * The master thread has its correspondent in roi_end. 628 | */ 629 | if(!data->is_master) 630 | save_data(drcontext, "thread_exit"); 631 | 632 | /* Add the new per_thread_t with global scope to all_thread_data */ 633 | dr_mutex_lock(synch_mutex); 634 | 635 | barrierpoint.add_thread_data(*(data)); 636 | 637 | dr_mutex_unlock(synch_mutex); 638 | 639 | #ifdef VALIDATE 640 | dr_close_file(data->disassemble_file); 641 | dr_close_file(data->memory_access_file); 642 | dr_close_file(data->runtime_bb_file); 643 | #endif 644 | 645 | dr_thread_free(drcontext, data, sizeof(data)); 646 | 647 | return; 648 | } 649 | 650 | 651 | DR_EXPORT void dr_client_main(client_id_t id, int argc, const char *argv[]){ 652 | dr_set_client_name("BarrierPoint client", 653 | "https://github.com/ARM-software/BarrierPoint"); 654 | dr_log(NULL, DR_LOG_ALL, 1, "[DR] Barrierpoint initializing\n"); 655 | if (dr_is_notify_on()) 656 | dr_fprintf(STDERR, "[DR] BarrierPoint is running\n"); 657 | if (!droption_parser_t::parse_argv(DROPTION_SCOPE_CLIENT, argc, argv, NULL, NULL)) 658 | DR_ASSERT_MSG(false, "[DR] ERROR: Could't parse correctly the flag options"); 659 | 660 | /* Initialize Cache Line mask */ 661 | cache_line_mask = ~(cache_line_size_bytes.get_value()-1); 662 | 663 | /* Get main module address */ 664 | module_data_t *exe = dr_get_main_module(); 665 | if (exe != NULL) 666 | exe_start = exe->start; 667 | else 668 | DR_ASSERT_MSG(false, "[DR] ERROR: Couldn't find where the main module starts"); 669 | dr_free_module_data(exe); 670 | 671 | /* We need 2 reg slots beyond drreg's eflag slots => 3 slots */ 672 | drreg_options_t ops = { sizeof(ops), 3, false}; 673 | /* Initialize the client */ 674 | if(!drmgr_init() || !drwrap_init() || drreg_init(&ops) != DRREG_SUCCESS || !drutil_init()) 675 | DR_ASSERT(false); 676 | 677 | drsym_init(0); 678 | 679 | /* Reserves a thread-local storage for every thread */ 680 | tls_idx = drmgr_register_tls_field(); 681 | DR_ASSERT_MSG(tls_idx > 0," drmgr_register_tls_field() Failed\n"); 682 | drmgr_register_module_load_event(module_load_event); 683 | /* We want to trace threads initialization even if we are outside of the ROI */ 684 | drmgr_register_thread_init_event(event_thread_init); 685 | dr_register_exit_event(event_exit); 686 | 687 | /* We instrument basic blocks no matter if they are inside the ROI or not */ 688 | drmgr_register_bb_instrumentation_event(nullptr, event_app_instruction, nullptr); 689 | drmgr_register_bb_app2app_event(event_bb_app2app, nullptr); 690 | 691 | /*Initialize the mutex */ 692 | synch_mutex = dr_mutex_create(); 693 | /* The TLS field provided by DR cannot be directly accessed from the code 694 | * cache. For better performance, we allocate a raw TLS so that we can 695 | * directly access and update it with a single instrucion. 696 | */ 697 | DR_ASSERT(dr_raw_tls_calloc(&tls_seg, &tls_offs, MEMTRACE_TLS_COUNT, 0)); 698 | } -------------------------------------------------------------------------------- /dynamorio_client/LICENSE-BSD-LGPL.txt: -------------------------------------------------------------------------------- 1 | Some parts of the DynamoRIO BarrierPoint client are based from existent DynamoRIO clients. 2 | Those clients are licensed under the following licenses: 3 | 4 | =========================================================================== 5 | Primary DynamoRIO License: BSD 6 | 7 | Copyright (c) 2010-2018 Google, Inc. licensed under the terms of the BSD. All other rights reserved. 8 | Copyright (c) 2000-2009 VMware, Inc. licensed under the terms of the BSD. All other rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, 14 | this list of conditions and the following disclaimer. 15 | 16 | * Redistributions in binary form must reproduce the above copyright notice, 17 | this list of conditions and the following disclaimer in the documentation 18 | and/or other materials provided with the distribution. 19 | 20 | * Neither the name of VMware, Inc. nor the names of its contributors may be 21 | used to endorse or promote products derived from this software without 22 | specific prior written permission. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 25 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 | ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE 28 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 34 | DAMAGE. 35 | 36 | =========================================================================== 37 | libelftc License 38 | 39 | The drsyms Extension contains libelftc code 40 | (http://elftoolchain.sourceforge.net/) in binary form for obtaining file 41 | and line information from DWARF2 debug information on Linux and 42 | Cygwin/MinGW. That libelftc code has the following copyright and license: 43 | 44 | Copyright (c) 2006-2012 Kai Wang, Joseph Koshy, et al. 45 | Copyright (c) 1990-2005 The Regents of the University of California, et al. 46 | All rights reserved. 47 | 48 | Redistribution and use in source and binary forms, with or without 49 | modification, are permitted provided that the following conditions 50 | are met: 51 | 1. Redistributions of source code must retain the above copyright 52 | notice, this list of conditions and the following disclaimer. 53 | 2. Redistributions in binary form must reproduce the above copyright 54 | notice, this list of conditions and the following disclaimer in the 55 | documentation and/or other materials provided with the distribution. 56 | 57 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 58 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 59 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 60 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 61 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 62 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 63 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 64 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 65 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 66 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 67 | SUCH DAMAGE. 68 | 69 | =========================================================================== 70 | Certain Extensions are instead under the LGPL 2.1 License 71 | 72 | The drwrap and drutil Extensions, along with Dr. Memory and all of its 73 | libraries in the Dr. Memory Framework (if packaged with DynamoRIO), are 74 | licensed under the LGPL 2.1 License and NOT the BSD license used for the 75 | rest of DynamoRIO. These Extensions are provided as libraries distinct 76 | from the rest of DynamoRIO. The details of this license are below: 77 | 78 | This library is free software; you can redistribute it and/or 79 | modify it under the terms of the GNU Lesser General Public 80 | License as published by the Free Software Foundation; 81 | version 2.1 of the License, and no later version. 82 | 83 | This library is distributed in the hope that it will be useful, 84 | but WITHOUT ANY WARRANTY; without even the implied warranty of 85 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 86 | Library General Public License for more details. 87 | 88 | You should have received a copy of the GNU Lesser General Public 89 | License along with this library; if not, write to the Free Software 90 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 91 | 92 | 93 | GNU LESSER GENERAL PUBLIC LICENSE 94 | Version 2.1, February 1999 95 | 96 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 97 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 98 | Everyone is permitted to copy and distribute verbatim copies 99 | of this license document, but changing it is not allowed. 100 | 101 | [This is the first released version of the Lesser GPL. It also counts 102 | as the successor of the GNU Library Public License, version 2, hence 103 | the version number 2.1.] 104 | 105 | Preamble 106 | 107 | The licenses for most software are designed to take away your 108 | freedom to share and change it. By contrast, the GNU General Public 109 | Licenses are intended to guarantee your freedom to share and change 110 | free software--to make sure the software is free for all its users. 111 | 112 | This license, the Lesser General Public License, applies to some 113 | specially designated software packages--typically libraries--of the 114 | Free Software Foundation and other authors who decide to use it. You 115 | can use it too, but we suggest you first think carefully about whether 116 | this license or the ordinary General Public License is the better 117 | strategy to use in any particular case, based on the explanations below. 118 | 119 | When we speak of free software, we are referring to freedom of use, 120 | not price. Our General Public Licenses are designed to make sure that 121 | you have the freedom to distribute copies of free software (and charge 122 | for this service if you wish); that you receive source code or can get 123 | it if you want it; that you can change the software and use pieces of 124 | it in new free programs; and that you are informed that you can do 125 | these things. 126 | 127 | To protect your rights, we need to make restrictions that forbid 128 | distributors to deny you these rights or to ask you to surrender these 129 | rights. These restrictions translate to certain responsibilities for 130 | you if you distribute copies of the library or if you modify it. 131 | 132 | For example, if you distribute copies of the library, whether gratis 133 | or for a fee, you must give the recipients all the rights that we gave 134 | you. You must make sure that they, too, receive or can get the source 135 | code. If you link other code with the library, you must provide 136 | complete object files to the recipients, so that they can relink them 137 | with the library after making changes to the library and recompiling 138 | it. And you must show them these terms so they know their rights. 139 | 140 | We protect your rights with a two-step method: (1) we copyright the 141 | library, and (2) we offer you this license, which gives you legal 142 | permission to copy, distribute and/or modify the library. 143 | 144 | To protect each distributor, we want to make it very clear that 145 | there is no warranty for the free library. Also, if the library is 146 | modified by someone else and passed on, the recipients should know 147 | that what they have is not the original version, so that the original 148 | author's reputation will not be affected by problems that might be 149 | introduced by others. 150 | 151 | Finally, software patents pose a constant threat to the existence of 152 | any free program. We wish to make sure that a company cannot 153 | effectively restrict the users of a free program by obtaining a 154 | restrictive license from a patent holder. Therefore, we insist that 155 | any patent license obtained for a version of the library must be 156 | consistent with the full freedom of use specified in this license. 157 | 158 | Most GNU software, including some libraries, is covered by the 159 | ordinary GNU General Public License. This license, the GNU Lesser 160 | General Public License, applies to certain designated libraries, and 161 | is quite different from the ordinary General Public License. We use 162 | this license for certain libraries in order to permit linking those 163 | libraries into non-free programs. 164 | 165 | When a program is linked with a library, whether statically or using 166 | a shared library, the combination of the two is legally speaking a 167 | combined work, a derivative of the original library. The ordinary 168 | General Public License therefore permits such linking only if the 169 | entire combination fits its criteria of freedom. The Lesser General 170 | Public License permits more lax criteria for linking other code with 171 | the library. 172 | 173 | We call this license the "Lesser" General Public License because it 174 | does Less to protect the user's freedom than the ordinary General 175 | Public License. It also provides other free software developers Less 176 | of an advantage over competing non-free programs. These disadvantages 177 | are the reason we use the ordinary General Public License for many 178 | libraries. However, the Lesser license provides advantages in certain 179 | special circumstances. 180 | 181 | For example, on rare occasions, there may be a special need to 182 | encourage the widest possible use of a certain library, so that it becomes 183 | a de-facto standard. To achieve this, non-free programs must be 184 | allowed to use the library. A more frequent case is that a free 185 | library does the same job as widely used non-free libraries. In this 186 | case, there is little to gain by limiting the free library to free 187 | software only, so we use the Lesser General Public License. 188 | 189 | In other cases, permission to use a particular library in non-free 190 | programs enables a greater number of people to use a large body of 191 | free software. For example, permission to use the GNU C Library in 192 | non-free programs enables many more people to use the whole GNU 193 | operating system, as well as its variant, the GNU/Linux operating 194 | system. 195 | 196 | Although the Lesser General Public License is Less protective of the 197 | users' freedom, it does ensure that the user of a program that is 198 | linked with the Library has the freedom and the wherewithal to run 199 | that program using a modified version of the Library. 200 | 201 | The precise terms and conditions for copying, distribution and 202 | modification follow. Pay close attention to the difference between a 203 | "work based on the library" and a "work that uses the library". The 204 | former contains code derived from the library, whereas the latter must 205 | be combined with the library in order to run. 206 | 207 | GNU LESSER GENERAL PUBLIC LICENSE 208 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 209 | 210 | 0. This License Agreement applies to any software library or other 211 | program which contains a notice placed by the copyright holder or 212 | other authorized party saying it may be distributed under the terms of 213 | this Lesser General Public License (also called "this License"). 214 | Each licensee is addressed as "you". 215 | 216 | A "library" means a collection of software functions and/or data 217 | prepared so as to be conveniently linked with application programs 218 | (which use some of those functions and data) to form executables. 219 | 220 | The "Library", below, refers to any such software library or work 221 | which has been distributed under these terms. A "work based on the 222 | Library" means either the Library or any derivative work under 223 | copyright law: that is to say, a work containing the Library or a 224 | portion of it, either verbatim or with modifications and/or translated 225 | straightforwardly into another language. (Hereinafter, translation is 226 | included without limitation in the term "modification".) 227 | 228 | "Source code" for a work means the preferred form of the work for 229 | making modifications to it. For a library, complete source code means 230 | all the source code for all modules it contains, plus any associated 231 | interface definition files, plus the scripts used to control compilation 232 | and installation of the library. 233 | 234 | Activities other than copying, distribution and modification are not 235 | covered by this License; they are outside its scope. The act of 236 | running a program using the Library is not restricted, and output from 237 | such a program is covered only if its contents constitute a work based 238 | on the Library (independent of the use of the Library in a tool for 239 | writing it). Whether that is true depends on what the Library does 240 | and what the program that uses the Library does. 241 | 242 | 1. You may copy and distribute verbatim copies of the Library's 243 | complete source code as you receive it, in any medium, provided that 244 | you conspicuously and appropriately publish on each copy an 245 | appropriate copyright notice and disclaimer of warranty; keep intact 246 | all the notices that refer to this License and to the absence of any 247 | warranty; and distribute a copy of this License along with the 248 | Library. 249 | 250 | You may charge a fee for the physical act of transferring a copy, 251 | and you may at your option offer warranty protection in exchange for a 252 | fee. 253 | 254 | 2. You may modify your copy or copies of the Library or any portion 255 | of it, thus forming a work based on the Library, and copy and 256 | distribute such modifications or work under the terms of Section 1 257 | above, provided that you also meet all of these conditions: 258 | 259 | a) The modified work must itself be a software library. 260 | 261 | b) You must cause the files modified to carry prominent notices 262 | stating that you changed the files and the date of any change. 263 | 264 | c) You must cause the whole of the work to be licensed at no 265 | charge to all third parties under the terms of this License. 266 | 267 | d) If a facility in the modified Library refers to a function or a 268 | table of data to be supplied by an application program that uses 269 | the facility, other than as an argument passed when the facility 270 | is invoked, then you must make a good faith effort to ensure that, 271 | in the event an application does not supply such function or 272 | table, the facility still operates, and performs whatever part of 273 | its purpose remains meaningful. 274 | 275 | (For example, a function in a library to compute square roots has 276 | a purpose that is entirely well-defined independent of the 277 | application. Therefore, Subsection 2d requires that any 278 | application-supplied function or table used by this function must 279 | be optional: if the application does not supply it, the square 280 | root function must still compute square roots.) 281 | 282 | These requirements apply to the modified work as a whole. If 283 | identifiable sections of that work are not derived from the Library, 284 | and can be reasonably considered independent and separate works in 285 | themselves, then this License, and its terms, do not apply to those 286 | sections when you distribute them as separate works. But when you 287 | distribute the same sections as part of a whole which is a work based 288 | on the Library, the distribution of the whole must be on the terms of 289 | this License, whose permissions for other licensees extend to the 290 | entire whole, and thus to each and every part regardless of who wrote 291 | it. 292 | 293 | Thus, it is not the intent of this section to claim rights or contest 294 | your rights to work written entirely by you; rather, the intent is to 295 | exercise the right to control the distribution of derivative or 296 | collective works based on the Library. 297 | 298 | In addition, mere aggregation of another work not based on the Library 299 | with the Library (or with a work based on the Library) on a volume of 300 | a storage or distribution medium does not bring the other work under 301 | the scope of this License. 302 | 303 | 3. You may opt to apply the terms of the ordinary GNU General Public 304 | License instead of this License to a given copy of the Library. To do 305 | this, you must alter all the notices that refer to this License, so 306 | that they refer to the ordinary GNU General Public License, version 2, 307 | instead of to this License. (If a newer version than version 2 of the 308 | ordinary GNU General Public License has appeared, then you can specify 309 | that version instead if you wish.) Do not make any other change in 310 | these notices. 311 | 312 | Once this change is made in a given copy, it is irreversible for 313 | that copy, so the ordinary GNU General Public License applies to all 314 | subsequent copies and derivative works made from that copy. 315 | 316 | This option is useful when you wish to copy part of the code of 317 | the Library into a program that is not a library. 318 | 319 | 4. You may copy and distribute the Library (or a portion or 320 | derivative of it, under Section 2) in object code or executable form 321 | under the terms of Sections 1 and 2 above provided that you accompany 322 | it with the complete corresponding machine-readable source code, which 323 | must be distributed under the terms of Sections 1 and 2 above on a 324 | medium customarily used for software interchange. 325 | 326 | If distribution of object code is made by offering access to copy 327 | from a designated place, then offering equivalent access to copy the 328 | source code from the same place satisfies the requirement to 329 | distribute the source code, even though third parties are not 330 | compelled to copy the source along with the object code. 331 | 332 | 5. A program that contains no derivative of any portion of the 333 | Library, but is designed to work with the Library by being compiled or 334 | linked with it, is called a "work that uses the Library". Such a 335 | work, in isolation, is not a derivative work of the Library, and 336 | therefore falls outside the scope of this License. 337 | 338 | However, linking a "work that uses the Library" with the Library 339 | creates an executable that is a derivative of the Library (because it 340 | contains portions of the Library), rather than a "work that uses the 341 | library". The executable is therefore covered by this License. 342 | Section 6 states terms for distribution of such executables. 343 | 344 | When a "work that uses the Library" uses material from a header file 345 | that is part of the Library, the object code for the work may be a 346 | derivative work of the Library even though the source code is not. 347 | Whether this is true is especially significant if the work can be 348 | linked without the Library, or if the work is itself a library. The 349 | threshold for this to be true is not precisely defined by law. 350 | 351 | If such an object file uses only numerical parameters, data 352 | structure layouts and accessors, and small macros and small inline 353 | functions (ten lines or less in length), then the use of the object 354 | file is unrestricted, regardless of whether it is legally a derivative 355 | work. (Executables containing this object code plus portions of the 356 | Library will still fall under Section 6.) 357 | 358 | Otherwise, if the work is a derivative of the Library, you may 359 | distribute the object code for the work under the terms of Section 6. 360 | Any executables containing that work also fall under Section 6, 361 | whether or not they are linked directly with the Library itself. 362 | 363 | 6. As an exception to the Sections above, you may also combine or 364 | link a "work that uses the Library" with the Library to produce a 365 | work containing portions of the Library, and distribute that work 366 | under terms of your choice, provided that the terms permit 367 | modification of the work for the customer's own use and reverse 368 | engineering for debugging such modifications. 369 | 370 | You must give prominent notice with each copy of the work that the 371 | Library is used in it and that the Library and its use are covered by 372 | this License. You must supply a copy of this License. If the work 373 | during execution displays copyright notices, you must include the 374 | copyright notice for the Library among them, as well as a reference 375 | directing the user to the copy of this License. Also, you must do one 376 | of these things: 377 | 378 | a) Accompany the work with the complete corresponding 379 | machine-readable source code for the Library including whatever 380 | changes were used in the work (which must be distributed under 381 | Sections 1 and 2 above); and, if the work is an executable linked 382 | with the Library, with the complete machine-readable "work that 383 | uses the Library", as object code and/or source code, so that the 384 | user can modify the Library and then relink to produce a modified 385 | executable containing the modified Library. (It is understood 386 | that the user who changes the contents of definitions files in the 387 | Library will not necessarily be able to recompile the application 388 | to use the modified definitions.) 389 | 390 | b) Use a suitable shared library mechanism for linking with the 391 | Library. A suitable mechanism is one that (1) uses at run time a 392 | copy of the library already present on the user's computer system, 393 | rather than copying library functions into the executable, and (2) 394 | will operate properly with a modified version of the library, if 395 | the user installs one, as long as the modified version is 396 | interface-compatible with the version that the work was made with. 397 | 398 | c) Accompany the work with a written offer, valid for at 399 | least three years, to give the same user the materials 400 | specified in Subsection 6a, above, for a charge no more 401 | than the cost of performing this distribution. 402 | 403 | d) If distribution of the work is made by offering access to copy 404 | from a designated place, offer equivalent access to copy the above 405 | specified materials from the same place. 406 | 407 | e) Verify that the user has already received a copy of these 408 | materials or that you have already sent this user a copy. 409 | 410 | For an executable, the required form of the "work that uses the 411 | Library" must include any data and utility programs needed for 412 | reproducing the executable from it. However, as a special exception, 413 | the materials to be distributed need not include anything that is 414 | normally distributed (in either source or binary form) with the major 415 | components (compiler, kernel, and so on) of the operating system on 416 | which the executable runs, unless that component itself accompanies 417 | the executable. 418 | 419 | It may happen that this requirement contradicts the license 420 | restrictions of other proprietary libraries that do not normally 421 | accompany the operating system. Such a contradiction means you cannot 422 | use both them and the Library together in an executable that you 423 | distribute. 424 | 425 | 7. You may place library facilities that are a work based on the 426 | Library side-by-side in a single library together with other library 427 | facilities not covered by this License, and distribute such a combined 428 | library, provided that the separate distribution of the work based on 429 | the Library and of the other library facilities is otherwise 430 | permitted, and provided that you do these two things: 431 | 432 | a) Accompany the combined library with a copy of the same work 433 | based on the Library, uncombined with any other library 434 | facilities. This must be distributed under the terms of the 435 | Sections above. 436 | 437 | b) Give prominent notice with the combined library of the fact 438 | that part of it is a work based on the Library, and explaining 439 | where to find the accompanying uncombined form of the same work. 440 | 441 | 8. You may not copy, modify, sublicense, link with, or distribute 442 | the Library except as expressly provided under this License. Any 443 | attempt otherwise to copy, modify, sublicense, link with, or 444 | distribute the Library is void, and will automatically terminate your 445 | rights under this License. However, parties who have received copies, 446 | or rights, from you under this License will not have their licenses 447 | terminated so long as such parties remain in full compliance. 448 | 449 | 9. You are not required to accept this License, since you have not 450 | signed it. However, nothing else grants you permission to modify or 451 | distribute the Library or its derivative works. These actions are 452 | prohibited by law if you do not accept this License. Therefore, by 453 | modifying or distributing the Library (or any work based on the 454 | Library), you indicate your acceptance of this License to do so, and 455 | all its terms and conditions for copying, distributing or modifying 456 | the Library or works based on it. 457 | 458 | 10. Each time you redistribute the Library (or any work based on the 459 | Library), the recipient automatically receives a license from the 460 | original licensor to copy, distribute, link with or modify the Library 461 | subject to these terms and conditions. You may not impose any further 462 | restrictions on the recipients' exercise of the rights granted herein. 463 | You are not responsible for enforcing compliance by third parties with 464 | this License. 465 | 466 | 11. If, as a consequence of a court judgment or allegation of patent 467 | infringement or for any other reason (not limited to patent issues), 468 | conditions are imposed on you (whether by court order, agreement or 469 | otherwise) that contradict the conditions of this License, they do not 470 | excuse you from the conditions of this License. If you cannot 471 | distribute so as to satisfy simultaneously your obligations under this 472 | License and any other pertinent obligations, then as a consequence you 473 | may not distribute the Library at all. For example, if a patent 474 | license would not permit royalty-free redistribution of the Library by 475 | all those who receive copies directly or indirectly through you, then 476 | the only way you could satisfy both it and this License would be to 477 | refrain entirely from distribution of the Library. 478 | 479 | If any portion of this section is held invalid or unenforceable under any 480 | particular circumstance, the balance of the section is intended to apply, 481 | and the section as a whole is intended to apply in other circumstances. 482 | 483 | It is not the purpose of this section to induce you to infringe any 484 | patents or other property right claims or to contest validity of any 485 | such claims; this section has the sole purpose of protecting the 486 | integrity of the free software distribution system which is 487 | implemented by public license practices. Many people have made 488 | generous contributions to the wide range of software distributed 489 | through that system in reliance on consistent application of that 490 | system; it is up to the author/donor to decide if he or she is willing 491 | to distribute software through any other system and a licensee cannot 492 | impose that choice. 493 | 494 | This section is intended to make thoroughly clear what is believed to 495 | be a consequence of the rest of this License. 496 | 497 | 12. If the distribution and/or use of the Library is restricted in 498 | certain countries either by patents or by copyrighted interfaces, the 499 | original copyright holder who places the Library under this License may add 500 | an explicit geographical distribution limitation excluding those countries, 501 | so that distribution is permitted only in or among countries not thus 502 | excluded. In such case, this License incorporates the limitation as if 503 | written in the body of this License. 504 | 505 | 13. The Free Software Foundation may publish revised and/or new 506 | versions of the Lesser General Public License from time to time. 507 | Such new versions will be similar in spirit to the present version, 508 | but may differ in detail to address new problems or concerns. 509 | 510 | Each version is given a distinguishing version number. If the Library 511 | specifies a version number of this License which applies to it and 512 | "any later version", you have the option of following the terms and 513 | conditions either of that version or of any later version published by 514 | the Free Software Foundation. If the Library does not specify a 515 | license version number, you may choose any version ever published by 516 | the Free Software Foundation. 517 | 518 | 14. If you wish to incorporate parts of the Library into other free 519 | programs whose distribution conditions are incompatible with these, 520 | write to the author to ask for permission. For software which is 521 | copyrighted by the Free Software Foundation, write to the Free 522 | Software Foundation; we sometimes make exceptions for this. Our 523 | decision will be guided by the two goals of preserving the free status 524 | of all derivatives of our free software and of promoting the sharing 525 | and reuse of software generally. 526 | 527 | NO WARRANTY 528 | 529 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 530 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 531 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 532 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 533 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 534 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 535 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 536 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 537 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 538 | 539 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 540 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 541 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 542 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 543 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 544 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 545 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 546 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 547 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 548 | DAMAGES. 549 | 550 | END OF TERMS AND CONDITIONS 551 | 552 | =========================================================================== -------------------------------------------------------------------------------- /RunBenchmarks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2020, Arm Limited and Contributors. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Collect statistics with performance counters (PAPI) 18 | 19 | import logging 20 | import os 21 | import stat 22 | import subprocess 23 | import platform 24 | import sys 25 | import argparse 26 | import re 27 | import json 28 | from datetime import date 29 | from itertools import product 30 | from collections import defaultdict 31 | import numpy as np 32 | 33 | PAPI_rounds_x86_64 = [ 34 | 'PAPI_TOT_INS,PAPI_TOT_CYC', # CPI, InsMIX 35 | 'PAPI_TOT_INS,PAPI_L1_DCM,PAPI_L1_ICM,PAPI_L1_TCM,PAPI_L1_LDM,PAPI_L1_STM', # L1 cache 36 | 'PAPI_TOT_INS,PAPI_L2_DCM,PAPI_L2_ICM,PAPI_L2_TCM,PAPI_L2_STM,PAPI_L2_DCA', # L2 cache 37 | ] 38 | 39 | PAPI_rounds_aarch64 = [ 40 | 'PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_L1_DCA,PAPI_L1_DCM', 41 | 'PAPI_TOT_INS,PAPI_L1_ICM,PAPI_L1_ICA,PAPI_L2_DCM', 42 | ] 43 | 44 | 45 | class StatsPerfCounters(object): 46 | 47 | p_PAPICounter = ( 48 | r"PAPI_(?P[\w\d]+_[\w\d]+)\[(?P\d+)\]\[(?P\d+)\]\[(?P\d+)\]=(?P\d+)") 49 | # E.g., INST_RETIRED[1][6260][3]=12557713 ==> stat[tid][pid][bid]=value 50 | 51 | def __init__(self, shfileN, iterations, nthreads, platform, barriers): 52 | 53 | self.logger = logging.getLogger('Perfcntrs.StatsPerfCounters') 54 | self.iterations = iterations 55 | self.nthreads = nthreads 56 | self.runningOn = platform # Currently supports aarch64 or x86_64 57 | self.barriers = barriers # if True we are collecting counters per parallel region 58 | self.filePrefix = shfileN[:-3] # .sh extension out 59 | # repetitions out, only prefix remains 60 | self.filePrefix = self.filePrefix[:-len(str(self.iterations))] 61 | 62 | # The main class structure, the statistics is a list of dictionaries, one per thread: 63 | # stats per barrier; each barrier has a [defaultdict(list) for _ in range(nthreads)]] 64 | self.STATS = [] 65 | # Each dictionary will be indexed by the event (e.g., CPU_CYCLES, INST_RETIRED, etc); 66 | # each element will be a list with an entry per iteration (iteration = execution). 67 | # Some counters are sampled several times (e.g., when gathering cache stats). 68 | # Thus, we keep tuples to compute metrics in a more accurate way. 69 | # The tuples always follow the format: TOT_INS, OTHER_STAT 70 | 71 | # we will parse each line of the file, compile to be faster 72 | self.p = re.compile(self.p_PAPICounter) 73 | 74 | _t_inst = [] # this will help us to construct the tuples 75 | 76 | # Parse all the files 77 | for i in range(self.iterations): 78 | 79 | try: 80 | _fni = '{}{}.papi'.format(self.filePrefix, i) 81 | _fi = open(_fni, 'r') 82 | self.logger.debug('Parsing file: {}'.format(_fni)) 83 | 84 | for line in _fi: 85 | m = re.search(self.p, line) 86 | if m: 87 | tid, pid, bid = int(m.group("tid")), int( 88 | m.group("pid")), int(m.group("bid")) 89 | stat, value = m.group("stat"), int(m.group("value")) 90 | # self.logger.debug('Parsed: {}, {}, {}, {}, {}'.format(stat, tid, pid, bid, value)) 91 | 92 | if len(self.STATS) < (bid + 1): # increase the list of barriers 93 | # self.logger.debug('prev_len(STATS) %d', len(self.STATS)) 94 | self.STATS.append([defaultdict(list) 95 | for _ in range(nthreads)]) 96 | # self.logger.debug('len(STATS) %d', len(self.STATS)) 97 | _t_inst.append([0 for _ in range(self.nthreads)]) 98 | 99 | if stat == 'TOT_INS': 100 | _t_inst[bid][tid] = value 101 | 102 | # self.logger.info('bid %d tid %d stat %s', bid, tid, stat) 103 | self.STATS[bid][tid][stat].append( 104 | (_t_inst[bid][tid], value)) 105 | 106 | else: 107 | self.logger.warn('Error parsing line: {}'.format(line)) 108 | 109 | _fi.close() 110 | 111 | except OSError as err: 112 | self.logger.error('{} _fni: {}'.format(err, _fni)) 113 | continue 114 | 115 | def getNumberBarriers(self): 116 | # Returns the number of barriers parsed 117 | return len(self.STATS) 118 | 119 | # Obtain statistics 120 | def getTotalCycles(self, threadID, iteration=0, barrierID=0): 121 | # For a given iteration, return the cpu cycles for threadID and barrierID. 122 | # If barrierID is < 0, return the total number of cycles for threadID 123 | # across all barriers 124 | STAT = 'TOT_CYC' 125 | if barrierID < 0: 126 | return sum(map(lambda x: x[threadID][STAT][iteration][1], self.STATS)) 127 | else: 128 | return self.STATS[barrierID][threadID][STAT][iteration][1] 129 | 130 | def getAvgCycles(self, threadID, barrierID=0, std=0): 131 | # For a given threadID and barrierID, return the average number of cycles across iterations 132 | STAT = 'TOT_CYC' 133 | cycles = list( 134 | map(lambda x: x[1], self.STATS[barrierID][threadID][STAT])) 135 | if std: 136 | return np.std(cycles, ddof=1) 137 | else: 138 | return np.mean(cycles) 139 | 140 | def getTotalInstructions(self, threadID, iteration=0, barrierID=0): 141 | # For a given iteration, return the number of instructions executed 142 | # for threadID and barrierID. 143 | # If barrierID is < 0, return the total number of executed instructions 144 | # for threadID across all barriers. 145 | STAT = 'TOT_INS' 146 | if barrierID < 0: 147 | return sum(map(lambda x: x[threadID][STAT][iteration][1], self.STATS)) 148 | else: 149 | return self.STATS[barrierID][threadID][STAT][iteration][1] 150 | 151 | def getAvgInstructions(self, threadID, barrierID=0, std=0): 152 | # For a given threadID and barrierID, return the average number of 153 | # instructions across iterations 154 | STAT = 'TOT_INS' 155 | insts = list( 156 | map(lambda x: x[1], self.STATS[barrierID][threadID][STAT])) 157 | if std: 158 | return np.std(insts, ddof=1) 159 | else: 160 | return np.mean(insts) 161 | 162 | def getTotalL1DataMisses(self, threadID, iteration=0, barrierID=0): 163 | # For a given iteration, return the number of L1 data misses for threadID and barrierID. 164 | # If barrierID < 0, it returns the total number of L1 data cache misses for threadID across all barriers 165 | STATd = 'L1_DCM' 166 | if barrierID < 0: 167 | return sum(map(lambda x: x[threadID][STATd][iteration][1], self.STATS)) 168 | else: 169 | return self.STATS[barrierID][threadID][STATd][iteration][1] 170 | 171 | def getAvgL1DataMisses(self, threadID, barrierID=0, std=0): 172 | # For a given threadID and barrierID, return the average number of 173 | # L1 data misses across iterations 174 | STATd = 'L1_DCM' 175 | misses = list( 176 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATd])) 177 | if std: 178 | return np.std(misses, ddof=1) 179 | else: 180 | return np.mean(misses) 181 | 182 | def getTotalL1InstMisses(self, threadID, iteration=0, barrierID=0): 183 | # For a given iteration, return the number of L1 inst misses for threadID and barrierID. 184 | # If barrierID < 0, return the total number of L1 inst cache misses 185 | # for threadID across all barriers 186 | STATi = 'L1_ICM' 187 | if barrierID < 0: 188 | return sum(map(lambda x: x[threadID][STATi][iteration][1], self.STATS)) 189 | else: 190 | return self.STATS[barrierID][threadID][STATi][iteration][1] 191 | 192 | def getAvgL1InstMisses(self, threadID, barrierID=0, std=0): 193 | # For a given threadID and barrierID, return the average number of 194 | # L1 instruction misses across iterations 195 | STATi = 'L1_ICM' 196 | misses = list( 197 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATi])) 198 | if std: 199 | return np.std(misses, ddof=1) 200 | else: 201 | return np.mean(misses) 202 | 203 | def getTotalL1DataAccesses(self, threadID, iteration=0, barrierID=0): 204 | # For a given iteration, return the number of L1 data cache accesses 205 | # for threadID and barrierID 206 | # If barrierID < 0, return the total number of L1 data cache accesses 207 | # for threadID across all barriers 208 | # Only for aarch64 209 | if self.runningOn != 'aarch64': 210 | return 0.0 211 | STATd = 'L1_DCA' 212 | if barrierID < 0: 213 | return sum(map(lambda x: x[threadID][STATd][iteration][1], self.STATS)) 214 | else: 215 | return self.STATS[barrierID][threadID][STATd][iteration][1] 216 | 217 | def getAvgL1DataAccesses(self, threadID, barrierID=0, std=0): 218 | # For a given threadID and barrierID, return the average number of 219 | # L1 data accesses across iterations 220 | # Only for aarch64 221 | if self.runningOn != 'aarch64': 222 | return 0.0 223 | STATd = 'L1_DCA' 224 | accesses = list( 225 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATd])) 226 | if std: 227 | return np.std(accesses, ddof=1) 228 | else: 229 | return np.mean(accesses) 230 | 231 | def getTotalL1InstAccesses(self, threadID, iteration=0, barrierID=0): 232 | # For a given iteration, return the number of L1 inst cache accesses 233 | # for threadID and barrierID 234 | # If barrierID < 0, return the total number of L1 inst cache accesses 235 | # for threadID across all barriers 236 | # Only for aarch64 237 | if self.runningOn != 'aarch64': 238 | return 0.0 239 | STATi = 'L1_ICA' 240 | if barrierID < 0: 241 | return sum(map(lambda x: x[threadID][STATi][iteration][1], self.STATS)) 242 | else: 243 | return self.STATS[barrierID][threadID][STATi][iteration][1] 244 | 245 | def getAvgL1InstAccesses(self, threadID, barrierID=0, std=0): 246 | # For a given threadID and barrierID, return the average number of 247 | # L1 inst accesses across iterations 248 | # Only for aarch64 249 | if self.runningOn != 'aarch64': 250 | return 0.0 251 | STATi = 'L1_ICA' 252 | accesses = list( 253 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATi])) 254 | if std: 255 | return np.std(accesses, ddof=1) 256 | else: 257 | return np.mean(accesses) 258 | 259 | def getTotalL2DataMisses(self, threadID, iteration=0, barrierID=0): 260 | # For a given iteration, return the number of L2 data cache misses 261 | # for threadID and barrierID 262 | # If barrierID < 0, return the total number of L2 data cache misses for 263 | # threadID across all barriers 264 | STATd = 'L2_DCM' 265 | if barrierID < 0: 266 | return sum(map(lambda x: x[threadID][STATd][iteration][1], self.STATS)) 267 | else: 268 | error = False 269 | if barrierID >= len(self.STATS): 270 | self.logger.error( 271 | "GetTotalL2DataMisses: number of barriers does not match") 272 | error = True 273 | if threadID >= len(self.STATS[barrierID]): 274 | self.logger.error( 275 | "GetTotalL2DataMisses: number of threads does not match") 276 | error = True 277 | if STATd not in self.STATS[barrierID][threadID]: 278 | self.logger.error("GetTotalL2DataMisses: " + 279 | STATd + " not in stats") 280 | error = True 281 | if iteration >= len(self.STATS[barrierID][threadID][STATd]): 282 | self.logger.error( 283 | "GetTotalL2DataMisses: Iterations do not match") 284 | error = True 285 | 286 | if error: 287 | return 0 288 | else: 289 | return self.STATS[barrierID][threadID][STATd][iteration][1] 290 | 291 | def getAvgL2DataMisses(self, threadID, barrierID=0, std=0): 292 | # For a given threadID and barrierID, return the average number of 293 | # L2 data misses across iterations 294 | STATd = 'L2_DCM' 295 | misses = list( 296 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATd])) 297 | if std: 298 | return np.std(misses, ddof=1) 299 | else: 300 | return np.mean(misses) 301 | 302 | def getTotalL2DataAccesses(self, threadID, iteration=0, barrierID=0): 303 | # For a given iteration, return the number of L2 data cache accesses for 304 | # threadID and barrierID 305 | # If barrierID < 0, return the total number of L2 data cache accesses for 306 | # threadID across all barriers 307 | if self.runningOn == 'aarch64': 308 | return 0.0 309 | STATd = 'L2_DCA' 310 | if barrierID < 0: 311 | return sum(map(lambda x: x[threadID][STATd][iteration][1], self.STATS)) 312 | else: 313 | return self.STATS[barrierID][threadID][STATd][iteration][1] 314 | 315 | def getAvgL2DataAccesses(self, threadID, barrierID=0, std=0): 316 | # For a given threadID and barrierID, return the average number of 317 | # L2 data accesses across iterations 318 | if self.runningOn == 'aarch64': 319 | return 0.0 320 | STATd = 'L2_DCA' 321 | accesses = list( 322 | map(lambda x: x[1], self.STATS[barrierID][threadID][STATd])) 323 | if std: 324 | return np.std(accesses, ddof=1) 325 | else: 326 | return np.mean(accesses) 327 | 328 | 329 | class Experiment(object): 330 | # Creates and runs experiment files to gather statistics based on performance counters 331 | def __init__(self, miniApp, nthreads, barriers, appParams, iterations=1, bpdir='.', \ 332 | outpath='./tmp', out_suffix='', libpath='./barrierpoint-libs', \ 333 | debug=False, dryRun=False, benchpath='./benchmarks'): 334 | 335 | # Define your own Logger flavour 336 | self.logger = logging.getLogger('Perfcntrs') 337 | self.debug = debug 338 | self.dryRun = dryRun 339 | 340 | # Attributes & properties 341 | self.miniApp = miniApp # miniApp name 342 | self.nthreads = nthreads # number of threads 343 | self.barriers = barriers # running the analysis per parallel region 344 | self.iterations = iterations # How many times the application is executed 345 | self.benchpath = benchpath # benchamark path 346 | self.libpath = libpath # BarrierPoint library parth 347 | self.out_suffix = out_suffix # Suffix for output folders 348 | 349 | self.bpdir = bpdir # BP methodology directory 350 | self.appParams = self.GetBenchParams(appParams, nthreads) 351 | 352 | # path to store the output files (PAPI results) 353 | if out_suffix: 354 | self._toolOutPath = "{}/{}-{}.{}t".format( 355 | outpath, miniApp, out_suffix, nthreads) 356 | else: 357 | self._toolOutPath = "{}/{}.{}t".format(outpath, miniApp, nthreads) 358 | 359 | # Get the running platform (aarch64 or x86 are supported) 360 | _plf = platform.uname() 361 | if _plf[-2] == 'aarch64': # running on Arm 362 | self.runningOn = 'aarch64' 363 | elif _plf[-2] == 'x86_64': # running on x86 364 | self.runningOn = 'x86_64' 365 | else: 366 | self.runningOn = 'Unknown' 367 | self.logger.info('Runnning on {} machine'.format(self.runningOn)) 368 | 369 | @property 370 | def toolOutPath(self): 371 | return self._toolOutPath 372 | 373 | @toolOutPath.setter 374 | def toolOutPath(self, toolOutPath): 375 | self._toolOutPath = toolOutPath 376 | 377 | def GetBenchPath(self, benchpath, miniApp): 378 | if os.path.exists(benchpath + "/" + miniApp): 379 | return benchpath + "/" + miniApp 380 | else: 381 | self.logger.error( 382 | "App \"{}\" not found in benchpath {}".format(miniApp, benchpath)) 383 | return None 384 | 385 | def GetBenchParams(self, appParams, nthreads): 386 | if "{}" in appParams: # Substitute {} in app parameters for number of threads 387 | return appParams.format(nthreads) 388 | else: 389 | return appParams 390 | 391 | def CreateExperimentBatch(self): 392 | # Creates an experiment batch (.sh file) with the application/threads 393 | # to obtain performance counters for the parallel region (ROI) 394 | self.logger.info("Creating experiments bash scripts for {}.{}t".format( 395 | self.miniApp, self.nthreads)) 396 | 397 | # Get the platform specific PAPI counters (aarch64 or x86 are supported) 398 | if self.runningOn == 'x86_64': 399 | self.PAPI_rounds = PAPI_rounds_x86_64 400 | elif self.runningOn == 'aarch64': 401 | self.PAPI_rounds = PAPI_rounds_aarch64 402 | else: 403 | self.logger.error('Unknown platform') 404 | return False 405 | 406 | # Try to create the ouput folder, if it does not exist 407 | if not os.path.isdir(self.toolOutPath): 408 | try: 409 | os.makedirs(self.toolOutPath) 410 | except OSError: 411 | self.logger.warn( 412 | 'Could not create output path {}. Storing results in root folder'.format(self.toolOutPath)) 413 | self.toolOutPath = '.' 414 | 415 | # Create a .sh file with the experiment commands 416 | try: 417 | # Analysis per parallel region 418 | if self.barriers: 419 | if self.out_suffix: 420 | self.shfileN = '{}/{}-{}.{}t.BP.rep{}.sh'.format( 421 | self.toolOutPath, self.miniApp, self.out_suffix, self.nthreads, self.iterations) 422 | else: 423 | self.shfileN = '{}/{}.{}t.BP.rep{}.sh'.format( 424 | self.toolOutPath, self.miniApp, self.nthreads, 425 | self.iterations) 426 | # Analysis of whole application 427 | else: 428 | if self.out_suffix: 429 | self.shfileN = '{}/{}-{}.{}t.rep{}.sh'.format( 430 | self.toolOutPath, self.miniApp, self.out_suffix, self.nthreads, self.iterations) 431 | else: 432 | self.shfileN = '{}/{}.{}t.rep{}.sh'.format( 433 | self.toolOutPath, self.miniApp, self.nthreads, 434 | self.iterations) 435 | self.shfile = open(self.shfileN, 'w') 436 | self.logger.debug('Creating sh file: {}'.format(self.shfileN)) 437 | 438 | except OSError as err: 439 | self.logger.error('{} shfileN: {}'.format(err, self.shfileN)) 440 | return False 441 | 442 | # Header of .sh file 443 | self.shfile.write('#!/bin/sh\n\n') 444 | self.shfile.write('# {} - {} threads - {} iteration(s)\n\n' 445 | .format(self.miniApp, self.nthreads, self.iterations)) 446 | 447 | self.shfile.write('# Exports\n') 448 | self.shfile.write('export BP_PERFCNTRS=1\n') 449 | self.shfile.write( 450 | 'export BP_PERFCNTRS_VERBOSE={}\n'.format(1 if self.debug else 0)) 451 | self.shfile.write('export BP_PERFCNTRS_OMPPARALLEL={}\n'.format( 452 | 1 if self.barriers else 0)) 453 | self.shfile.write('export BP_PERFCNTRS_SAMPLING={}\n'.format( 454 | 0 if self.barriers else 1)) 455 | self.shfile.write('export OMP_NUM_THREADS={}\n'.format(self.nthreads)) 456 | self.shfile.write( 457 | 'export GOMP_CPU_AFFINITY=\"0-{}\"\n\n'.format(self.nthreads - 1)) 458 | 459 | self.shfile.write('# Remove old files\n') 460 | # Remove old CSV files 461 | if self.barriers: 462 | if self.out_suffix: 463 | self.shfile.write('rm -f {}/{}-{}.{}t.BP.*.csv\n' 464 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 465 | self.nthreads)) 466 | else: 467 | self.shfile.write('rm -f {}/{}.{}t.BP.*.csv\n' 468 | .format(self.toolOutPath, self.miniApp, 469 | self.nthreads)) 470 | else: 471 | if self.out_suffix: 472 | self.shfile.write('rm -f {}/{}-{}.{}t.*.csv\n' 473 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 474 | self.nthreads)) 475 | else: 476 | self.shfile.write('rm -f {}/{}.{}t.*.csv\n' 477 | .format(self.toolOutPath, self.miniApp, 478 | self.nthreads)) 479 | 480 | # Remove old .papi files 481 | self.shfile.write( 482 | 'for i in $(seq 0 {})\ndo\n'.format(self.iterations - 1)) 483 | # Per parallel region 484 | if self.barriers: 485 | if self.out_suffix: 486 | self.shfile.write('\trm -f {}/{}-{}.{}t.BP.rep$((i)).papi\n' 487 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 488 | self.nthreads)) 489 | else: 490 | self.shfile.write('\trm -f {}/{}.{}t.BP.rep$((i)).papi\n' 491 | .format(self.toolOutPath, self.miniApp, self.nthreads)) 492 | # Whole application 493 | else: 494 | if self.out_suffix: 495 | self.shfile.write('\trm -f {}/{}-{}.{}t.rep$((i)).papi\n' 496 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 497 | self.nthreads)) 498 | else: 499 | self.shfile.write('\trm -f {}/{}.{}t.rep$((i)).papi\n' 500 | .format(self.toolOutPath, self.miniApp, self.nthreads)) 501 | self.shfile.write('done\n\n') 502 | 503 | # Run experiments command 504 | self.shfile.write( 505 | 'for i in $(seq 0 {})\ndo\n'.format(self.iterations - 1)) 506 | 507 | # The PAPI output file will be open in "append" mode to add 508 | # the different iterations at the end of the file 509 | # We follow the naming convention: 510 | # .... 511 | # E.g. rsbench.4t.2020_03_20.aarch64.rep1.papi 512 | 513 | # Per parallel region 514 | if self.barriers: 515 | if self.out_suffix: 516 | self.shfile.write('\texport BP_PERFCNTRS_OUTPUT_FILE=\"{}/{}-{}.{}t.BP.rep$((i)).papi"\n\n' 517 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 518 | self.nthreads)) 519 | else: 520 | self.shfile.write('\texport BP_PERFCNTRS_OUTPUT_FILE=\"{}/{}.{}t.BP.rep$((i)).papi"\n\n' 521 | .format(self.toolOutPath, self.miniApp, self.nthreads)) 522 | # Whole application 523 | else: 524 | if self.out_suffix: 525 | self.shfile.write('\texport BP_PERFCNTRS_OUTPUT_FILE=\"{}/{}-{}.{}t.rep$((i)).papi\"\n\n' 526 | .format(self.toolOutPath, self.miniApp, self.out_suffix, 527 | self.nthreads)) 528 | else: 529 | self.shfile.write('\texport BP_PERFCNTRS_OUTPUT_FILE=\"{}/{}.{}t.rep$((i)).papi\"\n\n' 530 | .format(self.toolOutPath, self.miniApp, self.nthreads)) 531 | 532 | for _PAPI_set in self.PAPI_rounds: 533 | self.shfile.write('\texport BP_PERFCNTRS_EVENTS=\"{}\"\n'. 534 | format(_PAPI_set)) 535 | self.shfile.write('\texport LD_PRELOAD=\"{}\"\n'. 536 | format(self.bpdir + "/" + self.libpath + "/libbp_perfcntrs.so")) 537 | self.shfile.write('\t{} {}\n\n'.format(self.GetBenchPath( 538 | self.benchpath, self.miniApp), self.appParams)) 539 | 540 | self.shfile.write('done\n\n') 541 | self.shfile.close() 542 | 543 | self.logger.info("Finishing creating the bash scripts for {}.{}t: {}".format( 544 | self.miniApp, self.nthreads, self.shfileN)) 545 | 546 | return True 547 | 548 | def RunExperimentBatch(self): 549 | # Runs the experiments and parses the results 550 | 551 | if not os.path.isfile(self.shfileN): 552 | self.logger.error( 553 | 'Could not locate sh file {}'.format(self.shfileN)) 554 | return False 555 | 556 | self.logger.info( 557 | "Running perfcntrs experiments batch: {}".format(self.shfileN)) 558 | 559 | cmd = self.shfileN 560 | if not self.dryRun: 561 | self.logger.debug('Executing perfcntrs command: {}'.format(cmd)) 562 | # add exec permissions to sh file 563 | os.chmod(self.shfileN, os.stat( 564 | self.shfileN).st_mode | stat.S_IEXEC) 565 | proc = subprocess.call(cmd, shell=True) 566 | # proc = subprocess.check_call(cmd, shell = True) 567 | # Instead of raising our own error, we could use check_call above for a traceback 568 | if proc != 0: 569 | self.logger.error("Unsuccessful command: {}") 570 | sys.exit() 571 | 572 | else: 573 | self.logger.info('Dry run for: {}'.format(cmd)) 574 | 575 | # Parsing experiments 576 | self.stats = StatsPerfCounters( 577 | self.shfileN, self.iterations, self.nthreads, self.runningOn, self.barriers) 578 | 579 | # Create CSV file as output with all the metrics 580 | try: 581 | self.outfileN = '{}.check.csv'.format(self.shfileN[:-3]) 582 | self.outfile = open(self.outfileN, 'w') 583 | self.logger.debug('Creating csv file: {}'.format(self.outfileN)) 584 | self.outfile.write('Benchmark,Thread,Iteration,Barrier,' 585 | 'Cycles,Instructions,' 586 | 'L1DAccesses,L1DMisses,L1IAccesses,L1IMisses,' 587 | 'L2DAccesses,L2DMisses\n') 588 | 589 | for i in range(self.nthreads): 590 | for j in range(self.iterations): 591 | for k in range(self.stats.getNumberBarriers()): 592 | self.outfile.write('{},{},{},{},{},{},{},{},{},{},{},{}\n' 593 | .format(self.miniApp, i, j, k, 594 | self.stats.getTotalCycles( 595 | i, j, k), self.stats.getTotalInstructions(i, j, k), 596 | self.stats.getTotalL1DataAccesses( 597 | i, j, k), self.stats.getTotalL1DataMisses(i, j, k), 598 | self.stats.getTotalL1InstAccesses( 599 | i, j, k), self.stats.getTotalL1InstMisses(i, j, k), 600 | self.stats.getTotalL2DataAccesses(i, j, k), self.stats.getTotalL2DataMisses(i, j, k))) 601 | self.outfile.close() 602 | 603 | except OSError as err: 604 | self.logger.error('{} outfileN: {}'.format(err, self.outfileN)) 605 | return False 606 | 607 | try: 608 | self.outfileN = '{}.csv'.format(self.shfileN[:-3]) 609 | self.outfile = open(self.outfileN, 'w') 610 | self.logger.debug('Creating csv file: {}'.format(self.outfileN)) 611 | self.outfile.write('Benchmark,Thread,Barrier,Cycles,Instructions,' 612 | 'L1DAccesses,L1DMisses,L1IAccesses,L1IMisses,' 613 | 'L2DAccesses,L2DMisses\n') 614 | 615 | for i in range(self.nthreads): 616 | for j in range(self.stats.getNumberBarriers()): 617 | self.outfile.write('{},{},{},{},{},{},{},{},{},{},{}\n' 618 | .format(self.miniApp, i, j, 619 | self.stats.getAvgCycles(i, j), 620 | self.stats.getAvgInstructions( 621 | i, j), 622 | self.stats.getAvgL1DataAccesses( 623 | i, j), self.stats.getAvgL1DataMisses(i, j), 624 | self.stats.getAvgL1InstAccesses( 625 | i, j), self.stats.getAvgL1InstMisses(i, j), 626 | self.stats.getAvgL2DataAccesses(i, j), self.stats.getAvgL2DataMisses(i, j))) 627 | self.outfile.close() 628 | 629 | except OSError as err: 630 | self.logger.error('{} outfileN: {}'.format(err, self.outfileN)) 631 | return False 632 | 633 | self.logger.info("Finished perfcntrs experiments batch for {}.{}t".format( 634 | self.miniApp, self.nthreads)) 635 | 636 | return True 637 | 638 | 639 | def main(): 640 | parser = argparse.ArgumentParser() 641 | parser.add_argument('-c', '--config_file', 642 | required=True, help='JSON config file') 643 | args = parser.parse_args() 644 | 645 | # Decode JSON configuration file 646 | with open(args.config_file, 'r') as f: 647 | json_data = json.load(f) 648 | 649 | rootdir = json_data["paths"]["rootdir"] 650 | benchpath = json_data["paths"]["benchpath"] 651 | libspath = json_data["paths"]["libspath"] 652 | dry_run = json_data["dry_run"] 653 | debug_mode = json_data["Debug"] 654 | suffix = json_data["Suffix"] 655 | outpath = json_data["paths"]["outpath"] 656 | nthreads = json_data["threads"] 657 | perf_iters = json_data["Perfcntrs_iterations"] 658 | # Captures perfcntr execution for the whole app or for each BP 659 | barrier_mode = [0, 1] 660 | 661 | # Define the logger 662 | formatter = logging.Formatter('[%(asctime)s] [%(levelname)s] - (%(name)s) - %(message)s') 663 | 664 | # File handler for debug info 665 | # If both DR instrumentation and performance analysis are executed, 666 | # the debug file will be appended to the previous existent one from the DR instrumentation. 667 | if json_data["execution"]["bp_identification"]: 668 | handler = logging.FileHandler("debug-barrierpoint.log") # Append to file 669 | else: 670 | handler = logging.FileHandler("debug-barrierpoint.log", 'w') 671 | 672 | handler.setFormatter(formatter) 673 | 674 | # Console handler (only INFO level) 675 | ch = logging.StreamHandler() 676 | ch.setLevel(logging.INFO) 677 | ch.setFormatter(formatter) 678 | 679 | logger = logging.getLogger("Perfcntrs") 680 | if debug_mode: 681 | logger.setLevel(logging.DEBUG) 682 | else: 683 | logger.setLevel(logging.INFO) 684 | 685 | # Add handlers to logger 686 | logger.addHandler(handler) 687 | logger.addHandler(ch) 688 | 689 | if json_data["execution"]["perfcntrs"]: 690 | logger.info("BarrierPoint performance analysis set to [ON]") 691 | else: 692 | logger.info("BarrierPoint performance analysis set to [OFF]") 693 | sys.exit() 694 | 695 | app_dict = {} 696 | for app in json_data["Application"]: 697 | app_dict[app] = json_data["Application"][app] 698 | 699 | # Settings print 700 | print("*** [BarrierPoint Methodology] ***\n") 701 | print("Performance Counters execution with the following settings:") 702 | print("> BP directory: {}\n> Benchmarks directory: {}\n> Libraries directory: {}\ 703 | \n> Output directory: {}".format(rootdir, benchpath, libspath, outpath)) 704 | print("> nThreads: {}\n> Suffix: {}\n> Debug: {}\n> Execution repetitions: {}\ 705 | \n> Dry-run: {}".format(nthreads, suffix, debug_mode, perf_iters, dry_run)) 706 | print("> Applications and Inputs: {}".format(app_dict)) 707 | print("*************************\n") 708 | 709 | theApps = [] 710 | for (ap, nt, suf, b) in product(app_dict, nthreads, suffix, barrier_mode): 711 | appParams = app_dict[ap] 712 | outpath_suf = outpath + "/bperf." + suf 713 | logger.info("Running {}.{}t (mode {}, input: {})".format( 714 | ap, nt, b, appParams)) 715 | theApps.append(Experiment(ap, nt, b, appParams, perf_iters, rootdir, 716 | outpath_suf, suf, libspath, debug_mode, dry_run, benchpath)) 717 | if not theApps[-1].CreateExperimentBatch(): 718 | logger.error('Create Experiments returned with an error') 719 | sys.exit() 720 | else: 721 | if not theApps[-1].RunExperimentBatch(): 722 | logger.error('Run Experiments returned with an error') 723 | sys.exit() 724 | else: 725 | logger.info('== Finished Performance Counter executions ==') 726 | 727 | 728 | if __name__ == '__main__': 729 | main() 730 | --------------------------------------------------------------------------------