├── .gitignore ├── .gitmodules ├── DEV-Def ├── A100PCIE40G.config └── QV100.config ├── ISA-Def ├── accelwattch_component_mapping.h ├── ampere_opcode.h ├── kepler_opcode.h ├── pascal_opcode.h ├── trace_opcode.h ├── turing_opcode.h └── volta_opcode.h ├── Makefile ├── README.md ├── common ├── CLI │ ├── App.hpp │ ├── Argv.hpp │ ├── CLI.hpp │ ├── Config.hpp │ ├── ConfigFwd.hpp │ ├── Encoding.hpp │ ├── Error.hpp │ ├── Formatter.hpp │ ├── FormatterFwd.hpp │ ├── Macros.hpp │ ├── Option.hpp │ ├── Split.hpp │ ├── StringTools.hpp │ ├── Timer.hpp │ ├── TypeTools.hpp │ ├── Validators.hpp │ ├── Version.hpp │ └── impl │ │ ├── App_inl.hpp │ │ ├── Argv_inl.hpp │ │ ├── Config_inl.hpp │ │ ├── Encoding_inl.hpp │ │ ├── Formatter_inl.hpp │ │ ├── Option_inl.hpp │ │ ├── Split_inl.hpp │ │ ├── StringTools_inl.hpp │ │ └── Validators_inl.hpp ├── common_def.cc ├── common_def.h ├── option_parser.cc ├── option_parser.h └── vector_types.h ├── hw-component ├── IBuffer.cc ├── IBuffer.h ├── OperandCollector.cc ├── OperandCollector.h ├── PipelineUnit.cc ├── PipelineUnit.h ├── PrivateSM.cc ├── PrivateSM.h ├── RegBankAlloc.cc ├── RegBankAlloc.h ├── Scoreboard.cc └── Scoreboard.h ├── hw-parser ├── hw-parser.cc └── hw-parser.h ├── main.cc ├── merge_report.py ├── parda ├── .gitignore ├── LICENSE ├── README.md ├── main.c ├── makefile ├── narray.c ├── narray.h ├── parda.c ├── parda.h ├── parda_mpi.c ├── parda_mpi.h ├── parda_omp.c ├── parda_omp.h ├── parda_print.c ├── process_args.c ├── process_args.h ├── run.sh ├── seperate.c ├── seperate.h ├── splay.c └── splay.h ├── sass-split ├── .gitignore ├── Makefile ├── process_sass_dir.cpp └── sass-split.sh ├── trace-driven ├── entry.h ├── hw-stt.cc ├── hw-stt.h ├── inst-stt.cc ├── inst-stt.h ├── kernel-info.cc ├── kernel-info.h ├── kernel-trace.cc ├── kernel-trace.h ├── mem-access.cc ├── mem-access.h ├── register-set.h ├── trace-warp-inst.cc └── trace-warp-inst.h ├── trace-parser ├── inst-memadd-info.cc ├── inst-memadd-info.h ├── inst-trace.cc ├── inst-trace.h ├── memory-space.cc ├── memory-space.h ├── sass-inst.cc ├── sass-inst.h ├── sass-split.py ├── trace-parser.cc └── trace-parser.h └── tracing-tool ├── .gitignore ├── Makefile ├── README.md ├── common.h ├── inject_funcs.cu ├── nvbit ├── cuda.h ├── generated_cuda_meta.h ├── instr_types.h ├── libnvbit.a ├── nvbit.h ├── nvbit_reg_rw.h ├── nvbit_tool.h ├── tools_cuda_api_meta.h └── utils │ ├── channel.hpp │ └── utils.h └── tracer.cu /.gitignore: -------------------------------------------------------------------------------- 1 | obj/* 2 | gpu-simulator.x 3 | .vscode/ 4 | *.mem 5 | *.sass 6 | *.temp.txt 7 | *-summary.txt 8 | app.config 9 | instn.config 10 | issue.config 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DEV-Def/A100PCIE40G.config: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################### 3 | ### ### 4 | ### Ampere A100 PCIe 40 GB Config ### 5 | ### ### 6 | ########################################################################################### 7 | 8 | 9 | # Device Limits 10 | -gpgpu_stack_size_limit 1024 11 | -gpgpu_heap_size_limit 8388608 12 | -gpgpu_kernel_launch_latency 5000 13 | -gpgpu_thread_block_launch_latency 0 14 | -gpgpu_max_concurrent_kernel 128 15 | 16 | # High Level Architecture Configuration 17 | -gpgpu_num_clusters 108 18 | -gpgpu_num_sms_per_cluster 1 19 | -gpgpu_num_memory_controllers 40 20 | -gpgpu_num_sub_partition_per_memory_channel 2 21 | 22 | # Clock Domain Frequencies in MHZ 23 | -gpgpu_core_clock_mhz 1065.0 24 | -gpgpu_icnt_clock_mhz 1065.0 25 | -gpgpu_l2d_clock_mhz 1065.0 26 | -gpgpu_dram_clock_mhz 1215.0 27 | 28 | # SM Pipeline Config 29 | -gpgpu_max_registers_per_sm 65536 30 | -gpgpu_max_registers_per_cta 65536 31 | 32 | # SM Warp Config 33 | -gpgpu_max_threads_per_sm 2048 34 | -gpgpu_warp_size 32 35 | -gpgpu_max_ctas_per_sm 32 36 | 37 | # Pipline Widths 38 | -gpgpu_ID_OC_SP_pipeline_width 4 39 | -gpgpu_ID_OC_DP_pipeline_width 4 40 | -gpgpu_ID_OC_INT_pipeline_width 4 41 | -gpgpu_ID_OC_SFU_pipeline_width 4 42 | -gpgpu_ID_OC_MEM_pipeline_width 4 43 | -gpgpu_OC_EX_SP_pipeline_width 4 44 | -gpgpu_OC_EX_DP_pipeline_width 4 45 | -gpgpu_OC_EX_INT_pipeline_width 4 46 | -gpgpu_OC_EX_SFU_pipeline_width 4 47 | -gpgpu_OC_EX_MEM_pipeline_width 4 48 | -gpgpu_EX_WB_pipeline_width 8 49 | -gpgpu_ID_OC_TENSOR_CORE_pipeline_width 4 50 | -gpgpu_OC_EX_TENSOR_CORE_pipeline_width 4 51 | 52 | # Number of FUs 53 | -gpgpu_num_sp_units 4 54 | -gpgpu_num_sfu_units 4 55 | -gpgpu_num_dp_units 8 56 | -gpgpu_num_int_units 8 57 | -gpgpu_num_tensor_core_units 4 58 | -gpgpu_num_mem_units 160 59 | 60 | # Instruction Latencies, ADD,MAX,MUL,MAD,DIV,[SHFL] 61 | #-gpgpu_opcode_latency_int 4,13,4,5,145,21 62 | #-gpgpu_opcode_latency_fp 4,13,4,5,39 63 | #-gpgpu_opcode_latency_dp 8,19,8,8,330 64 | #-gpgpu_opcode_latency_sfu 100 65 | #-gpgpu_opcode_latency_tensor_core 64 66 | -gpgpu_opcode_latency_int 3,12,3,4,144,20 67 | -gpgpu_opcode_latency_fp 3,12,3,4,38 68 | -gpgpu_opcode_latency_dp 7,18,7,7,329 69 | -gpgpu_opcode_latency_sfu 99 70 | -gpgpu_opcode_latency_tensor_core 63 71 | 72 | # Initiation Intervals, ADD,MAX,MUL,MAD,DIV,[SHFL] 73 | -gpgpu_opcode_initiation_interval_int 2,2,2,2,8,4 74 | -gpgpu_opcode_initiation_interval_fp 2,2,2,2,4 75 | -gpgpu_opcode_initiation_interval_dp 4,4,4,4,130 76 | -gpgpu_opcode_initiation_interval_sfu 8 77 | -gpgpu_opcode_initiation_interval_tensor_core 64 78 | 79 | # Sub Core Model, warp schedulers are isolated 80 | -gpgpu_sub_core_model 1 81 | 82 | # Generic Operand Collectors 83 | -gpgpu_operand_collector_num_units_gen 8 84 | -gpgpu_operand_collector_num_in_ports_gen 8 85 | -gpgpu_operand_collector_num_out_ports_gen 8 86 | 87 | # Register Banks 88 | -gpgpu_num_reg_banks 32 89 | -gpgpu_reg_file_port_throughput 2 90 | 91 | # Shared Memory Bankconflict Detection 92 | -gpgpu_shmem_num_banks 64 93 | -gpgpu_shmem_limited_broadcast 0 94 | -gpgpu_shmem_warp_parts 1 95 | -gpgpu_coalesce_arch 80 96 | 97 | # Warp Schedulers 98 | -gpgpu_inst_fetch_throughput 4 99 | -gpgpu_num_sched_per_sm 4 100 | # for Volta, a warp scheduler can issue 1 inst per cycle 101 | -gpgpu_max_insn_issue_per_warp 1 102 | # for Volta, dual issue only occurs with using two different execution unit 103 | -gpgpu_dual_issue_diff_exec_units 1 104 | 105 | # L1/Shared Memory Configuration 106 | # L1 cache + shared memory = 192 KB 107 | -gpgpu_unified_l1d_size 192 108 | -gpgpu_l1d_cache_banks 4 109 | -gpgpu_l1d_cache_sets 4 110 | -gpgpu_l1d_cache_block_size 128 111 | -gpgpu_l1d_cache_associative 64 112 | -gpgpu_l1d_latency 37 113 | # Size of shared memory per SM (Byte) 114 | -gpgpu_shmem_size_per_sm 167936 115 | # Size of shared memory per CTA (Byte) 116 | -gpgpu_shmem_size_per_cta 167936 117 | -gpgpu_shmem_latency 37 118 | 119 | # L2 Configuration 120 | -gpgpu_l2d_size_per_sub_partition 512 121 | # 32 sets, each 128 bytes 16-way for each memory sub partition (512 KB) 122 | -gpgpu_l2d_cache_sets 256 123 | -gpgpu_l2d_cache_block_size 128 124 | -gpgpu_l2d_cache_associative 16 125 | -gpgpu_dram_partition_queues_icnt_to_l2 64 126 | -gpgpu_dram_partition_queues_l2_to_dram 64 127 | -gpgpu_dram_partition_queues_dram_to_l2 64 128 | -gpgpu_dram_partition_queues_l2_to_icnt 64 129 | 130 | # Cluster Ejection Buffer 131 | -gpgpu_num_pkts_cluster_ejection_buffer 32 132 | 133 | # Interconnection 134 | -gpgpu_icnt_in_buffer_limit 512 135 | -gpgpu_icnt_out_buffer_limit 512 136 | -gpgpu_icnt_subnets 2 137 | -gpgpu_icnt_flit_size 40 138 | 139 | # DRAM Configuration 140 | -gpgpu_dram_latency 100 141 | 142 | # Trace OpCode Latency and Initiation Interval 143 | #-gpgpu_trace_opcode_latency_initiation_int 2,2 144 | #-gpgpu_trace_opcode_latency_initiation_sp 2,2 145 | #-gpgpu_trace_opcode_latency_initiation_dp 8,4 146 | #-gpgpu_trace_opcode_latency_initiation_sfu 20,8 147 | #-gpgpu_trace_opcode_latency_initiation_tensor 2,2 148 | -gpgpu_trace_opcode_latency_initiation_int 2,1 149 | -gpgpu_trace_opcode_latency_initiation_sp 2,1 150 | -gpgpu_trace_opcode_latency_initiation_dp 8,2 151 | -gpgpu_trace_opcode_latency_initiation_sfu 20,6 152 | -gpgpu_trace_opcode_latency_initiation_tensor 2,1 153 | 154 | # execute branch insts on spec unit 1 155 | # in Volta, there is a dedicated branch unit 156 | # ,,,,, 157 | -gpgpu_specialized_unit_1 1,4,4,4,4,BRA 158 | -gpgpu_trace_opcode_latency_initiation_spec_op_1 4,4 159 | 160 | # TEX unit, make fixed latency for all tex insts 161 | -gpgpu_specialized_unit_2 1,4,200,4,4,TEX 162 | -gpgpu_trace_opcode_latency_initiation_spec_op_2 200,4 163 | 164 | # tensor unit 165 | -gpgpu_specialized_unit_3 1,4,8,4,4,TENSOR 166 | -gpgpu_trace_opcode_latency_initiation_spec_op_3 2,2 167 | 168 | # shared memory allocation size 169 | -gpgpu_smem_allocation_size 256 170 | -gpgpu_register_allocation_size 256 171 | 172 | # L1 cache configurations 173 | -gpgpu_l1_cache_line_size_for_reuse_distance 32 174 | # L2 cache configurations 175 | -gpgpu_l2_cache_line_size_for_reuse_distance 64 176 | 177 | # dram/l1/l2 mem access latency 178 | -gpgpu_dram_mem_access_latency 302 179 | -gpgpu_l1_cache_access_latency 37 180 | -gpgpu_l2_cache_access_latency 213 181 | -gpgpu_const_mem_access_latency 8 182 | -------------------------------------------------------------------------------- /DEV-Def/QV100.config: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################### 3 | ### ### 4 | ### Volta Quadro V100 Config ### 5 | ### ### 6 | ########################################################################################### 7 | 8 | 9 | # This config models the Volta Quadro V100 10 | # For more info about volta architecture: 11 | # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf 12 | # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 13 | # http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf 14 | # https://en.wikipedia.org/wiki/Volta_(microarchitecture) 15 | # https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf 16 | # https://devblogs.nvidia.com/inside-volta/ 17 | # http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf 18 | # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability 19 | 20 | 21 | # Device Limits 22 | -gpgpu_stack_size_limit 1024 23 | -gpgpu_heap_size_limit 8388608 24 | -gpgpu_kernel_launch_latency 5000 25 | -gpgpu_thread_block_launch_latency 0 26 | -gpgpu_max_concurrent_kernel 128 27 | 28 | # High Level Architecture Configuration 29 | -gpgpu_num_clusters 80 30 | -gpgpu_num_sms_per_cluster 1 31 | -gpgpu_num_memory_controllers 32 32 | -gpgpu_num_sub_partition_per_memory_channel 2 33 | 34 | # Clock Domain Frequencies in MHZ 35 | -gpgpu_core_clock_mhz 1447.0 36 | -gpgpu_icnt_clock_mhz 1447.0 37 | -gpgpu_l2d_clock_mhz 1447.0 38 | -gpgpu_dram_clock_mhz 850.0 39 | 40 | # SM Pipeline Config 41 | -gpgpu_max_registers_per_sm 65536 42 | -gpgpu_max_registers_per_cta 65536 43 | 44 | # SM Warp Config 45 | -gpgpu_max_threads_per_sm 2048 46 | -gpgpu_warp_size 32 47 | -gpgpu_max_ctas_per_sm 32 48 | 49 | # Pipline Widths 50 | -gpgpu_ID_OC_SP_pipeline_width 4 51 | -gpgpu_ID_OC_DP_pipeline_width 4 52 | -gpgpu_ID_OC_INT_pipeline_width 4 53 | -gpgpu_ID_OC_SFU_pipeline_width 4 54 | -gpgpu_ID_OC_MEM_pipeline_width 4 55 | -gpgpu_OC_EX_SP_pipeline_width 4 56 | -gpgpu_OC_EX_DP_pipeline_width 4 57 | -gpgpu_OC_EX_INT_pipeline_width 4 58 | -gpgpu_OC_EX_SFU_pipeline_width 4 59 | -gpgpu_OC_EX_MEM_pipeline_width 4 60 | -gpgpu_EX_WB_pipeline_width 8 61 | -gpgpu_ID_OC_TENSOR_CORE_pipeline_width 4 62 | -gpgpu_OC_EX_TENSOR_CORE_pipeline_width 4 63 | 64 | # Number of FUs 65 | -gpgpu_num_sp_units 4 66 | -gpgpu_num_sfu_units 4 67 | -gpgpu_num_dp_units 8 68 | -gpgpu_num_int_units 8 69 | -gpgpu_num_tensor_core_units 4 70 | -gpgpu_num_mem_units 160 71 | 72 | # Instruction Latencies, ADD,MAX,MUL,MAD,DIV,[SHFL] 73 | #-gpgpu_opcode_latency_int 4,13,4,5,145,21 74 | #-gpgpu_opcode_latency_fp 4,13,4,5,39 75 | #-gpgpu_opcode_latency_dp 8,19,8,8,330 76 | #-gpgpu_opcode_latency_sfu 100 77 | #-gpgpu_opcode_latency_tensor_core 64 78 | -gpgpu_opcode_latency_int 3,12,3,4,144,20 79 | -gpgpu_opcode_latency_fp 3,12,3,4,38 80 | -gpgpu_opcode_latency_dp 7,18,7,7,329 81 | -gpgpu_opcode_latency_sfu 99 82 | -gpgpu_opcode_latency_tensor_core 63 83 | 84 | # Initiation Intervals, ADD,MAX,MUL,MAD,DIV,[SHFL] 85 | -gpgpu_opcode_initiation_interval_int 2,2,2,2,8,4 86 | -gpgpu_opcode_initiation_interval_fp 2,2,2,2,4 87 | -gpgpu_opcode_initiation_interval_dp 4,4,4,4,130 88 | -gpgpu_opcode_initiation_interval_sfu 8 89 | -gpgpu_opcode_initiation_interval_tensor_core 64 90 | 91 | # Sub Core Model, warp schedulers are isolated 92 | -gpgpu_sub_core_model 1 93 | 94 | # Generic Operand Collectors 95 | -gpgpu_operand_collector_num_units_gen 8 96 | -gpgpu_operand_collector_num_in_ports_gen 8 97 | -gpgpu_operand_collector_num_out_ports_gen 8 98 | 99 | # Register Banks 100 | -gpgpu_num_reg_banks 16 101 | -gpgpu_reg_file_port_throughput 2 102 | 103 | # Shared Memory Bankconflict Detection 104 | -gpgpu_shmem_num_banks 32 105 | -gpgpu_shmem_limited_broadcast 0 106 | -gpgpu_shmem_warp_parts 1 107 | -gpgpu_coalesce_arch 70 108 | 109 | # Warp Schedulers 110 | -gpgpu_inst_fetch_throughput 4 111 | -gpgpu_num_sched_per_sm 4 112 | # for Volta, a warp scheduler can issue 1 inst per cycle 113 | -gpgpu_max_insn_issue_per_warp 1 114 | # for Volta, dual issue only occurs with using two different execution unit 115 | -gpgpu_dual_issue_diff_exec_units 1 116 | 117 | # L1/Shared Memory Configuration 118 | # L1 cache + shared memory = 128 KB 119 | -gpgpu_unified_l1d_size 128 120 | -gpgpu_l1d_cache_banks 4 121 | -gpgpu_l1d_cache_sets 4 122 | -gpgpu_l1d_cache_block_size 128 123 | -gpgpu_l1d_cache_associative 64 124 | -gpgpu_l1d_latency 20 125 | # Size of shared memory per SM (Byte) 126 | -gpgpu_shmem_size_per_sm 98304 127 | # Size of shared memory per CTA (Byte) 128 | -gpgpu_shmem_size_per_cta 65536 129 | -gpgpu_shmem_latency 20 130 | 131 | # L2 Configuration 132 | -gpgpu_l2d_size_per_sub_partition 96 133 | # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB) 134 | -gpgpu_l2d_cache_sets 32 135 | -gpgpu_l2d_cache_block_size 128 136 | -gpgpu_l2d_cache_associative 24 137 | -gpgpu_dram_partition_queues_icnt_to_l2 64 138 | -gpgpu_dram_partition_queues_l2_to_dram 64 139 | -gpgpu_dram_partition_queues_dram_to_l2 64 140 | -gpgpu_dram_partition_queues_l2_to_icnt 64 141 | 142 | # Cluster Ejection Buffer 143 | -gpgpu_num_pkts_cluster_ejection_buffer 32 144 | 145 | # Interconnection 146 | -gpgpu_icnt_in_buffer_limit 512 147 | -gpgpu_icnt_out_buffer_limit 512 148 | -gpgpu_icnt_subnets 2 149 | -gpgpu_icnt_flit_size 40 150 | 151 | # DRAM Configuration 152 | -gpgpu_dram_latency 100 153 | 154 | # Trace OpCode Latency and Initiation Interval 155 | #-gpgpu_trace_opcode_latency_initiation_int 2,2 156 | #-gpgpu_trace_opcode_latency_initiation_sp 2,2 157 | #-gpgpu_trace_opcode_latency_initiation_dp 8,4 158 | #-gpgpu_trace_opcode_latency_initiation_sfu 20,8 159 | #-gpgpu_trace_opcode_latency_initiation_tensor 2,2 160 | -gpgpu_trace_opcode_latency_initiation_int 2,1 161 | -gpgpu_trace_opcode_latency_initiation_sp 2,1 162 | -gpgpu_trace_opcode_latency_initiation_dp 8,2 163 | -gpgpu_trace_opcode_latency_initiation_sfu 20,6 164 | -gpgpu_trace_opcode_latency_initiation_tensor 2,1 165 | 166 | # execute branch insts on spec unit 1 167 | # in Volta, there is a dedicated branch unit 168 | # ,,,,, 169 | -gpgpu_specialized_unit_1 1,4,4,4,4,BRA 170 | -gpgpu_trace_opcode_latency_initiation_spec_op_1 4,4 171 | 172 | # TEX unit, make fixed latency for all tex insts 173 | -gpgpu_specialized_unit_2 1,4,200,4,4,TEX 174 | -gpgpu_trace_opcode_latency_initiation_spec_op_2 200,4 175 | 176 | # tensor unit 177 | -gpgpu_specialized_unit_3 1,4,8,4,4,TENSOR 178 | -gpgpu_trace_opcode_latency_initiation_spec_op_3 2,2 179 | 180 | # shared memory allocation size 181 | -gpgpu_smem_allocation_size 256 182 | -gpgpu_register_allocation_size 256 183 | 184 | # L1 cache configurations 185 | -gpgpu_l1_cache_line_size_for_reuse_distance 32 186 | # L2 cache configurations 187 | -gpgpu_l2_cache_line_size_for_reuse_distance 64 188 | 189 | # dram/l1/l2 mem access latency 190 | -gpgpu_dram_mem_access_latency 302 191 | -gpgpu_l1_cache_access_latency 33 192 | -gpgpu_l2_cache_access_latency 213 193 | -gpgpu_const_mem_access_latency 8 194 | -------------------------------------------------------------------------------- /ISA-Def/kepler_opcode.h: -------------------------------------------------------------------------------- 1 | // developed by Mahmoud Khairy, Purdue Univ 2 | // abdallm@purdue.edu 3 | 4 | #ifndef KEPLER_OPCODE_H 5 | #define KEPLER_OPCODE_H 6 | 7 | #include 8 | #include 9 | #include "trace_opcode.h" 10 | 11 | #define KEPLER_BINART_VERSION 35 12 | 13 | /// Kepler ISA 14 | // see: https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html 15 | static const std::unordered_map Kepler_OpcodeMap = { 16 | // Floating Point 32 Instructions 17 | {"FFMA", OpcodeChar(OP_FFMA, SP_OP)}, 18 | {"FFMA32I", OpcodeChar(OP_FFMA32I, SP_OP)}, 19 | {"FADD", OpcodeChar(OP_FADD, SP_OP)}, 20 | {"FADD32I", OpcodeChar(OP_FADD32I, SP_OP)}, 21 | {"FCMP", OpcodeChar(OP_FCMP, SP_OP)}, 22 | {"FMUL", OpcodeChar(OP_FMUL, SP_OP)}, 23 | {"FMUL32I", OpcodeChar(OP_FMUL32I, SP_OP)}, 24 | {"FMNMX", OpcodeChar(OP_FMNMX, SP_OP)}, 25 | {"FSWZ", OpcodeChar(OP_FSWZ, SP_OP)}, 26 | {"FSET", OpcodeChar(OP_FSET, SP_OP)}, 27 | {"FSETP", OpcodeChar(OP_FSETP, SP_OP)}, 28 | {"FCHK", OpcodeChar(OP_FCHK, SP_OP)}, 29 | {"RRO", OpcodeChar(OP_RRO, SP_OP)}, 30 | // SFU 31 | {"MUFU", OpcodeChar(OP_MUFU, SFU_OP)}, 32 | 33 | // Double Point Instructions 34 | {"DFMA", OpcodeChar(OP_DFMA, DP_OP)}, 35 | {"DADD", OpcodeChar(OP_DADD, DP_OP)}, 36 | {"DMUL", OpcodeChar(OP_DMUL, DP_OP)}, 37 | {"DMNMX", OpcodeChar(OP_DMNMX, DP_OP)}, 38 | {"DSET", OpcodeChar(OP_DSET, DP_OP)}, 39 | {"DSETP", OpcodeChar(OP_DSETP, DP_OP)}, 40 | 41 | // Integer Instructions 42 | {"IMAD", OpcodeChar(OP_IMAD, INTP_OP)}, 43 | {"IMADSP", OpcodeChar(OP_IMADSP, INTP_OP)}, 44 | {"IMUL", OpcodeChar(OP_IMUL, INTP_OP)}, 45 | {"IMUL32I", OpcodeChar(OP_IMUL32I, INTP_OP)}, 46 | {"IADD", OpcodeChar(OP_IADD, INTP_OP)}, 47 | {"IADD32I", OpcodeChar(OP_IADD32I, INTP_OP)}, 48 | {"ISUB", OpcodeChar(OP_ISUB, INTP_OP)}, 49 | {"ISCADD", OpcodeChar(OP_ISCADD, INTP_OP)}, 50 | {"ISCADD32I", OpcodeChar(OP_ISCADD32I, INTP_OP)}, 51 | {"ISAD", OpcodeChar(OP_ISAD, INTP_OP)}, 52 | {"IMNMX", OpcodeChar(OP_IMNMX, INTP_OP)}, 53 | {"BFE", OpcodeChar(OP_BFE, INTP_OP)}, 54 | {"BFI", OpcodeChar(OP_BFI, INTP_OP)}, 55 | {"SHR", OpcodeChar(OP_SHR, INTP_OP)}, 56 | {"SHL", OpcodeChar(OP_SHL, INTP_OP)}, 57 | {"SHF", OpcodeChar(OP_SHF, INTP_OP)}, 58 | {"LOP", OpcodeChar(OP_LOP, INTP_OP)}, 59 | {"LOP32I", OpcodeChar(OP_LOP32I, INTP_OP)}, 60 | {"FLO", OpcodeChar(OP_FLO, INTP_OP)}, 61 | {"ISET", OpcodeChar(OP_ISET, INTP_OP)}, 62 | {"ISETP", OpcodeChar(OP_ISETP, INTP_OP)}, 63 | {"ICMP", OpcodeChar(OP_ICMP, INTP_OP)}, 64 | {"POPC", OpcodeChar(OP_POPC, INTP_OP)}, 65 | 66 | // Conversion Instructions 67 | {"F2F", OpcodeChar(OP_F2F, ALU_OP)}, 68 | {"F2I", OpcodeChar(OP_F2I, ALU_OP)}, 69 | {"I2F", OpcodeChar(OP_I2F, ALU_OP)}, 70 | {"I2I", OpcodeChar(OP_I2I, ALU_OP)}, 71 | 72 | // Movement Instructions 73 | {"MOV", OpcodeChar(OP_MOV, ALU_OP)}, 74 | {"MOV32I", OpcodeChar(OP_MOV32I, ALU_OP)}, 75 | {"SEL", OpcodeChar(OP_SEL, ALU_OP)}, 76 | {"PRMT", OpcodeChar(OP_PRMT, ALU_OP)}, 77 | {"SHFL", OpcodeChar(OP_SHFL, ALU_OP)}, 78 | 79 | // Predicate Instructions 80 | {"P2R", OpcodeChar(OP_P2R, ALU_OP)}, 81 | {"R2P", OpcodeChar(OP_R2P, ALU_OP)}, 82 | {"CSET", OpcodeChar(OP_CSET, ALU_OP)}, 83 | {"CSETP", OpcodeChar(OP_CSETP, ALU_OP)}, 84 | {"PSET", OpcodeChar(OP_PSET, ALU_OP)}, 85 | {"PSETP", OpcodeChar(OP_PSETP, ALU_OP)}, 86 | 87 | // Texture Instructions 88 | // For now, we ignore texture loads, consider it as ALU_OP 89 | {"TEX", OpcodeChar(OP_TEX, ALU_OP)}, 90 | {"TLD", OpcodeChar(OP_TLD, ALU_OP)}, 91 | {"TLD4", OpcodeChar(OP_TLD4, ALU_OP)}, 92 | {"TXQ", OpcodeChar(OP_TXQ, ALU_OP)}, 93 | 94 | // Load/Store Instructions 95 | // For now, we ignore constant loads, consider it as ALU_OP, TO DO 96 | {"LDC", OpcodeChar(OP_LDC, ALU_OP)}, 97 | // in Kepler, LD is load global so set it to LDG 98 | {"LD", OpcodeChar(OP_LDG, LOAD_OP)}, 99 | {"LDG", OpcodeChar(OP_LDG, LOAD_OP)}, 100 | {"LDL", OpcodeChar(OP_LDL, LOAD_OP)}, 101 | {"LDS", OpcodeChar(OP_LDS, LOAD_OP)}, 102 | {"LDSLK", OpcodeChar(OP_LDSLK, LOAD_OP)}, 103 | {"ST", OpcodeChar(OP_STG, STORE_OP)}, 104 | {"STL", OpcodeChar(OP_STL, STORE_OP)}, 105 | {"STS", OpcodeChar(OP_STS, STORE_OP)}, 106 | {"STSCUL", OpcodeChar(OP_STSCUL, STORE_OP)}, 107 | {"ATOM", OpcodeChar(OP_ATOM, STORE_OP)}, 108 | {"RED", OpcodeChar(OP_RED, STORE_OP)}, 109 | {"CCTL", OpcodeChar(OP_CCTL, ALU_OP)}, 110 | {"CCTLL", OpcodeChar(OP_CCTLL, ALU_OP)}, 111 | {"MEMBAR", OpcodeChar(OP_MEMBAR, MEMORY_BARRIER_OP)}, 112 | 113 | // surface memory instructions 114 | {"SUCLAMP", OpcodeChar(OP_SUCLAMP, LOAD_OP)}, 115 | {"SUBFM", OpcodeChar(OP_SUBFM, LOAD_OP)}, 116 | {"SUEAU", OpcodeChar(OP_SUEAU, LOAD_OP)}, 117 | {"SULDGA", OpcodeChar(OP_SULDGA, LOAD_OP)}, 118 | {"SUSTGA", OpcodeChar(OP_SUSTGA, STORE_OP)}, 119 | 120 | // Control Instructions 121 | {"BRA", OpcodeChar(OP_BRA, BRANCH_OP)}, 122 | {"BRX", OpcodeChar(OP_BRX, BRANCH_OP)}, 123 | {"JMP", OpcodeChar(OP_JMP, BRANCH_OP)}, 124 | {"JMX", OpcodeChar(OP_JMX, BRANCH_OP)}, 125 | {"CAL", OpcodeChar(OP_CAL, CALL_OPS)}, 126 | {"JCAL", OpcodeChar(OP_JCAL, CALL_OPS)}, 127 | {"RET", OpcodeChar(OP_RET, RET_OPS)}, 128 | {"BRK", OpcodeChar(OP_BRK, RET_OPS)}, 129 | {"CONT", OpcodeChar(OP_CONT, RET_OPS)}, 130 | {"SSY", OpcodeChar(OP_SSY, RET_OPS)}, 131 | {"PBK", OpcodeChar(OP_PBK, RET_OPS)}, 132 | {"PCNT", OpcodeChar(OP_PCNT, RET_OPS)}, 133 | {"PRET", OpcodeChar(OP_PRET, RET_OPS)}, 134 | {"BPT", OpcodeChar(OP_BPT, BRANCH_OP)}, 135 | {"EXIT", OpcodeChar(OP_EXIT, EXIT_OPS)}, 136 | 137 | // Miscellaneous Instructions 138 | {"NOP", OpcodeChar(OP_NOP, ALU_OP)}, 139 | {"S2R", OpcodeChar(OP_S2R, ALU_OP)}, 140 | {"B2R", OpcodeChar(OP_B2R, ALU_OP)}, 141 | {"BAR", OpcodeChar(OP_BAR, BARRIER_OP)}, 142 | {"VOTE", OpcodeChar(OP_VOTE, ALU_OP)}, 143 | }; 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /ISA-Def/trace_opcode.h: -------------------------------------------------------------------------------- 1 | // developed by Mahmoud Khairy, Purdue Univ 2 | // abdallm@purdue.edu 3 | 4 | #ifndef TRACE_OPCODE_H 5 | #define TRACE_OPCODE_H 6 | 7 | #include 8 | #include 9 | 10 | #define SPEC_UNIT_START_ID 100 11 | 12 | enum TraceInstrOpcode { 13 | 14 | // Volta (includes common insts for others cards as well) 15 | OP_FADD = 1, 16 | OP_FADD32I, 17 | OP_FCHK, 18 | OP_FFMA32I, 19 | OP_FFMA, 20 | OP_FMNMX, 21 | OP_FMUL, 22 | OP_FMUL32I, 23 | OP_FSEL, 24 | OP_FSET, 25 | OP_FSETP, 26 | OP_FSWZADD, 27 | OP_MUFU, 28 | OP_HADD2, 29 | OP_HADD2_32I, 30 | OP_HFMA2, 31 | OP_HFMA2_32I, 32 | OP_HMUL2, 33 | OP_HMUL2_32I, 34 | OP_HSET2, 35 | OP_HSETP2, 36 | OP_HMMA, 37 | OP_DADD, 38 | OP_DFMA, 39 | OP_DMUL, 40 | OP_DSETP, 41 | OP_BMSK, 42 | OP_BREV, 43 | OP_FLO, 44 | OP_IABS, 45 | OP_IADD, 46 | OP_IADD3, 47 | OP_IADD32I, 48 | OP_IDP, 49 | OP_IDP4A, 50 | OP_IMAD, 51 | OP_IMMA, 52 | OP_IMNMX, 53 | OP_IMUL, 54 | OP_IMUL32I, 55 | OP_ISCADD, 56 | OP_ISCADD32I, 57 | OP_ISETP, 58 | OP_LEA, 59 | OP_LOP, 60 | OP_LOP3, 61 | OP_LOP32I, 62 | OP_POPC, 63 | OP_SHF, 64 | OP_SHR, 65 | OP_VABSDIFF, 66 | OP_VABSDIFF4, 67 | OP_VADD, 68 | OP_F2F, 69 | OP_F2I, 70 | OP_I2F, 71 | OP_I2I, 72 | OP_I2IP, 73 | OP_FRND, 74 | OP_MOV, 75 | OP_MOV32I, 76 | OP_PRMT, 77 | OP_SEL, 78 | OP_SGXT, 79 | OP_SHFL, 80 | OP_PLOP3, 81 | OP_PSETP, 82 | OP_P2R, 83 | OP_R2P, 84 | OP_LD, 85 | OP_LDC, 86 | OP_LDG, 87 | OP_LDL, 88 | OP_LDS, 89 | OP_ST, 90 | OP_STG, 91 | OP_STL, 92 | OP_STS, 93 | OP_MATCH, 94 | OP_QSPC, 95 | OP_ATOM, 96 | OP_ATOMS, 97 | OP_ATOMG, 98 | OP_RED, 99 | OP_CCTL, 100 | OP_CCTLL, 101 | OP_ERRBAR, 102 | OP_MEMBAR, 103 | OP_CCTLT, 104 | OP_TEX, 105 | OP_TLD, 106 | OP_TLD4, 107 | OP_TMML, 108 | OP_TXD, 109 | OP_TXQ, 110 | OP_BMOV, 111 | OP_BPT, 112 | OP_BRA, 113 | OP_BREAK, 114 | OP_BRX, 115 | OP_BSSY, 116 | OP_BSYNC, 117 | OP_CALL, 118 | OP_EXIT, 119 | OP_JMP, 120 | OP_JMX, 121 | OP_KILL, 122 | OP_NANOSLEEP, 123 | OP_RET, 124 | OP_RPCMOV, 125 | OP_RTT, 126 | OP_WARPSYNC, 127 | OP_YIELD, 128 | OP_B2R, 129 | OP_BAR, 130 | OP_CS2R, 131 | OP_CSMTEST, 132 | OP_DEPBAR, 133 | OP_GETLMEMBASE, 134 | OP_LEPC, 135 | OP_NOP, 136 | OP_PMTRIG, 137 | OP_R2B, 138 | OP_S2R, 139 | OP_SETCTAID, 140 | OP_SETLMEMBASE, 141 | OP_VOTE, 142 | OP_VOTE_VTG, 143 | 144 | // unique insts for pascal 145 | OP_RRO, 146 | OP_DMNMX, 147 | OP_DSET, 148 | OP_BFE, 149 | OP_BFI, 150 | OP_ICMP, 151 | OP_IMADSP, 152 | OP_SHL, 153 | OP_XMAD, 154 | OP_CSET, 155 | OP_CSETP, 156 | OP_TEXS, 157 | OP_TLD4S, 158 | OP_TLDS, 159 | OP_CAL, 160 | OP_JCAL, 161 | OP_PRET, 162 | OP_BRK, 163 | OP_PBK, 164 | OP_CONT, 165 | OP_PCNT, 166 | OP_PEXIT, 167 | OP_SSY, 168 | OP_SYNC, 169 | OP_PSET, 170 | OP_VMNMX, 171 | OP_ISET, 172 | 173 | // unique insts for turing 174 | OP_BMMA, 175 | OP_MOVM, 176 | OP_LDSM, 177 | OP_R2UR, 178 | OP_S2UR, 179 | OP_UBMSK, 180 | OP_UBREV, 181 | OP_UCLEA, 182 | OP_UFLO, 183 | OP_UIADD3, 184 | OP_UIMAD, 185 | OP_UISETP, 186 | OP_ULDC, 187 | OP_ULEA, 188 | OP_ULOP, 189 | OP_ULOP3, 190 | OP_ULOP32I, 191 | OP_UMOV, 192 | OP_UP2UR, 193 | OP_UPLOP3, 194 | OP_UPOPC, 195 | OP_UPRMT, 196 | OP_UPSETP, 197 | OP_UR2UP, 198 | OP_USEL, 199 | OP_USGXT, 200 | OP_USHF, 201 | OP_USHL, 202 | OP_USHR, 203 | OP_VOTEU, 204 | OP_SUATOM, 205 | OP_SULD, 206 | OP_SURED, 207 | OP_SUST, 208 | OP_BRXU, 209 | OP_JMXU, 210 | 211 | // unique insts for kepler 212 | OP_FCMP, 213 | OP_FSWZ, 214 | OP_ISAD, 215 | OP_LDSLK, 216 | OP_STSCUL, 217 | OP_SUCLAMP, 218 | OP_SUBFM, 219 | OP_SUEAU, 220 | OP_SULDGA, 221 | OP_SUSTGA, 222 | OP_ISUB, 223 | 224 | // unique insts for ampere 225 | OP_HMNMX2, 226 | OP_DMMA, 227 | OP_I2FP, 228 | OP_F2IP, 229 | OP_LDGDEPBAR, 230 | OP_LDGSTS, 231 | OP_REDUX, 232 | OP_UF2FP, 233 | OP_SUQUERY, 234 | SASS_NUM_OPCODES /* The total number of opcodes. */ 235 | 236 | }; 237 | 238 | typedef enum TraceInstrOpcode sass_op_type; 239 | 240 | struct OpcodeChar { 241 | OpcodeChar(unsigned m_opcode, unsigned m_opcode_category) { 242 | opcode = m_opcode; 243 | opcode_category = m_opcode_category; 244 | } 245 | unsigned opcode; 246 | unsigned opcode_category; 247 | }; 248 | 249 | enum special_operations_t { 250 | OTHER_OP, 251 | INT__OP, 252 | INT_MUL24_OP, 253 | INT_MUL32_OP, 254 | INT_MUL_OP, 255 | INT_DIV_OP, 256 | FP_MUL_OP, 257 | FP_DIV_OP, 258 | FP__OP, 259 | FP_SQRT_OP, 260 | FP_LG_OP, 261 | FP_SIN_OP, 262 | FP_EXP_OP, 263 | DP_MUL_OP, 264 | DP_DIV_OP, 265 | DP___OP, 266 | TENSOR__OP, 267 | TEX__OP 268 | }; 269 | 270 | typedef enum special_operations_t special_ops; // Required to identify for the power model 271 | 272 | // Type of operation 273 | enum uarch_op_t { 274 | NO_OP = -1, 275 | ALU_OP = 1, 276 | SFU_OP, 277 | TENSOR_CORE_OP, 278 | DP_OP, 279 | SP_OP, 280 | INTP_OP, 281 | ALU_SFU_OP, 282 | LOAD_OP, 283 | TENSOR_CORE_LOAD_OP, 284 | TENSOR_CORE_STORE_OP, 285 | STORE_OP, 286 | BRANCH_OP, 287 | BARRIER_OP, 288 | MEMORY_BARRIER_OP, 289 | CALL_OPS, 290 | RET_OPS, 291 | EXIT_OPS, 292 | SPECIALIZED_UNIT_1_OP = SPEC_UNIT_START_ID, 293 | SPECIALIZED_UNIT_2_OP, 294 | SPECIALIZED_UNIT_3_OP, 295 | SPECIALIZED_UNIT_4_OP, 296 | SPECIALIZED_UNIT_5_OP, 297 | SPECIALIZED_UNIT_6_OP, 298 | SPECIALIZED_UNIT_7_OP, 299 | SPECIALIZED_UNIT_8_OP 300 | }; 301 | 302 | typedef enum uarch_op_t op_type; 303 | 304 | enum uarch_operand_type_t { UN_OP = -1, INT_OP, FP_OP }; 305 | 306 | typedef enum uarch_operand_type_t types_of_operands; 307 | 308 | #endif 309 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | USE_BOOST ?= 1 2 | DEBUG ?= 0 3 | USE_GPROF ?= 0 4 | 5 | BOOST_PATH := $(shell echo $$LD_LIBRARY_PATH | tr ':' '\n' | grep boost/lib | head -n 1) 6 | ifeq ($(BOOST_PATH),) 7 | BOOST_HOME ?= 8 | else 9 | BOOST_HOME := $(shell dirname $(BOOST_PATH)) 10 | # $(info Using BOOST_HOME: $(BOOST_HOME)) 11 | endif 12 | 13 | MPICC_PATH := $(shell which mpicc) 14 | # $(info Using MPICC_PATH: $(MPICC_PATH)) 15 | MPI_PATH := $(shell dirname $(MPICC_PATH)) 16 | # $(info Using MPI_PATH: $(MPI_PATH)) 17 | MPI_HOME ?= $(shell dirname $(MPI_PATH)) 18 | # $(info Using MPI_HOME: $(MPI_HOME)) 19 | 20 | MPICXX = $(shell which mpic++) 21 | MPIRUN = $(shell which mpirun) 22 | 23 | ifeq ($(USE_BOOST),1) 24 | CXX = $(MPICXX) 25 | CC = $(MPICXX) 26 | else 27 | CXX = g++ 28 | CC = gcc 29 | endif 30 | 31 | CXXFLAGS = -Wall -pthread -finline-functions -funswitch-loops -MMD -MP 32 | 33 | ifeq ($(USE_GPROF),1) 34 | CXXFLAGS += -pg 35 | endif 36 | 37 | CFLAGS = $(CXXFLAGS) 38 | 39 | # Detect Support for C++11 (C++0x) from GCC Version 40 | GNUC_CPP0X := $(shell mpic++ --version | perl -ne 'if (/g++\s+\(.*\)\s+([0-9.]+)/){ if($$1 >= 4.3) {$$n=1} else {$$n=0;} } END { print $$n; }') 41 | 42 | ifeq ($(GNUC_CPP0X), 1) 43 | CXXFLAGS += -std=c++11 44 | endif 45 | 46 | INC_DIRS = -I./hw-parser -I./hw-component -I./ISA-Def -I./DEV-Def -I./trace-parser -I./trace-driven -I./common -I./common/CLI -I./common/CLI/impl -I$(MPI_HOME)/include -I$(BOOST_HOME)/include -I./parda 47 | CXXFLAGS += $(INC_DIRS) $(shell pkg-config --cflags glib-2.0) 48 | CFLAGS += $(INC_DIRS) 49 | 50 | LIBRARIES = -L$(BOOST_HOME)/lib -lboost_mpi -lboost_serialization 51 | LIBRARIES += $(shell pkg-config --libs glib-2.0) 52 | 53 | ifeq ($(DEBUG),1) 54 | OPTFLAGS = -O0 -g3 -fPIC 55 | else 56 | OPTFLAGS = -O3 -fPIC 57 | endif 58 | 59 | OBJ_PATH = obj 60 | 61 | TARGET = gpu-simulator.x 62 | 63 | exist_OBJ_PATH = $(shell if [ -d $(OBJ_PATH) ]; then echo "exist"; else echo "noexist"; fi) 64 | 65 | ifeq ("$(exist_OBJ_PATH)", "noexist") 66 | $(shell mkdir $(OBJ_PATH)) 67 | endif 68 | 69 | CC_SRCS := $(wildcard *.c) $(wildcard parda/*.c) 70 | CC_SRCS := $(filter-out parda/parda_mpi.c parda/parda_omp.c parda/main.c parda/seperate.c, $(CC_SRCS)) 71 | 72 | CXX_SRCS := $(wildcard *.cc) $(wildcard trace-parser/*.cc) $(wildcard trace-driven/*.cc) 73 | CXX_SRCS += $(wildcard hw-component/*.cc) $(wildcard hw-parser/*.cc) $(wildcard common/*.cc) 74 | 75 | SRCS := $(CC_SRCS) $(CXX_SRCS) 76 | 77 | CC_OBJS := $(CC_SRCS:%.c=$(OBJ_PATH)/%.o) 78 | CXX_OBJS := $(CXX_SRCS:%.cc=$(OBJ_PATH)/%.o) 79 | 80 | OBJS := $(CXX_OBJS) $(CC_OBJS) 81 | 82 | default: all 83 | 84 | all: $(TARGET) 85 | 86 | $(TARGET): $(OBJS) 87 | $(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $^ $(LIBRARIES) 88 | 89 | $(OBJ_PATH)/%.o: %.cc 90 | @mkdir -p $(@D) 91 | $(CXX) $(CXXFLAGS) $(OPTFLAGS) -c $< -o $@ 92 | 93 | $(OBJ_PATH)/%.o: %.c 94 | @mkdir -p $(@D) 95 | $(CC) $(CFLAGS) $(OPTFLAGS) -c $< -o $@ 96 | 97 | DEPS = $(shell find $(OBJ_PATH) -name "*.d") 98 | -include $(DEPS) 99 | 100 | .PHONY: clean 101 | 102 | clean: 103 | rm -f $(OBJS) 104 | rm -f $(DEPS) 105 | rm -f $(TARGET) 106 | rm -rf $(OBJ_PATH) 107 | -------------------------------------------------------------------------------- /common/CLI/Argv.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | // [CLI11:public_includes:end] 13 | 14 | #include 15 | 16 | namespace CLI { 17 | // [CLI11:argv_hpp:verbatim] 18 | namespace detail { 19 | #ifdef _WIN32 20 | /// Decode and return UTF-8 argv from GetCommandLineW. 21 | CLI11_INLINE std::vector compute_win32_argv(); 22 | #endif 23 | } // namespace detail 24 | 25 | /// argc as passed in to this executable. 26 | CLI11_INLINE int argc(); 27 | 28 | /// argv as passed in to this executable, converted to utf-8 on Windows. 29 | CLI11_INLINE const char *const *argv(); 30 | 31 | // [CLI11:argv_hpp:end] 32 | } // namespace CLI 33 | 34 | #ifndef CLI11_COMPILE 35 | #include "impl/Argv_inl.hpp" 36 | #endif 37 | -------------------------------------------------------------------------------- /common/CLI/CLI.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // CLI Library includes 10 | // Order is important for combiner script 11 | 12 | #include "Version.hpp" 13 | 14 | #include "Macros.hpp" 15 | 16 | #include "Encoding.hpp" 17 | 18 | #include "Argv.hpp" 19 | 20 | #include "StringTools.hpp" 21 | 22 | #include "Error.hpp" 23 | 24 | #include "TypeTools.hpp" 25 | 26 | #include "Split.hpp" 27 | 28 | #include "ConfigFwd.hpp" 29 | 30 | #include "Validators.hpp" 31 | 32 | #include "FormatterFwd.hpp" 33 | 34 | #include "Option.hpp" 35 | 36 | #include "App.hpp" 37 | 38 | #include "Config.hpp" 39 | 40 | #include "Formatter.hpp" 41 | -------------------------------------------------------------------------------- /common/CLI/Config.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | // [CLI11:public_includes:end] 18 | 19 | #include "App.hpp" 20 | #include "ConfigFwd.hpp" 21 | #include "StringTools.hpp" 22 | 23 | namespace CLI { 24 | // [CLI11:config_hpp:verbatim] 25 | namespace detail { 26 | 27 | std::string convert_arg_for_ini(const std::string &arg, char stringQuote = '"', char characterQuote = '\''); 28 | 29 | /// Comma separated join, adds quotes if needed 30 | std::string ini_join(const std::vector &args, 31 | char sepChar = ',', 32 | char arrayStart = '[', 33 | char arrayEnd = ']', 34 | char stringQuote = '"', 35 | char characterQuote = '\''); 36 | 37 | std::vector generate_parents(const std::string §ion, std::string &name, char parentSeparator); 38 | 39 | /// assuming non default segments do a check on the close and open of the segments in a configItem structure 40 | void checkParentSegments(std::vector &output, const std::string ¤tSection, char parentSeparator); 41 | } // namespace detail 42 | 43 | // [CLI11:config_hpp:end] 44 | } // namespace CLI 45 | 46 | #ifndef CLI11_COMPILE 47 | #include "impl/Config_inl.hpp" 48 | #endif 49 | -------------------------------------------------------------------------------- /common/CLI/ConfigFwd.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | // [CLI11:public_includes:end] 16 | 17 | #include "Error.hpp" 18 | #include "StringTools.hpp" 19 | 20 | namespace CLI { 21 | // [CLI11:config_fwd_hpp:verbatim] 22 | 23 | class App; 24 | 25 | /// Holds values to load into Options 26 | struct ConfigItem { 27 | /// This is the list of parents 28 | std::vector parents{}; 29 | 30 | /// This is the name 31 | std::string name{}; 32 | 33 | /// Listing of inputs 34 | std::vector inputs{}; 35 | 36 | /// The list of parents and name joined by "." 37 | CLI11_NODISCARD std::string fullname() const { 38 | std::vector tmp = parents; 39 | tmp.emplace_back(name); 40 | return detail::join(tmp, "."); 41 | } 42 | }; 43 | 44 | /// This class provides a converter for configuration files. 45 | class Config { 46 | protected: 47 | std::vector items{}; 48 | 49 | public: 50 | /// Convert an app into a configuration 51 | virtual std::string to_config(const App *, bool, bool, std::string) const = 0; 52 | 53 | /// Convert a configuration into an app 54 | virtual std::vector from_config(std::istream &) const = 0; 55 | 56 | /// Get a flag value 57 | CLI11_NODISCARD virtual std::string to_flag(const ConfigItem &item) const { 58 | if(item.inputs.size() == 1) { 59 | return item.inputs.at(0); 60 | } 61 | if(item.inputs.empty()) { 62 | return "{}"; 63 | } 64 | throw ConversionError::TooManyInputsFlag(item.fullname()); // LCOV_EXCL_LINE 65 | } 66 | 67 | /// Parse a config file, throw an error (ParseError:ConfigParseError or FileError) on failure 68 | CLI11_NODISCARD std::vector from_file(const std::string &name) const { 69 | std::ifstream input{name}; 70 | if(!input.good()) 71 | throw FileError::Missing(name); 72 | 73 | return from_config(input); 74 | } 75 | 76 | /// Virtual destructor 77 | virtual ~Config() = default; 78 | }; 79 | 80 | /// This converter works with INI/TOML files; to write INI files use ConfigINI 81 | class ConfigBase : public Config { 82 | protected: 83 | /// the character used for comments 84 | char commentChar = '#'; 85 | /// the character used to start an array '\0' is a default to not use 86 | char arrayStart = '['; 87 | /// the character used to end an array '\0' is a default to not use 88 | char arrayEnd = ']'; 89 | /// the character used to separate elements in an array 90 | char arraySeparator = ','; 91 | /// the character used separate the name from the value 92 | char valueDelimiter = '='; 93 | /// the character to use around strings 94 | char stringQuote = '"'; 95 | /// the character to use around single characters 96 | char characterQuote = '\''; 97 | /// the maximum number of layers to allow 98 | uint8_t maximumLayers{255}; 99 | /// the separator used to separator parent layers 100 | char parentSeparatorChar{'.'}; 101 | /// Specify the configuration index to use for arrayed sections 102 | int16_t configIndex{-1}; 103 | /// Specify the configuration section that should be used 104 | std::string configSection{}; 105 | 106 | public: 107 | std::string 108 | to_config(const App * /*app*/, bool default_also, bool write_description, std::string prefix) const override; 109 | 110 | std::vector from_config(std::istream &input) const override; 111 | /// Specify the configuration for comment characters 112 | ConfigBase *comment(char cchar) { 113 | commentChar = cchar; 114 | return this; 115 | } 116 | /// Specify the start and end characters for an array 117 | ConfigBase *arrayBounds(char aStart, char aEnd) { 118 | arrayStart = aStart; 119 | arrayEnd = aEnd; 120 | return this; 121 | } 122 | /// Specify the delimiter character for an array 123 | ConfigBase *arrayDelimiter(char aSep) { 124 | arraySeparator = aSep; 125 | return this; 126 | } 127 | /// Specify the delimiter between a name and value 128 | ConfigBase *valueSeparator(char vSep) { 129 | valueDelimiter = vSep; 130 | return this; 131 | } 132 | /// Specify the quote characters used around strings and characters 133 | ConfigBase *quoteCharacter(char qString, char qChar) { 134 | stringQuote = qString; 135 | characterQuote = qChar; 136 | return this; 137 | } 138 | /// Specify the maximum number of parents 139 | ConfigBase *maxLayers(uint8_t layers) { 140 | maximumLayers = layers; 141 | return this; 142 | } 143 | /// Specify the separator to use for parent layers 144 | ConfigBase *parentSeparator(char sep) { 145 | parentSeparatorChar = sep; 146 | return this; 147 | } 148 | /// get a reference to the configuration section 149 | std::string §ionRef() { return configSection; } 150 | /// get the section 151 | CLI11_NODISCARD const std::string §ion() const { return configSection; } 152 | /// specify a particular section of the configuration file to use 153 | ConfigBase *section(const std::string §ionName) { 154 | configSection = sectionName; 155 | return this; 156 | } 157 | 158 | /// get a reference to the configuration index 159 | int16_t &indexRef() { return configIndex; } 160 | /// get the section index 161 | CLI11_NODISCARD int16_t index() const { return configIndex; } 162 | /// specify a particular index in the section to use (-1) for all sections to use 163 | ConfigBase *index(int16_t sectionIndex) { 164 | configIndex = sectionIndex; 165 | return this; 166 | } 167 | }; 168 | 169 | /// the default Config is the TOML file format 170 | using ConfigTOML = ConfigBase; 171 | 172 | /// ConfigINI generates a "standard" INI compliant output 173 | class ConfigINI : public ConfigTOML { 174 | 175 | public: 176 | ConfigINI() { 177 | commentChar = ';'; 178 | arrayStart = '\0'; 179 | arrayEnd = '\0'; 180 | arraySeparator = ' '; 181 | valueDelimiter = '='; 182 | } 183 | }; 184 | // [CLI11:config_fwd_hpp:end] 185 | } // namespace CLI 186 | -------------------------------------------------------------------------------- /common/CLI/Encoding.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | // [CLI11:public_includes:set] 12 | #include 13 | // [CLI11:public_includes:end] 14 | 15 | // [CLI11:encoding_includes:verbatim] 16 | #ifdef CLI11_CPP17 17 | #include 18 | #endif // CLI11_CPP17 19 | 20 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 21 | #include 22 | #include // NOLINT(build/include) 23 | #endif // CLI11_HAS_FILESYSTEM 24 | // [CLI11:encoding_includes:end] 25 | 26 | namespace CLI { 27 | // [CLI11:encoding_hpp:verbatim] 28 | 29 | /// Convert a wide string to a narrow string. 30 | CLI11_INLINE std::string narrow(const std::wstring &str); 31 | CLI11_INLINE std::string narrow(const wchar_t *str); 32 | CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t size); 33 | 34 | /// Convert a narrow string to a wide string. 35 | CLI11_INLINE std::wstring widen(const std::string &str); 36 | CLI11_INLINE std::wstring widen(const char *str); 37 | CLI11_INLINE std::wstring widen(const char *str, std::size_t size); 38 | 39 | #ifdef CLI11_CPP17 40 | CLI11_INLINE std::string narrow(std::wstring_view str); 41 | CLI11_INLINE std::wstring widen(std::string_view str); 42 | #endif // CLI11_CPP17 43 | 44 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 45 | /// Convert a char-string to a native path correctly. 46 | CLI11_INLINE std::filesystem::path to_path(std::string_view str); 47 | #endif // CLI11_HAS_FILESYSTEM 48 | 49 | // [CLI11:encoding_hpp:end] 50 | } // namespace CLI 51 | 52 | #ifndef CLI11_COMPILE 53 | #include "impl/Encoding_inl.hpp" 54 | #endif 55 | -------------------------------------------------------------------------------- /common/CLI/Formatter.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | #include 13 | // [CLI11:public_includes:end] 14 | 15 | #include "App.hpp" 16 | #include "FormatterFwd.hpp" 17 | 18 | namespace CLI { 19 | // [CLI11:formatter_hpp:verbatim] 20 | // [CLI11:formatter_hpp:end] 21 | } // namespace CLI 22 | 23 | #ifndef CLI11_COMPILE 24 | #include "impl/Formatter_inl.hpp" 25 | #endif 26 | -------------------------------------------------------------------------------- /common/CLI/FormatterFwd.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | // [CLI11:public_includes:end] 16 | 17 | #include "StringTools.hpp" 18 | 19 | namespace CLI { 20 | // [CLI11:formatter_fwd_hpp:verbatim] 21 | 22 | class Option; 23 | class App; 24 | 25 | /// This enum signifies the type of help requested 26 | /// 27 | /// This is passed in by App; all user classes must accept this as 28 | /// the second argument. 29 | 30 | enum class AppFormatMode { 31 | Normal, ///< The normal, detailed help 32 | All, ///< A fully expanded help 33 | Sub, ///< Used when printed as part of expanded subcommand 34 | }; 35 | 36 | /// This is the minimum requirements to run a formatter. 37 | /// 38 | /// A user can subclass this is if they do not care at all 39 | /// about the structure in CLI::Formatter. 40 | class FormatterBase { 41 | protected: 42 | /// @name Options 43 | ///@{ 44 | 45 | /// The width of the first column 46 | std::size_t column_width_{30}; 47 | 48 | /// @brief The required help printout labels (user changeable) 49 | /// Values are Needs, Excludes, etc. 50 | std::map labels_{}; 51 | 52 | ///@} 53 | /// @name Basic 54 | ///@{ 55 | 56 | public: 57 | FormatterBase() = default; 58 | FormatterBase(const FormatterBase &) = default; 59 | FormatterBase(FormatterBase &&) = default; 60 | FormatterBase &operator=(const FormatterBase &) = default; 61 | FormatterBase &operator=(FormatterBase &&) = default; 62 | 63 | /// Adding a destructor in this form to work around bug in GCC 4.7 64 | virtual ~FormatterBase() noexcept {} // NOLINT(modernize-use-equals-default) 65 | 66 | /// This is the key method that puts together help 67 | virtual std::string make_help(const App *, std::string, AppFormatMode) const = 0; 68 | 69 | ///@} 70 | /// @name Setters 71 | ///@{ 72 | 73 | /// Set the "REQUIRED" label 74 | void label(std::string key, std::string val) { labels_[key] = val; } 75 | 76 | /// Set the column width 77 | void column_width(std::size_t val) { column_width_ = val; } 78 | 79 | ///@} 80 | /// @name Getters 81 | ///@{ 82 | 83 | /// Get the current value of a name (REQUIRED, etc.) 84 | CLI11_NODISCARD std::string get_label(std::string key) const { 85 | if(labels_.find(key) == labels_.end()) 86 | return key; 87 | return labels_.at(key); 88 | } 89 | 90 | /// Get the current column width 91 | CLI11_NODISCARD std::size_t get_column_width() const { return column_width_; } 92 | 93 | ///@} 94 | }; 95 | 96 | /// This is a specialty override for lambda functions 97 | class FormatterLambda final : public FormatterBase { 98 | using funct_t = std::function; 99 | 100 | /// The lambda to hold and run 101 | funct_t lambda_; 102 | 103 | public: 104 | /// Create a FormatterLambda with a lambda function 105 | explicit FormatterLambda(funct_t funct) : lambda_(std::move(funct)) {} 106 | 107 | /// Adding a destructor (mostly to make GCC 4.7 happy) 108 | ~FormatterLambda() noexcept override {} // NOLINT(modernize-use-equals-default) 109 | 110 | /// This will simply call the lambda function 111 | std::string make_help(const App *app, std::string name, AppFormatMode mode) const override { 112 | return lambda_(app, name, mode); 113 | } 114 | }; 115 | 116 | /// This is the default Formatter for CLI11. It pretty prints help output, and is broken into quite a few 117 | /// overridable methods, to be highly customizable with minimal effort. 118 | class Formatter : public FormatterBase { 119 | public: 120 | Formatter() = default; 121 | Formatter(const Formatter &) = default; 122 | Formatter(Formatter &&) = default; 123 | Formatter &operator=(const Formatter &) = default; 124 | Formatter &operator=(Formatter &&) = default; 125 | 126 | /// @name Overridables 127 | ///@{ 128 | 129 | /// This prints out a group of options with title 130 | /// 131 | CLI11_NODISCARD virtual std::string 132 | make_group(std::string group, bool is_positional, std::vector opts) const; 133 | 134 | /// This prints out just the positionals "group" 135 | virtual std::string make_positionals(const App *app) const; 136 | 137 | /// This prints out all the groups of options 138 | std::string make_groups(const App *app, AppFormatMode mode) const; 139 | 140 | /// This prints out all the subcommands 141 | virtual std::string make_subcommands(const App *app, AppFormatMode mode) const; 142 | 143 | /// This prints out a subcommand 144 | virtual std::string make_subcommand(const App *sub) const; 145 | 146 | /// This prints out a subcommand in help-all 147 | virtual std::string make_expanded(const App *sub) const; 148 | 149 | /// This prints out all the groups of options 150 | virtual std::string make_footer(const App *app) const; 151 | 152 | /// This displays the description line 153 | virtual std::string make_description(const App *app) const; 154 | 155 | /// This displays the usage line 156 | virtual std::string make_usage(const App *app, std::string name) const; 157 | 158 | /// This puts everything together 159 | std::string make_help(const App * /*app*/, std::string, AppFormatMode) const override; 160 | 161 | ///@} 162 | /// @name Options 163 | ///@{ 164 | 165 | /// This prints out an option help line, either positional or optional form 166 | virtual std::string make_option(const Option *opt, bool is_positional) const { 167 | std::stringstream out; 168 | detail::format_help( 169 | out, make_option_name(opt, is_positional) + make_option_opts(opt), make_option_desc(opt), column_width_); 170 | return out.str(); 171 | } 172 | 173 | /// @brief This is the name part of an option, Default: left column 174 | virtual std::string make_option_name(const Option *, bool) const; 175 | 176 | /// @brief This is the options part of the name, Default: combined into left column 177 | virtual std::string make_option_opts(const Option *) const; 178 | 179 | /// @brief This is the description. Default: Right column, on new line if left column too large 180 | virtual std::string make_option_desc(const Option *) const; 181 | 182 | /// @brief This is used to print the name on the USAGE line 183 | virtual std::string make_option_usage(const Option *opt) const; 184 | 185 | ///@} 186 | }; 187 | 188 | // [CLI11:formatter_fwd_hpp:end] 189 | } // namespace CLI 190 | -------------------------------------------------------------------------------- /common/CLI/Macros.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:macros_hpp:verbatim] 10 | 11 | // The following version macro is very similar to the one in pybind11 12 | #if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER) 13 | #if __cplusplus >= 201402L 14 | #define CLI11_CPP14 15 | #if __cplusplus >= 201703L 16 | #define CLI11_CPP17 17 | #if __cplusplus > 201703L 18 | #define CLI11_CPP20 19 | #endif 20 | #endif 21 | #endif 22 | #elif defined(_MSC_VER) && __cplusplus == 199711L 23 | // MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented) 24 | // Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer 25 | #if _MSVC_LANG >= 201402L 26 | #define CLI11_CPP14 27 | #if _MSVC_LANG > 201402L && _MSC_VER >= 1910 28 | #define CLI11_CPP17 29 | #if _MSVC_LANG > 201703L && _MSC_VER >= 1910 30 | #define CLI11_CPP20 31 | #endif 32 | #endif 33 | #endif 34 | #endif 35 | 36 | #if defined(CLI11_CPP14) 37 | #define CLI11_DEPRECATED(reason) [[deprecated(reason)]] 38 | #elif defined(_MSC_VER) 39 | #define CLI11_DEPRECATED(reason) __declspec(deprecated(reason)) 40 | #else 41 | #define CLI11_DEPRECATED(reason) __attribute__((deprecated(reason))) 42 | #endif 43 | 44 | // GCC < 10 doesn't ignore this in unevaluated contexts 45 | #if !defined(CLI11_CPP17) || \ 46 | (defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 10 && __GNUC__ > 4) 47 | #define CLI11_NODISCARD 48 | #else 49 | #define CLI11_NODISCARD [[nodiscard]] 50 | #endif 51 | 52 | /** detection of rtti */ 53 | #ifndef CLI11_USE_STATIC_RTTI 54 | #if(defined(_HAS_STATIC_RTTI) && _HAS_STATIC_RTTI) 55 | #define CLI11_USE_STATIC_RTTI 1 56 | #elif defined(__cpp_rtti) 57 | #if(defined(_CPPRTTI) && _CPPRTTI == 0) 58 | #define CLI11_USE_STATIC_RTTI 1 59 | #else 60 | #define CLI11_USE_STATIC_RTTI 0 61 | #endif 62 | #elif(defined(__GCC_RTTI) && __GXX_RTTI) 63 | #define CLI11_USE_STATIC_RTTI 0 64 | #else 65 | #define CLI11_USE_STATIC_RTTI 1 66 | #endif 67 | #endif 68 | 69 | /** availability */ 70 | #if defined CLI11_CPP17 && defined __has_include && !defined CLI11_HAS_FILESYSTEM 71 | #if __has_include() 72 | // Filesystem cannot be used if targeting macOS < 10.15 73 | #if defined __MAC_OS_X_VERSION_MIN_REQUIRED && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500 74 | #define CLI11_HAS_FILESYSTEM 0 75 | #elif defined(__wasi__) 76 | // As of wasi-sdk-14, filesystem is not implemented 77 | #define CLI11_HAS_FILESYSTEM 0 78 | #else 79 | #include 80 | #if defined __cpp_lib_filesystem && __cpp_lib_filesystem >= 201703 81 | #if defined _GLIBCXX_RELEASE && _GLIBCXX_RELEASE >= 9 82 | #define CLI11_HAS_FILESYSTEM 1 83 | #elif defined(__GLIBCXX__) 84 | // if we are using gcc and Version <9 default to no filesystem 85 | #define CLI11_HAS_FILESYSTEM 0 86 | #else 87 | #define CLI11_HAS_FILESYSTEM 1 88 | #endif 89 | #else 90 | #define CLI11_HAS_FILESYSTEM 0 91 | #endif 92 | #endif 93 | #endif 94 | #endif 95 | 96 | /** availability */ 97 | #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 5 98 | #define CLI11_HAS_CODECVT 0 99 | #else 100 | #define CLI11_HAS_CODECVT 1 101 | #include 102 | #endif 103 | 104 | /** disable deprecations */ 105 | #if defined(__GNUC__) // GCC or clang 106 | #define CLI11_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") 107 | #define CLI11_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") 108 | 109 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") 110 | 111 | #elif defined(_MSC_VER) 112 | #define CLI11_DIAGNOSTIC_PUSH __pragma(warning(push)) 113 | #define CLI11_DIAGNOSTIC_POP __pragma(warning(pop)) 114 | 115 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED __pragma(warning(disable : 4996)) 116 | 117 | #else 118 | #define CLI11_DIAGNOSTIC_PUSH 119 | #define CLI11_DIAGNOSTIC_POP 120 | 121 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED 122 | 123 | #endif 124 | 125 | /** Inline macro **/ 126 | #ifdef CLI11_COMPILE 127 | #define CLI11_INLINE 128 | #else 129 | #define CLI11_INLINE inline 130 | #endif 131 | // [CLI11:macros_hpp:end] 132 | -------------------------------------------------------------------------------- /common/CLI/Split.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:public_includes:set] 10 | #include 11 | #include 12 | #include 13 | #include 14 | // [CLI11:public_includes:end] 15 | 16 | #include "Macros.hpp" 17 | 18 | namespace CLI { 19 | // [CLI11:split_hpp:verbatim] 20 | 21 | namespace detail { 22 | 23 | // Returns false if not a short option. Otherwise, sets opt name and rest and returns true 24 | CLI11_INLINE bool split_short(const std::string ¤t, std::string &name, std::string &rest); 25 | 26 | // Returns false if not a long option. Otherwise, sets opt name and other side of = and returns true 27 | CLI11_INLINE bool split_long(const std::string ¤t, std::string &name, std::string &value); 28 | 29 | // Returns false if not a windows style option. Otherwise, sets opt name and value and returns true 30 | CLI11_INLINE bool split_windows_style(const std::string ¤t, std::string &name, std::string &value); 31 | 32 | // Splits a string into multiple long and short names 33 | CLI11_INLINE std::vector split_names(std::string current); 34 | 35 | /// extract default flag values either {def} or starting with a ! 36 | CLI11_INLINE std::vector> get_default_flag_values(const std::string &str); 37 | 38 | /// Get a vector of short names, one of long names, and a single name 39 | CLI11_INLINE std::tuple, std::vector, std::string> 40 | get_names(const std::vector &input); 41 | 42 | } // namespace detail 43 | // [CLI11:split_hpp:end] 44 | } // namespace CLI 45 | 46 | #ifndef CLI11_COMPILE 47 | #include "impl/Split_inl.hpp" 48 | #endif 49 | -------------------------------------------------------------------------------- /common/CLI/Timer.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // On GCC < 4.8, the following define is often missing. Due to the 10 | // fact that this library only uses sleep_for, this should be safe 11 | #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 5 && __GNUC_MINOR__ < 8 12 | #define _GLIBCXX_USE_NANOSLEEP 13 | #endif 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace CLI { 25 | 26 | /// This is a simple timer with pretty printing. Creating the timer starts counting. 27 | class Timer { 28 | protected: 29 | /// This is a typedef to make clocks easier to use 30 | using clock = std::chrono::steady_clock; 31 | 32 | /// This typedef is for points in time 33 | using time_point = std::chrono::time_point; 34 | 35 | /// This is the type of a printing function, you can make your own 36 | using time_print_t = std::function; 37 | 38 | /// This is the title of the timer 39 | std::string title_; 40 | 41 | /// This is the function that is used to format most of the timing message 42 | time_print_t time_print_; 43 | 44 | /// This is the starting point (when the timer was created) 45 | time_point start_; 46 | 47 | /// This is the number of times cycles (print divides by this number) 48 | std::size_t cycles{1}; 49 | 50 | public: 51 | /// Standard print function, this one is set by default 52 | static std::string Simple(std::string title, std::string time) { return title + ": " + time; } 53 | 54 | /// This is a fancy print function with --- headers 55 | static std::string Big(std::string title, std::string time) { 56 | return std::string("-----------------------------------------\n") + "| " + title + " | Time = " + time + "\n" + 57 | "-----------------------------------------"; 58 | } 59 | 60 | public: 61 | /// Standard constructor, can set title and print function 62 | explicit Timer(std::string title = "Timer", time_print_t time_print = Simple) 63 | : title_(std::move(title)), time_print_(std::move(time_print)), start_(clock::now()) {} 64 | 65 | /// Time a function by running it multiple times. Target time is the len to target. 66 | std::string time_it(std::function f, double target_time = 1) { 67 | time_point start = start_; 68 | double total_time = NAN; 69 | 70 | start_ = clock::now(); 71 | std::size_t n = 0; 72 | do { 73 | f(); 74 | std::chrono::duration elapsed = clock::now() - start_; 75 | total_time = elapsed.count(); 76 | } while(n++ < 100u && total_time < target_time); 77 | 78 | std::string out = make_time_str(total_time / static_cast(n)) + " for " + std::to_string(n) + " tries"; 79 | start_ = start; 80 | return out; 81 | } 82 | 83 | /// This formats the numerical value for the time string 84 | std::string make_time_str() const { // NOLINT(modernize-use-nodiscard) 85 | time_point stop = clock::now(); 86 | std::chrono::duration elapsed = stop - start_; 87 | double time = elapsed.count() / static_cast(cycles); 88 | return make_time_str(time); 89 | } 90 | 91 | // LCOV_EXCL_START 92 | /// This prints out a time string from a time 93 | std::string make_time_str(double time) const { // NOLINT(modernize-use-nodiscard) 94 | auto print_it = [](double x, std::string unit) { 95 | const unsigned int buffer_length = 50; 96 | std::array buffer; 97 | std::snprintf(buffer.data(), buffer_length, "%.5g", x); 98 | return buffer.data() + std::string(" ") + unit; 99 | }; 100 | 101 | if(time < .000001) 102 | return print_it(time * 1000000000, "ns"); 103 | if(time < .001) 104 | return print_it(time * 1000000, "us"); 105 | if(time < 1) 106 | return print_it(time * 1000, "ms"); 107 | return print_it(time, "s"); 108 | } 109 | // LCOV_EXCL_STOP 110 | 111 | /// This is the main function, it creates a string 112 | std::string to_string() const { return time_print_(title_, make_time_str()); } // NOLINT(modernize-use-nodiscard) 113 | 114 | /// Division sets the number of cycles to divide by (no graphical change) 115 | Timer &operator/(std::size_t val) { 116 | cycles = val; 117 | return *this; 118 | } 119 | }; 120 | 121 | /// This class prints out the time upon destruction 122 | class AutoTimer : public Timer { 123 | public: 124 | /// Reimplementing the constructor is required in GCC 4.7 125 | explicit AutoTimer(std::string title = "Timer", time_print_t time_print = Simple) : Timer(title, time_print) {} 126 | // GCC 4.7 does not support using inheriting constructors. 127 | 128 | /// This destructor prints the string 129 | ~AutoTimer() { std::cout << to_string() << std::endl; } 130 | }; 131 | 132 | } // namespace CLI 133 | 134 | /// This prints out the time if shifted into a std::cout like stream. 135 | inline std::ostream &operator<<(std::ostream &in, const CLI::Timer &timer) { return in << timer.to_string(); } 136 | -------------------------------------------------------------------------------- /common/CLI/Version.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // [CLI11:version_hpp:verbatim] 10 | 11 | #define CLI11_VERSION_MAJOR 2 12 | #define CLI11_VERSION_MINOR 3 13 | #define CLI11_VERSION_PATCH 2 14 | #define CLI11_VERSION "2.3.2" 15 | 16 | // [CLI11:version_hpp:end] 17 | -------------------------------------------------------------------------------- /common/CLI/impl/Argv_inl.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // This include is only needed for IDEs to discover symbols 10 | #include 11 | 12 | #include 13 | 14 | // [CLI11:public_includes:set] 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | // [CLI11:public_includes:end] 21 | 22 | // [CLI11:argv_inl_includes:verbatim] 23 | #if defined(_WIN32) 24 | #if !(defined(_AMD64_) || defined(_X86_) || defined(_ARM_)) 25 | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || \ 26 | defined(_M_AMD64) 27 | #define _AMD64_ 28 | #elif defined(i386) || defined(__i386) || defined(__i386__) || defined(__i386__) || defined(_M_IX86) 29 | #define _X86_ 30 | #elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) 31 | #define _ARM_ 32 | #elif defined(__aarch64__) || defined(_M_ARM64) 33 | #define _ARM64_ 34 | #elif defined(_M_ARM64EC) 35 | #define _ARM64EC_ 36 | #endif 37 | #endif 38 | 39 | // first 40 | #ifndef NOMINMAX 41 | // if NOMINMAX is already defined we don't want to mess with that either way 42 | #define NOMINMAX 43 | #include 44 | #undef NOMINMAX 45 | #else 46 | #include 47 | #endif 48 | 49 | // second 50 | #include 51 | // third 52 | #include 53 | #include 54 | 55 | #elif defined(__APPLE__) 56 | #include 57 | #endif 58 | // [CLI11:argv_inl_includes:end] 59 | 60 | namespace CLI { 61 | // [CLI11:argv_inl_hpp:verbatim] 62 | 63 | namespace detail { 64 | 65 | #ifdef __APPLE__ 66 | // Copy argc and argv as early as possible to avoid modification 67 | static const std::vector static_args = [] { 68 | static const std::vector static_args_as_strings = [] { 69 | std::vector args_as_strings; 70 | int argc = *_NSGetArgc(); 71 | char **argv = *_NSGetArgv(); 72 | 73 | args_as_strings.reserve(static_cast(argc)); 74 | for(size_t i = 0; i < static_cast(argc); i++) { 75 | args_as_strings.push_back(argv[i]); 76 | } 77 | 78 | return args_as_strings; 79 | }(); 80 | 81 | std::vector static_args_result; 82 | static_args_result.reserve(static_args_as_strings.size()); 83 | 84 | for(const auto &arg : static_args_as_strings) { 85 | static_args_result.push_back(arg.data()); 86 | } 87 | 88 | return static_args_result; 89 | }(); 90 | #endif 91 | 92 | #ifdef _WIN32 93 | CLI11_INLINE std::vector compute_win32_argv() { 94 | std::vector result; 95 | int argc = 0; 96 | 97 | auto deleter = [](wchar_t **ptr) { LocalFree(ptr); }; 98 | // NOLINTBEGIN(*-avoid-c-arrays) 99 | auto wargv = std::unique_ptr(CommandLineToArgvW(GetCommandLineW(), &argc), deleter); 100 | // NOLINTEND(*-avoid-c-arrays) 101 | 102 | if(wargv == nullptr) { 103 | throw std::runtime_error("CommandLineToArgvW failed with code " + std::to_string(GetLastError())); 104 | } 105 | 106 | result.reserve(static_cast(argc)); 107 | for(size_t i = 0; i < static_cast(argc); ++i) { 108 | result.push_back(narrow(wargv[i])); 109 | } 110 | 111 | return result; 112 | } 113 | #endif 114 | 115 | /// Command-line arguments, as passed in to this executable, converted to utf-8 on Windows. 116 | CLI11_INLINE const std::vector &args() { 117 | // This function uses initialization via lambdas extensively to take advantage of the thread safety of static 118 | // variable initialization [stmt.dcl.3] 119 | 120 | #ifdef _WIN32 121 | static const std::vector static_args = [] { 122 | static const std::vector static_args_as_strings = compute_win32_argv(); 123 | 124 | std::vector static_args_result; 125 | static_args_result.reserve(static_args_as_strings.size()); 126 | 127 | for(const auto &arg : static_args_as_strings) { 128 | static_args_result.push_back(arg.data()); 129 | } 130 | 131 | return static_args_result; 132 | }(); 133 | 134 | return static_args; 135 | 136 | #elif defined(__APPLE__) 137 | 138 | return static_args; 139 | 140 | #else 141 | static const std::vector static_args = [] { 142 | static const std::vector static_cmdline = [] { 143 | // On posix, retrieve arguments from /proc/self/cmdline, separated by null terminators. 144 | std::vector cmdline; 145 | 146 | auto deleter = [](FILE *f) { std::fclose(f); }; 147 | std::unique_ptr fp_unique(std::fopen("/proc/self/cmdline", "r"), deleter); 148 | FILE *fp = fp_unique.get(); 149 | if(!fp) { 150 | throw std::runtime_error("could not open /proc/self/cmdline for reading"); // LCOV_EXCL_LINE 151 | } 152 | 153 | size_t size = 0; 154 | while(std::feof(fp) == 0) { 155 | cmdline.resize(size + 128); 156 | size += std::fread(cmdline.data() + size, 1, 128, fp); 157 | 158 | if(std::ferror(fp) != 0) { 159 | throw std::runtime_error("error during reading /proc/self/cmdline"); // LCOV_EXCL_LINE 160 | } 161 | } 162 | cmdline.resize(size); 163 | 164 | return cmdline; 165 | }(); 166 | 167 | std::size_t argc = static_cast(std::count(static_cmdline.begin(), static_cmdline.end(), '\0')); 168 | std::vector static_args_result; 169 | static_args_result.reserve(argc); 170 | 171 | for(auto it = static_cmdline.begin(); it != static_cmdline.end(); 172 | it = std::find(it, static_cmdline.end(), '\0') + 1) { 173 | static_args_result.push_back(static_cmdline.data() + (it - static_cmdline.begin())); 174 | } 175 | 176 | return static_args_result; 177 | }(); 178 | 179 | return static_args; 180 | #endif 181 | } 182 | 183 | } // namespace detail 184 | 185 | CLI11_INLINE const char *const *argv() { return detail::args().data(); } 186 | CLI11_INLINE int argc() { return static_cast(detail::args().size()); } 187 | 188 | // [CLI11:argv_inl_hpp:end] 189 | } // namespace CLI 190 | -------------------------------------------------------------------------------- /common/CLI/impl/Encoding_inl.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // This include is only needed for IDEs to discover symbols 10 | #include 11 | #include 12 | 13 | // [CLI11:public_includes:set] 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | // [CLI11:public_includes:end] 25 | 26 | namespace CLI { 27 | // [CLI11:encoding_inl_hpp:verbatim] 28 | 29 | namespace detail { 30 | 31 | #if !CLI11_HAS_CODECVT 32 | /// Attempt to set one of the acceptable unicode locales for conversion 33 | CLI11_INLINE void set_unicode_locale() { 34 | static const std::array unicode_locales{{"C.UTF-8", "en_US.UTF-8", ".UTF-8"}}; 35 | 36 | for(const auto &locale_name : unicode_locales) { 37 | if(std::setlocale(LC_ALL, locale_name) != nullptr) { 38 | return; 39 | } 40 | } 41 | throw std::runtime_error("CLI::narrow: could not set locale to C.UTF-8"); 42 | } 43 | 44 | template struct scope_guard_t { 45 | F closure; 46 | 47 | explicit scope_guard_t(F closure_) : closure(closure_) {} 48 | ~scope_guard_t() { closure(); } 49 | }; 50 | 51 | template CLI11_NODISCARD CLI11_INLINE scope_guard_t scope_guard(F &&closure) { 52 | return scope_guard_t{std::forward(closure)}; 53 | } 54 | 55 | #endif // !CLI11_HAS_CODECVT 56 | 57 | CLI11_DIAGNOSTIC_PUSH 58 | CLI11_DIAGNOSTIC_IGNORE_DEPRECATED 59 | 60 | CLI11_INLINE std::string narrow_impl(const wchar_t *str, std::size_t str_size) { 61 | #if CLI11_HAS_CODECVT 62 | #ifdef _WIN32 63 | return std::wstring_convert>().to_bytes(str, str + str_size); 64 | 65 | #else 66 | return std::wstring_convert>().to_bytes(str, str + str_size); 67 | 68 | #endif // _WIN32 69 | #else // CLI11_HAS_CODECVT 70 | (void)str_size; 71 | std::mbstate_t state = std::mbstate_t(); 72 | const wchar_t *it = str; 73 | 74 | std::string old_locale = std::setlocale(LC_ALL, nullptr); 75 | auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); }); 76 | set_unicode_locale(); 77 | 78 | std::size_t new_size = std::wcsrtombs(nullptr, &it, 0, &state); 79 | if(new_size == static_cast(-1)) { 80 | throw std::runtime_error("CLI::narrow: conversion error in std::wcsrtombs at offset " + 81 | std::to_string(it - str)); 82 | } 83 | std::string result(new_size, '\0'); 84 | std::wcsrtombs(const_cast(result.data()), &str, new_size, &state); 85 | 86 | return result; 87 | 88 | #endif // CLI11_HAS_CODECVT 89 | } 90 | 91 | CLI11_INLINE std::wstring widen_impl(const char *str, std::size_t str_size) { 92 | #if CLI11_HAS_CODECVT 93 | #ifdef _WIN32 94 | return std::wstring_convert>().from_bytes(str, str + str_size); 95 | 96 | #else 97 | return std::wstring_convert>().from_bytes(str, str + str_size); 98 | 99 | #endif // _WIN32 100 | #else // CLI11_HAS_CODECVT 101 | (void)str_size; 102 | std::mbstate_t state = std::mbstate_t(); 103 | const char *it = str; 104 | 105 | std::string old_locale = std::setlocale(LC_ALL, nullptr); 106 | auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); }); 107 | set_unicode_locale(); 108 | 109 | std::size_t new_size = std::mbsrtowcs(nullptr, &it, 0, &state); 110 | if(new_size == static_cast(-1)) { 111 | throw std::runtime_error("CLI::widen: conversion error in std::mbsrtowcs at offset " + 112 | std::to_string(it - str)); 113 | } 114 | std::wstring result(new_size, L'\0'); 115 | std::mbsrtowcs(const_cast(result.data()), &str, new_size, &state); 116 | 117 | return result; 118 | 119 | #endif // CLI11_HAS_CODECVT 120 | } 121 | 122 | CLI11_DIAGNOSTIC_POP 123 | 124 | } // namespace detail 125 | 126 | CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t str_size) { return detail::narrow_impl(str, str_size); } 127 | CLI11_INLINE std::string narrow(const std::wstring &str) { return detail::narrow_impl(str.data(), str.size()); } 128 | // Flawfinder: ignore 129 | CLI11_INLINE std::string narrow(const wchar_t *str) { return detail::narrow_impl(str, std::wcslen(str)); } 130 | 131 | CLI11_INLINE std::wstring widen(const char *str, std::size_t str_size) { return detail::widen_impl(str, str_size); } 132 | CLI11_INLINE std::wstring widen(const std::string &str) { return detail::widen_impl(str.data(), str.size()); } 133 | // Flawfinder: ignore 134 | CLI11_INLINE std::wstring widen(const char *str) { return detail::widen_impl(str, std::strlen(str)); } 135 | 136 | #ifdef CLI11_CPP17 137 | CLI11_INLINE std::string narrow(std::wstring_view str) { return detail::narrow_impl(str.data(), str.size()); } 138 | CLI11_INLINE std::wstring widen(std::string_view str) { return detail::widen_impl(str.data(), str.size()); } 139 | #endif // CLI11_CPP17 140 | 141 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0 142 | CLI11_INLINE std::filesystem::path to_path(std::string_view str) { 143 | return std::filesystem::path{ 144 | #ifdef _WIN32 145 | widen(str) 146 | #else 147 | str 148 | #endif // _WIN32 149 | }; 150 | } 151 | #endif // CLI11_HAS_FILESYSTEM 152 | 153 | // [CLI11:encoding_inl_hpp:end] 154 | } // namespace CLI 155 | -------------------------------------------------------------------------------- /common/CLI/impl/Split_inl.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner 2 | // under NSF AWARD 1414736 and by the respective contributors. 3 | // All rights reserved. 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | #pragma once 8 | 9 | // This include is only needed for IDEs to discover symbols 10 | #include 11 | 12 | // [CLI11:public_includes:set] 13 | #include 14 | #include 15 | #include 16 | #include 17 | // [CLI11:public_includes:end] 18 | 19 | #include 20 | #include 21 | 22 | namespace CLI { 23 | // [CLI11:split_inl_hpp:verbatim] 24 | 25 | namespace detail { 26 | 27 | CLI11_INLINE bool split_short(const std::string ¤t, std::string &name, std::string &rest) { 28 | if(current.size() > 1 && current[0] == '-' && valid_first_char(current[1])) { 29 | name = current.substr(1, 1); 30 | rest = current.substr(2); 31 | return true; 32 | } 33 | return false; 34 | } 35 | 36 | CLI11_INLINE bool split_long(const std::string ¤t, std::string &name, std::string &value) { 37 | if(current.size() > 2 && current.compare(0, 2, "--") == 0 && valid_first_char(current[2])) { 38 | auto loc = current.find_first_of('='); 39 | if(loc != std::string::npos) { 40 | name = current.substr(2, loc - 2); 41 | value = current.substr(loc + 1); 42 | } else { 43 | name = current.substr(2); 44 | value = ""; 45 | } 46 | return true; 47 | } 48 | return false; 49 | } 50 | 51 | CLI11_INLINE bool split_windows_style(const std::string ¤t, std::string &name, std::string &value) { 52 | if(current.size() > 1 && current[0] == '/' && valid_first_char(current[1])) { 53 | auto loc = current.find_first_of(':'); 54 | if(loc != std::string::npos) { 55 | name = current.substr(1, loc - 1); 56 | value = current.substr(loc + 1); 57 | } else { 58 | name = current.substr(1); 59 | value = ""; 60 | } 61 | return true; 62 | } 63 | return false; 64 | } 65 | 66 | CLI11_INLINE std::vector split_names(std::string current) { 67 | std::vector output; 68 | std::size_t val = 0; 69 | while((val = current.find(',')) != std::string::npos) { 70 | output.push_back(trim_copy(current.substr(0, val))); 71 | current = current.substr(val + 1); 72 | } 73 | output.push_back(trim_copy(current)); 74 | return output; 75 | } 76 | 77 | CLI11_INLINE std::vector> get_default_flag_values(const std::string &str) { 78 | std::vector flags = split_names(str); 79 | flags.erase(std::remove_if(flags.begin(), 80 | flags.end(), 81 | [](const std::string &name) { 82 | return ((name.empty()) || (!(((name.find_first_of('{') != std::string::npos) && 83 | (name.back() == '}')) || 84 | (name[0] == '!')))); 85 | }), 86 | flags.end()); 87 | std::vector> output; 88 | output.reserve(flags.size()); 89 | for(auto &flag : flags) { 90 | auto def_start = flag.find_first_of('{'); 91 | std::string defval = "false"; 92 | if((def_start != std::string::npos) && (flag.back() == '}')) { 93 | defval = flag.substr(def_start + 1); 94 | defval.pop_back(); 95 | flag.erase(def_start, std::string::npos); // NOLINT(readability-suspicious-call-argument) 96 | } 97 | flag.erase(0, flag.find_first_not_of("-!")); 98 | output.emplace_back(flag, defval); 99 | } 100 | return output; 101 | } 102 | 103 | CLI11_INLINE std::tuple, std::vector, std::string> 104 | get_names(const std::vector &input) { 105 | 106 | std::vector short_names; 107 | std::vector long_names; 108 | std::string pos_name; 109 | 110 | for(std::string name : input) { 111 | if(name.length() == 0) { 112 | continue; 113 | } 114 | if(name.length() > 1 && name[0] == '-' && name[1] != '-') { 115 | if(name.length() == 2 && valid_first_char(name[1])) 116 | short_names.emplace_back(1, name[1]); 117 | else if(name.length() > 2) 118 | throw BadNameString::MissingDash(name); 119 | else 120 | throw BadNameString::OneCharName(name); 121 | } else if(name.length() > 2 && name.substr(0, 2) == "--") { 122 | name = name.substr(2); 123 | if(valid_name_string(name)) 124 | long_names.push_back(name); 125 | else 126 | throw BadNameString::BadLongName(name); 127 | } else if(name == "-" || name == "--") { 128 | throw BadNameString::DashesOnly(name); 129 | } else { 130 | if(pos_name.length() > 0) 131 | throw BadNameString::MultiPositionalNames(name); 132 | pos_name = name; 133 | } 134 | } 135 | 136 | return std::make_tuple(short_names, long_names, pos_name); 137 | } 138 | 139 | } // namespace detail 140 | // [CLI11:split_inl_hpp:end] 141 | } // namespace CLI 142 | -------------------------------------------------------------------------------- /common/common_def.cc: -------------------------------------------------------------------------------- 1 | #include "common_def.h" 2 | 3 | int kernel_info_m_next_uid = 0; 4 | 5 | unsigned long long GLOBAL_HEAP_START = 0xC0000000; 6 | 7 | unsigned long long SHARED_MEM_SIZE_MAX = 96 * (1 << 10); 8 | 9 | unsigned long long LOCAL_MEM_SIZE_MAX = 1 << 14; 10 | 11 | unsigned MAX_STREAMING_MULTIPROCESSORS = 80; 12 | 13 | unsigned MAX_THREAD_PER_SM = 1 << 11; 14 | 15 | unsigned MAX_WARP_PER_SM = 1 << 6; 16 | unsigned long long TOTAL_LOCAL_MEM_PER_SM = 17 | MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX; 18 | unsigned long long TOTAL_SHARED_MEM = 19 | MAX_STREAMING_MULTIPROCESSORS * SHARED_MEM_SIZE_MAX; 20 | unsigned long long TOTAL_LOCAL_MEM = 21 | MAX_STREAMING_MULTIPROCESSORS * MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX; 22 | unsigned long long SHARED_GENERIC_START = GLOBAL_HEAP_START - TOTAL_SHARED_MEM; 23 | unsigned long long LOCAL_GENERIC_START = SHARED_GENERIC_START - TOTAL_LOCAL_MEM; 24 | unsigned long long STATIC_ALLOC_LIMIT = 25 | GLOBAL_HEAP_START - (TOTAL_LOCAL_MEM + TOTAL_SHARED_MEM); 26 | -------------------------------------------------------------------------------- /common/common_def.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifndef COMMON_DEF_H 6 | #define COMMON_DEF_H 7 | 8 | #define USE_BOOST 9 | #define gpgpu_concurrent_kernel_sm false 10 | 11 | #define ENABLE_SAMPLING_POINT 12 | 13 | // #define DUMP_THREAD_NUM 14 | // #define DUMP_TIME_SUMMARY 15 | 16 | #define WARP_SIZE 32 17 | #define MAX_DST 1 18 | #define MAX_SRC 4 19 | 20 | #define MAX_WARP_PER_SHADER 64 21 | 22 | #define MAX_INPUT_VALUES 24 23 | #define MAX_OUTPUT_VALUES 8 24 | 25 | #define MAX_REG_OPERANDS 32 26 | 27 | enum command_type { 28 | kernel_launch = 1, 29 | cpu_gpu_mem_copy, 30 | gpu_cpu_mem_copy, 31 | }; 32 | 33 | enum address_space { GLOBAL_MEM = 1, SHARED_MEM, LOCAL_MEM, TEX_MEM }; 34 | 35 | enum address_scope { 36 | L1_CACHE = 1, 37 | L2_CACHE, 38 | SYS_MEM, 39 | }; 40 | 41 | enum address_format { list_all = 0, base_stride = 1, base_delta = 2 }; 42 | 43 | const unsigned MAX_WARP_SIZE = 32; 44 | typedef std::bitset active_mask_t; 45 | 46 | const unsigned MAX_ACCESSES_PER_INSN_PER_THREAD = 8; 47 | 48 | typedef unsigned long long new_addr_type; 49 | 50 | const unsigned MAX_MEMORY_ACCESS_SIZE = 128; 51 | typedef std::bitset mem_access_byte_mask_t; 52 | 53 | const unsigned SECTOR_CHUNCK_SIZE = 4; 54 | const unsigned SECTOR_SIZE = 32; 55 | typedef std::bitset mem_access_sector_mask_t; 56 | 57 | enum _memory_op_t { no_memory_op = 0, memory_load, memory_store }; 58 | 59 | enum mem_operation_t { NOT_TEX, TEX }; 60 | typedef enum mem_operation_t mem_operation; 61 | 62 | #define MEM_ACCESS_TYPE_TUP_DEF \ 63 | MA_TUP_BEGIN(mem_access_type) \ 64 | MA_TUP(GLOBAL_ACC_R), MA_TUP(LOCAL_ACC_R), MA_TUP(CONST_ACC_R), \ 65 | MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W), \ 66 | MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R), \ 67 | MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R), \ 68 | MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type) 69 | 70 | #define MA_TUP_BEGIN(X) enum X { 71 | #define MA_TUP(X) X 72 | #define MA_TUP_END(X) \ 73 | } \ 74 | ; 75 | enum mem_access_type { 76 | GLOBAL_ACC_R, 77 | LOCAL_ACC_R, 78 | CONST_ACC_R, 79 | TEXTURE_ACC_R, 80 | GLOBAL_ACC_W, 81 | LOCAL_ACC_W, 82 | L1_WRBK_ACC, 83 | L2_WRBK_ACC, 84 | INST_ACC_R, 85 | L1_WR_ALLOC_R, 86 | L2_WR_ALLOC_R, 87 | NUM_MEM_ACCESS_TYPE 88 | }; 89 | #undef MA_TUP_BEGIN 90 | #undef MA_TUP 91 | #undef MA_TUP_END 92 | 93 | enum _memory_space_t { 94 | undefined_space = 0, 95 | reg_space, 96 | local_space, 97 | shared_space, 98 | sstarr_space, 99 | param_space_unclassified, 100 | param_space_kernel, 101 | param_space_local, 102 | const_space, 103 | tex_space, 104 | surf_space, 105 | global_space, 106 | generic_space, 107 | instruction_space 108 | }; 109 | 110 | enum cache_operator_type { 111 | CACHE_UNDEFINED, 112 | 113 | CACHE_ALL, 114 | CACHE_LAST_USE, 115 | CACHE_VOLATILE, 116 | CACHE_L1, 117 | 118 | CACHE_STREAMING, 119 | CACHE_GLOBAL, 120 | 121 | CACHE_WRITE_BACK, 122 | CACHE_WRITE_THROUGH 123 | }; 124 | 125 | #define MAX_REG_OPERANDS 32 126 | 127 | #define MAX_KERNELS_NUM 300 128 | 129 | #ifdef USE_BOOST 130 | 131 | #include 132 | #include 133 | #include 134 | #endif 135 | 136 | #ifdef USE_BOOST 137 | void simple_mpi_test(int argc, char **argv); 138 | #endif 139 | 140 | extern int kernel_info_m_next_uid; 141 | 142 | extern unsigned long long GLOBAL_HEAP_START; 143 | 144 | extern unsigned long long SHARED_MEM_SIZE_MAX; 145 | 146 | extern unsigned long long LOCAL_MEM_SIZE_MAX; 147 | 148 | extern unsigned MAX_STREAMING_MULTIPROCESSORS; 149 | 150 | extern unsigned MAX_THREAD_PER_SM; 151 | 152 | extern unsigned MAX_WARP_PER_SM; 153 | extern unsigned long long TOTAL_LOCAL_MEM_PER_SM; 154 | extern unsigned long long TOTAL_SHARED_MEM; 155 | extern unsigned long long TOTAL_LOCAL_MEM; 156 | extern unsigned long long SHARED_GENERIC_START; 157 | extern unsigned long long LOCAL_GENERIC_START; 158 | extern unsigned long long STATIC_ALLOC_LIMIT; 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /common/option_parser.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung 2 | // The University of British Columbia 3 | // All rights reserved. 4 | // 5 | // Redistribution and use in source and binary forms, with or without 6 | // modification, are permitted provided that the following conditions are met: 7 | // 8 | // Redistributions of source code must retain the above copyright notice, this 9 | // list of conditions and the following disclaimer. 10 | // Redistributions in binary form must reproduce the above copyright notice, 11 | // this list of conditions and the following disclaimer in the documentation 12 | // and/or other materials provided with the distribution. Neither the name of 13 | // The University of British Columbia nor the names of its contributors may be 14 | // used to endorse or promote products derived from this software without 15 | // specific prior written permission. 16 | // 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | // POSSIBILITY OF SUCH DAMAGE. 28 | 29 | #pragma once 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | typedef class OptionParser *option_parser_t; 36 | 37 | enum option_dtype { 38 | OPT_INT32, 39 | OPT_UINT32, 40 | OPT_INT64, 41 | OPT_UINT64, 42 | OPT_BOOL, 43 | OPT_FLOAT, 44 | OPT_DOUBLE, 45 | OPT_CHAR, 46 | OPT_CSTR 47 | }; 48 | 49 | option_parser_t option_parser_create(); 50 | void option_parser_destroy(option_parser_t opp); 51 | 52 | void option_parser_register(option_parser_t opp, const char *name, 53 | enum option_dtype type, void *variable, 54 | const char *desc, const char *defaultvalue); 55 | 56 | void option_parser_cmdline(option_parser_t opp, int argc, const char *argv[]); 57 | 58 | void option_parser_cfgfile(option_parser_t opp, const char *filename); 59 | 60 | void option_parser_delimited_string(option_parser_t opp, 61 | const char *inputstring, 62 | const char *delimiters); 63 | 64 | void option_parser_print(option_parser_t opp, FILE *fout); 65 | void option_parser_print_limited(option_parser_t opp, FILE *fout, int limited, 66 | std::string pattern1, std::string pattern2); 67 | -------------------------------------------------------------------------------- /hw-component/IBuffer.cc: -------------------------------------------------------------------------------- 1 | #include "IBuffer.h" 2 | 3 | #define PRINT_AT for(unsigned i=0; i<40; ++i) std::cout << "@"; std::cout << std::endl; 4 | 5 | IBuffer::IBuffer(const unsigned smid, const unsigned num_warps) 6 | : m_smid(smid), m_num_warps(num_warps) { 7 | m_ibuffer.resize(num_warps); 8 | } 9 | 10 | void IBuffer::print_ibuffer() const { 11 | for (unsigned i = 0; i < m_num_warps; i++) { 12 | std::cout << "warp - " << i << ": "; 13 | for (auto it = m_ibuffer[i].begin(); it != m_ibuffer[i].end(); it++) { 14 | std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid 15 | << "), "; 16 | } 17 | std::cout << std::endl; 18 | } 19 | } 20 | 21 | void IBuffer::print_ibuffer(const unsigned gwarp_start, const unsigned gwarp_end) const { 22 | PRINT_AT; 23 | for (unsigned i = gwarp_start; i < gwarp_end; i++) { 24 | std::cout << " Ibuffer (pc, wid, kid) warp - " << i << ": "; 25 | for (auto it = m_ibuffer[i].begin(); it != m_ibuffer[i].end(); it++) { 26 | std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid 27 | << "), "; 28 | } 29 | std::cout << std::endl; 30 | } 31 | PRINT_AT; 32 | } 33 | 34 | void IBuffer::print_ibuffer(const unsigned gwarp_id) const { 35 | std::cout << " Ibuffer (pc, wid, kid) warp - " << gwarp_id << ": "; 36 | for (auto it = m_ibuffer[gwarp_id].begin(); it != m_ibuffer[gwarp_id].end(); 37 | it++) { 38 | std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid << "), "; 39 | } 40 | std::cout << std::endl; 41 | } 42 | -------------------------------------------------------------------------------- /hw-component/IBuffer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifndef IBUFFER_H 6 | #define IBUFFER_H 7 | 8 | struct ibuffer_entry { 9 | ibuffer_entry(const unsigned pc, const unsigned wid, 10 | const unsigned kid, const unsigned uid) 11 | : pc(pc), wid(wid), kid(kid), uid(uid) {} 12 | 13 | unsigned pc, wid, kid, uid; 14 | }; 15 | 16 | class IBuffer { 17 | public: 18 | IBuffer(const unsigned smid, const unsigned num_warps); 19 | 20 | /// The `allKernelsWarpID` uniquely identifies each warp across all 21 | /// kernels within the application. For an application with multiple 22 | /// kernels (e.g., 100 kernels), each containing warps numbered from 23 | /// 0 to 10, the `allKernelsWarpID` ranges from 0 to 1000, which pro- 24 | /// vides a global unique identifier for every warp. 25 | inline bool is_empty(const unsigned allKernelsWarpID) const { 26 | return m_ibuffer[allKernelsWarpID].empty(); 27 | } 28 | 29 | inline bool has_free_slot(const unsigned allKernelsWarpID) const { 30 | return m_ibuffer[allKernelsWarpID].size() < 2; 31 | } 32 | 33 | inline bool is_not_empty(const unsigned allKernelsWarpID) const { 34 | return !is_empty(allKernelsWarpID); 35 | } 36 | 37 | void push_back(const unsigned allKernelsWarpID, ibuffer_entry entry) { 38 | m_ibuffer[allKernelsWarpID].push_back(entry); 39 | } 40 | 41 | /// TODO: Using double-ended queues instead of vectors may speed up 42 | /// the `pop_front` function, because std::d eque supports efficient 43 | /// header deletion operations. 44 | ibuffer_entry pop_front(const unsigned allKernelsWarpID) { 45 | ibuffer_entry entry = std::move(m_ibuffer[allKernelsWarpID].front()); 46 | m_ibuffer[allKernelsWarpID].erase( 47 | m_ibuffer[allKernelsWarpID].begin()); 48 | return entry; 49 | } 50 | 51 | /// TODO: Merging `pop_front` and `front` to reduce the overhead of 52 | /// duplicate moves. 53 | inline const ibuffer_entry& front(const unsigned allKernelsWarpID) const { 54 | return m_ibuffer[allKernelsWarpID].front(); 55 | } 56 | 57 | /// Return the size of `m_ibuffer`. 58 | inline std::size_t size() const { return m_ibuffer.size(); } 59 | 60 | void print_ibuffer() const; 61 | void print_ibuffer(const unsigned gwarp_id) const; 62 | void print_ibuffer(const unsigned gwarp_start, const unsigned gwarp_end) const; 63 | 64 | private: 65 | unsigned m_smid; 66 | std::vector> m_ibuffer; 67 | unsigned m_num_warps; 68 | }; 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /hw-component/RegBankAlloc.cc: -------------------------------------------------------------------------------- 1 | #include "RegBankAlloc.h" 2 | 3 | regBankAlloc::regBankAlloc( 4 | const unsigned smid, 5 | const unsigned num_banks, 6 | const unsigned num_warp_scheds, 7 | const unsigned bank_warp_shift, 8 | const unsigned num_banks_per_sched) 9 | : m_smid(smid), m_num_banks(num_banks), 10 | m_num_warp_scheds(num_warp_scheds), 11 | m_bank_warp_shift(bank_warp_shift), 12 | m_num_banks_per_sched(num_banks_per_sched), 13 | m_sub_core_model(num_warp_scheds > 1) { 14 | isNumBankPowerOfTwo = (m_num_banks & (m_num_banks - 1)) == 0; 15 | m_bank_state.resize(num_banks, FREE); 16 | } 17 | 18 | unsigned regBankAlloc::register_bank(const unsigned regnum, 19 | const unsigned wid, 20 | const unsigned sched_id) const { 21 | unsigned bank = regnum; 22 | if (m_bank_warp_shift) bank += wid; 23 | if (m_sub_core_model) { 24 | unsigned bank_num = (bank % m_num_banks_per_sched) + sched_id * m_num_banks_per_sched; 25 | assert(bank_num < m_num_banks); 26 | return bank_num; 27 | } else { 28 | // Use the `isPowerOfTwo` variable to decide whether to use bitwise 29 | // operations for optimization. 30 | return isNumBankPowerOfTwo ? bank & (m_num_banks - 1) : bank % m_num_banks; 31 | } 32 | } 33 | 34 | const RegBankState& regBankAlloc::getBankState( 35 | const unsigned regnum, const unsigned wid, 36 | const unsigned sched_id) const { 37 | unsigned bank_id = register_bank(regnum, wid, sched_id); 38 | return getBankState(bank_id); 39 | } 40 | 41 | void regBankAlloc::setBankState(const unsigned regnum, 42 | const unsigned wid, 43 | const unsigned sched_id, 44 | const RegBankState state) noexcept{ 45 | unsigned bank_id = register_bank(regnum, wid, sched_id); 46 | setBankState(bank_id, state); 47 | } 48 | 49 | void regBankAlloc::releaseBankState(const unsigned regnum, 50 | const unsigned wid, 51 | const unsigned sched_id) noexcept { 52 | setBankState(regnum, wid, sched_id, FREE); 53 | } 54 | 55 | void regBankAlloc::printBankState() const { 56 | printf("Register Bank State (smid=%u): \n", m_smid); 57 | for (unsigned i = 0; i < m_num_banks; ++i) { 58 | printf(" bank %2u: %d\n", i, m_bank_state[i]); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /hw-component/RegBankAlloc.h: -------------------------------------------------------------------------------- 1 | #include "stdio.h" 2 | #include 3 | #include 4 | 5 | #ifndef REG_BANK_ALLOC_H 6 | #define REG_BANK_ALLOC_H 7 | 8 | enum RegBankState { 9 | FREE = 0, 10 | ON_READING, 11 | ON_WRITING, 12 | RegBankStateNUM, 13 | }; 14 | 15 | class regBankAlloc { 16 | public: 17 | regBankAlloc(const unsigned smid, 18 | const unsigned num_banks, 19 | const unsigned num_warp_scheds, 20 | const unsigned bank_warp_shift, 21 | const unsigned num_banks_per_sched); 22 | 23 | unsigned register_bank(const unsigned regnum, const unsigned wid, 24 | const unsigned sched_id) const; 25 | 26 | inline const RegBankState& getBankState(const unsigned bank_id) const { 27 | return m_bank_state[bank_id]; 28 | } 29 | 30 | const RegBankState& getBankState(const unsigned regnum, const unsigned wid, 31 | const unsigned sched_id) const; 32 | 33 | inline void setBankState(const unsigned bank_id, 34 | const RegBankState state) noexcept { 35 | m_bank_state[bank_id] = state; 36 | }; 37 | 38 | void setBankState(const unsigned regnum, const unsigned wid, 39 | const unsigned sched_id, const RegBankState state) noexcept; 40 | 41 | inline void releaseBankState(const unsigned bank_id) noexcept { 42 | setBankState(bank_id, FREE); 43 | } 44 | 45 | void releaseBankState(const unsigned regnum, 46 | const unsigned wid, 47 | const unsigned sched_id) noexcept; 48 | 49 | /// TODO: Using `std::fill_n(m_bank_state.begin(), m_num_banks, FREE);` 50 | /// to replace the loop can cause additional overhead due to the in- 51 | /// ability to take advantage of inline functions, as well as the in- 52 | /// ternal iterator performing bounds checks. 53 | inline void releaseAllBankStates() noexcept { 54 | for (unsigned i = 0; i < m_num_banks; ++i) { 55 | setBankState(i, FREE); 56 | } 57 | }; 58 | 59 | void printBankState() const; 60 | 61 | private: 62 | unsigned m_smid; 63 | unsigned m_num_banks; 64 | unsigned m_num_warp_scheds; 65 | unsigned m_bank_warp_shift; 66 | unsigned m_num_banks_per_sched; 67 | bool m_sub_core_model; 68 | std::vector m_bank_state; 69 | 70 | // When `m_num_banks` is a power of 2, a shift operation can be 71 | // used instead of a modulo operation (%) to improve efficiency. 72 | bool isNumBankPowerOfTwo; 73 | }; 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /hw-component/Scoreboard.cc: -------------------------------------------------------------------------------- 1 | #include "Scoreboard.h" 2 | 3 | Scoreboard::Scoreboard(const unsigned smid, 4 | const unsigned n_warps) 5 | : m_smid(smid) { 6 | reg_table.resize(n_warps); 7 | longopregs.resize(n_warps); 8 | 9 | /// TODO: For `std::vector> reg_table;`, we need 10 | /// to determine in advance how many registers will be inserted, then use 11 | /// `reg_table[wid].reserve(size)` to pre-allocate memory. This should 12 | /// improve performance by reducing the number of dynamic memory allocs 13 | /// that occur when inserting an element. 14 | } 15 | 16 | void Scoreboard::reserveRegister(const unsigned wid, 17 | const int regnum) noexcept { 18 | auto [iter, inserted] = reg_table[wid].insert(regnum); 19 | if (!inserted) { 20 | printf("Error: trying to reserve an already reserved register (sid=%u, " 21 | "wid=%u, regnum=%d).\n", m_smid, wid, regnum); 22 | abort(); 23 | } 24 | } 25 | 26 | const bool Scoreboard::islongop(const unsigned wid, const int regnum) const { 27 | if (regnum == -1) return false; 28 | else return longopregs[wid].find(regnum) != longopregs[wid].end(); 29 | } 30 | 31 | void Scoreboard::reserveRegisters(const unsigned wid, std::vector ®nums, 32 | bool is_load) noexcept { 33 | std::unordered_set prev_regs; 34 | for (auto ®num : regnums) { 35 | if (regnum > 0 && prev_regs.insert(regnum).second) { 36 | reserveRegister(wid, regnum); 37 | } 38 | } 39 | 40 | if (is_load) 41 | for (auto ®num : regnums) 42 | if (regnum > 0) longopregs[wid].insert(regnum); 43 | } 44 | 45 | void Scoreboard::releaseRegisters(const unsigned wid, 46 | std::vector ®nums) noexcept { 47 | for (auto ®num : regnums) 48 | releaseRegister(wid, regnum); 49 | } 50 | 51 | bool Scoreboard::checkCollision(const unsigned wid, std::vector ®nums, 52 | const int pred, const int ar1, const int ar2) const { 53 | if (pred > 0 && reg_table[wid].find(pred) != reg_table[wid].end()) return true; 54 | if (ar1 > 0 && reg_table[wid].find(ar1) != reg_table[wid].end()) return true; 55 | if (ar2 > 0 && reg_table[wid].find(ar2) != reg_table[wid].end()) return true; 56 | for (auto ® : regnums) 57 | if (reg > 0 && reg_table[wid].find(reg) != reg_table[wid].end()) 58 | return true; 59 | 60 | return false; 61 | } 62 | 63 | void Scoreboard::printContents() const { 64 | printf(" Scoreboard contents (sid=%u): \n", m_smid); 65 | for (unsigned i = 0; i < reg_table.size(); i++) { 66 | if (reg_table[i].size() == 0) 67 | continue; 68 | printContents(i); 69 | } 70 | } 71 | 72 | void Scoreboard::printContents(unsigned i) const { 73 | printf(" wid = %2u: ", i); 74 | std::unordered_set::const_iterator it; 75 | for (it = reg_table[i].begin(); it != reg_table[i].end(); it++) 76 | printf("R%d ", *it); 77 | printf("\n"); 78 | } 79 | -------------------------------------------------------------------------------- /hw-component/Scoreboard.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifndef SCOREBOARD_H 11 | #define SCOREBOARD_H 12 | 13 | class Scoreboard { 14 | public: 15 | Scoreboard(const unsigned smid, const unsigned n_warps); 16 | 17 | void reserveRegisters(const unsigned wid, std::vector ®nums, 18 | bool is_load) noexcept; 19 | 20 | void releaseRegisters(const unsigned wid, std::vector ®nums) noexcept; 21 | 22 | inline void releaseRegister(const unsigned wid, const int regnum) noexcept { 23 | if (regnum != -1) reg_table[wid].erase(regnum); 24 | } 25 | 26 | bool checkCollision(const unsigned wid, std::vector ®nums, const int pred, 27 | const int ar1, const int ar2) const; 28 | 29 | /// TODO: Maybe don't need this again. 30 | inline bool pendingWrites(const unsigned wid) const { 31 | return !reg_table[wid].empty(); 32 | } 33 | 34 | /// TODO: Maybe don't need this again. 35 | const bool islongop(const unsigned wid, const int regnum) const; 36 | 37 | inline const unsigned regs_size(const unsigned wid) const { 38 | return reg_table[wid].size(); 39 | } 40 | 41 | void printContents() const; 42 | void printContents(unsigned i) const; 43 | 44 | private: 45 | void reserveRegister(const unsigned wid, const int regnum) noexcept; 46 | 47 | int get_sid() const { return m_smid; } 48 | 49 | unsigned m_smid; 50 | 51 | std::vector> reg_table; 52 | std::vector> longopregs; 53 | }; 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /parda/.gitignore: -------------------------------------------------------------------------------- 1 | *.hist 2 | *.o 3 | *.x 4 | -------------------------------------------------------------------------------- /parda/README.md: -------------------------------------------------------------------------------- 1 | Reuse distance is a well established approach to characterizing data cache locality based on the stack histogram model. 2 | This analysis so far has been restricted to offline use due to the high cost, often several orders of magnitude larger than the execution time of the analyzed code. Parda is the first parallel algorithm to compute accurate reuse distances by analysis of memory address traces. The algorithm uses a tunable parameter that enables faster analysis when the maximum needed reuse distance is limited by a cache size upper bound. 3 | 4 | This program is a Parda implementation on file input. parda omp implementation is mainly in `parda_omp.c` and `parda_omp.h`. 5 | 6 | ## Instructions to run file input Parda. 7 | 8 | ### A. Setup and compile 9 | 10 | `Step 0:` parda use glib standard linux library. If on ubuntu system just execute following sudo command. 11 | 12 | ```shell 13 | sudo apt-get install glib 14 | ``` 15 | 16 | `Step 1:` Download sample trace files from project git web page. 17 | 18 | `normal_137979.trace` is text file and 19 | `binary_137979.trace` is the binary file. 20 | This two files record trace data of `ls` command. 21 | 22 | `Step 2:` 23 | 24 | ```shell 25 | cd /path/to/parda 26 | ``` 27 | 28 | Current program only tests with `gcc` and `icc`. 29 | Edit the first three lines of makefile. 30 | If machine has `mpicc`, give `MPI=1` option to enable mpi parallelism. 31 | Otherwise, give `OMP=1`. If use only sequential algorithm, comments both `OMP` and `MPI`. 32 | 33 | ```makefile 34 | DEBUG = 1 35 | OMP = 1 36 | MPI = 1 37 | ``` 38 | 39 | ```shell 40 | make 41 | ``` 42 | 43 | ### B. Execution instructions 44 | 45 | ```shell 46 | ./parda.x --help to see how to run with different flags and run with sequential algorithm. 47 | ``` 48 | 49 | #### Execution arguments: 50 | 51 | ```makefile 52 | --input: the input trace file name. 53 | --lines: the total number of lines in the input trace file. 54 | --enable-omp: enable program to parallelly run with OpenMP threads. 55 | --enable-mpi: enable program to parallelly run with MPI. 56 | --enable-seperate: Seperate the input file to prepare for running with paralellization. 57 | ``` 58 | 59 | #### 1) Sequential execution: 60 | 61 | ```shell 62 | ./parda.x --input=normal_137979.trace --lines=137979 > seq.hist 63 | ``` 64 | 65 | #### 2) Run parda with OpenMP `--enable-omp` flag. 66 | 67 | Before running with omp we need to seperate the trace files to threads number. For example if we want to run with 4 threads. 68 | 69 | ```shell 70 | ./parda.x --enable-seperate --input=normal_137979.trace --lines=137979 --threads=4 71 | ``` 72 | 73 | We will find 4 seperated trace files: 74 | 75 | `4_normal_137979.trace_p0.txt` `4_normal_137979.trace_p1.txt` 76 | `4_normal_137979.trace_p2.txt` `4_normal_137979.trace_p3.txt` 77 | 78 | ```shell 79 | ./parda.x --enable-omp --input=normal_137979.trace --lines=137979 --threads=4 > omp.re 80 | ``` 81 | 82 | #### 3) Run parda with MPI 83 | 84 | ```shell 85 | mpirun -np 4 ./parda.x --input=normal_137979.trace --lines=137979 --enable-mpi 86 | ``` 87 | 88 | ### Parda 89 | 90 | Parda is free software: you can redistribute it and/or modify 91 | it under the terms of the GNU General Public License as published by 92 | the Free Software Foundation, either version 3 of the License, or 93 | (at your option) any later version. 94 | 95 | Parda is distributed in the hope that it will be useful, 96 | but WITHOUT ANY WARRANTY; without even the implied warranty of 97 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 98 | GNU General Public License for more details. 99 | 100 | ### Author: 101 | Qingpeng Niu 102 | 103 | ### Contact: 104 | niuqingpeng at gmail.com 105 | 106 | ### Documententation 107 | 108 | #### Related publications: 109 | 110 | ``` 111 | PARDA: A Fast Parallel Reuse Distance Analysis Algorithm. 112 | Qingpeng Niu, James Dinan, Qingda Lu and P. Sadayappan. 113 | IEEE IPDPS (IPDPS'12), May 2012, Shanghai, China. 114 | ``` -------------------------------------------------------------------------------- /parda/main.c: -------------------------------------------------------------------------------- 1 | #include "parda.h" 2 | #ifdef enable_mpi 3 | #include "parda_mpi.h" 4 | #endif 5 | #ifdef enable_omp 6 | #include "parda_omp.h" 7 | #endif 8 | #include "process_args.h" 9 | #include "seperate.h" 10 | 11 | int main(int argc, char **argv) { 12 | process_args(argc, argv); 13 | if (is_seperate == 1) { 14 | parda_seperate_file(inputFileName, threads, lines); 15 | } else if (is_omp == 0 && is_mpi == 0) { 16 | DEBUG(printf("This is seq stackdist\n");) 17 | classical_tree_based_stackdist(inputFileName, lines); 18 | } else if (is_omp == 1 && is_mpi == 0) { 19 | DEBUG(printf("This is omp stackdist\n");) 20 | #ifdef enable_omp 21 | parda_omp_stackdist(inputFileName, lines, threads); 22 | #else 23 | printf("openmp is not enabled, try to define enable_omp and add OMP " 24 | "variable in Makefile\n"); 25 | abort(); 26 | #endif 27 | } else if (is_omp == 0 && is_mpi == 1) { 28 | DEBUG(printf("This is mpistackdist\n");) 29 | #ifdef enable_mpi 30 | parda_mpi_stackdist(inputFileName, lines, threads, argc, argv); 31 | #else 32 | printf("mpi is not enabled, try to define enable_omp and add MPI variable " 33 | "in Makefile\n"); 34 | abort(); 35 | #endif 36 | } else if (is_omp == 1 && is_mpi == 1) { 37 | DEBUG(printf("This is hybrid stackdist\n");) 38 | #if defined(enable_omp) && defined(enable_mpi) 39 | parda_hybrid_stackdist(inputFileName, lines, threads, argc, argv); 40 | #else 41 | printf("hybridis not enabled, try to define enable_omp and enable_mpi and " 42 | "add MPI and OMP variable in Makefile\n"); 43 | abort(); 44 | #endif 45 | } 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /parda/makefile: -------------------------------------------------------------------------------- 1 | #DEBUG = 1 2 | #OMP = 1 3 | #MPI = 0 4 | 5 | BASE=g++ 6 | #ifeq (icc, $(findstring icc,$(shell mpicc -show))) 7 | #BASE=icc 8 | #endif 9 | 10 | #ifdef MPI 11 | #CC=mpicc 12 | #CFLAGS+=-Denable_mpi 13 | #else 14 | CC=$(BASE) 15 | #endif 16 | 17 | CFLAGS += -Wall -std=c++11 -Wpointer-arith 18 | ifdef DEBUG 19 | CFLAGS+= -g -O0 20 | else 21 | CFLAGS+= -O3 22 | endif 23 | CFLAGS += $(shell pkg-config --cflags glib-2.0) 24 | LIBS = $(shell pkg-config --libs glib-2.0 --libs gthread-2.0) 25 | OBJS+= main.o splay.o parda.o parda_print.o narray.o process_args.o seperate.o 26 | HEADERS= splay.h parda.h narray.h process_args.h seperate.h 27 | 28 | ifdef OMP 29 | OBJS+= parda_omp.o 30 | HEADERS+= parda_omp.h 31 | CFLAGS+=-Denable_omp 32 | ifeq ($(BASE),icc) 33 | CFLAGS+=-openmp 34 | else 35 | CFLAGS+=-fopenmp 36 | endif 37 | endif 38 | 39 | ifeq ($(CC),mpicc) 40 | OBJS+= parda_mpi.o 41 | HEADERS+= parda_mpi.h 42 | CFLAGS+= -Denable_mpi 43 | endif 44 | 45 | ifeq ($(BASE),icc) 46 | CFLAGS+=-limf 47 | endif 48 | 49 | SOURCES=$(subst .o,.c, $(OBJS) ) 50 | EXE=parda.x 51 | .PHONY: all clean gnuplots run 52 | all: $(EXE) 53 | 54 | $(EXE): $(OBJS) 55 | $(CC) $(CFLAGS) -o $@ $+ $(LIBS) 56 | cp -f parda.x ../ls 57 | $(OBJS):$(HEADERS) makefile 58 | %.d: %.c 59 | set -e; rm -f $@; \ 60 | $(CC) -M $(CPPFLAGS) $< > $@.$$$$; \ 61 | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ 62 | rm -f $@.$$$$ 63 | include $(sources:.c=.d) 64 | clean: 65 | rm -f $(EXE) *.o 66 | run: 67 | -------------------------------------------------------------------------------- /parda/narray.c: -------------------------------------------------------------------------------- 1 | #include "narray.h" 2 | 3 | narray_t *narray_new(unsigned element_size, unsigned capacity) { 4 | narray_t *na = (narray_t *)malloc(sizeof(narray_t)); 5 | na->element_size = element_size; 6 | na->len = 0; 7 | na->capacity = capacity * element_size; 8 | na->data = calloc(capacity, element_size); 9 | return na; 10 | } 11 | 12 | void narray_append_val(narray_t *na, const void *value) { 13 | if (na->len == na->capacity) { 14 | unsigned new_capacity = na->capacity + na->capacity + 10 * na->element_size; 15 | void *ndata = calloc(new_capacity, 1); 16 | memcpy(ndata, na->data, na->len); 17 | free(na->data); 18 | na->data = ndata; 19 | na->capacity = new_capacity; 20 | } 21 | memcpy((char *)na->data + na->len, value, na->element_size); 22 | na->len += na->element_size; 23 | } 24 | 25 | void narray_free(narray_t *na) { 26 | free(na->data); 27 | free(na); 28 | } 29 | 30 | void narray_print(narray_t *na, void (*show_element)(void *, int, FILE *), 31 | FILE *fp) { 32 | mdebug(fprintf(fp, "enter narray_print len=%u\n", na->len);) unsigned len = 33 | narray_get_len(na); 34 | unsigned i; 35 | for (i = 0; i < len; i++) { 36 | show_element(na->data, i, fp); 37 | mdebug(printf("%s ", ((HKEY *)ga->data)[i]);) 38 | } 39 | } 40 | 41 | narray_t *narray_heaparray_new(void *data, const unsigned len, 42 | const unsigned element_size) { 43 | narray_t *na = (narray_t *)malloc(sizeof(narray_t)); 44 | na->data = data; 45 | na->len = len; 46 | na->capacity = len; 47 | na->element_size = element_size; 48 | return na; 49 | } 50 | -------------------------------------------------------------------------------- /parda/narray.h: -------------------------------------------------------------------------------- 1 | #ifndef _NARRAY_H 2 | #define _NARRAY_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef enable_mdebugging 9 | #define mdebug(cmd) cmd 10 | #else 11 | #define mdebug(cmd) 12 | #endif 13 | 14 | typedef struct narray_s { 15 | void *data; 16 | unsigned len, capacity, element_size; 17 | } narray_t; 18 | 19 | narray_t *narray_heaparray_new(void *data, unsigned len, unsigned element_size); 20 | narray_t *narray_new(unsigned element_size, unsigned capacity); 21 | void narray_append_val(narray_t *na, const void *value); 22 | void narray_free(narray_t *na); 23 | void narray_print(narray_t *na, void (*show_element)(void *, int, FILE *), 24 | FILE *fp); 25 | 26 | static inline unsigned narray_get_len(const narray_t *na) { 27 | return na->len / na->element_size; 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /parda/parda.h: -------------------------------------------------------------------------------- 1 | #ifndef _PARDA_H 2 | #define _PARDA_H 3 | 4 | #include "narray.h" 5 | #include "process_args.h" 6 | #include "splay.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #ifdef enable_mpi 18 | #ifdef enable_omp 19 | #define enable_hybrid 20 | #endif 21 | #endif 22 | 23 | #define enable_timing 24 | #ifdef enable_timing 25 | #define PTIME(cmd) cmd 26 | #else 27 | #define PTIME(cmd) 28 | #endif 29 | 30 | #ifdef enable_debugging 31 | #define DEBUG(cmd) cmd 32 | #else 33 | #define DEBUG(cmd) 34 | #endif 35 | 36 | #ifdef enable_profiling 37 | #define PROF(cmd) cmd 38 | #else 39 | #define PROF(cmd) 40 | #endif 41 | 42 | #define DEFAULT_NBUCKETS 1000000 43 | #define B_OVFL nbuckets 44 | #define B_INF nbuckets + 1 45 | #define SLEN 20 46 | 47 | extern int nbuckets; 48 | #ifdef ENABLE_PROFILING 49 | extern char pfile[30]; 50 | extern FILE *pid_fp; 51 | #endif 52 | 53 | typedef char HKEY[SLEN]; 54 | 55 | typedef struct end_keytime_s { 56 | narray_t *gkeys; 57 | narray_t *gtimes; 58 | } end_keytime_t; 59 | 60 | typedef struct processor_info_s { 61 | int pid, psize; 62 | long tstart, tlen, tend, sum; 63 | } processor_info_t; 64 | 65 | typedef struct program_data_s { 66 | GHashTable *gh; 67 | narray_t *ga; 68 | end_keytime_t ekt; 69 | Tree *root; 70 | unsigned int *histogram; 71 | } program_data_t; 72 | 73 | void classical_tree_based_stackdist(char *inputFileName, long lines); 74 | 75 | gboolean compare_strings(gconstpointer a, gconstpointer b); 76 | void iterator(gpointer key, gpointer value, gpointer ekt); 77 | 78 | program_data_t parda_init(void); 79 | void parda_input_with_filename(char *inFileName, program_data_t *pdt, 80 | long begin, long end); 81 | void parda_input_with_textfilepointer(FILE *fp, program_data_t *pdt, long begin, 82 | long end); 83 | void parda_input_with_binaryfilepointer(FILE *fp, program_data_t *pdt, 84 | long begin, long end); 85 | void parda_free(program_data_t *pdt); 86 | end_keytime_t parda_generate_end(const program_data_t *pdt); 87 | processor_info_t parda_get_processor_info(int pid, int psize, long sum); 88 | void parda_get_abfront(program_data_t *pdt_a, const narray_t *gb, 89 | const processor_info_t *pit_a); 90 | int parda_get_abend(program_data_t *pdt_b, const end_keytime_t *ekt_a); 91 | program_data_t parda_merge(program_data_t *pdt_a, program_data_t *pdt_b, 92 | const processor_info_t *pit_b); 93 | 94 | void parda_print_front(const program_data_t *pdt); 95 | void parda_print_end(const end_keytime_t *ekt); 96 | void parda_print_tree(const program_data_t *pdt); 97 | void parda_print_hash(const program_data_t *pdt); 98 | void parda_print(const program_data_t *pdt); 99 | void print_iterator(gpointer key, gpointer value, gpointer ekt); 100 | void parda_print_histogram(const unsigned *histogram); 101 | void parda_fprintf_histogram(const unsigned *histogram, FILE *file); 102 | float parda_fprintf_histogram_r(const unsigned *histogram, FILE *file, 103 | bool print); 104 | 105 | int parda_findopt(char *option, char **value, int *argc, char ***argv); 106 | void parda_process(char *input, T tim, program_data_t *pdt); 107 | 108 | void show_hkey(void *data, int i, FILE *fp); 109 | void show_T(void *data, int i, FILE *fp); 110 | 111 | double rtclock(void); 112 | 113 | static inline T parda_low(int pid, int psize, T sum) { 114 | return (((long long)(pid)) * (sum) / (psize)); 115 | } 116 | 117 | static inline T parda_high(int pid, int psize, T sum) { 118 | return parda_low(pid + 1, psize, sum) - 1; 119 | } 120 | 121 | static inline T parda_size(int pid, int psize, T sum) { 122 | return (parda_low(pid + 1, psize, sum)) - (parda_low(pid, psize, sum)); 123 | } 124 | 125 | static inline T parda_owner(T index, int psize, T sum) { 126 | return (((long long)psize) * (index + 1) - 1) / sum; 127 | } 128 | 129 | static inline char *parda_generate_pfilename(char filename[], int pid, 130 | int psize) { 131 | char pfilename[30]; 132 | sprintf(pfilename, "%d_%s_p%d.txt", psize, filename, pid); 133 | return strdup(pfilename); 134 | } 135 | 136 | static inline void process_one_access(char *input, program_data_t *pdt, 137 | const long tim) { 138 | int distance; 139 | int *lookup; 140 | lookup = (T *)g_hash_table_lookup(pdt->gh, input); 141 | 142 | if (lookup == NULL) { 143 | char *data = strdup(input); 144 | pdt->root = insert(tim, pdt->root); 145 | long *p_data; 146 | narray_append_val(pdt->ga, input); 147 | if (!(p_data = (long *)malloc(sizeof(long)))) { 148 | printf("no memory for p_data\n"); 149 | assert(0); 150 | exit(-1); 151 | } 152 | *p_data = tim; 153 | g_hash_table_insert(pdt->gh, data, p_data); 154 | } 155 | 156 | else { 157 | char *data = strdup(input); 158 | pdt->root = insert((*lookup), pdt->root); 159 | distance = node_size(pdt->root->right); 160 | pdt->root = delete_(*lookup, pdt->root); 161 | pdt->root = insert(tim, pdt->root); 162 | int *p_data; 163 | if (!(p_data = (int *)malloc(sizeof(int)))) { 164 | printf("no memory for p_data\n"); 165 | assert(0); 166 | exit(-1); 167 | } 168 | *p_data = tim; 169 | g_hash_table_replace(pdt->gh, data, p_data); 170 | 171 | if (distance > nbuckets) 172 | pdt->histogram[B_OVFL] += 1; 173 | else 174 | pdt->histogram[distance] += 1; 175 | } 176 | } 177 | 178 | static inline int process_one_access_and_get_distance(char *input, 179 | program_data_t *pdt, 180 | const long tim) { 181 | int distance; 182 | int *lookup; 183 | lookup = (T *)g_hash_table_lookup(pdt->gh, input); 184 | 185 | if (lookup == NULL) { 186 | char *data = strdup(input); 187 | pdt->root = insert(tim, pdt->root); 188 | long *p_data; 189 | narray_append_val(pdt->ga, input); 190 | if (!(p_data = (long *)malloc(sizeof(long)))) { 191 | printf("no memory for p_data\n"); 192 | assert(0); 193 | exit(-1); 194 | } 195 | *p_data = tim; 196 | g_hash_table_insert(pdt->gh, data, p_data); 197 | 198 | return B_INF; 199 | } 200 | 201 | else { 202 | char *data = strdup(input); 203 | pdt->root = insert((*lookup), pdt->root); 204 | distance = node_size(pdt->root->right); 205 | pdt->root = delete_(*lookup, pdt->root); 206 | pdt->root = insert(tim, pdt->root); 207 | int *p_data; 208 | if (!(p_data = (int *)malloc(sizeof(int)))) { 209 | printf("no memory for p_data\n"); 210 | assert(0); 211 | exit(-1); 212 | } 213 | *p_data = tim; 214 | g_hash_table_replace(pdt->gh, data, p_data); 215 | 216 | if (distance > nbuckets) 217 | pdt->histogram[B_OVFL] += 1; 218 | else 219 | pdt->histogram[distance] += 1; 220 | 221 | if (distance > nbuckets) 222 | return B_OVFL; 223 | else 224 | return distance; 225 | } 226 | } 227 | #endif 228 | -------------------------------------------------------------------------------- /parda/parda_mpi.c: -------------------------------------------------------------------------------- 1 | #include "parda.h" 2 | #ifdef enable_omp 3 | #include "parda_omp.h" 4 | #endif 5 | #include "parda_mpi.h" 6 | 7 | narray_t *parda_recv_array(int source, int *tag, unsigned element_size) { 8 | narray_t *ga; 9 | MPI_Status status; 10 | unsigned blen; 11 | void *bdata; 12 | MPI_Recv(&blen, 1, MPI_UNSIGNED, source, (*tag)++, MPI_COMM_WORLD, &status); 13 | bdata = (char *)calloc(blen, 1); 14 | MPI_Recv(bdata, blen, MPI_CHAR, source, (*tag)++, MPI_COMM_WORLD, &status); 15 | ga = narray_heaparray_new(bdata, blen, element_size); 16 | return ga; 17 | } 18 | 19 | void parda_send_array(narray_t *ga, int dest, int *tag) { 20 | MPI_Send(&ga->len, 1, MPI_UNSIGNED, dest, (*tag)++, MPI_COMM_WORLD); 21 | MPI_Send(ga->data, ga->len, MPI_CHAR, dest, (*tag)++, MPI_COMM_WORLD); 22 | } 23 | 24 | unsigned *parda_mpi_merge(program_data_t *pdt, processor_info_t *pit) { 25 | int i, len; 26 | int psize = pit->psize; 27 | int pid = pit->pid; 28 | int var, tag = 1; 29 | for (var = pid, len = 1; var % 2 == 1; var = (var >> 1), len = (len << 1)) { 30 | end_keytime_t ekt_a; 31 | int dest = pid - len; 32 | parda_send_array(pdt->ga, dest, &tag); 33 | ekt_a.gkeys = parda_recv_array(dest, &tag, sizeof(HKEY)); 34 | ekt_a.gtimes = parda_recv_array(dest, &tag, sizeof(T)); 35 | parda_get_abend(pdt, &ekt_a); 36 | narray_t *mga = parda_recv_array(dest, &tag, sizeof(HKEY)); 37 | narray_free(pdt->ga); 38 | pdt->ga = mga; 39 | } 40 | if (pid + len < psize) { 41 | int source = pid + len; 42 | pdt->ekt = parda_generate_end(pdt); 43 | narray_t *gb = parda_recv_array(source, &tag, sizeof(HKEY)); 44 | parda_send_array(pdt->ekt.gkeys, source, &tag); 45 | parda_send_array(pdt->ekt.gtimes, source, &tag); 46 | parda_get_abfront(pdt, gb, pit); 47 | parda_send_array(pdt->ga, source, &tag); 48 | narray_free(pdt->ekt.gkeys); 49 | narray_free(pdt->ekt.gtimes); 50 | } else if (pid == psize - 1) { 51 | pdt->histogram[B_INF] += narray_get_len(pdt->ga); 52 | } 53 | unsigned *global_his = (unsigned *)malloc(sizeof(unsigned) * (nbuckets + 2)); 54 | 55 | for (i = 0; i < nbuckets + 2; i++) { 56 | global_his[i] = 0; 57 | } 58 | MPI_Reduce(pdt->histogram, global_his, nbuckets + 2, MPI_UNSIGNED, MPI_SUM, 0, 59 | MPI_COMM_WORLD); 60 | return global_his; 61 | } 62 | int parda_MPI_IO_binary_input(program_data_t *pdt, char filename[], 63 | const processor_info_t *pit) { 64 | MPI_File thefile; 65 | MPI_Status status; 66 | MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, 67 | &thefile); 68 | MPI_File_set_view(thefile, pit->tstart * sizeof(void *), MPI_LONG, MPI_LONG, 69 | "native", MPI_INFO_NULL); 70 | #ifdef ENABLE_PROFILING 71 | double t3, t4; 72 | MPI_Barrier(MPI_COMM_WORLD); 73 | t3 = MPI_Wtime(); 74 | #endif 75 | GHashTable *gh = pdt->gh; 76 | Tree *root = pdt->root; 77 | narray_t *ga = pdt->ga; 78 | int bufsize = 10000; 79 | void **buf = (void **)malloc(bufsize * sizeof(void *)); 80 | unsigned int *histogram = pdt->histogram; 81 | HKEY input; 82 | long tim, begin; 83 | int count, i; 84 | for (tim = begin = pit->tstart; begin <= pit->tend; begin += count) { 85 | MPI_File_read(thefile, buf, bufsize, MPI_LONG, &status); 86 | MPI_Get_count(&status, MPI_LONG, &count); 87 | if (begin + count > pit->tend + 1) { 88 | count = pit->tend + 1 - begin; 89 | } 90 | for (i = 0; i < count; i++) { 91 | sprintf(input, "%p", buf[i]); 92 | int distance; 93 | T *lookup; 94 | lookup = g_hash_table_lookup(gh, input); 95 | 96 | if (lookup == NULL) { 97 | char *data = strdup(input); 98 | root = insert(tim, root); 99 | T *p_data; 100 | 101 | narray_append_val(ga, input); 102 | if (!(p_data = (T *)malloc(sizeof(T)))) 103 | return -1; 104 | *p_data = tim; 105 | g_hash_table_insert(gh, data, p_data); 106 | } 107 | 108 | else { 109 | root = insert((*lookup), root); 110 | distance = node_size(root->right); 111 | root = delete_(*lookup, root); 112 | root = insert(tim, root); 113 | int *p_data; 114 | if (!(p_data = (int *)malloc(sizeof(int)))) 115 | return -1; 116 | *p_data = tim; 117 | g_hash_table_replace(gh, strdup(input), p_data); 118 | 119 | if (distance > nbuckets) 120 | histogram[B_OVFL]++; 121 | else 122 | histogram[distance]++; 123 | } 124 | tim++; 125 | } 126 | } 127 | printf("start from %ld to %ld\n", pit->tstart, tim); 128 | #ifdef ENABLE_PROFILING 129 | t4 = MPI_Wtime(); 130 | int pid = pit->pid; 131 | fprintf(pid_fp, 132 | "parda input time with barrier = %.3lf sec for processor %d; \n", 133 | t4 - t3, pid); 134 | #endif 135 | pdt->root = root; 136 | return 1; 137 | } 138 | 139 | void parda_mpi_stackdist(char *inputFileName, long lines, int processors, 140 | int argc, char **argv) { 141 | int pid, psize; 142 | program_data_t pdt; 143 | long psum; 144 | processor_info_t pit; 145 | MPI_Init(&argc, &argv); 146 | process_args(argc, argv); 147 | MPI_Comm_rank(MPI_COMM_WORLD, &pid); 148 | MPI_Comm_size(MPI_COMM_WORLD, &psize); 149 | psum = lines; 150 | MPI_Bcast(&psum, 1, MPI_INT, 0, MPI_COMM_WORLD); 151 | #ifdef enable_timing 152 | double ts, te, t_init, t_input, t_print, t_free; 153 | ts = MPI_Wtime(); 154 | #endif 155 | pit = parda_get_processor_info(pid, psize, psum); 156 | pdt = parda_init(); 157 | PTIME(MPI_Barrier(MPI_COMM_WORLD);) 158 | PTIME(te = MPI_Wtime();) 159 | PTIME(t_init = te - ts;) 160 | parda_input_with_filename(parda_generate_pfilename(inputFileName, pid, psize), 161 | &pdt, pit.tstart, pit.tend); 162 | unsigned *global_his = parda_mpi_merge(&pdt, &pit); 163 | PTIME(MPI_Barrier(MPI_COMM_WORLD);) 164 | PTIME(ts = MPI_Wtime();) 165 | PTIME(t_input = ts - te;) 166 | if (pid == 0) { 167 | parda_print_histogram(global_his); 168 | } 169 | PTIME(te = MPI_Wtime();) 170 | PTIME(t_print = te - ts;) 171 | parda_free(&pdt); 172 | free(global_his); 173 | PTIME(ts = MPI_Wtime();) 174 | PTIME(t_free = ts - te;) 175 | #ifdef enable_timing 176 | if (pid == 0) { 177 | printf("mpi\n"); 178 | printf("init time is %lf\n", t_init); 179 | printf("input time is %lf\n", t_input); 180 | printf("print time is %lf\n", t_print); 181 | printf("free time is %lf\n", t_free); 182 | } 183 | #endif 184 | MPI_Finalize(); 185 | } 186 | 187 | #if defined(enable_omp) && defined(enable_mpi) 188 | void parda_hybrid_stackdist(char *inputFileName, long lines, int processors, 189 | int argc, char **argv) { 190 | int pid, psize; 191 | program_data_t pdt; 192 | long psum; 193 | processor_info_t pit; 194 | MPI_Init(&argc, &argv); 195 | process_args(argc, argv); 196 | MPI_Comm_rank(MPI_COMM_WORLD, &pid); 197 | MPI_Comm_size(MPI_COMM_WORLD, &psize); 198 | DEBUG(if (pid == 0)) 199 | DEBUG(printf("enter hybrid\n");) 200 | 201 | psum = lines; 202 | MPI_Bcast(&psum, 1, MPI_INT, 0, MPI_COMM_WORLD); 203 | pit = parda_get_processor_info(pid, psize, psum); 204 | program_data_t *pdt_a = parda_omp_init(threads); 205 | 206 | pdt = parda_omp_input(inputFileName, pdt_a, pit.tstart, pit.tend, pid, psize); 207 | parda_omp_free(pdt_a, threads); 208 | 209 | unsigned *global_his = parda_mpi_merge(&pdt, &pit); 210 | 211 | if (pid == 0) { 212 | parda_print_histogram(global_his); 213 | } 214 | parda_free(&pdt); 215 | free(global_his); 216 | MPI_Finalize(); 217 | } 218 | #endif 219 | -------------------------------------------------------------------------------- /parda/parda_mpi.h: -------------------------------------------------------------------------------- 1 | #ifndef _PARDA_MPI_H 2 | #define _PARDA_MPI_H 3 | 4 | #include "parda.h" 5 | #include 6 | 7 | narray_t *parda_recv_array(int source, int *tag, unsigned element_size); 8 | void parda_send_array(narray_t *ga, int dest, int *tag); 9 | unsigned *parda_mpi_merge(program_data_t *pdt, processor_info_t *pit); 10 | void parda_mpi_free(program_data_t *pdt, unsigned *global_his); 11 | int parda_MPI_IO_binary_input(program_data_t *pdt, char filename[], 12 | const processor_info_t *pit); 13 | void parda_mpi_stackdist(char *inputFileName, long lines, int processors, 14 | int argc, char **argv); 15 | #if defined(enable_omp) && defined(enable_mpi) 16 | void parda_hybrid_stackdist(char *inputFileName, long lines, int processors, 17 | int argc, char **argv); 18 | #endif 19 | #endif 20 | -------------------------------------------------------------------------------- /parda/parda_omp.c: -------------------------------------------------------------------------------- 1 | #include "parda_omp.h" 2 | 3 | processor_info_t parda_get_thread_info(long lines, long begin, int pid, 4 | int psize) { 5 | processor_info_t pit_c; 6 | pit_c.pid = pid; 7 | pit_c.psize = psize; 8 | pit_c.sum = lines; 9 | pit_c.tstart = parda_low(pit_c.pid, pit_c.psize, lines); 10 | pit_c.tstart += begin; 11 | pit_c.tend = parda_high(pit_c.pid, pit_c.psize, lines); 12 | pit_c.tend += begin; 13 | pit_c.tlen = parda_size(pit_c.pid, pit_c.psize, lines); 14 | return pit_c; 15 | } 16 | 17 | program_data_t *parda_omp_init(int nthreads) { 18 | g_thread_init(NULL); 19 | program_data_t *pdt_a = 20 | (program_data_t *)malloc(nthreads * sizeof(program_data_t)); 21 | int i; 22 | for (i = 0; i < nthreads; i++) 23 | pdt_a[i] = parda_init(); 24 | omp_set_num_threads(nthreads); 25 | return pdt_a; 26 | } 27 | 28 | void parda_omp_openfile(char inputFileName[], const int pid, const int nthreads, 29 | const int psize, FILE *fpa[]) { 30 | int i; 31 | for (i = 0; i < nthreads; i++) { 32 | char *pfilename = parda_generate_pfilename( 33 | inputFileName, pid * nthreads + i, psize * nthreads); 34 | fpa[i] = fopen(pfilename, "r"); 35 | } 36 | } 37 | 38 | program_data_t parda_omp_input_with_filename(char inputFileName[], 39 | program_data_t *pdt_a, long begin, 40 | long end, int pid, int psize) { 41 | int nthreads = threads; 42 | long lines = end + 1 - begin; 43 | FILE *fpa[8]; 44 | parda_omp_openfile(inputFileName, pid, nthreads, psize, fpa); 45 | #pragma omp parallel default(none) \ 46 | firstprivate(begin, pid, psize, nthreads, lines, is_binary) \ 47 | shared(pdt_a, fpa) 48 | { 49 | int i = omp_get_thread_num(); 50 | FILE *fp = fpa[i]; 51 | processor_info_t pit = parda_get_thread_info(lines, begin, i, nthreads); 52 | program_data_t pdt_c = pdt_a[i]; 53 | if (!is_binary) { 54 | parda_input_with_textfilepointer(fp, &pdt_c, pit.tstart, pit.tend); 55 | } else { 56 | parda_input_with_binaryfilepointer(fp, &pdt_c, pit.tstart, pit.tend); 57 | } 58 | pdt_a[i] = pdt_c; 59 | int tid = i; 60 | int var, len; 61 | int mlen = nthreads >> 1; 62 | for (var = tid, len = 1; len <= mlen; len = (len << 1)) { 63 | if (var & 1) { 64 | program_data_t pdt_A = pdt_a[tid - len]; 65 | program_data_t pdt_B = pdt_a[tid]; 66 | pdt_a[tid] = parda_merge(&pdt_A, &pdt_B, &pit); 67 | var >>= 1; 68 | } 69 | #pragma omp barrier 70 | } 71 | } 72 | program_data_t pdt_c = pdt_a[nthreads - 1]; 73 | return pdt_c; 74 | } 75 | 76 | program_data_t parda_omp_input(char inputFileName[], program_data_t *pdt_a, 77 | long begin, long end, int pid, int psize) { 78 | int nthreads = threads; 79 | 80 | long lines = end + 1 - begin; 81 | processor_info_t pit_a[8]; 82 | int syn[8 << 6]; 83 | memset(syn, 0, sizeof(syn)); 84 | int i; 85 | #pragma omp parallel default(none) private(i) \ 86 | firstprivate(begin, pid, psize, nthreads, lines) \ 87 | shared(pdt_a, pit_a, syn, inputFileName) 88 | { 89 | DEBUG(printf("enter parallel for\n");) 90 | __sync_synchronize(); 91 | #pragma omp for 92 | for (i = 0; i < nthreads; i++) { 93 | printf("i=%d executed by thread=%d\n", i, omp_get_thread_num()); 94 | pit_a[i] = parda_get_thread_info(lines, begin, i, nthreads); 95 | parda_input_with_filename(parda_generate_pfilename(inputFileName, 96 | pid * nthreads + i, 97 | psize * nthreads), 98 | &pdt_a[i], pit_a[i].tstart, pit_a[i].tend); 99 | #ifdef enable_debugging 100 | printf("after input in for\n"); 101 | #endif 102 | int tid = i; 103 | int var, len; 104 | for (var = tid, len = 1; var % 2 == 1; 105 | var = (var >> 1), len = (len << 1)) { 106 | DEBUG(printf("before while in for %d and %d\n", tid - len, tid);) 107 | while (syn[(tid - len) << 8] == 0) { 108 | #pragma omp flush(syn) 109 | } 110 | DEBUG(printf("after while in for and will merge %d and %d\n", tid - len, 111 | tid);) 112 | pdt_a[tid] = parda_merge(&pdt_a[tid - len], &pdt_a[tid], &pit_a[tid]); 113 | DEBUG(printf("after merged %d and %d\n", tid - len, tid);) 114 | } 115 | syn[tid << 8]++; 116 | #pragma omp flush(syn) 117 | } 118 | } 119 | program_data_t pdt = pdt_a[nthreads - 1]; 120 | return pdt; 121 | } 122 | 123 | void parda_omp_free(program_data_t *pdt_a, int psize) { 124 | int i; 125 | #pragma omp parallel private(i) shared(pdt_a, psize) 126 | { 127 | #pragma omp for 128 | for (i = 0; i < psize - 1; i++) { 129 | g_hash_table_destroy(pdt_a[i].gh); 130 | } 131 | } 132 | free(pdt_a); 133 | } 134 | 135 | void parda_omp_stackdist(char *inputFileName, long lines, int threads) { 136 | #ifdef enable_timing 137 | double ts, te, t_init, t_input, t_print, t_free; 138 | ts = rtclock(); 139 | #endif 140 | program_data_t *pdt_a = parda_omp_init(threads); 141 | PTIME(te = rtclock();) 142 | PTIME(t_init = te - ts;) 143 | DEBUG(printf("after omp init\n");) 144 | program_data_t pdt_c = 145 | parda_omp_input_with_filename(inputFileName, pdt_a, 0, lines - 1, 0, 1); 146 | 147 | DEBUG(printf("after omp input\n");) 148 | program_data_t *pdt = &pdt_c; 149 | pdt->histogram[B_INF] += narray_get_len(pdt->ga); 150 | PTIME(ts = rtclock();) 151 | PTIME(t_input = ts - te;) 152 | parda_print_histogram(pdt->histogram); 153 | PTIME(te = rtclock();) 154 | PTIME(t_print = te - ts;) 155 | parda_omp_free(pdt_a, threads); 156 | parda_free(pdt); 157 | PTIME(ts = rtclock();) 158 | PTIME(t_free = ts - te;) 159 | #ifdef enable_timing 160 | printf("omp\n"); 161 | printf("init time is %lf\n", t_init); 162 | printf("input time is %lf\n", t_input); 163 | printf("print time is %lf\n", t_print); 164 | printf("free time is %lf\n", t_free); 165 | #endif 166 | } 167 | -------------------------------------------------------------------------------- /parda/parda_omp.h: -------------------------------------------------------------------------------- 1 | #ifndef _PARDA_OMP_H 2 | #define _PARDA_OMP_H 3 | 4 | #include "parda.h" 5 | #include 6 | 7 | processor_info_t parda_get_thread_info(long lines, long begin, int pid, 8 | int psize); 9 | program_data_t *parda_omp_init(int psize); 10 | program_data_t parda_omp_input(char inputFileName[], program_data_t *pdt_a, 11 | long begin, long end, int pid, int psize); 12 | void parda_omp_free(program_data_t *pdt_a, int psize); 13 | void parda_omp_stackdist(char *inputFileName, long lines, int threads); 14 | #endif 15 | -------------------------------------------------------------------------------- /parda/parda_print.c: -------------------------------------------------------------------------------- 1 | #include "parda.h" 2 | 3 | void parda_print_front(const program_data_t *pdt) { 4 | narray_t *ga = pdt->ga; 5 | unsigned i; 6 | unsigned len = narray_get_len(ga); 7 | printf("< "); 8 | for (i = 0; i < len; i++) { 9 | printf("%s ", ((HKEY *)ga->data)[i]); 10 | } 11 | printf(">\n"); 12 | } 13 | 14 | void parda_print_end(const end_keytime_t *ekt) { 15 | narray_t *gkeys = ekt->gkeys; 16 | narray_t *gtimes = ekt->gtimes; 17 | 18 | unsigned len = narray_get_len(gkeys); 19 | 20 | unsigned i; 21 | printf("[ "); 22 | for (i = 0; i < len; i++) { 23 | printf("(%s:%d) ", ((HKEY *)(gkeys->data))[i], ((T *)(gtimes->data))[i]); 24 | } 25 | printf("]\n"); 26 | } 27 | 28 | void parda_print_tree(const program_data_t *pdt) { 29 | Tree *root = pdt->root; 30 | printtree(root, 0); 31 | } 32 | 33 | void print_iterator(gpointer key, gpointer value, gpointer ekt) { 34 | printf("(%s:%d) ", (char *)key, *(T *)value); 35 | } 36 | 37 | void parda_print_hash(const program_data_t *pdt) { 38 | printf("[ "); 39 | g_hash_table_foreach(pdt->gh, (GHFunc)print_iterator, NULL); 40 | printf("]\n"); 41 | } 42 | 43 | void parda_print(const program_data_t *pdt) { 44 | parda_print_front(pdt); 45 | parda_print_tree(pdt); 46 | parda_print_hash(pdt); 47 | } 48 | 49 | void parda_print_histogram(const unsigned *histogram) { 50 | int last_bucket; 51 | int i; 52 | unsigned long long sum = 0; 53 | unsigned long long cum = 0; 54 | 55 | last_bucket = nbuckets - 1; 56 | while (histogram[last_bucket] == 0) 57 | last_bucket--; 58 | 59 | for (i = 0; i <= last_bucket; i++) 60 | sum += histogram[i]; 61 | sum += histogram[B_OVFL]; 62 | sum += histogram[B_INF]; 63 | 64 | printf("# Dist\t Refs\t Refs(%%)\t Cum_Ref\tCum_Ref(%%)\n"); 65 | 66 | for (i = 0; i <= last_bucket; i++) { 67 | cum += histogram[i]; 68 | if (histogram[i]) 69 | printf("%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i], 70 | histogram[i] / (double)sum, cum, cum / (double)sum); 71 | } 72 | 73 | cum += histogram[B_OVFL]; 74 | printf("#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL], 75 | histogram[B_OVFL] / (double)sum, cum, cum / (double)sum); 76 | cum += histogram[B_INF]; 77 | printf("#INF \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF], 78 | histogram[B_INF] / (double)sum, cum, cum / (double)sum); 79 | } 80 | 81 | void parda_fprintf_histogram(const unsigned *histogram, FILE *file) { 82 | int last_bucket; 83 | int i; 84 | unsigned long long sum = 0; 85 | unsigned long long cum = 0; 86 | 87 | last_bucket = nbuckets - 1; 88 | while (histogram[last_bucket] == 0) 89 | last_bucket--; 90 | 91 | for (i = 0; i <= last_bucket; i++) 92 | sum += histogram[i]; 93 | sum += histogram[B_OVFL]; 94 | sum += histogram[B_INF]; 95 | 96 | fprintf(file, "# Dist\t Refs\t Refs(%%)\t Cum_Ref\tCum_Ref(%%)\n"); 97 | 98 | for (i = 0; i <= last_bucket; i++) { 99 | cum += histogram[i]; 100 | if (histogram[i]) 101 | fprintf(file, "%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i], 102 | histogram[i] / (double)sum, cum, cum / (double)sum); 103 | } 104 | 105 | cum += histogram[B_OVFL]; 106 | fprintf(file, "#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL], 107 | histogram[B_OVFL] / (double)sum, cum, cum / (double)sum); 108 | cum += histogram[B_INF]; 109 | fprintf(file, "#INF \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF], 110 | histogram[B_INF] / (double)sum, cum, cum / (double)sum); 111 | } 112 | 113 | float parda_fprintf_histogram_r(const unsigned *histogram, FILE *file, 114 | bool print = true) { 115 | int last_bucket; 116 | int i; 117 | unsigned long long sum = 0; 118 | unsigned long long cum = 0; 119 | 120 | last_bucket = nbuckets - 1; 121 | while (histogram[last_bucket] == 0) 122 | last_bucket--; 123 | 124 | for (i = 0; i <= last_bucket; i++) 125 | sum += histogram[i]; 126 | sum += histogram[B_OVFL]; 127 | sum += histogram[B_INF]; 128 | 129 | if (print) 130 | fprintf(file, "# Dist\t Refs\t Refs(%%)\t Cum_Ref\tCum_Ref(%%)\n"); 131 | 132 | for (i = 0; i <= last_bucket; i++) { 133 | cum += histogram[i]; 134 | if (histogram[i] && print) 135 | fprintf(file, "%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i], 136 | histogram[i] / (double)sum, cum, cum / (double)sum); 137 | } 138 | 139 | cum += histogram[B_OVFL]; 140 | if (print) 141 | fprintf(file, "#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL], 142 | histogram[B_OVFL] / (double)sum, cum, cum / (double)sum); 143 | 144 | float hit_rate = cum / (double)sum; 145 | 146 | cum += histogram[B_INF]; 147 | if (print) 148 | fprintf(file, "#INF \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF], 149 | histogram[B_INF] / (double)sum, cum, cum / (double)sum); 150 | 151 | return hit_rate; 152 | } -------------------------------------------------------------------------------- /parda/process_args.c: -------------------------------------------------------------------------------- 1 | #include "process_args.h" 2 | 3 | int is_omp = 0; 4 | int is_mpi = 0; 5 | int is_seperate = 0; 6 | int is_binary = 0; 7 | int threads = 1; 8 | char inputFileName[200] = "d.in"; 9 | long lines = -1; 10 | int buffersize = 10000; 11 | 12 | int process_args(int argc, char **argv) { 13 | int c; 14 | is_omp = is_mpi = is_binary = 0; 15 | while (1) { 16 | static struct option long_options[] = { 17 | 18 | {"enable-omp", no_argument, 0, 'o'}, 19 | {"enable-mpi", no_argument, 0, 'm'}, 20 | 21 | {"fileformat", required_argument, 0, 'f'}, 22 | {"input", required_argument, 0, 'i'}, 23 | {"lines", required_argument, 0, 'l'}, 24 | {"threads", required_argument, 0, 't'}, 25 | {"enable-seperate", no_argument, 0, 's'}, 26 | {"help", no_argument, 0, 'h'}, 27 | {0, 0, 0, 0}}; 28 | 29 | int option_index = 0; 30 | 31 | c = getopt_long(argc, argv, "omb:", long_options, &option_index); 32 | 33 | if (c == -1) 34 | break; 35 | 36 | switch (c) { 37 | case 0: 38 | 39 | if (long_options[option_index].flag != 0) 40 | break; 41 | printf("option %s", long_options[option_index].name); 42 | if (optarg) 43 | printf(" with arg %s", optarg); 44 | printf("\n"); 45 | break; 46 | 47 | case 's': 48 | is_seperate = 1; 49 | break; 50 | case 'o': 51 | is_omp = 1; 52 | break; 53 | 54 | case 'm': 55 | is_mpi = 1; 56 | break; 57 | 58 | case 'f': 59 | if (!strcmp(optarg, "binary")) 60 | is_binary = 1; 61 | else if (!strcmp(optarg, "text")) 62 | is_binary = 0; 63 | else 64 | printf("wrong value for fileformat. Try help\n"), abort(); 65 | break; 66 | 67 | case 'i': 68 | strcpy(inputFileName, optarg); 69 | 70 | break; 71 | case 'l': 72 | lines = atol(optarg); 73 | break; 74 | case 't': 75 | threads = atol(optarg); 76 | break; 77 | case 'h': 78 | printf("case 1: seperate file\n"); 79 | printf("./parda.x --enable-seperate --input=normal_137979.trace " 80 | "--lines=137979 --threads=4\n"); 81 | printf("case 2: run with sequential algorithm\n"); 82 | printf("./parda.x --input=normal_343684.trace --lines=343684\n"); 83 | printf("case 3: run with OpenMp flag\n"); 84 | printf("./parda.x --input=normal_343684.trace --lines=343684 " 85 | "--enable-omp --threads=4\n"); 86 | printf("case 4: run with binary file input\n"); 87 | printf("./parda.x --fileformat=binary --input=binary_167024.trace " 88 | "--lines=167024 > binary.re\n"); 89 | exit(0); 90 | break; 91 | case '?': 92 | 93 | break; 94 | 95 | default: 96 | abort(); 97 | } 98 | } 99 | 100 | if (optind < argc) { 101 | printf("non-option ARGV-elements: "); 102 | while (optind < argc) 103 | printf("%s ", argv[optind++]); 104 | putchar('\n'); 105 | } 106 | if (lines == -1) 107 | printf("total lines number must be provided\n"), abort(); 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /parda/process_args.h: -------------------------------------------------------------------------------- 1 | #ifndef _PROCESS_ARGS_H 2 | #define _PROCESS_ARGS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | extern int is_omp; 13 | extern int is_mpi; 14 | extern int is_seperate; 15 | extern int is_binary; 16 | extern char inputFileName[200]; 17 | extern long lines; 18 | extern int threads; 19 | extern int buffersize; 20 | 21 | int process_args(int argc, char **argv); 22 | #endif 23 | -------------------------------------------------------------------------------- /parda/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./parda.x --input=traces/normal_137979.trace --lines=137979 > seq.hist 4 | -------------------------------------------------------------------------------- /parda/seperate.c: -------------------------------------------------------------------------------- 1 | #include "seperate.h" 2 | 3 | long get_file_lines(char filename[]) { 4 | FILE *fp = fopen(filename, "r"); 5 | char input[20]; 6 | long sum = 0; 7 | while (fscanf(fp, "%s", input) != EOF) { 8 | sum++; 9 | if (sum < 0) { 10 | printf("Trace length is out of 32 bit integer type\n"); 11 | return -1; 12 | } 13 | } 14 | fclose(fp); 15 | return sum; 16 | } 17 | 18 | long seperate_textfile(char filename[], int processor_number, long lines) { 19 | FILE *fp = fopen(filename, "r"); 20 | char input[20]; 21 | long sum = lines; 22 | int i; 23 | long tstart, tend; 24 | long tim; 25 | for (i = 0; i < processor_number; ++i) { 26 | FILE *fw = 27 | fopen(parda_generate_pfilename(filename, i, processor_number), "w"); 28 | tstart = parda_low(i, processor_number, sum); 29 | tend = parda_high(i, processor_number, sum); 30 | for (tim = tstart; tim <= tend; ++tim) { 31 | assert(fscanf(fp, "%s", input) != EOF); 32 | int len = strlen(input); 33 | if (len >= 20) { 34 | printf("line %ld length is larger than SLEN, please make sure all line " 35 | "less than SLEN\n", 36 | tim + 1); 37 | } 38 | fprintf(fw, "%s\n", input); 39 | } 40 | fclose(fw); 41 | } 42 | fclose(fp); 43 | return sum; 44 | } 45 | 46 | long seperate_binaryfile(char filename[], int processor_number, long lines) { 47 | FILE *fp = fopen(filename, "rb"); 48 | long sum = lines; 49 | int i; 50 | long tstart, tend; 51 | long t, count; 52 | void **buffer = (void **)malloc(buffersize * sizeof(void *)); 53 | for (i = 0; i < processor_number; ++i) { 54 | FILE *fw = 55 | fopen(parda_generate_pfilename(filename, i, processor_number), "wb"); 56 | tstart = parda_low(i, processor_number, sum); 57 | tend = parda_high(i, processor_number, sum); 58 | for (t = tstart; t <= tend; t += count) { 59 | count = min(tend + 1 - t, buffersize); 60 | count = fread(buffer, sizeof(void *), count, fp); 61 | fwrite(buffer, sizeof(void *), count, fw); 62 | } 63 | fclose(fw); 64 | } 65 | fclose(fp); 66 | return sum; 67 | } 68 | 69 | long parda_seperate_file(char inputFileName[], int processor_number, 70 | long lines) { 71 | if (lines == -1) 72 | lines = get_file_lines(inputFileName); 73 | int psize = processor_number; 74 | if (!is_binary) 75 | seperate_textfile(inputFileName, psize, lines); 76 | else 77 | seperate_binaryfile(inputFileName, psize, lines); 78 | char linesFile[50]; 79 | sprintf(linesFile, "%s_lines_%ld.txt", inputFileName, lines); 80 | FILE *tfile = fopen(linesFile, "w"); 81 | fprintf(tfile, "%ld", lines); 82 | fclose(tfile); 83 | return lines; 84 | } 85 | -------------------------------------------------------------------------------- /parda/seperate.h: -------------------------------------------------------------------------------- 1 | #ifndef _SEPERATE_H 2 | #define _SEPERATE_H 3 | 4 | #include "parda.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #ifndef min 16 | #define min(a, b) (((a) < (b)) ? (a) : (b)) 17 | #endif 18 | 19 | long get_file_lines(char filename[]); 20 | long seperate_textfile(char filename[], int processor_number, long lines); 21 | long seperate_binaryfile(char inputFileName[], int processor_number, 22 | long lines); 23 | long parda_seperate_file(char inputFileName[], int processor_number, 24 | long lines); 25 | #endif 26 | -------------------------------------------------------------------------------- /parda/splay.c: -------------------------------------------------------------------------------- 1 | #include "splay.h" 2 | 3 | Tree *splay(T i, Tree *t) 4 | 5 | { 6 | Tree N, *l, *r, *y; 7 | T comp, l_size, r_size; 8 | if (t == NULL) 9 | return t; 10 | N.left = N.right = NULL; 11 | l = r = &N; 12 | l_size = r_size = 0; 13 | 14 | for (;;) { 15 | comp = compare(i, t->key); 16 | if (comp < 0) { 17 | if (t->left == NULL) 18 | break; 19 | if (compare(i, t->left->key) < 0) { 20 | y = t->left; 21 | t->left = y->right; 22 | y->right = t; 23 | t->size = node_size(t->left) + node_size(t->right) + 1; 24 | t = y; 25 | if (t->left == NULL) 26 | break; 27 | } 28 | r->left = t; 29 | r = t; 30 | t = t->left; 31 | r_size += 1 + node_size(r->right); 32 | } else if (comp > 0) { 33 | if (t->right == NULL) 34 | break; 35 | if (compare(i, t->right->key) > 0) { 36 | y = t->right; 37 | t->right = y->left; 38 | y->left = t; 39 | t->size = node_size(t->left) + node_size(t->right) + 1; 40 | t = y; 41 | if (t->right == NULL) 42 | break; 43 | } 44 | l->right = t; 45 | l = t; 46 | t = t->right; 47 | l_size += 1 + node_size(l->left); 48 | } else { 49 | break; 50 | } 51 | } 52 | l_size += node_size(t->left); 53 | r_size += node_size(t->right); 54 | t->size = l_size + r_size + 1; 55 | 56 | l->right = r->left = NULL; 57 | 58 | for (y = N.right; y != NULL; y = y->right) { 59 | y->size = l_size; 60 | l_size -= 1 + node_size(y->left); 61 | } 62 | for (y = N.left; y != NULL; y = y->left) { 63 | y->size = r_size; 64 | r_size -= 1 + node_size(y->right); 65 | } 66 | 67 | l->right = t->left; 68 | r->left = t->right; 69 | t->left = N.right; 70 | t->right = N.left; 71 | 72 | return t; 73 | } 74 | 75 | Tree *insert(T i, Tree *t) { 76 | 77 | Tree *new_; 78 | 79 | if (t != NULL) { 80 | t = splay(i, t); 81 | if (compare(i, t->key) == 0) { 82 | return t; 83 | } 84 | } 85 | new_ = (Tree *)malloc(sizeof(Tree)); 86 | if (new_ == NULL) { 87 | printf("Ran out of space\n"); 88 | exit(1); 89 | } 90 | if (t == NULL) { 91 | new_->left = new_->right = NULL; 92 | } else if (compare(i, t->key) < 0) { 93 | new_->left = t->left; 94 | new_->right = t; 95 | t->left = NULL; 96 | t->size = 1 + node_size(t->right); 97 | } else { 98 | new_->right = t->right; 99 | new_->left = t; 100 | t->right = NULL; 101 | t->size = 1 + node_size(t->left); 102 | } 103 | new_->key = i; 104 | new_->size = 1 + node_size(new_->left) + node_size(new_->right); 105 | return new_; 106 | } 107 | 108 | Tree *delete_(T i, Tree *t) { 109 | 110 | Tree *x; 111 | T tsize; 112 | 113 | if (t == NULL) 114 | return NULL; 115 | tsize = t->size; 116 | t = splay(i, t); 117 | if (compare(i, t->key) == 0) { 118 | if (t->left == NULL) { 119 | x = t->right; 120 | } else { 121 | x = splay(i, t->left); 122 | x->right = t->right; 123 | } 124 | free(t); 125 | if (x != NULL) { 126 | x->size = tsize - 1; 127 | } 128 | return x; 129 | } else { 130 | return t; 131 | } 132 | } 133 | 134 | Tree *find_rank(T r, Tree *t) { 135 | 136 | T lsize; 137 | if ((r < 0) || (r >= node_size(t))) 138 | return NULL; 139 | for (;;) { 140 | lsize = node_size(t->left); 141 | if (r < lsize) { 142 | t = t->left; 143 | } else if (r > lsize) { 144 | r = r - lsize - 1; 145 | t = t->right; 146 | } else { 147 | return t; 148 | } 149 | } 150 | } 151 | void freetree(Tree *t) { 152 | if (t == NULL) 153 | return; 154 | freetree(t->right); 155 | freetree(t->left); 156 | free(t); 157 | } 158 | void printtree(Tree *t, int d) { 159 | 160 | int i; 161 | if (t == NULL) 162 | return; 163 | printtree(t->right, d + 1); 164 | for (i = 0; i < d; i++) 165 | printf(" "); 166 | printf("%d(%d)\n", t->key, t->size); 167 | printtree(t->left, d + 1); 168 | } 169 | -------------------------------------------------------------------------------- /parda/splay.h: -------------------------------------------------------------------------------- 1 | #ifndef _splay_h 2 | #define _splay_h 3 | #include 4 | #include 5 | 6 | typedef struct tree_node Tree; 7 | typedef int T; 8 | struct tree_node { 9 | Tree *left, *right; 10 | T key; 11 | T size; 12 | }; 13 | 14 | #define compare(i, j) ((i) - (j)) 15 | 16 | #define node_size(x) (((x) == NULL) ? 0 : ((x)->size)) 17 | 18 | Tree *splay(T i, Tree *t); 19 | Tree *insert(T i, Tree *t); 20 | Tree *delete_(T i, Tree *t); 21 | Tree *find_rank(T r, Tree *t); 22 | void printtree(Tree *t, int d); 23 | void freetree(Tree *t); 24 | #endif 25 | -------------------------------------------------------------------------------- /sass-split/.gitignore: -------------------------------------------------------------------------------- 1 | process_sass_dir 2 | process_sass_dir.o 3 | -------------------------------------------------------------------------------- /sass-split/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-std=c++11 -Wall -Wextra -O2 3 | LDFLAGS= 4 | OBJ_FILES=process_sass_dir.o 5 | TARGET=process_sass_dir 6 | 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(OBJ_FILES) 10 | $(CXX) $(LDFLAGS) -o $@ $^ 11 | 12 | %.o: %.cpp 13 | $(CXX) $(CXXFLAGS) -c -o $@ $< 14 | 15 | clean: 16 | rm -f $(OBJ_FILES) $(TARGET) 17 | 18 | .PHONY: all clean 19 | -------------------------------------------------------------------------------- /sass-split/sass-split.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEARCH_ROOT="../apps/OursTracesCollection" 4 | 5 | CMD="./process_sass_dir --dir" 6 | 7 | find "$SEARCH_ROOT" -type d -name "sass_traces" | while read dir; do 8 | echo "Processing directory: $dir" 9 | $CMD "$dir" 10 | echo "Current Time: $(date)" 11 | done 12 | -------------------------------------------------------------------------------- /trace-driven/entry.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTRY_H 2 | #define ENTRY_H 3 | 4 | struct inst_fetch_buffer_entry { 5 | inst_fetch_buffer_entry() 6 | : pc(0), wid(0), kid(0), uid(0), m_valid(false), 7 | latency(-1), initial_interval(-1), 8 | initial_interval_dec_counter(0) {} 9 | 10 | inst_fetch_buffer_entry(unsigned _pc, unsigned _wid, 11 | unsigned _kid, unsigned _uid) 12 | : pc(_pc), wid(_wid), kid(_kid), uid(_uid), 13 | m_valid(true), latency(-1), initial_interval(0), 14 | initial_interval_dec_counter(0) {} 15 | 16 | // inst_fetch_buffer_entry(inst_fetch_buffer_entry&& other) noexcept 17 | // : pc(other.pc), wid(other.wid), kid(other.kid), uid(other.uid), 18 | // m_valid(other.m_valid), latency(other.latency), 19 | // initial_interval(other.initial_interval), 20 | // initial_interval_dec_counter(other.initial_interval_dec_counter) { 21 | // other.m_valid = false; 22 | // } 23 | 24 | void set_latency(unsigned _latency) { latency = _latency; } 25 | void set_initial_interval(unsigned _initial_interval) { 26 | initial_interval = _initial_interval; 27 | initial_interval_dec_counter = _initial_interval; 28 | } 29 | 30 | unsigned pc; 31 | unsigned wid; 32 | unsigned kid; 33 | unsigned uid; 34 | bool m_valid; 35 | unsigned latency; 36 | unsigned initial_interval; 37 | unsigned initial_interval_dec_counter; 38 | }; 39 | 40 | struct curr_instn_id_per_warp_entry { 41 | curr_instn_id_per_warp_entry() { 42 | kid = 0; 43 | block_id = 0; 44 | warp_id = 0; 45 | }; 46 | curr_instn_id_per_warp_entry(unsigned _kid, unsigned _block_id, 47 | unsigned _warp_id) { 48 | kid = _kid; 49 | block_id = _block_id; 50 | warp_id = _warp_id; 51 | }; 52 | unsigned kid; 53 | unsigned block_id; 54 | unsigned warp_id; 55 | }; 56 | 57 | bool operator<(const curr_instn_id_per_warp_entry &lhs, 58 | const curr_instn_id_per_warp_entry &rhs); 59 | 60 | #endif -------------------------------------------------------------------------------- /trace-driven/hw-stt.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ConvolutedDog/HyFiSS/e1447e8826c35a23169a63b6fc748a6fb6d5da9c/trace-driven/hw-stt.cc -------------------------------------------------------------------------------- /trace-driven/hw-stt.h: -------------------------------------------------------------------------------- 1 | 2 | #include "inst-stt.h" 3 | 4 | #ifndef HW_STT_H 5 | #define HW_STT_H 6 | 7 | #endif -------------------------------------------------------------------------------- /trace-driven/inst-stt.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "../ISA-Def/accelwattch_component_mapping.h" 12 | #include "../ISA-Def/ampere_opcode.h" 13 | #include "../ISA-Def/kepler_opcode.h" 14 | #include "../ISA-Def/pascal_opcode.h" 15 | #include "../ISA-Def/trace_opcode.h" 16 | #include "../ISA-Def/turing_opcode.h" 17 | #include "../ISA-Def/volta_opcode.h" 18 | #include "inst-stt.h" 19 | 20 | inst_stt::inst_stt() { 21 | 22 | fetch_stage = false; 23 | 24 | wr_bk_stage = false; 25 | 26 | warp_exit_stage = false; 27 | } 28 | -------------------------------------------------------------------------------- /trace-driven/inst-stt.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifndef INST_STT_H 11 | #define INST_STT_H 12 | 13 | typedef bool inst_stage_t; 14 | 15 | class inst_stt { 16 | public: 17 | inst_stt(); 18 | 19 | private: 20 | inst_stage_t fetch_stage; 21 | 22 | inst_stage_t wr_bk_stage; 23 | inst_stage_t warp_exit_stage; 24 | }; 25 | 26 | class mem_stat_t { 27 | public: 28 | mem_stat_t(); 29 | }; 30 | 31 | struct SM_computation_instance {}; 32 | 33 | #endif -------------------------------------------------------------------------------- /trace-driven/kernel-info.cc: -------------------------------------------------------------------------------- 1 | #include "kernel-info.h" 2 | 3 | kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim) { 4 | 5 | m_grid_dim = gridDim; 6 | 7 | m_block_dim = blockDim; 8 | 9 | m_uid = kernel_info_m_next_uid++; 10 | } 11 | 12 | trace_kernel_info_t::trace_kernel_info_t(dim3 gridDim, dim3 blockDim, 13 | trace_parser *parser, 14 | 15 | kernel_trace_t *kernel_trace_info) 16 | : kernel_info_t(gridDim, blockDim) { 17 | m_parser = parser; 18 | m_kernel_trace_info = kernel_trace_info; 19 | 20 | if (kernel_trace_info->binary_verion == AMPERE_RTX_BINART_VERSION || 21 | kernel_trace_info->binary_verion == AMPERE_A100_BINART_VERSION) 22 | OpcodeMap = &Ampere_OpcodeMap; 23 | else if (kernel_trace_info->binary_verion == VOLTA_BINART_VERSION) 24 | OpcodeMap = &Volta_OpcodeMap; 25 | else if (kernel_trace_info->binary_verion == PASCAL_TITANX_BINART_VERSION || 26 | kernel_trace_info->binary_verion == PASCAL_P100_BINART_VERSION) 27 | OpcodeMap = &Pascal_OpcodeMap; 28 | else if (kernel_trace_info->binary_verion == KEPLER_BINART_VERSION) 29 | OpcodeMap = &Kepler_OpcodeMap; 30 | else if (kernel_trace_info->binary_verion == TURING_BINART_VERSION) 31 | OpcodeMap = &Turing_OpcodeMap; 32 | else { 33 | printf("unsupported binary version: %d\n", 34 | kernel_trace_info->binary_verion); 35 | fflush(stdout); 36 | exit(0); 37 | } 38 | } 39 | 40 | std::vector & 41 | trace_kernel_info_t::get_one_kernel_one_threadblock_traces(unsigned kernel_id, 42 | unsigned block_id) { 43 | return m_parser->get_one_kernel_one_threadblcok_mem_instns(kernel_id, 44 | block_id); 45 | } 46 | 47 | std::vector *> 48 | trace_kernel_info_t::get_next_threadblock_traces( 49 | std::string kernel_name, unsigned kernel_id, 50 | unsigned num_warps_per_thread_block) { 51 | return m_parser->get_next_threadblock_traces( 52 | m_kernel_trace_info->trace_verion, m_kernel_trace_info->enable_lineinfo, 53 | m_kernel_trace_info->ifs, kernel_name, kernel_id, 54 | num_warps_per_thread_block); 55 | } -------------------------------------------------------------------------------- /trace-driven/kernel-info.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../common/vector_types.h" 6 | #include "../trace-parser/trace-parser.h" 7 | #include "mem-access.h" 8 | 9 | #ifndef KERNEL_INFO_H 10 | #define KERNEL_INFO_H 11 | 12 | class kernel_info_t { 13 | public: 14 | kernel_info_t(dim3 gridDim, dim3 blockDim); 15 | ~kernel_info_t(){}; 16 | 17 | size_t num_blocks() const { 18 | return m_grid_dim.x * m_grid_dim.y * m_grid_dim.z; 19 | } 20 | 21 | size_t threads_per_cta() const { 22 | return m_block_dim.x * m_block_dim.y * m_block_dim.z; 23 | } 24 | 25 | dim3 get_grid_dim() const { return m_grid_dim; } 26 | 27 | dim3 get_cta_dim() const { return m_block_dim; } 28 | 29 | unsigned get_uid() const { return m_uid; } 30 | 31 | unsigned m_uid; 32 | 33 | dim3 m_grid_dim; 34 | dim3 m_block_dim; 35 | }; 36 | 37 | class trace_kernel_info_t : public kernel_info_t { 38 | public: 39 | trace_kernel_info_t(dim3 gridDim, dim3 blockDim, trace_parser *parser, 40 | 41 | kernel_trace_t *kernel_trace_info); 42 | ~trace_kernel_info_t() { delete m_kernel_trace_info; }; 43 | std::vector *> 44 | get_next_threadblock_traces(std::string kernel_name, unsigned kernel_id, 45 | unsigned num_warps_per_thread_block); 46 | std::vector & 47 | get_one_kernel_one_threadblock_traces(unsigned kernel_id, unsigned block_id); 48 | 49 | unsigned long get_cuda_stream_id() { 50 | return m_kernel_trace_info->cuda_stream_id; 51 | } 52 | 53 | kernel_trace_t *get_trace_info() { return m_kernel_trace_info; } 54 | 55 | private: 56 | const std::unordered_map *OpcodeMap; 57 | trace_parser *m_parser; 58 | kernel_trace_t *m_kernel_trace_info; 59 | }; 60 | 61 | #endif -------------------------------------------------------------------------------- /trace-driven/kernel-trace.cc: -------------------------------------------------------------------------------- 1 | #include "kernel-trace.h" 2 | 3 | kernel_trace_t::kernel_trace_t() { 4 | kernel_name = "Empty"; 5 | shmem_base_addr = 0; 6 | local_base_addr = 0; 7 | binary_verion = 0; 8 | trace_verion = 0; 9 | } -------------------------------------------------------------------------------- /trace-driven/kernel-trace.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../common/common_def.h" 5 | 6 | #ifndef KERNEL_TRACE_H 7 | #define KERNEL_TRACE_H 8 | 9 | struct kernel_trace_t { 10 | kernel_trace_t(); 11 | 12 | std::string kernel_name; 13 | unsigned kernel_id; 14 | unsigned grid_dim_x; 15 | unsigned grid_dim_y; 16 | unsigned grid_dim_z; 17 | unsigned tb_dim_x; 18 | unsigned tb_dim_y; 19 | unsigned tb_dim_z; 20 | unsigned shmem; 21 | unsigned nregs; 22 | unsigned long cuda_stream_id; 23 | unsigned binary_verion; 24 | unsigned enable_lineinfo; 25 | unsigned trace_verion; 26 | std::string nvbit_verion; 27 | unsigned long long shmem_base_addr; 28 | unsigned long long local_base_addr; 29 | 30 | #ifdef ENABLE_SAMPLING_POINT 31 | unsigned sampling_point; 32 | #endif 33 | 34 | std::ifstream *ifs; 35 | }; 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /trace-driven/mem-access.cc: -------------------------------------------------------------------------------- 1 | #include "mem-access.h" 2 | 3 | mem_access_t::mem_access_t(mem_access_type type, new_addr_type address, 4 | unsigned size, bool wr) { 5 | 6 | m_type = type; 7 | 8 | m_addr = address; 9 | 10 | m_req_size = size; 11 | 12 | m_write = wr; 13 | } 14 | 15 | mem_access_t::mem_access_t(mem_access_type type, new_addr_type address, 16 | unsigned size, bool wr, 17 | const active_mask_t &active_mask, 18 | const mem_access_byte_mask_t &byte_mask, 19 | const mem_access_sector_mask_t §or_mask) 20 | : m_warp_mask(active_mask), m_byte_mask(byte_mask), 21 | m_sector_mask(sector_mask) { 22 | m_type = type; 23 | m_addr = address; 24 | m_req_size = size; 25 | m_write = wr; 26 | } 27 | 28 | void mem_access_t::print(FILE *fp) const { 29 | fprintf(fp, "addr=0x%llx, %s, size=%u, ", m_addr, m_write ? "store" : "load ", 30 | m_req_size); 31 | switch (m_type) { 32 | case GLOBAL_ACC_R: 33 | fprintf(fp, "GLOBAL_R"); 34 | break; 35 | case LOCAL_ACC_R: 36 | fprintf(fp, "LOCAL_R "); 37 | break; 38 | case CONST_ACC_R: 39 | fprintf(fp, "CONST "); 40 | break; 41 | case TEXTURE_ACC_R: 42 | fprintf(fp, "TEXTURE "); 43 | break; 44 | case GLOBAL_ACC_W: 45 | fprintf(fp, "GLOBAL_W"); 46 | break; 47 | case LOCAL_ACC_W: 48 | fprintf(fp, "LOCAL_W "); 49 | break; 50 | case L2_WRBK_ACC: 51 | fprintf(fp, "L2_WRBK "); 52 | break; 53 | case INST_ACC_R: 54 | fprintf(fp, "INST "); 55 | break; 56 | case L1_WRBK_ACC: 57 | fprintf(fp, "L1_WRBK "); 58 | break; 59 | default: 60 | fprintf(fp, "unknown "); 61 | break; 62 | } 63 | } -------------------------------------------------------------------------------- /trace-driven/mem-access.h: -------------------------------------------------------------------------------- 1 | #include "../common/common_def.h" 2 | 3 | #ifndef MEM_ACCESS_H 4 | #define MEM_ACCESS_H 5 | 6 | class mem_access_t { 7 | public: 8 | mem_access_t() {} 9 | 10 | mem_access_t(mem_access_type type, new_addr_type address, unsigned size, 11 | bool wr); 12 | 13 | mem_access_t(mem_access_type type, new_addr_type address, unsigned size, 14 | bool wr, const active_mask_t &active_mask, 15 | const mem_access_byte_mask_t &byte_mask, 16 | const mem_access_sector_mask_t §or_mask); 17 | 18 | new_addr_type get_addr() const { return m_addr; } 19 | 20 | void set_addr(new_addr_type addr) { m_addr = addr; } 21 | 22 | unsigned get_size() const { return m_req_size; } 23 | 24 | const active_mask_t &get_warp_mask() const { return m_warp_mask; } 25 | 26 | bool is_write() const { return m_write; } 27 | 28 | enum mem_access_type get_type() const { return m_type; } 29 | 30 | mem_access_byte_mask_t get_byte_mask() const { return m_byte_mask; } 31 | 32 | mem_access_sector_mask_t get_sector_mask() const { return m_sector_mask; } 33 | 34 | void print(FILE *fp) const; 35 | 36 | private: 37 | unsigned m_uid; 38 | 39 | new_addr_type m_addr; 40 | 41 | bool m_write; 42 | 43 | unsigned m_req_size; 44 | 45 | mem_access_type m_type; 46 | 47 | active_mask_t m_warp_mask; 48 | 49 | mem_access_byte_mask_t m_byte_mask; 50 | 51 | mem_access_sector_mask_t m_sector_mask; 52 | }; 53 | 54 | #endif -------------------------------------------------------------------------------- /trace-driven/trace-warp-inst.cc: -------------------------------------------------------------------------------- 1 | #include "trace-warp-inst.h" 2 | 3 | inline types_of_operands get_oprnd_type(op_type op, special_ops sp_op) { 4 | switch (op) { 5 | case SP_OP: 6 | case SFU_OP: 7 | case SPECIALIZED_UNIT_2_OP: 8 | case SPECIALIZED_UNIT_3_OP: 9 | case DP_OP: 10 | case LOAD_OP: 11 | case STORE_OP: 12 | return FP_OP; 13 | case INTP_OP: 14 | case SPECIALIZED_UNIT_4_OP: 15 | return INT_OP; 16 | case ALU_OP: 17 | if ((sp_op == FP__OP) || (sp_op == TEX__OP) || (sp_op == OTHER_OP)) 18 | return FP_OP; 19 | else if (sp_op == INT__OP) 20 | return INT_OP; 21 | default: 22 | return UN_OP; 23 | } 24 | } 25 | 26 | bool trace_warp_inst_t::parse_from_trace_struct( 27 | const _inst_trace_t *trace, 28 | const std::unordered_map *OpcodeMap, 29 | unsigned gwarp_id) { 30 | 31 | active_mask_t active_mask = trace->mask; 32 | set_active(active_mask); 33 | 34 | m_decoded = true; 35 | pc = (address_type)trace->m_pc; 36 | m_gwarp_id = gwarp_id; 37 | 38 | isize = 16; 39 | for (unsigned i = 0; i < MAX_OUTPUT_VALUES; i++) { 40 | out[i] = 0; 41 | } 42 | for (unsigned i = 0; i < MAX_INPUT_VALUES; i++) { 43 | in[i] = 0; 44 | } 45 | 46 | is_vectorin = false; 47 | is_vectorout = false; 48 | ar1 = -1; 49 | ar2 = -1; 50 | memory_op = no_memory_op; 51 | data_size = 0; 52 | op = ALU_OP; 53 | sp_op = OTHER_OP; 54 | mem_op = NOT_TEX; 55 | const_cache_operand = 0; 56 | oprnd_type = UN_OP; 57 | 58 | const std::vector &opcode_tokens = 59 | trace->get_opcode_tokens_directly(); 60 | std::string opcode1 = opcode_tokens[0]; 61 | 62 | std::unordered_map::const_iterator it = 63 | OpcodeMap->find(opcode1); 64 | 65 | if (it != OpcodeMap->end()) { 66 | 67 | m_opcode = it->second.opcode; 68 | op = (op_type)(it->second.opcode_category); 69 | const std::unordered_map *OpcPowerMap = &OpcodePowerMap; 70 | 71 | std::unordered_map::const_iterator it2 = 72 | OpcPowerMap->find(m_opcode); 73 | if (it2 != OpcPowerMap->end()) 74 | sp_op = (special_ops)(it2->second); 75 | oprnd_type = get_oprnd_type(op, sp_op); 76 | } else { 77 | std::cout << "ERROR: undefined instruction : " << trace->opcode 78 | << " Opcode: " << opcode1 << std::endl; 79 | assert(0 && "undefined instruction"); 80 | } 81 | 82 | std::string opcode = trace->opcode; 83 | if (opcode1 == "MUFU") { 84 | 85 | if ((opcode.find("MUFU.SIN") != std::string::npos) || 86 | (opcode.find("MUFU.COS") != std::string::npos)) 87 | sp_op = FP_SIN_OP; 88 | if ((opcode.find("MUFU.EX2") != std::string::npos) || 89 | (opcode.find("MUFU.RCP") != std::string::npos)) 90 | sp_op = FP_EXP_OP; 91 | if (opcode.find("MUFU.RSQ") != std::string::npos) 92 | sp_op = FP_SQRT_OP; 93 | if (opcode.find("MUFU.LG2") != std::string::npos) 94 | sp_op = FP_LG_OP; 95 | } 96 | 97 | if (opcode1 == "IMAD") { 98 | 99 | if ((opcode.find("IMAD.MOV") != std::string::npos) || 100 | (opcode.find("IMAD.IADD") != std::string::npos)) 101 | sp_op = INT__OP; 102 | } 103 | 104 | num_regs = trace->reg_srcs_num + trace->reg_dsts_num; 105 | num_operands = num_regs; 106 | outcount = trace->reg_dsts_num; 107 | for (unsigned m = 0; m < trace->reg_dsts_num; ++m) { 108 | out[m] = trace->reg_dest[m]; 109 | arch_reg.dst[m] = trace->reg_dest[m]; 110 | } 111 | 112 | incount = trace->reg_srcs_num; 113 | for (unsigned m = 0; m < trace->reg_srcs_num; ++m) { 114 | in[m] = trace->reg_src[m]; 115 | arch_reg.src[m] = trace->reg_src[m]; 116 | } 117 | 118 | if (trace->memadd_info != NULL) { 119 | data_size = trace->memadd_info->width; 120 | } 121 | 122 | switch (m_opcode) { 123 | case OP_LDC: 124 | data_size = 4; 125 | memory_op = memory_load; 126 | const_cache_operand = 1; 127 | 128 | break; 129 | case OP_LDG: 130 | case OP_LDL: 131 | assert(data_size > 0); 132 | memory_op = memory_load; 133 | 134 | break; 135 | case OP_STG: 136 | case OP_STL: 137 | assert(data_size > 0); 138 | memory_op = memory_store; 139 | 140 | break; 141 | case OP_ATOMG: 142 | case OP_RED: 143 | case OP_ATOM: 144 | assert(data_size > 0); 145 | memory_op = memory_load; 146 | op = LOAD_OP; 147 | 148 | m_isatomic = true; 149 | should_do_atomic = true; 150 | 151 | break; 152 | case OP_LDS: 153 | assert(data_size > 0); 154 | memory_op = memory_load; 155 | 156 | break; 157 | case OP_STS: 158 | assert(data_size > 0); 159 | memory_op = memory_store; 160 | 161 | break; 162 | case OP_ATOMS: 163 | assert(data_size > 0); 164 | m_isatomic = true; 165 | memory_op = memory_load; 166 | 167 | should_do_atomic = true; 168 | break; 169 | case OP_LDSM: 170 | assert(data_size > 0); 171 | 172 | break; 173 | case OP_ST: 174 | case OP_LD: 175 | assert(data_size > 0); 176 | if (m_opcode == OP_LD) 177 | memory_op = memory_load; 178 | else 179 | memory_op = memory_store; 180 | 181 | break; 182 | case OP_BAR: 183 | 184 | break; 185 | case OP_HADD2: 186 | case OP_HADD2_32I: 187 | case OP_HFMA2: 188 | case OP_HFMA2_32I: 189 | case OP_HMUL2_32I: 190 | case OP_HSET2: 191 | case OP_HSETP2:; 192 | break; 193 | default: 194 | break; 195 | } 196 | 197 | if (!trace->pred_str.empty()) { 198 | size_t pos_P = trace->pred_str.find('P'); 199 | if (pos_P != std::string::npos) { 200 | size_t pos_space = trace->pred_str.find(' ', pos_P); 201 | size_t count = (pos_space != std::string::npos) ? pos_space - pos_P - 1 202 | : std::string::npos; 203 | std::string num_str = trace->pred_str.substr(pos_P + 1, count); 204 | pred = std::stoul(num_str); 205 | } 206 | } 207 | 208 | m_empty = false; 209 | 210 | return true; 211 | } 212 | 213 | inline void trace_warp_inst_t::set_active(const active_mask_t &active) { 214 | m_warp_active_mask = active; 215 | } -------------------------------------------------------------------------------- /trace-driven/trace-warp-inst.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "../ISA-Def/accelwattch_component_mapping.h" 10 | #include "../ISA-Def/trace_opcode.h" 11 | #include "../common/common_def.h" 12 | #include "../common/vector_types.h" 13 | #include "../trace-parser/inst-trace.h" 14 | 15 | #include "kernel-trace.h" 16 | #include "mem-access.h" 17 | 18 | #ifndef TRACE_WARP_INST_H 19 | #define TRACE_WARP_INST_H 20 | 21 | types_of_operands get_oprnd_type(op_type op, special_ops sp_op); 22 | 23 | class trace_warp_inst_t { 24 | public: 25 | trace_warp_inst_t() { 26 | m_opcode = 0; 27 | m_uid = 0; 28 | m_empty = true; 29 | m_isatomic = false; 30 | 31 | m_decoded = false; 32 | pc = (address_type)-1; 33 | isize = 0; 34 | 35 | num_operands = 0; 36 | num_regs = 0; 37 | 38 | memset(out, 0, sizeof(unsigned)); 39 | outcount = 0; 40 | memset(in, 0, sizeof(unsigned)); 41 | incount = 0; 42 | 43 | is_vectorin = false; 44 | is_vectorout = false; 45 | 46 | pred = -1; 47 | ar1 = -1; 48 | ar2 = -1; 49 | 50 | for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) { 51 | arch_reg.src[i] = -1; 52 | arch_reg.dst[i] = -1; 53 | } 54 | 55 | memory_op = no_memory_op; 56 | data_size = 0; 57 | 58 | op = NO_OP; 59 | sp_op = OTHER_OP; 60 | mem_op = NOT_TEX; 61 | 62 | const_cache_operand = 0; 63 | 64 | oprnd_type = UN_OP; 65 | 66 | m_is_printf = false; 67 | should_do_atomic = false; 68 | 69 | m_gwarp_id = 0; 70 | m_warp_id = 0; 71 | m_dynamic_warp_id = 0; 72 | 73 | space = memory_space_t(); 74 | cache_op = CACHE_UNDEFINED; 75 | } 76 | 77 | bool parse_from_trace_struct( 78 | const _inst_trace_t *trace, 79 | const std::unordered_map *OpcodeMap, 80 | unsigned gwarp_id); 81 | 82 | inline void set_active(const active_mask_t &active); 83 | 84 | unsigned get_opcode() const { return m_opcode; } 85 | unsigned get_uid() const { return m_uid; } 86 | bool isempty() const { return m_empty; } 87 | bool isatomic() const { return m_isatomic; } 88 | bool isdecoded() const { return m_decoded; } 89 | address_type get_pc() const { return pc; } 90 | unsigned get_isize() const { return isize; } 91 | unsigned get_outcount() const { return outcount; } 92 | unsigned get_incount() const { return incount; } 93 | unsigned get_in(unsigned i) const { 94 | assert(i < incount); 95 | return in[i]; 96 | } 97 | unsigned get_out(unsigned i) const { 98 | assert(i < outcount); 99 | return out[i]; 100 | } 101 | bool get_is_vectorin() const { return is_vectorin; } 102 | bool get_is_vectorout() const { return is_vectorout; } 103 | int get_pred() const { return pred; } 104 | int get_ar1() const { return ar1; } 105 | int get_ar2() const { return ar2; } 106 | int get_arch_reg_dst(unsigned i) const { 107 | assert(i < outcount); 108 | return arch_reg.dst[i]; 109 | } 110 | /// Determines whether all result registers are written back, and 111 | /// the value of the register is set to -1 after being written back. 112 | const bool allArchRegDstWriteBack() const { 113 | // Another implementation logic: 114 | // bool all_write_back = true; 115 | // for (unsigned i = 0; i < outcount; ++i) { 116 | // if (trace_warp_inst.get_arch_reg_dst(i) != -1) { 117 | // all_write_back = false; 118 | // break; 119 | // } 120 | // } 121 | // return all_write_back; 122 | return std::all_of( 123 | std::begin(arch_reg.dst), std::end(arch_reg.dst), 124 | [&](int dstRegValue){ return dstRegValue == -1; }); 125 | } 126 | int get_arch_reg_src(unsigned i) const { 127 | assert(i < incount); 128 | return arch_reg.src[i]; 129 | } 130 | void set_arch_reg_dst(unsigned i, int reg) { 131 | assert(i < outcount); 132 | arch_reg.dst[i] = reg; 133 | } 134 | void set_arch_reg_src(unsigned i, int reg) { 135 | assert(i < incount); 136 | arch_reg.src[i] = reg; 137 | } 138 | _memory_op_t get_memory_op() const { return memory_op; } 139 | unsigned get_num_operands() const { return num_operands; } 140 | unsigned get_num_regs() const { return num_regs; } 141 | unsigned get_data_size() const { return data_size; } 142 | op_type get_op() const { return op; } 143 | special_ops get_sp_op() const { return sp_op; } 144 | mem_operation get_mem_op() const { return mem_op; } 145 | bool get_const_cache_operand() const { return const_cache_operand; } 146 | types_of_operands get_oprnd_type_() const { return oprnd_type; } 147 | bool get_should_do_atomic() const { return should_do_atomic; } 148 | bool get_is_printf() const { return m_is_printf; } 149 | unsigned get_gwarp_id() const { return m_gwarp_id; } 150 | unsigned get_warp_id() const { return m_warp_id; } 151 | unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; } 152 | active_mask_t get_active_mask() const { return m_warp_active_mask; } 153 | active_mask_t &get_active_mask_ref() { return m_warp_active_mask; } 154 | unsigned get_activate_count() const { return m_warp_active_mask.count(); } 155 | 156 | private: 157 | unsigned m_opcode; 158 | unsigned m_uid; 159 | bool m_empty; 160 | bool m_isatomic; 161 | 162 | bool m_decoded = false; 163 | address_type pc = (address_type)-1; 164 | unsigned isize; 165 | 166 | unsigned out[8]; 167 | 168 | unsigned outcount; 169 | 170 | unsigned in[24]; 171 | 172 | unsigned incount; 173 | 174 | bool is_vectorin; 175 | bool is_vectorout; 176 | 177 | int pred; 178 | int ar1, ar2; 179 | 180 | struct { 181 | int dst[MAX_REG_OPERANDS]; 182 | int src[MAX_REG_OPERANDS]; 183 | } arch_reg; 184 | 185 | _memory_op_t memory_op; 186 | 187 | unsigned num_operands; 188 | unsigned num_regs; 189 | 190 | unsigned data_size; 191 | 192 | op_type op; 193 | special_ops sp_op; 194 | mem_operation mem_op; 195 | 196 | bool const_cache_operand; 197 | 198 | types_of_operands oprnd_type; 199 | 200 | bool should_do_atomic; 201 | bool m_is_printf; 202 | 203 | unsigned m_gwarp_id; 204 | unsigned m_warp_id; 205 | 206 | unsigned m_dynamic_warp_id; 207 | 208 | active_mask_t m_warp_active_mask; 209 | 210 | memory_space_t space; 211 | cache_operator_type cache_op; 212 | }; 213 | 214 | #endif -------------------------------------------------------------------------------- /trace-parser/inst-memadd-info.cc: -------------------------------------------------------------------------------- 1 | #include "inst-memadd-info.h" 2 | 3 | void inst_memadd_info_t::base_stride_decompress( 4 | unsigned long long base_address, int stride, 5 | const std::bitset &mask) { 6 | bool first_bit1_found = false; 7 | bool last_bit1_found = false; 8 | unsigned long long addra = base_address; 9 | for (int s = 0; s < WARP_SIZE; s++) { 10 | if (mask.test(s) && !first_bit1_found) { 11 | first_bit1_found = true; 12 | addrs[s] = base_address; 13 | } else if (first_bit1_found && !last_bit1_found) { 14 | if (mask.test(s)) { 15 | addra += stride; 16 | addrs[s] = addra; 17 | } else 18 | last_bit1_found = true; 19 | } else 20 | addrs[s] = 0; 21 | } 22 | empty = false; 23 | } 24 | 25 | void inst_memadd_info_t::base_delta_decompress( 26 | unsigned long long base_address, const std::vector &deltas, 27 | const std::bitset &mask) { 28 | bool first_bit1_found = false; 29 | long long last_address = 0; 30 | unsigned delta_index = 0; 31 | for (int s = 0; s < 32; s++) { 32 | if (mask.test(s) && !first_bit1_found) { 33 | addrs[s] = base_address; 34 | first_bit1_found = true; 35 | last_address = base_address; 36 | } else if (mask.test(s) && first_bit1_found) { 37 | assert(delta_index < deltas.size()); 38 | addrs[s] = last_address + deltas[delta_index++]; 39 | last_address = addrs[s]; 40 | } else 41 | addrs[s] = 0; 42 | } 43 | empty = false; 44 | } -------------------------------------------------------------------------------- /trace-parser/inst-memadd-info.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../common/common_def.h" 4 | #include "../common/vector_types.h" 5 | 6 | #ifndef INST_MEMADD_INFO_H 7 | #define INST_MEMADD_INFO_H 8 | 9 | class inst_memadd_info_t { 10 | public: 11 | uint64_t addrs[WARP_SIZE]; 12 | int32_t width = 0; 13 | bool empty = true; 14 | 15 | void base_stride_decompress(unsigned long long base_address, int stride, 16 | const std::bitset &mask); 17 | void base_delta_decompress(unsigned long long base_address, 18 | const std::vector &deltas, 19 | const std::bitset &mask); 20 | }; 21 | 22 | #endif -------------------------------------------------------------------------------- /trace-parser/inst-trace.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../ISA-Def/trace_opcode.h" 7 | #include "../ISA-Def/volta_opcode.h" 8 | #include "../common/common_def.h" 9 | #include "../hw-parser/hw-parser.h" 10 | #include "inst-memadd-info.h" 11 | #include "memory-space.h" 12 | #include "sass-inst.h" 13 | 14 | #ifndef INST_TRACE_H 15 | #define INST_TRACE_H 16 | 17 | enum FUNC_UNITS_NAME { 18 | 19 | NON_UNIT = 0, 20 | SP_UNIT, 21 | SFU_UNIT, 22 | INT_UNIT, 23 | DP_UNIT, 24 | TENSOR_CORE_UNIT, 25 | LDST_UNIT, 26 | SPEC_UNIT_1, 27 | SPEC_UNIT_2, 28 | SPEC_UNIT_3, 29 | NUM_FUNC_UNITS 30 | }; 31 | 32 | struct inst_trace_t { 33 | inst_trace_t(); 34 | inst_trace_t(const inst_trace_t &b); 35 | 36 | unsigned line_num; 37 | unsigned m_pc; 38 | unsigned mask; 39 | unsigned reg_dsts_num; 40 | unsigned reg_dest[MAX_DST]; 41 | std::string opcode; 42 | unsigned reg_srcs_num; 43 | unsigned reg_src[MAX_SRC]; 44 | inst_memadd_info_t *memadd_info; 45 | 46 | bool parse_from_string(std::string trace, unsigned tracer_version, 47 | unsigned enable_lineinfo, std::string kernel_name, 48 | unsigned kernel_id); 49 | 50 | bool check_opcode_contain(const std::vector &opcode, 51 | std::string param) const; 52 | 53 | unsigned 54 | get_datawidth_from_opcode(const std::vector &opcode) const; 55 | 56 | std::vector get_opcode_tokens() const; 57 | 58 | ~inst_trace_t(); 59 | }; 60 | 61 | struct _inst_trace_t { 62 | 63 | _inst_trace_t(unsigned _kernel_id, unsigned _pc, std::string _instn_str) { 64 | kernel_id = _kernel_id; 65 | m_pc = _pc; 66 | instn_str = _instn_str; 67 | 68 | for (unsigned it = 0; it < MAX_DST; it++) { 69 | reg_dest_is_pred[it] = false; 70 | } 71 | 72 | memadd_info = NULL; 73 | parse_from_string(_instn_str, _kernel_id); 74 | 75 | opcode_tokens = get_opcode_tokens(); 76 | memadd_info->width = get_datawidth_from_opcode(opcode_tokens); 77 | m_valid = true; 78 | mask = 0x0; 79 | }; 80 | 81 | _inst_trace_t(unsigned _kernel_id, unsigned _pc, std::string _instn_str, 82 | hw_config *hw_cfg) { 83 | kernel_id = _kernel_id; 84 | m_pc = _pc; 85 | instn_str = _instn_str; 86 | 87 | for (unsigned it = 0; it < MAX_DST; it++) { 88 | reg_dest_is_pred[it] = false; 89 | } 90 | 91 | memadd_info = NULL; 92 | parse_from_string(_instn_str, _kernel_id); 93 | 94 | opcode_tokens = get_opcode_tokens(); 95 | memadd_info->width = get_datawidth_from_opcode(opcode_tokens); 96 | this->hw_cfg = hw_cfg; 97 | 98 | parse_opcode_latency_info(); 99 | m_valid = true; 100 | mask = 0x0; 101 | }; 102 | 103 | bool m_valid = false; 104 | 105 | unsigned kernel_id; 106 | unsigned m_pc; 107 | unsigned mask = 0x0; 108 | unsigned reg_dsts_num; 109 | int reg_dest[MAX_DST]; 110 | bool reg_dest_is_pred[MAX_DST]; 111 | std::string opcode; 112 | 113 | unsigned reg_srcs_num; 114 | int reg_src[MAX_SRC]; 115 | inst_memadd_info_t *memadd_info; 116 | std::string instn_str; 117 | 118 | std::vector opcode_tokens; 119 | 120 | std::string pred_str = ""; 121 | 122 | unsigned initiation_interval; 123 | unsigned latency; 124 | enum FUNC_UNITS_NAME func_unit; 125 | hw_config *hw_cfg; 126 | 127 | bool parse_from_string(std::string trace, unsigned kernel_id); 128 | 129 | bool check_opcode_contain(const std::vector &opcode, 130 | std::string param) const; 131 | 132 | unsigned 133 | get_datawidth_from_opcode(const std::vector &opcode) const; 134 | 135 | std::vector get_opcode_tokens() const; 136 | 137 | inline std::vector get_opcode_tokens_directly() const { 138 | return opcode_tokens; 139 | } 140 | 141 | void parse_opcode_latency_info(); 142 | 143 | unsigned get_latency() const; 144 | unsigned get_initiation_interval() const; 145 | enum FUNC_UNITS_NAME get_func_unit() const; 146 | 147 | ~_inst_trace_t(); 148 | }; 149 | 150 | #endif -------------------------------------------------------------------------------- /trace-parser/memory-space.cc: -------------------------------------------------------------------------------- 1 | #include "memory-space.h" 2 | 3 | memory_space_t::memory_space_t() { 4 | m_type = undefined_space; 5 | m_bank = 0; 6 | } 7 | 8 | memory_space_t::memory_space_t(const enum _memory_space_t &from) { 9 | m_type = from; 10 | m_bank = 0; 11 | } 12 | 13 | bool memory_space_t::operator==(const memory_space_t &x) const { 14 | return (m_bank == x.m_bank) && (m_type == x.m_type); 15 | } 16 | 17 | bool memory_space_t::operator!=(const memory_space_t &x) const { 18 | return !(*this == x); 19 | } 20 | 21 | bool memory_space_t::operator<(const memory_space_t &x) const { 22 | if (m_type < x.m_type) 23 | return true; 24 | else if (m_type > x.m_type) 25 | return false; 26 | else if (m_bank < x.m_bank) 27 | return true; 28 | return false; 29 | } 30 | 31 | enum _memory_space_t memory_space_t::get_type() const { return m_type; } 32 | 33 | void memory_space_t::set_type(enum _memory_space_t t) { m_type = t; } 34 | 35 | unsigned memory_space_t::get_bank() const { return m_bank; } 36 | 37 | void memory_space_t::set_bank(unsigned b) { m_bank = b; } 38 | 39 | bool memory_space_t::is_const() const { 40 | return (m_type == const_space) || (m_type == param_space_kernel); 41 | } 42 | 43 | bool memory_space_t::is_local() const { 44 | return (m_type == local_space) || (m_type == param_space_local); 45 | } 46 | 47 | bool memory_space_t::is_global() const { return (m_type == global_space); } 48 | -------------------------------------------------------------------------------- /trace-parser/memory-space.h: -------------------------------------------------------------------------------- 1 | #include "../common/common_def.h" 2 | #include "../common/vector_types.h" 3 | 4 | #ifndef MEMORY_SPACE_H 5 | #define MEMORY_SPACE_H 6 | 7 | class memory_space_t { 8 | public: 9 | memory_space_t(); 10 | 11 | memory_space_t(const enum _memory_space_t &from); 12 | 13 | bool operator==(const memory_space_t &x) const; 14 | bool operator!=(const memory_space_t &x) const; 15 | bool operator<(const memory_space_t &x) const; 16 | enum _memory_space_t get_type() const; 17 | void set_type(enum _memory_space_t t); 18 | unsigned get_bank() const; 19 | void set_bank(unsigned b); 20 | bool is_const() const; 21 | bool is_local() const; 22 | bool is_global() const; 23 | 24 | private: 25 | enum _memory_space_t m_type; 26 | unsigned m_bank; 27 | }; 28 | 29 | #endif -------------------------------------------------------------------------------- /trace-parser/sass-inst.cc: -------------------------------------------------------------------------------- 1 | #include "sass-inst.h" 2 | 3 | std::map pc_to_sassStr; 4 | std::vector have_readed_insn_pcs; 5 | 6 | bool have_print_sass_during_this_execution = false; 7 | 8 | sass_inst_t find_sass_inst_by_pc(unsigned pc) { 9 | std::map::iterator iter; 10 | iter = pc_to_sassStr.find(pc); 11 | if (iter != pc_to_sassStr.end()) { 12 | return iter->second; 13 | } else { 14 | std::cout << "Can't find sass inst by pc: " << std::hex << pc << std::endl; 15 | sass_inst_t null_ = sass_inst_t(); 16 | return null_; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /trace-parser/sass-inst.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../ISA-Def/trace_opcode.h" 8 | #include "../common/common_def.h" 9 | #include "../common/vector_types.h" 10 | 11 | #ifndef SASS_INST_H 12 | #define SASS_INST_H 13 | 14 | struct sass_inst_t { 15 | std::string insnStr; 16 | std::string kernel_name; 17 | unsigned kernel_id; 18 | 19 | unsigned line_num; 20 | unsigned m_pc; 21 | unsigned mask; 22 | unsigned reg_dsts_num; 23 | unsigned reg_dest[MAX_DST]; 24 | std::string opcode; 25 | unsigned reg_srcs_num; 26 | unsigned reg_src[MAX_SRC]; 27 | 28 | std::string m_source_file; 29 | unsigned m_source_line; 30 | 31 | const char *source_file() const { return m_source_file.c_str(); } 32 | unsigned source_line() const { return m_source_line; } 33 | 34 | bool m_empty = true; 35 | }; 36 | 37 | extern std::map pc_to_sassStr; 38 | extern std::vector have_readed_insn_pcs; 39 | 40 | sass_inst_t find_sass_inst_by_pc(unsigned pc); 41 | 42 | #endif -------------------------------------------------------------------------------- /trace-parser/sass-split.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='Process sass dir.') 8 | 9 | parser.add_argument('--dir', type=str, required=True, 10 | help='The directory of sass files') 11 | 12 | args = parser.parse_args() 13 | 14 | sass_dir = args.dir 15 | sass_dir = os.path.abspath(sass_dir) 16 | 17 | files = os.listdir(sass_dir) 18 | sass_files = [os.path.join(sass_dir, file) for file in files if (file.endswith(".sass") and not file.endswith(".split.sass"))] 19 | 20 | f_open = {} 21 | warp_content = {} 22 | 23 | for sass_file in sass_files: 24 | print("Processing ", sass_file) 25 | content = open(sass_file, "r").read().split(" ") 26 | kernel_id = int(sass_file.split("/")[-1].split("_")[1].split(".sass")[0]) 27 | 28 | for i in range(int(len(content)/3)): 29 | gwarp_id = int(content[i*3 + 2], 16) 30 | entry = (kernel_id, gwarp_id) 31 | 32 | # Use dictionaries to accumulate content instead of writing files directly 33 | if entry not in warp_content: 34 | warp_content[entry] = [] 35 | warp_content[entry].append(content[i*3] + " " + content[i*3 + 1]) 36 | 37 | for (kernel_id, gwarp_id), lines in warp_content.items(): 38 | file_path = os.path.join(sass_dir, "kernel_" + str(kernel_id) + "_gwarp_id_" + str(gwarp_id) + ".split.sass") 39 | with open(file_path, "w") as file: 40 | file.write("\n".join(lines)) 41 | -------------------------------------------------------------------------------- /tracing-tool/.gitignore: -------------------------------------------------------------------------------- 1 | inject_funcs.o 2 | tracer.so 3 | tracer.o 4 | -------------------------------------------------------------------------------- /tracing-tool/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=/usr/local/cuda/bin/nvcc -ccbin=$(CXX) -D_FORCE_INLINES --compiler-options "-pipe" 2 | ARCH=70 75 80 3 | 4 | NVCC_VER_REQ=11 5 | NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ') 6 | NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc) 7 | 8 | ifeq ($(NVCC_VER_CHECK),0) 9 | $(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.) 10 | endif 11 | 12 | NVBIT_PATH=nvbit 13 | INCLUDES=-I$(NVBIT_PATH) 14 | 15 | LIBS=-L$(NVBIT_PATH) -lnvbit 16 | NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /)) 17 | 18 | SOURCES=$(wildcard *.cu) 19 | 20 | OBJECTS=$(SOURCES:.cu=.o) 21 | 22 | $(foreach sm,$(ARCH),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 23 | 24 | mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) 25 | current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path)))) 26 | 27 | NVBIT_TOOL=tracer.so 28 | 29 | all: $(NVBIT_TOOL) 30 | 31 | $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a 32 | $(NVCC) $(GENCODE_FLAGS) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@ 33 | 34 | %.o: %.cu 35 | $(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall $(GENCODE_FLAGS) -O3 -Xcompiler -fPIC $< -o $@ 36 | 37 | inject_funcs.o: inject_funcs.cu 38 | $(NVCC) $(INCLUDES) -maxrregcount=24 -Xptxas -astoolspatch --keep-device-functions $(GENCODE_FLAGS) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@ 39 | 40 | clean: 41 | rm -f *.so *.o -------------------------------------------------------------------------------- /tracing-tool/README.md: -------------------------------------------------------------------------------- 1 | ## Tracing Tool 2 | 3 | The ***tracing-tool*** is used to extract the memory and compute traces. This tool uses and extends NVBit (NVidia Binary Instrumentation Tool) which is a research prototype of a dynamic binary instrumentation library for NVIDIA GPUs. Licence and agreement of NVBIT is found in the origianal [NVBIT repo](https://github.com/NVlabs/NVBit) ("This software contains source code provided by NVIDIA Corporation") 4 | 5 | NVBIT does not require application source code, any pre-compiled GPU application should work regardless of which compiler (or version) has been used (i.e. nvcc, pgicc, etc). 6 | 7 | ## Usage 8 | 9 | * Setup the `MAX_KERNELS` variable in `tracer.cu` to define the limit on the number of kernels you want to instrument in the application. The `MAX_KERNELS` variable we used for collecting traces is 300. 10 | 11 | * For stanalone building and running of the ***tracing-tool***, please see below: 12 | 13 | #### 1. Building the tool 14 | 15 | * Setup `ARCH` and `NVCC` variable in the Makefile. For the Volta architecture, you need to set: 16 | ```shell 17 | NVCC=/usr/local/cuda/bin/nvcc -ccbin=$(CXX) -D_FORCE_INLINES --compiler-options "-pipe" 18 | ARCH=70 19 | ``` 20 | It is important to note that this tool is not sensitive to CUDA versions, so your default version should be fine. 21 | * Compile the ***tracing-tool***: 22 | ``` 23 | make clean && make 24 | ``` 25 | 26 | #### 2. Extracting the traces 27 | 28 | ``` 29 | LD_PRELOAD=/path/to/tracing-tool/tracer.so /path/to/app [parameters of app] 30 | ``` 31 | 32 | The above command outputs two folders ***memory_traces*** and ***sass_traces*** each has the applications kernel traces. It also output ***configs*** file which has information about the kernel executing inside the application. 33 | -------------------------------------------------------------------------------- /tracing-tool/common.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef struct { 4 | int pred_inst; 5 | int pred_off_threads; 6 | int pred_num; 7 | int sm_id; 8 | int cta_id_x; 9 | int cta_id_y; 10 | int cta_id_z; 11 | int warp_id; 12 | int opcode_id; 13 | int pc; 14 | int is_mem_inst; 15 | int mref_id; 16 | uint64_t mem_addrs1[32]; 17 | uint64_t mem_addrs2[32]; 18 | int dst_oprnd; 19 | int dst_oprnd_type; 20 | int src_oprnds[5]; 21 | int src_oprnds_type[5]; 22 | uint64_t curr_clk; 23 | int gwarp_id; 24 | bool isPredNeg; 25 | bool isPredUniform; 26 | uint32_t active_mask; 27 | uint32_t predicate_mask; 28 | bool stride_or_delta; 29 | } inst_access_t; 30 | 31 | #define cta_addresses_size_width 10000 32 | #define cta_addresses_size_depth 10000 -------------------------------------------------------------------------------- /tracing-tool/inject_funcs.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "common.h" 5 | #include "utils/channel.hpp" 6 | #include "utils/utils.h" 7 | 8 | #include "nvbit_reg_rw.h" 9 | 10 | extern "C" __device__ __noinline__ void 11 | instrument_inst(int pred, int pc, int opcode_id, int is_mem_inst, 12 | uint64_t addr1, int mref_id, uint64_t addr2, int dst_oprnd, 13 | int dst_oprnd_type, int src_oprnd1, int src_oprnd1_type, 14 | int src_oprnd2, int src_oprnd2_type, int src_oprnd3, 15 | int src_oprnd3_type, int src_oprnd4, int src_oprnd4_type, 16 | int src_oprnd5, int src_oprnd5_type, int pred_num, 17 | int isPredNeg, int isPredUniform, uint64_t pchannel_dev) { 18 | 19 | inst_access_t ia; 20 | 21 | /* TODO: some instructions about using %clock64 */ 22 | uint64_t current_clk; 23 | asm("mov.u64 %0, %clock64;" : "=l"(current_clk)); 24 | ia.curr_clk = current_clk; 25 | 26 | if (!pred) { 27 | ia.pred_inst = 1; 28 | } else { 29 | ia.pred_inst = 0; 30 | } 31 | ia.pred_num = pred_num; 32 | 33 | ia.sm_id = get_smid(); 34 | int4 cta = get_ctaid(); 35 | ia.cta_id_x = cta.x; 36 | ia.cta_id_y = cta.y; 37 | ia.cta_id_z = cta.z; 38 | /* warp id within a thread block. */ 39 | ia.warp_id = get_warpid(); 40 | /* global warp id within all thread blocks of one kernel. */ 41 | ia.gwarp_id = get_global_warp_id(); 42 | ia.opcode_id = opcode_id; 43 | ia.pc = pc; 44 | ia.is_mem_inst = is_mem_inst; 45 | ia.mref_id = mref_id; 46 | ia.isPredNeg = isPredNeg; 47 | ia.isPredUniform = isPredUniform; 48 | 49 | // ia.pred_reg_value = nvbit_read_pred_reg(); 50 | 51 | const uint32_t active_mask = __ballot_sync(__activemask(), 1); 52 | const int laneid = get_laneid(); 53 | const int first_laneid = __ffs(active_mask) - 1; 54 | const uint32_t predicate_mask = __ballot_sync(__activemask(), pred); 55 | const int active_threads = __popc(active_mask); 56 | 57 | /* active threads that are not predicated off per instruction executed */ 58 | ia.pred_off_threads = active_threads - __popc(predicate_mask); 59 | 60 | if (is_mem_inst) { 61 | /* collect memory address information from other threads */ 62 | for (int i = 0; i < 32; i++) { 63 | ia.mem_addrs1[i] = __shfl_sync(active_mask, addr1, i); 64 | if (mref_id == 2) 65 | ia.mem_addrs2[i] = __shfl_sync(active_mask, addr2, i); 66 | } 67 | } 68 | 69 | /* Judge if the addr is strid-mode or delta-mode. START */ 70 | /* Judge if the addr is strid-mode or delta-mode. END */ 71 | 72 | ia.dst_oprnd = dst_oprnd; 73 | ia.dst_oprnd_type = dst_oprnd_type; 74 | 75 | ia.active_mask = active_mask; 76 | ia.predicate_mask = predicate_mask; 77 | 78 | ia.src_oprnds[0] = src_oprnd1; 79 | ia.src_oprnds_type[0] = src_oprnd1_type; 80 | ia.src_oprnds[1] = src_oprnd2; 81 | ia.src_oprnds_type[1] = src_oprnd2_type; 82 | ia.src_oprnds[2] = src_oprnd3; 83 | ia.src_oprnds_type[2] = src_oprnd3_type; 84 | ia.src_oprnds[3] = src_oprnd4; 85 | ia.src_oprnds_type[3] = src_oprnd4_type; 86 | ia.src_oprnds[4] = src_oprnd5; 87 | ia.src_oprnds_type[4] = src_oprnd5_type; 88 | 89 | /* first active lane pushes information on the channel */ 90 | if (first_laneid == laneid) { 91 | ChannelDev *channel_dev = (ChannelDev *)pchannel_dev; 92 | channel_dev->push(&ia, sizeof(inst_access_t)); 93 | } 94 | } -------------------------------------------------------------------------------- /tracing-tool/nvbit/instr_types.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | 33 | namespace InstrType { 34 | 35 | /* all supported arch have at most 255 general purpose registers */ 36 | constexpr const int RZ = 255; 37 | /* the always true predicate is indicated as "7" on all the archs */ 38 | constexpr const int PT = 7; 39 | /* the entire predicate register is ecoded as "8" */ 40 | constexpr const int PR = 8; 41 | constexpr const int URZ = 63; 42 | constexpr const int UPT = 7; // uniform predicate true 43 | constexpr const int UPR = 8; // entire uniform predicate register 44 | constexpr const int MAX_CHARS = 256; 45 | 46 | // loads and stores have 1, LDGSTS has 2 47 | constexpr const int MAX_NUM_MREF_PER_INSTR = 2; 48 | 49 | enum class MemorySpace { 50 | NONE, 51 | LOCAL, // local memory operation 52 | GENERIC, // generic memory operation 53 | GLOBAL, // global memory operation 54 | SHARED, // shared memory operation 55 | CONSTANT, // constant memory operation 56 | GLOBAL_TO_SHARED, // read from global memory then write to shared memory 57 | }; 58 | constexpr const char* MemorySpaceStr[] = { 59 | "NONE", "LOCAL", "GENERIC", "GLOBAL", "SHARED", "CONSTANT", 60 | "GLOBAL_TO_SHARED", 61 | }; 62 | 63 | enum class OperandType { 64 | IMM_UINT64, 65 | IMM_DOUBLE, 66 | REG, 67 | PRED, 68 | UREG, 69 | UPRED, 70 | CBANK, 71 | MREF, 72 | GENERIC 73 | }; 74 | 75 | constexpr const char* OperandTypeStr[] = { 76 | "IMM_UINT64", "IMM_DOUBLE", "REG", "PRED", "UREG", 77 | "UPRED", "CBANK", "MREF", "GENERIC"}; 78 | 79 | enum class RegModifierType { 80 | /* stride modifiers */ 81 | X1, 82 | X4, 83 | X8, 84 | X16, 85 | /* size modifiers */ 86 | U32, 87 | U64, 88 | NO_MOD 89 | }; 90 | constexpr const char* RegModifierTypeStr[] = { 91 | "X1", "X4", "X8", "X16", "U32", /* no U */ "64", "NO_MOD"}; 92 | 93 | typedef struct { 94 | /* operand type */ 95 | OperandType type; 96 | /* operand string */ 97 | std::string str; 98 | /* is negative */ 99 | bool is_neg; 100 | /* is not */ 101 | bool is_not; 102 | /* is absolute */ 103 | bool is_abs; 104 | /* operand size in byte */ 105 | int nbytes; 106 | 107 | union { 108 | struct { 109 | uint64_t value; 110 | } imm_uint64; 111 | 112 | struct { 113 | double value; 114 | } imm_double; 115 | 116 | struct { 117 | int num; 118 | /* register properties .XXX */ 119 | char prop[MAX_CHARS]; 120 | } reg; 121 | 122 | struct { 123 | int num; 124 | } pred; 125 | 126 | struct { 127 | int id; 128 | bool has_imm_offset; 129 | int imm_offset; 130 | bool has_reg_offset; 131 | int reg_offset; 132 | } cbank; 133 | 134 | struct { 135 | bool has_ra; 136 | int ra_num; 137 | RegModifierType ra_mod; 138 | bool has_ur; 139 | int ur_num; 140 | bool has_imm; 141 | int imm; 142 | } mref; 143 | 144 | struct { 145 | char array[MAX_CHARS]; 146 | } generic; 147 | 148 | } u; 149 | } operand_t; 150 | }; 151 | -------------------------------------------------------------------------------- /tracing-tool/nvbit/libnvbit.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ConvolutedDog/HyFiSS/e1447e8826c35a23169a63b6fc748a6fb6d5da9c/tracing-tool/nvbit/libnvbit.a -------------------------------------------------------------------------------- /tracing-tool/nvbit/nvbit_reg_rw.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* This file needs to be include once in your nvbit tool, it provides hooks to 29 | * the nvbit core library to properly load this tool. 30 | * Do not modify!!! */ 31 | #pragma once 32 | #include 33 | #include 34 | #include 35 | 36 | __managed__ volatile int32_t __nvbit_var = 0; 37 | 38 | /* parameters need to be used in the function to prevent compiler optimizing 39 | * them away. */ 40 | 41 | extern "C" __device__ __noinline__ int32_t nvbit_read_reg(uint64_t reg_num) { 42 | #pragma unroll 43 | for (int i = 0; i < 1024; i++) __nvbit_var += i; 44 | assert(__nvbit_var == reg_num); 45 | return __nvbit_var; 46 | } 47 | 48 | extern "C" __device__ __noinline__ void nvbit_write_reg(uint64_t reg_num, 49 | int32_t reg_val) { 50 | #pragma unroll 51 | for (int i = 0; i < 1024; i++) __nvbit_var += i; 52 | assert(__nvbit_var == reg_num + reg_val); 53 | } 54 | 55 | extern "C" __device__ __noinline__ int32_t nvbit_read_ureg(uint64_t reg_num) { 56 | #pragma unroll 57 | for (int i = 0; i < 512; i++) __nvbit_var += i; 58 | assert(__nvbit_var == reg_num); 59 | return __nvbit_var; 60 | } 61 | 62 | extern "C" __device__ __noinline__ void nvbit_write_ureg(uint64_t reg_num, 63 | int32_t reg_val) { 64 | #pragma unroll 65 | for (int i = 0; i < 512; i++) __nvbit_var += i; 66 | assert(__nvbit_var == reg_num + reg_val); 67 | } 68 | 69 | extern "C" __device__ __noinline__ int32_t nvbit_read_pred_reg() { 70 | #pragma unroll 71 | for (int i = 0; i < 32; i++) __nvbit_var += i; 72 | return __nvbit_var; 73 | } 74 | 75 | extern "C" __device__ __noinline__ void nvbit_write_pred_reg(int32_t reg_val) { 76 | #pragma unroll 77 | for (int i = 0; i < 32; i++) __nvbit_var += reg_val; 78 | } 79 | 80 | extern "C" __device__ __noinline__ int32_t nvbit_read_upred_reg() { 81 | #pragma unroll 82 | for (int i = 0; i < 32; i++) __nvbit_var += i; 83 | return __nvbit_var; 84 | } 85 | 86 | extern "C" __device__ __noinline__ void nvbit_write_upred_reg(int32_t reg_val) { 87 | #pragma unroll 88 | for (int i = 0; i < 32; i++) __nvbit_var += reg_val; 89 | } 90 | -------------------------------------------------------------------------------- /tracing-tool/nvbit/nvbit_tool.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* This file needs to be include once in your nvbit tool, it provides hooks to 29 | * the nvbit core library to properly load this tool. 30 | * Do not modify!!! */ 31 | 32 | #pragma once 33 | #include 34 | #include 35 | #include 36 | 37 | #define SIGN_EXTEND64(x) ((((int64_t)(x)) << 32) >> 32) 38 | 39 | /* generic address generation code */ 40 | extern "C" __device__ __noinline__ uint64_t 41 | gen_mref_addr(uint32_t ra_high, int is_ra64, uint32_t ra_low, int ra_stride, 42 | uint32_t ru_high, int is_ru64, uint32_t ru_low, int32_t imm, 43 | uint32_t mref_idx /* unused */) { 44 | int64_t base_addr = 0; 45 | 46 | if (is_ra64) { 47 | base_addr += 48 | (((uint64_t)ra_high) << 32) | ((uint64_t)ra_low * ra_stride); 49 | } else { 50 | base_addr += SIGN_EXTEND64(ra_low * ra_stride); 51 | } 52 | 53 | if (is_ru64) { 54 | base_addr += (((uint64_t)ru_high) << 32) | ((uint64_t)ru_low); 55 | } else { 56 | base_addr += SIGN_EXTEND64(ru_low); 57 | } 58 | 59 | uint64_t addr = base_addr + imm; 60 | #if 0 61 | printf( 62 | "ra_high %d - is_ra64 %d - ra_low %d - ra_stride %d - ru_high %d - " 63 | "is_ru64 %d - ru_low %d - imm %d base_addr %lx addr %lx\n", 64 | ra_high, is_ra64, ra_low, ra_stride, ru_high, is_ru64, ru_low, imm, 65 | base_addr, addr); 66 | #endif 67 | return addr; 68 | } 69 | 70 | __global__ void load_module_nvbit_kernel(int var) { 71 | printf(""); 72 | if (var) { 73 | int tmp = gen_mref_addr(var, var, var, var, var, var, var, var, var); 74 | printf("%d\n", tmp); 75 | } 76 | } 77 | extern "C" void __nvbit_start(); 78 | 79 | extern "C" void nvbit_at_context_init_hook() { 80 | __nvbit_start(); 81 | load_module_nvbit_kernel<<<1, 1>>>(0); 82 | cudaDeviceSynchronize(); 83 | assert(cudaGetLastError() == cudaSuccess); 84 | } 85 | -------------------------------------------------------------------------------- /tracing-tool/nvbit/utils/utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | #include 30 | 31 | #undef CEILING 32 | #define CEILING(x, y) (((x) + (y)-1) / (y)) 33 | 34 | #define CUDA_SAFECALL(call) \ 35 | { \ 36 | call; \ 37 | cudaError err = cudaGetLastError(); \ 38 | if (cudaSuccess != err) { \ 39 | fprintf( \ 40 | stderr, \ 41 | "Cuda error in function '%s' file '%s' in line %i : %s.\n", \ 42 | #call, __FILE__, __LINE__, cudaGetErrorString(err)); \ 43 | fflush(stderr); \ 44 | _exit(EXIT_FAILURE); \ 45 | } \ 46 | } 47 | 48 | /********************************************************************* 49 | * 50 | * Device level utility functions 51 | * 52 | **********************************************************************/ 53 | 54 | // Get the SM id 55 | __device__ __forceinline__ unsigned int get_smid(void) { 56 | unsigned int ret; 57 | asm("mov.u32 %0, %smid;" : "=r"(ret)); 58 | return ret; 59 | } 60 | 61 | // Get the warp id within the application 62 | __device__ __forceinline__ unsigned int get_warpid(void) { 63 | unsigned int ret; 64 | asm("mov.u32 %0, %warpid;" : "=r"(ret)); 65 | return ret; 66 | } 67 | 68 | // Get the line id within the warp 69 | __device__ __forceinline__ unsigned int get_laneid(void) { 70 | unsigned int laneid; 71 | asm volatile("mov.u32 %0, %laneid;" : "=r"(laneid)); 72 | return laneid; 73 | } 74 | 75 | // Get a global warp id 76 | __device__ __forceinline__ int get_global_warp_id() { 77 | int block_id = blockIdx.x + blockIdx.y * gridDim.x + 78 | gridDim.x * gridDim.y * blockIdx.z; 79 | 80 | int l_thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) + 81 | (threadIdx.y * blockDim.x) + threadIdx.x; 82 | 83 | int l_warp_id = l_thread_id / 32; 84 | 85 | int n_warps = CEILING(blockDim.x * blockDim.y * blockDim.z, 32); 86 | 87 | int g_warp_id = block_id * n_warps + l_warp_id; 88 | 89 | return g_warp_id; 90 | } 91 | 92 | // Get a thread's CTA ID 93 | __device__ __forceinline__ int4 get_ctaid(void) { 94 | int4 ret; 95 | asm("mov.u32 %0, %ctaid.x;" : "=r"(ret.x)); 96 | asm("mov.u32 %0, %ctaid.y;" : "=r"(ret.y)); 97 | asm("mov.u32 %0, %ctaid.z;" : "=r"(ret.z)); 98 | return ret; 99 | } 100 | 101 | // Get the number of CTA ids per grid 102 | __device__ __forceinline__ int4 get_nctaid(void) { 103 | int4 ret; 104 | asm("mov.u32 %0, %nctaid.x;" : "=r"(ret.x)); 105 | asm("mov.u32 %0, %nctaid.y;" : "=r"(ret.y)); 106 | asm("mov.u32 %0, %nctaid.z;" : "=r"(ret.z)); 107 | return ret; 108 | } 109 | 110 | // Device level sleep function 111 | __device__ __forceinline__ void csleep(uint64_t clock_count) { 112 | if (clock_count == 0) return; 113 | clock_t start_clock = clock64(); 114 | clock_t clock_offset = 0; 115 | while (clock_offset < clock_count) { 116 | clock_offset = clock64() - start_clock; 117 | } 118 | } 119 | 120 | class Managed { 121 | public: 122 | void *operator new(size_t len) { 123 | void *ptr; 124 | cudaMallocManaged(&ptr, len); 125 | return ptr; 126 | } 127 | 128 | // void Managed::operator delete(void *ptr) 129 | void operator delete(void *ptr) { cudaFree(ptr); } 130 | 131 | void *operator new[](size_t len) { 132 | void *ptr; 133 | cudaMallocManaged(&ptr, len); 134 | return ptr; 135 | } 136 | // void Managed::operator delete[] (void* ptr) { 137 | void operator delete[](void *ptr) { cudaFree(ptr); } 138 | }; 139 | --------------------------------------------------------------------------------