├── .gitignore
├── .gitmodules
├── DEV-Def
    ├── A100PCIE40G.config
    └── QV100.config
├── ISA-Def
    ├── accelwattch_component_mapping.h
    ├── ampere_opcode.h
    ├── kepler_opcode.h
    ├── pascal_opcode.h
    ├── trace_opcode.h
    ├── turing_opcode.h
    └── volta_opcode.h
├── Makefile
├── README.md
├── common
    ├── CLI
    │   ├── App.hpp
    │   ├── Argv.hpp
    │   ├── CLI.hpp
    │   ├── Config.hpp
    │   ├── ConfigFwd.hpp
    │   ├── Encoding.hpp
    │   ├── Error.hpp
    │   ├── Formatter.hpp
    │   ├── FormatterFwd.hpp
    │   ├── Macros.hpp
    │   ├── Option.hpp
    │   ├── Split.hpp
    │   ├── StringTools.hpp
    │   ├── Timer.hpp
    │   ├── TypeTools.hpp
    │   ├── Validators.hpp
    │   ├── Version.hpp
    │   └── impl
    │   │   ├── App_inl.hpp
    │   │   ├── Argv_inl.hpp
    │   │   ├── Config_inl.hpp
    │   │   ├── Encoding_inl.hpp
    │   │   ├── Formatter_inl.hpp
    │   │   ├── Option_inl.hpp
    │   │   ├── Split_inl.hpp
    │   │   ├── StringTools_inl.hpp
    │   │   └── Validators_inl.hpp
    ├── common_def.cc
    ├── common_def.h
    ├── option_parser.cc
    ├── option_parser.h
    └── vector_types.h
├── hw-component
    ├── IBuffer.cc
    ├── IBuffer.h
    ├── OperandCollector.cc
    ├── OperandCollector.h
    ├── PipelineUnit.cc
    ├── PipelineUnit.h
    ├── PrivateSM.cc
    ├── PrivateSM.h
    ├── RegBankAlloc.cc
    ├── RegBankAlloc.h
    ├── Scoreboard.cc
    └── Scoreboard.h
├── hw-parser
    ├── hw-parser.cc
    └── hw-parser.h
├── main.cc
├── merge_report.py
├── parda
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── main.c
    ├── makefile
    ├── narray.c
    ├── narray.h
    ├── parda.c
    ├── parda.h
    ├── parda_mpi.c
    ├── parda_mpi.h
    ├── parda_omp.c
    ├── parda_omp.h
    ├── parda_print.c
    ├── process_args.c
    ├── process_args.h
    ├── run.sh
    ├── seperate.c
    ├── seperate.h
    ├── splay.c
    └── splay.h
├── sass-split
    ├── .gitignore
    ├── Makefile
    ├── process_sass_dir.cpp
    └── sass-split.sh
├── trace-driven
    ├── entry.h
    ├── hw-stt.cc
    ├── hw-stt.h
    ├── inst-stt.cc
    ├── inst-stt.h
    ├── kernel-info.cc
    ├── kernel-info.h
    ├── kernel-trace.cc
    ├── kernel-trace.h
    ├── mem-access.cc
    ├── mem-access.h
    ├── register-set.h
    ├── trace-warp-inst.cc
    └── trace-warp-inst.h
├── trace-parser
    ├── inst-memadd-info.cc
    ├── inst-memadd-info.h
    ├── inst-trace.cc
    ├── inst-trace.h
    ├── memory-space.cc
    ├── memory-space.h
    ├── sass-inst.cc
    ├── sass-inst.h
    ├── sass-split.py
    ├── trace-parser.cc
    └── trace-parser.h
└── tracing-tool
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── common.h
    ├── inject_funcs.cu
    ├── nvbit
        ├── cuda.h
        ├── generated_cuda_meta.h
        ├── instr_types.h
        ├── libnvbit.a
        ├── nvbit.h
        ├── nvbit_reg_rw.h
        ├── nvbit_tool.h
        ├── tools_cuda_api_meta.h
        └── utils
        │   ├── channel.hpp
        │   └── utils.h
    └── tracer.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | obj/*
 2 | gpu-simulator.x
 3 | .vscode/
 4 | *.mem
 5 | *.sass
 6 | *.temp.txt
 7 | *-summary.txt
 8 | app.config
 9 | instn.config
10 | issue.config
11 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DEV-Def/A100PCIE40G.config:
--------------------------------------------------------------------------------
  1 | 
  2 | ###########################################################################################
  3 | ###                                                                                     ###
  4 | ###                             Ampere A100 PCIe 40 GB Config                           ###
  5 | ###                                                                                     ###
  6 | ###########################################################################################
  7 | 
  8 | 
  9 | # Device Limits
 10 | -gpgpu_stack_size_limit 1024
 11 | -gpgpu_heap_size_limit 8388608
 12 | -gpgpu_kernel_launch_latency 5000
 13 | -gpgpu_thread_block_launch_latency 0
 14 | -gpgpu_max_concurrent_kernel 128
 15 | 
 16 | # High Level Architecture Configuration
 17 | -gpgpu_num_clusters 108
 18 | -gpgpu_num_sms_per_cluster 1
 19 | -gpgpu_num_memory_controllers 40
 20 | -gpgpu_num_sub_partition_per_memory_channel 2
 21 | 
 22 | # Clock Domain Frequencies in MHZ
 23 | -gpgpu_core_clock_mhz 1065.0
 24 | -gpgpu_icnt_clock_mhz 1065.0
 25 | -gpgpu_l2d_clock_mhz  1065.0
 26 | -gpgpu_dram_clock_mhz 1215.0
 27 | 
 28 | # SM Pipeline Config
 29 | -gpgpu_max_registers_per_sm 65536
 30 | -gpgpu_max_registers_per_cta 65536
 31 | 
 32 | # SM Warp Config
 33 | -gpgpu_max_threads_per_sm 2048
 34 | -gpgpu_warp_size 32
 35 | -gpgpu_max_ctas_per_sm 32
 36 | 
 37 | # Pipline Widths
 38 | -gpgpu_ID_OC_SP_pipeline_width 4
 39 | -gpgpu_ID_OC_DP_pipeline_width 4
 40 | -gpgpu_ID_OC_INT_pipeline_width 4
 41 | -gpgpu_ID_OC_SFU_pipeline_width 4
 42 | -gpgpu_ID_OC_MEM_pipeline_width 4
 43 | -gpgpu_OC_EX_SP_pipeline_width 4
 44 | -gpgpu_OC_EX_DP_pipeline_width 4
 45 | -gpgpu_OC_EX_INT_pipeline_width 4
 46 | -gpgpu_OC_EX_SFU_pipeline_width 4
 47 | -gpgpu_OC_EX_MEM_pipeline_width 4
 48 | -gpgpu_EX_WB_pipeline_width 8
 49 | -gpgpu_ID_OC_TENSOR_CORE_pipeline_width 4
 50 | -gpgpu_OC_EX_TENSOR_CORE_pipeline_width 4
 51 | 
 52 | # Number of FUs
 53 | -gpgpu_num_sp_units 4
 54 | -gpgpu_num_sfu_units 4
 55 | -gpgpu_num_dp_units 8
 56 | -gpgpu_num_int_units 8
 57 | -gpgpu_num_tensor_core_units 4
 58 | -gpgpu_num_mem_units 160
 59 | 
 60 | # Instruction Latencies, ADD,MAX,MUL,MAD,DIV,[SHFL]
 61 | #-gpgpu_opcode_latency_int 4,13,4,5,145,21
 62 | #-gpgpu_opcode_latency_fp 4,13,4,5,39
 63 | #-gpgpu_opcode_latency_dp 8,19,8,8,330
 64 | #-gpgpu_opcode_latency_sfu 100
 65 | #-gpgpu_opcode_latency_tensor_core 64
 66 | -gpgpu_opcode_latency_int 3,12,3,4,144,20
 67 | -gpgpu_opcode_latency_fp 3,12,3,4,38
 68 | -gpgpu_opcode_latency_dp 7,18,7,7,329
 69 | -gpgpu_opcode_latency_sfu 99
 70 | -gpgpu_opcode_latency_tensor_core 63
 71 | 
 72 | # Initiation Intervals, ADD,MAX,MUL,MAD,DIV,[SHFL]
 73 | -gpgpu_opcode_initiation_interval_int 2,2,2,2,8,4
 74 | -gpgpu_opcode_initiation_interval_fp 2,2,2,2,4
 75 | -gpgpu_opcode_initiation_interval_dp 4,4,4,4,130
 76 | -gpgpu_opcode_initiation_interval_sfu 8
 77 | -gpgpu_opcode_initiation_interval_tensor_core 64
 78 | 
 79 | # Sub Core Model, warp schedulers are isolated
 80 | -gpgpu_sub_core_model 1
 81 | 
 82 | # Generic Operand Collectors
 83 | -gpgpu_operand_collector_num_units_gen 8
 84 | -gpgpu_operand_collector_num_in_ports_gen 8
 85 | -gpgpu_operand_collector_num_out_ports_gen 8
 86 | 
 87 | # Register Banks
 88 | -gpgpu_num_reg_banks 32
 89 | -gpgpu_reg_file_port_throughput 2
 90 | 
 91 | # Shared Memory Bankconflict Detection
 92 | -gpgpu_shmem_num_banks 64
 93 | -gpgpu_shmem_limited_broadcast 0
 94 | -gpgpu_shmem_warp_parts 1
 95 | -gpgpu_coalesce_arch 80
 96 | 
 97 | # Warp Schedulers
 98 | -gpgpu_inst_fetch_throughput 4
 99 | -gpgpu_num_sched_per_sm 4
100 | # for Volta, a warp scheduler can issue 1 inst per cycle
101 | -gpgpu_max_insn_issue_per_warp 1
102 | # for Volta, dual issue only occurs with using two different execution unit
103 | -gpgpu_dual_issue_diff_exec_units 1
104 | 
105 | # L1/Shared Memory Configuration
106 | # L1 cache + shared memory = 192 KB
107 | -gpgpu_unified_l1d_size 192
108 | -gpgpu_l1d_cache_banks 4
109 | -gpgpu_l1d_cache_sets 4
110 | -gpgpu_l1d_cache_block_size 128
111 | -gpgpu_l1d_cache_associative 64
112 | -gpgpu_l1d_latency 37
113 | # Size of shared memory per SM (Byte)
114 | -gpgpu_shmem_size_per_sm 167936
115 | # Size of shared memory per CTA (Byte)
116 | -gpgpu_shmem_size_per_cta 167936
117 | -gpgpu_shmem_latency 37
118 | 
119 | # L2 Configuration
120 | -gpgpu_l2d_size_per_sub_partition 512
121 | # 32 sets, each 128 bytes 16-way for each memory sub partition (512 KB)
122 | -gpgpu_l2d_cache_sets 256
123 | -gpgpu_l2d_cache_block_size 128
124 | -gpgpu_l2d_cache_associative 16
125 | -gpgpu_dram_partition_queues_icnt_to_l2 64
126 | -gpgpu_dram_partition_queues_l2_to_dram 64
127 | -gpgpu_dram_partition_queues_dram_to_l2 64
128 | -gpgpu_dram_partition_queues_l2_to_icnt 64
129 | 
130 | # Cluster Ejection Buffer
131 | -gpgpu_num_pkts_cluster_ejection_buffer 32
132 | 
133 | # Interconnection
134 | -gpgpu_icnt_in_buffer_limit 512
135 | -gpgpu_icnt_out_buffer_limit 512
136 | -gpgpu_icnt_subnets 2
137 | -gpgpu_icnt_flit_size 40
138 | 
139 | # DRAM Configuration
140 | -gpgpu_dram_latency 100
141 | 
142 | # Trace OpCode Latency and Initiation Interval
143 | #-gpgpu_trace_opcode_latency_initiation_int 2,2
144 | #-gpgpu_trace_opcode_latency_initiation_sp 2,2
145 | #-gpgpu_trace_opcode_latency_initiation_dp 8,4
146 | #-gpgpu_trace_opcode_latency_initiation_sfu 20,8
147 | #-gpgpu_trace_opcode_latency_initiation_tensor 2,2
148 | -gpgpu_trace_opcode_latency_initiation_int 2,1
149 | -gpgpu_trace_opcode_latency_initiation_sp 2,1
150 | -gpgpu_trace_opcode_latency_initiation_dp 8,2
151 | -gpgpu_trace_opcode_latency_initiation_sfu 20,6
152 | -gpgpu_trace_opcode_latency_initiation_tensor 2,1
153 | 
154 | # execute branch insts on spec unit 1
155 | # in Volta, there is a dedicated branch unit
156 | # <enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
157 | -gpgpu_specialized_unit_1 1,4,4,4,4,BRA
158 | -gpgpu_trace_opcode_latency_initiation_spec_op_1 4,4
159 | 
160 | # TEX unit, make fixed latency for all tex insts
161 | -gpgpu_specialized_unit_2 1,4,200,4,4,TEX
162 | -gpgpu_trace_opcode_latency_initiation_spec_op_2 200,4
163 | 
164 | # tensor unit
165 | -gpgpu_specialized_unit_3 1,4,8,4,4,TENSOR
166 | -gpgpu_trace_opcode_latency_initiation_spec_op_3 2,2
167 | 
168 | # shared memory allocation size
169 | -gpgpu_smem_allocation_size 256
170 | -gpgpu_register_allocation_size 256
171 | 
172 | # L1 cache configurations
173 | -gpgpu_l1_cache_line_size_for_reuse_distance 32
174 | # L2 cache configurations
175 | -gpgpu_l2_cache_line_size_for_reuse_distance 64
176 | 
177 | # dram/l1/l2 mem access latency
178 | -gpgpu_dram_mem_access_latency 302
179 | -gpgpu_l1_cache_access_latency 37
180 | -gpgpu_l2_cache_access_latency 213
181 | -gpgpu_const_mem_access_latency 8
182 | 


--------------------------------------------------------------------------------
/DEV-Def/QV100.config:
--------------------------------------------------------------------------------
  1 | 
  2 | ###########################################################################################
  3 | ###                                                                                     ###
  4 | ###                             Volta Quadro V100 Config                                ###
  5 | ###                                                                                     ###
  6 | ###########################################################################################
  7 | 
  8 | 
  9 | # This config models the Volta Quadro V100
 10 | # For more info about volta architecture:
 11 | # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
 12 | # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
 13 | # http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
 14 | # https://en.wikipedia.org/wiki/Volta_(microarchitecture)
 15 | # https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
 16 | # https://devblogs.nvidia.com/inside-volta/
 17 | # http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
 18 | # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
 19 | 
 20 | 
 21 | # Device Limits
 22 | -gpgpu_stack_size_limit 1024
 23 | -gpgpu_heap_size_limit 8388608
 24 | -gpgpu_kernel_launch_latency 5000
 25 | -gpgpu_thread_block_launch_latency 0
 26 | -gpgpu_max_concurrent_kernel 128
 27 | 
 28 | # High Level Architecture Configuration
 29 | -gpgpu_num_clusters 80
 30 | -gpgpu_num_sms_per_cluster 1
 31 | -gpgpu_num_memory_controllers 32
 32 | -gpgpu_num_sub_partition_per_memory_channel 2
 33 | 
 34 | # Clock Domain Frequencies in MHZ
 35 | -gpgpu_core_clock_mhz 1447.0
 36 | -gpgpu_icnt_clock_mhz 1447.0
 37 | -gpgpu_l2d_clock_mhz  1447.0
 38 | -gpgpu_dram_clock_mhz  850.0
 39 | 
 40 | # SM Pipeline Config
 41 | -gpgpu_max_registers_per_sm 65536
 42 | -gpgpu_max_registers_per_cta 65536
 43 | 
 44 | # SM Warp Config
 45 | -gpgpu_max_threads_per_sm 2048
 46 | -gpgpu_warp_size 32
 47 | -gpgpu_max_ctas_per_sm 32
 48 | 
 49 | # Pipline Widths
 50 | -gpgpu_ID_OC_SP_pipeline_width 4
 51 | -gpgpu_ID_OC_DP_pipeline_width 4
 52 | -gpgpu_ID_OC_INT_pipeline_width 4
 53 | -gpgpu_ID_OC_SFU_pipeline_width 4
 54 | -gpgpu_ID_OC_MEM_pipeline_width 4
 55 | -gpgpu_OC_EX_SP_pipeline_width 4
 56 | -gpgpu_OC_EX_DP_pipeline_width 4
 57 | -gpgpu_OC_EX_INT_pipeline_width 4
 58 | -gpgpu_OC_EX_SFU_pipeline_width 4
 59 | -gpgpu_OC_EX_MEM_pipeline_width 4
 60 | -gpgpu_EX_WB_pipeline_width 8
 61 | -gpgpu_ID_OC_TENSOR_CORE_pipeline_width 4
 62 | -gpgpu_OC_EX_TENSOR_CORE_pipeline_width 4
 63 | 
 64 | # Number of FUs
 65 | -gpgpu_num_sp_units 4
 66 | -gpgpu_num_sfu_units 4
 67 | -gpgpu_num_dp_units 8
 68 | -gpgpu_num_int_units 8
 69 | -gpgpu_num_tensor_core_units 4
 70 | -gpgpu_num_mem_units 160
 71 | 
 72 | # Instruction Latencies, ADD,MAX,MUL,MAD,DIV,[SHFL]
 73 | #-gpgpu_opcode_latency_int 4,13,4,5,145,21
 74 | #-gpgpu_opcode_latency_fp 4,13,4,5,39
 75 | #-gpgpu_opcode_latency_dp 8,19,8,8,330
 76 | #-gpgpu_opcode_latency_sfu 100
 77 | #-gpgpu_opcode_latency_tensor_core 64
 78 | -gpgpu_opcode_latency_int 3,12,3,4,144,20
 79 | -gpgpu_opcode_latency_fp 3,12,3,4,38
 80 | -gpgpu_opcode_latency_dp 7,18,7,7,329
 81 | -gpgpu_opcode_latency_sfu 99
 82 | -gpgpu_opcode_latency_tensor_core 63
 83 | 
 84 | # Initiation Intervals, ADD,MAX,MUL,MAD,DIV,[SHFL]
 85 | -gpgpu_opcode_initiation_interval_int 2,2,2,2,8,4
 86 | -gpgpu_opcode_initiation_interval_fp 2,2,2,2,4
 87 | -gpgpu_opcode_initiation_interval_dp 4,4,4,4,130
 88 | -gpgpu_opcode_initiation_interval_sfu 8
 89 | -gpgpu_opcode_initiation_interval_tensor_core 64
 90 | 
 91 | # Sub Core Model, warp schedulers are isolated
 92 | -gpgpu_sub_core_model 1
 93 | 
 94 | # Generic Operand Collectors
 95 | -gpgpu_operand_collector_num_units_gen 8
 96 | -gpgpu_operand_collector_num_in_ports_gen 8
 97 | -gpgpu_operand_collector_num_out_ports_gen 8
 98 | 
 99 | # Register Banks
100 | -gpgpu_num_reg_banks 16
101 | -gpgpu_reg_file_port_throughput 2
102 | 
103 | # Shared Memory Bankconflict Detection
104 | -gpgpu_shmem_num_banks 32
105 | -gpgpu_shmem_limited_broadcast 0
106 | -gpgpu_shmem_warp_parts 1
107 | -gpgpu_coalesce_arch 70
108 | 
109 | # Warp Schedulers
110 | -gpgpu_inst_fetch_throughput 4
111 | -gpgpu_num_sched_per_sm 4
112 | # for Volta, a warp scheduler can issue 1 inst per cycle
113 | -gpgpu_max_insn_issue_per_warp 1
114 | # for Volta, dual issue only occurs with using two different execution unit
115 | -gpgpu_dual_issue_diff_exec_units 1
116 | 
117 | # L1/Shared Memory Configuration
118 | # L1 cache + shared memory = 128 KB
119 | -gpgpu_unified_l1d_size 128
120 | -gpgpu_l1d_cache_banks 4
121 | -gpgpu_l1d_cache_sets 4
122 | -gpgpu_l1d_cache_block_size 128
123 | -gpgpu_l1d_cache_associative 64
124 | -gpgpu_l1d_latency 20
125 | # Size of shared memory per SM (Byte)
126 | -gpgpu_shmem_size_per_sm 98304
127 | # Size of shared memory per CTA (Byte)
128 | -gpgpu_shmem_size_per_cta 65536
129 | -gpgpu_shmem_latency 20
130 | 
131 | # L2 Configuration
132 | -gpgpu_l2d_size_per_sub_partition 96
133 | # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB)
134 | -gpgpu_l2d_cache_sets 32
135 | -gpgpu_l2d_cache_block_size 128
136 | -gpgpu_l2d_cache_associative 24
137 | -gpgpu_dram_partition_queues_icnt_to_l2 64
138 | -gpgpu_dram_partition_queues_l2_to_dram 64
139 | -gpgpu_dram_partition_queues_dram_to_l2 64
140 | -gpgpu_dram_partition_queues_l2_to_icnt 64
141 | 
142 | # Cluster Ejection Buffer
143 | -gpgpu_num_pkts_cluster_ejection_buffer 32
144 | 
145 | # Interconnection
146 | -gpgpu_icnt_in_buffer_limit 512
147 | -gpgpu_icnt_out_buffer_limit 512
148 | -gpgpu_icnt_subnets 2
149 | -gpgpu_icnt_flit_size 40
150 | 
151 | # DRAM Configuration
152 | -gpgpu_dram_latency 100
153 | 
154 | # Trace OpCode Latency and Initiation Interval
155 | #-gpgpu_trace_opcode_latency_initiation_int 2,2
156 | #-gpgpu_trace_opcode_latency_initiation_sp 2,2
157 | #-gpgpu_trace_opcode_latency_initiation_dp 8,4
158 | #-gpgpu_trace_opcode_latency_initiation_sfu 20,8
159 | #-gpgpu_trace_opcode_latency_initiation_tensor 2,2
160 | -gpgpu_trace_opcode_latency_initiation_int 2,1
161 | -gpgpu_trace_opcode_latency_initiation_sp 2,1
162 | -gpgpu_trace_opcode_latency_initiation_dp 8,2
163 | -gpgpu_trace_opcode_latency_initiation_sfu 20,6
164 | -gpgpu_trace_opcode_latency_initiation_tensor 2,1
165 | 
166 | # execute branch insts on spec unit 1
167 | # in Volta, there is a dedicated branch unit
168 | # <enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
169 | -gpgpu_specialized_unit_1 1,4,4,4,4,BRA
170 | -gpgpu_trace_opcode_latency_initiation_spec_op_1 4,4
171 | 
172 | # TEX unit, make fixed latency for all tex insts
173 | -gpgpu_specialized_unit_2 1,4,200,4,4,TEX
174 | -gpgpu_trace_opcode_latency_initiation_spec_op_2 200,4
175 | 
176 | # tensor unit
177 | -gpgpu_specialized_unit_3 1,4,8,4,4,TENSOR
178 | -gpgpu_trace_opcode_latency_initiation_spec_op_3 2,2
179 | 
180 | # shared memory allocation size
181 | -gpgpu_smem_allocation_size 256
182 | -gpgpu_register_allocation_size 256
183 | 
184 | # L1 cache configurations
185 | -gpgpu_l1_cache_line_size_for_reuse_distance 32
186 | # L2 cache configurations
187 | -gpgpu_l2_cache_line_size_for_reuse_distance 64
188 | 
189 | # dram/l1/l2 mem access latency
190 | -gpgpu_dram_mem_access_latency 302
191 | -gpgpu_l1_cache_access_latency 33
192 | -gpgpu_l2_cache_access_latency 213
193 | -gpgpu_const_mem_access_latency 8
194 | 


--------------------------------------------------------------------------------
/ISA-Def/kepler_opcode.h:
--------------------------------------------------------------------------------
  1 | // developed by Mahmoud Khairy, Purdue Univ
  2 | // abdallm@purdue.edu
  3 | 
  4 | #ifndef KEPLER_OPCODE_H
  5 | #define KEPLER_OPCODE_H
  6 | 
  7 | #include <string>
  8 | #include <unordered_map>
  9 | #include "trace_opcode.h"
 10 | 
 11 | #define KEPLER_BINART_VERSION 35
 12 | 
 13 | /// Kepler ISA
 14 | // see: https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html
 15 | static const std::unordered_map<std::string, OpcodeChar> Kepler_OpcodeMap = {
 16 |     // Floating Point 32 Instructions
 17 |     {"FFMA", OpcodeChar(OP_FFMA, SP_OP)},
 18 |     {"FFMA32I", OpcodeChar(OP_FFMA32I, SP_OP)},
 19 |     {"FADD", OpcodeChar(OP_FADD, SP_OP)},
 20 |     {"FADD32I", OpcodeChar(OP_FADD32I, SP_OP)},
 21 |     {"FCMP", OpcodeChar(OP_FCMP, SP_OP)},
 22 |     {"FMUL", OpcodeChar(OP_FMUL, SP_OP)},
 23 |     {"FMUL32I", OpcodeChar(OP_FMUL32I, SP_OP)},
 24 |     {"FMNMX", OpcodeChar(OP_FMNMX, SP_OP)},
 25 |     {"FSWZ", OpcodeChar(OP_FSWZ, SP_OP)},
 26 |     {"FSET", OpcodeChar(OP_FSET, SP_OP)},
 27 |     {"FSETP", OpcodeChar(OP_FSETP, SP_OP)},
 28 |     {"FCHK", OpcodeChar(OP_FCHK, SP_OP)},
 29 |     {"RRO", OpcodeChar(OP_RRO, SP_OP)},
 30 |     // SFU
 31 |     {"MUFU", OpcodeChar(OP_MUFU, SFU_OP)},
 32 | 
 33 |     // Double Point Instructions
 34 |     {"DFMA", OpcodeChar(OP_DFMA, DP_OP)},
 35 |     {"DADD", OpcodeChar(OP_DADD, DP_OP)},
 36 |     {"DMUL", OpcodeChar(OP_DMUL, DP_OP)},
 37 |     {"DMNMX", OpcodeChar(OP_DMNMX, DP_OP)},
 38 |     {"DSET", OpcodeChar(OP_DSET, DP_OP)},
 39 |     {"DSETP", OpcodeChar(OP_DSETP, DP_OP)},
 40 | 
 41 |     // Integer Instructions
 42 |     {"IMAD", OpcodeChar(OP_IMAD, INTP_OP)},
 43 |     {"IMADSP", OpcodeChar(OP_IMADSP, INTP_OP)},
 44 |     {"IMUL", OpcodeChar(OP_IMUL, INTP_OP)},
 45 |     {"IMUL32I", OpcodeChar(OP_IMUL32I, INTP_OP)},
 46 |     {"IADD", OpcodeChar(OP_IADD, INTP_OP)},
 47 |     {"IADD32I", OpcodeChar(OP_IADD32I, INTP_OP)},
 48 |     {"ISUB", OpcodeChar(OP_ISUB, INTP_OP)},
 49 |     {"ISCADD", OpcodeChar(OP_ISCADD, INTP_OP)},
 50 |     {"ISCADD32I", OpcodeChar(OP_ISCADD32I, INTP_OP)},
 51 |     {"ISAD", OpcodeChar(OP_ISAD, INTP_OP)},
 52 |     {"IMNMX", OpcodeChar(OP_IMNMX, INTP_OP)},
 53 |     {"BFE", OpcodeChar(OP_BFE, INTP_OP)},
 54 |     {"BFI", OpcodeChar(OP_BFI, INTP_OP)},
 55 |     {"SHR", OpcodeChar(OP_SHR, INTP_OP)},
 56 |     {"SHL", OpcodeChar(OP_SHL, INTP_OP)},
 57 |     {"SHF", OpcodeChar(OP_SHF, INTP_OP)},
 58 |     {"LOP", OpcodeChar(OP_LOP, INTP_OP)},
 59 |     {"LOP32I", OpcodeChar(OP_LOP32I, INTP_OP)},
 60 |     {"FLO", OpcodeChar(OP_FLO, INTP_OP)},
 61 |     {"ISET", OpcodeChar(OP_ISET, INTP_OP)},
 62 |     {"ISETP", OpcodeChar(OP_ISETP, INTP_OP)},
 63 |     {"ICMP", OpcodeChar(OP_ICMP, INTP_OP)},
 64 |     {"POPC", OpcodeChar(OP_POPC, INTP_OP)},
 65 | 
 66 |     // Conversion Instructions
 67 |     {"F2F", OpcodeChar(OP_F2F, ALU_OP)},
 68 |     {"F2I", OpcodeChar(OP_F2I, ALU_OP)},
 69 |     {"I2F", OpcodeChar(OP_I2F, ALU_OP)},
 70 |     {"I2I", OpcodeChar(OP_I2I, ALU_OP)},
 71 | 
 72 |     // Movement Instructions
 73 |     {"MOV", OpcodeChar(OP_MOV, ALU_OP)},
 74 |     {"MOV32I", OpcodeChar(OP_MOV32I, ALU_OP)},
 75 |     {"SEL", OpcodeChar(OP_SEL, ALU_OP)},
 76 |     {"PRMT", OpcodeChar(OP_PRMT, ALU_OP)},
 77 |     {"SHFL", OpcodeChar(OP_SHFL, ALU_OP)},
 78 | 
 79 |     // Predicate Instructions
 80 |     {"P2R", OpcodeChar(OP_P2R, ALU_OP)},
 81 |     {"R2P", OpcodeChar(OP_R2P, ALU_OP)},
 82 |     {"CSET", OpcodeChar(OP_CSET, ALU_OP)},
 83 |     {"CSETP", OpcodeChar(OP_CSETP, ALU_OP)},
 84 |     {"PSET", OpcodeChar(OP_PSET, ALU_OP)},
 85 |     {"PSETP", OpcodeChar(OP_PSETP, ALU_OP)},
 86 | 
 87 |     // Texture Instructions
 88 |     // For now, we ignore texture loads, consider it as ALU_OP
 89 |     {"TEX", OpcodeChar(OP_TEX, ALU_OP)},
 90 |     {"TLD", OpcodeChar(OP_TLD, ALU_OP)},
 91 |     {"TLD4", OpcodeChar(OP_TLD4, ALU_OP)},
 92 |     {"TXQ", OpcodeChar(OP_TXQ, ALU_OP)},
 93 | 
 94 |     // Load/Store Instructions
 95 |     // For now, we ignore constant loads, consider it as ALU_OP, TO DO
 96 |     {"LDC", OpcodeChar(OP_LDC, ALU_OP)},
 97 |     // in Kepler, LD is load global so set it to LDG
 98 |     {"LD", OpcodeChar(OP_LDG, LOAD_OP)},
 99 |     {"LDG", OpcodeChar(OP_LDG, LOAD_OP)},
100 |     {"LDL", OpcodeChar(OP_LDL, LOAD_OP)},
101 |     {"LDS", OpcodeChar(OP_LDS, LOAD_OP)},
102 |     {"LDSLK", OpcodeChar(OP_LDSLK, LOAD_OP)},
103 |     {"ST", OpcodeChar(OP_STG, STORE_OP)},
104 |     {"STL", OpcodeChar(OP_STL, STORE_OP)},
105 |     {"STS", OpcodeChar(OP_STS, STORE_OP)},
106 |     {"STSCUL", OpcodeChar(OP_STSCUL, STORE_OP)},
107 |     {"ATOM", OpcodeChar(OP_ATOM, STORE_OP)},
108 |     {"RED", OpcodeChar(OP_RED, STORE_OP)},
109 |     {"CCTL", OpcodeChar(OP_CCTL, ALU_OP)},
110 |     {"CCTLL", OpcodeChar(OP_CCTLL, ALU_OP)},
111 |     {"MEMBAR", OpcodeChar(OP_MEMBAR, MEMORY_BARRIER_OP)},
112 | 
113 |     // surface memory instructions
114 |     {"SUCLAMP", OpcodeChar(OP_SUCLAMP, LOAD_OP)},
115 |     {"SUBFM", OpcodeChar(OP_SUBFM, LOAD_OP)},
116 |     {"SUEAU", OpcodeChar(OP_SUEAU, LOAD_OP)},
117 |     {"SULDGA", OpcodeChar(OP_SULDGA, LOAD_OP)},
118 |     {"SUSTGA", OpcodeChar(OP_SUSTGA, STORE_OP)},
119 | 
120 |     // Control Instructions
121 |     {"BRA", OpcodeChar(OP_BRA, BRANCH_OP)},
122 |     {"BRX", OpcodeChar(OP_BRX, BRANCH_OP)},
123 |     {"JMP", OpcodeChar(OP_JMP, BRANCH_OP)},
124 |     {"JMX", OpcodeChar(OP_JMX, BRANCH_OP)},
125 |     {"CAL", OpcodeChar(OP_CAL, CALL_OPS)},
126 |     {"JCAL", OpcodeChar(OP_JCAL, CALL_OPS)},
127 |     {"RET", OpcodeChar(OP_RET, RET_OPS)},
128 |     {"BRK", OpcodeChar(OP_BRK, RET_OPS)},
129 |     {"CONT", OpcodeChar(OP_CONT, RET_OPS)},
130 |     {"SSY", OpcodeChar(OP_SSY, RET_OPS)},
131 |     {"PBK", OpcodeChar(OP_PBK, RET_OPS)},
132 |     {"PCNT", OpcodeChar(OP_PCNT, RET_OPS)},
133 |     {"PRET", OpcodeChar(OP_PRET, RET_OPS)},
134 |     {"BPT", OpcodeChar(OP_BPT, BRANCH_OP)},
135 |     {"EXIT", OpcodeChar(OP_EXIT, EXIT_OPS)},
136 | 
137 |     // Miscellaneous Instructions
138 |     {"NOP", OpcodeChar(OP_NOP, ALU_OP)},
139 |     {"S2R", OpcodeChar(OP_S2R, ALU_OP)},
140 |     {"B2R", OpcodeChar(OP_B2R, ALU_OP)},
141 |     {"BAR", OpcodeChar(OP_BAR, BARRIER_OP)},
142 |     {"VOTE", OpcodeChar(OP_VOTE, ALU_OP)},
143 | };
144 | 
145 | #endif
146 | 


--------------------------------------------------------------------------------
/ISA-Def/trace_opcode.h:
--------------------------------------------------------------------------------
  1 | // developed by Mahmoud Khairy, Purdue Univ
  2 | // abdallm@purdue.edu
  3 | 
  4 | #ifndef TRACE_OPCODE_H
  5 | #define TRACE_OPCODE_H
  6 | 
  7 | #include <string>
  8 | #include <unordered_map>
  9 | 
 10 | #define SPEC_UNIT_START_ID 100
 11 | 
 12 | enum TraceInstrOpcode {
 13 |   
 14 |   // Volta (includes common insts for others cards as well)
 15 |   OP_FADD = 1,
 16 |   OP_FADD32I,
 17 |   OP_FCHK,
 18 |   OP_FFMA32I,
 19 |   OP_FFMA,
 20 |   OP_FMNMX,
 21 |   OP_FMUL,
 22 |   OP_FMUL32I,
 23 |   OP_FSEL,
 24 |   OP_FSET,
 25 |   OP_FSETP,
 26 |   OP_FSWZADD,
 27 |   OP_MUFU,
 28 |   OP_HADD2,
 29 |   OP_HADD2_32I,
 30 |   OP_HFMA2,
 31 |   OP_HFMA2_32I,
 32 |   OP_HMUL2,
 33 |   OP_HMUL2_32I,
 34 |   OP_HSET2,
 35 |   OP_HSETP2,
 36 |   OP_HMMA,
 37 |   OP_DADD,
 38 |   OP_DFMA,
 39 |   OP_DMUL,
 40 |   OP_DSETP,
 41 |   OP_BMSK,
 42 |   OP_BREV,
 43 |   OP_FLO,
 44 |   OP_IABS,
 45 |   OP_IADD,
 46 |   OP_IADD3,
 47 |   OP_IADD32I,
 48 |   OP_IDP,
 49 |   OP_IDP4A,
 50 |   OP_IMAD,
 51 |   OP_IMMA,
 52 |   OP_IMNMX,
 53 |   OP_IMUL,
 54 |   OP_IMUL32I,
 55 |   OP_ISCADD,
 56 |   OP_ISCADD32I,
 57 |   OP_ISETP,
 58 |   OP_LEA,
 59 |   OP_LOP,
 60 |   OP_LOP3,
 61 |   OP_LOP32I,
 62 |   OP_POPC,
 63 |   OP_SHF,
 64 |   OP_SHR,
 65 |   OP_VABSDIFF,
 66 |   OP_VABSDIFF4,
 67 |   OP_VADD,
 68 |   OP_F2F,
 69 |   OP_F2I,
 70 |   OP_I2F,
 71 |   OP_I2I,
 72 |   OP_I2IP,
 73 |   OP_FRND,
 74 |   OP_MOV,
 75 |   OP_MOV32I,
 76 |   OP_PRMT,
 77 |   OP_SEL,
 78 |   OP_SGXT,
 79 |   OP_SHFL,
 80 |   OP_PLOP3,
 81 |   OP_PSETP,
 82 |   OP_P2R,
 83 |   OP_R2P,
 84 |   OP_LD,
 85 |   OP_LDC,
 86 |   OP_LDG,
 87 |   OP_LDL,
 88 |   OP_LDS,
 89 |   OP_ST,
 90 |   OP_STG,
 91 |   OP_STL,
 92 |   OP_STS,
 93 |   OP_MATCH,
 94 |   OP_QSPC,
 95 |   OP_ATOM,
 96 |   OP_ATOMS,
 97 |   OP_ATOMG,
 98 |   OP_RED,
 99 |   OP_CCTL,
100 |   OP_CCTLL,
101 |   OP_ERRBAR,
102 |   OP_MEMBAR,
103 |   OP_CCTLT,
104 |   OP_TEX,
105 |   OP_TLD,
106 |   OP_TLD4,
107 |   OP_TMML,
108 |   OP_TXD,
109 |   OP_TXQ,
110 |   OP_BMOV,
111 |   OP_BPT,
112 |   OP_BRA,
113 |   OP_BREAK,
114 |   OP_BRX,
115 |   OP_BSSY,
116 |   OP_BSYNC,
117 |   OP_CALL,
118 |   OP_EXIT,
119 |   OP_JMP,
120 |   OP_JMX,
121 |   OP_KILL,
122 |   OP_NANOSLEEP,
123 |   OP_RET,
124 |   OP_RPCMOV,
125 |   OP_RTT,
126 |   OP_WARPSYNC,
127 |   OP_YIELD,
128 |   OP_B2R,
129 |   OP_BAR,
130 |   OP_CS2R,
131 |   OP_CSMTEST,
132 |   OP_DEPBAR,
133 |   OP_GETLMEMBASE,
134 |   OP_LEPC,
135 |   OP_NOP,
136 |   OP_PMTRIG,
137 |   OP_R2B,
138 |   OP_S2R,
139 |   OP_SETCTAID,
140 |   OP_SETLMEMBASE,
141 |   OP_VOTE,
142 |   OP_VOTE_VTG,
143 |   
144 |   // unique insts for pascal
145 |   OP_RRO,
146 |   OP_DMNMX,
147 |   OP_DSET,
148 |   OP_BFE,
149 |   OP_BFI,
150 |   OP_ICMP,
151 |   OP_IMADSP,
152 |   OP_SHL,
153 |   OP_XMAD,
154 |   OP_CSET,
155 |   OP_CSETP,
156 |   OP_TEXS,
157 |   OP_TLD4S,
158 |   OP_TLDS,
159 |   OP_CAL,
160 |   OP_JCAL,
161 |   OP_PRET,
162 |   OP_BRK,
163 |   OP_PBK,
164 |   OP_CONT,
165 |   OP_PCNT,
166 |   OP_PEXIT,
167 |   OP_SSY,
168 |   OP_SYNC,
169 |   OP_PSET,
170 |   OP_VMNMX,
171 |   OP_ISET,
172 |   
173 |   // unique insts for turing
174 |   OP_BMMA,
175 |   OP_MOVM,
176 |   OP_LDSM,
177 |   OP_R2UR,
178 |   OP_S2UR,
179 |   OP_UBMSK,
180 |   OP_UBREV,
181 |   OP_UCLEA,
182 |   OP_UFLO,
183 |   OP_UIADD3,
184 |   OP_UIMAD,
185 |   OP_UISETP,
186 |   OP_ULDC,
187 |   OP_ULEA,
188 |   OP_ULOP,
189 |   OP_ULOP3,
190 |   OP_ULOP32I,
191 |   OP_UMOV,
192 |   OP_UP2UR,
193 |   OP_UPLOP3,
194 |   OP_UPOPC,
195 |   OP_UPRMT,
196 |   OP_UPSETP,
197 |   OP_UR2UP,
198 |   OP_USEL,
199 |   OP_USGXT,
200 |   OP_USHF,
201 |   OP_USHL,
202 |   OP_USHR,
203 |   OP_VOTEU,
204 |   OP_SUATOM,
205 |   OP_SULD,
206 |   OP_SURED,
207 |   OP_SUST,
208 |   OP_BRXU,
209 |   OP_JMXU,
210 |   
211 |   // unique insts for kepler
212 |   OP_FCMP,
213 |   OP_FSWZ,
214 |   OP_ISAD,
215 |   OP_LDSLK,
216 |   OP_STSCUL,
217 |   OP_SUCLAMP,
218 |   OP_SUBFM,
219 |   OP_SUEAU,
220 |   OP_SULDGA,
221 |   OP_SUSTGA,
222 |   OP_ISUB,
223 |   
224 |   // unique insts for ampere
225 |   OP_HMNMX2,
226 |   OP_DMMA,
227 |   OP_I2FP,
228 |   OP_F2IP,
229 |   OP_LDGDEPBAR,
230 |   OP_LDGSTS,
231 |   OP_REDUX,
232 |   OP_UF2FP,
233 |   OP_SUQUERY,
234 |   SASS_NUM_OPCODES /* The total number of opcodes. */
235 | 
236 | };
237 | 
238 | typedef enum TraceInstrOpcode sass_op_type;
239 | 
240 | struct OpcodeChar {
241 |   OpcodeChar(unsigned m_opcode, unsigned m_opcode_category) {
242 |     opcode = m_opcode;
243 |     opcode_category = m_opcode_category;
244 |   }
245 |   unsigned opcode;
246 |   unsigned opcode_category;
247 | };
248 | 
249 | enum special_operations_t {
250 |   OTHER_OP,
251 |   INT__OP,
252 |   INT_MUL24_OP,
253 |   INT_MUL32_OP,
254 |   INT_MUL_OP,
255 |   INT_DIV_OP,
256 |   FP_MUL_OP,
257 |   FP_DIV_OP,
258 |   FP__OP,
259 |   FP_SQRT_OP,
260 |   FP_LG_OP,
261 |   FP_SIN_OP,
262 |   FP_EXP_OP,
263 |   DP_MUL_OP,
264 |   DP_DIV_OP,
265 |   DP___OP,
266 |   TENSOR__OP,
267 |   TEX__OP
268 | };
269 | 
270 | typedef enum special_operations_t special_ops;  // Required to identify for the power model
271 | 
272 | // Type of operation
273 | enum uarch_op_t {
274 |   NO_OP = -1,
275 |   ALU_OP = 1,
276 |   SFU_OP,
277 |   TENSOR_CORE_OP,
278 |   DP_OP,
279 |   SP_OP,
280 |   INTP_OP,
281 |   ALU_SFU_OP,
282 |   LOAD_OP,
283 |   TENSOR_CORE_LOAD_OP,
284 |   TENSOR_CORE_STORE_OP,
285 |   STORE_OP,
286 |   BRANCH_OP,
287 |   BARRIER_OP,
288 |   MEMORY_BARRIER_OP,
289 |   CALL_OPS,
290 |   RET_OPS,
291 |   EXIT_OPS,
292 |   SPECIALIZED_UNIT_1_OP = SPEC_UNIT_START_ID,
293 |   SPECIALIZED_UNIT_2_OP,
294 |   SPECIALIZED_UNIT_3_OP,
295 |   SPECIALIZED_UNIT_4_OP,
296 |   SPECIALIZED_UNIT_5_OP,
297 |   SPECIALIZED_UNIT_6_OP,
298 |   SPECIALIZED_UNIT_7_OP,
299 |   SPECIALIZED_UNIT_8_OP
300 | };
301 | 
302 | typedef enum uarch_op_t op_type;
303 | 
304 | enum uarch_operand_type_t { UN_OP = -1, INT_OP, FP_OP };
305 | 
306 | typedef enum uarch_operand_type_t types_of_operands;
307 | 
308 | #endif
309 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | USE_BOOST ?= 1
  2 | DEBUG ?= 0
  3 | USE_GPROF ?= 0
  4 | 
  5 | BOOST_PATH := $(shell echo $$LD_LIBRARY_PATH | tr ':' '\n' | grep boost/lib | head -n 1)
  6 | ifeq ($(BOOST_PATH),)
  7 | 	BOOST_HOME ?=
  8 | else
  9 | 	BOOST_HOME := $(shell dirname $(BOOST_PATH))
 10 | #	$(info Using BOOST_HOME: $(BOOST_HOME))
 11 | endif
 12 | 
 13 | MPICC_PATH := $(shell which mpicc)
 14 | # $(info Using MPICC_PATH: $(MPICC_PATH))
 15 | MPI_PATH := $(shell dirname $(MPICC_PATH))
 16 | # $(info Using MPI_PATH: $(MPI_PATH))
 17 | MPI_HOME ?= $(shell dirname $(MPI_PATH))
 18 | # $(info Using MPI_HOME: $(MPI_HOME))
 19 | 
 20 | MPICXX = $(shell which mpic++)
 21 | MPIRUN = $(shell which mpirun)
 22 | 
 23 | ifeq ($(USE_BOOST),1)
 24 | 	CXX = $(MPICXX)
 25 | 	CC = $(MPICXX)
 26 | else
 27 | 	CXX = g++
 28 | 	CC = gcc
 29 | endif
 30 | 
 31 | CXXFLAGS = -Wall -pthread -finline-functions -funswitch-loops -MMD -MP
 32 | 
 33 | ifeq ($(USE_GPROF),1)
 34 | 	CXXFLAGS += -pg
 35 | endif
 36 | 
 37 | CFLAGS = $(CXXFLAGS)
 38 | 
 39 | # Detect Support for C++11 (C++0x) from GCC Version 
 40 | GNUC_CPP0X := $(shell mpic++ --version | perl -ne 'if (/g++\s+\(.*\)\s+([0-9.]+)/){ if($$1 >= 4.3) {$$n=1} else {$$n=0;} } END { print $$n; }')
 41 | 
 42 | ifeq ($(GNUC_CPP0X), 1)
 43 | 	CXXFLAGS += -std=c++11
 44 | endif
 45 | 
 46 | INC_DIRS = -I./hw-parser -I./hw-component -I./ISA-Def -I./DEV-Def -I./trace-parser -I./trace-driven -I./common -I./common/CLI -I./common/CLI/impl -I$(MPI_HOME)/include -I$(BOOST_HOME)/include -I./parda
 47 | CXXFLAGS += $(INC_DIRS) $(shell pkg-config --cflags glib-2.0)
 48 | CFLAGS += $(INC_DIRS)
 49 | 
 50 | LIBRARIES = -L$(BOOST_HOME)/lib -lboost_mpi -lboost_serialization
 51 | LIBRARIES += $(shell pkg-config --libs glib-2.0)
 52 | 
 53 | ifeq ($(DEBUG),1)
 54 | 	OPTFLAGS = -O0 -g3 -fPIC
 55 | else
 56 | 	OPTFLAGS = -O3 -fPIC
 57 | endif
 58 | 
 59 | OBJ_PATH = obj
 60 | 
 61 | TARGET = gpu-simulator.x
 62 | 
 63 | exist_OBJ_PATH = $(shell if [ -d $(OBJ_PATH) ]; then echo "exist"; else echo "noexist"; fi)
 64 | 
 65 | ifeq ("$(exist_OBJ_PATH)", "noexist")
 66 | $(shell mkdir $(OBJ_PATH))
 67 | endif
 68 | 
 69 | CC_SRCS := $(wildcard *.c) $(wildcard parda/*.c)
 70 | CC_SRCS := $(filter-out parda/parda_mpi.c parda/parda_omp.c parda/main.c parda/seperate.c, $(CC_SRCS))
 71 | 
 72 | CXX_SRCS := $(wildcard *.cc) $(wildcard trace-parser/*.cc) $(wildcard trace-driven/*.cc) 
 73 | CXX_SRCS += $(wildcard hw-component/*.cc) $(wildcard hw-parser/*.cc) $(wildcard common/*.cc)
 74 | 
 75 | SRCS := $(CC_SRCS) $(CXX_SRCS)
 76 | 
 77 | CC_OBJS := $(CC_SRCS:%.c=$(OBJ_PATH)/%.o)
 78 | CXX_OBJS := $(CXX_SRCS:%.cc=$(OBJ_PATH)/%.o)
 79 | 
 80 | OBJS := $(CXX_OBJS) $(CC_OBJS) 
 81 | 
 82 | default: all
 83 | 
 84 | all: $(TARGET)
 85 | 
 86 | $(TARGET): $(OBJS)
 87 | 	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $^ $(LIBRARIES) 
 88 | 
 89 | $(OBJ_PATH)/%.o: %.cc
 90 | 	@mkdir -p $(@D)
 91 | 	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -c $< -o $@
 92 | 
 93 | $(OBJ_PATH)/%.o: %.c
 94 | 	@mkdir -p $(@D)
 95 | 	$(CC) $(CFLAGS) $(OPTFLAGS) -c $< -o $@
 96 | 
 97 | DEPS = $(shell find $(OBJ_PATH) -name "*.d")
 98 | -include $(DEPS)
 99 | 
100 | .PHONY: clean
101 | 
102 | clean:
103 | 	rm -f $(OBJS)
104 | 	rm -f $(DEPS)
105 | 	rm -f $(TARGET)
106 | 	rm -rf $(OBJ_PATH)
107 | 


--------------------------------------------------------------------------------
/common/CLI/Argv.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // [CLI11:public_includes:set]
10 | #include <string>
11 | #include <vector>
12 | // [CLI11:public_includes:end]
13 | 
14 | #include <CLI/Macros.hpp>
15 | 
16 | namespace CLI {
17 | // [CLI11:argv_hpp:verbatim]
18 | namespace detail {
19 | #ifdef _WIN32
20 | /// Decode and return UTF-8 argv from GetCommandLineW.
21 | CLI11_INLINE std::vector<std::string> compute_win32_argv();
22 | #endif
23 | }  // namespace detail
24 | 
25 | /// argc as passed in to this executable.
26 | CLI11_INLINE int argc();
27 | 
28 | /// argv as passed in to this executable, converted to utf-8 on Windows.
29 | CLI11_INLINE const char *const *argv();
30 | 
31 | // [CLI11:argv_hpp:end]
32 | }  // namespace CLI
33 | 
34 | #ifndef CLI11_COMPILE
35 | #include "impl/Argv_inl.hpp"
36 | #endif
37 | 


--------------------------------------------------------------------------------
/common/CLI/CLI.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // CLI Library includes
10 | // Order is important for combiner script
11 | 
12 | #include "Version.hpp"
13 | 
14 | #include "Macros.hpp"
15 | 
16 | #include "Encoding.hpp"
17 | 
18 | #include "Argv.hpp"
19 | 
20 | #include "StringTools.hpp"
21 | 
22 | #include "Error.hpp"
23 | 
24 | #include "TypeTools.hpp"
25 | 
26 | #include "Split.hpp"
27 | 
28 | #include "ConfigFwd.hpp"
29 | 
30 | #include "Validators.hpp"
31 | 
32 | #include "FormatterFwd.hpp"
33 | 
34 | #include "Option.hpp"
35 | 
36 | #include "App.hpp"
37 | 
38 | #include "Config.hpp"
39 | 
40 | #include "Formatter.hpp"
41 | 


--------------------------------------------------------------------------------
/common/CLI/Config.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // [CLI11:public_includes:set]
10 | #include <algorithm>
11 | #include <cctype>
12 | #include <fstream>
13 | #include <iostream>
14 | #include <string>
15 | #include <utility>
16 | #include <vector>
17 | // [CLI11:public_includes:end]
18 | 
19 | #include "App.hpp"
20 | #include "ConfigFwd.hpp"
21 | #include "StringTools.hpp"
22 | 
23 | namespace CLI {
24 | // [CLI11:config_hpp:verbatim]
25 | namespace detail {
26 | 
27 | std::string convert_arg_for_ini(const std::string &arg, char stringQuote = '"', char characterQuote = '\'');
28 | 
29 | /// Comma separated join, adds quotes if needed
30 | std::string ini_join(const std::vector<std::string> &args,
31 |                      char sepChar = ',',
32 |                      char arrayStart = '[',
33 |                      char arrayEnd = ']',
34 |                      char stringQuote = '"',
35 |                      char characterQuote = '\'');
36 | 
37 | std::vector<std::string> generate_parents(const std::string &section, std::string &name, char parentSeparator);
38 | 
39 | /// assuming non default segments do a check on the close and open of the segments in a configItem structure
40 | void checkParentSegments(std::vector<ConfigItem> &output, const std::string &currentSection, char parentSeparator);
41 | }  // namespace detail
42 | 
43 | // [CLI11:config_hpp:end]
44 | }  // namespace CLI
45 | 
46 | #ifndef CLI11_COMPILE
47 | #include "impl/Config_inl.hpp"
48 | #endif
49 | 


--------------------------------------------------------------------------------
/common/CLI/ConfigFwd.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // [CLI11:public_includes:set]
 10 | #include <algorithm>
 11 | #include <fstream>
 12 | #include <iostream>
 13 | #include <string>
 14 | #include <vector>
 15 | // [CLI11:public_includes:end]
 16 | 
 17 | #include "Error.hpp"
 18 | #include "StringTools.hpp"
 19 | 
 20 | namespace CLI {
 21 | // [CLI11:config_fwd_hpp:verbatim]
 22 | 
 23 | class App;
 24 | 
 25 | /// Holds values to load into Options
 26 | struct ConfigItem {
 27 |     /// This is the list of parents
 28 |     std::vector<std::string> parents{};
 29 | 
 30 |     /// This is the name
 31 |     std::string name{};
 32 | 
 33 |     /// Listing of inputs
 34 |     std::vector<std::string> inputs{};
 35 | 
 36 |     /// The list of parents and name joined by "."
 37 |     CLI11_NODISCARD std::string fullname() const {
 38 |         std::vector<std::string> tmp = parents;
 39 |         tmp.emplace_back(name);
 40 |         return detail::join(tmp, ".");
 41 |     }
 42 | };
 43 | 
 44 | /// This class provides a converter for configuration files.
 45 | class Config {
 46 |   protected:
 47 |     std::vector<ConfigItem> items{};
 48 | 
 49 |   public:
 50 |     /// Convert an app into a configuration
 51 |     virtual std::string to_config(const App *, bool, bool, std::string) const = 0;
 52 | 
 53 |     /// Convert a configuration into an app
 54 |     virtual std::vector<ConfigItem> from_config(std::istream &) const = 0;
 55 | 
 56 |     /// Get a flag value
 57 |     CLI11_NODISCARD virtual std::string to_flag(const ConfigItem &item) const {
 58 |         if(item.inputs.size() == 1) {
 59 |             return item.inputs.at(0);
 60 |         }
 61 |         if(item.inputs.empty()) {
 62 |             return "{}";
 63 |         }
 64 |         throw ConversionError::TooManyInputsFlag(item.fullname());  // LCOV_EXCL_LINE
 65 |     }
 66 | 
 67 |     /// Parse a config file, throw an error (ParseError:ConfigParseError or FileError) on failure
 68 |     CLI11_NODISCARD std::vector<ConfigItem> from_file(const std::string &name) const {
 69 |         std::ifstream input{name};
 70 |         if(!input.good())
 71 |             throw FileError::Missing(name);
 72 | 
 73 |         return from_config(input);
 74 |     }
 75 | 
 76 |     /// Virtual destructor
 77 |     virtual ~Config() = default;
 78 | };
 79 | 
 80 | /// This converter works with INI/TOML files; to write INI files use ConfigINI
 81 | class ConfigBase : public Config {
 82 |   protected:
 83 |     /// the character used for comments
 84 |     char commentChar = '#';
 85 |     /// the character used to start an array '\0' is a default to not use
 86 |     char arrayStart = '[';
 87 |     /// the character used to end an array '\0' is a default to not use
 88 |     char arrayEnd = ']';
 89 |     /// the character used to separate elements in an array
 90 |     char arraySeparator = ',';
 91 |     /// the character used separate the name from the value
 92 |     char valueDelimiter = '=';
 93 |     /// the character to use around strings
 94 |     char stringQuote = '"';
 95 |     /// the character to use around single characters
 96 |     char characterQuote = '\'';
 97 |     /// the maximum number of layers to allow
 98 |     uint8_t maximumLayers{255};
 99 |     /// the separator used to separator parent layers
100 |     char parentSeparatorChar{'.'};
101 |     /// Specify the configuration index to use for arrayed sections
102 |     int16_t configIndex{-1};
103 |     /// Specify the configuration section that should be used
104 |     std::string configSection{};
105 | 
106 |   public:
107 |     std::string
108 |     to_config(const App * /*app*/, bool default_also, bool write_description, std::string prefix) const override;
109 | 
110 |     std::vector<ConfigItem> from_config(std::istream &input) const override;
111 |     /// Specify the configuration for comment characters
112 |     ConfigBase *comment(char cchar) {
113 |         commentChar = cchar;
114 |         return this;
115 |     }
116 |     /// Specify the start and end characters for an array
117 |     ConfigBase *arrayBounds(char aStart, char aEnd) {
118 |         arrayStart = aStart;
119 |         arrayEnd = aEnd;
120 |         return this;
121 |     }
122 |     /// Specify the delimiter character for an array
123 |     ConfigBase *arrayDelimiter(char aSep) {
124 |         arraySeparator = aSep;
125 |         return this;
126 |     }
127 |     /// Specify the delimiter between a name and value
128 |     ConfigBase *valueSeparator(char vSep) {
129 |         valueDelimiter = vSep;
130 |         return this;
131 |     }
132 |     /// Specify the quote characters used around strings and characters
133 |     ConfigBase *quoteCharacter(char qString, char qChar) {
134 |         stringQuote = qString;
135 |         characterQuote = qChar;
136 |         return this;
137 |     }
138 |     /// Specify the maximum number of parents
139 |     ConfigBase *maxLayers(uint8_t layers) {
140 |         maximumLayers = layers;
141 |         return this;
142 |     }
143 |     /// Specify the separator to use for parent layers
144 |     ConfigBase *parentSeparator(char sep) {
145 |         parentSeparatorChar = sep;
146 |         return this;
147 |     }
148 |     /// get a reference to the configuration section
149 |     std::string &sectionRef() { return configSection; }
150 |     /// get the section
151 |     CLI11_NODISCARD const std::string &section() const { return configSection; }
152 |     /// specify a particular section of the configuration file to use
153 |     ConfigBase *section(const std::string &sectionName) {
154 |         configSection = sectionName;
155 |         return this;
156 |     }
157 | 
158 |     /// get a reference to the configuration index
159 |     int16_t &indexRef() { return configIndex; }
160 |     /// get the section index
161 |     CLI11_NODISCARD int16_t index() const { return configIndex; }
162 |     /// specify a particular index in the section to use (-1) for all sections to use
163 |     ConfigBase *index(int16_t sectionIndex) {
164 |         configIndex = sectionIndex;
165 |         return this;
166 |     }
167 | };
168 | 
169 | /// the default Config is the TOML file format
170 | using ConfigTOML = ConfigBase;
171 | 
172 | /// ConfigINI generates a "standard" INI compliant output
173 | class ConfigINI : public ConfigTOML {
174 | 
175 |   public:
176 |     ConfigINI() {
177 |         commentChar = ';';
178 |         arrayStart = '\0';
179 |         arrayEnd = '\0';
180 |         arraySeparator = ' ';
181 |         valueDelimiter = '=';
182 |     }
183 | };
184 | // [CLI11:config_fwd_hpp:end]
185 | }  // namespace CLI
186 | 


--------------------------------------------------------------------------------
/common/CLI/Encoding.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <CLI/Macros.hpp>
10 | 
11 | // [CLI11:public_includes:set]
12 | #include <string>
13 | // [CLI11:public_includes:end]
14 | 
15 | // [CLI11:encoding_includes:verbatim]
16 | #ifdef CLI11_CPP17
17 | #include <string_view>
18 | #endif  // CLI11_CPP17
19 | 
20 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0
21 | #include <filesystem>
22 | #include <string_view>  // NOLINT(build/include)
23 | #endif                  // CLI11_HAS_FILESYSTEM
24 | // [CLI11:encoding_includes:end]
25 | 
26 | namespace CLI {
27 | // [CLI11:encoding_hpp:verbatim]
28 | 
29 | /// Convert a wide string to a narrow string.
30 | CLI11_INLINE std::string narrow(const std::wstring &str);
31 | CLI11_INLINE std::string narrow(const wchar_t *str);
32 | CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t size);
33 | 
34 | /// Convert a narrow string to a wide string.
35 | CLI11_INLINE std::wstring widen(const std::string &str);
36 | CLI11_INLINE std::wstring widen(const char *str);
37 | CLI11_INLINE std::wstring widen(const char *str, std::size_t size);
38 | 
39 | #ifdef CLI11_CPP17
40 | CLI11_INLINE std::string narrow(std::wstring_view str);
41 | CLI11_INLINE std::wstring widen(std::string_view str);
42 | #endif  // CLI11_CPP17
43 | 
44 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0
45 | /// Convert a char-string to a native path correctly.
46 | CLI11_INLINE std::filesystem::path to_path(std::string_view str);
47 | #endif  // CLI11_HAS_FILESYSTEM
48 | 
49 | // [CLI11:encoding_hpp:end]
50 | }  // namespace CLI
51 | 
52 | #ifndef CLI11_COMPILE
53 | #include "impl/Encoding_inl.hpp"
54 | #endif
55 | 


--------------------------------------------------------------------------------
/common/CLI/Formatter.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // [CLI11:public_includes:set]
10 | #include <algorithm>
11 | #include <string>
12 | #include <vector>
13 | // [CLI11:public_includes:end]
14 | 
15 | #include "App.hpp"
16 | #include "FormatterFwd.hpp"
17 | 
18 | namespace CLI {
19 | // [CLI11:formatter_hpp:verbatim]
20 | // [CLI11:formatter_hpp:end]
21 | }  // namespace CLI
22 | 
23 | #ifndef CLI11_COMPILE
24 | #include "impl/Formatter_inl.hpp"
25 | #endif
26 | 


--------------------------------------------------------------------------------
/common/CLI/FormatterFwd.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // [CLI11:public_includes:set]
 10 | #include <functional>
 11 | #include <map>
 12 | #include <string>
 13 | #include <utility>
 14 | #include <vector>
 15 | // [CLI11:public_includes:end]
 16 | 
 17 | #include "StringTools.hpp"
 18 | 
 19 | namespace CLI {
 20 | // [CLI11:formatter_fwd_hpp:verbatim]
 21 | 
 22 | class Option;
 23 | class App;
 24 | 
 25 | /// This enum signifies the type of help requested
 26 | ///
 27 | /// This is passed in by App; all user classes must accept this as
 28 | /// the second argument.
 29 | 
 30 | enum class AppFormatMode {
 31 |     Normal,  ///< The normal, detailed help
 32 |     All,     ///< A fully expanded help
 33 |     Sub,     ///< Used when printed as part of expanded subcommand
 34 | };
 35 | 
 36 | /// This is the minimum requirements to run a formatter.
 37 | ///
 38 | /// A user can subclass this is if they do not care at all
 39 | /// about the structure in CLI::Formatter.
 40 | class FormatterBase {
 41 |   protected:
 42 |     /// @name Options
 43 |     ///@{
 44 | 
 45 |     /// The width of the first column
 46 |     std::size_t column_width_{30};
 47 | 
 48 |     /// @brief The required help printout labels (user changeable)
 49 |     /// Values are Needs, Excludes, etc.
 50 |     std::map<std::string, std::string> labels_{};
 51 | 
 52 |     ///@}
 53 |     /// @name Basic
 54 |     ///@{
 55 | 
 56 |   public:
 57 |     FormatterBase() = default;
 58 |     FormatterBase(const FormatterBase &) = default;
 59 |     FormatterBase(FormatterBase &&) = default;
 60 |     FormatterBase &operator=(const FormatterBase &) = default;
 61 |     FormatterBase &operator=(FormatterBase &&) = default;
 62 | 
 63 |     /// Adding a destructor in this form to work around bug in GCC 4.7
 64 |     virtual ~FormatterBase() noexcept {}  // NOLINT(modernize-use-equals-default)
 65 | 
 66 |     /// This is the key method that puts together help
 67 |     virtual std::string make_help(const App *, std::string, AppFormatMode) const = 0;
 68 | 
 69 |     ///@}
 70 |     /// @name Setters
 71 |     ///@{
 72 | 
 73 |     /// Set the "REQUIRED" label
 74 |     void label(std::string key, std::string val) { labels_[key] = val; }
 75 | 
 76 |     /// Set the column width
 77 |     void column_width(std::size_t val) { column_width_ = val; }
 78 | 
 79 |     ///@}
 80 |     /// @name Getters
 81 |     ///@{
 82 | 
 83 |     /// Get the current value of a name (REQUIRED, etc.)
 84 |     CLI11_NODISCARD std::string get_label(std::string key) const {
 85 |         if(labels_.find(key) == labels_.end())
 86 |             return key;
 87 |         return labels_.at(key);
 88 |     }
 89 | 
 90 |     /// Get the current column width
 91 |     CLI11_NODISCARD std::size_t get_column_width() const { return column_width_; }
 92 | 
 93 |     ///@}
 94 | };
 95 | 
 96 | /// This is a specialty override for lambda functions
 97 | class FormatterLambda final : public FormatterBase {
 98 |     using funct_t = std::function<std::string(const App *, std::string, AppFormatMode)>;
 99 | 
100 |     /// The lambda to hold and run
101 |     funct_t lambda_;
102 | 
103 |   public:
104 |     /// Create a FormatterLambda with a lambda function
105 |     explicit FormatterLambda(funct_t funct) : lambda_(std::move(funct)) {}
106 | 
107 |     /// Adding a destructor (mostly to make GCC 4.7 happy)
108 |     ~FormatterLambda() noexcept override {}  // NOLINT(modernize-use-equals-default)
109 | 
110 |     /// This will simply call the lambda function
111 |     std::string make_help(const App *app, std::string name, AppFormatMode mode) const override {
112 |         return lambda_(app, name, mode);
113 |     }
114 | };
115 | 
116 | /// This is the default Formatter for CLI11. It pretty prints help output, and is broken into quite a few
117 | /// overridable methods, to be highly customizable with minimal effort.
118 | class Formatter : public FormatterBase {
119 |   public:
120 |     Formatter() = default;
121 |     Formatter(const Formatter &) = default;
122 |     Formatter(Formatter &&) = default;
123 |     Formatter &operator=(const Formatter &) = default;
124 |     Formatter &operator=(Formatter &&) = default;
125 | 
126 |     /// @name Overridables
127 |     ///@{
128 | 
129 |     /// This prints out a group of options with title
130 |     ///
131 |     CLI11_NODISCARD virtual std::string
132 |     make_group(std::string group, bool is_positional, std::vector<const Option *> opts) const;
133 | 
134 |     /// This prints out just the positionals "group"
135 |     virtual std::string make_positionals(const App *app) const;
136 | 
137 |     /// This prints out all the groups of options
138 |     std::string make_groups(const App *app, AppFormatMode mode) const;
139 | 
140 |     /// This prints out all the subcommands
141 |     virtual std::string make_subcommands(const App *app, AppFormatMode mode) const;
142 | 
143 |     /// This prints out a subcommand
144 |     virtual std::string make_subcommand(const App *sub) const;
145 | 
146 |     /// This prints out a subcommand in help-all
147 |     virtual std::string make_expanded(const App *sub) const;
148 | 
149 |     /// This prints out all the groups of options
150 |     virtual std::string make_footer(const App *app) const;
151 | 
152 |     /// This displays the description line
153 |     virtual std::string make_description(const App *app) const;
154 | 
155 |     /// This displays the usage line
156 |     virtual std::string make_usage(const App *app, std::string name) const;
157 | 
158 |     /// This puts everything together
159 |     std::string make_help(const App * /*app*/, std::string, AppFormatMode) const override;
160 | 
161 |     ///@}
162 |     /// @name Options
163 |     ///@{
164 | 
165 |     /// This prints out an option help line, either positional or optional form
166 |     virtual std::string make_option(const Option *opt, bool is_positional) const {
167 |         std::stringstream out;
168 |         detail::format_help(
169 |             out, make_option_name(opt, is_positional) + make_option_opts(opt), make_option_desc(opt), column_width_);
170 |         return out.str();
171 |     }
172 | 
173 |     /// @brief This is the name part of an option, Default: left column
174 |     virtual std::string make_option_name(const Option *, bool) const;
175 | 
176 |     /// @brief This is the options part of the name, Default: combined into left column
177 |     virtual std::string make_option_opts(const Option *) const;
178 | 
179 |     /// @brief This is the description. Default: Right column, on new line if left column too large
180 |     virtual std::string make_option_desc(const Option *) const;
181 | 
182 |     /// @brief This is used to print the name on the USAGE line
183 |     virtual std::string make_option_usage(const Option *opt) const;
184 | 
185 |     ///@}
186 | };
187 | 
188 | // [CLI11:formatter_fwd_hpp:end]
189 | }  // namespace CLI
190 | 


--------------------------------------------------------------------------------
/common/CLI/Macros.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // [CLI11:macros_hpp:verbatim]
 10 | 
 11 | // The following version macro is very similar to the one in pybind11
 12 | #if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
 13 | #if __cplusplus >= 201402L
 14 | #define CLI11_CPP14
 15 | #if __cplusplus >= 201703L
 16 | #define CLI11_CPP17
 17 | #if __cplusplus > 201703L
 18 | #define CLI11_CPP20
 19 | #endif
 20 | #endif
 21 | #endif
 22 | #elif defined(_MSC_VER) && __cplusplus == 199711L
 23 | // MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
 24 | // Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
 25 | #if _MSVC_LANG >= 201402L
 26 | #define CLI11_CPP14
 27 | #if _MSVC_LANG > 201402L && _MSC_VER >= 1910
 28 | #define CLI11_CPP17
 29 | #if _MSVC_LANG > 201703L && _MSC_VER >= 1910
 30 | #define CLI11_CPP20
 31 | #endif
 32 | #endif
 33 | #endif
 34 | #endif
 35 | 
 36 | #if defined(CLI11_CPP14)
 37 | #define CLI11_DEPRECATED(reason) [[deprecated(reason)]]
 38 | #elif defined(_MSC_VER)
 39 | #define CLI11_DEPRECATED(reason) __declspec(deprecated(reason))
 40 | #else
 41 | #define CLI11_DEPRECATED(reason) __attribute__((deprecated(reason)))
 42 | #endif
 43 | 
 44 | // GCC < 10 doesn't ignore this in unevaluated contexts
 45 | #if !defined(CLI11_CPP17) ||                                                                                           \
 46 |     (defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 10 && __GNUC__ > 4)
 47 | #define CLI11_NODISCARD
 48 | #else
 49 | #define CLI11_NODISCARD [[nodiscard]]
 50 | #endif
 51 | 
 52 | /** detection of rtti */
 53 | #ifndef CLI11_USE_STATIC_RTTI
 54 | #if(defined(_HAS_STATIC_RTTI) && _HAS_STATIC_RTTI)
 55 | #define CLI11_USE_STATIC_RTTI 1
 56 | #elif defined(__cpp_rtti)
 57 | #if(defined(_CPPRTTI) && _CPPRTTI == 0)
 58 | #define CLI11_USE_STATIC_RTTI 1
 59 | #else
 60 | #define CLI11_USE_STATIC_RTTI 0
 61 | #endif
 62 | #elif(defined(__GCC_RTTI) && __GXX_RTTI)
 63 | #define CLI11_USE_STATIC_RTTI 0
 64 | #else
 65 | #define CLI11_USE_STATIC_RTTI 1
 66 | #endif
 67 | #endif
 68 | 
 69 | /** <filesystem> availability */
 70 | #if defined CLI11_CPP17 && defined __has_include && !defined CLI11_HAS_FILESYSTEM
 71 | #if __has_include(<filesystem>)
 72 | // Filesystem cannot be used if targeting macOS < 10.15
 73 | #if defined __MAC_OS_X_VERSION_MIN_REQUIRED && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500
 74 | #define CLI11_HAS_FILESYSTEM 0
 75 | #elif defined(__wasi__)
 76 | // As of wasi-sdk-14, filesystem is not implemented
 77 | #define CLI11_HAS_FILESYSTEM 0
 78 | #else
 79 | #include <filesystem>
 80 | #if defined __cpp_lib_filesystem && __cpp_lib_filesystem >= 201703
 81 | #if defined _GLIBCXX_RELEASE && _GLIBCXX_RELEASE >= 9
 82 | #define CLI11_HAS_FILESYSTEM 1
 83 | #elif defined(__GLIBCXX__)
 84 | // if we are using gcc and Version <9 default to no filesystem
 85 | #define CLI11_HAS_FILESYSTEM 0
 86 | #else
 87 | #define CLI11_HAS_FILESYSTEM 1
 88 | #endif
 89 | #else
 90 | #define CLI11_HAS_FILESYSTEM 0
 91 | #endif
 92 | #endif
 93 | #endif
 94 | #endif
 95 | 
 96 | /** <codecvt> availability */
 97 | #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 5
 98 | #define CLI11_HAS_CODECVT 0
 99 | #else
100 | #define CLI11_HAS_CODECVT 1
101 | #include <codecvt>
102 | #endif
103 | 
104 | /** disable deprecations */
105 | #if defined(__GNUC__)  // GCC or clang
106 | #define CLI11_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
107 | #define CLI11_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
108 | 
109 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
110 | 
111 | #elif defined(_MSC_VER)
112 | #define CLI11_DIAGNOSTIC_PUSH __pragma(warning(push))
113 | #define CLI11_DIAGNOSTIC_POP __pragma(warning(pop))
114 | 
115 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED __pragma(warning(disable : 4996))
116 | 
117 | #else
118 | #define CLI11_DIAGNOSTIC_PUSH
119 | #define CLI11_DIAGNOSTIC_POP
120 | 
121 | #define CLI11_DIAGNOSTIC_IGNORE_DEPRECATED
122 | 
123 | #endif
124 | 
125 | /** Inline macro **/
126 | #ifdef CLI11_COMPILE
127 | #define CLI11_INLINE
128 | #else
129 | #define CLI11_INLINE inline
130 | #endif
131 | // [CLI11:macros_hpp:end]
132 | 


--------------------------------------------------------------------------------
/common/CLI/Split.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // [CLI11:public_includes:set]
10 | #include <string>
11 | #include <tuple>
12 | #include <utility>
13 | #include <vector>
14 | // [CLI11:public_includes:end]
15 | 
16 | #include "Macros.hpp"
17 | 
18 | namespace CLI {
19 | // [CLI11:split_hpp:verbatim]
20 | 
21 | namespace detail {
22 | 
23 | // Returns false if not a short option. Otherwise, sets opt name and rest and returns true
24 | CLI11_INLINE bool split_short(const std::string &current, std::string &name, std::string &rest);
25 | 
26 | // Returns false if not a long option. Otherwise, sets opt name and other side of = and returns true
27 | CLI11_INLINE bool split_long(const std::string &current, std::string &name, std::string &value);
28 | 
29 | // Returns false if not a windows style option. Otherwise, sets opt name and value and returns true
30 | CLI11_INLINE bool split_windows_style(const std::string &current, std::string &name, std::string &value);
31 | 
32 | // Splits a string into multiple long and short names
33 | CLI11_INLINE std::vector<std::string> split_names(std::string current);
34 | 
35 | /// extract default flag values either {def} or starting with a !
36 | CLI11_INLINE std::vector<std::pair<std::string, std::string>> get_default_flag_values(const std::string &str);
37 | 
38 | /// Get a vector of short names, one of long names, and a single name
39 | CLI11_INLINE std::tuple<std::vector<std::string>, std::vector<std::string>, std::string>
40 | get_names(const std::vector<std::string> &input);
41 | 
42 | }  // namespace detail
43 | // [CLI11:split_hpp:end]
44 | }  // namespace CLI
45 | 
46 | #ifndef CLI11_COMPILE
47 | #include "impl/Split_inl.hpp"
48 | #endif
49 | 


--------------------------------------------------------------------------------
/common/CLI/Timer.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // On GCC < 4.8, the following define is often missing. Due to the
 10 | // fact that this library only uses sleep_for, this should be safe
 11 | #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 5 && __GNUC_MINOR__ < 8
 12 | #define _GLIBCXX_USE_NANOSLEEP
 13 | #endif
 14 | 
 15 | #include <cmath>
 16 | 
 17 | #include <array>
 18 | #include <chrono>
 19 | #include <functional>
 20 | #include <iostream>
 21 | #include <string>
 22 | #include <utility>
 23 | 
 24 | namespace CLI {
 25 | 
 26 | /// This is a simple timer with pretty printing. Creating the timer starts counting.
 27 | class Timer {
 28 |   protected:
 29 |     /// This is a typedef to make clocks easier to use
 30 |     using clock = std::chrono::steady_clock;
 31 | 
 32 |     /// This typedef is for points in time
 33 |     using time_point = std::chrono::time_point<clock>;
 34 | 
 35 |     /// This is the type of a printing function, you can make your own
 36 |     using time_print_t = std::function<std::string(std::string, std::string)>;
 37 | 
 38 |     /// This is the title of the timer
 39 |     std::string title_;
 40 | 
 41 |     /// This is the function that is used to format most of the timing message
 42 |     time_print_t time_print_;
 43 | 
 44 |     /// This is the starting point (when the timer was created)
 45 |     time_point start_;
 46 | 
 47 |     /// This is the number of times cycles (print divides by this number)
 48 |     std::size_t cycles{1};
 49 | 
 50 |   public:
 51 |     /// Standard print function, this one is set by default
 52 |     static std::string Simple(std::string title, std::string time) { return title + ": " + time; }
 53 | 
 54 |     /// This is a fancy print function with --- headers
 55 |     static std::string Big(std::string title, std::string time) {
 56 |         return std::string("-----------------------------------------\n") + "| " + title + " | Time = " + time + "\n" +
 57 |                "-----------------------------------------";
 58 |     }
 59 | 
 60 |   public:
 61 |     /// Standard constructor, can set title and print function
 62 |     explicit Timer(std::string title = "Timer", time_print_t time_print = Simple)
 63 |         : title_(std::move(title)), time_print_(std::move(time_print)), start_(clock::now()) {}
 64 | 
 65 |     /// Time a function by running it multiple times. Target time is the len to target.
 66 |     std::string time_it(std::function<void()> f, double target_time = 1) {
 67 |         time_point start = start_;
 68 |         double total_time = NAN;
 69 | 
 70 |         start_ = clock::now();
 71 |         std::size_t n = 0;
 72 |         do {
 73 |             f();
 74 |             std::chrono::duration<double> elapsed = clock::now() - start_;
 75 |             total_time = elapsed.count();
 76 |         } while(n++ < 100u && total_time < target_time);
 77 | 
 78 |         std::string out = make_time_str(total_time / static_cast<double>(n)) + " for " + std::to_string(n) + " tries";
 79 |         start_ = start;
 80 |         return out;
 81 |     }
 82 | 
 83 |     /// This formats the numerical value for the time string
 84 |     std::string make_time_str() const {  // NOLINT(modernize-use-nodiscard)
 85 |         time_point stop = clock::now();
 86 |         std::chrono::duration<double> elapsed = stop - start_;
 87 |         double time = elapsed.count() / static_cast<double>(cycles);
 88 |         return make_time_str(time);
 89 |     }
 90 | 
 91 |     // LCOV_EXCL_START
 92 |     /// This prints out a time string from a time
 93 |     std::string make_time_str(double time) const {  // NOLINT(modernize-use-nodiscard)
 94 |         auto print_it = [](double x, std::string unit) {
 95 |             const unsigned int buffer_length = 50;
 96 |             std::array<char, buffer_length> buffer;
 97 |             std::snprintf(buffer.data(), buffer_length, "%.5g", x);
 98 |             return buffer.data() + std::string(" ") + unit;
 99 |         };
100 | 
101 |         if(time < .000001)
102 |             return print_it(time * 1000000000, "ns");
103 |         if(time < .001)
104 |             return print_it(time * 1000000, "us");
105 |         if(time < 1)
106 |             return print_it(time * 1000, "ms");
107 |         return print_it(time, "s");
108 |     }
109 |     // LCOV_EXCL_STOP
110 | 
111 |     /// This is the main function, it creates a string
112 |     std::string to_string() const { return time_print_(title_, make_time_str()); }  // NOLINT(modernize-use-nodiscard)
113 | 
114 |     /// Division sets the number of cycles to divide by (no graphical change)
115 |     Timer &operator/(std::size_t val) {
116 |         cycles = val;
117 |         return *this;
118 |     }
119 | };
120 | 
121 | /// This class prints out the time upon destruction
122 | class AutoTimer : public Timer {
123 |   public:
124 |     /// Reimplementing the constructor is required in GCC 4.7
125 |     explicit AutoTimer(std::string title = "Timer", time_print_t time_print = Simple) : Timer(title, time_print) {}
126 |     // GCC 4.7 does not support using inheriting constructors.
127 | 
128 |     /// This destructor prints the string
129 |     ~AutoTimer() { std::cout << to_string() << std::endl; }
130 | };
131 | 
132 | }  // namespace CLI
133 | 
134 | /// This prints out the time if shifted into a std::cout like stream.
135 | inline std::ostream &operator<<(std::ostream &in, const CLI::Timer &timer) { return in << timer.to_string(); }
136 | 


--------------------------------------------------------------------------------
/common/CLI/Version.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
 2 | // under NSF AWARD 1414736 and by the respective contributors.
 3 | // All rights reserved.
 4 | //
 5 | // SPDX-License-Identifier: BSD-3-Clause
 6 | 
 7 | #pragma once
 8 | 
 9 | // [CLI11:version_hpp:verbatim]
10 | 
11 | #define CLI11_VERSION_MAJOR 2
12 | #define CLI11_VERSION_MINOR 3
13 | #define CLI11_VERSION_PATCH 2
14 | #define CLI11_VERSION "2.3.2"
15 | 
16 | // [CLI11:version_hpp:end]
17 | 


--------------------------------------------------------------------------------
/common/CLI/impl/Argv_inl.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // This include is only needed for IDEs to discover symbols
 10 | #include <CLI/Argv.hpp>
 11 | 
 12 | #include <CLI/Encoding.hpp>
 13 | 
 14 | // [CLI11:public_includes:set]
 15 | #include <algorithm>
 16 | #include <memory>
 17 | #include <stdexcept>
 18 | #include <string>
 19 | #include <vector>
 20 | // [CLI11:public_includes:end]
 21 | 
 22 | // [CLI11:argv_inl_includes:verbatim]
 23 | #if defined(_WIN32)
 24 | #if !(defined(_AMD64_) || defined(_X86_) || defined(_ARM_))
 25 | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) ||           \
 26 |     defined(_M_AMD64)
 27 | #define _AMD64_
 28 | #elif defined(i386) || defined(__i386) || defined(__i386__) || defined(__i386__) || defined(_M_IX86)
 29 | #define _X86_
 30 | #elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT)
 31 | #define _ARM_
 32 | #elif defined(__aarch64__) || defined(_M_ARM64)
 33 | #define _ARM64_
 34 | #elif defined(_M_ARM64EC)
 35 | #define _ARM64EC_
 36 | #endif
 37 | #endif
 38 | 
 39 | // first
 40 | #ifndef NOMINMAX
 41 | // if NOMINMAX is already defined we don't want to mess with that either way
 42 | #define NOMINMAX
 43 | #include <windef.h>
 44 | #undef NOMINMAX
 45 | #else
 46 | #include <windef.h>
 47 | #endif
 48 | 
 49 | // second
 50 | #include <winbase.h>
 51 | // third
 52 | #include <processthreadsapi.h>
 53 | #include <shellapi.h>
 54 | 
 55 | #elif defined(__APPLE__)
 56 | #include <crt_externs.h>
 57 | #endif
 58 | // [CLI11:argv_inl_includes:end]
 59 | 
 60 | namespace CLI {
 61 | // [CLI11:argv_inl_hpp:verbatim]
 62 | 
 63 | namespace detail {
 64 | 
 65 | #ifdef __APPLE__
 66 | // Copy argc and argv as early as possible to avoid modification
 67 | static const std::vector<const char *> static_args = [] {
 68 |     static const std::vector<std::string> static_args_as_strings = [] {
 69 |         std::vector<std::string> args_as_strings;
 70 |         int argc = *_NSGetArgc();
 71 |         char **argv = *_NSGetArgv();
 72 | 
 73 |         args_as_strings.reserve(static_cast<size_t>(argc));
 74 |         for(size_t i = 0; i < static_cast<size_t>(argc); i++) {
 75 |             args_as_strings.push_back(argv[i]);
 76 |         }
 77 | 
 78 |         return args_as_strings;
 79 |     }();
 80 | 
 81 |     std::vector<const char *> static_args_result;
 82 |     static_args_result.reserve(static_args_as_strings.size());
 83 | 
 84 |     for(const auto &arg : static_args_as_strings) {
 85 |         static_args_result.push_back(arg.data());
 86 |     }
 87 | 
 88 |     return static_args_result;
 89 | }();
 90 | #endif
 91 | 
 92 | #ifdef _WIN32
 93 | CLI11_INLINE std::vector<std::string> compute_win32_argv() {
 94 |     std::vector<std::string> result;
 95 |     int argc = 0;
 96 | 
 97 |     auto deleter = [](wchar_t **ptr) { LocalFree(ptr); };
 98 |     // NOLINTBEGIN(*-avoid-c-arrays)
 99 |     auto wargv = std::unique_ptr<wchar_t *[], decltype(deleter)>(CommandLineToArgvW(GetCommandLineW(), &argc), deleter);
100 |     // NOLINTEND(*-avoid-c-arrays)
101 | 
102 |     if(wargv == nullptr) {
103 |         throw std::runtime_error("CommandLineToArgvW failed with code " + std::to_string(GetLastError()));
104 |     }
105 | 
106 |     result.reserve(static_cast<size_t>(argc));
107 |     for(size_t i = 0; i < static_cast<size_t>(argc); ++i) {
108 |         result.push_back(narrow(wargv[i]));
109 |     }
110 | 
111 |     return result;
112 | }
113 | #endif
114 | 
115 | /// Command-line arguments, as passed in to this executable, converted to utf-8 on Windows.
116 | CLI11_INLINE const std::vector<const char *> &args() {
117 |     // This function uses initialization via lambdas extensively to take advantage of the thread safety of static
118 |     // variable initialization [stmt.dcl.3]
119 | 
120 | #ifdef _WIN32
121 |     static const std::vector<const char *> static_args = [] {
122 |         static const std::vector<std::string> static_args_as_strings = compute_win32_argv();
123 | 
124 |         std::vector<const char *> static_args_result;
125 |         static_args_result.reserve(static_args_as_strings.size());
126 | 
127 |         for(const auto &arg : static_args_as_strings) {
128 |             static_args_result.push_back(arg.data());
129 |         }
130 | 
131 |         return static_args_result;
132 |     }();
133 | 
134 |     return static_args;
135 | 
136 | #elif defined(__APPLE__)
137 | 
138 |     return static_args;
139 | 
140 | #else
141 |     static const std::vector<const char *> static_args = [] {
142 |         static const std::vector<char> static_cmdline = [] {
143 |             // On posix, retrieve arguments from /proc/self/cmdline, separated by null terminators.
144 |             std::vector<char> cmdline;
145 | 
146 |             auto deleter = [](FILE *f) { std::fclose(f); };
147 |             std::unique_ptr<FILE, decltype(deleter)> fp_unique(std::fopen("/proc/self/cmdline", "r"), deleter);
148 |             FILE *fp = fp_unique.get();
149 |             if(!fp) {
150 |                 throw std::runtime_error("could not open /proc/self/cmdline for reading");  // LCOV_EXCL_LINE
151 |             }
152 | 
153 |             size_t size = 0;
154 |             while(std::feof(fp) == 0) {
155 |                 cmdline.resize(size + 128);
156 |                 size += std::fread(cmdline.data() + size, 1, 128, fp);
157 | 
158 |                 if(std::ferror(fp) != 0) {
159 |                     throw std::runtime_error("error during reading /proc/self/cmdline");  // LCOV_EXCL_LINE
160 |                 }
161 |             }
162 |             cmdline.resize(size);
163 | 
164 |             return cmdline;
165 |         }();
166 | 
167 |         std::size_t argc = static_cast<std::size_t>(std::count(static_cmdline.begin(), static_cmdline.end(), '\0'));
168 |         std::vector<const char *> static_args_result;
169 |         static_args_result.reserve(argc);
170 | 
171 |         for(auto it = static_cmdline.begin(); it != static_cmdline.end();
172 |             it = std::find(it, static_cmdline.end(), '\0') + 1) {
173 |             static_args_result.push_back(static_cmdline.data() + (it - static_cmdline.begin()));
174 |         }
175 | 
176 |         return static_args_result;
177 |     }();
178 | 
179 |     return static_args;
180 | #endif
181 | }
182 | 
183 | }  // namespace detail
184 | 
185 | CLI11_INLINE const char *const *argv() { return detail::args().data(); }
186 | CLI11_INLINE int argc() { return static_cast<int>(detail::args().size()); }
187 | 
188 | // [CLI11:argv_inl_hpp:end]
189 | }  // namespace CLI
190 | 


--------------------------------------------------------------------------------
/common/CLI/impl/Encoding_inl.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // This include is only needed for IDEs to discover symbols
 10 | #include <CLI/Encoding.hpp>
 11 | #include <CLI/Macros.hpp>
 12 | 
 13 | // [CLI11:public_includes:set]
 14 | #include <array>
 15 | #include <clocale>
 16 | #include <cstdlib>
 17 | #include <cstring>
 18 | #include <cwchar>
 19 | #include <locale>
 20 | #include <stdexcept>
 21 | #include <string>
 22 | #include <type_traits>
 23 | #include <utility>
 24 | // [CLI11:public_includes:end]
 25 | 
 26 | namespace CLI {
 27 | // [CLI11:encoding_inl_hpp:verbatim]
 28 | 
 29 | namespace detail {
 30 | 
 31 | #if !CLI11_HAS_CODECVT
 32 | /// Attempt to set one of the acceptable unicode locales for conversion
 33 | CLI11_INLINE void set_unicode_locale() {
 34 |     static const std::array<const char *, 3> unicode_locales{{"C.UTF-8", "en_US.UTF-8", ".UTF-8"}};
 35 | 
 36 |     for(const auto &locale_name : unicode_locales) {
 37 |         if(std::setlocale(LC_ALL, locale_name) != nullptr) {
 38 |             return;
 39 |         }
 40 |     }
 41 |     throw std::runtime_error("CLI::narrow: could not set locale to C.UTF-8");
 42 | }
 43 | 
 44 | template <typename F> struct scope_guard_t {
 45 |     F closure;
 46 | 
 47 |     explicit scope_guard_t(F closure_) : closure(closure_) {}
 48 |     ~scope_guard_t() { closure(); }
 49 | };
 50 | 
 51 | template <typename F> CLI11_NODISCARD CLI11_INLINE scope_guard_t<F> scope_guard(F &&closure) {
 52 |     return scope_guard_t<F>{std::forward<F>(closure)};
 53 | }
 54 | 
 55 | #endif  // !CLI11_HAS_CODECVT
 56 | 
 57 | CLI11_DIAGNOSTIC_PUSH
 58 | CLI11_DIAGNOSTIC_IGNORE_DEPRECATED
 59 | 
 60 | CLI11_INLINE std::string narrow_impl(const wchar_t *str, std::size_t str_size) {
 61 | #if CLI11_HAS_CODECVT
 62 | #ifdef _WIN32
 63 |     return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>>().to_bytes(str, str + str_size);
 64 | 
 65 | #else
 66 |     return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(str, str + str_size);
 67 | 
 68 | #endif  // _WIN32
 69 | #else   // CLI11_HAS_CODECVT
 70 |     (void)str_size;
 71 |     std::mbstate_t state = std::mbstate_t();
 72 |     const wchar_t *it = str;
 73 | 
 74 |     std::string old_locale = std::setlocale(LC_ALL, nullptr);
 75 |     auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); });
 76 |     set_unicode_locale();
 77 | 
 78 |     std::size_t new_size = std::wcsrtombs(nullptr, &it, 0, &state);
 79 |     if(new_size == static_cast<std::size_t>(-1)) {
 80 |         throw std::runtime_error("CLI::narrow: conversion error in std::wcsrtombs at offset " +
 81 |                                  std::to_string(it - str));
 82 |     }
 83 |     std::string result(new_size, '\0');
 84 |     std::wcsrtombs(const_cast<char *>(result.data()), &str, new_size, &state);
 85 | 
 86 |     return result;
 87 | 
 88 | #endif  // CLI11_HAS_CODECVT
 89 | }
 90 | 
 91 | CLI11_INLINE std::wstring widen_impl(const char *str, std::size_t str_size) {
 92 | #if CLI11_HAS_CODECVT
 93 | #ifdef _WIN32
 94 |     return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>>().from_bytes(str, str + str_size);
 95 | 
 96 | #else
 97 |     return std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(str, str + str_size);
 98 | 
 99 | #endif  // _WIN32
100 | #else   // CLI11_HAS_CODECVT
101 |     (void)str_size;
102 |     std::mbstate_t state = std::mbstate_t();
103 |     const char *it = str;
104 | 
105 |     std::string old_locale = std::setlocale(LC_ALL, nullptr);
106 |     auto sg = scope_guard([&] { std::setlocale(LC_ALL, old_locale.c_str()); });
107 |     set_unicode_locale();
108 | 
109 |     std::size_t new_size = std::mbsrtowcs(nullptr, &it, 0, &state);
110 |     if(new_size == static_cast<std::size_t>(-1)) {
111 |         throw std::runtime_error("CLI::widen: conversion error in std::mbsrtowcs at offset " +
112 |                                  std::to_string(it - str));
113 |     }
114 |     std::wstring result(new_size, L'\0');
115 |     std::mbsrtowcs(const_cast<wchar_t *>(result.data()), &str, new_size, &state);
116 | 
117 |     return result;
118 | 
119 | #endif  // CLI11_HAS_CODECVT
120 | }
121 | 
122 | CLI11_DIAGNOSTIC_POP
123 | 
124 | }  // namespace detail
125 | 
126 | CLI11_INLINE std::string narrow(const wchar_t *str, std::size_t str_size) { return detail::narrow_impl(str, str_size); }
127 | CLI11_INLINE std::string narrow(const std::wstring &str) { return detail::narrow_impl(str.data(), str.size()); }
128 | // Flawfinder: ignore
129 | CLI11_INLINE std::string narrow(const wchar_t *str) { return detail::narrow_impl(str, std::wcslen(str)); }
130 | 
131 | CLI11_INLINE std::wstring widen(const char *str, std::size_t str_size) { return detail::widen_impl(str, str_size); }
132 | CLI11_INLINE std::wstring widen(const std::string &str) { return detail::widen_impl(str.data(), str.size()); }
133 | // Flawfinder: ignore
134 | CLI11_INLINE std::wstring widen(const char *str) { return detail::widen_impl(str, std::strlen(str)); }
135 | 
136 | #ifdef CLI11_CPP17
137 | CLI11_INLINE std::string narrow(std::wstring_view str) { return detail::narrow_impl(str.data(), str.size()); }
138 | CLI11_INLINE std::wstring widen(std::string_view str) { return detail::widen_impl(str.data(), str.size()); }
139 | #endif  // CLI11_CPP17
140 | 
141 | #if defined CLI11_HAS_FILESYSTEM && CLI11_HAS_FILESYSTEM > 0
142 | CLI11_INLINE std::filesystem::path to_path(std::string_view str) {
143 |     return std::filesystem::path{
144 | #ifdef _WIN32
145 |         widen(str)
146 | #else
147 |         str
148 | #endif  // _WIN32
149 |     };
150 | }
151 | #endif  // CLI11_HAS_FILESYSTEM
152 | 
153 | // [CLI11:encoding_inl_hpp:end]
154 | }  // namespace CLI
155 | 


--------------------------------------------------------------------------------
/common/CLI/impl/Split_inl.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Cincinnati, developed by Henry Schreiner
  2 | // under NSF AWARD 1414736 and by the respective contributors.
  3 | // All rights reserved.
  4 | //
  5 | // SPDX-License-Identifier: BSD-3-Clause
  6 | 
  7 | #pragma once
  8 | 
  9 | // This include is only needed for IDEs to discover symbols
 10 | #include <CLI/Split.hpp>
 11 | 
 12 | // [CLI11:public_includes:set]
 13 | #include <string>
 14 | #include <tuple>
 15 | #include <utility>
 16 | #include <vector>
 17 | // [CLI11:public_includes:end]
 18 | 
 19 | #include <CLI/Error.hpp>
 20 | #include <CLI/StringTools.hpp>
 21 | 
 22 | namespace CLI {
 23 | // [CLI11:split_inl_hpp:verbatim]
 24 | 
 25 | namespace detail {
 26 | 
 27 | CLI11_INLINE bool split_short(const std::string &current, std::string &name, std::string &rest) {
 28 |     if(current.size() > 1 && current[0] == '-' && valid_first_char(current[1])) {
 29 |         name = current.substr(1, 1);
 30 |         rest = current.substr(2);
 31 |         return true;
 32 |     }
 33 |     return false;
 34 | }
 35 | 
 36 | CLI11_INLINE bool split_long(const std::string &current, std::string &name, std::string &value) {
 37 |     if(current.size() > 2 && current.compare(0, 2, "--") == 0 && valid_first_char(current[2])) {
 38 |         auto loc = current.find_first_of('=');
 39 |         if(loc != std::string::npos) {
 40 |             name = current.substr(2, loc - 2);
 41 |             value = current.substr(loc + 1);
 42 |         } else {
 43 |             name = current.substr(2);
 44 |             value = "";
 45 |         }
 46 |         return true;
 47 |     }
 48 |     return false;
 49 | }
 50 | 
 51 | CLI11_INLINE bool split_windows_style(const std::string &current, std::string &name, std::string &value) {
 52 |     if(current.size() > 1 && current[0] == '/' && valid_first_char(current[1])) {
 53 |         auto loc = current.find_first_of(':');
 54 |         if(loc != std::string::npos) {
 55 |             name = current.substr(1, loc - 1);
 56 |             value = current.substr(loc + 1);
 57 |         } else {
 58 |             name = current.substr(1);
 59 |             value = "";
 60 |         }
 61 |         return true;
 62 |     }
 63 |     return false;
 64 | }
 65 | 
 66 | CLI11_INLINE std::vector<std::string> split_names(std::string current) {
 67 |     std::vector<std::string> output;
 68 |     std::size_t val = 0;
 69 |     while((val = current.find(',')) != std::string::npos) {
 70 |         output.push_back(trim_copy(current.substr(0, val)));
 71 |         current = current.substr(val + 1);
 72 |     }
 73 |     output.push_back(trim_copy(current));
 74 |     return output;
 75 | }
 76 | 
 77 | CLI11_INLINE std::vector<std::pair<std::string, std::string>> get_default_flag_values(const std::string &str) {
 78 |     std::vector<std::string> flags = split_names(str);
 79 |     flags.erase(std::remove_if(flags.begin(),
 80 |                                flags.end(),
 81 |                                [](const std::string &name) {
 82 |                                    return ((name.empty()) || (!(((name.find_first_of('{') != std::string::npos) &&
 83 |                                                                  (name.back() == '}')) ||
 84 |                                                                 (name[0] == '!'))));
 85 |                                }),
 86 |                 flags.end());
 87 |     std::vector<std::pair<std::string, std::string>> output;
 88 |     output.reserve(flags.size());
 89 |     for(auto &flag : flags) {
 90 |         auto def_start = flag.find_first_of('{');
 91 |         std::string defval = "false";
 92 |         if((def_start != std::string::npos) && (flag.back() == '}')) {
 93 |             defval = flag.substr(def_start + 1);
 94 |             defval.pop_back();
 95 |             flag.erase(def_start, std::string::npos);  // NOLINT(readability-suspicious-call-argument)
 96 |         }
 97 |         flag.erase(0, flag.find_first_not_of("-!"));
 98 |         output.emplace_back(flag, defval);
 99 |     }
100 |     return output;
101 | }
102 | 
103 | CLI11_INLINE std::tuple<std::vector<std::string>, std::vector<std::string>, std::string>
104 | get_names(const std::vector<std::string> &input) {
105 | 
106 |     std::vector<std::string> short_names;
107 |     std::vector<std::string> long_names;
108 |     std::string pos_name;
109 | 
110 |     for(std::string name : input) {
111 |         if(name.length() == 0) {
112 |             continue;
113 |         }
114 |         if(name.length() > 1 && name[0] == '-' && name[1] != '-') {
115 |             if(name.length() == 2 && valid_first_char(name[1]))
116 |                 short_names.emplace_back(1, name[1]);
117 |             else if(name.length() > 2)
118 |                 throw BadNameString::MissingDash(name);
119 |             else
120 |                 throw BadNameString::OneCharName(name);
121 |         } else if(name.length() > 2 && name.substr(0, 2) == "--") {
122 |             name = name.substr(2);
123 |             if(valid_name_string(name))
124 |                 long_names.push_back(name);
125 |             else
126 |                 throw BadNameString::BadLongName(name);
127 |         } else if(name == "-" || name == "--") {
128 |             throw BadNameString::DashesOnly(name);
129 |         } else {
130 |             if(pos_name.length() > 0)
131 |                 throw BadNameString::MultiPositionalNames(name);
132 |             pos_name = name;
133 |         }
134 |     }
135 | 
136 |     return std::make_tuple(short_names, long_names, pos_name);
137 | }
138 | 
139 | }  // namespace detail
140 | // [CLI11:split_inl_hpp:end]
141 | }  // namespace CLI
142 | 


--------------------------------------------------------------------------------
/common/common_def.cc:
--------------------------------------------------------------------------------
 1 | #include "common_def.h"
 2 | 
 3 | int kernel_info_m_next_uid = 0;
 4 | 
 5 | unsigned long long GLOBAL_HEAP_START = 0xC0000000;
 6 | 
 7 | unsigned long long SHARED_MEM_SIZE_MAX = 96 * (1 << 10);
 8 | 
 9 | unsigned long long LOCAL_MEM_SIZE_MAX = 1 << 14;
10 | 
11 | unsigned MAX_STREAMING_MULTIPROCESSORS = 80;
12 | 
13 | unsigned MAX_THREAD_PER_SM = 1 << 11;
14 | 
15 | unsigned MAX_WARP_PER_SM = 1 << 6;
16 | unsigned long long TOTAL_LOCAL_MEM_PER_SM =
17 |     MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX;
18 | unsigned long long TOTAL_SHARED_MEM =
19 |     MAX_STREAMING_MULTIPROCESSORS * SHARED_MEM_SIZE_MAX;
20 | unsigned long long TOTAL_LOCAL_MEM =
21 |     MAX_STREAMING_MULTIPROCESSORS * MAX_THREAD_PER_SM * LOCAL_MEM_SIZE_MAX;
22 | unsigned long long SHARED_GENERIC_START = GLOBAL_HEAP_START - TOTAL_SHARED_MEM;
23 | unsigned long long LOCAL_GENERIC_START = SHARED_GENERIC_START - TOTAL_LOCAL_MEM;
24 | unsigned long long STATIC_ALLOC_LIMIT =
25 |     GLOBAL_HEAP_START - (TOTAL_LOCAL_MEM + TOTAL_SHARED_MEM);
26 | 


--------------------------------------------------------------------------------
/common/common_def.h:
--------------------------------------------------------------------------------
  1 | #include <bitset>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | 
  5 | #ifndef COMMON_DEF_H
  6 | #define COMMON_DEF_H
  7 | 
  8 | #define USE_BOOST
  9 | #define gpgpu_concurrent_kernel_sm false
 10 | 
 11 | #define ENABLE_SAMPLING_POINT
 12 | 
 13 | // #define DUMP_THREAD_NUM
 14 | // #define DUMP_TIME_SUMMARY
 15 | 
 16 | #define WARP_SIZE 32
 17 | #define MAX_DST 1
 18 | #define MAX_SRC 4
 19 | 
 20 | #define MAX_WARP_PER_SHADER 64
 21 | 
 22 | #define MAX_INPUT_VALUES 24
 23 | #define MAX_OUTPUT_VALUES 8
 24 | 
 25 | #define MAX_REG_OPERANDS 32
 26 | 
 27 | enum command_type {
 28 |   kernel_launch = 1,
 29 |   cpu_gpu_mem_copy,
 30 |   gpu_cpu_mem_copy,
 31 | };
 32 | 
 33 | enum address_space { GLOBAL_MEM = 1, SHARED_MEM, LOCAL_MEM, TEX_MEM };
 34 | 
 35 | enum address_scope {
 36 |   L1_CACHE = 1,
 37 |   L2_CACHE,
 38 |   SYS_MEM,
 39 | };
 40 | 
 41 | enum address_format { list_all = 0, base_stride = 1, base_delta = 2 };
 42 | 
 43 | const unsigned MAX_WARP_SIZE = 32;
 44 | typedef std::bitset<MAX_WARP_SIZE> active_mask_t;
 45 | 
 46 | const unsigned MAX_ACCESSES_PER_INSN_PER_THREAD = 8;
 47 | 
 48 | typedef unsigned long long new_addr_type;
 49 | 
 50 | const unsigned MAX_MEMORY_ACCESS_SIZE = 128;
 51 | typedef std::bitset<MAX_MEMORY_ACCESS_SIZE> mem_access_byte_mask_t;
 52 | 
 53 | const unsigned SECTOR_CHUNCK_SIZE = 4;
 54 | const unsigned SECTOR_SIZE = 32;
 55 | typedef std::bitset<SECTOR_CHUNCK_SIZE> mem_access_sector_mask_t;
 56 | 
 57 | enum _memory_op_t { no_memory_op = 0, memory_load, memory_store };
 58 | 
 59 | enum mem_operation_t { NOT_TEX, TEX };
 60 | typedef enum mem_operation_t mem_operation;
 61 | 
 62 | #define MEM_ACCESS_TYPE_TUP_DEF                                                \
 63 |   MA_TUP_BEGIN(mem_access_type)                                                \
 64 |   MA_TUP(GLOBAL_ACC_R), MA_TUP(LOCAL_ACC_R), MA_TUP(CONST_ACC_R),              \
 65 |       MA_TUP(TEXTURE_ACC_R), MA_TUP(GLOBAL_ACC_W), MA_TUP(LOCAL_ACC_W),        \
 66 |       MA_TUP(L1_WRBK_ACC), MA_TUP(L2_WRBK_ACC), MA_TUP(INST_ACC_R),            \
 67 |       MA_TUP(L1_WR_ALLOC_R), MA_TUP(L2_WR_ALLOC_R),                            \
 68 |       MA_TUP(NUM_MEM_ACCESS_TYPE) MA_TUP_END(mem_access_type)
 69 | 
 70 | #define MA_TUP_BEGIN(X) enum X {
 71 | #define MA_TUP(X) X
 72 | #define MA_TUP_END(X)                                                          \
 73 |   }                                                                            \
 74 |   ;
 75 | enum mem_access_type {
 76 |   GLOBAL_ACC_R,
 77 |   LOCAL_ACC_R,
 78 |   CONST_ACC_R,
 79 |   TEXTURE_ACC_R,
 80 |   GLOBAL_ACC_W,
 81 |   LOCAL_ACC_W,
 82 |   L1_WRBK_ACC,
 83 |   L2_WRBK_ACC,
 84 |   INST_ACC_R,
 85 |   L1_WR_ALLOC_R,
 86 |   L2_WR_ALLOC_R,
 87 |   NUM_MEM_ACCESS_TYPE
 88 | };
 89 | #undef MA_TUP_BEGIN
 90 | #undef MA_TUP
 91 | #undef MA_TUP_END
 92 | 
 93 | enum _memory_space_t {
 94 |   undefined_space = 0,
 95 |   reg_space,
 96 |   local_space,
 97 |   shared_space,
 98 |   sstarr_space,
 99 |   param_space_unclassified,
100 |   param_space_kernel,
101 |   param_space_local,
102 |   const_space,
103 |   tex_space,
104 |   surf_space,
105 |   global_space,
106 |   generic_space,
107 |   instruction_space
108 | };
109 | 
110 | enum cache_operator_type {
111 |   CACHE_UNDEFINED,
112 | 
113 |   CACHE_ALL,
114 |   CACHE_LAST_USE,
115 |   CACHE_VOLATILE,
116 |   CACHE_L1,
117 | 
118 |   CACHE_STREAMING,
119 |   CACHE_GLOBAL,
120 | 
121 |   CACHE_WRITE_BACK,
122 |   CACHE_WRITE_THROUGH
123 | };
124 | 
125 | #define MAX_REG_OPERANDS 32
126 | 
127 | #define MAX_KERNELS_NUM 300
128 | 
129 | #ifdef USE_BOOST
130 | 
131 | #include <boost/mpi.hpp>
132 | #include <boost/serialization/map.hpp>
133 | #include <boost/serialization/vector.hpp>
134 | #endif
135 | 
136 | #ifdef USE_BOOST
137 | void simple_mpi_test(int argc, char **argv);
138 | #endif
139 | 
140 | extern int kernel_info_m_next_uid;
141 | 
142 | extern unsigned long long GLOBAL_HEAP_START;
143 | 
144 | extern unsigned long long SHARED_MEM_SIZE_MAX;
145 | 
146 | extern unsigned long long LOCAL_MEM_SIZE_MAX;
147 | 
148 | extern unsigned MAX_STREAMING_MULTIPROCESSORS;
149 | 
150 | extern unsigned MAX_THREAD_PER_SM;
151 | 
152 | extern unsigned MAX_WARP_PER_SM;
153 | extern unsigned long long TOTAL_LOCAL_MEM_PER_SM;
154 | extern unsigned long long TOTAL_SHARED_MEM;
155 | extern unsigned long long TOTAL_LOCAL_MEM;
156 | extern unsigned long long SHARED_GENERIC_START;
157 | extern unsigned long long LOCAL_GENERIC_START;
158 | extern unsigned long long STATIC_ALLOC_LIMIT;
159 | 
160 | #endif
161 | 


--------------------------------------------------------------------------------
/common/option_parser.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung
 2 | // The University of British Columbia
 3 | // All rights reserved.
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are met:
 7 | //
 8 | // Redistributions of source code must retain the above copyright notice, this
 9 | // list of conditions and the following disclaimer.
10 | // Redistributions in binary form must reproduce the above copyright notice,
11 | // this list of conditions and the following disclaimer in the documentation
12 | // and/or other materials provided with the distribution. Neither the name of
13 | // The University of British Columbia nor the names of its contributors may be
14 | // used to endorse or promote products derived from this software without
15 | // specific prior written permission.
16 | //
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 | // POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | #pragma once
30 | 
31 | #include <stdio.h>
32 | #include <stdlib.h>
33 | #include <string>
34 | 
35 | typedef class OptionParser *option_parser_t;
36 | 
37 | enum option_dtype {
38 |   OPT_INT32,
39 |   OPT_UINT32,
40 |   OPT_INT64,
41 |   OPT_UINT64,
42 |   OPT_BOOL,
43 |   OPT_FLOAT,
44 |   OPT_DOUBLE,
45 |   OPT_CHAR,
46 |   OPT_CSTR
47 | };
48 | 
49 | option_parser_t option_parser_create();
50 | void option_parser_destroy(option_parser_t opp);
51 | 
52 | void option_parser_register(option_parser_t opp, const char *name,
53 |                             enum option_dtype type, void *variable,
54 |                             const char *desc, const char *defaultvalue);
55 | 
56 | void option_parser_cmdline(option_parser_t opp, int argc, const char *argv[]);
57 | 
58 | void option_parser_cfgfile(option_parser_t opp, const char *filename);
59 | 
60 | void option_parser_delimited_string(option_parser_t opp,
61 |                                     const char *inputstring,
62 |                                     const char *delimiters);
63 | 
64 | void option_parser_print(option_parser_t opp, FILE *fout);
65 | void option_parser_print_limited(option_parser_t opp, FILE *fout, int limited,
66 |                                  std::string pattern1, std::string pattern2);
67 | 


--------------------------------------------------------------------------------
/hw-component/IBuffer.cc:
--------------------------------------------------------------------------------
 1 | #include "IBuffer.h"
 2 | 
 3 | #define PRINT_AT for(unsigned i=0; i<40; ++i) std::cout << "@"; std::cout << std::endl;
 4 | 
 5 | IBuffer::IBuffer(const unsigned smid, const unsigned num_warps)
 6 |   : m_smid(smid), m_num_warps(num_warps) {
 7 |   m_ibuffer.resize(num_warps);
 8 | }
 9 | 
10 | void IBuffer::print_ibuffer() const {
11 |   for (unsigned i = 0; i < m_num_warps; i++) {
12 |     std::cout << "warp - " << i << ": ";
13 |     for (auto it = m_ibuffer[i].begin(); it != m_ibuffer[i].end(); it++) {
14 |       std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid
15 |                 << "), ";
16 |     }
17 |     std::cout << std::endl;
18 |   }
19 | }
20 | 
21 | void IBuffer::print_ibuffer(const unsigned gwarp_start, const unsigned gwarp_end) const {
22 | PRINT_AT;
23 |   for (unsigned i = gwarp_start; i < gwarp_end; i++) {
24 |     std::cout << "    Ibuffer (pc, wid, kid) warp - " << i << ": ";
25 |     for (auto it = m_ibuffer[i].begin(); it != m_ibuffer[i].end(); it++) {
26 |       std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid
27 |                 << "), ";
28 |     }
29 |     std::cout << std::endl;
30 |   }
31 | PRINT_AT;
32 | }
33 | 
34 | void IBuffer::print_ibuffer(const unsigned gwarp_id) const {
35 |   std::cout << "    Ibuffer (pc, wid, kid) warp - " << gwarp_id << ": ";
36 |   for (auto it = m_ibuffer[gwarp_id].begin(); it != m_ibuffer[gwarp_id].end();
37 |        it++) {
38 |     std::cout << "(" << it->pc << ", " << it->wid << ", " << it->kid << "), ";
39 |   }
40 |   std::cout << std::endl;
41 | }
42 | 


--------------------------------------------------------------------------------
/hw-component/IBuffer.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <utility>
 3 | #include <vector>
 4 | 
 5 | #ifndef IBUFFER_H
 6 | #define IBUFFER_H
 7 | 
 8 | struct ibuffer_entry {
 9 |   ibuffer_entry(const unsigned pc, const unsigned wid,
10 |                 const unsigned kid, const unsigned uid)
11 |     : pc(pc), wid(wid), kid(kid), uid(uid) {}
12 | 
13 |   unsigned pc, wid, kid, uid;
14 | };
15 | 
16 | class IBuffer {
17 | public:
18 |   IBuffer(const unsigned smid, const unsigned num_warps);
19 | 
20 |   /// The `allKernelsWarpID` uniquely identifies each warp across all
21 |   /// kernels within the application.  For an application with multiple
22 |   /// kernels (e.g., 100 kernels), each containing warps numbered from
23 |   /// 0 to 10, the `allKernelsWarpID` ranges from 0 to 1000, which pro-
24 |   /// vides a global unique identifier for every warp.
25 |   inline bool is_empty(const unsigned allKernelsWarpID) const {
26 |     return m_ibuffer[allKernelsWarpID].empty();
27 |   }
28 | 
29 |   inline bool has_free_slot(const unsigned allKernelsWarpID) const {
30 |     return m_ibuffer[allKernelsWarpID].size() < 2;
31 |   }
32 | 
33 |   inline bool is_not_empty(const unsigned allKernelsWarpID) const {
34 |     return !is_empty(allKernelsWarpID);
35 |   }
36 | 
37 |   void push_back(const unsigned allKernelsWarpID, ibuffer_entry entry) {
38 |     m_ibuffer[allKernelsWarpID].push_back(entry);
39 |   }
40 | 
41 |   /// TODO: Using double-ended queues instead of vectors may speed up
42 |   /// the `pop_front` function, because std::d eque supports efficient
43 |   /// header deletion operations.
44 |   ibuffer_entry pop_front(const unsigned allKernelsWarpID) {
45 |     ibuffer_entry entry = std::move(m_ibuffer[allKernelsWarpID].front());
46 |     m_ibuffer[allKernelsWarpID].erase(
47 |         m_ibuffer[allKernelsWarpID].begin());
48 |     return entry;
49 |   }
50 | 
51 |   /// TODO: Merging `pop_front` and `front` to reduce the overhead of
52 |   /// duplicate moves.
53 |   inline const ibuffer_entry& front(const unsigned allKernelsWarpID) const {
54 |     return m_ibuffer[allKernelsWarpID].front();
55 |   }
56 | 
57 |   /// Return the size of `m_ibuffer`.
58 |   inline std::size_t size() const { return m_ibuffer.size(); }
59 | 
60 |   void print_ibuffer() const;
61 |   void print_ibuffer(const unsigned gwarp_id) const;
62 |   void print_ibuffer(const unsigned gwarp_start, const unsigned gwarp_end) const;
63 | 
64 | private:
65 |   unsigned m_smid;
66 |   std::vector<std::vector<ibuffer_entry>> m_ibuffer;
67 |   unsigned m_num_warps;
68 | };
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/hw-component/RegBankAlloc.cc:
--------------------------------------------------------------------------------
 1 | #include "RegBankAlloc.h"
 2 | 
 3 | regBankAlloc::regBankAlloc(
 4 |     const unsigned smid,
 5 |     const unsigned num_banks,
 6 |     const unsigned num_warp_scheds,
 7 |     const unsigned bank_warp_shift,
 8 |     const unsigned num_banks_per_sched) 
 9 |   : m_smid(smid), m_num_banks(num_banks),
10 |     m_num_warp_scheds(num_warp_scheds),
11 |     m_bank_warp_shift(bank_warp_shift),
12 |     m_num_banks_per_sched(num_banks_per_sched),
13 |     m_sub_core_model(num_warp_scheds > 1) {
14 |   isNumBankPowerOfTwo = (m_num_banks & (m_num_banks - 1)) == 0;
15 |   m_bank_state.resize(num_banks, FREE);
16 | }
17 | 
18 | unsigned regBankAlloc::register_bank(const unsigned regnum,
19 |                                      const unsigned wid,
20 |                                      const unsigned sched_id) const {
21 |   unsigned bank = regnum;
22 |   if (m_bank_warp_shift) bank += wid;
23 |   if (m_sub_core_model) {
24 |     unsigned bank_num = (bank % m_num_banks_per_sched) + sched_id * m_num_banks_per_sched;
25 |     assert(bank_num < m_num_banks);
26 |     return bank_num;
27 |   } else {
28 |     // Use the `isPowerOfTwo` variable to decide whether to use bitwise
29 |     // operations for optimization.
30 |     return isNumBankPowerOfTwo ? bank & (m_num_banks - 1) : bank % m_num_banks;
31 |   }
32 | }
33 | 
34 | const RegBankState& regBankAlloc::getBankState(
35 |   const unsigned regnum, const unsigned wid,
36 |   const unsigned sched_id) const {
37 |   unsigned bank_id = register_bank(regnum, wid, sched_id);
38 |   return getBankState(bank_id);
39 | }
40 | 
41 | void regBankAlloc::setBankState(const unsigned regnum,
42 |                                 const unsigned wid,
43 |                                 const unsigned sched_id,
44 |                                 const RegBankState state) noexcept{
45 |   unsigned bank_id = register_bank(regnum, wid, sched_id);
46 |   setBankState(bank_id, state);
47 | }
48 | 
49 | void regBankAlloc::releaseBankState(const unsigned regnum,
50 |                                     const unsigned wid,
51 |                                     const unsigned sched_id) noexcept {
52 |   setBankState(regnum, wid, sched_id, FREE);
53 | }
54 | 
55 | void regBankAlloc::printBankState() const {
56 |   printf("Register Bank State (smid=%u): \n", m_smid);
57 |   for (unsigned i = 0; i < m_num_banks; ++i) {
58 |     printf("  bank %2u: %d\n", i, m_bank_state[i]);
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/hw-component/RegBankAlloc.h:
--------------------------------------------------------------------------------
 1 | #include "stdio.h"
 2 | #include <assert.h>
 3 | #include <vector>
 4 | 
 5 | #ifndef REG_BANK_ALLOC_H
 6 | #define REG_BANK_ALLOC_H
 7 | 
 8 | enum RegBankState {
 9 |   FREE = 0,
10 |   ON_READING,
11 |   ON_WRITING,
12 |   RegBankStateNUM,
13 | };
14 | 
15 | class regBankAlloc {
16 | public:
17 |   regBankAlloc(const unsigned smid,
18 |                const unsigned num_banks,
19 |                const unsigned num_warp_scheds,
20 |                const unsigned bank_warp_shift,
21 |                const unsigned num_banks_per_sched);
22 | 
23 |   unsigned register_bank(const unsigned regnum, const unsigned wid,
24 |                          const unsigned sched_id) const;
25 | 
26 |   inline const RegBankState& getBankState(const unsigned bank_id) const {
27 |     return m_bank_state[bank_id];
28 |   }
29 | 
30 |   const RegBankState& getBankState(const unsigned regnum, const unsigned wid,
31 |                                    const unsigned sched_id) const;
32 | 
33 |   inline void setBankState(const unsigned bank_id, 
34 |                            const RegBankState state) noexcept {
35 |     m_bank_state[bank_id] = state;
36 |   };
37 | 
38 |   void setBankState(const unsigned regnum, const unsigned wid,
39 |                     const unsigned sched_id, const RegBankState state) noexcept;
40 |   
41 |   inline void releaseBankState(const unsigned bank_id) noexcept {
42 |     setBankState(bank_id, FREE);
43 |   }
44 |   
45 |   void releaseBankState(const unsigned regnum,
46 |                         const unsigned wid,
47 |                         const unsigned sched_id) noexcept;
48 | 
49 |   /// TODO: Using `std::fill_n(m_bank_state.begin(), m_num_banks, FREE);`
50 |   /// to replace the loop can cause additional overhead due to the in-
51 |   /// ability to take advantage of inline functions, as well as the in-
52 |   /// ternal iterator performing bounds checks.
53 |   inline void releaseAllBankStates() noexcept {
54 |     for (unsigned i = 0; i < m_num_banks; ++i) {
55 |       setBankState(i, FREE);
56 |     }
57 |   };
58 | 
59 |   void printBankState() const;
60 | 
61 | private:
62 |   unsigned m_smid;
63 |   unsigned m_num_banks;
64 |   unsigned m_num_warp_scheds;
65 |   unsigned m_bank_warp_shift;
66 |   unsigned m_num_banks_per_sched;
67 |   bool m_sub_core_model;
68 |   std::vector<RegBankState> m_bank_state;
69 | 
70 |   // When `m_num_banks` is a power of 2, a shift operation can be
71 |   // used instead of a modulo operation (%) to improve efficiency.
72 |   bool isNumBankPowerOfTwo;
73 | };
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/hw-component/Scoreboard.cc:
--------------------------------------------------------------------------------
 1 | #include "Scoreboard.h"
 2 | 
 3 | Scoreboard::Scoreboard(const unsigned smid, 
 4 |                        const unsigned n_warps)
 5 |   : m_smid(smid) {
 6 |   reg_table.resize(n_warps);
 7 |   longopregs.resize(n_warps);
 8 | 
 9 |   /// TODO: For `std::vector<std::unordered_set<int>> reg_table;`, we need
10 |   /// to determine in advance how many registers will be inserted, then use
11 |   /// `reg_table[wid].reserve(size)` to pre-allocate memory. This should
12 |   /// improve performance by reducing the number of dynamic memory allocs
13 |   /// that occur when inserting an element.
14 | }
15 | 
16 | void Scoreboard::reserveRegister(const unsigned wid, 
17 |                                  const int regnum) noexcept {
18 |   auto [iter, inserted] = reg_table[wid].insert(regnum);
19 |   if (!inserted) {
20 |     printf("Error: trying to reserve an already reserved register (sid=%u, "
21 |            "wid=%u, regnum=%d).\n", m_smid, wid, regnum);
22 |     abort();
23 |   }
24 | }
25 | 
26 | const bool Scoreboard::islongop(const unsigned wid, const int regnum) const {
27 |   if (regnum == -1) return false;
28 |   else return longopregs[wid].find(regnum) != longopregs[wid].end();
29 | }
30 | 
31 | void Scoreboard::reserveRegisters(const unsigned wid, std::vector<int> &regnums,
32 |                                   bool is_load) noexcept {
33 |   std::unordered_set<int> prev_regs;
34 |   for (auto &regnum : regnums) {
35 |     if (regnum > 0 && prev_regs.insert(regnum).second) {
36 |       reserveRegister(wid, regnum);
37 |     }
38 |   }
39 | 
40 |   if (is_load)
41 |     for (auto &regnum : regnums)
42 |       if (regnum > 0) longopregs[wid].insert(regnum);
43 | }
44 | 
45 | void Scoreboard::releaseRegisters(const unsigned wid,
46 |                                   std::vector<int> &regnums) noexcept {
47 |   for (auto &regnum : regnums)
48 |     releaseRegister(wid, regnum);
49 | }
50 | 
51 | bool Scoreboard::checkCollision(const unsigned wid, std::vector<int> &regnums,
52 |                                 const int pred, const int ar1, const int ar2) const {
53 |   if (pred > 0 && reg_table[wid].find(pred) != reg_table[wid].end()) return true;
54 |   if (ar1 > 0 && reg_table[wid].find(ar1) != reg_table[wid].end()) return true;
55 |   if (ar2 > 0 && reg_table[wid].find(ar2) != reg_table[wid].end()) return true;
56 |   for (auto &reg : regnums)
57 |     if (reg > 0 && reg_table[wid].find(reg) != reg_table[wid].end())
58 |       return true;
59 | 
60 |   return false;
61 | }
62 | 
63 | void Scoreboard::printContents() const {
64 |   printf("    Scoreboard contents (sid=%u): \n", m_smid);
65 |   for (unsigned i = 0; i < reg_table.size(); i++) {
66 |     if (reg_table[i].size() == 0)
67 |       continue;
68 |     printContents(i);
69 |   }
70 | }
71 | 
72 | void Scoreboard::printContents(unsigned i) const {
73 |   printf("  wid = %2u: ", i);
74 |   std::unordered_set<int>::const_iterator it;
75 |   for (it = reg_table[i].begin(); it != reg_table[i].end(); it++)
76 |     printf("R%d ", *it);
77 |   printf("\n");
78 | }
79 | 


--------------------------------------------------------------------------------
/hw-component/Scoreboard.h:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <iostream>
 3 | #include <set>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <vector>
 7 | #include <unordered_set> 
 8 | #include <cassert>
 9 | 
10 | #ifndef SCOREBOARD_H
11 | #define SCOREBOARD_H
12 | 
13 | class Scoreboard {
14 | public:
15 |   Scoreboard(const unsigned smid, const unsigned n_warps);
16 | 
17 |   void reserveRegisters(const unsigned wid, std::vector<int> &regnums,
18 |                         bool is_load) noexcept;
19 | 
20 |   void releaseRegisters(const unsigned wid, std::vector<int> &regnums) noexcept;
21 | 
22 |   inline void releaseRegister(const unsigned wid, const int regnum) noexcept {
23 |     if (regnum != -1) reg_table[wid].erase(regnum);
24 |   }
25 | 
26 |   bool checkCollision(const unsigned wid, std::vector<int> &regnums, const int pred,
27 |                       const int ar1, const int ar2) const;
28 | 
29 |   /// TODO: Maybe don't need this again.
30 |   inline bool pendingWrites(const unsigned wid) const {
31 |     return !reg_table[wid].empty();
32 |   }
33 | 
34 |   /// TODO: Maybe don't need this again.
35 |   const bool islongop(const unsigned wid, const int regnum) const;
36 | 
37 |   inline const unsigned regs_size(const unsigned wid) const {
38 |     return reg_table[wid].size();
39 |   }
40 | 
41 |   void printContents() const;
42 |   void printContents(unsigned i) const;
43 | 
44 | private:
45 |   void reserveRegister(const unsigned wid, const int regnum) noexcept;
46 | 
47 |   int get_sid() const { return m_smid; }
48 | 
49 |   unsigned m_smid;
50 | 
51 |   std::vector<std::unordered_set<int>> reg_table;
52 |   std::vector<std::unordered_set<int>> longopregs;
53 | };
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/parda/.gitignore:
--------------------------------------------------------------------------------
1 | *.hist
2 | *.o
3 | *.x
4 | 


--------------------------------------------------------------------------------
/parda/README.md:
--------------------------------------------------------------------------------
  1 | Reuse distance is a well established approach to characterizing data cache locality based on the stack histogram model. 
  2 | This analysis so far has been restricted to ofﬂine use due to the high cost, often several orders of magnitude larger than the execution time of the analyzed code. Parda is the ﬁrst parallel algorithm to compute accurate reuse distances by analysis of memory address traces. The algorithm uses a tunable parameter that enables faster analysis when the maximum needed reuse distance is limited by a cache size upper bound. 
  3 | 
  4 | This program is a Parda implementation on file input. parda omp implementation is mainly in `parda_omp.c` and `parda_omp.h`. 
  5 | 
  6 | ## Instructions to run file input Parda. 
  7 | 
  8 | ### A. Setup and compile
  9 | 
 10 | `Step 0:` parda use glib standard linux library. If on ubuntu system just execute following sudo command.
 11 | 
 12 | ```shell
 13 | sudo apt-get install glib
 14 | ```
 15 | 
 16 | `Step 1:` Download sample trace files from project git web page. 
 17 | 
 18 | `normal_137979.trace` is text file and 
 19 | `binary_137979.trace` is the binary file.
 20 | This two files record trace data of `ls` command. 
 21 | 
 22 | `Step 2:`
 23 | 
 24 | ```shell
 25 | cd /path/to/parda
 26 | ```
 27 | 
 28 | Current program only tests with `gcc` and `icc`. 
 29 | Edit the first three lines of makefile. 
 30 | If machine has `mpicc`, give `MPI=1` option to enable mpi parallelism. 
 31 | Otherwise, give `OMP=1`. If use only sequential algorithm, comments both `OMP` and `MPI`.
 32 | 
 33 | ```makefile
 34 | DEBUG = 1
 35 | OMP = 1
 36 | MPI = 1
 37 | ```
 38 | 
 39 | ```shell
 40 | make
 41 | ```
 42 | 
 43 | ### B. Execution instructions
 44 | 
 45 | ```shell
 46 | ./parda.x --help to see how to run with different flags and run with sequential algorithm. 
 47 | ```
 48 | 
 49 | #### Execution arguments:
 50 | 
 51 | ```makefile
 52 | --input: the input trace file name.
 53 | --lines: the total number of lines in the input trace file. 
 54 | --enable-omp: enable program to parallelly run with OpenMP threads.
 55 | --enable-mpi: enable program to parallelly run with MPI.
 56 | --enable-seperate: Seperate the input file to prepare for running with paralellization.
 57 | ```
 58 | 
 59 | #### 1) Sequential execution:
 60 | 
 61 | ```shell
 62 | ./parda.x --input=normal_137979.trace --lines=137979 > seq.hist 
 63 | ```
 64 | 
 65 | #### 2) Run parda with OpenMP `--enable-omp` flag. 
 66 | 
 67 | Before running with omp we need to seperate the trace files to threads number. For example if we want to run with 4 threads. 
 68 | 
 69 | ```shell
 70 | ./parda.x --enable-seperate --input=normal_137979.trace --lines=137979 --threads=4
 71 | ```
 72 | 
 73 | We will find 4 seperated trace files:
 74 | 
 75 | `4_normal_137979.trace_p0.txt`  `4_normal_137979.trace_p1.txt`
 76 | `4_normal_137979.trace_p2.txt`  `4_normal_137979.trace_p3.txt`
 77 | 
 78 | ```shell
 79 | ./parda.x --enable-omp --input=normal_137979.trace --lines=137979 --threads=4 > omp.re 
 80 | ```
 81 | 
 82 | #### 3) Run parda with MPI
 83 | 
 84 | ```shell
 85 | mpirun -np 4 ./parda.x --input=normal_137979.trace --lines=137979 --enable-mpi
 86 | ```
 87 | 
 88 | ### Parda
 89 | 
 90 | Parda is free software: you can redistribute it and/or modify
 91 | it under the terms of the GNU General Public License as published by
 92 | the Free Software Foundation, either version 3 of the License, or
 93 | (at your option) any later version.
 94 | 
 95 | Parda is distributed in the hope that it will be useful,
 96 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 97 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 98 | GNU General Public License for more details.
 99 | 
100 | ### Author: 
101 | Qingpeng Niu
102 | 
103 | ### Contact: 
104 | niuqingpeng at gmail.com
105 | 
106 | ### Documententation
107 | 
108 | #### Related publications:
109 | 
110 | ```
111 | PARDA: A Fast Parallel Reuse Distance Analysis Algorithm.
112 | Qingpeng Niu, James Dinan, Qingda Lu and P. Sadayappan.
113 | IEEE IPDPS (IPDPS'12), May 2012, Shanghai, China.
114 | ```


--------------------------------------------------------------------------------
/parda/main.c:
--------------------------------------------------------------------------------
 1 | #include "parda.h"
 2 | #ifdef enable_mpi
 3 | #include "parda_mpi.h"
 4 | #endif
 5 | #ifdef enable_omp
 6 | #include "parda_omp.h"
 7 | #endif
 8 | #include "process_args.h"
 9 | #include "seperate.h"
10 | 
11 | int main(int argc, char **argv) {
12 |   process_args(argc, argv);
13 |   if (is_seperate == 1) {
14 |     parda_seperate_file(inputFileName, threads, lines);
15 |   } else if (is_omp == 0 && is_mpi == 0) {
16 |     DEBUG(printf("This is seq stackdist\n");)
17 |     classical_tree_based_stackdist(inputFileName, lines);
18 |   } else if (is_omp == 1 && is_mpi == 0) {
19 |     DEBUG(printf("This is omp stackdist\n");)
20 | #ifdef enable_omp
21 |     parda_omp_stackdist(inputFileName, lines, threads);
22 | #else
23 |     printf("openmp is not enabled, try to define enable_omp and add OMP "
24 |            "variable in Makefile\n");
25 |     abort();
26 | #endif
27 |   } else if (is_omp == 0 && is_mpi == 1) {
28 |     DEBUG(printf("This is mpistackdist\n");)
29 | #ifdef enable_mpi
30 |     parda_mpi_stackdist(inputFileName, lines, threads, argc, argv);
31 | #else
32 |     printf("mpi is not enabled, try to define enable_omp and add MPI variable "
33 |            "in Makefile\n");
34 |     abort();
35 | #endif
36 |   } else if (is_omp == 1 && is_mpi == 1) {
37 |     DEBUG(printf("This is hybrid stackdist\n");)
38 | #if defined(enable_omp) && defined(enable_mpi)
39 |     parda_hybrid_stackdist(inputFileName, lines, threads, argc, argv);
40 | #else
41 |     printf("hybridis not enabled, try to define enable_omp and enable_mpi and "
42 |            "add MPI and OMP variable in Makefile\n");
43 |     abort();
44 | #endif
45 |   }
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/parda/makefile:
--------------------------------------------------------------------------------
 1 | #DEBUG = 1
 2 | #OMP = 1
 3 | #MPI = 0
 4 | 
 5 | BASE=g++
 6 | #ifeq (icc, $(findstring icc,$(shell mpicc -show)))
 7 | #BASE=icc
 8 | #endif
 9 | 
10 | #ifdef MPI
11 | #CC=mpicc
12 | #CFLAGS+=-Denable_mpi
13 | #else
14 | CC=$(BASE)
15 | #endif
16 | 
17 | CFLAGS += -Wall -std=c++11 -Wpointer-arith
18 | ifdef DEBUG
19 | CFLAGS+= -g -O0
20 | else
21 | CFLAGS+= -O3
22 | endif
23 | CFLAGS += $(shell pkg-config --cflags glib-2.0)
24 | LIBS    = $(shell pkg-config --libs glib-2.0 --libs gthread-2.0)
25 | OBJS+= main.o splay.o parda.o parda_print.o narray.o process_args.o seperate.o
26 | HEADERS= splay.h parda.h narray.h process_args.h seperate.h
27 | 
28 | ifdef OMP
29 | OBJS+= parda_omp.o
30 | HEADERS+= parda_omp.h
31 | CFLAGS+=-Denable_omp
32 | ifeq ($(BASE),icc)
33 | CFLAGS+=-openmp
34 | else
35 | CFLAGS+=-fopenmp
36 | endif
37 | endif
38 | 
39 | ifeq ($(CC),mpicc)
40 | OBJS+= parda_mpi.o
41 | HEADERS+= parda_mpi.h
42 | CFLAGS+= -Denable_mpi
43 | endif
44 | 
45 | ifeq ($(BASE),icc)
46 | CFLAGS+=-limf
47 | endif
48 | 
49 | SOURCES=$(subst .o,.c, $(OBJS) )
50 | EXE=parda.x
51 | .PHONY: all clean gnuplots run
52 | all: $(EXE)
53 | 
54 | $(EXE): $(OBJS)
55 | 	$(CC) $(CFLAGS) -o $@ $+ $(LIBS)
56 | 	cp -f parda.x ../ls
57 | $(OBJS):$(HEADERS) makefile
58 | %.d: %.c
59 | 	set -e; rm -f $@; \
60 | 	$(CC) -M $(CPPFLAGS) $< > $@.$$$$; \
61 |         sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
62 |         rm -f $@.$$$$
63 | include $(sources:.c=.d)
64 | clean:
65 | 	rm -f $(EXE) *.o 
66 | run:
67 | 


--------------------------------------------------------------------------------
/parda/narray.c:
--------------------------------------------------------------------------------
 1 | #include "narray.h"
 2 | 
 3 | narray_t *narray_new(unsigned element_size, unsigned capacity) {
 4 |   narray_t *na = (narray_t *)malloc(sizeof(narray_t));
 5 |   na->element_size = element_size;
 6 |   na->len = 0;
 7 |   na->capacity = capacity * element_size;
 8 |   na->data = calloc(capacity, element_size);
 9 |   return na;
10 | }
11 | 
12 | void narray_append_val(narray_t *na, const void *value) {
13 |   if (na->len == na->capacity) {
14 |     unsigned new_capacity = na->capacity + na->capacity + 10 * na->element_size;
15 |     void *ndata = calloc(new_capacity, 1);
16 |     memcpy(ndata, na->data, na->len);
17 |     free(na->data);
18 |     na->data = ndata;
19 |     na->capacity = new_capacity;
20 |   }
21 |   memcpy((char *)na->data + na->len, value, na->element_size);
22 |   na->len += na->element_size;
23 | }
24 | 
25 | void narray_free(narray_t *na) {
26 |   free(na->data);
27 |   free(na);
28 | }
29 | 
30 | void narray_print(narray_t *na, void (*show_element)(void *, int, FILE *),
31 |                   FILE *fp) {
32 |   mdebug(fprintf(fp, "enter narray_print len=%u\n", na->len);) unsigned len =
33 |       narray_get_len(na);
34 |   unsigned i;
35 |   for (i = 0; i < len; i++) {
36 |     show_element(na->data, i, fp);
37 |     mdebug(printf("%s ", ((HKEY *)ga->data)[i]);)
38 |   }
39 | }
40 | 
41 | narray_t *narray_heaparray_new(void *data, const unsigned len,
42 |                                const unsigned element_size) {
43 |   narray_t *na = (narray_t *)malloc(sizeof(narray_t));
44 |   na->data = data;
45 |   na->len = len;
46 |   na->capacity = len;
47 |   na->element_size = element_size;
48 |   return na;
49 | }
50 | 


--------------------------------------------------------------------------------
/parda/narray.h:
--------------------------------------------------------------------------------
 1 | #ifndef _NARRAY_H
 2 | #define _NARRAY_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #ifdef enable_mdebugging
 9 | #define mdebug(cmd) cmd
10 | #else
11 | #define mdebug(cmd)
12 | #endif
13 | 
14 | typedef struct narray_s {
15 |   void *data;
16 |   unsigned len, capacity, element_size;
17 | } narray_t;
18 | 
19 | narray_t *narray_heaparray_new(void *data, unsigned len, unsigned element_size);
20 | narray_t *narray_new(unsigned element_size, unsigned capacity);
21 | void narray_append_val(narray_t *na, const void *value);
22 | void narray_free(narray_t *na);
23 | void narray_print(narray_t *na, void (*show_element)(void *, int, FILE *),
24 |                   FILE *fp);
25 | 
26 | static inline unsigned narray_get_len(const narray_t *na) {
27 |   return na->len / na->element_size;
28 | }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/parda/parda.h:
--------------------------------------------------------------------------------
  1 | #ifndef _PARDA_H
  2 | #define _PARDA_H
  3 | 
  4 | #include "narray.h"
  5 | #include "process_args.h"
  6 | #include "splay.h"
  7 | 
  8 | #include <assert.h>
  9 | #include <glib.h>
 10 | #include <libgen.h>
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | #include <sys/time.h>
 15 | #include <unistd.h>
 16 | 
 17 | #ifdef enable_mpi
 18 | #ifdef enable_omp
 19 | #define enable_hybrid
 20 | #endif
 21 | #endif
 22 | 
 23 | #define enable_timing
 24 | #ifdef enable_timing
 25 | #define PTIME(cmd) cmd
 26 | #else
 27 | #define PTIME(cmd)
 28 | #endif
 29 | 
 30 | #ifdef enable_debugging
 31 | #define DEBUG(cmd) cmd
 32 | #else
 33 | #define DEBUG(cmd)
 34 | #endif
 35 | 
 36 | #ifdef enable_profiling
 37 | #define PROF(cmd) cmd
 38 | #else
 39 | #define PROF(cmd)
 40 | #endif
 41 | 
 42 | #define DEFAULT_NBUCKETS 1000000
 43 | #define B_OVFL nbuckets
 44 | #define B_INF nbuckets + 1
 45 | #define SLEN 20
 46 | 
 47 | extern int nbuckets;
 48 | #ifdef ENABLE_PROFILING
 49 | extern char pfile[30];
 50 | extern FILE *pid_fp;
 51 | #endif
 52 | 
 53 | typedef char HKEY[SLEN];
 54 | 
 55 | typedef struct end_keytime_s {
 56 |   narray_t *gkeys;
 57 |   narray_t *gtimes;
 58 | } end_keytime_t;
 59 | 
 60 | typedef struct processor_info_s {
 61 |   int pid, psize;
 62 |   long tstart, tlen, tend, sum;
 63 | } processor_info_t;
 64 | 
 65 | typedef struct program_data_s {
 66 |   GHashTable *gh;
 67 |   narray_t *ga;
 68 |   end_keytime_t ekt;
 69 |   Tree *root;
 70 |   unsigned int *histogram;
 71 | } program_data_t;
 72 | 
 73 | void classical_tree_based_stackdist(char *inputFileName, long lines);
 74 | 
 75 | gboolean compare_strings(gconstpointer a, gconstpointer b);
 76 | void iterator(gpointer key, gpointer value, gpointer ekt);
 77 | 
 78 | program_data_t parda_init(void);
 79 | void parda_input_with_filename(char *inFileName, program_data_t *pdt,
 80 |                                long begin, long end);
 81 | void parda_input_with_textfilepointer(FILE *fp, program_data_t *pdt, long begin,
 82 |                                       long end);
 83 | void parda_input_with_binaryfilepointer(FILE *fp, program_data_t *pdt,
 84 |                                         long begin, long end);
 85 | void parda_free(program_data_t *pdt);
 86 | end_keytime_t parda_generate_end(const program_data_t *pdt);
 87 | processor_info_t parda_get_processor_info(int pid, int psize, long sum);
 88 | void parda_get_abfront(program_data_t *pdt_a, const narray_t *gb,
 89 |                        const processor_info_t *pit_a);
 90 | int parda_get_abend(program_data_t *pdt_b, const end_keytime_t *ekt_a);
 91 | program_data_t parda_merge(program_data_t *pdt_a, program_data_t *pdt_b,
 92 |                            const processor_info_t *pit_b);
 93 | 
 94 | void parda_print_front(const program_data_t *pdt);
 95 | void parda_print_end(const end_keytime_t *ekt);
 96 | void parda_print_tree(const program_data_t *pdt);
 97 | void parda_print_hash(const program_data_t *pdt);
 98 | void parda_print(const program_data_t *pdt);
 99 | void print_iterator(gpointer key, gpointer value, gpointer ekt);
100 | void parda_print_histogram(const unsigned *histogram);
101 | void parda_fprintf_histogram(const unsigned *histogram, FILE *file);
102 | float parda_fprintf_histogram_r(const unsigned *histogram, FILE *file,
103 |                                 bool print);
104 | 
105 | int parda_findopt(char *option, char **value, int *argc, char ***argv);
106 | void parda_process(char *input, T tim, program_data_t *pdt);
107 | 
108 | void show_hkey(void *data, int i, FILE *fp);
109 | void show_T(void *data, int i, FILE *fp);
110 | 
111 | double rtclock(void);
112 | 
113 | static inline T parda_low(int pid, int psize, T sum) {
114 |   return (((long long)(pid)) * (sum) / (psize));
115 | }
116 | 
117 | static inline T parda_high(int pid, int psize, T sum) {
118 |   return parda_low(pid + 1, psize, sum) - 1;
119 | }
120 | 
121 | static inline T parda_size(int pid, int psize, T sum) {
122 |   return (parda_low(pid + 1, psize, sum)) - (parda_low(pid, psize, sum));
123 | }
124 | 
125 | static inline T parda_owner(T index, int psize, T sum) {
126 |   return (((long long)psize) * (index + 1) - 1) / sum;
127 | }
128 | 
129 | static inline char *parda_generate_pfilename(char filename[], int pid,
130 |                                              int psize) {
131 |   char pfilename[30];
132 |   sprintf(pfilename, "%d_%s_p%d.txt", psize, filename, pid);
133 |   return strdup(pfilename);
134 | }
135 | 
136 | static inline void process_one_access(char *input, program_data_t *pdt,
137 |                                       const long tim) {
138 |   int distance;
139 |   int *lookup;
140 |   lookup = (T *)g_hash_table_lookup(pdt->gh, input);
141 | 
142 |   if (lookup == NULL) {
143 |     char *data = strdup(input);
144 |     pdt->root = insert(tim, pdt->root);
145 |     long *p_data;
146 |     narray_append_val(pdt->ga, input);
147 |     if (!(p_data = (long *)malloc(sizeof(long)))) {
148 |       printf("no memory for p_data\n");
149 |       assert(0);
150 |       exit(-1);
151 |     }
152 |     *p_data = tim;
153 |     g_hash_table_insert(pdt->gh, data, p_data);
154 |   }
155 | 
156 |   else {
157 |     char *data = strdup(input);
158 |     pdt->root = insert((*lookup), pdt->root);
159 |     distance = node_size(pdt->root->right);
160 |     pdt->root = delete_(*lookup, pdt->root);
161 |     pdt->root = insert(tim, pdt->root);
162 |     int *p_data;
163 |     if (!(p_data = (int *)malloc(sizeof(int)))) {
164 |       printf("no memory for p_data\n");
165 |       assert(0);
166 |       exit(-1);
167 |     }
168 |     *p_data = tim;
169 |     g_hash_table_replace(pdt->gh, data, p_data);
170 | 
171 |     if (distance > nbuckets)
172 |       pdt->histogram[B_OVFL] += 1;
173 |     else
174 |       pdt->histogram[distance] += 1;
175 |   }
176 | }
177 | 
178 | static inline int process_one_access_and_get_distance(char *input,
179 |                                                       program_data_t *pdt,
180 |                                                       const long tim) {
181 |   int distance;
182 |   int *lookup;
183 |   lookup = (T *)g_hash_table_lookup(pdt->gh, input);
184 | 
185 |   if (lookup == NULL) {
186 |     char *data = strdup(input);
187 |     pdt->root = insert(tim, pdt->root);
188 |     long *p_data;
189 |     narray_append_val(pdt->ga, input);
190 |     if (!(p_data = (long *)malloc(sizeof(long)))) {
191 |       printf("no memory for p_data\n");
192 |       assert(0);
193 |       exit(-1);
194 |     }
195 |     *p_data = tim;
196 |     g_hash_table_insert(pdt->gh, data, p_data);
197 | 
198 |     return B_INF;
199 |   }
200 | 
201 |   else {
202 |     char *data = strdup(input);
203 |     pdt->root = insert((*lookup), pdt->root);
204 |     distance = node_size(pdt->root->right);
205 |     pdt->root = delete_(*lookup, pdt->root);
206 |     pdt->root = insert(tim, pdt->root);
207 |     int *p_data;
208 |     if (!(p_data = (int *)malloc(sizeof(int)))) {
209 |       printf("no memory for p_data\n");
210 |       assert(0);
211 |       exit(-1);
212 |     }
213 |     *p_data = tim;
214 |     g_hash_table_replace(pdt->gh, data, p_data);
215 | 
216 |     if (distance > nbuckets)
217 |       pdt->histogram[B_OVFL] += 1;
218 |     else
219 |       pdt->histogram[distance] += 1;
220 | 
221 |     if (distance > nbuckets)
222 |       return B_OVFL;
223 |     else
224 |       return distance;
225 |   }
226 | }
227 | #endif
228 | 


--------------------------------------------------------------------------------
/parda/parda_mpi.c:
--------------------------------------------------------------------------------
  1 | #include "parda.h"
  2 | #ifdef enable_omp
  3 | #include "parda_omp.h"
  4 | #endif
  5 | #include "parda_mpi.h"
  6 | 
  7 | narray_t *parda_recv_array(int source, int *tag, unsigned element_size) {
  8 |   narray_t *ga;
  9 |   MPI_Status status;
 10 |   unsigned blen;
 11 |   void *bdata;
 12 |   MPI_Recv(&blen, 1, MPI_UNSIGNED, source, (*tag)++, MPI_COMM_WORLD, &status);
 13 |   bdata = (char *)calloc(blen, 1);
 14 |   MPI_Recv(bdata, blen, MPI_CHAR, source, (*tag)++, MPI_COMM_WORLD, &status);
 15 |   ga = narray_heaparray_new(bdata, blen, element_size);
 16 |   return ga;
 17 | }
 18 | 
 19 | void parda_send_array(narray_t *ga, int dest, int *tag) {
 20 |   MPI_Send(&ga->len, 1, MPI_UNSIGNED, dest, (*tag)++, MPI_COMM_WORLD);
 21 |   MPI_Send(ga->data, ga->len, MPI_CHAR, dest, (*tag)++, MPI_COMM_WORLD);
 22 | }
 23 | 
 24 | unsigned *parda_mpi_merge(program_data_t *pdt, processor_info_t *pit) {
 25 |   int i, len;
 26 |   int psize = pit->psize;
 27 |   int pid = pit->pid;
 28 |   int var, tag = 1;
 29 |   for (var = pid, len = 1; var % 2 == 1; var = (var >> 1), len = (len << 1)) {
 30 |     end_keytime_t ekt_a;
 31 |     int dest = pid - len;
 32 |     parda_send_array(pdt->ga, dest, &tag);
 33 |     ekt_a.gkeys = parda_recv_array(dest, &tag, sizeof(HKEY));
 34 |     ekt_a.gtimes = parda_recv_array(dest, &tag, sizeof(T));
 35 |     parda_get_abend(pdt, &ekt_a);
 36 |     narray_t *mga = parda_recv_array(dest, &tag, sizeof(HKEY));
 37 |     narray_free(pdt->ga);
 38 |     pdt->ga = mga;
 39 |   }
 40 |   if (pid + len < psize) {
 41 |     int source = pid + len;
 42 |     pdt->ekt = parda_generate_end(pdt);
 43 |     narray_t *gb = parda_recv_array(source, &tag, sizeof(HKEY));
 44 |     parda_send_array(pdt->ekt.gkeys, source, &tag);
 45 |     parda_send_array(pdt->ekt.gtimes, source, &tag);
 46 |     parda_get_abfront(pdt, gb, pit);
 47 |     parda_send_array(pdt->ga, source, &tag);
 48 |     narray_free(pdt->ekt.gkeys);
 49 |     narray_free(pdt->ekt.gtimes);
 50 |   } else if (pid == psize - 1) {
 51 |     pdt->histogram[B_INF] += narray_get_len(pdt->ga);
 52 |   }
 53 |   unsigned *global_his = (unsigned *)malloc(sizeof(unsigned) * (nbuckets + 2));
 54 | 
 55 |   for (i = 0; i < nbuckets + 2; i++) {
 56 |     global_his[i] = 0;
 57 |   }
 58 |   MPI_Reduce(pdt->histogram, global_his, nbuckets + 2, MPI_UNSIGNED, MPI_SUM, 0,
 59 |              MPI_COMM_WORLD);
 60 |   return global_his;
 61 | }
 62 | int parda_MPI_IO_binary_input(program_data_t *pdt, char filename[],
 63 |                               const processor_info_t *pit) {
 64 |   MPI_File thefile;
 65 |   MPI_Status status;
 66 |   MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDONLY, MPI_INFO_NULL,
 67 |                 &thefile);
 68 |   MPI_File_set_view(thefile, pit->tstart * sizeof(void *), MPI_LONG, MPI_LONG,
 69 |                     "native", MPI_INFO_NULL);
 70 | #ifdef ENABLE_PROFILING
 71 |   double t3, t4;
 72 |   MPI_Barrier(MPI_COMM_WORLD);
 73 |   t3 = MPI_Wtime();
 74 | #endif
 75 |   GHashTable *gh = pdt->gh;
 76 |   Tree *root = pdt->root;
 77 |   narray_t *ga = pdt->ga;
 78 |   int bufsize = 10000;
 79 |   void **buf = (void **)malloc(bufsize * sizeof(void *));
 80 |   unsigned int *histogram = pdt->histogram;
 81 |   HKEY input;
 82 |   long tim, begin;
 83 |   int count, i;
 84 |   for (tim = begin = pit->tstart; begin <= pit->tend; begin += count) {
 85 |     MPI_File_read(thefile, buf, bufsize, MPI_LONG, &status);
 86 |     MPI_Get_count(&status, MPI_LONG, &count);
 87 |     if (begin + count > pit->tend + 1) {
 88 |       count = pit->tend + 1 - begin;
 89 |     }
 90 |     for (i = 0; i < count; i++) {
 91 |       sprintf(input, "%p", buf[i]);
 92 |       int distance;
 93 |       T *lookup;
 94 |       lookup = g_hash_table_lookup(gh, input);
 95 | 
 96 |       if (lookup == NULL) {
 97 |         char *data = strdup(input);
 98 |         root = insert(tim, root);
 99 |         T *p_data;
100 | 
101 |         narray_append_val(ga, input);
102 |         if (!(p_data = (T *)malloc(sizeof(T))))
103 |           return -1;
104 |         *p_data = tim;
105 |         g_hash_table_insert(gh, data, p_data);
106 |       }
107 | 
108 |       else {
109 |         root = insert((*lookup), root);
110 |         distance = node_size(root->right);
111 |         root = delete_(*lookup, root);
112 |         root = insert(tim, root);
113 |         int *p_data;
114 |         if (!(p_data = (int *)malloc(sizeof(int))))
115 |           return -1;
116 |         *p_data = tim;
117 |         g_hash_table_replace(gh, strdup(input), p_data);
118 | 
119 |         if (distance > nbuckets)
120 |           histogram[B_OVFL]++;
121 |         else
122 |           histogram[distance]++;
123 |       }
124 |       tim++;
125 |     }
126 |   }
127 |   printf("start from %ld to %ld\n", pit->tstart, tim);
128 | #ifdef ENABLE_PROFILING
129 |   t4 = MPI_Wtime();
130 |   int pid = pit->pid;
131 |   fprintf(pid_fp,
132 |           "parda input time with barrier = %.3lf sec for processor %d; \n",
133 |           t4 - t3, pid);
134 | #endif
135 |   pdt->root = root;
136 |   return 1;
137 | }
138 | 
139 | void parda_mpi_stackdist(char *inputFileName, long lines, int processors,
140 |                          int argc, char **argv) {
141 |   int pid, psize;
142 |   program_data_t pdt;
143 |   long psum;
144 |   processor_info_t pit;
145 |   MPI_Init(&argc, &argv);
146 |   process_args(argc, argv);
147 |   MPI_Comm_rank(MPI_COMM_WORLD, &pid);
148 |   MPI_Comm_size(MPI_COMM_WORLD, &psize);
149 |   psum = lines;
150 |   MPI_Bcast(&psum, 1, MPI_INT, 0, MPI_COMM_WORLD);
151 | #ifdef enable_timing
152 |   double ts, te, t_init, t_input, t_print, t_free;
153 |   ts = MPI_Wtime();
154 | #endif
155 |   pit = parda_get_processor_info(pid, psize, psum);
156 |   pdt = parda_init();
157 |   PTIME(MPI_Barrier(MPI_COMM_WORLD);)
158 |   PTIME(te = MPI_Wtime();)
159 |   PTIME(t_init = te - ts;)
160 |   parda_input_with_filename(parda_generate_pfilename(inputFileName, pid, psize),
161 |                             &pdt, pit.tstart, pit.tend);
162 |   unsigned *global_his = parda_mpi_merge(&pdt, &pit);
163 |   PTIME(MPI_Barrier(MPI_COMM_WORLD);)
164 |   PTIME(ts = MPI_Wtime();)
165 |   PTIME(t_input = ts - te;)
166 |   if (pid == 0) {
167 |     parda_print_histogram(global_his);
168 |   }
169 |   PTIME(te = MPI_Wtime();)
170 |   PTIME(t_print = te - ts;)
171 |   parda_free(&pdt);
172 |   free(global_his);
173 |   PTIME(ts = MPI_Wtime();)
174 |   PTIME(t_free = ts - te;)
175 | #ifdef enable_timing
176 |   if (pid == 0) {
177 |     printf("mpi\n");
178 |     printf("init time is %lf\n", t_init);
179 |     printf("input time is %lf\n", t_input);
180 |     printf("print time is %lf\n", t_print);
181 |     printf("free time is %lf\n", t_free);
182 |   }
183 | #endif
184 |   MPI_Finalize();
185 | }
186 | 
187 | #if defined(enable_omp) && defined(enable_mpi)
188 | void parda_hybrid_stackdist(char *inputFileName, long lines, int processors,
189 |                             int argc, char **argv) {
190 |   int pid, psize;
191 |   program_data_t pdt;
192 |   long psum;
193 |   processor_info_t pit;
194 |   MPI_Init(&argc, &argv);
195 |   process_args(argc, argv);
196 |   MPI_Comm_rank(MPI_COMM_WORLD, &pid);
197 |   MPI_Comm_size(MPI_COMM_WORLD, &psize);
198 |   DEBUG(if (pid == 0))
199 |   DEBUG(printf("enter hybrid\n");)
200 | 
201 |   psum = lines;
202 |   MPI_Bcast(&psum, 1, MPI_INT, 0, MPI_COMM_WORLD);
203 |   pit = parda_get_processor_info(pid, psize, psum);
204 |   program_data_t *pdt_a = parda_omp_init(threads);
205 | 
206 |   pdt = parda_omp_input(inputFileName, pdt_a, pit.tstart, pit.tend, pid, psize);
207 |   parda_omp_free(pdt_a, threads);
208 | 
209 |   unsigned *global_his = parda_mpi_merge(&pdt, &pit);
210 | 
211 |   if (pid == 0) {
212 |     parda_print_histogram(global_his);
213 |   }
214 |   parda_free(&pdt);
215 |   free(global_his);
216 |   MPI_Finalize();
217 | }
218 | #endif
219 | 


--------------------------------------------------------------------------------
/parda/parda_mpi.h:
--------------------------------------------------------------------------------
 1 | #ifndef _PARDA_MPI_H
 2 | #define _PARDA_MPI_H
 3 | 
 4 | #include "parda.h"
 5 | #include <mpi.h>
 6 | 
 7 | narray_t *parda_recv_array(int source, int *tag, unsigned element_size);
 8 | void parda_send_array(narray_t *ga, int dest, int *tag);
 9 | unsigned *parda_mpi_merge(program_data_t *pdt, processor_info_t *pit);
10 | void parda_mpi_free(program_data_t *pdt, unsigned *global_his);
11 | int parda_MPI_IO_binary_input(program_data_t *pdt, char filename[],
12 |                               const processor_info_t *pit);
13 | void parda_mpi_stackdist(char *inputFileName, long lines, int processors,
14 |                          int argc, char **argv);
15 | #if defined(enable_omp) && defined(enable_mpi)
16 | void parda_hybrid_stackdist(char *inputFileName, long lines, int processors,
17 |                             int argc, char **argv);
18 | #endif
19 | #endif
20 | 


--------------------------------------------------------------------------------
/parda/parda_omp.c:
--------------------------------------------------------------------------------
  1 | #include "parda_omp.h"
  2 | 
  3 | processor_info_t parda_get_thread_info(long lines, long begin, int pid,
  4 |                                        int psize) {
  5 |   processor_info_t pit_c;
  6 |   pit_c.pid = pid;
  7 |   pit_c.psize = psize;
  8 |   pit_c.sum = lines;
  9 |   pit_c.tstart = parda_low(pit_c.pid, pit_c.psize, lines);
 10 |   pit_c.tstart += begin;
 11 |   pit_c.tend = parda_high(pit_c.pid, pit_c.psize, lines);
 12 |   pit_c.tend += begin;
 13 |   pit_c.tlen = parda_size(pit_c.pid, pit_c.psize, lines);
 14 |   return pit_c;
 15 | }
 16 | 
 17 | program_data_t *parda_omp_init(int nthreads) {
 18 |   g_thread_init(NULL);
 19 |   program_data_t *pdt_a =
 20 |       (program_data_t *)malloc(nthreads * sizeof(program_data_t));
 21 |   int i;
 22 |   for (i = 0; i < nthreads; i++)
 23 |     pdt_a[i] = parda_init();
 24 |   omp_set_num_threads(nthreads);
 25 |   return pdt_a;
 26 | }
 27 | 
 28 | void parda_omp_openfile(char inputFileName[], const int pid, const int nthreads,
 29 |                         const int psize, FILE *fpa[]) {
 30 |   int i;
 31 |   for (i = 0; i < nthreads; i++) {
 32 |     char *pfilename = parda_generate_pfilename(
 33 |         inputFileName, pid * nthreads + i, psize * nthreads);
 34 |     fpa[i] = fopen(pfilename, "r");
 35 |   }
 36 | }
 37 | 
 38 | program_data_t parda_omp_input_with_filename(char inputFileName[],
 39 |                                              program_data_t *pdt_a, long begin,
 40 |                                              long end, int pid, int psize) {
 41 |   int nthreads = threads;
 42 |   long lines = end + 1 - begin;
 43 |   FILE *fpa[8];
 44 |   parda_omp_openfile(inputFileName, pid, nthreads, psize, fpa);
 45 | #pragma omp parallel default(none)                                             \
 46 |     firstprivate(begin, pid, psize, nthreads, lines, is_binary)                \
 47 |         shared(pdt_a, fpa)
 48 |   {
 49 |     int i = omp_get_thread_num();
 50 |     FILE *fp = fpa[i];
 51 |     processor_info_t pit = parda_get_thread_info(lines, begin, i, nthreads);
 52 |     program_data_t pdt_c = pdt_a[i];
 53 |     if (!is_binary) {
 54 |       parda_input_with_textfilepointer(fp, &pdt_c, pit.tstart, pit.tend);
 55 |     } else {
 56 |       parda_input_with_binaryfilepointer(fp, &pdt_c, pit.tstart, pit.tend);
 57 |     }
 58 |     pdt_a[i] = pdt_c;
 59 |     int tid = i;
 60 |     int var, len;
 61 |     int mlen = nthreads >> 1;
 62 |     for (var = tid, len = 1; len <= mlen; len = (len << 1)) {
 63 |       if (var & 1) {
 64 |         program_data_t pdt_A = pdt_a[tid - len];
 65 |         program_data_t pdt_B = pdt_a[tid];
 66 |         pdt_a[tid] = parda_merge(&pdt_A, &pdt_B, &pit);
 67 |         var >>= 1;
 68 |       }
 69 | #pragma omp barrier
 70 |     }
 71 |   }
 72 |   program_data_t pdt_c = pdt_a[nthreads - 1];
 73 |   return pdt_c;
 74 | }
 75 | 
 76 | program_data_t parda_omp_input(char inputFileName[], program_data_t *pdt_a,
 77 |                                long begin, long end, int pid, int psize) {
 78 |   int nthreads = threads;
 79 | 
 80 |   long lines = end + 1 - begin;
 81 |   processor_info_t pit_a[8];
 82 |   int syn[8 << 6];
 83 |   memset(syn, 0, sizeof(syn));
 84 |   int i;
 85 | #pragma omp parallel default(none) private(i)                                  \
 86 |     firstprivate(begin, pid, psize, nthreads, lines)                           \
 87 |         shared(pdt_a, pit_a, syn, inputFileName)
 88 |   {
 89 |     DEBUG(printf("enter parallel for\n");)
 90 |     __sync_synchronize();
 91 | #pragma omp for
 92 |     for (i = 0; i < nthreads; i++) {
 93 |       printf("i=%d executed by thread=%d\n", i, omp_get_thread_num());
 94 |       pit_a[i] = parda_get_thread_info(lines, begin, i, nthreads);
 95 |       parda_input_with_filename(parda_generate_pfilename(inputFileName,
 96 |                                                          pid * nthreads + i,
 97 |                                                          psize * nthreads),
 98 |                                 &pdt_a[i], pit_a[i].tstart, pit_a[i].tend);
 99 | #ifdef enable_debugging
100 |       printf("after input in for\n");
101 | #endif
102 |       int tid = i;
103 |       int var, len;
104 |       for (var = tid, len = 1; var % 2 == 1;
105 |            var = (var >> 1), len = (len << 1)) {
106 |         DEBUG(printf("before while in for %d and %d\n", tid - len, tid);)
107 |         while (syn[(tid - len) << 8] == 0) {
108 | #pragma omp flush(syn)
109 |         }
110 |         DEBUG(printf("after while in for and will merge %d and %d\n", tid - len,
111 |                      tid);)
112 |         pdt_a[tid] = parda_merge(&pdt_a[tid - len], &pdt_a[tid], &pit_a[tid]);
113 |         DEBUG(printf("after merged %d and %d\n", tid - len, tid);)
114 |       }
115 |       syn[tid << 8]++;
116 | #pragma omp flush(syn)
117 |     }
118 |   }
119 |   program_data_t pdt = pdt_a[nthreads - 1];
120 |   return pdt;
121 | }
122 | 
123 | void parda_omp_free(program_data_t *pdt_a, int psize) {
124 |   int i;
125 | #pragma omp parallel private(i) shared(pdt_a, psize)
126 |   {
127 | #pragma omp for
128 |     for (i = 0; i < psize - 1; i++) {
129 |       g_hash_table_destroy(pdt_a[i].gh);
130 |     }
131 |   }
132 |   free(pdt_a);
133 | }
134 | 
135 | void parda_omp_stackdist(char *inputFileName, long lines, int threads) {
136 | #ifdef enable_timing
137 |   double ts, te, t_init, t_input, t_print, t_free;
138 |   ts = rtclock();
139 | #endif
140 |   program_data_t *pdt_a = parda_omp_init(threads);
141 |   PTIME(te = rtclock();)
142 |   PTIME(t_init = te - ts;)
143 |   DEBUG(printf("after omp init\n");)
144 |   program_data_t pdt_c =
145 |       parda_omp_input_with_filename(inputFileName, pdt_a, 0, lines - 1, 0, 1);
146 | 
147 |   DEBUG(printf("after omp input\n");)
148 |   program_data_t *pdt = &pdt_c;
149 |   pdt->histogram[B_INF] += narray_get_len(pdt->ga);
150 |   PTIME(ts = rtclock();)
151 |   PTIME(t_input = ts - te;)
152 |   parda_print_histogram(pdt->histogram);
153 |   PTIME(te = rtclock();)
154 |   PTIME(t_print = te - ts;)
155 |   parda_omp_free(pdt_a, threads);
156 |   parda_free(pdt);
157 |   PTIME(ts = rtclock();)
158 |   PTIME(t_free = ts - te;)
159 | #ifdef enable_timing
160 |   printf("omp\n");
161 |   printf("init time is %lf\n", t_init);
162 |   printf("input time is %lf\n", t_input);
163 |   printf("print time is %lf\n", t_print);
164 |   printf("free time is %lf\n", t_free);
165 | #endif
166 | }
167 | 


--------------------------------------------------------------------------------
/parda/parda_omp.h:
--------------------------------------------------------------------------------
 1 | #ifndef _PARDA_OMP_H
 2 | #define _PARDA_OMP_H
 3 | 
 4 | #include "parda.h"
 5 | #include <omp.h>
 6 | 
 7 | processor_info_t parda_get_thread_info(long lines, long begin, int pid,
 8 |                                        int psize);
 9 | program_data_t *parda_omp_init(int psize);
10 | program_data_t parda_omp_input(char inputFileName[], program_data_t *pdt_a,
11 |                                long begin, long end, int pid, int psize);
12 | void parda_omp_free(program_data_t *pdt_a, int psize);
13 | void parda_omp_stackdist(char *inputFileName, long lines, int threads);
14 | #endif
15 | 


--------------------------------------------------------------------------------
/parda/parda_print.c:
--------------------------------------------------------------------------------
  1 | #include "parda.h"
  2 | 
  3 | void parda_print_front(const program_data_t *pdt) {
  4 |   narray_t *ga = pdt->ga;
  5 |   unsigned i;
  6 |   unsigned len = narray_get_len(ga);
  7 |   printf("< ");
  8 |   for (i = 0; i < len; i++) {
  9 |     printf("%s ", ((HKEY *)ga->data)[i]);
 10 |   }
 11 |   printf(">\n");
 12 | }
 13 | 
 14 | void parda_print_end(const end_keytime_t *ekt) {
 15 |   narray_t *gkeys = ekt->gkeys;
 16 |   narray_t *gtimes = ekt->gtimes;
 17 | 
 18 |   unsigned len = narray_get_len(gkeys);
 19 | 
 20 |   unsigned i;
 21 |   printf("[ ");
 22 |   for (i = 0; i < len; i++) {
 23 |     printf("(%s:%d) ", ((HKEY *)(gkeys->data))[i], ((T *)(gtimes->data))[i]);
 24 |   }
 25 |   printf("]\n");
 26 | }
 27 | 
 28 | void parda_print_tree(const program_data_t *pdt) {
 29 |   Tree *root = pdt->root;
 30 |   printtree(root, 0);
 31 | }
 32 | 
 33 | void print_iterator(gpointer key, gpointer value, gpointer ekt) {
 34 |   printf("(%s:%d) ", (char *)key, *(T *)value);
 35 | }
 36 | 
 37 | void parda_print_hash(const program_data_t *pdt) {
 38 |   printf("[ ");
 39 |   g_hash_table_foreach(pdt->gh, (GHFunc)print_iterator, NULL);
 40 |   printf("]\n");
 41 | }
 42 | 
 43 | void parda_print(const program_data_t *pdt) {
 44 |   parda_print_front(pdt);
 45 |   parda_print_tree(pdt);
 46 |   parda_print_hash(pdt);
 47 | }
 48 | 
 49 | void parda_print_histogram(const unsigned *histogram) {
 50 |   int last_bucket;
 51 |   int i;
 52 |   unsigned long long sum = 0;
 53 |   unsigned long long cum = 0;
 54 | 
 55 |   last_bucket = nbuckets - 1;
 56 |   while (histogram[last_bucket] == 0)
 57 |     last_bucket--;
 58 | 
 59 |   for (i = 0; i <= last_bucket; i++)
 60 |     sum += histogram[i];
 61 |   sum += histogram[B_OVFL];
 62 |   sum += histogram[B_INF];
 63 | 
 64 |   printf("# Dist\t     Refs\t   Refs(%%)\t  Cum_Ref\tCum_Ref(%%)\n");
 65 | 
 66 |   for (i = 0; i <= last_bucket; i++) {
 67 |     cum += histogram[i];
 68 |     if (histogram[i])
 69 |       printf("%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i],
 70 |              histogram[i] / (double)sum, cum, cum / (double)sum);
 71 |   }
 72 | 
 73 |   cum += histogram[B_OVFL];
 74 |   printf("#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL],
 75 |          histogram[B_OVFL] / (double)sum, cum, cum / (double)sum);
 76 |   cum += histogram[B_INF];
 77 |   printf("#INF  \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF],
 78 |          histogram[B_INF] / (double)sum, cum, cum / (double)sum);
 79 | }
 80 | 
 81 | void parda_fprintf_histogram(const unsigned *histogram, FILE *file) {
 82 |   int last_bucket;
 83 |   int i;
 84 |   unsigned long long sum = 0;
 85 |   unsigned long long cum = 0;
 86 | 
 87 |   last_bucket = nbuckets - 1;
 88 |   while (histogram[last_bucket] == 0)
 89 |     last_bucket--;
 90 | 
 91 |   for (i = 0; i <= last_bucket; i++)
 92 |     sum += histogram[i];
 93 |   sum += histogram[B_OVFL];
 94 |   sum += histogram[B_INF];
 95 | 
 96 |   fprintf(file, "# Dist\t     Refs\t   Refs(%%)\t  Cum_Ref\tCum_Ref(%%)\n");
 97 | 
 98 |   for (i = 0; i <= last_bucket; i++) {
 99 |     cum += histogram[i];
100 |     if (histogram[i])
101 |       fprintf(file, "%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i],
102 |               histogram[i] / (double)sum, cum, cum / (double)sum);
103 |   }
104 | 
105 |   cum += histogram[B_OVFL];
106 |   fprintf(file, "#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL],
107 |           histogram[B_OVFL] / (double)sum, cum, cum / (double)sum);
108 |   cum += histogram[B_INF];
109 |   fprintf(file, "#INF  \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF],
110 |           histogram[B_INF] / (double)sum, cum, cum / (double)sum);
111 | }
112 | 
113 | float parda_fprintf_histogram_r(const unsigned *histogram, FILE *file,
114 |                                 bool print = true) {
115 |   int last_bucket;
116 |   int i;
117 |   unsigned long long sum = 0;
118 |   unsigned long long cum = 0;
119 | 
120 |   last_bucket = nbuckets - 1;
121 |   while (histogram[last_bucket] == 0)
122 |     last_bucket--;
123 | 
124 |   for (i = 0; i <= last_bucket; i++)
125 |     sum += histogram[i];
126 |   sum += histogram[B_OVFL];
127 |   sum += histogram[B_INF];
128 | 
129 |   if (print)
130 |     fprintf(file, "# Dist\t     Refs\t   Refs(%%)\t  Cum_Ref\tCum_Ref(%%)\n");
131 | 
132 |   for (i = 0; i <= last_bucket; i++) {
133 |     cum += histogram[i];
134 |     if (histogram[i] && print)
135 |       fprintf(file, "%6d\t%9u\t%0.8lf\t%9llu\t%0.8lf\n", i, histogram[i],
136 |               histogram[i] / (double)sum, cum, cum / (double)sum);
137 |   }
138 | 
139 |   cum += histogram[B_OVFL];
140 |   if (print)
141 |     fprintf(file, "#OVFL \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_OVFL],
142 |             histogram[B_OVFL] / (double)sum, cum, cum / (double)sum);
143 | 
144 |   float hit_rate = cum / (double)sum;
145 | 
146 |   cum += histogram[B_INF];
147 |   if (print)
148 |     fprintf(file, "#INF  \t%9u\t%0.8f\t%9llu\t%0.8lf\n", histogram[B_INF],
149 |             histogram[B_INF] / (double)sum, cum, cum / (double)sum);
150 | 
151 |   return hit_rate;
152 | }


--------------------------------------------------------------------------------
/parda/process_args.c:
--------------------------------------------------------------------------------
  1 | #include "process_args.h"
  2 | 
  3 | int is_omp = 0;
  4 | int is_mpi = 0;
  5 | int is_seperate = 0;
  6 | int is_binary = 0;
  7 | int threads = 1;
  8 | char inputFileName[200] = "d.in";
  9 | long lines = -1;
 10 | int buffersize = 10000;
 11 | 
 12 | int process_args(int argc, char **argv) {
 13 |   int c;
 14 |   is_omp = is_mpi = is_binary = 0;
 15 |   while (1) {
 16 |     static struct option long_options[] = {
 17 | 
 18 |         {"enable-omp", no_argument, 0, 'o'},
 19 |         {"enable-mpi", no_argument, 0, 'm'},
 20 | 
 21 |         {"fileformat", required_argument, 0, 'f'},
 22 |         {"input", required_argument, 0, 'i'},
 23 |         {"lines", required_argument, 0, 'l'},
 24 |         {"threads", required_argument, 0, 't'},
 25 |         {"enable-seperate", no_argument, 0, 's'},
 26 |         {"help", no_argument, 0, 'h'},
 27 |         {0, 0, 0, 0}};
 28 | 
 29 |     int option_index = 0;
 30 | 
 31 |     c = getopt_long(argc, argv, "omb:", long_options, &option_index);
 32 | 
 33 |     if (c == -1)
 34 |       break;
 35 | 
 36 |     switch (c) {
 37 |     case 0:
 38 | 
 39 |       if (long_options[option_index].flag != 0)
 40 |         break;
 41 |       printf("option %s", long_options[option_index].name);
 42 |       if (optarg)
 43 |         printf(" with arg %s", optarg);
 44 |       printf("\n");
 45 |       break;
 46 | 
 47 |     case 's':
 48 |       is_seperate = 1;
 49 |       break;
 50 |     case 'o':
 51 |       is_omp = 1;
 52 |       break;
 53 | 
 54 |     case 'm':
 55 |       is_mpi = 1;
 56 |       break;
 57 | 
 58 |     case 'f':
 59 |       if (!strcmp(optarg, "binary"))
 60 |         is_binary = 1;
 61 |       else if (!strcmp(optarg, "text"))
 62 |         is_binary = 0;
 63 |       else
 64 |         printf("wrong value for fileformat. Try help\n"), abort();
 65 |       break;
 66 | 
 67 |     case 'i':
 68 |       strcpy(inputFileName, optarg);
 69 | 
 70 |       break;
 71 |     case 'l':
 72 |       lines = atol(optarg);
 73 |       break;
 74 |     case 't':
 75 |       threads = atol(optarg);
 76 |       break;
 77 |     case 'h':
 78 |       printf("case 1: seperate file\n");
 79 |       printf("./parda.x --enable-seperate --input=normal_137979.trace "
 80 |              "--lines=137979 --threads=4\n");
 81 |       printf("case 2: run with sequential algorithm\n");
 82 |       printf("./parda.x --input=normal_343684.trace --lines=343684\n");
 83 |       printf("case 3: run with OpenMp flag\n");
 84 |       printf("./parda.x --input=normal_343684.trace --lines=343684 "
 85 |              "--enable-omp --threads=4\n");
 86 |       printf("case 4: run with binary file input\n");
 87 |       printf("./parda.x --fileformat=binary --input=binary_167024.trace "
 88 |              "--lines=167024 > binary.re\n");
 89 |       exit(0);
 90 |       break;
 91 |     case '?':
 92 | 
 93 |       break;
 94 | 
 95 |     default:
 96 |       abort();
 97 |     }
 98 |   }
 99 | 
100 |   if (optind < argc) {
101 |     printf("non-option ARGV-elements: ");
102 |     while (optind < argc)
103 |       printf("%s ", argv[optind++]);
104 |     putchar('\n');
105 |   }
106 |   if (lines == -1)
107 |     printf("total lines number must be provided\n"), abort();
108 |   return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/parda/process_args.h:
--------------------------------------------------------------------------------
 1 | #ifndef _PROCESS_ARGS_H
 2 | #define _PROCESS_ARGS_H
 3 | 
 4 | #include <fcntl.h>
 5 | #include <getopt.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <string.h>
 9 | #include <sys/stat.h>
10 | #include <sys/types.h>
11 | 
12 | extern int is_omp;
13 | extern int is_mpi;
14 | extern int is_seperate;
15 | extern int is_binary;
16 | extern char inputFileName[200];
17 | extern long lines;
18 | extern int threads;
19 | extern int buffersize;
20 | 
21 | int process_args(int argc, char **argv);
22 | #endif
23 | 


--------------------------------------------------------------------------------
/parda/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./parda.x --input=traces/normal_137979.trace --lines=137979 > seq.hist
4 | 


--------------------------------------------------------------------------------
/parda/seperate.c:
--------------------------------------------------------------------------------
 1 | #include "seperate.h"
 2 | 
 3 | long get_file_lines(char filename[]) {
 4 |   FILE *fp = fopen(filename, "r");
 5 |   char input[20];
 6 |   long sum = 0;
 7 |   while (fscanf(fp, "%s", input) != EOF) {
 8 |     sum++;
 9 |     if (sum < 0) {
10 |       printf("Trace length is out of 32 bit integer type\n");
11 |       return -1;
12 |     }
13 |   }
14 |   fclose(fp);
15 |   return sum;
16 | }
17 | 
18 | long seperate_textfile(char filename[], int processor_number, long lines) {
19 |   FILE *fp = fopen(filename, "r");
20 |   char input[20];
21 |   long sum = lines;
22 |   int i;
23 |   long tstart, tend;
24 |   long tim;
25 |   for (i = 0; i < processor_number; ++i) {
26 |     FILE *fw =
27 |         fopen(parda_generate_pfilename(filename, i, processor_number), "w");
28 |     tstart = parda_low(i, processor_number, sum);
29 |     tend = parda_high(i, processor_number, sum);
30 |     for (tim = tstart; tim <= tend; ++tim) {
31 |       assert(fscanf(fp, "%s", input) != EOF);
32 |       int len = strlen(input);
33 |       if (len >= 20) {
34 |         printf("line %ld length is larger than SLEN, please make sure all line "
35 |                "less than SLEN\n",
36 |                tim + 1);
37 |       }
38 |       fprintf(fw, "%s\n", input);
39 |     }
40 |     fclose(fw);
41 |   }
42 |   fclose(fp);
43 |   return sum;
44 | }
45 | 
46 | long seperate_binaryfile(char filename[], int processor_number, long lines) {
47 |   FILE *fp = fopen(filename, "rb");
48 |   long sum = lines;
49 |   int i;
50 |   long tstart, tend;
51 |   long t, count;
52 |   void **buffer = (void **)malloc(buffersize * sizeof(void *));
53 |   for (i = 0; i < processor_number; ++i) {
54 |     FILE *fw =
55 |         fopen(parda_generate_pfilename(filename, i, processor_number), "wb");
56 |     tstart = parda_low(i, processor_number, sum);
57 |     tend = parda_high(i, processor_number, sum);
58 |     for (t = tstart; t <= tend; t += count) {
59 |       count = min(tend + 1 - t, buffersize);
60 |       count = fread(buffer, sizeof(void *), count, fp);
61 |       fwrite(buffer, sizeof(void *), count, fw);
62 |     }
63 |     fclose(fw);
64 |   }
65 |   fclose(fp);
66 |   return sum;
67 | }
68 | 
69 | long parda_seperate_file(char inputFileName[], int processor_number,
70 |                          long lines) {
71 |   if (lines == -1)
72 |     lines = get_file_lines(inputFileName);
73 |   int psize = processor_number;
74 |   if (!is_binary)
75 |     seperate_textfile(inputFileName, psize, lines);
76 |   else
77 |     seperate_binaryfile(inputFileName, psize, lines);
78 |   char linesFile[50];
79 |   sprintf(linesFile, "%s_lines_%ld.txt", inputFileName, lines);
80 |   FILE *tfile = fopen(linesFile, "w");
81 |   fprintf(tfile, "%ld", lines);
82 |   fclose(tfile);
83 |   return lines;
84 | }
85 | 


--------------------------------------------------------------------------------
/parda/seperate.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SEPERATE_H
 2 | #define _SEPERATE_H
 3 | 
 4 | #include "parda.h"
 5 | 
 6 | #include <fcntl.h>
 7 | #include <getopt.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | #include <sys/stat.h>
12 | #include <sys/types.h>
13 | #include <unistd.h>
14 | 
15 | #ifndef min
16 | #define min(a, b) (((a) < (b)) ? (a) : (b))
17 | #endif
18 | 
19 | long get_file_lines(char filename[]);
20 | long seperate_textfile(char filename[], int processor_number, long lines);
21 | long seperate_binaryfile(char inputFileName[], int processor_number,
22 |                          long lines);
23 | long parda_seperate_file(char inputFileName[], int processor_number,
24 |                          long lines);
25 | #endif
26 | 


--------------------------------------------------------------------------------
/parda/splay.c:
--------------------------------------------------------------------------------
  1 | #include "splay.h"
  2 | 
  3 | Tree *splay(T i, Tree *t)
  4 | 
  5 | {
  6 |   Tree N, *l, *r, *y;
  7 |   T comp, l_size, r_size;
  8 |   if (t == NULL)
  9 |     return t;
 10 |   N.left = N.right = NULL;
 11 |   l = r = &N;
 12 |   l_size = r_size = 0;
 13 | 
 14 |   for (;;) {
 15 |     comp = compare(i, t->key);
 16 |     if (comp < 0) {
 17 |       if (t->left == NULL)
 18 |         break;
 19 |       if (compare(i, t->left->key) < 0) {
 20 |         y = t->left;
 21 |         t->left = y->right;
 22 |         y->right = t;
 23 |         t->size = node_size(t->left) + node_size(t->right) + 1;
 24 |         t = y;
 25 |         if (t->left == NULL)
 26 |           break;
 27 |       }
 28 |       r->left = t;
 29 |       r = t;
 30 |       t = t->left;
 31 |       r_size += 1 + node_size(r->right);
 32 |     } else if (comp > 0) {
 33 |       if (t->right == NULL)
 34 |         break;
 35 |       if (compare(i, t->right->key) > 0) {
 36 |         y = t->right;
 37 |         t->right = y->left;
 38 |         y->left = t;
 39 |         t->size = node_size(t->left) + node_size(t->right) + 1;
 40 |         t = y;
 41 |         if (t->right == NULL)
 42 |           break;
 43 |       }
 44 |       l->right = t;
 45 |       l = t;
 46 |       t = t->right;
 47 |       l_size += 1 + node_size(l->left);
 48 |     } else {
 49 |       break;
 50 |     }
 51 |   }
 52 |   l_size += node_size(t->left);
 53 |   r_size += node_size(t->right);
 54 |   t->size = l_size + r_size + 1;
 55 | 
 56 |   l->right = r->left = NULL;
 57 | 
 58 |   for (y = N.right; y != NULL; y = y->right) {
 59 |     y->size = l_size;
 60 |     l_size -= 1 + node_size(y->left);
 61 |   }
 62 |   for (y = N.left; y != NULL; y = y->left) {
 63 |     y->size = r_size;
 64 |     r_size -= 1 + node_size(y->right);
 65 |   }
 66 | 
 67 |   l->right = t->left;
 68 |   r->left = t->right;
 69 |   t->left = N.right;
 70 |   t->right = N.left;
 71 | 
 72 |   return t;
 73 | }
 74 | 
 75 | Tree *insert(T i, Tree *t) {
 76 | 
 77 |   Tree *new_;
 78 | 
 79 |   if (t != NULL) {
 80 |     t = splay(i, t);
 81 |     if (compare(i, t->key) == 0) {
 82 |       return t;
 83 |     }
 84 |   }
 85 |   new_ = (Tree *)malloc(sizeof(Tree));
 86 |   if (new_ == NULL) {
 87 |     printf("Ran out of space\n");
 88 |     exit(1);
 89 |   }
 90 |   if (t == NULL) {
 91 |     new_->left = new_->right = NULL;
 92 |   } else if (compare(i, t->key) < 0) {
 93 |     new_->left = t->left;
 94 |     new_->right = t;
 95 |     t->left = NULL;
 96 |     t->size = 1 + node_size(t->right);
 97 |   } else {
 98 |     new_->right = t->right;
 99 |     new_->left = t;
100 |     t->right = NULL;
101 |     t->size = 1 + node_size(t->left);
102 |   }
103 |   new_->key = i;
104 |   new_->size = 1 + node_size(new_->left) + node_size(new_->right);
105 |   return new_;
106 | }
107 | 
108 | Tree *delete_(T i, Tree *t) {
109 | 
110 |   Tree *x;
111 |   T tsize;
112 | 
113 |   if (t == NULL)
114 |     return NULL;
115 |   tsize = t->size;
116 |   t = splay(i, t);
117 |   if (compare(i, t->key) == 0) {
118 |     if (t->left == NULL) {
119 |       x = t->right;
120 |     } else {
121 |       x = splay(i, t->left);
122 |       x->right = t->right;
123 |     }
124 |     free(t);
125 |     if (x != NULL) {
126 |       x->size = tsize - 1;
127 |     }
128 |     return x;
129 |   } else {
130 |     return t;
131 |   }
132 | }
133 | 
134 | Tree *find_rank(T r, Tree *t) {
135 | 
136 |   T lsize;
137 |   if ((r < 0) || (r >= node_size(t)))
138 |     return NULL;
139 |   for (;;) {
140 |     lsize = node_size(t->left);
141 |     if (r < lsize) {
142 |       t = t->left;
143 |     } else if (r > lsize) {
144 |       r = r - lsize - 1;
145 |       t = t->right;
146 |     } else {
147 |       return t;
148 |     }
149 |   }
150 | }
151 | void freetree(Tree *t) {
152 |   if (t == NULL)
153 |     return;
154 |   freetree(t->right);
155 |   freetree(t->left);
156 |   free(t);
157 | }
158 | void printtree(Tree *t, int d) {
159 | 
160 |   int i;
161 |   if (t == NULL)
162 |     return;
163 |   printtree(t->right, d + 1);
164 |   for (i = 0; i < d; i++)
165 |     printf("  ");
166 |   printf("%d(%d)\n", t->key, t->size);
167 |   printtree(t->left, d + 1);
168 | }
169 | 


--------------------------------------------------------------------------------
/parda/splay.h:
--------------------------------------------------------------------------------
 1 | #ifndef _splay_h
 2 | #define _splay_h
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | 
 6 | typedef struct tree_node Tree;
 7 | typedef int T;
 8 | struct tree_node {
 9 |   Tree *left, *right;
10 |   T key;
11 |   T size;
12 | };
13 | 
14 | #define compare(i, j) ((i) - (j))
15 | 
16 | #define node_size(x) (((x) == NULL) ? 0 : ((x)->size))
17 | 
18 | Tree *splay(T i, Tree *t);
19 | Tree *insert(T i, Tree *t);
20 | Tree *delete_(T i, Tree *t);
21 | Tree *find_rank(T r, Tree *t);
22 | void printtree(Tree *t, int d);
23 | void freetree(Tree *t);
24 | #endif
25 | 


--------------------------------------------------------------------------------
/sass-split/.gitignore:
--------------------------------------------------------------------------------
1 | process_sass_dir
2 | process_sass_dir.o
3 | 


--------------------------------------------------------------------------------
/sass-split/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-std=c++11 -Wall -Wextra -O2
 3 | LDFLAGS=
 4 | OBJ_FILES=process_sass_dir.o
 5 | TARGET=process_sass_dir
 6 | 
 7 | all: $(TARGET)
 8 | 
 9 | $(TARGET): $(OBJ_FILES)
10 | 	$(CXX) $(LDFLAGS) -o $@ $^
11 | 
12 | %.o: %.cpp
13 | 	$(CXX) $(CXXFLAGS) -c -o $@ $<
14 | 
15 | clean:
16 | 	rm -f $(OBJ_FILES) $(TARGET)
17 | 
18 | .PHONY: all clean
19 | 


--------------------------------------------------------------------------------
/sass-split/sass-split.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEARCH_ROOT="../apps/OursTracesCollection"
 4 | 
 5 | CMD="./process_sass_dir --dir"
 6 | 
 7 | find "$SEARCH_ROOT" -type d -name "sass_traces" | while read dir; do
 8 |     echo "Processing directory: $dir"
 9 |     $CMD "$dir"
10 |     echo "Current Time: $(date)"
11 | done
12 | 


--------------------------------------------------------------------------------
/trace-driven/entry.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTRY_H
 2 | #define ENTRY_H
 3 | 
 4 | struct inst_fetch_buffer_entry {
 5 |   inst_fetch_buffer_entry()
 6 |     : pc(0), wid(0), kid(0), uid(0), m_valid(false), 
 7 |       latency(-1), initial_interval(-1), 
 8 |       initial_interval_dec_counter(0) {}
 9 | 
10 |   inst_fetch_buffer_entry(unsigned _pc, unsigned _wid,
11 |                           unsigned _kid, unsigned _uid)
12 |     : pc(_pc), wid(_wid), kid(_kid), uid(_uid),
13 |       m_valid(true), latency(-1), initial_interval(0),
14 |       initial_interval_dec_counter(0) {}
15 | 
16 |   // inst_fetch_buffer_entry(inst_fetch_buffer_entry&& other) noexcept 
17 |   //   : pc(other.pc), wid(other.wid), kid(other.kid), uid(other.uid),
18 |   //     m_valid(other.m_valid), latency(other.latency),
19 |   //     initial_interval(other.initial_interval), 
20 |   //     initial_interval_dec_counter(other.initial_interval_dec_counter) {
21 |   //   other.m_valid = false;
22 |   // }
23 | 
24 |   void set_latency(unsigned _latency) { latency = _latency; }
25 |   void set_initial_interval(unsigned _initial_interval) {
26 |     initial_interval = _initial_interval;
27 |     initial_interval_dec_counter = _initial_interval;
28 |   }
29 | 
30 |   unsigned pc;
31 |   unsigned wid;
32 |   unsigned kid;
33 |   unsigned uid;
34 |   bool m_valid;
35 |   unsigned latency;
36 |   unsigned initial_interval;
37 |   unsigned initial_interval_dec_counter;
38 | };
39 | 
40 | struct curr_instn_id_per_warp_entry {
41 |   curr_instn_id_per_warp_entry() {
42 |     kid = 0;
43 |     block_id = 0;
44 |     warp_id = 0;
45 |   };
46 |   curr_instn_id_per_warp_entry(unsigned _kid, unsigned _block_id,
47 |                                unsigned _warp_id) {
48 |     kid = _kid;
49 |     block_id = _block_id;
50 |     warp_id = _warp_id;
51 |   };
52 |   unsigned kid;
53 |   unsigned block_id;
54 |   unsigned warp_id;
55 | };
56 | 
57 | bool operator<(const curr_instn_id_per_warp_entry &lhs,
58 |                const curr_instn_id_per_warp_entry &rhs);
59 | 
60 | #endif


--------------------------------------------------------------------------------
/trace-driven/hw-stt.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ConvolutedDog/HyFiSS/e1447e8826c35a23169a63b6fc748a6fb6d5da9c/trace-driven/hw-stt.cc


--------------------------------------------------------------------------------
/trace-driven/hw-stt.h:
--------------------------------------------------------------------------------
1 | 
2 | #include "inst-stt.h"
3 | 
4 | #ifndef HW_STT_H
5 | #define HW_STT_H
6 | 
7 | #endif


--------------------------------------------------------------------------------
/trace-driven/inst-stt.cc:
--------------------------------------------------------------------------------
 1 | #include <bits/stdc++.h>
 2 | #include <fstream>
 3 | #include <iostream>
 4 | #include <math.h>
 5 | #include <sstream>
 6 | #include <stdio.h>
 7 | #include <string>
 8 | #include <time.h>
 9 | #include <vector>
10 | 
11 | #include "../ISA-Def/accelwattch_component_mapping.h"
12 | #include "../ISA-Def/ampere_opcode.h"
13 | #include "../ISA-Def/kepler_opcode.h"
14 | #include "../ISA-Def/pascal_opcode.h"
15 | #include "../ISA-Def/trace_opcode.h"
16 | #include "../ISA-Def/turing_opcode.h"
17 | #include "../ISA-Def/volta_opcode.h"
18 | #include "inst-stt.h"
19 | 
20 | inst_stt::inst_stt() {
21 | 
22 |   fetch_stage = false;
23 | 
24 |   wr_bk_stage = false;
25 | 
26 |   warp_exit_stage = false;
27 | }
28 | 


--------------------------------------------------------------------------------
/trace-driven/inst-stt.h:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <bitset>
 3 | #include <list>
 4 | #include <map>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include <string>
 9 | 
10 | #ifndef INST_STT_H
11 | #define INST_STT_H
12 | 
13 | typedef bool inst_stage_t;
14 | 
15 | class inst_stt {
16 | public:
17 |   inst_stt();
18 | 
19 | private:
20 |   inst_stage_t fetch_stage;
21 | 
22 |   inst_stage_t wr_bk_stage;
23 |   inst_stage_t warp_exit_stage;
24 | };
25 | 
26 | class mem_stat_t {
27 | public:
28 |   mem_stat_t();
29 | };
30 | 
31 | struct SM_computation_instance {};
32 | 
33 | #endif


--------------------------------------------------------------------------------
/trace-driven/kernel-info.cc:
--------------------------------------------------------------------------------
 1 | #include "kernel-info.h"
 2 | 
 3 | kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim) {
 4 | 
 5 |   m_grid_dim = gridDim;
 6 | 
 7 |   m_block_dim = blockDim;
 8 | 
 9 |   m_uid = kernel_info_m_next_uid++;
10 | }
11 | 
12 | trace_kernel_info_t::trace_kernel_info_t(dim3 gridDim, dim3 blockDim,
13 |                                          trace_parser *parser,
14 | 
15 |                                          kernel_trace_t *kernel_trace_info)
16 |     : kernel_info_t(gridDim, blockDim) {
17 |   m_parser = parser;
18 |   m_kernel_trace_info = kernel_trace_info;
19 | 
20 |   if (kernel_trace_info->binary_verion == AMPERE_RTX_BINART_VERSION ||
21 |       kernel_trace_info->binary_verion == AMPERE_A100_BINART_VERSION)
22 |     OpcodeMap = &Ampere_OpcodeMap;
23 |   else if (kernel_trace_info->binary_verion == VOLTA_BINART_VERSION)
24 |     OpcodeMap = &Volta_OpcodeMap;
25 |   else if (kernel_trace_info->binary_verion == PASCAL_TITANX_BINART_VERSION ||
26 |            kernel_trace_info->binary_verion == PASCAL_P100_BINART_VERSION)
27 |     OpcodeMap = &Pascal_OpcodeMap;
28 |   else if (kernel_trace_info->binary_verion == KEPLER_BINART_VERSION)
29 |     OpcodeMap = &Kepler_OpcodeMap;
30 |   else if (kernel_trace_info->binary_verion == TURING_BINART_VERSION)
31 |     OpcodeMap = &Turing_OpcodeMap;
32 |   else {
33 |     printf("unsupported binary version: %d\n",
34 |            kernel_trace_info->binary_verion);
35 |     fflush(stdout);
36 |     exit(0);
37 |   }
38 | }
39 | 
40 | std::vector<mem_instn> &
41 | trace_kernel_info_t::get_one_kernel_one_threadblock_traces(unsigned kernel_id,
42 |                                                            unsigned block_id) {
43 |   return m_parser->get_one_kernel_one_threadblcok_mem_instns(kernel_id,
44 |                                                              block_id);
45 | }
46 | 
47 | std::vector<std::vector<inst_trace_t> *>
48 | trace_kernel_info_t::get_next_threadblock_traces(
49 |     std::string kernel_name, unsigned kernel_id,
50 |     unsigned num_warps_per_thread_block) {
51 |   return m_parser->get_next_threadblock_traces(
52 |       m_kernel_trace_info->trace_verion, m_kernel_trace_info->enable_lineinfo,
53 |       m_kernel_trace_info->ifs, kernel_name, kernel_id,
54 |       num_warps_per_thread_block);
55 | }


--------------------------------------------------------------------------------
/trace-driven/kernel-info.h:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | #include "../common/vector_types.h"
 6 | #include "../trace-parser/trace-parser.h"
 7 | #include "mem-access.h"
 8 | 
 9 | #ifndef KERNEL_INFO_H
10 | #define KERNEL_INFO_H
11 | 
12 | class kernel_info_t {
13 | public:
14 |   kernel_info_t(dim3 gridDim, dim3 blockDim);
15 |   ~kernel_info_t(){};
16 | 
17 |   size_t num_blocks() const {
18 |     return m_grid_dim.x * m_grid_dim.y * m_grid_dim.z;
19 |   }
20 | 
21 |   size_t threads_per_cta() const {
22 |     return m_block_dim.x * m_block_dim.y * m_block_dim.z;
23 |   }
24 | 
25 |   dim3 get_grid_dim() const { return m_grid_dim; }
26 | 
27 |   dim3 get_cta_dim() const { return m_block_dim; }
28 | 
29 |   unsigned get_uid() const { return m_uid; }
30 | 
31 |   unsigned m_uid;
32 | 
33 |   dim3 m_grid_dim;
34 |   dim3 m_block_dim;
35 | };
36 | 
37 | class trace_kernel_info_t : public kernel_info_t {
38 | public:
39 |   trace_kernel_info_t(dim3 gridDim, dim3 blockDim, trace_parser *parser,
40 | 
41 |                       kernel_trace_t *kernel_trace_info);
42 |   ~trace_kernel_info_t() { delete m_kernel_trace_info; };
43 |   std::vector<std::vector<inst_trace_t> *>
44 |   get_next_threadblock_traces(std::string kernel_name, unsigned kernel_id,
45 |                               unsigned num_warps_per_thread_block);
46 |   std::vector<mem_instn> &
47 |   get_one_kernel_one_threadblock_traces(unsigned kernel_id, unsigned block_id);
48 | 
49 |   unsigned long get_cuda_stream_id() {
50 |     return m_kernel_trace_info->cuda_stream_id;
51 |   }
52 | 
53 |   kernel_trace_t *get_trace_info() { return m_kernel_trace_info; }
54 | 
55 | private:
56 |   const std::unordered_map<std::string, OpcodeChar> *OpcodeMap;
57 |   trace_parser *m_parser;
58 |   kernel_trace_t *m_kernel_trace_info;
59 | };
60 | 
61 | #endif


--------------------------------------------------------------------------------
/trace-driven/kernel-trace.cc:
--------------------------------------------------------------------------------
1 | #include "kernel-trace.h"
2 | 
3 | kernel_trace_t::kernel_trace_t() {
4 |   kernel_name = "Empty";
5 |   shmem_base_addr = 0;
6 |   local_base_addr = 0;
7 |   binary_verion = 0;
8 |   trace_verion = 0;
9 | }


--------------------------------------------------------------------------------
/trace-driven/kernel-trace.h:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <string>
 3 | 
 4 | #include "../common/common_def.h"
 5 | 
 6 | #ifndef KERNEL_TRACE_H
 7 | #define KERNEL_TRACE_H
 8 | 
 9 | struct kernel_trace_t {
10 |   kernel_trace_t();
11 | 
12 |   std::string kernel_name;
13 |   unsigned kernel_id;
14 |   unsigned grid_dim_x;
15 |   unsigned grid_dim_y;
16 |   unsigned grid_dim_z;
17 |   unsigned tb_dim_x;
18 |   unsigned tb_dim_y;
19 |   unsigned tb_dim_z;
20 |   unsigned shmem;
21 |   unsigned nregs;
22 |   unsigned long cuda_stream_id;
23 |   unsigned binary_verion;
24 |   unsigned enable_lineinfo;
25 |   unsigned trace_verion;
26 |   std::string nvbit_verion;
27 |   unsigned long long shmem_base_addr;
28 |   unsigned long long local_base_addr;
29 | 
30 | #ifdef ENABLE_SAMPLING_POINT
31 |   unsigned sampling_point;
32 | #endif
33 | 
34 |   std::ifstream *ifs;
35 | };
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/trace-driven/mem-access.cc:
--------------------------------------------------------------------------------
 1 | #include "mem-access.h"
 2 | 
 3 | mem_access_t::mem_access_t(mem_access_type type, new_addr_type address,
 4 |                            unsigned size, bool wr) {
 5 | 
 6 |   m_type = type;
 7 | 
 8 |   m_addr = address;
 9 | 
10 |   m_req_size = size;
11 | 
12 |   m_write = wr;
13 | }
14 | 
15 | mem_access_t::mem_access_t(mem_access_type type, new_addr_type address,
16 |                            unsigned size, bool wr,
17 |                            const active_mask_t &active_mask,
18 |                            const mem_access_byte_mask_t &byte_mask,
19 |                            const mem_access_sector_mask_t &sector_mask)
20 |     : m_warp_mask(active_mask), m_byte_mask(byte_mask),
21 |       m_sector_mask(sector_mask) {
22 |   m_type = type;
23 |   m_addr = address;
24 |   m_req_size = size;
25 |   m_write = wr;
26 | }
27 | 
28 | void mem_access_t::print(FILE *fp) const {
29 |   fprintf(fp, "addr=0x%llx, %s, size=%u, ", m_addr, m_write ? "store" : "load ",
30 |           m_req_size);
31 |   switch (m_type) {
32 |   case GLOBAL_ACC_R:
33 |     fprintf(fp, "GLOBAL_R");
34 |     break;
35 |   case LOCAL_ACC_R:
36 |     fprintf(fp, "LOCAL_R ");
37 |     break;
38 |   case CONST_ACC_R:
39 |     fprintf(fp, "CONST   ");
40 |     break;
41 |   case TEXTURE_ACC_R:
42 |     fprintf(fp, "TEXTURE ");
43 |     break;
44 |   case GLOBAL_ACC_W:
45 |     fprintf(fp, "GLOBAL_W");
46 |     break;
47 |   case LOCAL_ACC_W:
48 |     fprintf(fp, "LOCAL_W ");
49 |     break;
50 |   case L2_WRBK_ACC:
51 |     fprintf(fp, "L2_WRBK ");
52 |     break;
53 |   case INST_ACC_R:
54 |     fprintf(fp, "INST    ");
55 |     break;
56 |   case L1_WRBK_ACC:
57 |     fprintf(fp, "L1_WRBK ");
58 |     break;
59 |   default:
60 |     fprintf(fp, "unknown ");
61 |     break;
62 |   }
63 | }


--------------------------------------------------------------------------------
/trace-driven/mem-access.h:
--------------------------------------------------------------------------------
 1 | #include "../common/common_def.h"
 2 | 
 3 | #ifndef MEM_ACCESS_H
 4 | #define MEM_ACCESS_H
 5 | 
 6 | class mem_access_t {
 7 | public:
 8 |   mem_access_t() {}
 9 | 
10 |   mem_access_t(mem_access_type type, new_addr_type address, unsigned size,
11 |                bool wr);
12 | 
13 |   mem_access_t(mem_access_type type, new_addr_type address, unsigned size,
14 |                bool wr, const active_mask_t &active_mask,
15 |                const mem_access_byte_mask_t &byte_mask,
16 |                const mem_access_sector_mask_t &sector_mask);
17 | 
18 |   new_addr_type get_addr() const { return m_addr; }
19 | 
20 |   void set_addr(new_addr_type addr) { m_addr = addr; }
21 | 
22 |   unsigned get_size() const { return m_req_size; }
23 | 
24 |   const active_mask_t &get_warp_mask() const { return m_warp_mask; }
25 | 
26 |   bool is_write() const { return m_write; }
27 | 
28 |   enum mem_access_type get_type() const { return m_type; }
29 | 
30 |   mem_access_byte_mask_t get_byte_mask() const { return m_byte_mask; }
31 | 
32 |   mem_access_sector_mask_t get_sector_mask() const { return m_sector_mask; }
33 | 
34 |   void print(FILE *fp) const;
35 | 
36 | private:
37 |   unsigned m_uid;
38 | 
39 |   new_addr_type m_addr;
40 | 
41 |   bool m_write;
42 | 
43 |   unsigned m_req_size;
44 | 
45 |   mem_access_type m_type;
46 | 
47 |   active_mask_t m_warp_mask;
48 | 
49 |   mem_access_byte_mask_t m_byte_mask;
50 | 
51 |   mem_access_sector_mask_t m_sector_mask;
52 | };
53 | 
54 | #endif


--------------------------------------------------------------------------------
/trace-driven/trace-warp-inst.cc:
--------------------------------------------------------------------------------
  1 | #include "trace-warp-inst.h"
  2 | 
  3 | inline types_of_operands get_oprnd_type(op_type op, special_ops sp_op) {
  4 |   switch (op) {
  5 |   case SP_OP:
  6 |   case SFU_OP:
  7 |   case SPECIALIZED_UNIT_2_OP:
  8 |   case SPECIALIZED_UNIT_3_OP:
  9 |   case DP_OP:
 10 |   case LOAD_OP:
 11 |   case STORE_OP:
 12 |     return FP_OP;
 13 |   case INTP_OP:
 14 |   case SPECIALIZED_UNIT_4_OP:
 15 |     return INT_OP;
 16 |   case ALU_OP:
 17 |     if ((sp_op == FP__OP) || (sp_op == TEX__OP) || (sp_op == OTHER_OP))
 18 |       return FP_OP;
 19 |     else if (sp_op == INT__OP)
 20 |       return INT_OP;
 21 |   default:
 22 |     return UN_OP;
 23 |   }
 24 | }
 25 | 
 26 | bool trace_warp_inst_t::parse_from_trace_struct(
 27 |     const _inst_trace_t *trace,
 28 |     const std::unordered_map<std::string, OpcodeChar> *OpcodeMap,
 29 |     unsigned gwarp_id) {
 30 | 
 31 |   active_mask_t active_mask = trace->mask;
 32 |   set_active(active_mask);
 33 | 
 34 |   m_decoded = true;
 35 |   pc = (address_type)trace->m_pc;
 36 |   m_gwarp_id = gwarp_id;
 37 | 
 38 |   isize = 16;
 39 |   for (unsigned i = 0; i < MAX_OUTPUT_VALUES; i++) {
 40 |     out[i] = 0;
 41 |   }
 42 |   for (unsigned i = 0; i < MAX_INPUT_VALUES; i++) {
 43 |     in[i] = 0;
 44 |   }
 45 | 
 46 |   is_vectorin = false;
 47 |   is_vectorout = false;
 48 |   ar1 = -1;
 49 |   ar2 = -1;
 50 |   memory_op = no_memory_op;
 51 |   data_size = 0;
 52 |   op = ALU_OP;
 53 |   sp_op = OTHER_OP;
 54 |   mem_op = NOT_TEX;
 55 |   const_cache_operand = 0;
 56 |   oprnd_type = UN_OP;
 57 | 
 58 |   const std::vector<std::string> &opcode_tokens =
 59 |       trace->get_opcode_tokens_directly();
 60 |   std::string opcode1 = opcode_tokens[0];
 61 | 
 62 |   std::unordered_map<std::string, OpcodeChar>::const_iterator it =
 63 |       OpcodeMap->find(opcode1);
 64 | 
 65 |   if (it != OpcodeMap->end()) {
 66 | 
 67 |     m_opcode = it->second.opcode;
 68 |     op = (op_type)(it->second.opcode_category);
 69 |     const std::unordered_map<unsigned, unsigned> *OpcPowerMap = &OpcodePowerMap;
 70 | 
 71 |     std::unordered_map<unsigned, unsigned>::const_iterator it2 =
 72 |         OpcPowerMap->find(m_opcode);
 73 |     if (it2 != OpcPowerMap->end())
 74 |       sp_op = (special_ops)(it2->second);
 75 |     oprnd_type = get_oprnd_type(op, sp_op);
 76 |   } else {
 77 |     std::cout << "ERROR:  undefined instruction : " << trace->opcode
 78 |               << " Opcode: " << opcode1 << std::endl;
 79 |     assert(0 && "undefined instruction");
 80 |   }
 81 | 
 82 |   std::string opcode = trace->opcode;
 83 |   if (opcode1 == "MUFU") {
 84 | 
 85 |     if ((opcode.find("MUFU.SIN") != std::string::npos) ||
 86 |         (opcode.find("MUFU.COS") != std::string::npos))
 87 |       sp_op = FP_SIN_OP;
 88 |     if ((opcode.find("MUFU.EX2") != std::string::npos) ||
 89 |         (opcode.find("MUFU.RCP") != std::string::npos))
 90 |       sp_op = FP_EXP_OP;
 91 |     if (opcode.find("MUFU.RSQ") != std::string::npos)
 92 |       sp_op = FP_SQRT_OP;
 93 |     if (opcode.find("MUFU.LG2") != std::string::npos)
 94 |       sp_op = FP_LG_OP;
 95 |   }
 96 | 
 97 |   if (opcode1 == "IMAD") {
 98 | 
 99 |     if ((opcode.find("IMAD.MOV") != std::string::npos) ||
100 |         (opcode.find("IMAD.IADD") != std::string::npos))
101 |       sp_op = INT__OP;
102 |   }
103 | 
104 |   num_regs = trace->reg_srcs_num + trace->reg_dsts_num;
105 |   num_operands = num_regs;
106 |   outcount = trace->reg_dsts_num;
107 |   for (unsigned m = 0; m < trace->reg_dsts_num; ++m) {
108 |     out[m] = trace->reg_dest[m];
109 |     arch_reg.dst[m] = trace->reg_dest[m];
110 |   }
111 | 
112 |   incount = trace->reg_srcs_num;
113 |   for (unsigned m = 0; m < trace->reg_srcs_num; ++m) {
114 |     in[m] = trace->reg_src[m];
115 |     arch_reg.src[m] = trace->reg_src[m];
116 |   }
117 | 
118 |   if (trace->memadd_info != NULL) {
119 |     data_size = trace->memadd_info->width;
120 |   }
121 | 
122 |   switch (m_opcode) {
123 |   case OP_LDC:
124 |     data_size = 4;
125 |     memory_op = memory_load;
126 |     const_cache_operand = 1;
127 | 
128 |     break;
129 |   case OP_LDG:
130 |   case OP_LDL:
131 |     assert(data_size > 0);
132 |     memory_op = memory_load;
133 | 
134 |     break;
135 |   case OP_STG:
136 |   case OP_STL:
137 |     assert(data_size > 0);
138 |     memory_op = memory_store;
139 | 
140 |     break;
141 |   case OP_ATOMG:
142 |   case OP_RED:
143 |   case OP_ATOM:
144 |     assert(data_size > 0);
145 |     memory_op = memory_load;
146 |     op = LOAD_OP;
147 | 
148 |     m_isatomic = true;
149 |     should_do_atomic = true;
150 | 
151 |     break;
152 |   case OP_LDS:
153 |     assert(data_size > 0);
154 |     memory_op = memory_load;
155 | 
156 |     break;
157 |   case OP_STS:
158 |     assert(data_size > 0);
159 |     memory_op = memory_store;
160 | 
161 |     break;
162 |   case OP_ATOMS:
163 |     assert(data_size > 0);
164 |     m_isatomic = true;
165 |     memory_op = memory_load;
166 | 
167 |     should_do_atomic = true;
168 |     break;
169 |   case OP_LDSM:
170 |     assert(data_size > 0);
171 | 
172 |     break;
173 |   case OP_ST:
174 |   case OP_LD:
175 |     assert(data_size > 0);
176 |     if (m_opcode == OP_LD)
177 |       memory_op = memory_load;
178 |     else
179 |       memory_op = memory_store;
180 | 
181 |     break;
182 |   case OP_BAR:
183 | 
184 |     break;
185 |   case OP_HADD2:
186 |   case OP_HADD2_32I:
187 |   case OP_HFMA2:
188 |   case OP_HFMA2_32I:
189 |   case OP_HMUL2_32I:
190 |   case OP_HSET2:
191 |   case OP_HSETP2:;
192 |     break;
193 |   default:
194 |     break;
195 |   }
196 | 
197 |   if (!trace->pred_str.empty()) {
198 |     size_t pos_P = trace->pred_str.find('P');
199 |     if (pos_P != std::string::npos) {
200 |       size_t pos_space = trace->pred_str.find(' ', pos_P);
201 |       size_t count = (pos_space != std::string::npos) ? pos_space - pos_P - 1
202 |                                                       : std::string::npos;
203 |       std::string num_str = trace->pred_str.substr(pos_P + 1, count);
204 |       pred = std::stoul(num_str);
205 |     }
206 |   }
207 | 
208 |   m_empty = false;
209 | 
210 |   return true;
211 | }
212 | 
213 | inline void trace_warp_inst_t::set_active(const active_mask_t &active) {
214 |   m_warp_active_mask = active;
215 | }


--------------------------------------------------------------------------------
/trace-driven/trace-warp-inst.h:
--------------------------------------------------------------------------------
  1 | #include <list>
  2 | #include <map>
  3 | #include <regex>
  4 | #include <string.h>
  5 | #include <string>
  6 | #include <unordered_map>
  7 | #include <vector>
  8 | 
  9 | #include "../ISA-Def/accelwattch_component_mapping.h"
 10 | #include "../ISA-Def/trace_opcode.h"
 11 | #include "../common/common_def.h"
 12 | #include "../common/vector_types.h"
 13 | #include "../trace-parser/inst-trace.h"
 14 | 
 15 | #include "kernel-trace.h"
 16 | #include "mem-access.h"
 17 | 
 18 | #ifndef TRACE_WARP_INST_H
 19 | #define TRACE_WARP_INST_H
 20 | 
 21 | types_of_operands get_oprnd_type(op_type op, special_ops sp_op);
 22 | 
 23 | class trace_warp_inst_t {
 24 | public:
 25 |   trace_warp_inst_t() {
 26 |     m_opcode = 0;
 27 |     m_uid = 0;
 28 |     m_empty = true;
 29 |     m_isatomic = false;
 30 | 
 31 |     m_decoded = false;
 32 |     pc = (address_type)-1;
 33 |     isize = 0;
 34 | 
 35 |     num_operands = 0;
 36 |     num_regs = 0;
 37 | 
 38 |     memset(out, 0, sizeof(unsigned));
 39 |     outcount = 0;
 40 |     memset(in, 0, sizeof(unsigned));
 41 |     incount = 0;
 42 | 
 43 |     is_vectorin = false;
 44 |     is_vectorout = false;
 45 | 
 46 |     pred = -1;
 47 |     ar1 = -1;
 48 |     ar2 = -1;
 49 | 
 50 |     for (unsigned i = 0; i < MAX_REG_OPERANDS; i++) {
 51 |       arch_reg.src[i] = -1;
 52 |       arch_reg.dst[i] = -1;
 53 |     }
 54 | 
 55 |     memory_op = no_memory_op;
 56 |     data_size = 0;
 57 | 
 58 |     op = NO_OP;
 59 |     sp_op = OTHER_OP;
 60 |     mem_op = NOT_TEX;
 61 | 
 62 |     const_cache_operand = 0;
 63 | 
 64 |     oprnd_type = UN_OP;
 65 | 
 66 |     m_is_printf = false;
 67 |     should_do_atomic = false;
 68 | 
 69 |     m_gwarp_id = 0;
 70 |     m_warp_id = 0;
 71 |     m_dynamic_warp_id = 0;
 72 | 
 73 |     space = memory_space_t();
 74 |     cache_op = CACHE_UNDEFINED;
 75 |   }
 76 | 
 77 |   bool parse_from_trace_struct(
 78 |       const _inst_trace_t *trace,
 79 |       const std::unordered_map<std::string, OpcodeChar> *OpcodeMap,
 80 |       unsigned gwarp_id);
 81 | 
 82 |   inline void set_active(const active_mask_t &active);
 83 | 
 84 |   unsigned get_opcode() const { return m_opcode; }
 85 |   unsigned get_uid() const { return m_uid; }
 86 |   bool isempty() const { return m_empty; }
 87 |   bool isatomic() const { return m_isatomic; }
 88 |   bool isdecoded() const { return m_decoded; }
 89 |   address_type get_pc() const { return pc; }
 90 |   unsigned get_isize() const { return isize; }
 91 |   unsigned get_outcount() const { return outcount; }
 92 |   unsigned get_incount() const { return incount; }
 93 |   unsigned get_in(unsigned i) const {
 94 |     assert(i < incount);
 95 |     return in[i];
 96 |   }
 97 |   unsigned get_out(unsigned i) const {
 98 |     assert(i < outcount);
 99 |     return out[i];
100 |   }
101 |   bool get_is_vectorin() const { return is_vectorin; }
102 |   bool get_is_vectorout() const { return is_vectorout; }
103 |   int get_pred() const { return pred; }
104 |   int get_ar1() const { return ar1; }
105 |   int get_ar2() const { return ar2; }
106 |   int get_arch_reg_dst(unsigned i) const {
107 |     assert(i < outcount);
108 |     return arch_reg.dst[i];
109 |   }
110 |   /// Determines whether all result registers are written back, and
111 |   /// the value of the register is set to -1 after being written back.
112 |   const bool allArchRegDstWriteBack() const {
113 |     // Another implementation logic:
114 |     //   bool all_write_back = true;
115 |     //   for (unsigned i = 0; i < outcount; ++i) {
116 |     //     if (trace_warp_inst.get_arch_reg_dst(i) != -1) {
117 |     //       all_write_back = false;
118 |     //       break;
119 |     //     }
120 |     //   }
121 |     //   return all_write_back;
122 |     return std::all_of(
123 |       std::begin(arch_reg.dst), std::end(arch_reg.dst), 
124 |       [&](int dstRegValue){ return dstRegValue == -1; });
125 |   }
126 |   int get_arch_reg_src(unsigned i) const {
127 |     assert(i < incount);
128 |     return arch_reg.src[i];
129 |   }
130 |   void set_arch_reg_dst(unsigned i, int reg) {
131 |     assert(i < outcount);
132 |     arch_reg.dst[i] = reg;
133 |   }
134 |   void set_arch_reg_src(unsigned i, int reg) {
135 |     assert(i < incount);
136 |     arch_reg.src[i] = reg;
137 |   }
138 |   _memory_op_t get_memory_op() const { return memory_op; }
139 |   unsigned get_num_operands() const { return num_operands; }
140 |   unsigned get_num_regs() const { return num_regs; }
141 |   unsigned get_data_size() const { return data_size; }
142 |   op_type get_op() const { return op; }
143 |   special_ops get_sp_op() const { return sp_op; }
144 |   mem_operation get_mem_op() const { return mem_op; }
145 |   bool get_const_cache_operand() const { return const_cache_operand; }
146 |   types_of_operands get_oprnd_type_() const { return oprnd_type; }
147 |   bool get_should_do_atomic() const { return should_do_atomic; }
148 |   bool get_is_printf() const { return m_is_printf; }
149 |   unsigned get_gwarp_id() const { return m_gwarp_id; }
150 |   unsigned get_warp_id() const { return m_warp_id; }
151 |   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
152 |   active_mask_t get_active_mask() const { return m_warp_active_mask; }
153 |   active_mask_t &get_active_mask_ref() { return m_warp_active_mask; }
154 |   unsigned get_activate_count() const { return m_warp_active_mask.count(); }
155 | 
156 | private:
157 |   unsigned m_opcode;
158 |   unsigned m_uid;
159 |   bool m_empty;
160 |   bool m_isatomic;
161 | 
162 |   bool m_decoded = false;
163 |   address_type pc = (address_type)-1;
164 |   unsigned isize;
165 | 
166 |   unsigned out[8];
167 | 
168 |   unsigned outcount;
169 | 
170 |   unsigned in[24];
171 | 
172 |   unsigned incount;
173 | 
174 |   bool is_vectorin;
175 |   bool is_vectorout;
176 | 
177 |   int pred;
178 |   int ar1, ar2;
179 | 
180 |   struct {
181 |     int dst[MAX_REG_OPERANDS];
182 |     int src[MAX_REG_OPERANDS];
183 |   } arch_reg;
184 | 
185 |   _memory_op_t memory_op;
186 | 
187 |   unsigned num_operands;
188 |   unsigned num_regs;
189 | 
190 |   unsigned data_size;
191 | 
192 |   op_type op;
193 |   special_ops sp_op;
194 |   mem_operation mem_op;
195 | 
196 |   bool const_cache_operand;
197 | 
198 |   types_of_operands oprnd_type;
199 | 
200 |   bool should_do_atomic;
201 |   bool m_is_printf;
202 | 
203 |   unsigned m_gwarp_id;
204 |   unsigned m_warp_id;
205 | 
206 |   unsigned m_dynamic_warp_id;
207 | 
208 |   active_mask_t m_warp_active_mask;
209 | 
210 |   memory_space_t space;
211 |   cache_operator_type cache_op;
212 | };
213 | 
214 | #endif


--------------------------------------------------------------------------------
/trace-parser/inst-memadd-info.cc:
--------------------------------------------------------------------------------
 1 | #include "inst-memadd-info.h"
 2 | 
 3 | void inst_memadd_info_t::base_stride_decompress(
 4 |     unsigned long long base_address, int stride,
 5 |     const std::bitset<WARP_SIZE> &mask) {
 6 |   bool first_bit1_found = false;
 7 |   bool last_bit1_found = false;
 8 |   unsigned long long addra = base_address;
 9 |   for (int s = 0; s < WARP_SIZE; s++) {
10 |     if (mask.test(s) && !first_bit1_found) {
11 |       first_bit1_found = true;
12 |       addrs[s] = base_address;
13 |     } else if (first_bit1_found && !last_bit1_found) {
14 |       if (mask.test(s)) {
15 |         addra += stride;
16 |         addrs[s] = addra;
17 |       } else
18 |         last_bit1_found = true;
19 |     } else
20 |       addrs[s] = 0;
21 |   }
22 |   empty = false;
23 | }
24 | 
25 | void inst_memadd_info_t::base_delta_decompress(
26 |     unsigned long long base_address, const std::vector<long long> &deltas,
27 |     const std::bitset<WARP_SIZE> &mask) {
28 |   bool first_bit1_found = false;
29 |   long long last_address = 0;
30 |   unsigned delta_index = 0;
31 |   for (int s = 0; s < 32; s++) {
32 |     if (mask.test(s) && !first_bit1_found) {
33 |       addrs[s] = base_address;
34 |       first_bit1_found = true;
35 |       last_address = base_address;
36 |     } else if (mask.test(s) && first_bit1_found) {
37 |       assert(delta_index < deltas.size());
38 |       addrs[s] = last_address + deltas[delta_index++];
39 |       last_address = addrs[s];
40 |     } else
41 |       addrs[s] = 0;
42 |   }
43 |   empty = false;
44 | }


--------------------------------------------------------------------------------
/trace-parser/inst-memadd-info.h:
--------------------------------------------------------------------------------
 1 | #include <bitset>
 2 | 
 3 | #include "../common/common_def.h"
 4 | #include "../common/vector_types.h"
 5 | 
 6 | #ifndef INST_MEMADD_INFO_H
 7 | #define INST_MEMADD_INFO_H
 8 | 
 9 | class inst_memadd_info_t {
10 | public:
11 |   uint64_t addrs[WARP_SIZE];
12 |   int32_t width = 0;
13 |   bool empty = true;
14 | 
15 |   void base_stride_decompress(unsigned long long base_address, int stride,
16 |                               const std::bitset<WARP_SIZE> &mask);
17 |   void base_delta_decompress(unsigned long long base_address,
18 |                              const std::vector<long long> &deltas,
19 |                              const std::bitset<WARP_SIZE> &mask);
20 | };
21 | 
22 | #endif


--------------------------------------------------------------------------------
/trace-parser/inst-trace.h:
--------------------------------------------------------------------------------
  1 | #include <iomanip>
  2 | #include <sstream>
  3 | #include <string>
  4 | #include <vector>
  5 | 
  6 | #include "../ISA-Def/trace_opcode.h"
  7 | #include "../ISA-Def/volta_opcode.h"
  8 | #include "../common/common_def.h"
  9 | #include "../hw-parser/hw-parser.h"
 10 | #include "inst-memadd-info.h"
 11 | #include "memory-space.h"
 12 | #include "sass-inst.h"
 13 | 
 14 | #ifndef INST_TRACE_H
 15 | #define INST_TRACE_H
 16 | 
 17 | enum FUNC_UNITS_NAME {
 18 | 
 19 |   NON_UNIT = 0,
 20 |   SP_UNIT,
 21 |   SFU_UNIT,
 22 |   INT_UNIT,
 23 |   DP_UNIT,
 24 |   TENSOR_CORE_UNIT,
 25 |   LDST_UNIT,
 26 |   SPEC_UNIT_1,
 27 |   SPEC_UNIT_2,
 28 |   SPEC_UNIT_3,
 29 |   NUM_FUNC_UNITS
 30 | };
 31 | 
 32 | struct inst_trace_t {
 33 |   inst_trace_t();
 34 |   inst_trace_t(const inst_trace_t &b);
 35 | 
 36 |   unsigned line_num;
 37 |   unsigned m_pc;
 38 |   unsigned mask;
 39 |   unsigned reg_dsts_num;
 40 |   unsigned reg_dest[MAX_DST];
 41 |   std::string opcode;
 42 |   unsigned reg_srcs_num;
 43 |   unsigned reg_src[MAX_SRC];
 44 |   inst_memadd_info_t *memadd_info;
 45 | 
 46 |   bool parse_from_string(std::string trace, unsigned tracer_version,
 47 |                          unsigned enable_lineinfo, std::string kernel_name,
 48 |                          unsigned kernel_id);
 49 | 
 50 |   bool check_opcode_contain(const std::vector<std::string> &opcode,
 51 |                             std::string param) const;
 52 | 
 53 |   unsigned
 54 |   get_datawidth_from_opcode(const std::vector<std::string> &opcode) const;
 55 | 
 56 |   std::vector<std::string> get_opcode_tokens() const;
 57 | 
 58 |   ~inst_trace_t();
 59 | };
 60 | 
 61 | struct _inst_trace_t {
 62 | 
 63 |   _inst_trace_t(unsigned _kernel_id, unsigned _pc, std::string _instn_str) {
 64 |     kernel_id = _kernel_id;
 65 |     m_pc = _pc;
 66 |     instn_str = _instn_str;
 67 | 
 68 |     for (unsigned it = 0; it < MAX_DST; it++) {
 69 |       reg_dest_is_pred[it] = false;
 70 |     }
 71 | 
 72 |     memadd_info = NULL;
 73 |     parse_from_string(_instn_str, _kernel_id);
 74 | 
 75 |     opcode_tokens = get_opcode_tokens();
 76 |     memadd_info->width = get_datawidth_from_opcode(opcode_tokens);
 77 |     m_valid = true;
 78 |     mask = 0x0;
 79 |   };
 80 | 
 81 |   _inst_trace_t(unsigned _kernel_id, unsigned _pc, std::string _instn_str,
 82 |                 hw_config *hw_cfg) {
 83 |     kernel_id = _kernel_id;
 84 |     m_pc = _pc;
 85 |     instn_str = _instn_str;
 86 | 
 87 |     for (unsigned it = 0; it < MAX_DST; it++) {
 88 |       reg_dest_is_pred[it] = false;
 89 |     }
 90 | 
 91 |     memadd_info = NULL;
 92 |     parse_from_string(_instn_str, _kernel_id);
 93 | 
 94 |     opcode_tokens = get_opcode_tokens();
 95 |     memadd_info->width = get_datawidth_from_opcode(opcode_tokens);
 96 |     this->hw_cfg = hw_cfg;
 97 | 
 98 |     parse_opcode_latency_info();
 99 |     m_valid = true;
100 |     mask = 0x0;
101 |   };
102 | 
103 |   bool m_valid = false;
104 | 
105 |   unsigned kernel_id;
106 |   unsigned m_pc;
107 |   unsigned mask = 0x0;
108 |   unsigned reg_dsts_num;
109 |   int reg_dest[MAX_DST];
110 |   bool reg_dest_is_pred[MAX_DST];
111 |   std::string opcode;
112 | 
113 |   unsigned reg_srcs_num;
114 |   int reg_src[MAX_SRC];
115 |   inst_memadd_info_t *memadd_info;
116 |   std::string instn_str;
117 | 
118 |   std::vector<std::string> opcode_tokens;
119 | 
120 |   std::string pred_str = "";
121 | 
122 |   unsigned initiation_interval;
123 |   unsigned latency;
124 |   enum FUNC_UNITS_NAME func_unit;
125 |   hw_config *hw_cfg;
126 | 
127 |   bool parse_from_string(std::string trace, unsigned kernel_id);
128 | 
129 |   bool check_opcode_contain(const std::vector<std::string> &opcode,
130 |                             std::string param) const;
131 | 
132 |   unsigned
133 |   get_datawidth_from_opcode(const std::vector<std::string> &opcode) const;
134 | 
135 |   std::vector<std::string> get_opcode_tokens() const;
136 | 
137 |   inline std::vector<std::string> get_opcode_tokens_directly() const {
138 |     return opcode_tokens;
139 |   }
140 | 
141 |   void parse_opcode_latency_info();
142 | 
143 |   unsigned get_latency() const;
144 |   unsigned get_initiation_interval() const;
145 |   enum FUNC_UNITS_NAME get_func_unit() const;
146 | 
147 |   ~_inst_trace_t();
148 | };
149 | 
150 | #endif


--------------------------------------------------------------------------------
/trace-parser/memory-space.cc:
--------------------------------------------------------------------------------
 1 | #include "memory-space.h"
 2 | 
 3 | memory_space_t::memory_space_t() {
 4 |   m_type = undefined_space;
 5 |   m_bank = 0;
 6 | }
 7 | 
 8 | memory_space_t::memory_space_t(const enum _memory_space_t &from) {
 9 |   m_type = from;
10 |   m_bank = 0;
11 | }
12 | 
13 | bool memory_space_t::operator==(const memory_space_t &x) const {
14 |   return (m_bank == x.m_bank) && (m_type == x.m_type);
15 | }
16 | 
17 | bool memory_space_t::operator!=(const memory_space_t &x) const {
18 |   return !(*this == x);
19 | }
20 | 
21 | bool memory_space_t::operator<(const memory_space_t &x) const {
22 |   if (m_type < x.m_type)
23 |     return true;
24 |   else if (m_type > x.m_type)
25 |     return false;
26 |   else if (m_bank < x.m_bank)
27 |     return true;
28 |   return false;
29 | }
30 | 
31 | enum _memory_space_t memory_space_t::get_type() const { return m_type; }
32 | 
33 | void memory_space_t::set_type(enum _memory_space_t t) { m_type = t; }
34 | 
35 | unsigned memory_space_t::get_bank() const { return m_bank; }
36 | 
37 | void memory_space_t::set_bank(unsigned b) { m_bank = b; }
38 | 
39 | bool memory_space_t::is_const() const {
40 |   return (m_type == const_space) || (m_type == param_space_kernel);
41 | }
42 | 
43 | bool memory_space_t::is_local() const {
44 |   return (m_type == local_space) || (m_type == param_space_local);
45 | }
46 | 
47 | bool memory_space_t::is_global() const { return (m_type == global_space); }
48 | 


--------------------------------------------------------------------------------
/trace-parser/memory-space.h:
--------------------------------------------------------------------------------
 1 | #include "../common/common_def.h"
 2 | #include "../common/vector_types.h"
 3 | 
 4 | #ifndef MEMORY_SPACE_H
 5 | #define MEMORY_SPACE_H
 6 | 
 7 | class memory_space_t {
 8 | public:
 9 |   memory_space_t();
10 | 
11 |   memory_space_t(const enum _memory_space_t &from);
12 | 
13 |   bool operator==(const memory_space_t &x) const;
14 |   bool operator!=(const memory_space_t &x) const;
15 |   bool operator<(const memory_space_t &x) const;
16 |   enum _memory_space_t get_type() const;
17 |   void set_type(enum _memory_space_t t);
18 |   unsigned get_bank() const;
19 |   void set_bank(unsigned b);
20 |   bool is_const() const;
21 |   bool is_local() const;
22 |   bool is_global() const;
23 | 
24 | private:
25 |   enum _memory_space_t m_type;
26 |   unsigned m_bank;
27 | };
28 | 
29 | #endif


--------------------------------------------------------------------------------
/trace-parser/sass-inst.cc:
--------------------------------------------------------------------------------
 1 | #include "sass-inst.h"
 2 | 
 3 | std::map<unsigned, sass_inst_t> pc_to_sassStr;
 4 | std::vector<int> have_readed_insn_pcs;
 5 | 
 6 | bool have_print_sass_during_this_execution = false;
 7 | 
 8 | sass_inst_t find_sass_inst_by_pc(unsigned pc) {
 9 |   std::map<unsigned, sass_inst_t>::iterator iter;
10 |   iter = pc_to_sassStr.find(pc);
11 |   if (iter != pc_to_sassStr.end()) {
12 |     return iter->second;
13 |   } else {
14 |     std::cout << "Can't find sass inst by pc: " << std::hex << pc << std::endl;
15 |     sass_inst_t null_ = sass_inst_t();
16 |     return null_;
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/trace-parser/sass-inst.h:
--------------------------------------------------------------------------------
 1 | #include <bitset>
 2 | #include <iostream>
 3 | #include <map>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "../ISA-Def/trace_opcode.h"
 8 | #include "../common/common_def.h"
 9 | #include "../common/vector_types.h"
10 | 
11 | #ifndef SASS_INST_H
12 | #define SASS_INST_H
13 | 
14 | struct sass_inst_t {
15 |   std::string insnStr;
16 |   std::string kernel_name;
17 |   unsigned kernel_id;
18 | 
19 |   unsigned line_num;
20 |   unsigned m_pc;
21 |   unsigned mask;
22 |   unsigned reg_dsts_num;
23 |   unsigned reg_dest[MAX_DST];
24 |   std::string opcode;
25 |   unsigned reg_srcs_num;
26 |   unsigned reg_src[MAX_SRC];
27 | 
28 |   std::string m_source_file;
29 |   unsigned m_source_line;
30 | 
31 |   const char *source_file() const { return m_source_file.c_str(); }
32 |   unsigned source_line() const { return m_source_line; }
33 | 
34 |   bool m_empty = true;
35 | };
36 | 
37 | extern std::map<unsigned, sass_inst_t> pc_to_sassStr;
38 | extern std::vector<int> have_readed_insn_pcs;
39 | 
40 | sass_inst_t find_sass_inst_by_pc(unsigned pc);
41 | 
42 | #endif


--------------------------------------------------------------------------------
/trace-parser/sass-split.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='Process sass dir.')
 8 | 
 9 | parser.add_argument('--dir', type=str, required=True,
10 |                     help='The directory of sass files')
11 | 
12 | args = parser.parse_args()
13 | 
14 | sass_dir = args.dir
15 | sass_dir = os.path.abspath(sass_dir)
16 | 
17 | files = os.listdir(sass_dir)
18 | sass_files = [os.path.join(sass_dir, file) for file in files if (file.endswith(".sass") and not file.endswith(".split.sass"))]
19 | 
20 | f_open = {}
21 | warp_content = {}
22 | 
23 | for sass_file in sass_files:
24 |     print("Processing ", sass_file)
25 |     content = open(sass_file, "r").read().split(" ")
26 |     kernel_id = int(sass_file.split("/")[-1].split("_")[1].split(".sass")[0])
27 | 
28 |     for i in range(int(len(content)/3)):
29 |         gwarp_id = int(content[i*3 + 2], 16)
30 |         entry = (kernel_id, gwarp_id)
31 |         
32 |         # Use dictionaries to accumulate content instead of writing files directly
33 |         if entry not in warp_content:
34 |             warp_content[entry] = []
35 |         warp_content[entry].append(content[i*3] + " " + content[i*3 + 1])
36 | 
37 | for (kernel_id, gwarp_id), lines in warp_content.items():
38 |     file_path = os.path.join(sass_dir, "kernel_" + str(kernel_id) + "_gwarp_id_" + str(gwarp_id) + ".split.sass")
39 |     with open(file_path, "w") as file:
40 |         file.write("\n".join(lines))
41 | 


--------------------------------------------------------------------------------
/tracing-tool/.gitignore:
--------------------------------------------------------------------------------
1 | inject_funcs.o
2 | tracer.so
3 | tracer.o
4 | 


--------------------------------------------------------------------------------
/tracing-tool/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=/usr/local/cuda/bin/nvcc -ccbin=$(CXX) -D_FORCE_INLINES --compiler-options "-pipe"
 2 | ARCH=70 75 80
 3 | 
 4 | NVCC_VER_REQ=11
 5 | NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ')
 6 | NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc)
 7 | 
 8 | ifeq ($(NVCC_VER_CHECK),0)
 9 | $(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.)
10 | endif
11 | 
12 | NVBIT_PATH=nvbit
13 | INCLUDES=-I$(NVBIT_PATH)
14 | 
15 | LIBS=-L$(NVBIT_PATH) -lnvbit
16 | NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /))
17 | 
18 | SOURCES=$(wildcard *.cu)
19 | 
20 | OBJECTS=$(SOURCES:.cu=.o)
21 | 
22 | $(foreach sm,$(ARCH),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
23 | 
24 | mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
25 | current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
26 | 
27 | NVBIT_TOOL=tracer.so
28 | 
29 | all: $(NVBIT_TOOL)
30 | 
31 | $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a
32 | 	$(NVCC) $(GENCODE_FLAGS) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
33 | 
34 | %.o: %.cu
35 | 	$(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall $(GENCODE_FLAGS) -O3 -Xcompiler -fPIC $< -o $@
36 | 
37 | inject_funcs.o: inject_funcs.cu
38 | 	$(NVCC) $(INCLUDES) -maxrregcount=24 -Xptxas -astoolspatch --keep-device-functions $(GENCODE_FLAGS) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@
39 | 
40 | clean:
41 | 	rm -f *.so *.o


--------------------------------------------------------------------------------
/tracing-tool/README.md:
--------------------------------------------------------------------------------
 1 | ## Tracing Tool
 2 | 
 3 | The ***tracing-tool*** is used to extract the memory and compute traces. This tool uses and extends NVBit (NVidia Binary Instrumentation Tool) which is a research prototype of a dynamic binary instrumentation library for NVIDIA GPUs. Licence and agreement of NVBIT is found in the origianal [NVBIT repo](https://github.com/NVlabs/NVBit) ("This software contains source code provided by NVIDIA Corporation")
 4 | 
 5 | NVBIT does not require application source code, any pre-compiled GPU application should work regardless of which compiler (or version) has been used (i.e. nvcc, pgicc, etc).
 6 | 
 7 | ## Usage
 8 | 
 9 | *  Setup the `MAX_KERNELS` variable in `tracer.cu` to define the limit on the number of kernels you want to instrument in the application. The `MAX_KERNELS` variable we used for collecting traces is 300.
10 | 
11 | * For stanalone building and running of the ***tracing-tool***, please see below: 
12 | 
13 |   #### 1. Building the tool
14 |   
15 |   * Setup `ARCH` and `NVCC` variable in the Makefile. For the Volta architecture, you need to set:
16 |     ```shell
17 |     NVCC=/usr/local/cuda/bin/nvcc -ccbin=$(CXX) -D_FORCE_INLINES --compiler-options "-pipe"
18 |     ARCH=70
19 |     ```
20 |     It is important to note that this tool is not sensitive to CUDA versions, so your default version should be fine.
21 |   * Compile the ***tracing-tool***:
22 |     ```
23 |     make clean && make
24 |     ```
25 | 
26 |   #### 2. Extracting the traces
27 |   
28 |   ```
29 |   LD_PRELOAD=/path/to/tracing-tool/tracer.so /path/to/app [parameters of app] 
30 |   ```
31 |   
32 |   The above command outputs two folders ***memory_traces*** and ***sass_traces*** each has the applications kernel traces. It also output ***configs*** file which has information about the kernel executing inside the application. 
33 | 


--------------------------------------------------------------------------------
/tracing-tool/common.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | typedef struct {
 4 |   int pred_inst;
 5 |   int pred_off_threads;
 6 |   int pred_num;
 7 |   int sm_id;
 8 |   int cta_id_x;
 9 |   int cta_id_y;
10 |   int cta_id_z;
11 |   int warp_id;
12 |   int opcode_id;
13 |   int pc;
14 |   int is_mem_inst;
15 |   int mref_id;
16 |   uint64_t mem_addrs1[32];
17 |   uint64_t mem_addrs2[32];
18 |   int dst_oprnd;
19 |   int dst_oprnd_type;
20 |   int src_oprnds[5];
21 |   int src_oprnds_type[5];
22 |   uint64_t curr_clk;
23 |   int gwarp_id;
24 |   bool isPredNeg;
25 |   bool isPredUniform;
26 |   uint32_t active_mask;
27 |   uint32_t predicate_mask;
28 |   bool stride_or_delta;
29 | } inst_access_t;
30 | 
31 | #define cta_addresses_size_width 10000
32 | #define cta_addresses_size_depth 10000


--------------------------------------------------------------------------------
/tracing-tool/inject_funcs.cu:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdio.h>
 3 | 
 4 | #include "common.h"
 5 | #include "utils/channel.hpp"
 6 | #include "utils/utils.h"
 7 | 
 8 | #include "nvbit_reg_rw.h"
 9 | 
10 | extern "C" __device__ __noinline__ void
11 | instrument_inst(int pred, int pc, int opcode_id, int is_mem_inst,
12 |                 uint64_t addr1, int mref_id, uint64_t addr2, int dst_oprnd,
13 |                 int dst_oprnd_type, int src_oprnd1, int src_oprnd1_type,
14 |                 int src_oprnd2, int src_oprnd2_type, int src_oprnd3,
15 |                 int src_oprnd3_type, int src_oprnd4, int src_oprnd4_type,
16 |                 int src_oprnd5, int src_oprnd5_type, int pred_num,
17 |                 int isPredNeg, int isPredUniform, uint64_t pchannel_dev) {
18 | 
19 |   inst_access_t ia;
20 | 
21 |   /* TODO: some instructions about using %clock64 */
22 |   uint64_t current_clk;
23 |   asm("mov.u64 %0, %clock64;" : "=l"(current_clk));
24 |   ia.curr_clk = current_clk;
25 | 
26 |   if (!pred) {
27 |     ia.pred_inst = 1;
28 |   } else {
29 |     ia.pred_inst = 0;
30 |   }
31 |   ia.pred_num = pred_num;
32 | 
33 |   ia.sm_id = get_smid();
34 |   int4 cta = get_ctaid();
35 |   ia.cta_id_x = cta.x;
36 |   ia.cta_id_y = cta.y;
37 |   ia.cta_id_z = cta.z;
38 |   /* warp id within a thread block. */
39 |   ia.warp_id = get_warpid();
40 |   /* global warp id within all thread blocks of one kernel. */
41 |   ia.gwarp_id = get_global_warp_id();
42 |   ia.opcode_id = opcode_id;
43 |   ia.pc = pc;
44 |   ia.is_mem_inst = is_mem_inst;
45 |   ia.mref_id = mref_id;
46 |   ia.isPredNeg = isPredNeg;
47 |   ia.isPredUniform = isPredUniform;
48 | 
49 |   // ia.pred_reg_value = nvbit_read_pred_reg();
50 | 
51 |   const uint32_t active_mask = __ballot_sync(__activemask(), 1);
52 |   const int laneid = get_laneid();
53 |   const int first_laneid = __ffs(active_mask) - 1;
54 |   const uint32_t predicate_mask = __ballot_sync(__activemask(), pred);
55 |   const int active_threads = __popc(active_mask);
56 | 
57 |   /* active threads that are not predicated off per instruction executed */
58 |   ia.pred_off_threads = active_threads - __popc(predicate_mask);
59 | 
60 |   if (is_mem_inst) {
61 |     /* collect memory address information from other threads */
62 |     for (int i = 0; i < 32; i++) {
63 |       ia.mem_addrs1[i] = __shfl_sync(active_mask, addr1, i);
64 |       if (mref_id == 2)
65 |         ia.mem_addrs2[i] = __shfl_sync(active_mask, addr2, i);
66 |     }
67 |   }
68 | 
69 |   /* Judge if the addr is strid-mode or delta-mode. START */
70 |   /* Judge if the addr is strid-mode or delta-mode. END */
71 | 
72 |   ia.dst_oprnd = dst_oprnd;
73 |   ia.dst_oprnd_type = dst_oprnd_type;
74 | 
75 |   ia.active_mask = active_mask;
76 |   ia.predicate_mask = predicate_mask;
77 | 
78 |   ia.src_oprnds[0] = src_oprnd1;
79 |   ia.src_oprnds_type[0] = src_oprnd1_type;
80 |   ia.src_oprnds[1] = src_oprnd2;
81 |   ia.src_oprnds_type[1] = src_oprnd2_type;
82 |   ia.src_oprnds[2] = src_oprnd3;
83 |   ia.src_oprnds_type[2] = src_oprnd3_type;
84 |   ia.src_oprnds[3] = src_oprnd4;
85 |   ia.src_oprnds_type[3] = src_oprnd4_type;
86 |   ia.src_oprnds[4] = src_oprnd5;
87 |   ia.src_oprnds_type[4] = src_oprnd5_type;
88 | 
89 |   /* first active lane pushes information on the channel */
90 |   if (first_laneid == laneid) {
91 |     ChannelDev *channel_dev = (ChannelDev *)pchannel_dev;
92 |     channel_dev->push(&ia, sizeof(inst_access_t));
93 |   }
94 | }


--------------------------------------------------------------------------------
/tracing-tool/nvbit/instr_types.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <inttypes.h>
 31 | #include <unordered_map>
 32 | 
 33 | namespace InstrType {
 34 | 
 35 | /* all supported arch have at most 255 general purpose registers */
 36 | constexpr const int RZ = 255;
 37 | /* the always true predicate is indicated as "7" on all the archs */
 38 | constexpr const int PT = 7;
 39 | /* the entire predicate register is ecoded as "8" */
 40 | constexpr const int PR = 8;
 41 | constexpr const int URZ = 63;
 42 | constexpr const int UPT = 7;  // uniform predicate true
 43 | constexpr const int UPR = 8;  // entire uniform predicate register
 44 | constexpr const int MAX_CHARS = 256;
 45 | 
 46 | // loads and stores have 1, LDGSTS has 2
 47 | constexpr const int MAX_NUM_MREF_PER_INSTR = 2;
 48 | 
 49 | enum class MemorySpace {
 50 |     NONE,
 51 |     LOCAL,             // local memory operation
 52 |     GENERIC,           // generic memory operation
 53 |     GLOBAL,            // global memory operation
 54 |     SHARED,            // shared memory operation
 55 |     CONSTANT,          // constant memory operation
 56 |     GLOBAL_TO_SHARED,  // read from global memory then write to shared memory
 57 | };
 58 | constexpr const char* MemorySpaceStr[] = {
 59 |     "NONE", "LOCAL", "GENERIC", "GLOBAL", "SHARED", "CONSTANT",
 60 |     "GLOBAL_TO_SHARED",
 61 | };
 62 | 
 63 | enum class OperandType {
 64 |     IMM_UINT64,
 65 |     IMM_DOUBLE,
 66 |     REG,
 67 |     PRED,
 68 |     UREG,
 69 |     UPRED,
 70 |     CBANK,
 71 |     MREF,
 72 |     GENERIC
 73 | };
 74 | 
 75 | constexpr const char* OperandTypeStr[] = {
 76 |     "IMM_UINT64", "IMM_DOUBLE", "REG",  "PRED",   "UREG",
 77 |     "UPRED",      "CBANK",      "MREF", "GENERIC"};
 78 | 
 79 | enum class RegModifierType {
 80 |     /* stride modifiers */
 81 |     X1,
 82 |     X4,
 83 |     X8,
 84 |     X16,
 85 |     /* size modifiers */
 86 |     U32,
 87 |     U64,
 88 |     NO_MOD
 89 | };
 90 | constexpr const char* RegModifierTypeStr[] = {
 91 |     "X1", "X4", "X8", "X16", "U32", /* no U */ "64", "NO_MOD"};
 92 | 
 93 | typedef struct {
 94 |     /* operand type */
 95 |     OperandType type;
 96 |     /* operand string */
 97 |     std::string str;
 98 |     /* is negative */
 99 |     bool is_neg;
100 |     /* is not */
101 |     bool is_not;
102 |     /* is absolute */
103 |     bool is_abs;
104 |     /* operand size in byte */
105 |     int nbytes;
106 | 
107 |     union {
108 |         struct {
109 |             uint64_t value;
110 |         } imm_uint64;
111 | 
112 |         struct {
113 |             double value;
114 |         } imm_double;
115 | 
116 |         struct {
117 |             int num;
118 |             /* register properties .XXX */
119 |             char prop[MAX_CHARS];
120 |         } reg;
121 | 
122 |         struct {
123 |             int num;
124 |         } pred;
125 | 
126 |         struct {
127 |             int id;
128 |             bool has_imm_offset;
129 |             int imm_offset;
130 |             bool has_reg_offset;
131 |             int reg_offset;
132 |         } cbank;
133 | 
134 |         struct {
135 |             bool has_ra;
136 |             int ra_num;
137 |             RegModifierType ra_mod;
138 |             bool has_ur;
139 |             int ur_num;
140 |             bool has_imm;
141 |             int imm;
142 |         } mref;
143 | 
144 |         struct {
145 |             char array[MAX_CHARS];
146 |         } generic;
147 | 
148 |     } u;
149 | } operand_t;
150 | };
151 | 


--------------------------------------------------------------------------------
/tracing-tool/nvbit/libnvbit.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ConvolutedDog/HyFiSS/e1447e8826c35a23169a63b6fc748a6fb6d5da9c/tracing-tool/nvbit/libnvbit.a


--------------------------------------------------------------------------------
/tracing-tool/nvbit/nvbit_reg_rw.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | /* This file needs to be include once in your nvbit tool, it provides hooks to
29 |  * the nvbit core library to properly load this tool.
30 |  * Do not modify!!!  */
31 | #pragma once
32 | #include <stdio.h>
33 | #include <cassert>
34 | #include <stdint.h>
35 | 
36 | __managed__ volatile int32_t __nvbit_var = 0;
37 | 
38 | /* parameters need to be used in the function to prevent compiler optimizing
39 |  * them away. */
40 | 
41 | extern "C" __device__ __noinline__ int32_t nvbit_read_reg(uint64_t reg_num) {
42 | #pragma unroll
43 |     for (int i = 0; i < 1024; i++) __nvbit_var += i;
44 |     assert(__nvbit_var == reg_num);
45 |     return __nvbit_var;
46 | }
47 | 
48 | extern "C" __device__ __noinline__ void nvbit_write_reg(uint64_t reg_num,
49 |                                                         int32_t reg_val) {
50 | #pragma unroll
51 |     for (int i = 0; i < 1024; i++) __nvbit_var += i;
52 |     assert(__nvbit_var == reg_num + reg_val);
53 | }
54 | 
55 | extern "C" __device__ __noinline__ int32_t nvbit_read_ureg(uint64_t reg_num) {
56 | #pragma unroll
57 |     for (int i = 0; i < 512; i++) __nvbit_var += i;
58 |     assert(__nvbit_var == reg_num);
59 |     return __nvbit_var;
60 | }
61 | 
62 | extern "C" __device__ __noinline__ void nvbit_write_ureg(uint64_t reg_num,
63 |                                                         int32_t reg_val) {
64 | #pragma unroll
65 |     for (int i = 0; i < 512; i++) __nvbit_var += i;
66 |     assert(__nvbit_var == reg_num + reg_val);
67 | }
68 | 
69 | extern "C" __device__ __noinline__ int32_t nvbit_read_pred_reg() {
70 | #pragma unroll
71 |     for (int i = 0; i < 32; i++) __nvbit_var += i;
72 |     return __nvbit_var;
73 | }
74 | 
75 | extern "C" __device__ __noinline__ void nvbit_write_pred_reg(int32_t reg_val) {
76 | #pragma unroll
77 |     for (int i = 0; i < 32; i++) __nvbit_var += reg_val;
78 | }
79 | 
80 | extern "C" __device__ __noinline__ int32_t nvbit_read_upred_reg() {
81 | #pragma unroll
82 |     for (int i = 0; i < 32; i++) __nvbit_var += i;
83 |     return __nvbit_var;
84 | }
85 | 
86 | extern "C" __device__ __noinline__ void nvbit_write_upred_reg(int32_t reg_val) {
87 | #pragma unroll
88 |     for (int i = 0; i < 32; i++) __nvbit_var += reg_val;
89 | }
90 | 


--------------------------------------------------------------------------------
/tracing-tool/nvbit/nvbit_tool.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | /* This file needs to be include once in your nvbit tool, it provides hooks to
29 |  * the nvbit core library to properly load this tool.
30 |  * Do not modify!!!  */
31 | 
32 | #pragma once
33 | #include <stdio.h>
34 | #include <cassert>
35 | #include <stdint.h>
36 | 
37 | #define SIGN_EXTEND64(x) ((((int64_t)(x)) << 32) >> 32)
38 | 
39 | /* generic address generation code */
40 | extern "C" __device__ __noinline__ uint64_t
41 | gen_mref_addr(uint32_t ra_high, int is_ra64, uint32_t ra_low, int ra_stride,
42 |               uint32_t ru_high, int is_ru64, uint32_t ru_low, int32_t imm,
43 |               uint32_t mref_idx /* unused */) {
44 |     int64_t base_addr = 0;
45 | 
46 |     if (is_ra64) {
47 |         base_addr +=
48 |             (((uint64_t)ra_high) << 32) | ((uint64_t)ra_low * ra_stride);
49 |     } else {
50 |         base_addr += SIGN_EXTEND64(ra_low * ra_stride);
51 |     }
52 | 
53 |     if (is_ru64) {
54 |         base_addr += (((uint64_t)ru_high) << 32) | ((uint64_t)ru_low);
55 |     } else {
56 |         base_addr += SIGN_EXTEND64(ru_low);
57 |     }
58 | 
59 |     uint64_t addr = base_addr + imm;
60 | #if 0
61 |     printf(
62 |         "ra_high %d - is_ra64 %d - ra_low %d - ra_stride %d - ru_high %d - "
63 |         "is_ru64 %d - ru_low %d - imm %d base_addr %lx addr %lx\n",
64 |         ra_high, is_ra64, ra_low, ra_stride, ru_high, is_ru64, ru_low, imm,
65 |         base_addr, addr);
66 | #endif
67 |     return addr;
68 | }
69 | 
70 | __global__ void load_module_nvbit_kernel(int var) {
71 |     printf("");
72 |     if (var) {
73 |         int tmp = gen_mref_addr(var, var, var, var, var, var, var, var, var);
74 |         printf("%d\n", tmp);
75 |     }
76 | }
77 | extern "C" void __nvbit_start();
78 | 
79 | extern "C" void nvbit_at_context_init_hook() {
80 |     __nvbit_start();
81 |     load_module_nvbit_kernel<<<1, 1>>>(0);
82 |     cudaDeviceSynchronize();
83 |     assert(cudaGetLastError() == cudaSuccess);
84 | }
85 | 


--------------------------------------------------------------------------------
/tracing-tool/nvbit/utils/utils.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #pragma once
 29 | #include <unistd.h>
 30 | 
 31 | #undef CEILING
 32 | #define CEILING(x, y) (((x) + (y)-1) / (y))
 33 | 
 34 | #define CUDA_SAFECALL(call)                                                 \
 35 |     {                                                                       \
 36 |         call;                                                               \
 37 |         cudaError err = cudaGetLastError();                                 \
 38 |         if (cudaSuccess != err) {                                           \
 39 |             fprintf(                                                        \
 40 |                 stderr,                                                     \
 41 |                 "Cuda error in function '%s' file '%s' in line %i : %s.\n", \
 42 |                 #call, __FILE__, __LINE__, cudaGetErrorString(err));        \
 43 |             fflush(stderr);                                                 \
 44 |             _exit(EXIT_FAILURE);                                            \
 45 |         }                                                                   \
 46 |     }
 47 | 
 48 | /*********************************************************************
 49 |  *
 50 |  *                   Device level utility functions
 51 |  *
 52 |  **********************************************************************/
 53 | 
 54 | // Get the SM id
 55 | __device__ __forceinline__ unsigned int get_smid(void) {
 56 |     unsigned int ret;
 57 |     asm("mov.u32 %0, %smid;" : "=r"(ret));
 58 |     return ret;
 59 | }
 60 | 
 61 | // Get the warp id within the application
 62 | __device__ __forceinline__ unsigned int get_warpid(void) {
 63 |     unsigned int ret;
 64 |     asm("mov.u32 %0, %warpid;" : "=r"(ret));
 65 |     return ret;
 66 | }
 67 | 
 68 | // Get the line id within the warp
 69 | __device__ __forceinline__ unsigned int get_laneid(void) {
 70 |     unsigned int laneid;
 71 |     asm volatile("mov.u32 %0, %laneid;" : "=r"(laneid));
 72 |     return laneid;
 73 | }
 74 | 
 75 | // Get a global warp id
 76 | __device__ __forceinline__ int get_global_warp_id() {
 77 |     int block_id = blockIdx.x + blockIdx.y * gridDim.x +
 78 |                    gridDim.x * gridDim.y * blockIdx.z;
 79 | 
 80 |     int l_thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
 81 |                       (threadIdx.y * blockDim.x) + threadIdx.x;
 82 | 
 83 |     int l_warp_id = l_thread_id / 32;
 84 | 
 85 |     int n_warps = CEILING(blockDim.x * blockDim.y * blockDim.z, 32);
 86 | 
 87 |     int g_warp_id = block_id * n_warps + l_warp_id;
 88 | 
 89 |     return g_warp_id;
 90 | }
 91 | 
 92 | // Get a thread's CTA ID
 93 | __device__ __forceinline__ int4 get_ctaid(void) {
 94 |     int4 ret;
 95 |     asm("mov.u32 %0, %ctaid.x;" : "=r"(ret.x));
 96 |     asm("mov.u32 %0, %ctaid.y;" : "=r"(ret.y));
 97 |     asm("mov.u32 %0, %ctaid.z;" : "=r"(ret.z));
 98 |     return ret;
 99 | }
100 | 
101 | //  Get the number of CTA ids per grid
102 | __device__ __forceinline__ int4 get_nctaid(void) {
103 |     int4 ret;
104 |     asm("mov.u32 %0, %nctaid.x;" : "=r"(ret.x));
105 |     asm("mov.u32 %0, %nctaid.y;" : "=r"(ret.y));
106 |     asm("mov.u32 %0, %nctaid.z;" : "=r"(ret.z));
107 |     return ret;
108 | }
109 | 
110 | // Device level sleep function
111 | __device__ __forceinline__ void csleep(uint64_t clock_count) {
112 |     if (clock_count == 0) return;
113 |     clock_t start_clock = clock64();
114 |     clock_t clock_offset = 0;
115 |     while (clock_offset < clock_count) {
116 |         clock_offset = clock64() - start_clock;
117 |     }
118 | }
119 | 
120 | class Managed {
121 |   public:
122 |     void *operator new(size_t len) {
123 |         void *ptr;
124 |         cudaMallocManaged(&ptr, len);
125 |         return ptr;
126 |     }
127 | 
128 |     // void Managed::operator delete(void *ptr)
129 |     void operator delete(void *ptr) { cudaFree(ptr); }
130 | 
131 |     void *operator new[](size_t len) {
132 |         void *ptr;
133 |         cudaMallocManaged(&ptr, len);
134 |         return ptr;
135 |     }
136 |     // void Managed::operator delete[] (void* ptr) {
137 |     void operator delete[](void *ptr) { cudaFree(ptr); }
138 | };
139 | 


--------------------------------------------------------------------------------