├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── branch ├── bimodal.bpred ├── gshare.bpred ├── hashed_perceptron.bpred └── perceptron.bpred ├── build_champsim.sh ├── cvp_tracer ├── README.md └── cvp2champsim.cc ├── download.sh ├── download_links ├── get_stats.py ├── inc ├── block.h ├── cache.h ├── champsim.h ├── dram_controller.h ├── instruction.h ├── kpcp.h ├── memory_class.h ├── ooo_cpu.h ├── set.h ├── spp_dev.h └── uncore.h ├── ml_prefetch_sim.py ├── model.py ├── prefetcher ├── bo.h ├── bo.llc_pref ├── from_file.llc_pref ├── ip_stride.l2c_pref ├── kpcp.l2c_pref ├── kpcp_util.cc ├── next_line.l1d_pref ├── next_line.l1i_pref ├── next_line.l2c_pref ├── next_line.llc_pref ├── no.l1d_pref ├── no.l1i_pref ├── no.l2c_pref ├── no.llc_pref ├── spp_dev.l2c_pref └── trace.llc_pref ├── replacement ├── base_replacement.cc ├── drrip.llc_repl ├── lru.llc_repl ├── ship.llc_repl └── srrip.llc_repl ├── run_4core.sh ├── run_champsim.sh ├── scripts ├── download_dpc3_traces.sh ├── dpc3_max_simpoint.txt ├── multiworkload.cc └── seeds.txt ├── src ├── block.cc ├── cache.cc ├── dram_controller.cc ├── main.cc ├── ooo_cpu.cc └── uncore.cc └── tracer ├── champsim_tracer.cpp ├── clean_tracer.sh ├── make_tracer.sh ├── makefile └── makefile.rules /.gitignore: -------------------------------------------------------------------------------- 1 | prefetcher/l1i_prefetcher.cc 2 | prefetcher/l1d_prefetcher.cc 3 | prefetcher/l2c_prefetcher.cc 4 | prefetcher/llc_prefetcher.cc 5 | branch/branch_predictor.cc 6 | replacement/llc_replacement.cc 7 | 8 | inc/champsim.h.bak 9 | 10 | bin/ 11 | obj/ 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | app = champsim 2 | 3 | srcExt = cc 4 | srcDir = src branch replacement prefetcher 5 | objDir = obj 6 | binDir = bin 7 | inc = inc 8 | 9 | debug = 1 10 | 11 | CFlags = -Wall -O3 -std=c++11 12 | LDFlags = 13 | libs = 14 | libDir = 15 | 16 | 17 | #************************ DO NOT EDIT BELOW THIS LINE! ************************ 18 | 19 | ifeq ($(debug),1) 20 | debug=-g 21 | else 22 | debug= 23 | endif 24 | inc := $(addprefix -I,$(inc)) 25 | libs := $(addprefix -l,$(libs)) 26 | libDir := $(addprefix -L,$(libDir)) 27 | CFlags += -c $(debug) $(inc) $(libDir) $(libs) 28 | sources := $(shell find $(srcDir) -name '*.$(srcExt)') 29 | srcDirs := $(shell find . -name '*.$(srcExt)' -exec dirname {} \; | uniq) 30 | objects := $(patsubst %.$(srcExt),$(objDir)/%.o,$(sources)) 31 | 32 | ifeq ($(srcExt),cc) 33 | CC = $(CXX) 34 | else 35 | CFlags += -std=gnu99 36 | endif 37 | 38 | .phony: all clean distclean 39 | 40 | 41 | all: $(binDir)/$(app) 42 | 43 | $(binDir)/$(app): buildrepo $(objects) 44 | @mkdir -p `dirname $@` 45 | @echo "Linking $@..." 46 | @$(CC) $(objects) $(LDFlags) -o $@ 47 | 48 | $(objDir)/%.o: %.$(srcExt) 49 | @echo "Generating dependencies for $<..." 50 | @$(call make-depend,$<,$@,$(subst .o,.d,$@)) 51 | @echo "Compiling $<..." 52 | @$(CC) $(CFlags) $< -o $@ 53 | 54 | clean: 55 | $(RM) -r $(objDir) 56 | 57 | distclean: clean 58 | $(RM) -r $(binDir)/$(app) 59 | 60 | buildrepo: 61 | @$(call make-repo) 62 | 63 | define make-repo 64 | for dir in $(srcDirs); \ 65 | do \ 66 | mkdir -p $(objDir)/$$dir; \ 67 | done 68 | endef 69 | 70 | 71 | # usage: $(call make-depend,source-file,object-file,depend-file) 72 | define make-depend 73 | $(CC) -MM \ 74 | -MF $3 \ 75 | -MP \ 76 | -MT $2 \ 77 | $(CFlags) \ 78 | $1 79 | endef 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modified ChampSim for ML Prefetching Competition 2 | 3 | We will use ChampSim to evaluate the effectiveness of your ML prefetchers. You 4 | prefetching models will be trained using the Load Traces that we provide (details below), 5 | and they will generate an Ouput File with a list of prefetches that will be fed back into 6 | ChampSim to compute coverage, accuracy and instructions per cycle (IPC). 7 | 8 | ## Traces: 9 | 10 | The traces can be found at [this link](https://utexas.box.com/s/2k54kp8zvrqdfaa8cdhfquvcxwh7yn85). 11 | Alternatively, the `download.sh` file can be used to download all of the files to 12 | avoid bulk download restrictions from Box. You can also use the information found 13 | in the `download_links` file to download the data in another fashion. 14 | 15 | There are two types of traces that can be found here: 16 | - Load traces under the folder LoadTraces that you will use to train your ML models. The 17 | load trace is a series of program's LLC accesses, and the trace format is as follows: 18 | ``` 19 | Unique Instr Id, Cycle Count, Load Address, Instruction Pointer of the Load, LLC hit/miss 20 | ``` 21 | The load traces are plain text CSV. 22 | 23 | - Execution traces under the folder ChampSimTraces that ChampSim will need to 24 | compute IPC. You do not need these traces to train your models, they are 25 | only provided to facilitate an evaluation using IPCs. Note that you do not 26 | unzip execution traces as ChampSim expects it to be in the zipped format. 27 | 28 | ## Output File 29 | 30 | For a given Load Trace, your code should generate an output file that contains one 31 | prefetch per line. Each line should consist of two space-separated integral 32 | values, the unique instruction ID for which you want to issue a prefetch and the 33 | load address you want to prefetch. The unique instruction ID corresponds to 34 | the ID of the triggering load in the input Load Trace. You can include up to two 35 | prefetches per load listed in the Load Trace. You can choose not to prefetch 36 | for a load. Note that the prefetches should be in the order that they occur in the trace. 37 | Should you exceed the maximum number of prefetches per load, the first two will 38 | be kept and the remaining excess prefetches for that load will be discarded. 39 | 40 | For example, consider a Load Trace as follows: 41 | ``` 42 | 3659 cycle1 A ip1 1 43 | 5433 cycle2 B ip2 0 44 | 6928 cycle3 C ip3 0 45 | ``` 46 | 47 | Your output file could look something like this: 48 | ``` 49 | 3659 A+1 # Issue first prefetch for Instruction 3569 50 | 3659 A+2 # Issue second prefetch for Instruction 3569 51 | 5433 B+8 # Issue only one prefetch for Instruction 5433 52 | ``` 53 | 54 | ## Your Code: 55 | 56 | Your code should have two modes of functioning: 57 | 58 | 1. Taking in a Training Load Trace that your model trains on 59 | 2. Taking in a Test Load Trace for which your model will produce predictions in 60 | the format explained above. 61 | 62 | ## Building, Running, and Evaluating 63 | 64 | This has been rolled into one script `ml_prefetch_sim.py`. Below there are some 65 | common use cases highlighted, but more information can be found for each of the 66 | subcommands by running: 67 | 68 | ``` 69 | ./ml_prefetch_sim.py help subcommand 70 | ``` 71 | 72 | where subcommand is any of `build|run|eval` 73 | 74 | ### Building 75 | 76 | The following command will compile two ChampSim binaries: (1) A ChampSim binary 77 | that reads your ML model's output from a file and uses that as a prefetcher, 78 | and (2) A ChampSim binary with no prefetching that is to be used as a baseline 79 | 80 | ``` 81 | ./ml_prefetch_sim.py build 82 | ``` 83 | 84 | ### Training 85 | 86 | ``` 87 | ./ml_prefetch_sim.py train path_to_load_trace --model save_path --num-prefetch-warmup-instructions num_in_millions 88 | ``` 89 | 90 | To use the above, you need to modify the `model.py` file with your model. The 91 | prefetch warm-up instructions specify how many to include in the training set. 92 | The remainder of the instructions are the evaluation set. 93 | 94 | ### Generating the Prefetch File 95 | 96 | ``` 97 | ./ml_prefetch_sim.py generate path_to_load_trace path_to_output_prefetch_file --model save_path --num-prefetch-warmup-instructions num_in_millions 98 | ``` 99 | 100 | To use the above, you need to modify the `model.py` file with your model. The 101 | prefetch warm-up instructions specify how many to include in the training set. 102 | The remainder of the instructions are the evaluation set. 103 | 104 | ### Running 105 | 106 | To run the baseline ChampSim binaries on an execution trace: 107 | 108 | ``` 109 | ./ml_prefetch_sim.py run path_to_champsim_trace_here 110 | ``` 111 | 112 | To additionally run the ChampSim binary with your prefetcher: 113 | 114 | ``` 115 | ./ml_prefetch_sim.py run path_to_champsim_trace_here --prefetch path_to_prefetcher_file 116 | ``` 117 | 118 | To run the ChampSim binary with your prefetcher only: 119 | 120 | ``` 121 | ./ml_prefetch_sim.py run path_to_trace_here --prefetch path_to_prefetcher_file --no-base 122 | ``` 123 | 124 | ### Evaluation 125 | 126 | To evaluate the performance of ML prefetcher (and compare it against the baseline 127 | of no prefetcher, Best Offset, SISB, and SISB Best Offset), run: 128 | 129 | ``` 130 | ./ml_prefetch_sim.py eval 131 | ``` 132 | 133 | ## Competition Judging 134 | 135 | To test how submissions generalize, our test set evaluation will have two components: 136 | 137 | - Undisclosed execution samples for the training traces: You can submit a 138 | pre-trained model for each benchmark in the training set, and we will 139 | evaluate it on a different sample of the same benchmark 140 | 141 | - Undisclosed benchmarks: We will train and test your model on unseen 142 | benchmarks using the training routines that you provide 143 | 144 | ## Changes made to ChampSim for the competition: 145 | 146 | - Add LLC prefetcher (from\_file) to load ML model prefetch predictions into ChampSim 147 | - Modify the LLC prefetcher to provide unique instruction IDs and cycle counts 148 | - Remove same-page restriction in src/cache.cc for more irregular prefetching 149 | opportunity 150 | - Add ml\_prefetch\_sim.py to handle all of the building, running, and evaluation. 151 | 152 | --- 153 | 154 |

155 |

ChampSim

156 |

ChampSim is a trace-based simulator for a microarchitecture study. You can sign up to the public mailing list by sending an empty mail to champsim+subscribe@googlegroups.com. Traces for the 3rd Data Prefetching Championship (DPC-3) can be found from here (https://dpc3.compas.cs.stonybrook.edu/?SW_IS). A set of traces used for the 2nd Cache Replacement Championship (CRC-2) can be found from this link. (http://bit.ly/2t2nkUj)

157 |

158 | 159 | # Clone ChampSim repository 160 | ``` 161 | git clone https://github.com/ChampSim/ChampSim.git 162 | ``` 163 | 164 | # Compile 165 | 166 | ChampSim takes five parameters: Branch predictor, L1D prefetcher, L2C prefetcher, LLC replacement policy, and the number of cores. 167 | For example, `./build_champsim.sh bimodal no no lru 1` builds a single-core processor with bimodal branch predictor, no L1/L2 data prefetchers, and the baseline LRU replacement policy for the LLC. 168 | ``` 169 | $ ./build_champsim.sh bimodal no no no no lru 1 170 | 171 | $ ./build_champsim.sh ${BRANCH} ${L1I_PREFETCHER} ${L1D_PREFETCHER} ${L2C_PREFETCHER} ${LLC_PREFETCHER} ${LLC_REPLACEMENT} ${NUM_CORE} 172 | ``` 173 | 174 | # Download DPC-3 trace 175 | 176 | Professor Daniel Jimenez at Texas A&M University kindly provided traces for DPC-3. Use the following script to download these traces (~20GB size and max simpoint only). 177 | ``` 178 | $ cd scripts 179 | 180 | $ ./download_dpc3_traces.sh 181 | ``` 182 | 183 | # Run simulation 184 | 185 | Execute `run_champsim.sh` with proper input arguments. The default `TRACE_DIR` in `run_champsim.sh` is set to `$PWD/dpc3_traces`.
186 | 187 | * Single-core simulation: Run simulation with `run_champsim.sh` script. 188 | 189 | ``` 190 | Usage: ./run_champsim.sh [BINARY] [N_WARM] [N_SIM] [TRACE] [OPTION] 191 | $ ./run_champsim.sh bimodal-no-no-no-no-lru-1core 1 10 400.perlbench-41B.champsimtrace.xz 192 | 193 | ${BINARY}: ChampSim binary compiled by "build_champsim.sh" (bimodal-no-no-lru-1core) 194 | ${N_WARM}: number of instructions for warmup (1 million) 195 | ${N_SIM}: number of instructinos for detailed simulation (10 million) 196 | ${TRACE}: trace name (400.perlbench-41B.champsimtrace.xz) 197 | ${OPTION}: extra option for "-low_bandwidth" (src/main.cc) 198 | ``` 199 | Simulation results will be stored under "results_${N_SIM}M" as a form of "${TRACE}-${BINARY}-${OPTION}.txt".
200 | 201 | * Multi-core simulation: Run simulation with `run_4core.sh` script.
202 | ``` 203 | Usage: ./run_4core.sh [BINARY] [N_WARM] [N_SIM] [N_MIX] [TRACE0] [TRACE1] [TRACE2] [TRACE3] [OPTION] 204 | $ ./run_4core.sh bimodal-no-no-no-lru-4core 1 10 0 400.perlbench-41B.champsimtrace.xz \\ 205 | 401.bzip2-38B.champsimtrace.xz 403.gcc-17B.champsimtrace.xz 410.bwaves-945B.champsimtrace.xz 206 | ``` 207 | Note that we need to specify multiple trace files for `run_4core.sh`. `N_MIX` is used to represent a unique ID for mixed multi-programmed workloads. 208 | 209 | 210 | # Add your own branch predictor, data prefetchers, and replacement policy 211 | **Copy an empty template** 212 | ``` 213 | $ cp branch/branch_predictor.cc branch/mybranch.bpred 214 | $ cp prefetcher/l1d_prefetcher.cc prefetcher/mypref.l1d_pref 215 | $ cp prefetcher/l2c_prefetcher.cc prefetcher/mypref.l2c_pref 216 | $ cp prefetcher/llc_prefetcher.cc prefetcher/mypref.llc_pref 217 | $ cp replacement/llc_replacement.cc replacement/myrepl.llc_repl 218 | ``` 219 | 220 | **Work on your algorithms with your favorite text editor** 221 | ``` 222 | $ vim branch/mybranch.bpred 223 | $ vim prefetcher/mypref.l1d_pref 224 | $ vim prefetcher/mypref.l2c_pref 225 | $ vim prefetcher/mypref.llc_pref 226 | $ vim replacement/myrepl.llc_repl 227 | ``` 228 | 229 | **Compile and test** 230 | ``` 231 | $ ./build_champsim.sh mybranch mypref mypref mypref myrepl 1 232 | $ ./run_champsim.sh mybranch-mypref-mypref-mypref-myrepl-1core 1 10 bzip2_183B 233 | ``` 234 | 235 | # How to create traces 236 | 237 | We have included only 4 sample traces, taken from SPEC CPU 2006. These 238 | traces are short (10 million instructions), and do not necessarily cover the range of behaviors your 239 | replacement algorithm will likely see in the full competition trace list (not 240 | included). We STRONGLY recommend creating your own traces, covering 241 | a wide variety of program types and behaviors. 242 | 243 | The included Pin Tool champsim_tracer.cpp can be used to generate new traces. 244 | We used Pin 3.2 (pin-3.2-81205-gcc-linux), and it may require 245 | installing libdwarf.so, libelf.so, or other libraries, if you do not already 246 | have them. Please refer to the Pin documentation (https://software.intel.com/sites/landingpage/pintool/docs/81205/Pin/html/) 247 | for working with Pin 3.2. 248 | 249 | Get this version of Pin: 250 | ``` 251 | wget http://software.intel.com/sites/landingpage/pintool/downloads/pin-3.2-81205-gcc-linux.tar.gz 252 | ``` 253 | 254 | **Note on compatibility**: If you are using newer linux kernels/Ubuntu versions (eg. 20.04LTS), you might run into issues (such as [[1](https://github.com/ChampSim/ChampSim/issues/102)],[[2](https://stackoverflow.com/questions/55698095/intel-pin-tools-32-bit-processsectionheaders-560-assertion-failed)],[[3](https://stackoverflow.com/questions/43589174/pin-tool-segmentation-fault-for-ubuntu-17-04)]) with the PIN3.2. ChampSim tracer works fine with newer PIN tool versions that can be downloaded from [here](https://software.intel.com/content/www/us/en/develop/articles/pin-a-binary-instrumentation-tool-downloads.html). PIN3.17 is [confirmed](https://github.com/ChampSim/ChampSim/issues/102) to work with Ubuntu 20.04.1 LTS. 255 | 256 | Once downloaded, open tracer/make_tracer.sh and change PIN_ROOT to Pin's location. 257 | Run ./make_tracer.sh to generate champsim_tracer.so. 258 | 259 | **Use the Pin tool like this** 260 | ``` 261 | pin -t obj-intel64/champsim_tracer.so -- 262 | ``` 263 | 264 | The tracer has three options you can set: 265 | ``` 266 | -o 267 | Specify the output file for your trace. 268 | The default is default_trace.champsim 269 | 270 | -s 271 | Specify the number of instructions to skip in the program before tracing begins. 272 | The default value is 0. 273 | 274 | -t 275 | The number of instructions to trace, after -s instructions have been skipped. 276 | The default value is 1,000,000. 277 | ``` 278 | For example, you could trace 200,000 instructions of the program ls, after 279 | skipping the first 100,000 instructions, with this command: 280 | ``` 281 | pin -t obj/champsim_tracer.so -o traces/ls_trace.champsim -s 100000 -t 200000 -- ls 282 | ``` 283 | Traces created with the champsim_tracer.so are approximately 64 bytes per instruction, 284 | but they generally compress down to less than a byte per instruction using xz compression. 285 | 286 | # Evaluate Simulation 287 | 288 | ChampSim measures the IPC (Instruction Per Cycle) value as a performance metric.
289 | There are some other useful metrics printed out at the end of simulation.
290 | 291 | Good luck and be a champion!
292 | -------------------------------------------------------------------------------- /branch/bimodal.bpred: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | #define BIMODAL_TABLE_SIZE 16384 4 | #define BIMODAL_PRIME 16381 5 | #define MAX_COUNTER 3 6 | int bimodal_table[NUM_CPUS][BIMODAL_TABLE_SIZE]; 7 | 8 | void O3_CPU::initialize_branch_predictor() 9 | { 10 | cout << "CPU " << cpu << " Bimodal branch predictor" << endl; 11 | 12 | for(int i = 0; i < BIMODAL_TABLE_SIZE; i++) 13 | bimodal_table[cpu][i] = 0; 14 | } 15 | 16 | uint8_t O3_CPU::predict_branch(uint64_t ip) 17 | { 18 | uint32_t hash = ip % BIMODAL_PRIME; 19 | uint8_t prediction = (bimodal_table[cpu][hash] >= ((MAX_COUNTER + 1)/2)) ? 1 : 0; 20 | 21 | return prediction; 22 | } 23 | 24 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 25 | { 26 | uint32_t hash = ip % BIMODAL_PRIME; 27 | 28 | if (taken && (bimodal_table[cpu][hash] < MAX_COUNTER)) 29 | bimodal_table[cpu][hash]++; 30 | else if ((taken == 0) && (bimodal_table[cpu][hash] > 0)) 31 | bimodal_table[cpu][hash]--; 32 | } 33 | -------------------------------------------------------------------------------- /branch/gshare.bpred: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | #define GLOBAL_HISTORY_LENGTH 14 4 | #define GLOBAL_HISTORY_MASK (1 << GLOBAL_HISTORY_LENGTH) - 1 5 | int branch_history_vector[NUM_CPUS]; 6 | 7 | #define GS_HISTORY_TABLE_SIZE 16384 8 | int gs_history_table[NUM_CPUS][GS_HISTORY_TABLE_SIZE]; 9 | int my_last_prediction[NUM_CPUS]; 10 | 11 | void O3_CPU::initialize_branch_predictor() 12 | { 13 | cout << "CPU " << cpu << " GSHARE branch predictor" << endl; 14 | 15 | branch_history_vector[cpu] = 0; 16 | my_last_prediction[cpu] = 0; 17 | 18 | for(int i=0; i>GLOBAL_HISTORY_LENGTH)^(ip>>(GLOBAL_HISTORY_LENGTH*2))^bh_vector; 25 | hash = hash%GS_HISTORY_TABLE_SIZE; 26 | 27 | //printf("%d\n", hash); 28 | 29 | return hash; 30 | } 31 | 32 | uint8_t O3_CPU::predict_branch(uint64_t ip) 33 | { 34 | int prediction = 1; 35 | 36 | int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]); 37 | 38 | if(gs_history_table[cpu][gs_hash] >= 2) 39 | prediction = 1; 40 | else 41 | prediction = 0; 42 | 43 | my_last_prediction[cpu] = prediction; 44 | 45 | return prediction; 46 | } 47 | 48 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 49 | { 50 | int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]); 51 | 52 | if(taken == 1) { 53 | if(gs_history_table[cpu][gs_hash] < 3) 54 | gs_history_table[cpu][gs_hash]++; 55 | } else { 56 | if(gs_history_table[cpu][gs_hash] > 0) 57 | gs_history_table[cpu][gs_hash]--; 58 | } 59 | 60 | // update branch history vector 61 | branch_history_vector[cpu] <<= 1; 62 | branch_history_vector[cpu] &= GLOBAL_HISTORY_MASK; 63 | branch_history_vector[cpu] |= taken; 64 | } 65 | -------------------------------------------------------------------------------- /branch/hashed_perceptron.bpred: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This code implements a hashed perceptron branch predictor using geometric 4 | history lengths and dynamic threshold setting. 5 | 6 | It was written by Daniel A. Jiménez in March 2019. To the extent allowed by 7 | law, the author abdicates all rights to this work and places it in the public 8 | domain. 9 | 10 | The original perceptron branch predictor is from Jiménez and Lin, "Dynamic 11 | Branch Prediction with Perceptrons," HPCA 2001. 12 | 13 | The idea of using multiple independently indexed tables of perceptron weights 14 | is from Jiménez, "Fast Path-Based Neural Branch Prediction," MICRO 2003 and 15 | later expanded in "Piecewise Linear Branch Prediction" from ISCA 2005. 16 | 17 | The idea of using hashes of branch history to reduce the number of independent 18 | tables is documented in three contemporaneous papers: 19 | 20 | 1. Seznec, "Revisiting the Perceptron Predictor," IRISA technical report, 2004. 21 | 22 | 2. Tarjan and Skadron, "Revisiting the Perceptron Predictor Again," UVA 23 | technical report, 2004, expanded and published in ACM TACO 2005 as "Merging 24 | path and gshare indexing in perceptron branch prediction"; introduces the term 25 | "hashed perceptron." 26 | 27 | 3. Loh and Jiménez, "Reducing the Power and Complexity of Path-Based Neural 28 | Branch Prediction," WCED 2005. 29 | 30 | The ideas of using "geometric history lengths" i.e. hashing into tables with 31 | histories of exponentially increasing length, as well as dynamically adjusting 32 | the theta parameter, are from Seznec, "The O-GEHL Branch Predictor," from CBP 33 | 2004, expanded later as "Analysis of the O-GEometric History Length Branch 34 | Predictor" in ISCA 2005. 35 | 36 | This code uses these ideas, but prefers simplicity over absolute accuracy (I 37 | wrote it in about an hour and later spent more time on this comment block than 38 | I did on the code). These papers and subsequent papers by Jiménez and other 39 | authors significantly improve the accuracy of perceptron-based predictors but 40 | involve tricks and analysis beyond the needs of a tool like ChampSim that 41 | targets cache optimizations. If you want accuracy at any cost, see the winners 42 | of the latest branch prediction contest, CBP 2016 as of this writing, but 43 | prepare to have your face melted off by the complexity of the code you find 44 | there. If you are a student being asked to code a good branch predictor for 45 | your computer architecture class, don't copy this code; there are much better 46 | sources for you to plagiarize. 47 | 48 | */ 49 | 50 | #include 51 | #include 52 | #include 53 | #include 54 | 55 | #include "ooo_cpu.h" 56 | 57 | // this many tables 58 | 59 | #define NTABLES 16 60 | 61 | // maximum history length 62 | 63 | #define MAXHIST 232 64 | 65 | // minimum history length (for table 1; table 0 is biases) 66 | 67 | #define MINHIST 3 68 | 69 | // speed for dynamic threshold setting 70 | 71 | #define SPEED 18 72 | 73 | // geometric global history lengths 74 | 75 | int history_lengths[NTABLES] = { 0, 3, 4, 6, 8, 10, 14, 19, 26, 36, 49, 67, 91, 125, 170, MAXHIST }; 76 | 77 | // 12-bit indices for the tables 78 | 79 | #define LOG_TABLE_SIZE 12 80 | #define TABLE_SIZE (1<= 1; 174 | } 175 | 176 | void O3_CPU::last_branch_result(uint64_t pc, uint8_t taken) { 177 | 178 | // was this prediction correct? 179 | 180 | bool correct = taken == (yout[cpu] >= 1); 181 | 182 | // insert this branch outcome into the global history 183 | 184 | bool b = taken; 185 | for (int i=0; i -128) (*c)--; 217 | } 218 | } 219 | 220 | // dynamic threshold setting from Seznec's O-GEHL paper 221 | 222 | if (!correct) { 223 | 224 | // increase theta after enough mispredictions 225 | 226 | tc[cpu]++; 227 | if (tc[cpu] >= SPEED) { 228 | theta[cpu]++; 229 | tc[cpu] = 0; 230 | } 231 | } else if (a < theta[cpu]) { 232 | 233 | // decrease theta after enough weak but correct predictions 234 | 235 | tc[cpu]--; 236 | if (tc[cpu] <= -SPEED) { 237 | theta[cpu]--; 238 | tc[cpu] = 0; 239 | } 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /branch/perceptron.bpred: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2001 University of Texas at Austin 3 | * 4 | * Daniel A. Jimenez 5 | * Calvin Lin 6 | * 7 | * Permission is hereby granted, free of charge, to any person 8 | * obtaining a copy of this software (the "Software"), to deal in 9 | * the Software without restriction, including without limitation 10 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 | * and/or sell copies of the Software, and to permit persons to whom the 12 | * Software is furnished to do so, subject to the following conditions: 13 | * 14 | * The above copyright notice and this permission notice shall be 15 | * included in all copies or substantial portions of the Software. 16 | * 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT 21 | * AUSTIN BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 22 | * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 23 | * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | * THE SOFTWARE. 25 | * 26 | * This file implements the simulated perceptron branch predictor from: 27 | * 28 | * Jimenez, D. A. & Lin, C., Dynamic branch prediction with perceptrons, 29 | * Proceedings of the Seventh International Symposium on High Performance 30 | * Computer Architecture (HPCA), Monterrey, NL, Mexico 2001 31 | * 32 | * The #define's here specify a perceptron predictor with a history 33 | * length of 24, 163 perceptrons, and 8-bit weights. This represents 34 | * a hardware budget of (24+1)*8*163 = 32600 bits, or about 4K bytes, 35 | * which is comparable to the hardware budget of the Alpha 21264 hybrid 36 | * branch predictor. 37 | * 38 | * There are three important functions defined in this file: 39 | * 40 | * 1. void initialize_perceptron_predictor (void); 41 | * Initialize the perceptron predictor 42 | * 43 | * 2. perceptron_state *perceptron_dir_lookup (unsigned int); 44 | * Get a branch prediction, given a branch address. This function returns a 45 | * pointer to a 'perceptron_state' struct, which contains the prediction, the 46 | * perceptron output, and other information necessary for using and updating 47 | * the predictor. The first member of a 'perceptron_state' struct is a char 48 | * that is assigned 3 if the branch is predicted taken, 0 otherwise; this way, 49 | * a pointer to 'perceptron_state' can be cast to (char *) and passed around 50 | * SimpleScalar as though it were a pointer to a pattern history table entry. 51 | * 52 | * 3. void perceptron_update (perceptron_state *, int); 53 | * Update the branch predictor using the 'perceptron_state' pointer 54 | * returned by perceptron_dir_lookup() and an int that is 1 if the branch 55 | * was taken, 0 otherwise. 56 | */ 57 | 58 | #include "ooo_cpu.h" 59 | 60 | /* history length for the global history shift register */ 61 | 62 | #define PERCEPTRON_HISTORY 24 63 | 64 | /* number of perceptrons */ 65 | 66 | #define NUM_PERCEPTRONS 163 67 | 68 | /* number of bits per weight */ 69 | 70 | #define PERCEPTRON_BITS 8 71 | 72 | /* maximum and minimum weight values */ 73 | 74 | #define MAX_WEIGHT ((1<<(PERCEPTRON_BITS-1))-1) 75 | #define MIN_WEIGHT (-(MAX_WEIGHT+1)) 76 | 77 | /* threshold for training */ 78 | 79 | #define THETA ((int) (1.93 * PERCEPTRON_HISTORY + 14)) 80 | 81 | /* size of buffer for keeping 'perceptron_state' for update */ 82 | 83 | #define NUM_UPDATE_ENTRIES 100 84 | 85 | /* perceptron data structure */ 86 | 87 | typedef struct { 88 | int 89 | /* just a vector of integers */ 90 | 91 | weights[PERCEPTRON_HISTORY+1]; 92 | } perceptron; 93 | 94 | /* 'perceptron_state' - stores the branch prediction and keeps information 95 | * such as output and history needed for updating the perceptron predictor 96 | */ 97 | typedef struct { 98 | char 99 | /* this char emulates a pattern history table entry 100 | * with a value of 0 for "predict not taken" or 3 for 101 | * "predict taken," so a perceptron_state pointer can 102 | * be passed around SimpleScalar's branch prediction 103 | * infrastructure without changing too much stuff. 104 | */ 105 | dummy_counter; 106 | 107 | int 108 | /* prediction: 1 for taken, 0 for not taken */ 109 | 110 | prediction, 111 | 112 | /* perceptron output */ 113 | 114 | output; 115 | 116 | unsigned long long int 117 | /* value of the history register yielding this prediction */ 118 | 119 | history; 120 | 121 | perceptron 122 | /* pointer to the perceptron yielding this prediction */ 123 | 124 | *perc; 125 | } perceptron_state; 126 | 127 | perceptron 128 | /* table of perceptrons */ 129 | 130 | perceptrons[NUM_CPUS][NUM_PERCEPTRONS]; 131 | 132 | perceptron_state 133 | /* state for updating perceptron predictor */ 134 | 135 | perceptron_state_buf[NUM_CPUS][NUM_UPDATE_ENTRIES]; 136 | 137 | int 138 | /* index of the next "free" perceptron_state */ 139 | 140 | perceptron_state_buf_ctr[NUM_CPUS]; 141 | 142 | unsigned long long int 143 | 144 | /* speculative global history - updated by predictor */ 145 | 146 | spec_global_history[NUM_CPUS], 147 | 148 | /* real global history - updated when the predictor is updated */ 149 | 150 | global_history[NUM_CPUS]; 151 | 152 | perceptron_state *u[NUM_CPUS]; 153 | 154 | /* initialize a single perceptron */ 155 | void initialize_perceptron (perceptron *p) { 156 | int i; 157 | 158 | for (i=0; i<=PERCEPTRON_HISTORY; i++) p->weights[i] = 0; 159 | } 160 | 161 | void O3_CPU::initialize_branch_predictor() 162 | { 163 | spec_global_history[cpu] = 0; 164 | global_history[cpu] = 0; 165 | perceptron_state_buf_ctr[cpu] = 0; 166 | for (int i=0; i= NUM_UPDATE_ENTRIES) 190 | perceptron_state_buf_ctr[cpu] = 0; 191 | 192 | /* hash the address to get an index into the table of perceptrons */ 193 | 194 | index = address % NUM_PERCEPTRONS; 195 | 196 | /* get pointers to that perceptron and its weights */ 197 | 198 | p = &perceptrons[cpu][index]; 199 | w = &p->weights[0]; 200 | 201 | /* initialize the output to the bias weight, and bump the pointer 202 | * to the weights 203 | */ 204 | 205 | output = *w++; 206 | 207 | /* find the (rest of the) dot product of the history register 208 | * and the perceptron weights. note that, instead of actually 209 | * doing the expensive multiplies, we simply add a weight when the 210 | * corresponding branch in the history register is taken, or 211 | * subtract a weight when the branch is not taken. this also lets 212 | * us use binary instead of bipolar logic to represent the history 213 | * register 214 | */ 215 | for (mask=1,i=0; ioutput = output; 225 | u[cpu]->perc = p; 226 | u[cpu]->history = spec_global_history[cpu]; 227 | u[cpu]->prediction = output >= 0; 228 | u[cpu]->dummy_counter = u[cpu]->prediction ? 3 : 0; 229 | 230 | /* update the speculative global history register */ 231 | 232 | spec_global_history[cpu] <<= 1; 233 | spec_global_history[cpu] |= u[cpu]->prediction; 234 | return u[cpu]->prediction; 235 | } 236 | 237 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 238 | { 239 | int 240 | i, 241 | y, 242 | *w; 243 | 244 | unsigned long long int 245 | mask, 246 | history; 247 | 248 | /* update the real global history shift register */ 249 | 250 | global_history[cpu] <<= 1; 251 | global_history[cpu] |= taken; 252 | 253 | /* if this branch was mispredicted, restore the speculative 254 | * history to the last known real history 255 | */ 256 | 257 | if (u[cpu]->prediction != taken) spec_global_history[cpu] = global_history[cpu]; 258 | 259 | /* if the output of the perceptron predictor is outside of 260 | * the range [-THETA,THETA] *and* the prediction was correct, 261 | * then we don't need to adjust the weights 262 | */ 263 | 264 | if (u[cpu]->output > THETA) 265 | y = 1; 266 | else if (u[cpu]->output < -THETA) 267 | y = 0; 268 | else 269 | y = 2; 270 | if (y == 1 && taken) return; 271 | if (y == 0 && !taken) return; 272 | 273 | /* w is a pointer to the first weight (the bias weight) */ 274 | 275 | w = &u[cpu]->perc->weights[0]; 276 | 277 | /* if the branch was taken, increment the bias weight, 278 | * else decrement it, with saturating arithmetic 279 | */ 280 | 281 | if (taken) 282 | (*w)++; 283 | else 284 | (*w)--; 285 | if (*w > MAX_WEIGHT) *w = MAX_WEIGHT; 286 | if (*w < MIN_WEIGHT) *w = MIN_WEIGHT; 287 | 288 | /* now w points to the next weight */ 289 | 290 | w++; 291 | 292 | /* get the history that led to this prediction */ 293 | 294 | history = u[cpu]->history; 295 | 296 | /* for each weight and corresponding bit in the history register... */ 297 | 298 | for (mask=1,i=0; i !!x is 1 iff x is not zero, in this case history is positively correlated with branch outcome 306 | (*w)++; 307 | if (*w > MAX_WEIGHT) *w = MAX_WEIGHT; 308 | } else { 309 | (*w)--; 310 | if (*w < MIN_WEIGHT) *w = MIN_WEIGHT; 311 | } 312 | } 313 | } 314 | -------------------------------------------------------------------------------- /build_champsim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -ne 7 ]; then 4 | echo "Illegal number of parameters" 5 | echo "Usage: ./build_champsim.sh [branch_pred] [l1d_pref] [l2c_pref] [llc_pref] [llc_repl] [num_core]" 6 | exit 1 7 | fi 8 | 9 | # ChampSim configuration 10 | BRANCH=$1 # branch/*.bpred 11 | L1I_PREFETCHER=$2 # prefetcher/*.l1i_pref 12 | L1D_PREFETCHER=$3 # prefetcher/*.l1d_pref 13 | L2C_PREFETCHER=$4 # prefetcher/*.l2c_pref 14 | LLC_PREFETCHER=$5 # prefetcher/*.llc_pref 15 | LLC_REPLACEMENT=$6 # replacement/*.llc_repl 16 | NUM_CORE=$7 # tested up to 8-core system 17 | 18 | ############## Some useful macros ############### 19 | BOLD=$(tput bold) 20 | NORMAL=$(tput sgr0) 21 | ################################################# 22 | 23 | # Sanity check 24 | if [ ! -f ./branch/${BRANCH}.bpred ]; then 25 | echo "[ERROR] Cannot find branch predictor" 26 | echo "[ERROR] Possible branch predictors from branch/*.bpred " 27 | find branch -name "*.bpred" 28 | exit 1 29 | fi 30 | 31 | if [ ! -f ./prefetcher/${L1I_PREFETCHER}.l1i_pref ]; then 32 | echo "[ERROR] Cannot find L1I prefetcher" 33 | echo "[ERROR] Possible L1I prefetchers from prefetcher/*.l1i_pref " 34 | find prefetcher -name "*.l1i_pref" 35 | exit 1 36 | fi 37 | 38 | if [ ! -f ./prefetcher/${L1D_PREFETCHER}.l1d_pref ]; then 39 | echo "[ERROR] Cannot find L1D prefetcher" 40 | echo "[ERROR] Possible L1D prefetchers from prefetcher/*.l1d_pref " 41 | find prefetcher -name "*.l1d_pref" 42 | exit 1 43 | fi 44 | 45 | if [ ! -f ./prefetcher/${L2C_PREFETCHER}.l2c_pref ]; then 46 | echo "[ERROR] Cannot find L2C prefetcher" 47 | echo "[ERROR] Possible L2C prefetchers from prefetcher/*.l2c_pref " 48 | find prefetcher -name "*.l2c_pref" 49 | exit 1 50 | fi 51 | 52 | if [ ! -f ./prefetcher/${LLC_PREFETCHER}.llc_pref ]; then 53 | echo "[ERROR] Cannot find LLC prefetcher" 54 | echo "[ERROR] Possible LLC prefetchers from prefetcher/*.llc_pref " 55 | find prefetcher -name "*.llc_pref" 56 | exit 1 57 | fi 58 | 59 | if [ ! -f ./replacement/${LLC_REPLACEMENT}.llc_repl ]; then 60 | echo "[ERROR] Cannot find LLC replacement policy" 61 | echo "[ERROR] Possible LLC replacement policy from replacement/*.llc_repl" 62 | find replacement -name "*.llc_repl" 63 | exit 1 64 | fi 65 | 66 | # Check num_core 67 | re='^[0-9]+$' 68 | if ! [[ $NUM_CORE =~ $re ]] ; then 69 | echo "[ERROR]: num_core is NOT a number" >&2; 70 | exit 1 71 | fi 72 | 73 | # Check for multi-core 74 | if [ "$NUM_CORE" -gt "1" ]; then 75 | echo "Building multi-core ChampSim..." 76 | sed -i.bak 's/\/NUM_CPUS '${NUM_CORE}'/g' inc/champsim.h 77 | # sed -i.bak 's/\/DRAM_CHANNELS 2/g' inc/champsim.h 78 | # sed -i.bak 's/\/DRAM_CHANNELS_LOG2 1/g' inc/champsim.h 79 | else 80 | if [ "$NUM_CORE" -lt "1" ]; then 81 | echo "Number of core: $NUM_CORE must be greater or equal than 1" 82 | exit 1 83 | else 84 | echo "Building single-core ChampSim..." 85 | fi 86 | fi 87 | echo 88 | 89 | # Change prefetchers and replacement policy 90 | cp branch/${BRANCH}.bpred branch/branch_predictor.cc 91 | cp prefetcher/${L1I_PREFETCHER}.l1i_pref prefetcher/l1i_prefetcher.cc 92 | cp prefetcher/${L1D_PREFETCHER}.l1d_pref prefetcher/l1d_prefetcher.cc 93 | cp prefetcher/${L2C_PREFETCHER}.l2c_pref prefetcher/l2c_prefetcher.cc 94 | cp prefetcher/${LLC_PREFETCHER}.llc_pref prefetcher/llc_prefetcher.cc 95 | cp replacement/${LLC_REPLACEMENT}.llc_repl replacement/llc_replacement.cc 96 | 97 | # Build 98 | mkdir -p bin 99 | rm -f bin/champsim 100 | make clean 101 | make 102 | 103 | # Sanity check 104 | echo "" 105 | if [ ! -f bin/champsim ]; then 106 | echo "${BOLD}ChampSim build FAILED!" 107 | echo "" 108 | exit 1 109 | fi 110 | 111 | echo "${BOLD}ChampSim is successfully built" 112 | echo "Branch Predictor: ${BRANCH}" 113 | echo "L1I Prefetcher: ${L1I_PREFETCHER}" 114 | echo "L1D Prefetcher: ${L1D_PREFETCHER}" 115 | echo "L2C Prefetcher: ${L2C_PREFETCHER}" 116 | echo "LLC Prefetcher: ${LLC_PREFETCHER}" 117 | echo "LLC Replacement: ${LLC_REPLACEMENT}" 118 | echo "Cores: ${NUM_CORE}" 119 | BINARY_NAME="${BRANCH}-${L1I_PREFETCHER}-${L1D_PREFETCHER}-${L2C_PREFETCHER}-${LLC_PREFETCHER}-${LLC_REPLACEMENT}-${NUM_CORE}core" 120 | echo "Binary: bin/${BINARY_NAME}" 121 | echo "" 122 | mv bin/champsim bin/${BINARY_NAME} 123 | 124 | 125 | # Restore to the default configuration 126 | sed -i.bak 's/\/NUM_CPUS 1/g' inc/champsim.h 127 | #sed -i.bak 's/\/DRAM_CHANNELS 1/g' inc/champsim.h 128 | #sed -i.bak 's/\/DRAM_CHANNELS_LOG2 0/g' inc/champsim.h 129 | 130 | cp branch/bimodal.bpred branch/branch_predictor.cc 131 | cp prefetcher/no.l1i_pref prefetcher/l1i_prefetcher.cc 132 | cp prefetcher/no.l1d_pref prefetcher/l1d_prefetcher.cc 133 | cp prefetcher/no.l2c_pref prefetcher/l2c_prefetcher.cc 134 | cp prefetcher/no.llc_pref prefetcher/llc_prefetcher.cc 135 | cp replacement/lru.llc_repl replacement/llc_replacement.cc 136 | -------------------------------------------------------------------------------- /cvp_tracer/README.md: -------------------------------------------------------------------------------- 1 | The cvp2champsim tracer comes as is with no guarantee that it covers every conversion case. 2 | 3 | The tracer is used to convert the traces from the 2nd Championship Value 4 | Prediction (CVP) to a ChampSim-friendly format. 5 | 6 | CVP-1 Site: https://www.microarch.org/cvp1/ 7 | CVP-2 Site: https://www.microarch.org/cvp1/cvp2/rules.html 8 | 9 | To use the tracer first compile it using g++: 10 | 11 | g++ cvp2champsim.cc -o cvp_tracer 12 | 13 | To convert a trace execute: 14 | 15 | ./cvp_tracer TRACE_NAME.gz 16 | 17 | The ChampSim trace will be sent to standard output so to keep and compress the 18 | output trace run: 19 | 20 | ./cvp_tracer TRACE_NAME.gz | gzip > NEW_TRACE.champsim.gz 21 | 22 | Adding the "-v" flag will print the dissassembly of the CVP trace to standard 23 | error output as well as the ChampSim format to standard output. 24 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | while read -r line; do 2 | arr=($line) 3 | mkdir -p $(dirname ${arr[0]}) 4 | done < download_links 5 | 6 | while read -r line; do 7 | arr=($line) 8 | echo Downloading ${arr[0]} from ${arr[1]} 9 | if ! [[ -e "${arr[0]}" ]]; then 10 | curl -L -o ${arr[0]} ${arr[1]} 11 | echo Downloading ${arr[0]} Done 12 | else 13 | echo ${arr[0]} File already exists 14 | fi 15 | done < download_links 16 | -------------------------------------------------------------------------------- /get_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('results_file', help='Path to ChampSim results file') 9 | parser.add_argument('--cache-level', default='LLC', choices=('L2', 'LLC'), help='Cache level to compute stats for (default: %(default)s)') 10 | parser.add_argument('--base', default=None, help='Path to ChampSim base settings results file with no prefetcher for more accurate statistics') 11 | 12 | return parser.parse_args() 13 | 14 | 15 | def read_file(path, cache_level): 16 | if path is None: 17 | return None 18 | 19 | expected_keys = ('ipc', 'total_miss', 'useful', 'useless', 'load_miss', 'rfo_miss', 'kilo_inst') 20 | data = {} 21 | with open(path, 'r') as f: 22 | for line in f: 23 | if 'Finished CPU' in line: 24 | data['ipc'] = float(line.split()[9]) 25 | data['kilo_inst'] = int(line.split()[4]) / 1000 26 | if cache_level not in line: 27 | continue 28 | line = line.strip() 29 | if 'LOAD' in line: 30 | data['load_miss'] = int(line.split()[-1]) 31 | elif 'RFO' in line: 32 | data['rfo_miss'] = int(line.split()[-1]) 33 | elif 'TOTAL' in line: 34 | data['total_miss'] = int(line.split()[-1]) 35 | elif 'USEFUL' in line: 36 | data['useful'] = int(line.split()[-3]) 37 | data['useless'] = int(line.split()[-1]) 38 | 39 | if not all(key in data for key in expected_keys): 40 | return None 41 | 42 | return data 43 | 44 | def main(args=None): 45 | print(args) 46 | results = read_file(args.results_file, args.cache_level) 47 | useful, useless, ipc, load_miss, rfo_miss, kilo_inst = ( 48 | results['useful'], results['useless'], results['ipc'], results['load_miss'], results['rfo_miss'], results['kilo_inst'] 49 | ) 50 | results_total_miss = load_miss + rfo_miss + useful 51 | total_miss = results_total_miss 52 | 53 | results_mpki = (load_miss + rfo_miss) / kilo_inst 54 | 55 | base = read_file(args.base, args.cache_level) 56 | if base is not None: 57 | base_total_miss, base_ipc = base['total_miss'], base['ipc'] 58 | base_mpki = base_total_miss / kilo_inst 59 | 60 | if useful + useless == 0: 61 | print('Accuracy: N/A [All prefetches were merged and were not useful or useless]') 62 | else: 63 | print('Accuracy:', useful / (useful + useless) * 100, '%') 64 | if total_miss == 0: 65 | print('Coverage: N/A [No misses. Did you run this simulation for long enough?]') 66 | else: 67 | print('Coverage:', useful / total_miss * 100, '%') 68 | print('MPKI:', results_mpki) 69 | if base is not None: 70 | print('MPKI Improvement:', (base_mpki - results_mpki) / base_mpki * 100, '%') 71 | print('IPC:', ipc) 72 | if base is not None: 73 | print('IPC Improvement:', (ipc - base_ipc) / base_ipc * 100, '%') 74 | 75 | if __name__ == '__main__': 76 | main(args=get_args()) 77 | -------------------------------------------------------------------------------- /inc/block.h: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_H 2 | #define BLOCK_H 3 | 4 | #include "champsim.h" 5 | #include "instruction.h" 6 | #include "set.h" 7 | 8 | // CACHE BLOCK 9 | class BLOCK { 10 | public: 11 | uint8_t valid, 12 | prefetch, 13 | dirty, 14 | used; 15 | 16 | int delta, 17 | depth, 18 | signature, 19 | confidence; 20 | 21 | uint64_t address, 22 | full_addr, 23 | tag, 24 | data, 25 | ip, 26 | cpu, 27 | instr_id; 28 | 29 | // replacement state 30 | uint32_t lru; 31 | 32 | BLOCK() { 33 | valid = 0; 34 | prefetch = 0; 35 | dirty = 0; 36 | used = 0; 37 | 38 | delta = 0; 39 | depth = 0; 40 | signature = 0; 41 | confidence = 0; 42 | 43 | address = 0; 44 | full_addr = 0; 45 | tag = 0; 46 | data = 0; 47 | cpu = 0; 48 | instr_id = 0; 49 | 50 | lru = 0; 51 | }; 52 | }; 53 | 54 | // DRAM CACHE BLOCK 55 | class DRAM_ARRAY { 56 | public: 57 | BLOCK **block; 58 | 59 | DRAM_ARRAY() { 60 | block = NULL; 61 | }; 62 | }; 63 | 64 | // message packet 65 | class PACKET { 66 | public: 67 | uint8_t instruction, 68 | is_data, 69 | fill_l1i, 70 | fill_l1d, 71 | tlb_access, 72 | scheduled, 73 | translated, 74 | fetched, 75 | prefetched, 76 | drc_tag_read; 77 | 78 | int fill_level, 79 | pf_origin_level, 80 | rob_signal, 81 | rob_index, 82 | producer, 83 | delta, 84 | depth, 85 | signature, 86 | confidence; 87 | 88 | uint32_t pf_metadata; 89 | 90 | uint8_t is_producer, 91 | //rob_index_depend_on_me[ROB_SIZE], 92 | //lq_index_depend_on_me[ROB_SIZE], 93 | //sq_index_depend_on_me[ROB_SIZE], 94 | instr_merged, 95 | load_merged, 96 | store_merged, 97 | returned, 98 | asid[2], 99 | type; 100 | 101 | fastset 102 | rob_index_depend_on_me, 103 | lq_index_depend_on_me, 104 | sq_index_depend_on_me; 105 | 106 | uint32_t cpu, data_index, lq_index, sq_index; 107 | 108 | uint64_t address, 109 | full_addr, 110 | instruction_pa, 111 | data_pa, 112 | data, 113 | instr_id, 114 | ip, 115 | event_cycle, 116 | cycle_enqueued; 117 | 118 | PACKET() { 119 | instruction = 0; 120 | is_data = 1; 121 | fill_l1i = 0; 122 | fill_l1d = 0; 123 | tlb_access = 0; 124 | scheduled = 0; 125 | translated = 0; 126 | fetched = 0; 127 | prefetched = 0; 128 | drc_tag_read = 0; 129 | 130 | returned = 0; 131 | asid[0] = UINT8_MAX; 132 | asid[1] = UINT8_MAX; 133 | type = 0; 134 | 135 | fill_level = -1; 136 | rob_signal = -1; 137 | rob_index = -1; 138 | producer = -1; 139 | delta = 0; 140 | depth = 0; 141 | signature = 0; 142 | confidence = 0; 143 | 144 | #if 0 145 | for (uint32_t i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | // USEFUL MACROS 23 | //#define DEBUG_PRINT 24 | #define SANITY_CHECK 25 | #define LLC_BYPASS 26 | #define DRC_BYPASS 27 | #define NO_CRC2_COMPILE 28 | 29 | #ifdef DEBUG_PRINT 30 | #define DP(x) x 31 | #else 32 | #define DP(x) 33 | #endif 34 | 35 | // CPU 36 | #define NUM_CPUS 1 37 | #define CPU_FREQ 4000 38 | #define DRAM_IO_FREQ 3200 39 | #define PAGE_SIZE 4096 40 | #define LOG2_PAGE_SIZE 12 41 | 42 | // CACHE 43 | #define BLOCK_SIZE 64 44 | #define LOG2_BLOCK_SIZE 6 45 | #define MAX_READ_PER_CYCLE 8 46 | #define MAX_FILL_PER_CYCLE 1 47 | 48 | #define INFLIGHT 1 49 | #define COMPLETED 2 50 | 51 | #define FILL_L1 1 52 | #define FILL_L2 2 53 | #define FILL_LLC 4 54 | #define FILL_DRC 8 55 | #define FILL_DRAM 16 56 | 57 | // DRAM 58 | #define DRAM_CHANNELS 1 // default: assuming one DIMM per one channel 4GB * 1 => 4GB off-chip memory 59 | #define LOG2_DRAM_CHANNELS 0 60 | #define DRAM_RANKS 1 // 512MB * 8 ranks => 4GB per DIMM 61 | #define LOG2_DRAM_RANKS 0 62 | #define DRAM_BANKS 8 // 64MB * 8 banks => 512MB per rank 63 | #define LOG2_DRAM_BANKS 3 64 | #define DRAM_ROWS 65536 // 2KB * 32K rows => 64MB per bank 65 | #define LOG2_DRAM_ROWS 16 66 | #define DRAM_COLUMNS 128 // 64B * 32 column chunks (Assuming 1B DRAM cell * 8 chips * 8 transactions = 64B size of column chunks) => 2KB per row 67 | #define LOG2_DRAM_COLUMNS 7 68 | #define DRAM_ROW_SIZE (BLOCK_SIZE*DRAM_COLUMNS/1024) 69 | 70 | #define DRAM_SIZE (DRAM_CHANNELS*DRAM_RANKS*DRAM_BANKS*DRAM_ROWS*DRAM_ROW_SIZE/1024) 71 | #define DRAM_PAGES ((DRAM_SIZE<<10)>>2) 72 | //#define DRAM_PAGES 10 73 | 74 | using namespace std; 75 | 76 | extern uint8_t warmup_complete[NUM_CPUS], 77 | simulation_complete[NUM_CPUS], 78 | all_warmup_complete, 79 | all_simulation_complete, 80 | MAX_INSTR_DESTINATIONS, 81 | knob_cloudsuite, 82 | knob_low_bandwidth, 83 | prefetch_warmup_complete; 84 | 85 | extern uint64_t current_core_cycle[NUM_CPUS], 86 | stall_cycle[NUM_CPUS], 87 | last_drc_read_mode, 88 | last_drc_write_mode, 89 | drc_blocks; 90 | 91 | extern queue page_queue; 92 | extern map page_table, inverse_table, recent_page, unique_cl[NUM_CPUS]; 93 | extern uint64_t previous_ppage, num_adjacent_page, num_cl[NUM_CPUS], allocated_pages, num_page[NUM_CPUS], minor_fault[NUM_CPUS], major_fault[NUM_CPUS]; 94 | 95 | void print_stats(); 96 | uint64_t rotl64 (uint64_t n, unsigned int c), 97 | rotr64 (uint64_t n, unsigned int c), 98 | va_to_pa(uint32_t cpu, uint64_t instr_id, uint64_t va, uint64_t unique_vpage, uint8_t is_code); 99 | bool check_ppage(uint32_t cpu, uint64_t ppage); 100 | // log base 2 function from efectiu 101 | int lg2(int n); 102 | 103 | // smart random number generator 104 | class RANDOM { 105 | public: 106 | std::random_device rd; 107 | std::mt19937_64 engine{rd()}; 108 | std::uniform_int_distribution dist{0, 0xFFFFFFFFF}; // used to generate random physical page numbers 109 | 110 | RANDOM (uint64_t seed) { 111 | engine.seed(seed); 112 | } 113 | 114 | uint64_t draw_rand() { 115 | return dist(engine); 116 | }; 117 | }; 118 | extern uint64_t champsim_seed; 119 | #endif 120 | -------------------------------------------------------------------------------- /inc/dram_controller.h: -------------------------------------------------------------------------------- 1 | #ifndef DRAM_H 2 | #define DRAM_H 3 | 4 | #include "memory_class.h" 5 | 6 | // DRAM configuration 7 | #define DRAM_CHANNEL_WIDTH 8 // 8B 8 | #define DRAM_WQ_SIZE 64 9 | #define DRAM_RQ_SIZE 64 10 | 11 | #define tRP_DRAM_NANOSECONDS 12.5 12 | #define tRCD_DRAM_NANOSECONDS 12.5 13 | #define tCAS_DRAM_NANOSECONDS 12.5 14 | 15 | // the data bus must wait this amount of time when switching between reads and writes, and vice versa 16 | #define DRAM_DBUS_TURN_AROUND_TIME ((15*CPU_FREQ)/2000) // 7.5 ns 17 | extern uint32_t DRAM_MTPS, DRAM_DBUS_RETURN_TIME; 18 | 19 | // these values control when to send out a burst of writes 20 | #define DRAM_WRITE_HIGH_WM ((DRAM_WQ_SIZE*7)>>3) // 7/8th 21 | #define DRAM_WRITE_LOW_WM ((DRAM_WQ_SIZE*3)>>2) // 6/8th 22 | #define MIN_DRAM_WRITES_PER_SWITCH (DRAM_WQ_SIZE*1/4) 23 | 24 | // DRAM 25 | class MEMORY_CONTROLLER : public MEMORY { 26 | public: 27 | const string NAME; 28 | 29 | DRAM_ARRAY dram_array[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 30 | uint64_t dbus_cycle_available[DRAM_CHANNELS], dbus_cycle_congested[DRAM_CHANNELS], dbus_congested[NUM_TYPES+1][NUM_TYPES+1]; 31 | uint64_t bank_cycle_available[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 32 | uint8_t do_write, write_mode[DRAM_CHANNELS]; 33 | uint32_t processed_writes, scheduled_reads[DRAM_CHANNELS], scheduled_writes[DRAM_CHANNELS]; 34 | int fill_level; 35 | 36 | BANK_REQUEST bank_request[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 37 | 38 | // queues 39 | PACKET_QUEUE WQ[DRAM_CHANNELS], RQ[DRAM_CHANNELS]; 40 | 41 | // constructor 42 | MEMORY_CONTROLLER(string v1) : NAME (v1) { 43 | for (uint32_t i=0; i> LOG2_PAGE_SIZE; 142 | int tag = curr_page & 0xFFFF, 143 | hit = 0, match = -1, 144 | L2_ST_idx = curr_page % L2_ST_PRIME, 145 | curr_block = (addr >> LOG2_BLOCK_SIZE) & 0x3F; 146 | SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx]; 147 | int delta_buffer = 0, sig_buffer = 0; 148 | 149 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].signature, delta_buffer)); 170 | } 171 | else { 172 | hit = 1; 173 | table[match].first_hit = 0; 174 | 175 | if (delta_buffer) { 176 | // This is non-speculative information tracked from actual L2 cache demand 177 | // Now, the old signature will be associated with current delta 178 | L2_PT_update(cpu, sig_buffer, delta_buffer); 179 | } 180 | else 181 | break; 182 | 183 | if (warmup_complete[cpu]) 184 | L2_PF_DEBUG(printf("ST_hit cpu: %d cl_addr: %lx page: %lx block: %d old_sig: %x delta: %d\n", 185 | cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, sig_buffer, delta_buffer)); 186 | 187 | // Update signature 188 | int new_signature = get_new_signature(sig_buffer, delta_buffer); 189 | table[match].signature = new_signature; 190 | l2_sig_dist[cpu][table[match].signature]++; 191 | } 192 | 193 | // Update last_block 194 | table[match].last_block = curr_block; 195 | L2_ST_hit[cpu]++; L2_ST_access[cpu]++; 196 | break; 197 | } 198 | } 199 | 200 | if (match == L2_ST_WAY) { 201 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block)); 213 | break; 214 | } 215 | } 216 | } 217 | 218 | if (match == L2_ST_WAY) { // Miss 219 | // Search for LRU victim 220 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].lru)); 239 | L2_ST_miss[cpu]++; L2_ST_access[cpu]++; 240 | 241 | #ifdef L2_GHR_ON 242 | // Check GHR 243 | int ghr_max = 0, ghr_idx = -1, spec_block = 0, spec_sig = 0; 244 | for (int i=0; i= 64) 247 | spec_block -= 64; 248 | else if (spec_block < 0) 249 | spec_block += 64; 250 | if ((spec_block == curr_block) && (ghr_max <= L2_GHR[cpu][i].path_conf)) { 251 | ghr_max = L2_GHR[cpu][i].path_conf; 252 | ghr_idx = i; 253 | spec_sig = get_new_signature(L2_GHR[cpu][i].signature, L2_GHR[cpu][i].oop_delta); 254 | if (warmup_complete[cpu]) 255 | L2_PF_DEBUG(printf("cpu: %d OOP_match L2_GHR[%d] signature: %x path_conf: %d last_block: %d oop_delta: %d spec_block: %d == curr_block: %d spec_sig: %x\n", 256 | cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 257 | L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig)); 258 | } 259 | else { 260 | if (warmup_complete[cpu]) 261 | L2_PF_DEBUG(printf("cpu: %d OOP_unmatch L2_GHR[%d] signature: %x path_conf: %d last_block: %d oop_delta: %d spec_block: %d != curr_block: %d spec_sig: %x\n", 262 | cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 263 | L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig)); 264 | } 265 | } 266 | 267 | if (ghr_idx >= 0) { 268 | // Speculatively update first page 269 | spec_sig = get_new_signature(L2_GHR[cpu][ghr_idx].signature, L2_GHR[cpu][ghr_idx].oop_delta); 270 | 271 | hit = 1; 272 | table[match].signature = spec_sig; 273 | if (warmup_complete[cpu]) 274 | L2_PF_DEBUG(printf("cpu: %d spec_update page: %x sig: %3x delta: %3d curr_block: %2d last_block[NA]: %2d\n", 275 | cpu, tag, spec_sig, L2_GHR[cpu][ghr_idx].oop_delta, curr_block, L2_GHR[cpu][ghr_idx].last_block)); 276 | } 277 | #endif 278 | } 279 | 280 | // Update LRU 281 | int position = table[match].lru; 282 | for (int i=0; i> LOG2_PAGE_SIZE; 297 | int tag = curr_page & 0xFFFF, 298 | match = -1, 299 | L2_ST_idx = curr_page % L2_ST_PRIME; 300 | 301 | SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx]; 302 | 303 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F, table[match].signature, table[match].last_block)); 308 | return match; 309 | } 310 | } 311 | 312 | if (warmup_complete[cpu]) 313 | L2_PF_DEBUG(printf("ST_check not found cpu: %d cl_addr: %lx page: %lx block: %ld\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F)); 314 | return -1; 315 | } 316 | 317 | void L2_PT_update(uint32_t cpu, int signature, int delta) 318 | { 319 | int L2_PT_idx = signature % L2_PT_PRIME; 320 | PATTERN_TABLE *table = L2_PT[cpu][L2_PT_idx]; 321 | 322 | // Update L2_PT 323 | // Update metadata 324 | table[0].c_sig++; 325 | 326 | if (table[0].c_sig == (CSIG_MAX)) 327 | { 328 | table[0].c_sig = CSIG_MAX >> 1; 329 | for (int i = 0; i> 1; 331 | if (warmup_complete[cpu]) 332 | L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d c_sig saturated sig_total: %d => %d\n", L2_PT_idx, cpu, CSIG_MAX, table[0].c_sig)); 333 | } 334 | 335 | int match; 336 | for (match=0; match 0) 425 | { 426 | dynamic_fill_thrs[cpu]--; 427 | fill_down++; 428 | conf_level[dynamic_fill_thrs[cpu]]++; 429 | 430 | printf("FILL_THRESHOLD goes down %d => %d at cycle: %ld\n", dynamic_fill_thrs[cpu]+1, dynamic_fill_thrs[cpu], ooo_cpu[cpu].current_cycle); 431 | } 432 | 433 | conf_counter[cpu] = 0; 434 | } 435 | 436 | l2pf_was_useful++; 437 | } 438 | else 439 | { 440 | if (conf_counter[cpu] > 0) 441 | conf_counter[cpu]--; 442 | 443 | l2pf_was_useless++; 444 | } 445 | 446 | l2pf_match++; 447 | } 448 | 449 | break; 450 | } 451 | } 452 | l2pf_signal++; 453 | 454 | return; 455 | */ 456 | } 457 | 458 | #endif 459 | -------------------------------------------------------------------------------- /inc/memory_class.h: -------------------------------------------------------------------------------- 1 | #ifndef MEMORY_CLASS_H 2 | #define MEMORY_CLASS_H 3 | 4 | #include "champsim.h" 5 | #include "block.h" 6 | 7 | // CACHE ACCESS TYPE 8 | #define LOAD 0 9 | #define RFO 1 10 | #define PREFETCH 2 11 | #define WRITEBACK 3 12 | #define NUM_TYPES 4 13 | 14 | extern uint32_t tRP, // Row Precharge (RP) latency 15 | tRCD, // Row address to Column address (RCD) latency 16 | tCAS; // Column Address Strobe (CAS) latency 17 | 18 | extern uint64_t l2pf_access; 19 | 20 | class MEMORY { 21 | public: 22 | // memory interface 23 | MEMORY *upper_level_icache[NUM_CPUS], *upper_level_dcache[NUM_CPUS], *lower_level, *extra_interface; 24 | 25 | // empty queues 26 | PACKET_QUEUE WQ{"EMPTY", 1}, RQ{"EMPTY", 1}, PQ{"EMPTY", 1}, MSHR{"EMPTY", 1}; 27 | 28 | // functions 29 | virtual int add_rq(PACKET *packet) = 0; 30 | virtual int add_wq(PACKET *packet) = 0; 31 | virtual int add_pq(PACKET *packet) = 0; 32 | virtual void return_data(PACKET *packet) = 0; 33 | virtual void operate() = 0; 34 | virtual void increment_WQ_FULL(uint64_t address) = 0; 35 | virtual uint32_t get_occupancy(uint8_t queue_type, uint64_t address) = 0; 36 | virtual uint32_t get_size(uint8_t queue_type, uint64_t address) = 0; 37 | 38 | // stats 39 | uint64_t ACCESS[NUM_TYPES], HIT[NUM_TYPES], MISS[NUM_TYPES], MSHR_MERGED[NUM_TYPES], STALL[NUM_TYPES]; 40 | 41 | MEMORY() { 42 | for (uint32_t i=0; i 10 | #include 11 | #include 12 | #include 13 | 14 | #define TYPE unsigned short int 15 | //#define MAX_SIZE ROB_SIZE 16 | // sethpugsley - changed this from ROB_SIZE to allow for non-power-of-2 ROB sizes, like real CPUs have 17 | // but MAX_SIZE here still requires a power-of-2 number 18 | #define MAX_SIZE 512 19 | 20 | // tuned empirically 21 | 22 | #define SMALL_SIZE 13 23 | #define SMALLER_SIZE 6 24 | 25 | class fastset { 26 | union { 27 | // values for a small set 28 | TYPE 29 | values[SMALL_SIZE]; 30 | 31 | // the bits representing the set 32 | unsigned long long int 33 | bits[MAX_SIZE/64]; 34 | } data; 35 | 36 | int 37 | card; // cardinality of small set 38 | 39 | // set a bit in the bits 40 | 41 | void setbit (TYPE x) { 42 | int word = x >> 6; 43 | int bit = x & 63; 44 | data.bits[word] |= 1ull << bit; 45 | } 46 | 47 | // get one of the bits 48 | 49 | bool getbit (TYPE x) { 50 | int word = x >> 6; 51 | int bit = x & 63; 52 | return (data.bits[word] >> bit) & 1; 53 | } 54 | 55 | // insert an item into a small set 56 | 57 | void insert_small (TYPE x) { 58 | int i; 59 | for (i=0; i x) break; 63 | } 64 | // x belongs in i; move everything from v[i] through v[n-1] 65 | // to v[i+1] through v[n] 66 | for (int j=card-1; j>=i; j--) data.values[j+1] = data.values[j]; 67 | // the loop seems a little faster than memmove 68 | //memmove (&data.values[i+1], &data.values[i], (sizeof (TYPE) * (card-i))); 69 | data.values[i] = x; 70 | card++; 71 | } 72 | 73 | 74 | // do a linear search in a small set 75 | 76 | bool search_small_linear (TYPE x) { 77 | for (int i=0; i x) return false; 80 | if (y == x) return true; 81 | } 82 | return false; 83 | } 84 | 85 | 86 | // search a small set, specializing for the set size 87 | 88 | bool search_small (TYPE x) { 89 | 90 | // no elements? we're done. 91 | 92 | if (!card) return false; 93 | 94 | // below a certain size linear search is faster 95 | 96 | if (card < SMALLER_SIZE) return search_small_linear (x); 97 | 98 | // do a binary search for the item 99 | 100 | int begin = 0; 101 | int end = card-1; 102 | int middle = end/2; 103 | for (;;) { 104 | TYPE y = data.values[middle]; 105 | if (x < y) { 106 | end = middle-1; 107 | } else if (x > y) { 108 | begin = middle+1; 109 | } else return true; 110 | if (end < begin) break; 111 | middle = (begin + end) / 2; 112 | // assert (middle < card && middle >= 0); 113 | } 114 | return false; 115 | } 116 | 117 | // convert a small set into a bitset 118 | 119 | void smalltobit (void) { 120 | 121 | // we have to use a temporary array to hold the small set contents 122 | // because the small set and bitset occupy the same memory 123 | 124 | TYPE tmp[SMALL_SIZE]; 125 | memcpy (tmp, data.values, sizeof (TYPE) * card); 126 | memset (data.bits, 0, sizeof (data.bits)); 127 | for (int i=0; i= SMALL_SIZE); 208 | } 209 | 210 | // lim is the next multiple of 64 211 | 212 | int lim = ((n | 63) + 1) / 64; 213 | 214 | // bitwise OR the other bits into this set 215 | for (int i=0; i> 6) + 1) << 6)) 88 | prefetches.append((instr_id, ((load_addr >> 6) + 2) << 6)) 89 | 90 | return prefetches 91 | 92 | ''' 93 | # Example PyTorch Model 94 | import torch 95 | import torch.nn as nn 96 | 97 | class PytorchMLModel(nn.Module): 98 | 99 | def __init__(self): 100 | super().__init__() 101 | # Initialize your neural network here 102 | # For example 103 | self.embedding = nn.Embedding(...) 104 | self.fc = nn.Linear(...) 105 | 106 | def forward(self, x): 107 | # Forward pass for your model here 108 | # For example 109 | return self.relu(self.fc(self.embedding(x))) 110 | 111 | class TerribleMLModel(MLPrefetchModel): 112 | """ 113 | This class effectively functions as a wrapper around the above custom 114 | pytorch nn.Module. You can approach this in another way so long as the the 115 | load/save/train/generate functions behave as described above. 116 | 117 | Disclaimer: It's terrible since the below criterion assumes a gold Y label 118 | for the prefetches, which we don't really have. In any case, the below 119 | structure more or less shows how one would use a ML framework with this 120 | script. Happy coding / researching! :) 121 | """ 122 | 123 | def __init__(self): 124 | self.model = PytorchMLModel() 125 | 126 | def load(self, path): 127 | self.model = torch.load_state_dict(torch.load(path)) 128 | 129 | def save(self, path): 130 | torch.save(self.model.state_dict(), path) 131 | 132 | def train(self, data): 133 | # Just standard run-time here 134 | self.model.train() 135 | criterion = nn.CrossEntropyLoss() 136 | optimizer = nn.optim.Adam(self.model.parameters()) 137 | scheduler = nn.optim.lr_scheduler.StepLR(optimizer, step_size=0.1) 138 | for epoch in range(20): 139 | # Assuming batch(...) is a generator over the data 140 | for i, (x, y) in enumerate(batch(data)): 141 | y_pred = self.model(x) 142 | loss = criterion(y_pred, y) 143 | 144 | if i % 100 == 0: 145 | print('Loss:', loss.item()) 146 | 147 | optimizer.zero_grad() 148 | loss.backward() 149 | optimizer.step() 150 | scheduler.step() 151 | 152 | def generate(self, data): 153 | self.model.eval() 154 | prefetches = [] 155 | for i, (x, _) in enumerate(batch(data, random=False)): 156 | y_pred = self.model(x) 157 | 158 | for xi, yi in zip(x, y_pred): 159 | # Where instr_id is a function that extracts the unique instr_id 160 | prefetches.append((instr_id(xi), yi)) 161 | 162 | return prefetches 163 | ''' 164 | 165 | # Replace this if you create your own model 166 | Model = NextLineModel 167 | -------------------------------------------------------------------------------- /prefetcher/bo.h: -------------------------------------------------------------------------------- 1 | #ifndef __BO_H 2 | #define __BO_H 3 | 4 | //###################################################################################### 5 | // BEST OFFSET PREFETCHER 6 | //###################################################################################### 7 | 8 | #include 9 | #include 10 | #include "cache.h" 11 | #include 12 | 13 | using namespace std; 14 | 15 | // Submission ID: 3 16 | 17 | // Paper title: A Best-Offset Prefetcher 18 | 19 | // Author: Pierre Michaud 20 | 21 | // (Modified to be a LLC prefetcher by Akanksha Jain) 22 | // Prefetch Throttling is disabled since MSH info is not available 23 | //###################################################################################### 24 | // PREFETCHER PARAMETERS 25 | //###################################################################################### 26 | 27 | // Because prefetch cannot cross 4KB-page boundaries, there is no need to consider offsets 28 | // greater than 63. However, with pages larger than 4KB, it would be beneficial to consider 29 | // larger offsets. 30 | 31 | #define NOFFSETS 46 32 | int OFFSET[NOFFSETS] = {1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8,9,-9,10,-10,11,-11,12,-12,13,-13,14,-14,15,-15,16,-16,18,-18,20,-20,24,-24,30,-30,32,-32,36,-36,40,-40}; 33 | #define DEFAULT_OFFSET 1 34 | #define SCORE_MAX 31 35 | #define ROUND_MAX 100 36 | #define RRINDEX 6 37 | #define RRTAG 12 38 | #define DELAYQSIZE 15 39 | #define DELAY 60 40 | #define TIME_BITS 12 41 | //#define LLC_RATE_MAX 255 42 | //#define GAUGE_MAX 8191 43 | //#define MSHR_THRESHOLD_MAX (LLC_MSHR_SIZE-4) 44 | //#define MSHR_THRESHOLD_MIN 2 45 | #define LOW_SCORE 20 46 | //#define BAD_SCORE ((knob_small_llc)? 10 : 1) 47 | #define BAD_SCORE 10 48 | //#define BANDWIDTH ((knob_low_bandwidth)? 64 : 16) 49 | //###################################################################################### 50 | // PREFETCHER STATE 51 | //###################################################################################### 52 | 53 | int prefetch_offset; // 7 bits (6-bit value + 1 sign bit) 54 | 55 | // Recent Requests (RR) table: 2 banks, 64 entries per bank, RRTAG bits per entry 56 | int recent_request[2][1<> 6) == 0) 99 | 100 | #define INCREMENT(x,n) {x++; if (x==(n)) x=0;} 101 | 102 | #define TRUNCATE(x,nbits) (((x) & ((1<<(nbits))-1))) 103 | 104 | typedef long long t_addr; 105 | 106 | 107 | 108 | //###################################################################################### 109 | // RECENT REQUESTS TABLE (RR) 110 | //###################################################################################### 111 | 112 | void rr_init() 113 | { 114 | int i; 115 | for (i=0; i<(1<>RRINDEX,RRTAG); 125 | } 126 | 127 | 128 | int rr_index_left(t_addr lineaddr) 129 | { 130 | return TRUNCATE(lineaddr^(lineaddr>>RRINDEX),RRINDEX); 131 | } 132 | 133 | 134 | int rr_index_right(t_addr lineaddr) 135 | { 136 | return TRUNCATE(lineaddr^(lineaddr>>(2*RRINDEX)),RRINDEX); 137 | } 138 | 139 | 140 | void rr_insert_left(t_addr lineaddr) 141 | { 142 | int i = rr_index_left(lineaddr); 143 | recent_request[0][i] = rr_tag(lineaddr); 144 | } 145 | 146 | 147 | void rr_insert_right(t_addr lineaddr) 148 | { 149 | int i = rr_index_right(lineaddr); 150 | recent_request[1][i] = rr_tag(lineaddr); 151 | } 152 | 153 | 154 | int rr_hit(t_addr lineaddr) 155 | { 156 | int i = rr_index_left(lineaddr); 157 | int j = rr_index_right(lineaddr); 158 | int tag = rr_tag(lineaddr); 159 | return (recent_request[0][i] == tag) || (recent_request[1][j] == tag); 160 | } 161 | 162 | 163 | 164 | //###################################################################################### 165 | // DELAY QUEUE (DQ) 166 | //###################################################################################### 167 | 168 | // Without the delay queue, the prefetcher would always try to select an offset value 169 | // large enough for having timely prefetches. However, sometimes, a small offset yields 170 | // late prefetches but greater prefetch accuracy and better performance. The delay queue 171 | // is an imperfect solution to this problem. 172 | 173 | // This implementation of the delay queue is specific to the DPC2 simulator, as the DPC2 174 | // prefetcher can act only at certain clock cycles. In a real processor, the delay queue 175 | // implementation can be simpler. 176 | 177 | 178 | void dq_init() 179 | { 180 | int i; 181 | for (i=0; i= issuecycle) { 219 | return (cycle < issuecycle) || (cycle >= readycycle); 220 | } else { 221 | return (cycle < issuecycle) && (cycle >= readycycle); 222 | } 223 | } 224 | 225 | 226 | void dq_pop() 227 | { 228 | // dequeue the entries that are ready to be dequeued, 229 | // and do a write in the "left" bank of the RR table for each of them 230 | int i; 231 | for (i=0; i LOW_SCORE) || (pt.llc_rate > (2*BANDWIDTH))) { 269 | // prefetch accuracy not too bad, or low bandwidth requirement 270 | // ==> maximum prefetch aggressiveness 271 | pt.mshr_threshold = MSHR_THRESHOLD_MAX; 272 | } else if (pt.llc_rate < BANDWIDTH) { 273 | // LLC access rate exceeds memory bandwidth, implying that there are some LLC hits. 274 | // If there are more LLC misses than hits, perhaps memory bandwidth saturates. 275 | // If there are more LLC hits than misses, the MSHR is probably not stressed. 276 | // So we set the MSHR threshold low. 277 | pt.mshr_threshold = MSHR_THRESHOLD_MIN; 278 | } else { 279 | // in-between situation: we set the MSHR threshold proportionally to the (inverse) LLC rate 280 | pt.mshr_threshold = MSHR_THRESHOLD_MIN + (MSHR_THRESHOLD_MAX-MSHR_THRESHOLD_MIN) * (double) (pt.llc_rate - BANDWIDTH) / BANDWIDTH; 281 | } 282 | } 283 | 284 | 285 | // The pt_llc_access function estimates the average time between consecutive LLC accesses. 286 | // It is called on every LLC access. 287 | 288 | void pt_llc_access() 289 | { 290 | // update the gauge 291 | int cycle = TRUNCATE(current_core_cycle[0],TIME_BITS); 292 | int dt = TRUNCATE(cycle - pt.last_cycle,TIME_BITS); 293 | pt.last_cycle = cycle; 294 | pt.llc_rate_gauge += dt - pt.llc_rate; 295 | 296 | // if the gauge reaches its upper limit, increment the rate counter 297 | // if the gauge reaches its lower limit, decrement the rate counter 298 | // otherwise leave the rate counter unchanged 299 | if (pt.llc_rate_gauge > GAUGE_MAX) { 300 | pt.llc_rate_gauge = GAUGE_MAX; 301 | if (pt.llc_rate < LLC_RATE_MAX) { 302 | pt.llc_rate++; 303 | pt_update_mshr_threshold(); 304 | } 305 | } else if (pt.llc_rate_gauge < 0) { 306 | pt.llc_rate_gauge = 0; 307 | if (pt.llc_rate > 0) { 308 | pt.llc_rate--; 309 | pt_update_mshr_threshold(); 310 | } 311 | } 312 | } 313 | */ 314 | 315 | //###################################################################################### 316 | // OFFSETS SCORES (OS) 317 | //###################################################################################### 318 | 319 | // A method for determining the best offset value 320 | 321 | void os_reset() 322 | { 323 | int i; 324 | for (i=0; i increment the score 345 | os.score[os.p]++; 346 | if (os.score[os.p] >= os.max_score) { 347 | os.max_score = os.score[os.p]; 348 | os.best_offset = testoffset; 349 | } 350 | } 351 | 352 | if (os.p == (NOFFSETS-1)) { 353 | // one round finished 354 | os.round++; 355 | 356 | if ((os.max_score == SCORE_MAX) || (os.round == ROUND_MAX)) { 357 | // learning phase is finished, update the prefetch offset 358 | prefetch_offset = (os.best_offset != 0)? os.best_offset : DEFAULT_OFFSET; 359 | pt.prefetch_score = os.max_score; 360 | //pt_update_mshr_threshold(); 361 | 362 | if (os.max_score <= BAD_SCORE) { 363 | // prefetch accuracy is likely to be very low ==> turn the prefetch off 364 | prefetch_offset = 0; 365 | } 366 | // new learning phase starts 367 | os_reset(); 368 | return; 369 | } 370 | } 371 | INCREMENT(os.p,NOFFSETS); // prepare to test the next offset 372 | } 373 | 374 | 375 | //###################################################################################### 376 | // DPC2 INTERFACE 377 | //###################################################################################### 378 | 379 | 380 | void bo_prefetcher_initialize() { 381 | prefetch_offset = DEFAULT_OFFSET; 382 | rr_init(); 383 | os_reset(); 384 | dq_init(); 385 | //pt_init(); 386 | int i,j; 387 | for (i=0; i& prefetch_candidates) 395 | { 396 | t_addr lineaddr = addr >> LOGLINE; 397 | 398 | int s = set; 399 | int w = way; 400 | int llc_hit = (w < LLC_WAY); 401 | int prefetched = 0; 402 | assert(prefetch_candidates.size() == 0); 403 | 404 | if (llc_hit) { 405 | // read the prefetch bit, and reset it 406 | prefetched = prefetch_bit[s][w]; 407 | prefetch_bit[s][w] = 0; 408 | } 409 | else { 410 | //pt_llc_access(); 411 | } 412 | 413 | dq_pop(); 414 | 415 | //int prefetch_issued = 0; 416 | 417 | if (! llc_hit || prefetched ) { 418 | os_learn_best_offset(lineaddr); 419 | 420 | int offset = prefetch_offset; 421 | if (offset == 0) { 422 | // The prefetcher is currently turned off. 423 | // Just push the line address into the delay queue for best-offset learning. 424 | dq_push(lineaddr); 425 | //prefetch_issued = 0; 426 | } 427 | /*else if (! SAMEPAGE(lineaddr,lineaddr+offset)) { 428 | // crossing the page boundary, no prefetch request issued 429 | prefetch_issued = 0; 430 | }*/ 431 | else 432 | { 433 | dq_push(lineaddr); 434 | for(uint32_t i=1; i<=degree; i++) 435 | if (pt.prefetch_score > LOW_SCORE) 436 | prefetch_candidates.push_back((lineaddr+i*offset)<prefetch_line(ip ,lineaddr<> LOGLINE; 456 | 457 | // write the prefetch bit 458 | int s = set; 459 | int w = way; 460 | prefetch_bit[s][w] = prefetch; 461 | 462 | // write the "right" bank of the RR table 463 | t_addr baselineaddr; 464 | if (prefetch || (prefetch_offset == 0)) { 465 | baselineaddr = lineaddr - prefetch_offset; 466 | if (SAMEPAGE(lineaddr,baselineaddr)) { 467 | rr_insert_right(baselineaddr); 468 | } 469 | } 470 | } 471 | 472 | 473 | void bo_prefetcher_final_stats() { 474 | } 475 | 476 | #endif // __BO_H 477 | -------------------------------------------------------------------------------- /prefetcher/bo.llc_pref: -------------------------------------------------------------------------------- 1 | #include "bo.h" 2 | #define DEGREE 2 3 | 4 | void CACHE::llc_prefetcher_initialize() 5 | { 6 | bo_prefetcher_initialize(); 7 | } 8 | 9 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle) 10 | { 11 | if(instr_id == 0) 12 | return metadata_in; 13 | 14 | vector bo_candidates; 15 | bo_prefetcher_operate(addr, ip, cache_hit, type, get_set(addr), get_way(addr, get_set(addr)), DEGREE, bo_candidates); 16 | for(uint32_t i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define MAX_PREFETCH_DEGREE 2 11 | 12 | unordered_map> prefetches; 13 | 14 | void CACHE::llc_prefetcher_initialize() 15 | { 16 | cout << "CPU " << cpu << " LLC from_file prefetcher" << endl; 17 | 18 | uint64_t line_no = 0; 19 | uint64_t instr_id, addr; 20 | 21 | while(cin >> dec >> instr_id >> hex >> addr) { 22 | auto itr = prefetches.find(instr_id); 23 | if (itr == prefetches.end()) { 24 | prefetches[instr_id] = vector(); 25 | prefetches[instr_id].push_back(addr); 26 | } else { 27 | if (prefetches[instr_id].size() < MAX_PREFETCH_DEGREE) { 28 | prefetches[instr_id].push_back(addr); 29 | } else { 30 | cerr << "Exceeded max prefetch degree of " << MAX_PREFETCH_DEGREE << " on line " << line_no << " for instr_id " << instr_id << endl; 31 | } 32 | } 33 | line_no++; 34 | } 35 | 36 | } 37 | 38 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle) 39 | { 40 | if(instr_id == 0) return metadata_in; //No prefetches for instructions with id 0 (prefetches and code misses) 41 | 42 | auto itr = prefetches.find(instr_id); 43 | if (itr != prefetches.end()) { 44 | for(auto prefetch_addr : itr->second) { 45 | // cout << "Prefetch " << hex << prefetch_addr << dec << " for instr_id " << instr_id << endl; 46 | prefetch_line(ip, addr, prefetch_addr, FILL_LLC, 0); 47 | } 48 | } 49 | 50 | return metadata_in; 51 | } 52 | 53 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 54 | { 55 | return metadata_in; 56 | } 57 | 58 | void CACHE::llc_prefetcher_final_stats() 59 | { 60 | cout << "CPU " << cpu << " LLC from file prefetcher final stats" << endl; 61 | } 62 | -------------------------------------------------------------------------------- /prefetcher/ip_stride.l2c_pref: -------------------------------------------------------------------------------- 1 | // 2 | // From Data Prefetching Championship Simulator 2 3 | // Seth Pugsley, seth.h.pugsley@intel.com 4 | // 5 | 6 | /* 7 | 8 | This file describes an Instruction Pointer-based (Program Counter-based) stride prefetcher. 9 | The prefetcher detects stride patterns coming from the same IP, and then 10 | prefetches additional cache lines. 11 | 12 | Prefetches are issued into the L2 or LLC depending on L2 MSHR occupancy. 13 | 14 | */ 15 | 16 | #include "cache.h" 17 | 18 | #define IP_TRACKER_COUNT 1024 19 | #define PREFETCH_DEGREE 3 20 | 21 | class IP_TRACKER { 22 | public: 23 | // the IP we're tracking 24 | uint64_t ip; 25 | 26 | // the last address accessed by this IP 27 | uint64_t last_cl_addr; 28 | 29 | // the stride between the last two addresses accessed by this IP 30 | int64_t last_stride; 31 | 32 | // use LRU to evict old IP trackers 33 | uint32_t lru; 34 | 35 | IP_TRACKER () { 36 | ip = 0; 37 | last_cl_addr = 0; 38 | last_stride = 0; 39 | lru = 0; 40 | }; 41 | }; 42 | 43 | IP_TRACKER trackers[IP_TRACKER_COUNT]; 44 | 45 | void CACHE::l2c_prefetcher_initialize() 46 | { 47 | cout << "CPU " << cpu << " L2C IP-based stride prefetcher" << endl; 48 | for (int i=0; i> LOG2_BLOCK_SIZE; 56 | 57 | int index = -1; 58 | for (index=0; index trackers[index].last_cl_addr) 96 | stride = cl_addr - trackers[index].last_cl_addr; 97 | else { 98 | stride = trackers[index].last_cl_addr - cl_addr; 99 | stride *= -1; 100 | } 101 | 102 | //cout << "[IP_STRIDE] HIT index: " << index << " lru: " << trackers[index].lru << " ip: " << hex << ip << " cl_addr: " << cl_addr << dec << " stride: " << stride << endl; 103 | 104 | // don't do anything if we somehow saw the same address twice in a row 105 | if (stride == 0) 106 | return metadata_in; 107 | 108 | // only do any prefetching if there's a pattern of seeing the same 109 | // stride more than once 110 | if (stride == trackers[index].last_stride) { 111 | 112 | // do some prefetching 113 | for (int i=0; i> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE)) 119 | break; 120 | 121 | // check the MSHR occupancy to decide if we're going to prefetch to the L2 or LLC 122 | if (MSHR.occupancy < (MSHR.SIZE>>1)) 123 | prefetch_line(ip, addr, pf_address, FILL_L2, 0); 124 | else 125 | prefetch_line(ip, addr, pf_address, FILL_LLC, 0); 126 | } 127 | } 128 | 129 | trackers[index].last_cl_addr = cl_addr; 130 | trackers[index].last_stride = stride; 131 | 132 | for (int i=0; i> LOG2_PAGE_SIZE; 38 | int tag = curr_page & 0xFFFF, 39 | hit = 0, match = -1, 40 | L2_ST_idx = curr_page % L2_ST_PRIME, 41 | curr_block = (addr >> LOG2_BLOCK_SIZE) & 0x3F; 42 | SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx]; 43 | int delta_buffer = 0, sig_buffer = 0; 44 | 45 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].signature, delta_buffer)); 66 | } 67 | else { 68 | hit = 1; 69 | table[match].first_hit = 0; 70 | 71 | if (delta_buffer) { 72 | // This is non-speculative information tracked from actual L2 cache demand 73 | // Now, the old signature will be associated with current delta 74 | L2_PT_update(cpu, sig_buffer, delta_buffer); 75 | } 76 | else 77 | break; 78 | 79 | if (warmup_complete[cpu]) 80 | L2_PF_DEBUG(printf("ST_hit cpu: %d cl_addr: %lx page: %lx block: %d old_sig: %x delta: %d\n", 81 | cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, sig_buffer, delta_buffer)); 82 | 83 | // Update signature 84 | int new_signature = get_new_signature(sig_buffer, delta_buffer); 85 | table[match].signature = new_signature; 86 | l2_sig_dist[cpu][table[match].signature]++; 87 | } 88 | 89 | // Update last_block 90 | table[match].last_block = curr_block; 91 | L2_ST_hit[cpu]++; L2_ST_access[cpu]++; 92 | break; 93 | } 94 | } 95 | 96 | if (match == L2_ST_WAY) { 97 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block)); 109 | break; 110 | } 111 | } 112 | } 113 | 114 | if (match == L2_ST_WAY) { // Miss 115 | // Search for LRU victim 116 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].lru)); 135 | L2_ST_miss[cpu]++; L2_ST_access[cpu]++; 136 | 137 | #ifdef L2_GHR_ON 138 | // Check GHR 139 | int ghr_max = 0, ghr_idx = -1, spec_block = 0, spec_sig = 0; 140 | for (int i=0; i= 64) 143 | spec_block -= 64; 144 | else if (spec_block < 0) 145 | spec_block += 64; 146 | if ((spec_block == curr_block) && (ghr_max <= L2_GHR[cpu][i].path_conf)) { 147 | ghr_max = L2_GHR[cpu][i].path_conf; 148 | ghr_idx = i; 149 | spec_sig = get_new_signature(L2_GHR[cpu][i].signature, L2_GHR[cpu][i].oop_delta); 150 | if (warmup_complete[cpu]) 151 | L2_PF_DEBUG(printf("cpu: %d OOP_match L2_GHR[%d] signature: %x path_conf: %d last_block: %d oop_delta: %d spec_block: %d == curr_block: %d spec_sig: %x\n", 152 | cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 153 | L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig)); 154 | } 155 | else { 156 | if (warmup_complete[cpu]) 157 | L2_PF_DEBUG(printf("cpu: %d OOP_unmatch L2_GHR[%d] signature: %x path_conf: %d last_block: %d oop_delta: %d spec_block: %d != curr_block: %d spec_sig: %x\n", 158 | cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 159 | L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig)); 160 | } 161 | } 162 | 163 | if (ghr_idx >= 0) { 164 | // Speculatively update first page 165 | spec_sig = get_new_signature(L2_GHR[cpu][ghr_idx].signature, L2_GHR[cpu][ghr_idx].oop_delta); 166 | 167 | hit = 1; 168 | table[match].signature = spec_sig; 169 | if (warmup_complete[cpu]) 170 | L2_PF_DEBUG(printf("cpu: %d spec_update page: %x sig: %3x delta: %3d curr_block: %2d last_block[NA]: %2d\n", 171 | cpu, tag, spec_sig, L2_GHR[cpu][ghr_idx].oop_delta, curr_block, L2_GHR[cpu][ghr_idx].last_block)); 172 | } 173 | #endif 174 | } 175 | 176 | // Update LRU 177 | int position = table[match].lru; 178 | for (int i=0; i> LOG2_PAGE_SIZE; 193 | int tag = curr_page & 0xFFFF, 194 | match = -1, 195 | L2_ST_idx = curr_page % L2_ST_PRIME; 196 | 197 | SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx]; 198 | 199 | for (match=0; match> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F, table[match].signature, table[match].last_block)); 204 | return match; 205 | } 206 | } 207 | 208 | if (warmup_complete[cpu]) 209 | L2_PF_DEBUG(printf("ST_check not found cpu: %d cl_addr: %lx page: %lx block: %ld\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F)); 210 | return -1; 211 | } 212 | 213 | void L2_PT_update(uint32_t cpu, int signature, int delta) 214 | { 215 | int L2_PT_idx = signature % L2_PT_PRIME; 216 | PATTERN_TABLE *table = L2_PT[cpu][L2_PT_idx]; 217 | 218 | // Update L2_PT 219 | // Update metadata 220 | table[0].c_sig++; 221 | 222 | if (table[0].c_sig == (CSIG_MAX)) 223 | { 224 | table[0].c_sig = CSIG_MAX >> 1; 225 | for (int i = 0; i> 1; 227 | if (warmup_complete[cpu]) 228 | L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d c_sig saturated sig_total: %d => %d\n", L2_PT_idx, cpu, CSIG_MAX, table[0].c_sig)); 229 | } 230 | 231 | int match; 232 | for (match=0; match 0) 322 | { 323 | dynamic_fill_thrs[cpu]--; 324 | fill_down++; 325 | conf_level[dynamic_fill_thrs[cpu]]++; 326 | 327 | printf("FILL_THRESHOLD goes down %d => %d at cycle: %ld\n", dynamic_fill_thrs[cpu]+1, dynamic_fill_thrs[cpu], ooo_cpu[cpu].current_cycle); 328 | } 329 | 330 | conf_counter[cpu] = 0; 331 | } 332 | 333 | l2pf_was_useful++; 334 | } 335 | else 336 | { 337 | if (conf_counter[cpu] > 0) 338 | conf_counter[cpu]--; 339 | 340 | l2pf_was_useless++; 341 | } 342 | 343 | l2pf_match++; 344 | } 345 | 346 | break; 347 | } 348 | } 349 | l2pf_signal++; 350 | 351 | return; 352 | */ 353 | //} 354 | -------------------------------------------------------------------------------- /prefetcher/next_line.l1d_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l1d_prefetcher_initialize() 4 | { 5 | cout << "CPU " << cpu << " L1D next line prefetcher" << endl; 6 | } 7 | 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type) 9 | { 10 | uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | 12 | DP ( if (warmup_complete[cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE); 14 | cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; }); 15 | 16 | prefetch_line(ip, addr, pf_addr, FILL_L1, 0); 17 | } 18 | 19 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 20 | { 21 | 22 | } 23 | 24 | void CACHE::l1d_prefetcher_final_stats() 25 | { 26 | cout << "CPU " << cpu << " L1D next line prefetcher final stats" << endl; 27 | } 28 | -------------------------------------------------------------------------------- /prefetcher/next_line.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | void O3_CPU::l1i_prefetcher_initialize() 4 | { 5 | cout << "CPU " << cpu << " L1I next line prefetcher" << endl; 6 | } 7 | 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 9 | { 10 | 11 | } 12 | 13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 14 | { 15 | //cout << "access v_addr: 0x" << hex << v_addr << dec << endl; 16 | 17 | if((cache_hit == 0) && (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1))) 18 | { 19 | uint64_t pf_addr = v_addr + (1<>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | 12 | DP ( if (warmup_complete[cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE); 14 | cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; }); 15 | 16 | prefetch_line(ip, addr, pf_addr, FILL_L2, 0); 17 | 18 | return metadata_in; 19 | } 20 | 21 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 22 | { 23 | return metadata_in; 24 | } 25 | 26 | void CACHE::l2c_prefetcher_final_stats() 27 | { 28 | cout << "CPU " << cpu << " L2C next line prefetcher final stats" << endl; 29 | } 30 | -------------------------------------------------------------------------------- /prefetcher/next_line.llc_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::llc_prefetcher_initialize() 4 | { 5 | cout << "LLC Next Line Prefetcher" << endl; 6 | } 7 | 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle) 9 | { 10 | uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | prefetch_line(ip, addr, pf_addr, FILL_LLC, 0); 12 | 13 | return metadata_in; 14 | } 15 | 16 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 17 | { 18 | return metadata_in; 19 | } 20 | 21 | void CACHE::llc_prefetcher_final_stats() 22 | { 23 | cout << "LLC Next Line Prefetcher Final Stats: none" << endl; 24 | } 25 | -------------------------------------------------------------------------------- /prefetcher/no.l1d_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l1d_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type) 9 | { 10 | 11 | } 12 | 13 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | 16 | } 17 | 18 | void CACHE::l1d_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /prefetcher/no.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | void O3_CPU::l1i_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 9 | { 10 | 11 | } 12 | 13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 14 | { 15 | 16 | } 17 | 18 | void O3_CPU::l1i_prefetcher_cycle_operate() 19 | { 20 | 21 | } 22 | 23 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr) 24 | { 25 | 26 | } 27 | 28 | void O3_CPU::l1i_prefetcher_final_stats() 29 | { 30 | 31 | } 32 | -------------------------------------------------------------------------------- /prefetcher/no.l2c_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l2c_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in) 9 | { 10 | return metadata_in; 11 | } 12 | 13 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | return metadata_in; 16 | } 17 | 18 | void CACHE::l2c_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /prefetcher/no.llc_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::llc_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle) 9 | { 10 | return metadata_in; 11 | } 12 | 13 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | return metadata_in; 16 | } 17 | 18 | void CACHE::llc_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /prefetcher/trace.llc_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::llc_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle) 9 | { 10 | assert(type != PREFETCH); // The instr_id is currently set to 0 for all prefetches 11 | if(instr_id == 0) return metadata_in; //The instr_id is also set to 0 for I-cache misses and TLB misses, and we do not want to capture these right now 12 | 13 | cout << instr_id << ", " << curr_cycle << ", " << hex << addr << ", " << ip << dec << ", " << (int)cache_hit << endl; 14 | return metadata_in; 15 | } 16 | 17 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 18 | { 19 | return metadata_in; 20 | } 21 | 22 | void CACHE::llc_prefetcher_final_stats() 23 | { 24 | 25 | } 26 | -------------------------------------------------------------------------------- /replacement/base_replacement.cc: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | uint32_t CACHE::find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 4 | { 5 | // baseline LRU replacement policy for other caches 6 | return lru_victim(cpu, instr_id, set, current_set, ip, full_addr, type); 7 | } 8 | 9 | void CACHE::update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit) 10 | { 11 | if (type == WRITEBACK) { 12 | if (hit) // wrietback hit does not update LRU state 13 | return; 14 | } 15 | 16 | return lru_update(set, way); 17 | } 18 | 19 | uint32_t CACHE::lru_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 20 | { 21 | uint32_t way = 0; 22 | 23 | // fill invalid line first 24 | for (way=0; way>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data; 30 | cout << dec << " lru: " << block[set][way].lru << endl; }); 31 | 32 | break; 33 | } 34 | } 35 | 36 | // LRU victim 37 | if (way == NUM_WAY) { 38 | for (way=0; way>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data; 44 | cout << dec << " lru: " << block[set][way].lru << endl; }); 45 | 46 | break; 47 | } 48 | } 49 | } 50 | 51 | if (way == NUM_WAY) { 52 | cerr << "[" << NAME << "] " << __func__ << " no victim! set: " << set << endl; 53 | assert(0); 54 | } 55 | 56 | return way; 57 | } 58 | 59 | void CACHE::lru_update(uint32_t set, uint32_t way) 60 | { 61 | // update lru replacement state 62 | for (uint32_t i=0; i PSEL_THRS) { // follow BIP 84 | rrpv[set][way] = maxRRPV; 85 | 86 | bip_counter++; 87 | if (bip_counter == BIP_MAX) 88 | bip_counter = 0; 89 | if (bip_counter == 0) 90 | rrpv[set][way] = maxRRPV-1; 91 | } else // follow SRRIP 92 | rrpv[set][way] = maxRRPV-1; 93 | 94 | } else if (leader == 0) { // leader 0: BIP 95 | if (PSEL[cpu] > 0) PSEL[cpu]--; 96 | rrpv[set][way] = maxRRPV; 97 | 98 | bip_counter++; 99 | if (bip_counter == BIP_MAX) bip_counter = 0; 100 | if (bip_counter == 0) rrpv[set][way] = maxRRPV-1; 101 | 102 | } else if (leader == 1) { // leader 1: SRRIP 103 | if (PSEL[cpu] < PSEL_MAX) PSEL[cpu]++; 104 | rrpv[set][way] = maxRRPV-1; 105 | 106 | } else // WE SHOULD NOT REACH HERE 107 | assert(0); 108 | } 109 | 110 | // find replacement victim 111 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 112 | { 113 | // look for the maxRRPV line 114 | while (1) 115 | { 116 | for (int i=0; i 3 | #include 4 | 5 | #define maxRRPV 3 6 | #define SHCT_SIZE 16384 7 | #define SHCT_PRIME 16381 8 | #define SAMPLER_SET (256*NUM_CPUS) 9 | #define SAMPLER_WAY LLC_WAY 10 | #define SHCT_MAX 7 11 | 12 | uint32_t rrpv[LLC_SET][LLC_WAY]; 13 | 14 | // sampler structure 15 | class SAMPLER_class 16 | { 17 | public: 18 | uint8_t valid, 19 | type, 20 | used; 21 | 22 | uint64_t tag, cl_addr, ip; 23 | 24 | uint32_t lru; 25 | 26 | SAMPLER_class() { 27 | valid = 0; 28 | type = 0; 29 | used = 0; 30 | 31 | tag = 0; 32 | cl_addr = 0; 33 | ip = 0; 34 | 35 | lru = 0; 36 | }; 37 | }; 38 | 39 | // sampler 40 | uint32_t rand_sets[SAMPLER_SET]; 41 | SAMPLER_class sampler[SAMPLER_SET][SAMPLER_WAY]; 42 | 43 | // prediction table structure 44 | class SHCT_class { 45 | public: 46 | uint32_t counter; 47 | 48 | SHCT_class() { 49 | counter = 0; 50 | }; 51 | }; 52 | SHCT_class SHCT[NUM_CPUS][SHCT_SIZE]; 53 | 54 | // initialize replacement state 55 | void CACHE::llc_initialize_replacement() 56 | { 57 | cout << "Initialize SHIP state" << endl; 58 | 59 | for (int i=0; i 0) 123 | SHCT[cpu][SHCT_idx].counter--; 124 | 125 | /* 126 | if (draw_transition) 127 | printf("cycle: %lu SHCT: %d ip: 0x%llX SAMPLER_HIT cl_addr: 0x%llX page: 0x%llX block: %ld set: %d\n", 128 | ooo_cpu[cpu].current_cycle, SHCT[cpu][SHCT_idx].dead, s_set[match].ip, address>>6, address>>12, (address>>6) & 0x3F, s_idx); 129 | */ 130 | 131 | //s_set[match].ip = ip; // SHIP does not update ip on sampler hit 132 | s_set[match].type = type; 133 | s_set[match].used = 1; 134 | //D(printf("sampler hit cpu: %d set: %d way: %d tag: %x ip: %lx type: %d lru: %d\n", 135 | // cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru)); 136 | 137 | break; 138 | } 139 | } 140 | 141 | // check invalid 142 | if (match == SAMPLER_WAY) 143 | { 144 | for (match=0; match>6, address>>12, (address>>6) & 0x3F, s_idx); 178 | */ 179 | } 180 | 181 | s_set[match].tag = tag; 182 | s_set[match].ip = ip; 183 | s_set[match].type = type; 184 | s_set[match].used = 0; 185 | 186 | //D(printf("sampler miss cpu: %d set: %d way: %d tag: %x ip: %lx type: %d lru: %d\n", 187 | // cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru)); 188 | break; 189 | } 190 | } 191 | } 192 | 193 | // update LRU state 194 | uint32_t curr_position = s_set[match].lru; 195 | for (int i=0; i= SHCT_PRIME) 271 | assert(0); 272 | 273 | rrpv[set][way] = maxRRPV-1; 274 | if (SHCT[cpu][SHCT_idx].counter == SHCT_MAX) 275 | rrpv[set][way] = maxRRPV; 276 | } 277 | } 278 | 279 | // use this function to print out your own stats at the end of simulation 280 | void CACHE::llc_replacement_final_stats() 281 | { 282 | 283 | } 284 | -------------------------------------------------------------------------------- /replacement/srrip.llc_repl: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | #define maxRRPV 3 4 | uint32_t rrpv[LLC_SET][LLC_WAY]; 5 | 6 | // initialize replacement state 7 | void CACHE::llc_initialize_replacement() 8 | { 9 | cout << "Initialize SRRIP state" << endl; 10 | 11 | for (int i=0; i&2; 34 | exit 1 35 | fi 36 | 37 | re='^[0-9]+$' 38 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then 39 | echo "[ERROR]: Number of simulation instructions is NOT a number" >&2; 40 | exit 1 41 | fi 42 | 43 | if [ ! -f "$TRACE_DIR/$TRACE0" ] ; then 44 | echo "[ERROR] Cannot find a trace0 file: $TRACE_DIR/$TRACE0" 45 | exit 1 46 | fi 47 | 48 | if [ ! -f "$TRACE_DIR/$TRACE1" ] ; then 49 | echo "[ERROR] Cannot find a trace1 file: $TRACE_DIR/$TRACE1" 50 | exit 1 51 | fi 52 | 53 | if [ ! -f "$TRACE_DIR/$TRACE2" ] ; then 54 | echo "[ERROR] Cannot find a trace2 file: $TRACE_DIR/$TRACE2" 55 | exit 1 56 | fi 57 | 58 | if [ ! -f "$TRACE_DIR/$TRACE3" ] ; then 59 | echo "[ERROR] Cannot find a trace3 file: $TRACE_DIR/$TRACE3" 60 | exit 1 61 | fi 62 | 63 | mkdir -p results_4core_${N_SIM}M 64 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE0} ${TRACE_DIR}/${TRACE1} ${TRACE_DIR}/${TRACE2} ${TRACE_DIR}/${TRACE3}) &> results_4core_${N_SIM}M/mix${N_MIX}-${BINARY}${OPTION}.txt 65 | -------------------------------------------------------------------------------- /run_champsim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 4 ]; then 4 | echo "Illegal number of parameters" 5 | echo "Usage: ./run_champsim.sh [BINARY] [N_WARM] [N_SIM] [TRACE] [OPTION]" 6 | exit 1 7 | fi 8 | 9 | TRACE_DIR=$PWD/dpc3_traces 10 | BINARY=${1} 11 | N_WARM=${2} 12 | N_SIM=${3} 13 | TRACE=${4} 14 | OPTION=${5} 15 | 16 | # Sanity check 17 | if [ -z $TRACE_DIR ] || [ ! -d "$TRACE_DIR" ] ; then 18 | echo "[ERROR] Cannot find a trace directory: $TRACE_DIR" 19 | exit 1 20 | fi 21 | 22 | if [ ! -f "bin/$BINARY" ] ; then 23 | echo "[ERROR] Cannot find a ChampSim binary: bin/$BINARY" 24 | exit 1 25 | fi 26 | 27 | re='^[0-9]+$' 28 | if ! [[ $N_WARM =~ $re ]] || [ -z $N_WARM ] ; then 29 | echo "[ERROR]: Number of warmup instructions is NOT a number" >&2; 30 | exit 1 31 | fi 32 | 33 | re='^[0-9]+$' 34 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then 35 | echo "[ERROR]: Number of simulation instructions is NOT a number" >&2; 36 | exit 1 37 | fi 38 | 39 | if [ ! -f "$TRACE_DIR/$TRACE" ] ; then 40 | echo "[ERROR] Cannot find a trace file: $TRACE_DIR/$TRACE" 41 | exit 1 42 | fi 43 | 44 | mkdir -p results_${N_SIM}M 45 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE}) &> results_${N_SIM}M/${TRACE}-${BINARY}${OPTION}.txt 46 | -------------------------------------------------------------------------------- /scripts/download_dpc3_traces.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p $PWD/../dpc3_traces 4 | while read LINE 5 | do 6 | wget -P $PWD/../dpc3_traces -c http://hpca23.cse.tamu.edu/champsim-traces/speccpu/$LINE 7 | done < dpc3_max_simpoint.txt 8 | -------------------------------------------------------------------------------- /scripts/dpc3_max_simpoint.txt: -------------------------------------------------------------------------------- 1 | 600.perlbench_s-210B.champsimtrace.xz 2 | 602.gcc_s-734B.champsimtrace.xz 3 | 603.bwaves_s-3699B.champsimtrace.xz 4 | 605.mcf_s-665B.champsimtrace.xz 5 | 607.cactuBSSN_s-2421B.champsimtrace.xz 6 | 619.lbm_s-4268B.champsimtrace.xz 7 | 620.omnetpp_s-874B.champsimtrace.xz 8 | 621.wrf_s-575B.champsimtrace.xz 9 | 623.xalancbmk_s-700B.champsimtrace.xz 10 | 625.x264_s-18B.champsimtrace.xz 11 | 627.cam4_s-573B.champsimtrace.xz 12 | 628.pop2_s-17B.champsimtrace.xz 13 | 631.deepsjeng_s-928B.champsimtrace.xz 14 | 638.imagick_s-10316B.champsimtrace.xz 15 | 641.leela_s-800B.champsimtrace.xz 16 | 644.nab_s-5853B.champsimtrace.xz 17 | 648.exchange2_s-1699B.champsimtrace.xz 18 | 649.fotonik3d_s-1176B.champsimtrace.xz 19 | 654.roms_s-842B.champsimtrace.xz 20 | 657.xz_s-3167B.champsimtrace.xz 21 | -------------------------------------------------------------------------------- /scripts/multiworkload.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define NUM_MIX 100 7 | #define NUM_CPUS 4 8 | #define NUM_TRACE 20 9 | 10 | using namespace std; 11 | default_random_engine generator; 12 | 13 | int main() 14 | { 15 | int benchmark[NUM_MIX][NUM_CPUS]; 16 | for (int i=0; i distribution(rand_min, rand_max); 27 | 28 | int temp_rand; 29 | bool do_again = false; 30 | 31 | for (int i = 0; i < NUM_MIX; i++) { 32 | //printf("MIX%2d: ", i+1); 33 | for (int j = 0; j < NUM_CPUS; j++) { 34 | do { 35 | do_again = false; 36 | temp_rand = distribution(generator); // Generate random integer flat in [rand_min, rand_mix] 37 | for (int k = 0; k < j; k++) { 38 | if (temp_rand == benchmark[i][k]) { 39 | do_again = true; 40 | break; 41 | } 42 | } 43 | } while (do_again); 44 | 45 | benchmark[i][j] = temp_rand; 46 | printf("%d ", benchmark[i][j]); 47 | } 48 | printf("\n"); 49 | } 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /scripts/seeds.txt: -------------------------------------------------------------------------------- 1 | 473.astar-s0 854 2 | 473.astar-s1 801 3 | 473.astar-s2 851 4 | 410.bwaves-s0 1017 5 | 410.bwaves-s1 1017 6 | 410.bwaves-s2 922 7 | 459.GemsFDTD-s0 1001 8 | 459.GemsFDTD-s1 1001 9 | 459.GemsFDTD-s2 1005 10 | 470.lbm-s0 673 11 | 470.lbm-s1 585 12 | 437.leslie3d-s0 1151 13 | 437.leslie3d-s1 1158 14 | 437.leslie3d-s2 1059 15 | 462.libquantum-s0 1447 16 | 462.libquantum-s1 1459 17 | 462.libquantum-s2 1414 18 | 429.mcf-s0 629 19 | 429.mcf-s1 622 20 | 429.mcf-s2 577 21 | 433.milc-s0 735 22 | 433.milc-s1 739 23 | 433.milc-s2 741 24 | 471.omnetpp-s0 1036 25 | 471.omnetpp-s1 1083 26 | 471.omnetpp-s2 984 27 | 450.soplex-s0 979 28 | 450.soplex-s1 982 29 | 450.soplex-s2 936 30 | 482.sphinx3-s0 1086 31 | 482.sphinx3-s1 1079 32 | 482.sphinx3-s2 1041 33 | 602.gcc-s0 272 34 | 602.gcc-s1 270 35 | 602.gcc-s2 275 36 | 602.gcc-s3 224 37 | 605.mcf-s0 267 38 | 605.mcf-s1 273 39 | 605.mcf-s2 273 40 | 605.mcf-s3 273 41 | 605.mcf-s4 223 42 | 605.mcf-s5 226 43 | 605.mcf-s6 227 44 | 605.mcf-s7 227 45 | 605.mcf-s8 232 46 | 607.cactuBSSN-s0 267 47 | 607.cactuBSSN-s1 279 48 | 607.cactuBSSN-s2 266 49 | 607.cactuBSSN-s3 276 50 | 619.lbm-s0 279 51 | 619.lbm-s1 280 52 | 619.lbm-s2 280 53 | 619.lbm-s3 278 54 | 620.omnetpp-s0 216 55 | 620.omnetpp-s1 229 56 | 621.wrf-s0 227 57 | 621.wrf-s1 280 58 | 621.wrf-s2 277 59 | 621.wrf-s3 267 60 | 623.xalancbmk-s0 163 61 | 623.xalancbmk-s1 222 62 | 623.xalancbmk-s2 214 63 | 623.xalancbmk-s3 220 64 | 623.xalancbmk-s4 226 65 | 623.xalancbmk-s5 217 66 | 649.fotonik3d-s0 324 67 | 649.fotonik3d-s1 273 68 | 649.fotonik3d-s2 115 69 | 649.fotonik3d-s3 277 70 | 649.fotonik3d-s4 275 71 | 654.roms-s0 266 72 | 654.roms-s1 262 73 | 654.roms-s2 266 74 | 654.roms-s3 271 75 | 654.roms-s4 269 76 | 654.roms-s5 224 77 | 654.roms-s6 225 78 | 654.roms-s7 220 79 | 654.roms-s8 224 80 | bc-0 48 81 | bc-12 99 82 | bc-3 51 83 | bc-5 53 84 | bfs-10 97 85 | bfs-14 101 86 | bfs-3 51 87 | bfs-8 56 88 | cc-13 100 89 | cc-14 101 90 | cc-5 53 91 | cc-6 54 92 | pr-10 97 93 | pr-14 101 94 | pr-3 51 95 | pr-5 53 96 | sssp-10 97 97 | sssp-14 101 98 | sssp-3 51 99 | sssp-5 53 100 | -------------------------------------------------------------------------------- /src/block.cc: -------------------------------------------------------------------------------- 1 | #include "block.h" 2 | 3 | int PACKET_QUEUE::check_queue(PACKET *packet) 4 | { 5 | if ((head == tail) && occupancy == 0) 6 | return -1; 7 | 8 | if (head < tail) { 9 | for (uint32_t i=head; ifull_addr) { 12 | DP (if (warmup_complete[packet->cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 14 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 15 | cout << " cycle " << packet->event_cycle << endl; }); 16 | return i; 17 | } 18 | } 19 | else { 20 | if (entry[i].address == packet->address) { 21 | DP (if (warmup_complete[packet->cpu]) { 22 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 23 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 24 | cout << " cycle " << packet->event_cycle << endl; }); 25 | return i; 26 | } 27 | } 28 | } 29 | } 30 | else { 31 | for (uint32_t i=head; ifull_addr) { 34 | DP (if (warmup_complete[packet->cpu]) { 35 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 36 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 37 | cout << " cycle " << packet->event_cycle << endl; }); 38 | return i; 39 | } 40 | } 41 | else { 42 | if (entry[i].address == packet->address) { 43 | DP (if (warmup_complete[packet->cpu]) { 44 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 45 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 46 | cout << " cycle " << packet->event_cycle << endl; }); 47 | return i; 48 | } 49 | } 50 | } 51 | for (uint32_t i=0; ifull_addr) { 54 | DP (if (warmup_complete[packet->cpu]) { 55 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 56 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 57 | cout << " cycle " << packet->event_cycle << endl; }); 58 | return i; 59 | } 60 | } 61 | else { 62 | if (entry[i].address == packet->address) { 63 | DP (if (warmup_complete[packet->cpu]) { 64 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 65 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 66 | cout << " cycle " << packet->event_cycle << endl; }); 67 | return i; 68 | } 69 | } 70 | } 71 | } 72 | 73 | return -1; 74 | } 75 | 76 | void PACKET_QUEUE::add_queue(PACKET *packet) 77 | { 78 | #ifdef SANITY_CHECK 79 | if (occupancy && (head == tail)) 80 | assert(0); 81 | #endif 82 | 83 | // add entry 84 | entry[tail] = *packet; 85 | 86 | DP ( if (warmup_complete[packet->cpu]) { 87 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id; 88 | cout << " address: " << hex << entry[tail].address << " full_addr: " << entry[tail].full_addr << dec; 89 | cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << entry[tail].event_cycle << endl; }); 90 | 91 | occupancy++; 92 | tail++; 93 | if (tail >= SIZE) 94 | tail = 0; 95 | } 96 | 97 | void PACKET_QUEUE::remove_queue(PACKET *packet) 98 | { 99 | #ifdef SANITY_CHECK 100 | if ((occupancy == 0) && (head == tail)) 101 | assert(0); 102 | #endif 103 | 104 | DP ( if (warmup_complete[packet->cpu]) { 105 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id; 106 | cout << " address: " << hex << packet->address << " full_addr: " << packet->full_addr << dec << " fill_level: " << packet->fill_level; 107 | cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << packet->event_cycle << endl; }); 108 | 109 | // reset entry 110 | PACKET empty_packet; 111 | *packet = empty_packet; 112 | 113 | occupancy--; 114 | head++; 115 | if (head >= SIZE) 116 | head = 0; 117 | } 118 | -------------------------------------------------------------------------------- /src/uncore.cc: -------------------------------------------------------------------------------- 1 | #include "uncore.h" 2 | 3 | // uncore 4 | UNCORE uncore; 5 | 6 | // constructor 7 | UNCORE::UNCORE() { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /tracer/champsim_tracer.cpp: -------------------------------------------------------------------------------- 1 | 2 | /*! @file 3 | * This is an example of the PIN tool that demonstrates some basic PIN APIs 4 | * and could serve as the starting point for developing your first PIN tool 5 | */ 6 | 7 | #include "pin.H" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define NUM_INSTR_DESTINATIONS 2 15 | #define NUM_INSTR_SOURCES 4 16 | 17 | using namespace std; 18 | 19 | typedef struct trace_instr_format { 20 | unsigned long long int ip; // instruction pointer (program counter) value 21 | 22 | unsigned char is_branch; // is this branch 23 | unsigned char branch_taken; // if so, is this taken 24 | 25 | unsigned char destination_registers[NUM_INSTR_DESTINATIONS]; // output registers 26 | unsigned char source_registers[NUM_INSTR_SOURCES]; // input registers 27 | 28 | unsigned long long int destination_memory[NUM_INSTR_DESTINATIONS]; // output memory 29 | unsigned long long int source_memory[NUM_INSTR_SOURCES]; // input memory 30 | } trace_instr_format_t; 31 | 32 | /* ================================================================== */ 33 | // Global variables 34 | /* ================================================================== */ 35 | 36 | UINT64 instrCount = 0; 37 | 38 | FILE* out; 39 | 40 | bool output_file_closed = false; 41 | bool tracing_on = false; 42 | 43 | trace_instr_format_t curr_instr; 44 | 45 | /* ===================================================================== */ 46 | // Command line switches 47 | /* ===================================================================== */ 48 | KNOB KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool", "o", "champsim.trace", 49 | "specify file name for Champsim tracer output"); 50 | 51 | KNOB KnobSkipInstructions(KNOB_MODE_WRITEONCE, "pintool", "s", "0", 52 | "How many instructions to skip before tracing begins"); 53 | 54 | KNOB KnobTraceInstructions(KNOB_MODE_WRITEONCE, "pintool", "t", "1000000", 55 | "How many instructions to trace"); 56 | 57 | /* ===================================================================== */ 58 | // Utilities 59 | /* ===================================================================== */ 60 | 61 | /*! 62 | * Print out help message. 63 | */ 64 | INT32 Usage() 65 | { 66 | cerr << "This tool creates a register and memory access trace" << endl 67 | << "Specify the output trace file with -o" << endl 68 | << "Specify the number of instructions to skip before tracing with -s" << endl 69 | << "Specify the number of instructions to trace with -t" << endl << endl; 70 | 71 | cerr << KNOB_BASE::StringKnobSummary() << endl; 72 | 73 | return -1; 74 | } 75 | 76 | /* ===================================================================== */ 77 | // Analysis routines 78 | /* ===================================================================== */ 79 | 80 | void BeginInstruction(VOID *ip, UINT32 op_code, VOID *opstring) 81 | { 82 | instrCount++; 83 | //printf("[%p %u %s ", ip, opcode, (char*)opstring); 84 | 85 | if(instrCount > KnobSkipInstructions.Value()) 86 | { 87 | tracing_on = true; 88 | 89 | if(instrCount > (KnobTraceInstructions.Value()+KnobSkipInstructions.Value())) 90 | tracing_on = false; 91 | } 92 | 93 | if(!tracing_on) 94 | return; 95 | 96 | // reset the current instruction 97 | curr_instr.ip = (unsigned long long int)ip; 98 | 99 | curr_instr.is_branch = 0; 100 | curr_instr.branch_taken = 0; 101 | 102 | for(int i=0; i KnobSkipInstructions.Value()) 122 | { 123 | tracing_on = true; 124 | 125 | if(instrCount <= (KnobTraceInstructions.Value()+KnobSkipInstructions.Value())) 126 | { 127 | // keep tracing 128 | fwrite(&curr_instr, sizeof(trace_instr_format_t), 1, out); 129 | } 130 | else 131 | { 132 | tracing_on = false; 133 | // close down the file, we're done tracing 134 | if(!output_file_closed) 135 | { 136 | fclose(out); 137 | output_file_closed = true; 138 | } 139 | 140 | exit(0); 141 | } 142 | } 143 | } 144 | 145 | void BranchOrNot(UINT32 taken) 146 | { 147 | //printf("[%d] ", taken); 148 | 149 | curr_instr.is_branch = 1; 150 | if(taken != 0) 151 | { 152 | curr_instr.branch_taken = 1; 153 | } 154 | } 155 | 156 | void RegRead(UINT32 i, UINT32 index) 157 | { 158 | if(!tracing_on) return; 159 | 160 | REG r = (REG)i; 161 | 162 | /* 163 | if(r == 26) 164 | { 165 | // 26 is the IP, which is read and written by branches 166 | return; 167 | } 168 | */ 169 | 170 | //cout << r << " " << REG_StringShort((REG)r) << " " ; 171 | //cout << REG_StringShort((REG)r) << " " ; 172 | 173 | //printf("%d ", (int)r); 174 | 175 | // check to see if this register is already in the list 176 | int already_found = 0; 177 | for(int i=0; i "; 213 | //cout << "<" << REG_StringShort((REG)r) << "> "; 214 | 215 | //printf("<%d> ", (int)r); 216 | 217 | int already_found = 0; 218 | for(int i=0; i -- ... 395 | */ 396 | int main(int argc, char *argv[]) 397 | { 398 | // Initialize PIN library. Print help message if -h(elp) is specified 399 | // in the command line or the command line is invalid 400 | if( PIN_Init(argc,argv) ) 401 | return Usage(); 402 | 403 | const char* fileName = KnobOutputFile.Value().c_str(); 404 | 405 | out = fopen(fileName, "ab"); 406 | if (!out) 407 | { 408 | cout << "Couldn't open output trace file. Exiting." << endl; 409 | exit(1); 410 | } 411 | 412 | // Register function to be called to instrument instructions 413 | INS_AddInstrumentFunction(Instruction, 0); 414 | 415 | // Register function to be called when the application exits 416 | PIN_AddFiniFunction(Fini, 0); 417 | 418 | //cerr << "===============================================" << endl; 419 | //cerr << "This application is instrumented by the Champsim Trace Generator" << endl; 420 | //cerr << "Trace saved in " << KnobOutputFile.Value() << endl; 421 | //cerr << "===============================================" << endl; 422 | 423 | // Start the program, never returns 424 | PIN_StartProgram(); 425 | 426 | return 0; 427 | } 428 | 429 | /* ===================================================================== */ 430 | /* eof */ 431 | /* ===================================================================== */ 432 | -------------------------------------------------------------------------------- /tracer/clean_tracer.sh: -------------------------------------------------------------------------------- 1 | export PIN_ROOT=/home/grads/c/cienlux/task/pin-3.2-81205-gcc-linux 2 | make clean 3 | -------------------------------------------------------------------------------- /tracer/make_tracer.sh: -------------------------------------------------------------------------------- 1 | export PIN_ROOT=/your/pin/directory/ 2 | mkdir -p obj-intel64 3 | make obj-intel64/champsim_tracer.so 4 | -------------------------------------------------------------------------------- /tracer/makefile: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | # 3 | # DO NOT EDIT THIS FILE! 4 | # 5 | ############################################################## 6 | 7 | # If the tool is built out of the kit, PIN_ROOT must be specified in the make invocation and point to the kit root. 8 | ifdef PIN_ROOT 9 | CONFIG_ROOT := $(PIN_ROOT)/source/tools/Config 10 | else 11 | CONFIG_ROOT := ../Config 12 | endif 13 | include $(CONFIG_ROOT)/makefile.config 14 | include makefile.rules 15 | include $(TOOLS_ROOT)/Config/makefile.default.rules 16 | 17 | ############################################################## 18 | # 19 | # DO NOT EDIT THIS FILE! 20 | # 21 | ############################################################## 22 | -------------------------------------------------------------------------------- /tracer/makefile.rules: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | # 3 | # This file includes all the test targets as well as all the 4 | # non-default build rules and test recipes. 5 | # 6 | ############################################################## 7 | 8 | 9 | ############################################################## 10 | # 11 | # Test targets 12 | # 13 | ############################################################## 14 | 15 | ###### Place all generic definitions here ###### 16 | 17 | # This defines tests which run tools of the same name. This is simply for convenience to avoid 18 | # defining the test name twice (once in TOOL_ROOTS and again in TEST_ROOTS). 19 | # Tests defined here should not be defined in TOOL_ROOTS and TEST_ROOTS. 20 | TEST_TOOL_ROOTS := MyPinTool 21 | 22 | # This defines the tests to be run that were not already defined in TEST_TOOL_ROOTS. 23 | TEST_ROOTS := 24 | 25 | # This defines a list of tests that should run in the "short" sanity. Tests in this list must also 26 | # appear either in the TEST_TOOL_ROOTS or the TEST_ROOTS list. 27 | # If the entire directory should be tested in sanity, assign TEST_TOOL_ROOTS and TEST_ROOTS to the 28 | # SANITY_SUBSET variable in the tests section below (see example in makefile.rules.tmpl). 29 | SANITY_SUBSET := 30 | 31 | # This defines the tools which will be run during the the tests, and were not already defined in 32 | # TEST_TOOL_ROOTS. 33 | TOOL_ROOTS := 34 | 35 | # This defines the static analysis tools which will be run during the the tests. They should not 36 | # be defined in TEST_TOOL_ROOTS. If a test with the same name exists, it should be defined in 37 | # TEST_ROOTS. 38 | # Note: Static analysis tools are in fact executables linked with the Pin Static Analysis Library. 39 | # This library provides a subset of the Pin APIs which allows the tool to perform static analysis 40 | # of an application or dll. Pin itself is not used when this tool runs. 41 | SA_TOOL_ROOTS := 42 | 43 | # This defines all the applications that will be run during the tests. 44 | APP_ROOTS := 45 | 46 | # This defines any additional object files that need to be compiled. 47 | OBJECT_ROOTS := 48 | 49 | # This defines any additional dlls (shared objects), other than the pintools, that need to be compiled. 50 | DLL_ROOTS := 51 | 52 | # This defines any static libraries (archives), that need to be built. 53 | LIB_ROOTS := 54 | 55 | 56 | ############################################################## 57 | # 58 | # Test recipes 59 | # 60 | ############################################################## 61 | 62 | # This section contains recipes for tests other than the default. 63 | # See makefile.default.rules for the default test rules. 64 | # All tests in this section should adhere to the naming convention: .test 65 | 66 | 67 | ############################################################## 68 | # 69 | # Build rules 70 | # 71 | ############################################################## 72 | 73 | # This section contains the build rules for all binaries that have special build rules. 74 | # See makefile.default.rules for the default build rules. 75 | --------------------------------------------------------------------------------