├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── branch
    ├── bimodal.bpred
    ├── gshare.bpred
    ├── hashed_perceptron.bpred
    └── perceptron.bpred
├── build_champsim.sh
├── cvp_tracer
    ├── README.md
    └── cvp2champsim.cc
├── download.sh
├── download_links
├── get_stats.py
├── inc
    ├── block.h
    ├── cache.h
    ├── champsim.h
    ├── dram_controller.h
    ├── instruction.h
    ├── kpcp.h
    ├── memory_class.h
    ├── ooo_cpu.h
    ├── set.h
    ├── spp_dev.h
    └── uncore.h
├── ml_prefetch_sim.py
├── model.py
├── prefetcher
    ├── bo.h
    ├── bo.llc_pref
    ├── from_file.llc_pref
    ├── ip_stride.l2c_pref
    ├── kpcp.l2c_pref
    ├── kpcp_util.cc
    ├── next_line.l1d_pref
    ├── next_line.l1i_pref
    ├── next_line.l2c_pref
    ├── next_line.llc_pref
    ├── no.l1d_pref
    ├── no.l1i_pref
    ├── no.l2c_pref
    ├── no.llc_pref
    ├── spp_dev.l2c_pref
    └── trace.llc_pref
├── replacement
    ├── base_replacement.cc
    ├── drrip.llc_repl
    ├── lru.llc_repl
    ├── ship.llc_repl
    └── srrip.llc_repl
├── run_4core.sh
├── run_champsim.sh
├── scripts
    ├── download_dpc3_traces.sh
    ├── dpc3_max_simpoint.txt
    ├── multiworkload.cc
    └── seeds.txt
├── src
    ├── block.cc
    ├── cache.cc
    ├── dram_controller.cc
    ├── main.cc
    ├── ooo_cpu.cc
    └── uncore.cc
└── tracer
    ├── champsim_tracer.cpp
    ├── clean_tracer.sh
    ├── make_tracer.sh
    ├── makefile
    └── makefile.rules


/.gitignore:
--------------------------------------------------------------------------------
 1 | prefetcher/l1i_prefetcher.cc
 2 | prefetcher/l1d_prefetcher.cc
 3 | prefetcher/l2c_prefetcher.cc
 4 | prefetcher/llc_prefetcher.cc
 5 | branch/branch_predictor.cc
 6 | replacement/llc_replacement.cc
 7 | 
 8 | inc/champsim.h.bak
 9 | 
10 | bin/
11 | obj/
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | app = champsim
 2 | 
 3 | srcExt = cc
 4 | srcDir = src branch replacement prefetcher
 5 | objDir = obj
 6 | binDir = bin
 7 | inc = inc
 8 | 
 9 | debug = 1
10 | 
11 | CFlags = -Wall -O3 -std=c++11
12 | LDFlags =
13 | libs =
14 | libDir =
15 | 
16 | 
17 | #************************ DO NOT EDIT BELOW THIS LINE! ************************
18 | 
19 | ifeq ($(debug),1)
20 | 	debug=-g
21 | else
22 | 	debug=
23 | endif
24 | inc := $(addprefix -I,$(inc))
25 | libs := $(addprefix -l,$(libs))
26 | libDir := $(addprefix -L,$(libDir))
27 | CFlags += -c $(debug) $(inc) $(libDir) $(libs)
28 | sources := $(shell find $(srcDir) -name '*.$(srcExt)')
29 | srcDirs := $(shell find . -name '*.$(srcExt)' -exec dirname {} \; | uniq)
30 | objects := $(patsubst %.$(srcExt),$(objDir)/%.o,$(sources))
31 | 
32 | ifeq ($(srcExt),cc)
33 | 	CC = $(CXX)
34 | else
35 | 	CFlags += -std=gnu99
36 | endif
37 | 
38 | .phony: all clean distclean
39 | 
40 | 
41 | all: $(binDir)/$(app)
42 | 
43 | $(binDir)/$(app): buildrepo $(objects)
44 | 	@mkdir -p `dirname $@`
45 | 	@echo "Linking $@..."
46 | 	@$(CC) $(objects) $(LDFlags) -o $@
47 | 
48 | $(objDir)/%.o: %.$(srcExt)
49 | 	@echo "Generating dependencies for $<..."
50 | 	@$(call make-depend,$<,$@,$(subst .o,.d,$@))
51 | 	@echo "Compiling $<..."
52 | 	@$(CC) $(CFlags) $< -o $@
53 | 
54 | clean:
55 | 	$(RM) -r $(objDir)
56 | 
57 | distclean: clean
58 | 	$(RM) -r $(binDir)/$(app)
59 | 
60 | buildrepo:
61 | 	@$(call make-repo)
62 | 
63 | define make-repo
64 |    for dir in $(srcDirs); \
65 |    do \
66 | 	mkdir -p $(objDir)/$$dir; \
67 |    done
68 | endef
69 | 
70 | 
71 | # usage: $(call make-depend,source-file,object-file,depend-file)
72 | define make-depend
73 |   $(CC) -MM       \
74 |         -MF $3    \
75 |         -MP       \
76 |         -MT $2    \
77 |         $(CFlags) \
78 |         $1
79 | endef
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Modified ChampSim for ML Prefetching Competition
  2 | 
  3 | We will use ChampSim to evaluate the effectiveness of your ML prefetchers.  You
  4 | prefetching models will be trained using the Load Traces that we provide (details below), 
  5 | and they will generate an Ouput File with a list of prefetches that will be fed back into 
  6 | ChampSim to compute coverage, accuracy and instructions per cycle (IPC).
  7 | 
  8 | ## Traces:
  9 | 
 10 | The traces can be found at [this link](https://utexas.box.com/s/2k54kp8zvrqdfaa8cdhfquvcxwh7yn85).
 11 | Alternatively, the `download.sh` file can be used to download all of the files to
 12 | avoid bulk download restrictions from Box. You can also use the information found
 13 | in the `download_links` file to download the data in another fashion.
 14 | 
 15 | There are two types of traces that can be found here:
 16 | - Load traces under the folder LoadTraces that you will use to train your ML models.  The 
 17 |   load trace is a series of program's LLC accesses, and the trace format is as follows: 
 18 | ```
 19 | Unique Instr Id, Cycle Count, Load Address, Instruction Pointer of the Load, LLC hit/miss
 20 | ```
 21 |   The load traces are plain text CSV.
 22 | 
 23 | - Execution traces under the folder ChampSimTraces that ChampSim will need to
 24 |   compute IPC.  You do not need these traces to train your models, they are
 25 |   only provided to facilitate an evaluation using IPCs.  Note that you do not
 26 |   unzip execution traces as ChampSim expects it to be in the zipped format. 
 27 | 
 28 | ## Output File
 29 | 
 30 | For a given Load Trace, your code should generate an output file that contains one
 31 | prefetch per line.  Each line should consist of two space-separated integral
 32 | values, the unique instruction ID for which you want to issue a prefetch and the
 33 | load address you want to prefetch.  The unique instruction ID corresponds to
 34 | the ID of the triggering load in the input Load Trace.  You can include up to two 
 35 | prefetches per load listed in the Load Trace.  You can choose not to prefetch
 36 | for a load.  Note that the prefetches should be in the order that they occur in the trace.
 37 | Should you exceed the maximum number of prefetches per load, the first two will
 38 | be kept and the remaining excess prefetches for that load will be discarded.
 39 | 
 40 | For example, consider a Load Trace as follows:
 41 | ```
 42 | 3659 cycle1 A ip1 1
 43 | 5433 cycle2 B ip2 0
 44 | 6928 cycle3 C ip3 0
 45 | ```
 46 | 
 47 | Your output file could look something like this:
 48 | ```
 49 | 3659 A+1    # Issue first prefetch for Instruction 3569
 50 | 3659 A+2    # Issue second prefetch for Instruction 3569
 51 | 5433 B+8    # Issue only one prefetch for Instruction 5433
 52 | ```
 53 | 
 54 | ## Your Code:
 55 | 
 56 | Your code should have two modes of functioning:
 57 | 
 58 | 1. Taking in a Training Load Trace that your model trains on
 59 | 2. Taking in a Test Load Trace for which your model will produce predictions in
 60 |    the format explained above.
 61 | 
 62 | ## Building, Running, and Evaluating
 63 | 
 64 | This has been rolled into one script `ml_prefetch_sim.py`. Below there are some
 65 | common use cases highlighted, but more information can be found for each of the
 66 | subcommands by running:
 67 | 
 68 | ```
 69 | ./ml_prefetch_sim.py help subcommand
 70 | ```
 71 | 
 72 | where subcommand is any of `build|run|eval`
 73 | 
 74 | ### Building
 75 | 
 76 | The following command will compile two ChampSim binaries: (1) A ChampSim binary
 77 | that reads your ML model's output from a file and uses that as a prefetcher,
 78 | and (2) A ChampSim binary with no prefetching that is to be used as a baseline
 79 | 
 80 | ```
 81 | ./ml_prefetch_sim.py build
 82 | ```
 83 | 
 84 | ### Training
 85 | 
 86 | ```
 87 | ./ml_prefetch_sim.py train path_to_load_trace --model save_path --num-prefetch-warmup-instructions num_in_millions
 88 | ```
 89 | 
 90 | To use the above, you need to modify the `model.py` file with your model. The
 91 | prefetch warm-up instructions specify how many to include in the training set.
 92 | The remainder of the instructions are the evaluation set.
 93 | 
 94 | ### Generating the Prefetch File
 95 | 
 96 | ```
 97 | ./ml_prefetch_sim.py generate path_to_load_trace path_to_output_prefetch_file --model save_path --num-prefetch-warmup-instructions num_in_millions
 98 | ```
 99 | 
100 | To use the above, you need to modify the `model.py` file with your model. The
101 | prefetch warm-up instructions specify how many to include in the training set.
102 | The remainder of the instructions are the evaluation set.
103 | 
104 | ### Running
105 | 
106 | To run the baseline ChampSim binaries on an execution trace:
107 | 
108 | ```
109 | ./ml_prefetch_sim.py run path_to_champsim_trace_here
110 | ```
111 | 
112 | To additionally run the ChampSim binary with your prefetcher:
113 | 
114 | ```
115 | ./ml_prefetch_sim.py run path_to_champsim_trace_here --prefetch path_to_prefetcher_file
116 | ```
117 | 
118 | To run the ChampSim binary with your prefetcher only:
119 | 
120 | ```
121 | ./ml_prefetch_sim.py run path_to_trace_here --prefetch path_to_prefetcher_file --no-base
122 | ```
123 | 
124 | ### Evaluation
125 | 
126 | To evaluate the performance of ML prefetcher (and compare it against the baseline
127 | of no prefetcher, Best Offset, SISB, and SISB Best Offset), run:
128 | 
129 | ```
130 | ./ml_prefetch_sim.py eval
131 | ```
132 | 
133 | ## Competition Judging
134 | 
135 | To test how submissions generalize, our test set evaluation will have two components:
136 | 
137 | - Undisclosed execution samples for the training traces: You can submit a
138 |   pre-trained model for each benchmark in the training set, and we will
139 |   evaluate it on a different sample of the same benchmark
140 | 
141 | - Undisclosed benchmarks: We will train and test your model on unseen
142 |   benchmarks using the training routines that you provide
143 | 
144 | ## Changes made to ChampSim for the competition:
145 | 
146 | - Add LLC prefetcher (from\_file) to load ML model prefetch predictions into ChampSim
147 | - Modify the LLC prefetcher to provide unique instruction IDs and cycle counts
148 | - Remove same-page restriction in src/cache.cc for more irregular prefetching
149 |   opportunity
150 | - Add ml\_prefetch\_sim.py to handle all of the building, running, and evaluation.
151 | 
152 | ---
153 | 
154 | <p align="center">
155 |   <h1 align="center"> ChampSim </h1>
156 |   <p> ChampSim is a trace-based simulator for a microarchitecture study. You can sign up to the public mailing list by sending an empty mail to champsim+subscribe@googlegroups.com. Traces for the 3rd Data Prefetching Championship (DPC-3) can be found from here (https://dpc3.compas.cs.stonybrook.edu/?SW_IS). A set of traces used for the 2nd Cache Replacement Championship (CRC-2) can be found from this link. (http://bit.ly/2t2nkUj) <p>
157 | </p>
158 | 
159 | # Clone ChampSim repository
160 | ```
161 | git clone https://github.com/ChampSim/ChampSim.git
162 | ```
163 | 
164 | # Compile
165 | 
166 | ChampSim takes five parameters: Branch predictor, L1D prefetcher, L2C prefetcher, LLC replacement policy, and the number of cores. 
167 | For example, `./build_champsim.sh bimodal no no lru 1` builds a single-core processor with bimodal branch predictor, no L1/L2 data prefetchers, and the baseline LRU replacement policy for the LLC.
168 | ```
169 | $ ./build_champsim.sh bimodal no no no no lru 1
170 | 
171 | $ ./build_champsim.sh ${BRANCH} ${L1I_PREFETCHER} ${L1D_PREFETCHER} ${L2C_PREFETCHER} ${LLC_PREFETCHER} ${LLC_REPLACEMENT} ${NUM_CORE}
172 | ```
173 | 
174 | # Download DPC-3 trace
175 | 
176 | Professor Daniel Jimenez at Texas A&M University kindly provided traces for DPC-3. Use the following script to download these traces (~20GB size and max simpoint only).
177 | ```
178 | $ cd scripts
179 | 
180 | $ ./download_dpc3_traces.sh
181 | ```
182 | 
183 | # Run simulation
184 | 
185 | Execute `run_champsim.sh` with proper input arguments. The default `TRACE_DIR` in `run_champsim.sh` is set to `$PWD/dpc3_traces`. <br>
186 | 
187 | * Single-core simulation: Run simulation with `run_champsim.sh` script.
188 | 
189 | ```
190 | Usage: ./run_champsim.sh [BINARY] [N_WARM] [N_SIM] [TRACE] [OPTION]
191 | $ ./run_champsim.sh bimodal-no-no-no-no-lru-1core 1 10 400.perlbench-41B.champsimtrace.xz
192 | 
193 | ${BINARY}: ChampSim binary compiled by "build_champsim.sh" (bimodal-no-no-lru-1core)
194 | ${N_WARM}: number of instructions for warmup (1 million)
195 | ${N_SIM}:  number of instructinos for detailed simulation (10 million)
196 | ${TRACE}: trace name (400.perlbench-41B.champsimtrace.xz)
197 | ${OPTION}: extra option for "-low_bandwidth" (src/main.cc)
198 | ```
199 | Simulation results will be stored under "results_${N_SIM}M" as a form of "${TRACE}-${BINARY}-${OPTION}.txt".<br> 
200 | 
201 | * Multi-core simulation: Run simulation with `run_4core.sh` script. <br>
202 | ```
203 | Usage: ./run_4core.sh [BINARY] [N_WARM] [N_SIM] [N_MIX] [TRACE0] [TRACE1] [TRACE2] [TRACE3] [OPTION]
204 | $ ./run_4core.sh bimodal-no-no-no-lru-4core 1 10 0 400.perlbench-41B.champsimtrace.xz \\
205 |   401.bzip2-38B.champsimtrace.xz 403.gcc-17B.champsimtrace.xz 410.bwaves-945B.champsimtrace.xz
206 | ```
207 | Note that we need to specify multiple trace files for `run_4core.sh`. `N_MIX` is used to represent a unique ID for mixed multi-programmed workloads. 
208 | 
209 | 
210 | # Add your own branch predictor, data prefetchers, and replacement policy
211 | **Copy an empty template**
212 | ```
213 | $ cp branch/branch_predictor.cc branch/mybranch.bpred
214 | $ cp prefetcher/l1d_prefetcher.cc prefetcher/mypref.l1d_pref
215 | $ cp prefetcher/l2c_prefetcher.cc prefetcher/mypref.l2c_pref
216 | $ cp prefetcher/llc_prefetcher.cc prefetcher/mypref.llc_pref
217 | $ cp replacement/llc_replacement.cc replacement/myrepl.llc_repl
218 | ```
219 | 
220 | **Work on your algorithms with your favorite text editor**
221 | ```
222 | $ vim branch/mybranch.bpred
223 | $ vim prefetcher/mypref.l1d_pref
224 | $ vim prefetcher/mypref.l2c_pref
225 | $ vim prefetcher/mypref.llc_pref
226 | $ vim replacement/myrepl.llc_repl
227 | ```
228 | 
229 | **Compile and test**
230 | ```
231 | $ ./build_champsim.sh mybranch mypref mypref mypref myrepl 1
232 | $ ./run_champsim.sh mybranch-mypref-mypref-mypref-myrepl-1core 1 10 bzip2_183B
233 | ```
234 | 
235 | # How to create traces
236 | 
237 | We have included only 4 sample traces, taken from SPEC CPU 2006. These 
238 | traces are short (10 million instructions), and do not necessarily cover the range of behaviors your 
239 | replacement algorithm will likely see in the full competition trace list (not
240 | included).  We STRONGLY recommend creating your own traces, covering
241 | a wide variety of program types and behaviors.
242 | 
243 | The included Pin Tool champsim_tracer.cpp can be used to generate new traces.
244 | We used Pin 3.2 (pin-3.2-81205-gcc-linux), and it may require 
245 | installing libdwarf.so, libelf.so, or other libraries, if you do not already 
246 | have them. Please refer to the Pin documentation (https://software.intel.com/sites/landingpage/pintool/docs/81205/Pin/html/)
247 | for working with Pin 3.2.
248 | 
249 | Get this version of Pin:
250 | ```
251 | wget http://software.intel.com/sites/landingpage/pintool/downloads/pin-3.2-81205-gcc-linux.tar.gz
252 | ```
253 | 
254 | **Note on compatibility**: If you are using newer linux kernels/Ubuntu versions (eg. 20.04LTS), you might run into issues (such as [[1](https://github.com/ChampSim/ChampSim/issues/102)],[[2](https://stackoverflow.com/questions/55698095/intel-pin-tools-32-bit-processsectionheaders-560-assertion-failed)],[[3](https://stackoverflow.com/questions/43589174/pin-tool-segmentation-fault-for-ubuntu-17-04)]) with the PIN3.2. ChampSim tracer works fine with newer PIN tool versions that can be downloaded from [here](https://software.intel.com/content/www/us/en/develop/articles/pin-a-binary-instrumentation-tool-downloads.html). PIN3.17 is [confirmed](https://github.com/ChampSim/ChampSim/issues/102) to work with Ubuntu 20.04.1 LTS.
255 | 
256 | Once downloaded, open tracer/make_tracer.sh and change PIN_ROOT to Pin's location.
257 | Run ./make_tracer.sh to generate champsim_tracer.so.
258 | 
259 | **Use the Pin tool like this**
260 | ```
261 | pin -t obj-intel64/champsim_tracer.so -- <your program here>
262 | ```
263 | 
264 | The tracer has three options you can set:
265 | ```
266 | -o
267 | Specify the output file for your trace.
268 | The default is default_trace.champsim
269 | 
270 | -s <number>
271 | Specify the number of instructions to skip in the program before tracing begins.
272 | The default value is 0.
273 | 
274 | -t <number>
275 | The number of instructions to trace, after -s instructions have been skipped.
276 | The default value is 1,000,000.
277 | ```
278 | For example, you could trace 200,000 instructions of the program ls, after
279 | skipping the first 100,000 instructions, with this command:
280 | ```
281 | pin -t obj/champsim_tracer.so -o traces/ls_trace.champsim -s 100000 -t 200000 -- ls
282 | ```
283 | Traces created with the champsim_tracer.so are approximately 64 bytes per instruction,
284 | but they generally compress down to less than a byte per instruction using xz compression.
285 | 
286 | # Evaluate Simulation
287 | 
288 | ChampSim measures the IPC (Instruction Per Cycle) value as a performance metric. <br>
289 | There are some other useful metrics printed out at the end of simulation. <br>
290 | 
291 | Good luck and be a champion! <br>
292 | 


--------------------------------------------------------------------------------
/branch/bimodal.bpred:
--------------------------------------------------------------------------------
 1 | #include "ooo_cpu.h"
 2 | 
 3 | #define BIMODAL_TABLE_SIZE 16384
 4 | #define BIMODAL_PRIME 16381
 5 | #define MAX_COUNTER 3
 6 | int bimodal_table[NUM_CPUS][BIMODAL_TABLE_SIZE];
 7 | 
 8 | void O3_CPU::initialize_branch_predictor()
 9 | {
10 |     cout << "CPU " << cpu << " Bimodal branch predictor" << endl;
11 | 
12 |     for(int i = 0; i < BIMODAL_TABLE_SIZE; i++)
13 |         bimodal_table[cpu][i] = 0;
14 | }
15 | 
16 | uint8_t O3_CPU::predict_branch(uint64_t ip)
17 | {
18 |     uint32_t hash = ip % BIMODAL_PRIME;
19 |     uint8_t prediction = (bimodal_table[cpu][hash] >= ((MAX_COUNTER + 1)/2)) ? 1 : 0;
20 | 
21 |     return prediction;
22 | }
23 | 
24 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken)
25 | {
26 |     uint32_t hash = ip % BIMODAL_PRIME;
27 | 
28 |     if (taken && (bimodal_table[cpu][hash] < MAX_COUNTER))
29 |         bimodal_table[cpu][hash]++;
30 |     else if ((taken == 0) && (bimodal_table[cpu][hash] > 0))
31 |         bimodal_table[cpu][hash]--;
32 | }
33 | 


--------------------------------------------------------------------------------
/branch/gshare.bpred:
--------------------------------------------------------------------------------
 1 | #include "ooo_cpu.h"
 2 | 
 3 | #define GLOBAL_HISTORY_LENGTH 14
 4 | #define GLOBAL_HISTORY_MASK (1 << GLOBAL_HISTORY_LENGTH) - 1
 5 | int branch_history_vector[NUM_CPUS];
 6 | 
 7 | #define GS_HISTORY_TABLE_SIZE 16384
 8 | int gs_history_table[NUM_CPUS][GS_HISTORY_TABLE_SIZE];
 9 | int my_last_prediction[NUM_CPUS];
10 | 
11 | void O3_CPU::initialize_branch_predictor()
12 | {
13 |     cout << "CPU " << cpu << " GSHARE branch predictor" << endl;
14 | 
15 |     branch_history_vector[cpu] = 0;
16 |     my_last_prediction[cpu] = 0;
17 | 
18 |     for(int i=0; i<GS_HISTORY_TABLE_SIZE; i++)
19 |         gs_history_table[cpu][i] = 2; // 2 is slightly taken
20 | }
21 | 
22 | unsigned int gs_table_hash(uint64_t ip, int bh_vector)
23 | {
24 |     unsigned int hash = ip^(ip>>GLOBAL_HISTORY_LENGTH)^(ip>>(GLOBAL_HISTORY_LENGTH*2))^bh_vector;
25 |     hash = hash%GS_HISTORY_TABLE_SIZE;
26 | 
27 |     //printf("%d\n", hash);
28 | 
29 |     return hash;
30 | }
31 | 
32 | uint8_t O3_CPU::predict_branch(uint64_t ip)
33 | {
34 |     int prediction = 1;
35 | 
36 |     int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]);
37 | 
38 |     if(gs_history_table[cpu][gs_hash] >= 2)
39 |         prediction = 1;
40 |     else
41 |         prediction = 0;
42 | 
43 |     my_last_prediction[cpu] = prediction;
44 | 
45 |     return prediction;
46 | }
47 | 
48 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken)
49 | {
50 |     int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]);
51 | 
52 |     if(taken == 1) {
53 |         if(gs_history_table[cpu][gs_hash] < 3)
54 |             gs_history_table[cpu][gs_hash]++;
55 |     } else {
56 |         if(gs_history_table[cpu][gs_hash] > 0)
57 |             gs_history_table[cpu][gs_hash]--;
58 |     }
59 | 
60 |     // update branch history vector
61 |     branch_history_vector[cpu] <<= 1;
62 |     branch_history_vector[cpu] &= GLOBAL_HISTORY_MASK;
63 |     branch_history_vector[cpu] |= taken;
64 | }
65 | 


--------------------------------------------------------------------------------
/branch/hashed_perceptron.bpred:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | This code implements a hashed perceptron branch predictor using geometric
  4 | history lengths and dynamic threshold setting.
  5 | 
  6 | It was written by Daniel A. Jiménez in March 2019. To the extent allowed by
  7 | law, the author abdicates all rights to this work and places it in the public
  8 | domain.
  9 | 
 10 | The original perceptron branch predictor is from Jiménez and Lin, "Dynamic
 11 | Branch Prediction with Perceptrons," HPCA 2001.
 12 | 
 13 | The idea of using multiple independently indexed tables of perceptron weights
 14 | is from Jiménez, "Fast Path-Based Neural Branch Prediction," MICRO 2003 and
 15 | later expanded in "Piecewise Linear Branch Prediction" from ISCA 2005.
 16 | 
 17 | The idea of using hashes of branch history to reduce the number of independent
 18 | tables is documented in three contemporaneous papers:
 19 | 
 20 | 1. Seznec, "Revisiting the Perceptron Predictor," IRISA technical report, 2004.
 21 | 
 22 | 2. Tarjan and Skadron, "Revisiting the Perceptron Predictor Again," UVA
 23 | technical report, 2004, expanded and published in ACM TACO 2005 as "Merging
 24 | path and gshare indexing in perceptron branch prediction"; introduces the term
 25 | "hashed perceptron."
 26 | 
 27 | 3. Loh and Jiménez, "Reducing the Power and Complexity of Path-Based Neural
 28 | Branch Prediction," WCED 2005.
 29 | 
 30 | The ideas of using "geometric history lengths" i.e. hashing into tables with
 31 | histories of exponentially increasing length, as well as dynamically adjusting
 32 | the theta parameter, are from Seznec, "The O-GEHL Branch Predictor," from CBP
 33 | 2004, expanded later as "Analysis of the O-GEometric History Length Branch
 34 | Predictor" in ISCA 2005.
 35 | 
 36 | This code uses these ideas, but prefers simplicity over absolute accuracy (I
 37 | wrote it in about an hour and later spent more time on this comment block than
 38 | I did on the code). These papers and subsequent papers by Jiménez and other
 39 | authors significantly improve the accuracy of perceptron-based predictors but
 40 | involve tricks and analysis beyond the needs of a tool like ChampSim that
 41 | targets cache optimizations. If you want accuracy at any cost, see the winners
 42 | of the latest branch prediction contest, CBP 2016 as of this writing, but
 43 | prepare to have your face melted off by the complexity of the code you find
 44 | there. If you are a student being asked to code a good branch predictor for
 45 | your computer architecture class, don't copy this code; there are much better
 46 | sources for you to plagiarize.
 47 | 
 48 | */
 49 | 
 50 | #include <stdio.h>
 51 | #include <string.h>
 52 | #include <math.h>
 53 | #include <stdlib.h>
 54 | 
 55 | #include "ooo_cpu.h"
 56 | 
 57 | // this many tables
 58 | 
 59 | #define NTABLES	16
 60 | 
 61 | // maximum history length
 62 | 
 63 | #define MAXHIST	232
 64 | 
 65 | // minimum history length (for table 1; table 0 is biases)
 66 | 
 67 | #define MINHIST	3
 68 | 
 69 | // speed for dynamic threshold setting
 70 | 
 71 | #define SPEED	18
 72 | 
 73 | // geometric global history lengths
 74 | 
 75 | int history_lengths[NTABLES] = { 0, 3, 4, 6, 8, 10, 14, 19, 26, 36, 49, 67, 91, 125, 170, MAXHIST };
 76 | 
 77 | // 12-bit indices for the tables
 78 | 
 79 | #define LOG_TABLE_SIZE	12
 80 | #define TABLE_SIZE	(1<<LOG_TABLE_SIZE)
 81 | 
 82 | // this many 12-bit words will be kept in the global history
 83 | 
 84 | #define NGHIST_WORDS	(MAXHIST/LOG_TABLE_SIZE+1)
 85 | 
 86 | // tables of 8-bit weights
 87 | 
 88 | int tables[NUM_CPUS][NTABLES][TABLE_SIZE];
 89 | 
 90 | // words that store the global history
 91 | 
 92 | unsigned int ghist_words[NUM_CPUS][NGHIST_WORDS];
 93 | 
 94 | // remember the indices into the tables from prediction to update
 95 | 
 96 | unsigned int indices[NUM_CPUS][NTABLES];
 97 | 
 98 | // initialize theta to something reasonable, 
 99 | int 
100 | 	theta[NUM_CPUS], 
101 | 
102 | // initialize counter for threshold setting algorithm
103 | 	tc[NUM_CPUS], 	
104 | 
105 | // perceptron sum
106 | 	yout[NUM_CPUS];
107 | 
108 | void O3_CPU::initialize_branch_predictor () {
109 | 	// zero out the weights tables
110 | 
111 | 	memset (tables, 0, sizeof (tables));
112 | 
113 | 	// zero out the global history
114 | 
115 | 	memset (ghist_words, 0, sizeof (ghist_words));
116 | 
117 | 	// make a reasonable theta
118 | 
119 | 	for (int i=0; i<NUM_CPUS; i++) theta[i] = 10;
120 | }
121 | 
122 | uint8_t O3_CPU::predict_branch(uint64_t pc) {
123 | 
124 | 	// initialize perceptron sum
125 | 
126 | 	yout[cpu] = 0;
127 | 
128 | 	// for each table...
129 | 
130 | 	for (int i=0; i<NTABLES; i++) {
131 | 
132 | 		// n is the history length for this table
133 | 
134 | 		int n = history_lengths[i];
135 | 
136 | 		// hash global history bits 0..n-1 into x by XORing the words from the ghist_words array
137 | 
138 | 		unsigned int x = 0;
139 | 
140 | 		// most of the words are 12 bits long
141 | 
142 | 		int most_words = n / LOG_TABLE_SIZE;
143 | 
144 | 		// the last word is fewer than 12 bits
145 | 
146 | 		int last_word = n % LOG_TABLE_SIZE;
147 | 
148 | 		// XOR up to the next-to-the-last word
149 | 
150 | 		int j;
151 | 		for (j=0; j<most_words; j++) x ^= ghist_words[cpu][j];
152 | 
153 | 		// XOR in the last word
154 | 
155 | 		x ^= ghist_words[cpu][j] & ((1<<last_word)-1);
156 | 
157 | 		// XOR in the PC to spread accesses around (like gshare)
158 | 
159 | 		x ^= pc;
160 | 
161 | 		// stay within the table size
162 | 
163 | 		x &= TABLE_SIZE-1;
164 | 
165 | 		// remember this index for update
166 | 
167 | 		indices[cpu][i] = x;
168 | 
169 | 		// add the selected weight to the perceptron sum
170 | 
171 | 		yout[cpu] += tables[cpu][i][x];
172 | 	}
173 | 	return yout[cpu] >= 1;
174 | }
175 | 
176 | void O3_CPU::last_branch_result(uint64_t pc, uint8_t taken) {
177 | 
178 | 	// was this prediction correct?
179 | 
180 | 	bool correct = taken == (yout[cpu] >= 1);
181 | 
182 | 	// insert this branch outcome into the global history
183 | 
184 | 	bool b = taken;
185 | 	for (int i=0; i<NGHIST_WORDS; i++) {
186 | 
187 | 		// shift b into the lsb of the current word
188 | 
189 | 		ghist_words[cpu][i] <<= 1;
190 | 		ghist_words[cpu][i] |= b;
191 | 
192 | 		// get b as the previous msb of the current word
193 | 
194 | 		b = !!(ghist_words[cpu][i] & TABLE_SIZE);
195 | 		ghist_words[cpu][i] &= TABLE_SIZE-1;
196 | 	}
197 | 
198 | 	// get the magnitude of yout
199 | 
200 | 	int a = (yout[cpu] < 0) ? -yout[cpu] : yout[cpu];
201 | 
202 | 	// perceptron learning rule: train if misprediction or weak correct prediction
203 | 
204 | 	if (!correct || a < theta[cpu]) {
205 | 		// update weights
206 | 		for (int i=0; i<NTABLES; i++) {
207 | 			// which weight did we use to compute yout?
208 | 
209 | 			int *c = &tables[cpu][i][indices[cpu][i]];
210 | 
211 | 			// increment if taken, decrement if not, saturating at 127/-128
212 | 
213 | 			if (taken) {
214 | 				if (*c < 127) (*c)++;
215 | 			} else {
216 | 				if (*c > -128) (*c)--;
217 | 			}
218 | 		}
219 | 
220 | 		// dynamic threshold setting from Seznec's O-GEHL paper
221 | 
222 | 		if (!correct) {
223 | 
224 | 			// increase theta after enough mispredictions
225 | 
226 | 			tc[cpu]++;
227 | 			if (tc[cpu] >= SPEED) {
228 | 				theta[cpu]++;
229 | 				tc[cpu] = 0;
230 | 			}
231 | 		} else if (a < theta[cpu]) {
232 | 
233 | 			// decrease theta after enough weak but correct predictions
234 | 
235 | 			tc[cpu]--;
236 | 			if (tc[cpu] <= -SPEED) {
237 | 				theta[cpu]--;
238 | 				tc[cpu] = 0;
239 | 			}
240 | 		}
241 | 	}
242 | }
243 | 


--------------------------------------------------------------------------------
/branch/perceptron.bpred:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2001 University of Texas at Austin
  3 |  *
  4 |  * Daniel A. Jimenez
  5 |  * Calvin Lin
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person 
  8 |  * obtaining a copy of this software (the "Software"), to deal in
  9 |  * the Software without restriction, including without limitation 
 10 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 11 |  * and/or sell copies of the Software, and to permit persons to whom the 
 12 |  * Software is furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be
 15 |  * included in all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 18 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 19 |  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 20 |  * NONINFRINGEMENT.  IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT
 21 |  * AUSTIN BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 22 |  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 23 |  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 24 |  * THE SOFTWARE.
 25 |  *
 26 |  * This file implements the simulated perceptron branch predictor from:
 27 |  *
 28 |  * Jimenez, D. A. & Lin, C., Dynamic branch prediction with perceptrons,
 29 |  * Proceedings of the Seventh International Symposium on High Performance
 30 |  * Computer Architecture (HPCA), Monterrey, NL, Mexico 2001
 31 |  *
 32 |  * The #define's here specify a perceptron predictor with a history
 33 |  * length of 24, 163 perceptrons, and  8-bit weights.  This represents
 34 |  * a hardware budget of (24+1)*8*163 = 32600 bits, or about 4K bytes,
 35 |  * which is comparable to the hardware budget of the Alpha 21264 hybrid
 36 |  * branch predictor.
 37 |  *
 38 |  * There are three important functions defined in this file:
 39 |  * 
 40 |  * 1. void initialize_perceptron_predictor (void);
 41 |  * Initialize the perceptron predictor
 42 |  *
 43 |  * 2. perceptron_state *perceptron_dir_lookup (unsigned int);
 44 |  * Get a branch prediction, given a branch address.  This function returns a
 45 |  * pointer to a 'perceptron_state' struct, which contains the prediction, the
 46 |  * perceptron output, and other information necessary for using and updating
 47 |  * the predictor.  The first member of a 'perceptron_state' struct is a char
 48 |  * that is assigned 3 if the branch is predicted taken, 0 otherwise; this way,
 49 |  * a pointer to 'perceptron_state' can be cast to (char *) and passed around
 50 |  * SimpleScalar as though it were a pointer to a pattern history table entry.
 51 |  *
 52 |  * 3. void perceptron_update (perceptron_state *, int);
 53 |  * Update the branch predictor using the 'perceptron_state' pointer 
 54 |  * returned by perceptron_dir_lookup() and an int that is 1 if the branch
 55 |  * was taken, 0 otherwise.
 56 |  */
 57 | 
 58 | #include "ooo_cpu.h"
 59 | 
 60 | /* history length for the global history shift register */
 61 | 
 62 | #define PERCEPTRON_HISTORY	24
 63 | 
 64 | /* number of perceptrons */
 65 | 
 66 | #define NUM_PERCEPTRONS		163
 67 | 
 68 | /* number of bits per weight */
 69 | 
 70 | #define PERCEPTRON_BITS		8
 71 | 
 72 | /* maximum and minimum weight values */
 73 | 
 74 | #define MAX_WEIGHT		((1<<(PERCEPTRON_BITS-1))-1)
 75 | #define MIN_WEIGHT		(-(MAX_WEIGHT+1))
 76 | 
 77 | /* threshold for training */
 78 | 
 79 | #define THETA			((int) (1.93 * PERCEPTRON_HISTORY + 14))
 80 | 
 81 | /* size of buffer for keeping 'perceptron_state' for update */
 82 | 
 83 | #define NUM_UPDATE_ENTRIES	100
 84 | 
 85 | /* perceptron data structure */
 86 | 
 87 | typedef struct {
 88 | 	int	
 89 | 		/* just a vector of integers */
 90 | 
 91 | 		weights[PERCEPTRON_HISTORY+1];
 92 | } perceptron;
 93 | 
 94 | /* 'perceptron_state' - stores the branch prediction and keeps information
 95 |  * such as output and history needed for updating the perceptron predictor
 96 |  */
 97 | typedef struct {
 98 | 	char	
 99 | 		/* this char emulates a pattern history	table entry
100 | 		 * with a value of 0 for "predict not taken" or 3 for 
101 | 		 * "predict taken," so a perceptron_state pointer can 
102 | 		 * be passed around SimpleScalar's branch prediction 
103 | 		 * infrastructure without changing too much stuff.
104 | 		 */
105 | 		dummy_counter;
106 | 
107 | 	int
108 | 		/* prediction: 1 for taken, 0 for not taken */
109 | 
110 | 		prediction,
111 | 
112 | 		/* perceptron output */
113 | 
114 | 		output;
115 | 
116 | 	unsigned long long int 
117 | 		/* value of the history register yielding this prediction */
118 | 
119 | 		history;
120 | 
121 | 	perceptron
122 | 		/* pointer to the perceptron yielding this prediction */
123 | 
124 | 		*perc;
125 | } perceptron_state;
126 | 
127 | perceptron 
128 | 	/* table of perceptrons */
129 | 
130 | 	perceptrons[NUM_CPUS][NUM_PERCEPTRONS];
131 | 
132 | perceptron_state 
133 | 	/* state for updating perceptron predictor */
134 | 
135 | 	perceptron_state_buf[NUM_CPUS][NUM_UPDATE_ENTRIES];
136 | 
137 | int 
138 | 	/* index of the next "free" perceptron_state */
139 | 
140 | 	perceptron_state_buf_ctr[NUM_CPUS];
141 | 
142 | unsigned long long int
143 | 
144 | 	/* speculative global history - updated by predictor */
145 | 
146 | 	spec_global_history[NUM_CPUS],
147 | 
148 | 	/* real global history - updated when the predictor is updated */
149 | 
150 | 	global_history[NUM_CPUS];
151 | 
152 | perceptron_state *u[NUM_CPUS];
153 | 
154 | /* initialize a single perceptron */
155 | void initialize_perceptron (perceptron *p) {
156 |     int	i;
157 | 
158 |     for (i=0; i<=PERCEPTRON_HISTORY; i++) p->weights[i] = 0;
159 | }
160 | 
161 | void O3_CPU::initialize_branch_predictor()
162 | {
163 |     spec_global_history[cpu] = 0;
164 |     global_history[cpu] = 0;
165 |     perceptron_state_buf_ctr[cpu] = 0;
166 |     for (int i=0; i<NUM_PERCEPTRONS; i++)
167 |         initialize_perceptron (&perceptrons[cpu][i]);
168 | }
169 | 
170 | uint8_t O3_CPU::predict_branch(uint64_t ip)
171 | {
172 |     uint64_t address = ip;
173 | 
174 |     int	
175 |         index,
176 |         i,
177 |         output,
178 |         *w;
179 |     unsigned long long int 
180 |         mask;
181 |     perceptron 
182 |         *p;
183 | 
184 |     /* get a pointer to the next "free" perceptron_state,
185 |      * bumping up the pointer (and possibly letting it wrap around) 
186 |      */
187 | 
188 |     u[cpu] = &perceptron_state_buf[cpu][perceptron_state_buf_ctr[cpu]++];
189 |     if (perceptron_state_buf_ctr[cpu] >= NUM_UPDATE_ENTRIES)
190 |         perceptron_state_buf_ctr[cpu] = 0;
191 | 
192 |     /* hash the address to get an index into the table of perceptrons */
193 | 
194 |     index = address % NUM_PERCEPTRONS;
195 | 
196 |     /* get pointers to that perceptron and its weights */
197 | 
198 |     p = &perceptrons[cpu][index];
199 |     w = &p->weights[0];
200 | 
201 |     /* initialize the output to the bias weight, and bump the pointer
202 |      * to the weights
203 |      */
204 | 
205 |     output = *w++;
206 | 
207 |     /* find the (rest of the) dot product of the history register
208 |      * and the perceptron weights.  note that, instead of actually
209 |      * doing the expensive multiplies, we simply add a weight when the
210 |      * corresponding branch in the history register is taken, or
211 |      * subtract a weight when the branch is not taken.  this also lets
212 |      * us use binary instead of bipolar logic to represent the history
213 |      * register
214 |      */
215 |     for (mask=1,i=0; i<PERCEPTRON_HISTORY; i++,mask<<=1,w++) {
216 |         if (spec_global_history[cpu] & mask)
217 |             output += *w;
218 |         else
219 |             output += -*w;
220 |     }
221 | 
222 |     /* record the various values needed to update the predictor */
223 | 
224 |     u[cpu]->output = output;
225 |     u[cpu]->perc = p;
226 |     u[cpu]->history = spec_global_history[cpu];
227 |     u[cpu]->prediction = output >= 0;
228 |     u[cpu]->dummy_counter = u[cpu]->prediction ? 3 : 0;
229 | 
230 |     /* update the speculative global history register */
231 | 
232 |     spec_global_history[cpu] <<= 1;
233 |     spec_global_history[cpu] |= u[cpu]->prediction;
234 |     return u[cpu]->prediction;
235 | }
236 | 
237 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken)
238 | {
239 |     int	
240 |         i,
241 |         y, 
242 |         *w;
243 | 
244 |     unsigned long long int
245 |         mask, 
246 |         history;
247 | 
248 |     /* update the real global history shift register */
249 | 
250 |     global_history[cpu] <<= 1;
251 |     global_history[cpu] |= taken;
252 | 
253 |     /* if this branch was mispredicted, restore the speculative
254 |      * history to the last known real history
255 |      */
256 | 
257 |     if (u[cpu]->prediction != taken) spec_global_history[cpu] = global_history[cpu];
258 | 
259 |     /* if the output of the perceptron predictor is outside of
260 |      * the range [-THETA,THETA] *and* the prediction was correct,
261 |      * then we don't need to adjust the weights
262 |      */
263 | 
264 |     if (u[cpu]->output > THETA)
265 |         y = 1;
266 |     else if (u[cpu]->output < -THETA)
267 |         y = 0;
268 |     else
269 |         y = 2;
270 |     if (y == 1 && taken) return;
271 |     if (y == 0 && !taken) return;
272 | 
273 |     /* w is a pointer to the first weight (the bias weight) */
274 | 
275 |     w = &u[cpu]->perc->weights[0];
276 | 
277 |     /* if the branch was taken, increment the bias weight,
278 |      * else decrement it, with saturating arithmetic
279 |      */
280 | 
281 |     if (taken)
282 |         (*w)++;
283 |     else
284 |         (*w)--;
285 |     if (*w > MAX_WEIGHT) *w = MAX_WEIGHT;
286 |     if (*w < MIN_WEIGHT) *w = MIN_WEIGHT;
287 | 
288 |     /* now w points to the next weight */
289 | 
290 |     w++;
291 | 
292 |     /* get the history that led to this prediction */
293 | 
294 |     history = u[cpu]->history;
295 | 
296 |     /* for each weight and corresponding bit in the history register... */
297 | 
298 |     for (mask=1,i=0; i<PERCEPTRON_HISTORY; i++,mask<<=1,w++) {
299 | 
300 |         /* if the i'th bit in the history positively correlates
301 |          * with this branch outcome, increment the corresponding 
302 |          * weight, else decrement it, with saturating arithmetic
303 |          */
304 | 
305 |         if (!!(history & mask) == taken) { // a common trick to conver to boolean => !!x is 1 iff x is not zero, in this case history is positively correlated with branch outcome
306 |             (*w)++;
307 |             if (*w > MAX_WEIGHT) *w = MAX_WEIGHT;
308 |         } else {
309 |             (*w)--;
310 |             if (*w < MIN_WEIGHT) *w = MIN_WEIGHT;
311 |         }
312 |     }
313 | }
314 | 


--------------------------------------------------------------------------------
/build_champsim.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [ "$#" -ne 7 ]; then
  4 |     echo "Illegal number of parameters"
  5 |     echo "Usage: ./build_champsim.sh [branch_pred] [l1d_pref] [l2c_pref] [llc_pref] [llc_repl] [num_core]"
  6 |     exit 1
  7 | fi
  8 | 
  9 | # ChampSim configuration
 10 | BRANCH=$1           # branch/*.bpred
 11 | L1I_PREFETCHER=$2   # prefetcher/*.l1i_pref
 12 | L1D_PREFETCHER=$3   # prefetcher/*.l1d_pref
 13 | L2C_PREFETCHER=$4   # prefetcher/*.l2c_pref
 14 | LLC_PREFETCHER=$5   # prefetcher/*.llc_pref
 15 | LLC_REPLACEMENT=$6  # replacement/*.llc_repl
 16 | NUM_CORE=$7         # tested up to 8-core system
 17 | 
 18 | ############## Some useful macros ###############
 19 | BOLD=$(tput bold)
 20 | NORMAL=$(tput sgr0)
 21 | #################################################
 22 | 
 23 | # Sanity check
 24 | if [ ! -f ./branch/${BRANCH}.bpred ]; then
 25 |     echo "[ERROR] Cannot find branch predictor"
 26 | 	echo "[ERROR] Possible branch predictors from branch/*.bpred "
 27 |     find branch -name "*.bpred"
 28 |     exit 1
 29 | fi
 30 | 
 31 | if [ ! -f ./prefetcher/${L1I_PREFETCHER}.l1i_pref ]; then
 32 |     echo "[ERROR] Cannot find L1I prefetcher"
 33 | 	echo "[ERROR] Possible L1I prefetchers from prefetcher/*.l1i_pref "
 34 |     find prefetcher -name "*.l1i_pref"
 35 |     exit 1
 36 | fi
 37 | 
 38 | if [ ! -f ./prefetcher/${L1D_PREFETCHER}.l1d_pref ]; then
 39 |     echo "[ERROR] Cannot find L1D prefetcher"
 40 | 	echo "[ERROR] Possible L1D prefetchers from prefetcher/*.l1d_pref "
 41 |     find prefetcher -name "*.l1d_pref"
 42 |     exit 1
 43 | fi
 44 | 
 45 | if [ ! -f ./prefetcher/${L2C_PREFETCHER}.l2c_pref ]; then
 46 |     echo "[ERROR] Cannot find L2C prefetcher"
 47 | 	echo "[ERROR] Possible L2C prefetchers from prefetcher/*.l2c_pref "
 48 |     find prefetcher -name "*.l2c_pref"
 49 |     exit 1
 50 | fi
 51 | 
 52 | if [ ! -f ./prefetcher/${LLC_PREFETCHER}.llc_pref ]; then
 53 |     echo "[ERROR] Cannot find LLC prefetcher"
 54 | 	echo "[ERROR] Possible LLC prefetchers from prefetcher/*.llc_pref "
 55 |     find prefetcher -name "*.llc_pref"
 56 |     exit 1
 57 | fi
 58 | 
 59 | if [ ! -f ./replacement/${LLC_REPLACEMENT}.llc_repl ]; then
 60 |     echo "[ERROR] Cannot find LLC replacement policy"
 61 | 	echo "[ERROR] Possible LLC replacement policy from replacement/*.llc_repl"
 62 |     find replacement -name "*.llc_repl"
 63 |     exit 1
 64 | fi
 65 | 
 66 | # Check num_core
 67 | re='^[0-9]+$'
 68 | if ! [[ $NUM_CORE =~ $re ]] ; then
 69 |     echo "[ERROR]: num_core is NOT a number" >&2;
 70 |     exit 1
 71 | fi
 72 | 
 73 | # Check for multi-core
 74 | if [ "$NUM_CORE" -gt "1" ]; then
 75 |     echo "Building multi-core ChampSim..."
 76 |     sed -i.bak 's/\<NUM_CPUS 1\>/NUM_CPUS '${NUM_CORE}'/g' inc/champsim.h
 77 | #	sed -i.bak 's/\<DRAM_CHANNELS 1\>/DRAM_CHANNELS 2/g' inc/champsim.h
 78 | #	sed -i.bak 's/\<DRAM_CHANNELS_LOG2 0\>/DRAM_CHANNELS_LOG2 1/g' inc/champsim.h
 79 | else
 80 |     if [ "$NUM_CORE" -lt "1" ]; then
 81 |         echo "Number of core: $NUM_CORE must be greater or equal than 1"
 82 |         exit 1
 83 |     else
 84 |         echo "Building single-core ChampSim..."
 85 |     fi
 86 | fi
 87 | echo
 88 | 
 89 | # Change prefetchers and replacement policy
 90 | cp branch/${BRANCH}.bpred branch/branch_predictor.cc
 91 | cp prefetcher/${L1I_PREFETCHER}.l1i_pref prefetcher/l1i_prefetcher.cc
 92 | cp prefetcher/${L1D_PREFETCHER}.l1d_pref prefetcher/l1d_prefetcher.cc
 93 | cp prefetcher/${L2C_PREFETCHER}.l2c_pref prefetcher/l2c_prefetcher.cc
 94 | cp prefetcher/${LLC_PREFETCHER}.llc_pref prefetcher/llc_prefetcher.cc
 95 | cp replacement/${LLC_REPLACEMENT}.llc_repl replacement/llc_replacement.cc
 96 | 
 97 | # Build
 98 | mkdir -p bin
 99 | rm -f bin/champsim
100 | make clean
101 | make
102 | 
103 | # Sanity check
104 | echo ""
105 | if [ ! -f bin/champsim ]; then
106 |     echo "${BOLD}ChampSim build FAILED!"
107 |     echo ""
108 |     exit 1
109 | fi
110 | 
111 | echo "${BOLD}ChampSim is successfully built"
112 | echo "Branch Predictor: ${BRANCH}"
113 | echo "L1I Prefetcher: ${L1I_PREFETCHER}"
114 | echo "L1D Prefetcher: ${L1D_PREFETCHER}"
115 | echo "L2C Prefetcher: ${L2C_PREFETCHER}"
116 | echo "LLC Prefetcher: ${LLC_PREFETCHER}"
117 | echo "LLC Replacement: ${LLC_REPLACEMENT}"
118 | echo "Cores: ${NUM_CORE}"
119 | BINARY_NAME="${BRANCH}-${L1I_PREFETCHER}-${L1D_PREFETCHER}-${L2C_PREFETCHER}-${LLC_PREFETCHER}-${LLC_REPLACEMENT}-${NUM_CORE}core"
120 | echo "Binary: bin/${BINARY_NAME}"
121 | echo ""
122 | mv bin/champsim bin/${BINARY_NAME}
123 | 
124 | 
125 | # Restore to the default configuration
126 | sed -i.bak 's/\<NUM_CPUS '${NUM_CORE}'\>/NUM_CPUS 1/g' inc/champsim.h
127 | #sed -i.bak 's/\<DRAM_CHANNELS 2\>/DRAM_CHANNELS 1/g' inc/champsim.h
128 | #sed -i.bak 's/\<DRAM_CHANNELS_LOG2 1\>/DRAM_CHANNELS_LOG2 0/g' inc/champsim.h
129 | 
130 | cp branch/bimodal.bpred branch/branch_predictor.cc
131 | cp prefetcher/no.l1i_pref prefetcher/l1i_prefetcher.cc
132 | cp prefetcher/no.l1d_pref prefetcher/l1d_prefetcher.cc
133 | cp prefetcher/no.l2c_pref prefetcher/l2c_prefetcher.cc
134 | cp prefetcher/no.llc_pref prefetcher/llc_prefetcher.cc
135 | cp replacement/lru.llc_repl replacement/llc_replacement.cc
136 | 


--------------------------------------------------------------------------------
/cvp_tracer/README.md:
--------------------------------------------------------------------------------
 1 | The cvp2champsim tracer comes as is with no guarantee that it covers every conversion case.
 2 | 
 3 | The tracer is used to convert the traces from the 2nd Championship Value 
 4 | Prediction (CVP) to a ChampSim-friendly format. 
 5 | 
 6 | CVP-1 Site: https://www.microarch.org/cvp1/
 7 | CVP-2 Site: https://www.microarch.org/cvp1/cvp2/rules.html
 8 | 
 9 | To use the tracer first compile it using g++:
10 | 
11 | g++ cvp2champsim.cc -o cvp_tracer
12 | 
13 | To convert a trace execute:
14 | 
15 | ./cvp_tracer TRACE_NAME.gz
16 | 
17 | The ChampSim trace will be sent to standard output so to keep and compress the 
18 | output trace run:
19 | 
20 | ./cvp_tracer TRACE_NAME.gz | gzip > NEW_TRACE.champsim.gz
21 | 
22 | Adding the "-v" flag will print the dissassembly of the CVP trace to standard 
23 | error output as well as the ChampSim format to standard output.
24 | 


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | while read -r line; do
 2 |     arr=($line)
 3 |     mkdir -p $(dirname ${arr[0]})
 4 | done < download_links
 5 | 
 6 | while read -r line; do
 7 |     arr=($line)
 8 |     echo Downloading ${arr[0]} from ${arr[1]}
 9 |     if ! [[ -e "${arr[0]}" ]]; then
10 |         curl -L -o ${arr[0]} ${arr[1]}
11 |         echo Downloading ${arr[0]} Done
12 |     else
13 |         echo ${arr[0]} File already exists
14 |     fi
15 | done < download_links
16 | 


--------------------------------------------------------------------------------
/get_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('results_file', help='Path to ChampSim results file')
 9 |     parser.add_argument('--cache-level', default='LLC', choices=('L2', 'LLC'), help='Cache level to compute stats for (default: %(default)s)')
10 |     parser.add_argument('--base', default=None, help='Path to ChampSim base settings results file with no prefetcher for more accurate statistics')
11 | 
12 |     return parser.parse_args()
13 | 
14 | 
15 | def read_file(path, cache_level):
16 |     if path is None:
17 |         return None
18 | 
19 |     expected_keys = ('ipc', 'total_miss', 'useful', 'useless', 'load_miss', 'rfo_miss', 'kilo_inst')
20 |     data = {}
21 |     with open(path, 'r') as f:
22 |         for line in f:
23 |             if 'Finished CPU' in line:
24 |                 data['ipc'] = float(line.split()[9])
25 |                 data['kilo_inst'] = int(line.split()[4]) / 1000
26 |             if cache_level not in line:
27 |                 continue
28 |             line = line.strip()
29 |             if 'LOAD' in line:
30 |                 data['load_miss'] = int(line.split()[-1])
31 |             elif 'RFO' in line:
32 |                 data['rfo_miss'] = int(line.split()[-1])
33 |             elif 'TOTAL' in line:
34 |                 data['total_miss'] = int(line.split()[-1])
35 |             elif 'USEFUL' in line:
36 |                 data['useful'] = int(line.split()[-3])
37 |                 data['useless'] = int(line.split()[-1])
38 | 
39 |     if not all(key in data for key in expected_keys):
40 |         return None
41 | 
42 |     return data
43 | 
44 | def main(args=None):
45 |     print(args)
46 |     results = read_file(args.results_file, args.cache_level)
47 |     useful, useless, ipc, load_miss, rfo_miss, kilo_inst = (
48 |         results['useful'], results['useless'], results['ipc'], results['load_miss'], results['rfo_miss'], results['kilo_inst']
49 |     )
50 |     results_total_miss = load_miss + rfo_miss + useful
51 |     total_miss = results_total_miss
52 | 
53 |     results_mpki = (load_miss + rfo_miss) / kilo_inst
54 | 
55 |     base = read_file(args.base, args.cache_level)
56 |     if base is not None:
57 |         base_total_miss, base_ipc = base['total_miss'], base['ipc']
58 |         base_mpki = base_total_miss / kilo_inst
59 | 
60 |     if useful + useless == 0:
61 |         print('Accuracy: N/A [All prefetches were merged and were not useful or useless]')
62 |     else:
63 |         print('Accuracy:', useful / (useful + useless) * 100, '%')
64 |     if total_miss == 0:
65 |         print('Coverage: N/A [No misses. Did you run this simulation for long enough?]')
66 |     else:
67 |         print('Coverage:', useful / total_miss * 100, '%')
68 |     print('MPKI:', results_mpki)
69 |     if base is not None:
70 |         print('MPKI Improvement:', (base_mpki - results_mpki) / base_mpki * 100, '%')
71 |     print('IPC:', ipc)
72 |     if base is not None:
73 |         print('IPC Improvement:', (ipc - base_ipc) / base_ipc * 100, '%')
74 | 
75 | if __name__ == '__main__':
76 |     main(args=get_args())
77 | 


--------------------------------------------------------------------------------
/inc/block.h:
--------------------------------------------------------------------------------
  1 | #ifndef BLOCK_H
  2 | #define BLOCK_H
  3 | 
  4 | #include "champsim.h"
  5 | #include "instruction.h"
  6 | #include "set.h"
  7 | 
  8 | // CACHE BLOCK
  9 | class BLOCK {
 10 |   public:
 11 |     uint8_t valid,
 12 |             prefetch,
 13 |             dirty,
 14 |             used;
 15 | 
 16 |     int delta,
 17 |         depth,
 18 |         signature,
 19 |         confidence;
 20 | 
 21 |     uint64_t address,
 22 |              full_addr,
 23 |              tag,
 24 |              data,
 25 |              ip,
 26 |              cpu,
 27 |              instr_id;
 28 | 
 29 |     // replacement state
 30 |     uint32_t lru;
 31 | 
 32 |     BLOCK() {
 33 |         valid = 0;
 34 |         prefetch = 0;
 35 |         dirty = 0;
 36 |         used = 0;
 37 | 
 38 |         delta = 0;
 39 |         depth = 0;
 40 |         signature = 0;
 41 |         confidence = 0;
 42 | 
 43 |         address = 0;
 44 |         full_addr = 0;
 45 |         tag = 0;
 46 |         data = 0;
 47 |         cpu = 0;
 48 |         instr_id = 0;
 49 | 
 50 |         lru = 0;
 51 |     };
 52 | };
 53 | 
 54 | // DRAM CACHE BLOCK
 55 | class DRAM_ARRAY {
 56 |   public:
 57 |     BLOCK **block;
 58 | 
 59 |     DRAM_ARRAY() {
 60 |         block = NULL;
 61 |     };
 62 | };
 63 | 
 64 | // message packet
 65 | class PACKET {
 66 |   public:
 67 |     uint8_t instruction, 
 68 |             is_data,
 69 |             fill_l1i,
 70 |             fill_l1d,
 71 |             tlb_access,
 72 |             scheduled,
 73 |             translated,
 74 |             fetched,
 75 |             prefetched,
 76 |             drc_tag_read;
 77 | 
 78 |     int fill_level, 
 79 |         pf_origin_level,
 80 |         rob_signal, 
 81 |         rob_index, 
 82 |         producer,
 83 |         delta,
 84 |         depth,
 85 |         signature,
 86 |         confidence;
 87 | 
 88 |     uint32_t pf_metadata;
 89 | 
 90 |     uint8_t  is_producer, 
 91 |              //rob_index_depend_on_me[ROB_SIZE], 
 92 |              //lq_index_depend_on_me[ROB_SIZE], 
 93 |              //sq_index_depend_on_me[ROB_SIZE], 
 94 |              instr_merged,
 95 |              load_merged, 
 96 |              store_merged,
 97 |              returned,
 98 |              asid[2],
 99 |              type;
100 | 
101 |     fastset
102 |              rob_index_depend_on_me, 
103 |              lq_index_depend_on_me, 
104 |              sq_index_depend_on_me;
105 | 
106 |     uint32_t cpu, data_index, lq_index, sq_index;
107 | 
108 |     uint64_t address, 
109 |              full_addr, 
110 |              instruction_pa,
111 |              data_pa,
112 |              data,
113 |              instr_id,
114 |              ip, 
115 |              event_cycle,
116 |              cycle_enqueued;
117 | 
118 |     PACKET() {
119 |         instruction = 0;
120 |         is_data = 1;
121 | 	fill_l1i = 0;
122 | 	fill_l1d = 0;
123 |         tlb_access = 0;
124 |         scheduled = 0;
125 |         translated = 0;
126 |         fetched = 0;
127 |         prefetched = 0;
128 |         drc_tag_read = 0;
129 | 
130 |         returned = 0;
131 |         asid[0] = UINT8_MAX;
132 |         asid[1] = UINT8_MAX;
133 |         type = 0;
134 | 
135 |         fill_level = -1; 
136 |         rob_signal = -1;
137 |         rob_index = -1;
138 |         producer = -1;
139 |         delta = 0;
140 |         depth = 0;
141 |         signature = 0;
142 |         confidence = 0;
143 | 
144 | #if 0
145 |         for (uint32_t i=0; i<ROB_SIZE; i++) {
146 |             rob_index_depend_on_me[i] = 0;
147 |             lq_index_depend_on_me[i] = 0;
148 |             sq_index_depend_on_me[i] = 0;
149 |         }
150 | #endif
151 |         is_producer = 0;
152 |         instr_merged = 0;
153 |         load_merged = 0;
154 |         store_merged = 0;
155 | 
156 |         cpu = NUM_CPUS;
157 |         data_index = 0;
158 |         lq_index = 0;
159 |         sq_index = 0;
160 | 
161 |         address = 0;
162 |         full_addr = 0;
163 |         instruction_pa = 0;
164 |         data = 0;
165 |         instr_id = 0;
166 |         ip = 0;
167 |         event_cycle = UINT64_MAX;
168 | 	cycle_enqueued = 0;
169 |     };
170 | };
171 | 
172 | // packet queue
173 | class PACKET_QUEUE {
174 |   public:
175 |     string NAME;
176 |     uint32_t SIZE;
177 | 
178 |     uint8_t  is_RQ, 
179 |              is_WQ,
180 |              write_mode;
181 | 
182 |     uint32_t cpu, 
183 |              head, 
184 |              tail, 
185 |              occupancy, 
186 |              num_returned, 
187 |              next_fill_index, 
188 |              next_schedule_index, 
189 |              next_process_index;
190 | 
191 |     uint64_t next_fill_cycle, 
192 |              next_schedule_cycle, 
193 |              next_process_cycle,
194 |              ACCESS,
195 |              FORWARD,
196 |              MERGED,
197 |              TO_CACHE,
198 |              ROW_BUFFER_HIT,
199 |              ROW_BUFFER_MISS,
200 |              FULL;
201 | 
202 |     PACKET *entry, processed_packet[2*MAX_READ_PER_CYCLE];
203 | 
204 |     // constructor
205 |     PACKET_QUEUE(string v1, uint32_t v2) : NAME(v1), SIZE(v2) {
206 |         is_RQ = 0;
207 |         is_WQ = 0;
208 |         write_mode = 0;
209 | 
210 |         cpu = 0; 
211 |         head = 0;
212 |         tail = 0;
213 |         occupancy = 0;
214 |         num_returned = 0;
215 |         next_fill_index = 0;
216 |         next_schedule_index = 0;
217 |         next_process_index = 0;
218 | 
219 |         next_fill_cycle = UINT64_MAX;
220 |         next_schedule_cycle = UINT64_MAX;
221 |         next_process_cycle = UINT64_MAX;
222 | 
223 |         ACCESS = 0;
224 |         FORWARD = 0;
225 |         MERGED = 0;
226 |         TO_CACHE = 0;
227 |         ROW_BUFFER_HIT = 0;
228 |         ROW_BUFFER_MISS = 0;
229 |         FULL = 0;
230 | 
231 |         entry = new PACKET[SIZE]; 
232 |     };
233 | 
234 |     PACKET_QUEUE() {
235 |         is_RQ = 0;
236 |         is_WQ = 0;
237 | 
238 |         cpu = 0; 
239 |         head = 0;
240 |         tail = 0;
241 |         occupancy = 0;
242 |         num_returned = 0;
243 |         next_fill_index = 0;
244 |         next_schedule_index = 0;
245 |         next_process_index = 0;
246 | 
247 |         next_fill_cycle = UINT64_MAX;
248 |         next_schedule_cycle = UINT64_MAX;
249 |         next_process_cycle = UINT64_MAX;
250 | 
251 |         ACCESS = 0;
252 |         FORWARD = 0;
253 |         MERGED = 0;
254 |         TO_CACHE = 0;
255 |         ROW_BUFFER_HIT = 0;
256 |         ROW_BUFFER_MISS = 0;
257 |         FULL = 0;
258 | 
259 |         //entry = new PACKET[SIZE]; 
260 |     };
261 | 
262 |     // destructor
263 |     ~PACKET_QUEUE() {
264 |         delete[] entry;
265 |     };
266 | 
267 |     // functions
268 |     int check_queue(PACKET* packet);
269 |     void add_queue(PACKET* packet),
270 |          remove_queue(PACKET* packet);
271 | };
272 | 
273 | // reorder buffer
274 | class CORE_BUFFER {
275 |   public:
276 |     const string NAME;
277 |     const uint32_t SIZE;
278 |     uint32_t cpu, 
279 |              head, 
280 |              tail,
281 |              occupancy,
282 |              last_read, last_fetch, last_scheduled, 
283 |              inorder_fetch[2],
284 |              next_fetch[2],
285 |              next_schedule;
286 |     uint64_t event_cycle,
287 |              fetch_event_cycle,
288 |              schedule_event_cycle,
289 |              execute_event_cycle,
290 |              lsq_event_cycle,
291 |              retire_event_cycle;
292 | 
293 |     ooo_model_instr *entry;
294 | 
295 |     // constructor
296 |     CORE_BUFFER(string v1, uint32_t v2) : NAME(v1), SIZE(v2) {
297 |         head = 0;
298 |         tail = 0;
299 |         occupancy = 0;
300 | 
301 |         last_read = SIZE-1;
302 |         last_fetch = SIZE-1;
303 |         last_scheduled = 0;
304 | 
305 |         inorder_fetch[0] = 0;
306 |         inorder_fetch[1] = 0;
307 |         next_fetch[0] = 0;
308 |         next_fetch[1] = 0;
309 |         next_schedule = 0;
310 | 
311 |         event_cycle = 0;
312 |         fetch_event_cycle = UINT64_MAX;
313 |         schedule_event_cycle = UINT64_MAX;
314 |         execute_event_cycle = UINT64_MAX;
315 |         lsq_event_cycle = UINT64_MAX;
316 |         retire_event_cycle = UINT64_MAX;
317 | 
318 |         entry = new ooo_model_instr[SIZE];
319 |     };
320 | 
321 |     // destructor
322 |     ~CORE_BUFFER() {
323 |         delete[] entry;
324 |     };
325 | };
326 | 
327 | // load/store queue 
328 | class LSQ_ENTRY {
329 |   public:
330 |     uint64_t instr_id,
331 |              producer_id,
332 |              virtual_address,
333 |              physical_address,
334 |              ip,
335 |              event_cycle;
336 | 
337 |     uint32_t rob_index, data_index, sq_index;
338 | 
339 |     uint8_t translated,
340 |             fetched,
341 |             asid[2];
342 | // forwarding_depend_on_me[ROB_SIZE];
343 |     fastset
344 | 		forwarding_depend_on_me;
345 | 
346 |     // constructor
347 |     LSQ_ENTRY() {
348 |         instr_id = 0;
349 |         producer_id = UINT64_MAX;
350 |         virtual_address = 0;
351 |         physical_address = 0;
352 |         ip = 0;
353 |         event_cycle = 0;
354 | 
355 |         rob_index = 0;
356 |         data_index = 0;
357 |         sq_index = UINT32_MAX;
358 | 
359 |         translated = 0;
360 |         fetched = 0;
361 |         asid[0] = UINT8_MAX;
362 |         asid[1] = UINT8_MAX;
363 | 
364 | #if 0
365 |         for (uint32_t i=0; i<ROB_SIZE; i++)
366 |             forwarding_depend_on_me[i] = 0;
367 | #endif
368 |     };
369 | };
370 | 
371 | class LOAD_STORE_QUEUE {
372 |   public:
373 |     const string NAME;
374 |     const uint32_t SIZE;
375 |     uint32_t occupancy, head, tail;
376 | 
377 |     LSQ_ENTRY *entry;
378 | 
379 |     // constructor
380 |     LOAD_STORE_QUEUE(string v1, uint32_t v2) : NAME(v1), SIZE(v2) {
381 |         occupancy = 0;
382 |         head = 0;
383 |         tail = 0;
384 | 
385 |         entry = new LSQ_ENTRY[SIZE];
386 |     };
387 | 
388 |     // destructor
389 |     ~LOAD_STORE_QUEUE() {
390 |         delete[] entry;
391 |     };
392 | };
393 | #endif
394 | 


--------------------------------------------------------------------------------
/inc/cache.h:
--------------------------------------------------------------------------------
  1 | #ifndef CACHE_H
  2 | #define CACHE_H
  3 | 
  4 | #include "memory_class.h"
  5 | 
  6 | // PAGE
  7 | extern uint32_t PAGE_TABLE_LATENCY, SWAP_LATENCY;
  8 | 
  9 | // CACHE TYPE
 10 | #define IS_ITLB 0
 11 | #define IS_DTLB 1
 12 | #define IS_STLB 2
 13 | #define IS_L1I  3
 14 | #define IS_L1D  4
 15 | #define IS_L2C  5
 16 | #define IS_LLC  6
 17 | 
 18 | // INSTRUCTION TLB
 19 | #define ITLB_SET 16
 20 | #define ITLB_WAY 4
 21 | #define ITLB_RQ_SIZE 16
 22 | #define ITLB_WQ_SIZE 16
 23 | #define ITLB_PQ_SIZE 0
 24 | #define ITLB_MSHR_SIZE 8
 25 | #define ITLB_LATENCY 1
 26 | 
 27 | // DATA TLB
 28 | #define DTLB_SET 16
 29 | #define DTLB_WAY 4
 30 | #define DTLB_RQ_SIZE 16
 31 | #define DTLB_WQ_SIZE 16
 32 | #define DTLB_PQ_SIZE 0
 33 | #define DTLB_MSHR_SIZE 8
 34 | #define DTLB_LATENCY 1
 35 | 
 36 | // SECOND LEVEL TLB
 37 | #define STLB_SET 128
 38 | #define STLB_WAY 12
 39 | #define STLB_RQ_SIZE 32
 40 | #define STLB_WQ_SIZE 32
 41 | #define STLB_PQ_SIZE 0
 42 | #define STLB_MSHR_SIZE 16
 43 | #define STLB_LATENCY 8
 44 | 
 45 | // L1 INSTRUCTION CACHE
 46 | #define L1I_SET 64
 47 | #define L1I_WAY 8
 48 | #define L1I_RQ_SIZE 64
 49 | #define L1I_WQ_SIZE 64 
 50 | #define L1I_PQ_SIZE 32
 51 | #define L1I_MSHR_SIZE 8
 52 | #define L1I_LATENCY 4
 53 | 
 54 | // L1 DATA CACHE
 55 | #define L1D_SET 64
 56 | #define L1D_WAY 12
 57 | #define L1D_RQ_SIZE 64
 58 | #define L1D_WQ_SIZE 64 
 59 | #define L1D_PQ_SIZE 8
 60 | #define L1D_MSHR_SIZE 16
 61 | #define L1D_LATENCY 5 
 62 | 
 63 | // L2 CACHE
 64 | #define L2C_SET 1024
 65 | #define L2C_WAY 8
 66 | #define L2C_RQ_SIZE 32
 67 | #define L2C_WQ_SIZE 32
 68 | #define L2C_PQ_SIZE 16
 69 | #define L2C_MSHR_SIZE 32
 70 | #define L2C_LATENCY 10  // 4/5 (L1I or L1D) + 10 = 14/15 cycles
 71 | 
 72 | // LAST LEVEL CACHE
 73 | #define LLC_SET NUM_CPUS*2048
 74 | #define LLC_WAY 16
 75 | #define LLC_RQ_SIZE NUM_CPUS*L2C_MSHR_SIZE //48
 76 | #define LLC_WQ_SIZE NUM_CPUS*L2C_MSHR_SIZE //48
 77 | #define LLC_PQ_SIZE NUM_CPUS*32
 78 | #define LLC_MSHR_SIZE NUM_CPUS*64
 79 | #define LLC_LATENCY 20  // 4/5 (L1I or L1D) + 10 + 20 = 34/35 cycles
 80 | 
 81 | class CACHE : public MEMORY {
 82 |   public:
 83 |     uint32_t cpu;
 84 |     const string NAME;
 85 |     const uint32_t NUM_SET, NUM_WAY, NUM_LINE, WQ_SIZE, RQ_SIZE, PQ_SIZE, MSHR_SIZE;
 86 |     uint32_t LATENCY;
 87 |     BLOCK **block;
 88 |     int fill_level;
 89 |     uint32_t MAX_READ, MAX_FILL;
 90 |     uint32_t reads_available_this_cycle;
 91 |     uint8_t cache_type;
 92 | 
 93 |     // prefetch stats
 94 |     uint64_t pf_requested,
 95 |              pf_issued,
 96 |              pf_useful,
 97 |              pf_useless,
 98 |              pf_fill;
 99 | 
100 |     // queues
101 |     PACKET_QUEUE WQ{NAME + "_WQ", WQ_SIZE}, // write queue
102 |                  RQ{NAME + "_RQ", RQ_SIZE}, // read queue
103 |                  PQ{NAME + "_PQ", PQ_SIZE}, // prefetch queue
104 |                  MSHR{NAME + "_MSHR", MSHR_SIZE}, // MSHR
105 |                  PROCESSED{NAME + "_PROCESSED", ROB_SIZE}; // processed queue
106 | 
107 |     uint64_t sim_access[NUM_CPUS][NUM_TYPES],
108 |              sim_hit[NUM_CPUS][NUM_TYPES],
109 |              sim_miss[NUM_CPUS][NUM_TYPES],
110 |              roi_access[NUM_CPUS][NUM_TYPES],
111 |              roi_hit[NUM_CPUS][NUM_TYPES],
112 |              roi_miss[NUM_CPUS][NUM_TYPES];
113 | 
114 |     uint64_t total_miss_latency;
115 |     
116 |     // constructor
117 |     CACHE(string v1, uint32_t v2, int v3, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v8) 
118 |         : NAME(v1), NUM_SET(v2), NUM_WAY(v3), NUM_LINE(v4), WQ_SIZE(v5), RQ_SIZE(v6), PQ_SIZE(v7), MSHR_SIZE(v8) {
119 | 
120 |         LATENCY = 0;
121 | 
122 |         // cache block
123 |         block = new BLOCK* [NUM_SET];
124 |         for (uint32_t i=0; i<NUM_SET; i++) {
125 |             block[i] = new BLOCK[NUM_WAY]; 
126 | 
127 |             for (uint32_t j=0; j<NUM_WAY; j++) {
128 |                 block[i][j].lru = j;
129 |             }
130 |         }
131 | 
132 |         for (uint32_t i=0; i<NUM_CPUS; i++) {
133 |             upper_level_icache[i] = NULL;
134 |             upper_level_dcache[i] = NULL;
135 | 
136 |             for (uint32_t j=0; j<NUM_TYPES; j++) {
137 |                 sim_access[i][j] = 0;
138 |                 sim_hit[i][j] = 0;
139 |                 sim_miss[i][j] = 0;
140 |                 roi_access[i][j] = 0;
141 |                 roi_hit[i][j] = 0;
142 |                 roi_miss[i][j] = 0;
143 |             }
144 |         }
145 | 
146 | 	total_miss_latency = 0;
147 | 
148 |         lower_level = NULL;
149 |         extra_interface = NULL;
150 |         fill_level = -1;
151 |         MAX_READ = 1;
152 |         MAX_FILL = 1;
153 | 
154 |         pf_requested = 0;
155 |         pf_issued = 0;
156 |         pf_useful = 0;
157 |         pf_useless = 0;
158 |         pf_fill = 0;
159 |     };
160 | 
161 |     // destructor
162 |     ~CACHE() {
163 |         for (uint32_t i=0; i<NUM_SET; i++)
164 |             delete[] block[i];
165 |         delete[] block;
166 |     };
167 | 
168 |     // functions
169 |     int  add_rq(PACKET *packet),
170 |          add_wq(PACKET *packet),
171 |          add_pq(PACKET *packet);
172 | 
173 |     void return_data(PACKET *packet),
174 |          operate(),
175 |          increment_WQ_FULL(uint64_t address);
176 | 
177 |     uint32_t get_occupancy(uint8_t queue_type, uint64_t address),
178 |              get_size(uint8_t queue_type, uint64_t address);
179 | 
180 |     int  check_hit(PACKET *packet),
181 |          invalidate_entry(uint64_t inval_addr),
182 |          check_mshr(PACKET *packet),
183 |          prefetch_line(uint64_t ip, uint64_t base_addr, uint64_t pf_addr, int prefetch_fill_level, uint32_t prefetch_metadata),
184 |          kpc_prefetch_line(uint64_t base_addr, uint64_t pf_addr, int prefetch_fill_level, int delta, int depth, int signature, int confidence, uint32_t prefetch_metadata);
185 | 
186 |     void handle_fill(),
187 |          handle_writeback(),
188 |          handle_read(),
189 |          handle_prefetch();
190 | 
191 |     void add_mshr(PACKET *packet),
192 |          update_fill_cycle(),
193 |          llc_initialize_replacement(),
194 |          update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit),
195 |          llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit),
196 |          lru_update(uint32_t set, uint32_t way),
197 |          fill_cache(uint32_t set, uint32_t way, PACKET *packet),
198 |          replacement_final_stats(),
199 |          llc_replacement_final_stats(),
200 |          //prefetcher_initialize(),
201 |          l1d_prefetcher_initialize(),
202 |          l2c_prefetcher_initialize(),
203 |          llc_prefetcher_initialize(),
204 |          prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type),
205 |          l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type),
206 |          prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr),
207 |          l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in),
208 |          //prefetcher_final_stats(),
209 |          l1d_prefetcher_final_stats(),
210 |          l2c_prefetcher_final_stats(),
211 |          llc_prefetcher_final_stats();
212 |     void (*l1i_prefetcher_cache_operate)(uint32_t, uint64_t, uint8_t, uint8_t);
213 |     void (*l1i_prefetcher_cache_fill)(uint32_t, uint64_t, uint32_t, uint32_t, uint8_t, uint64_t);
214 | 
215 |     uint32_t l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in),
216 |          llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle),
217 |          l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in),
218 |          llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in);
219 |     
220 |     uint32_t get_set(uint64_t address),
221 |              get_way(uint64_t address, uint32_t set),
222 |              find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type),
223 |              llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type),
224 |              lru_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type);
225 | };
226 | 
227 | #endif
228 | 


--------------------------------------------------------------------------------
/inc/champsim.h:
--------------------------------------------------------------------------------
  1 | #ifndef CHAMPSIM_H
  2 | #define CHAMPSIM_H
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdint.h>
  6 | #include <stdlib.h>
  7 | #include <unistd.h>
  8 | #include <string.h>
  9 | #include <limits.h>
 10 | #include <time.h>
 11 | #include <assert.h>
 12 | #include <signal.h>
 13 | #include <sys/types.h>
 14 | 
 15 | #include <iostream>
 16 | #include <queue>
 17 | #include <map>
 18 | #include <random>
 19 | #include <string>
 20 | #include <iomanip>
 21 | 
 22 | // USEFUL MACROS
 23 | //#define DEBUG_PRINT
 24 | #define SANITY_CHECK
 25 | #define LLC_BYPASS
 26 | #define DRC_BYPASS
 27 | #define NO_CRC2_COMPILE
 28 | 
 29 | #ifdef DEBUG_PRINT
 30 | #define DP(x) x
 31 | #else
 32 | #define DP(x)
 33 | #endif
 34 | 
 35 | // CPU
 36 | #define NUM_CPUS 1
 37 | #define CPU_FREQ 4000
 38 | #define DRAM_IO_FREQ 3200
 39 | #define PAGE_SIZE 4096
 40 | #define LOG2_PAGE_SIZE 12
 41 | 
 42 | // CACHE
 43 | #define BLOCK_SIZE 64
 44 | #define LOG2_BLOCK_SIZE 6
 45 | #define MAX_READ_PER_CYCLE 8
 46 | #define MAX_FILL_PER_CYCLE 1
 47 | 
 48 | #define INFLIGHT 1
 49 | #define COMPLETED 2
 50 | 
 51 | #define FILL_L1    1
 52 | #define FILL_L2    2
 53 | #define FILL_LLC   4
 54 | #define FILL_DRC   8
 55 | #define FILL_DRAM 16
 56 | 
 57 | // DRAM
 58 | #define DRAM_CHANNELS 1      // default: assuming one DIMM per one channel 4GB * 1 => 4GB off-chip memory
 59 | #define LOG2_DRAM_CHANNELS 0
 60 | #define DRAM_RANKS 1         // 512MB * 8 ranks => 4GB per DIMM
 61 | #define LOG2_DRAM_RANKS 0
 62 | #define DRAM_BANKS 8         // 64MB * 8 banks => 512MB per rank
 63 | #define LOG2_DRAM_BANKS 3
 64 | #define DRAM_ROWS 65536      // 2KB * 32K rows => 64MB per bank
 65 | #define LOG2_DRAM_ROWS 16
 66 | #define DRAM_COLUMNS 128      // 64B * 32 column chunks (Assuming 1B DRAM cell * 8 chips * 8 transactions = 64B size of column chunks) => 2KB per row
 67 | #define LOG2_DRAM_COLUMNS 7
 68 | #define DRAM_ROW_SIZE (BLOCK_SIZE*DRAM_COLUMNS/1024)
 69 | 
 70 | #define DRAM_SIZE (DRAM_CHANNELS*DRAM_RANKS*DRAM_BANKS*DRAM_ROWS*DRAM_ROW_SIZE/1024) 
 71 | #define DRAM_PAGES ((DRAM_SIZE<<10)>>2) 
 72 | //#define DRAM_PAGES 10
 73 | 
 74 | using namespace std;
 75 | 
 76 | extern uint8_t warmup_complete[NUM_CPUS], 
 77 |                simulation_complete[NUM_CPUS], 
 78 |                all_warmup_complete, 
 79 |                all_simulation_complete,
 80 |                MAX_INSTR_DESTINATIONS,
 81 |                knob_cloudsuite,
 82 |                knob_low_bandwidth,
 83 |                prefetch_warmup_complete;
 84 | 
 85 | extern uint64_t current_core_cycle[NUM_CPUS], 
 86 |                 stall_cycle[NUM_CPUS], 
 87 |                 last_drc_read_mode, 
 88 |                 last_drc_write_mode,
 89 |                 drc_blocks;
 90 | 
 91 | extern queue <uint64_t> page_queue;
 92 | extern map <uint64_t, uint64_t> page_table, inverse_table, recent_page, unique_cl[NUM_CPUS];
 93 | extern uint64_t previous_ppage, num_adjacent_page, num_cl[NUM_CPUS], allocated_pages, num_page[NUM_CPUS], minor_fault[NUM_CPUS], major_fault[NUM_CPUS];
 94 | 
 95 | void print_stats();
 96 | uint64_t rotl64 (uint64_t n, unsigned int c),
 97 |          rotr64 (uint64_t n, unsigned int c),
 98 |   va_to_pa(uint32_t cpu, uint64_t instr_id, uint64_t va, uint64_t unique_vpage, uint8_t is_code);
 99 | bool check_ppage(uint32_t cpu, uint64_t ppage);
100 | // log base 2 function from efectiu
101 | int lg2(int n);
102 | 
103 | // smart random number generator
104 | class RANDOM {
105 |   public:
106 |     std::random_device rd;
107 |     std::mt19937_64 engine{rd()};
108 |     std::uniform_int_distribution<uint64_t> dist{0, 0xFFFFFFFFF}; // used to generate random physical page numbers
109 | 
110 |     RANDOM (uint64_t seed) {
111 |         engine.seed(seed);
112 |     }
113 | 
114 |     uint64_t draw_rand() {
115 |         return dist(engine);
116 |     };
117 | };
118 | extern uint64_t champsim_seed;
119 | #endif
120 | 


--------------------------------------------------------------------------------
/inc/dram_controller.h:
--------------------------------------------------------------------------------
  1 | #ifndef DRAM_H
  2 | #define DRAM_H
  3 | 
  4 | #include "memory_class.h"
  5 | 
  6 | // DRAM configuration
  7 | #define DRAM_CHANNEL_WIDTH 8 // 8B
  8 | #define DRAM_WQ_SIZE 64
  9 | #define DRAM_RQ_SIZE 64
 10 | 
 11 | #define tRP_DRAM_NANOSECONDS  12.5
 12 | #define tRCD_DRAM_NANOSECONDS 12.5
 13 | #define tCAS_DRAM_NANOSECONDS 12.5
 14 | 
 15 | // the data bus must wait this amount of time when switching between reads and writes, and vice versa
 16 | #define DRAM_DBUS_TURN_AROUND_TIME ((15*CPU_FREQ)/2000) // 7.5 ns 
 17 | extern uint32_t DRAM_MTPS, DRAM_DBUS_RETURN_TIME;
 18 | 
 19 | // these values control when to send out a burst of writes
 20 | #define DRAM_WRITE_HIGH_WM    ((DRAM_WQ_SIZE*7)>>3) // 7/8th
 21 | #define DRAM_WRITE_LOW_WM     ((DRAM_WQ_SIZE*3)>>2) // 6/8th
 22 | #define MIN_DRAM_WRITES_PER_SWITCH (DRAM_WQ_SIZE*1/4)
 23 | 
 24 | // DRAM
 25 | class MEMORY_CONTROLLER : public MEMORY {
 26 |   public:
 27 |     const string NAME;
 28 | 
 29 |     DRAM_ARRAY dram_array[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS];
 30 |     uint64_t dbus_cycle_available[DRAM_CHANNELS], dbus_cycle_congested[DRAM_CHANNELS], dbus_congested[NUM_TYPES+1][NUM_TYPES+1];
 31 |     uint64_t bank_cycle_available[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS];
 32 |     uint8_t  do_write, write_mode[DRAM_CHANNELS]; 
 33 |     uint32_t processed_writes, scheduled_reads[DRAM_CHANNELS], scheduled_writes[DRAM_CHANNELS];
 34 |     int fill_level;
 35 | 
 36 |     BANK_REQUEST bank_request[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS];
 37 | 
 38 |     // queues
 39 |     PACKET_QUEUE WQ[DRAM_CHANNELS], RQ[DRAM_CHANNELS];
 40 | 
 41 |     // constructor
 42 |     MEMORY_CONTROLLER(string v1) : NAME (v1) {
 43 |         for (uint32_t i=0; i<NUM_TYPES+1; i++) {
 44 |             for (uint32_t j=0; j<NUM_TYPES+1; j++) {
 45 |                 dbus_congested[i][j] = 0;
 46 |             }
 47 |         }
 48 |         do_write = 0;
 49 |         processed_writes = 0;
 50 |         for (uint32_t i=0; i<DRAM_CHANNELS; i++) {
 51 |             dbus_cycle_available[i] = 0;
 52 |             dbus_cycle_congested[i] = 0;
 53 |             write_mode[i] = 0;
 54 |             scheduled_reads[i] = 0;
 55 |             scheduled_writes[i] = 0;
 56 | 
 57 |             for (uint32_t j=0; j<DRAM_RANKS; j++) {
 58 |                 for (uint32_t k=0; k<DRAM_BANKS; k++)
 59 |                     bank_cycle_available[i][j][k] = 0;
 60 |             }
 61 | 
 62 |             WQ[i].NAME = "DRAM_WQ" + to_string(i);
 63 |             WQ[i].SIZE = DRAM_WQ_SIZE;
 64 |             WQ[i].entry = new PACKET [DRAM_WQ_SIZE];
 65 | 
 66 |             RQ[i].NAME = "DRAM_RQ" + to_string(i);
 67 |             RQ[i].SIZE = DRAM_RQ_SIZE;
 68 |             RQ[i].entry = new PACKET [DRAM_RQ_SIZE];
 69 |         }
 70 | 
 71 |         fill_level = FILL_DRAM;
 72 |     };
 73 | 
 74 |     // destructor
 75 |     ~MEMORY_CONTROLLER() {
 76 | 
 77 |     };
 78 | 
 79 |     // functions
 80 |     int  add_rq(PACKET *packet),
 81 |          add_wq(PACKET *packet),
 82 |          add_pq(PACKET *packet);
 83 | 
 84 |     void return_data(PACKET *packet),
 85 |          operate(),
 86 |          increment_WQ_FULL(uint64_t address);
 87 | 
 88 |     uint32_t get_occupancy(uint8_t queue_type, uint64_t address),
 89 |              get_size(uint8_t queue_type, uint64_t address);
 90 | 
 91 |     void schedule(PACKET_QUEUE *queue), process(PACKET_QUEUE *queue),
 92 |          update_schedule_cycle(PACKET_QUEUE *queue),
 93 |          update_process_cycle(PACKET_QUEUE *queue),
 94 |          reset_remain_requests(PACKET_QUEUE *queue, uint32_t channel);
 95 | 
 96 |     uint32_t dram_get_channel(uint64_t address),
 97 |              dram_get_rank   (uint64_t address),
 98 |              dram_get_bank   (uint64_t address),
 99 |              dram_get_row    (uint64_t address),
100 |              dram_get_column (uint64_t address),
101 |              drc_check_hit (uint64_t address, uint32_t cpu, uint32_t channel, uint32_t rank, uint32_t bank, uint32_t row);
102 | 
103 |     uint64_t get_bank_earliest_cycle();
104 | 
105 |     int check_dram_queue(PACKET_QUEUE *queue, PACKET *packet);
106 | };
107 | 
108 | #endif
109 | 


--------------------------------------------------------------------------------
/inc/instruction.h:
--------------------------------------------------------------------------------
  1 | #ifndef INSTRUCTION_H
  2 | #define INSTRUCTION_H
  3 | 
  4 | 
  5 | // instruction format
  6 | #define ROB_SIZE 352
  7 | #define LQ_SIZE 128
  8 | #define SQ_SIZE 72
  9 | #define NUM_INSTR_DESTINATIONS_SPARC 4
 10 | #define NUM_INSTR_DESTINATIONS 2
 11 | #define NUM_INSTR_SOURCES 4
 12 | 
 13 | // special registers that help us identify branches
 14 | #define REG_STACK_POINTER 6
 15 | #define REG_FLAGS 25
 16 | #define REG_INSTRUCTION_POINTER 26
 17 | 
 18 | // branch types
 19 | #define NOT_BRANCH           0
 20 | #define BRANCH_DIRECT_JUMP   1
 21 | #define BRANCH_INDIRECT      2
 22 | #define BRANCH_CONDITIONAL   3
 23 | #define BRANCH_DIRECT_CALL   4
 24 | #define BRANCH_INDIRECT_CALL 5
 25 | #define BRANCH_RETURN        6
 26 | #define BRANCH_OTHER         7
 27 | 
 28 | #include "set.h"
 29 | 
 30 | class input_instr {
 31 |   public:
 32 | 
 33 |     // instruction pointer or PC (Program Counter)
 34 |     uint64_t ip;
 35 | 
 36 |     // branch info
 37 |     uint8_t is_branch;
 38 |     uint8_t branch_taken;
 39 | 
 40 |     uint8_t destination_registers[NUM_INSTR_DESTINATIONS]; // output registers
 41 |     uint8_t source_registers[NUM_INSTR_SOURCES]; // input registers
 42 | 
 43 |     uint64_t destination_memory[NUM_INSTR_DESTINATIONS]; // output memory
 44 |     uint64_t source_memory[NUM_INSTR_SOURCES]; // input memory
 45 | 
 46 |     input_instr() {
 47 |         ip = 0;
 48 |         is_branch = 0;
 49 |         branch_taken = 0;
 50 | 
 51 |         for (uint32_t i=0; i<NUM_INSTR_SOURCES; i++) {
 52 |             source_registers[i] = 0;
 53 |             source_memory[i] = 0;
 54 |         }
 55 | 
 56 |         for (uint32_t i=0; i<NUM_INSTR_DESTINATIONS; i++) {
 57 |             destination_registers[i] = 0;
 58 |             destination_memory[i] = 0;
 59 |         }
 60 |     };
 61 | };
 62 | 
 63 | class cloudsuite_instr {
 64 |   public:
 65 | 
 66 |     // instruction pointer or PC (Program Counter)
 67 |     uint64_t ip;
 68 | 
 69 |     // branch info
 70 |     uint8_t is_branch;
 71 |     uint8_t branch_taken;
 72 | 
 73 |     uint8_t destination_registers[NUM_INSTR_DESTINATIONS_SPARC]; // output registers
 74 |     uint8_t source_registers[NUM_INSTR_SOURCES]; // input registers
 75 | 
 76 |     uint64_t destination_memory[NUM_INSTR_DESTINATIONS_SPARC]; // output memory
 77 |     uint64_t source_memory[NUM_INSTR_SOURCES]; // input memory
 78 | 
 79 |     uint8_t asid[2];
 80 | 
 81 |     cloudsuite_instr() {
 82 |         ip = 0;
 83 |         is_branch = 0;
 84 |         branch_taken = 0;
 85 | 
 86 |         for (uint32_t i=0; i<NUM_INSTR_SOURCES; i++) {
 87 |             source_registers[i] = 0;
 88 |             source_memory[i] = 0;
 89 |         }
 90 | 
 91 |         for (uint32_t i=0; i<NUM_INSTR_DESTINATIONS_SPARC; i++) {
 92 |             destination_registers[i] = 0;
 93 |             destination_memory[i] = 0;
 94 |         }
 95 | 
 96 |         asid[0] = UINT8_MAX;
 97 |         asid[1] = UINT8_MAX;
 98 |     };
 99 | };
100 | 
101 | class ooo_model_instr {
102 |   public:
103 |     uint64_t instr_id,
104 |              ip,
105 |              fetch_producer,
106 |              producer_id,
107 |              translated_cycle,
108 |              fetched_cycle,
109 |              execute_begin_cycle,
110 |              retired_cycle,
111 |              event_cycle;
112 | 
113 |     uint8_t is_branch,
114 |             is_memory,
115 |             branch_taken,
116 |             branch_mispredicted,
117 |             branch_prediction_made,
118 |             translated,
119 |             data_translated,
120 |             source_added[NUM_INSTR_SOURCES],
121 |             destination_added[NUM_INSTR_DESTINATIONS_SPARC],
122 |             is_producer,
123 |             is_consumer,
124 |             reg_RAW_producer,
125 |             reg_ready,
126 |             mem_ready,
127 |             asid[2],
128 |             reg_RAW_checked[NUM_INSTR_SOURCES];
129 | 
130 |     uint8_t branch_type;
131 |     uint64_t branch_target;
132 | 
133 |     uint32_t fetched, scheduled;
134 |     int num_reg_ops, num_mem_ops, num_reg_dependent;
135 | 
136 |     // executed bit is set after all dependencies are eliminated and this instr is chosen on a cycle, according to EXEC_WIDTH
137 |     int executed;
138 | 
139 |     uint8_t destination_registers[NUM_INSTR_DESTINATIONS_SPARC]; // output registers
140 | 
141 |     uint8_t source_registers[NUM_INSTR_SOURCES]; // input registers 
142 | 
143 |     // these are instruction ids of other instructions in the window
144 |     //int64_t registers_instrs_i_depend_on[NUM_INSTR_SOURCES];
145 |     // these are indices of instructions in the window that depend on me
146 |     //uint8_t registers_instrs_depend_on_me[ROB_SIZE], registers_index_depend_on_me[ROB_SIZE][NUM_INSTR_SOURCES];
147 |     fastset
148 | 	registers_instrs_depend_on_me, registers_index_depend_on_me[NUM_INSTR_SOURCES];
149 | 
150 | 
151 |     // memory addresses that may cause dependencies between instructions
152 |     uint64_t instruction_pa, data_pa, virtual_address, physical_address;
153 |     uint64_t destination_memory[NUM_INSTR_DESTINATIONS_SPARC]; // output memory
154 |     uint64_t source_memory[NUM_INSTR_SOURCES]; // input memory
155 |     //int source_memory_outstanding[NUM_INSTR_SOURCES];  // a value of 2 here means the load hasn't been issued yet, 1 means it has been issued, but not returned yet, and 0 means it has returned
156 | 
157 |     // keep around a record of what the original virtual addresses were
158 |     uint64_t destination_virtual_address[NUM_INSTR_DESTINATIONS_SPARC];
159 |     uint64_t source_virtual_address[NUM_INSTR_SOURCES];
160 | 
161 |     // these are instruction ids of other instructions in the window
162 |     //uint32_t memory_instrs_i_depend_on[NUM_INSTR_SOURCES];
163 | 
164 |     // these are indices of instructions in the ROB that depend on me
165 |     //uint8_t memory_instrs_depend_on_me[ROB_SIZE];
166 |     fastset memory_instrs_depend_on_me;
167 | 
168 |     uint32_t lq_index[NUM_INSTR_SOURCES],
169 |              sq_index[NUM_INSTR_DESTINATIONS_SPARC],
170 |              forwarding_index[NUM_INSTR_DESTINATIONS_SPARC];
171 | 
172 |     ooo_model_instr() {
173 |         instr_id = 0;
174 |         ip = 0;
175 |         fetch_producer = 0;
176 |         producer_id = 0;
177 |         translated_cycle = 0;
178 |         fetched_cycle = 0;
179 |         execute_begin_cycle = 0;
180 |         retired_cycle = 0;
181 |         event_cycle = 0;
182 | 
183 |         is_branch = 0;
184 |         is_memory = 0;
185 |         branch_taken = 0;
186 |         branch_mispredicted = 0;
187 | 	branch_prediction_made = 0;
188 |         translated = 0;
189 |         data_translated = 0;
190 |         is_producer = 0;
191 |         is_consumer = 0;
192 |         reg_RAW_producer = 0;
193 |         fetched = 0;
194 |         scheduled = 0;
195 |         executed = 0;
196 |         reg_ready = 0;
197 |         mem_ready = 0;
198 |         asid[0] = UINT8_MAX;
199 |         asid[1] = UINT8_MAX;
200 | 
201 | 	branch_type = NOT_BRANCH;
202 | 	branch_target = 0;
203 | 
204 |         instruction_pa = 0;
205 |         data_pa = 0;
206 |         virtual_address = 0;
207 |         physical_address = 0;
208 | 
209 |         num_reg_ops = 0;
210 |         num_mem_ops = 0;
211 |         num_reg_dependent = 0;
212 | 
213 |         for (uint32_t i=0; i<NUM_INSTR_SOURCES; i++) {
214 |             source_registers[i] = 0;
215 |             source_memory[i] = 0;
216 |             source_virtual_address[i] = 0;
217 |             source_added[i] = 0;
218 |             lq_index[i] = UINT32_MAX;
219 |             reg_RAW_checked[i] = 0;
220 |         }
221 | 
222 |         for (uint32_t i=0; i<NUM_INSTR_DESTINATIONS_SPARC; i++) {
223 |             destination_memory[i] = 0;
224 |             destination_registers[i] = 0;
225 |             destination_virtual_address[i] = 0;
226 |             destination_added[i] = 0;
227 |             sq_index[i] = UINT32_MAX;
228 |             forwarding_index[i] = 0;
229 |         }
230 | 
231 | #if 0
232 |         for (uint32_t i=0; i<ROB_SIZE; i++) {
233 |             registers_instrs_depend_on_me[i] = 0;
234 |             memory_instrs_depend_on_me[i] = 0;
235 | 
236 |             for (uint32_t j=0; j<NUM_INSTR_SOURCES; j++)
237 |                 registers_index_depend_on_me[i][j] = 0;
238 |         }
239 | #endif
240 |     };
241 | 
242 |   void print_instr()
243 |   {
244 |     cout << "*** " << instr_id << " ***" << endl;
245 |     cout << hex << "0x" << (uint64_t)ip << dec << endl;
246 |     cout << (uint32_t)is_branch << " " << (uint32_t)branch_taken << endl;
247 |     for(uint32_t i=0; i<NUM_INSTR_SOURCES; i++)
248 |       {
249 | 	cout << (uint32_t)source_registers[i] << " ";
250 |       }
251 |     cout << endl;
252 |     for(uint32_t i=0; i<NUM_INSTR_SOURCES; i++)
253 |       {
254 | 	cout << hex << "0x" << (uint32_t)source_memory[i] << dec << " ";
255 |       }
256 |     cout << endl;
257 |     for(uint32_t i=0; i<NUM_INSTR_DESTINATIONS; i++)
258 |       {
259 | 	cout << (uint32_t)destination_registers[i] << " ";
260 |       }
261 |     cout << endl;
262 |     for(uint32_t i=0; i<NUM_INSTR_DESTINATIONS; i++)
263 |       {
264 |         cout << hex << "0x" << (uint32_t)destination_memory[i] << dec << " ";
265 |       }
266 |     cout << endl;
267 |     
268 |     cout << endl;
269 |   }
270 | };
271 | 
272 | #endif
273 | 


--------------------------------------------------------------------------------
/inc/kpcp.h:
--------------------------------------------------------------------------------
  1 | #ifndef KPCP_H
  2 | #define KPCP_H
  3 | 
  4 | #include "cache.h"
  5 | 
  6 | // L2 SPP
  7 | //#define L2_PF_DEBUG_PRINT
  8 | #ifdef L2_PF_DEBUG_PRINT
  9 | #define L2_PF_DEBUG(x) x
 10 | #else
 11 | #define L2_PF_DEBUG(x)
 12 | #endif
 13 | 
 14 | #define L2_ST_SET 1
 15 | #define L2_ST_WAY 256
 16 | #define L2_ST_PRIME 1
 17 | #define L2_PT_SET 512
 18 | #define L2_PT_WAY 4
 19 | #define L2_PT_PRIME 509
 20 | #define CDELTA_MAX 16
 21 | #define CSIG_MAX 16
 22 | #define L2_GHR_TRACK 8
 23 | //#define L2_GHR_ON
 24 | #define SIG_SHIFT  3
 25 | #define SIG_LENGTH 12
 26 | #define SIG_MASK ((1 << SIG_LENGTH) - 1)
 27 | 
 28 | #define BAD_MAX 7
 29 | 
 30 | class SIGNATURE_TABLE {
 31 |   public:
 32 | 
 33 |     int valid,
 34 |         tag,
 35 |         last_block,
 36 |         signature,
 37 |         lru,
 38 |         l2_pf[64],
 39 |         used[64],
 40 |         delta[64],
 41 |         depth[64],
 42 |         dirty[64],
 43 |         first_hit;
 44 | 
 45 |     SIGNATURE_TABLE() {
 46 |         valid = 0;
 47 |         tag = 0;
 48 |         last_block = 0;
 49 |         signature = 0;
 50 |         lru = 0;
 51 |         
 52 |         for (uint32_t i=0; i<64; i++) {
 53 |             l2_pf[i] = 0;
 54 |             used[i] = 0;
 55 |             delta[i] = 0;
 56 |             depth[i] = 0;
 57 |             dirty[i] = 0;
 58 |         }
 59 | 
 60 |         first_hit = 0;
 61 |     };
 62 | };
 63 | 
 64 | class PATTERN_TABLE {
 65 |   public:
 66 |     int delta,
 67 |         c_delta,
 68 |         c_sig;
 69 | 
 70 |     PATTERN_TABLE() {
 71 |         delta = 0;
 72 |         c_delta = 0;
 73 |         c_sig = 0;
 74 |     };
 75 | };
 76 | 
 77 | class GLOBAL_HISTORY_REGISTER {
 78 |   public:
 79 |     int signature,
 80 |         path_conf,
 81 |         last_block,
 82 |         oop_delta,
 83 |         lru;
 84 | 
 85 |     GLOBAL_HISTORY_REGISTER() {
 86 |         signature = 0;
 87 |         path_conf = 0;
 88 |         last_block = 0;
 89 |         oop_delta = 0;
 90 |     };
 91 | };
 92 | 
 93 | extern SIGNATURE_TABLE L2_ST[NUM_CPUS][L2_ST_SET][L2_ST_WAY];
 94 | extern PATTERN_TABLE L2_PT[NUM_CPUS][L2_PT_SET][L2_PT_WAY];
 95 | extern GLOBAL_HISTORY_REGISTER L2_GHR[NUM_CPUS][L2_GHR_TRACK];
 96 | extern int L2_ST_access[NUM_CPUS], L2_ST_hit[NUM_CPUS], L2_ST_invalid[NUM_CPUS], L2_ST_miss[NUM_CPUS];
 97 | extern int L2_PT_access[NUM_CPUS], L2_PT_hit[NUM_CPUS], L2_PT_invalid[NUM_CPUS], L2_PT_miss[NUM_CPUS];
 98 | extern int l2_sig_dist[NUM_CPUS][1<<SIG_LENGTH];
 99 | 
100 | unsigned int get_new_signature(unsigned int old_signature, int curr_delta);
101 | int L2_ST_update(uint32_t cpu, uint64_t addr);
102 | int L2_ST_check(uint32_t cpu, uint64_t addr);
103 | void L2_PT_update(uint32_t cpu, int signature, int delta);
104 | void notify_sampler(uint32_t cpu, int64_t address, int dirty, int useful);
105 | 
106 | //#include "cache.h"
107 | //#include "kpcp.h"
108 | 
109 | SIGNATURE_TABLE L2_ST[NUM_CPUS][L2_ST_SET][L2_ST_WAY];
110 | PATTERN_TABLE L2_PT[NUM_CPUS][L2_PT_SET][L2_PT_WAY];
111 | GLOBAL_HISTORY_REGISTER L2_GHR[NUM_CPUS][L2_GHR_TRACK];
112 | 
113 | int L2_ST_access[NUM_CPUS], L2_ST_hit[NUM_CPUS], L2_ST_invalid[NUM_CPUS], L2_ST_miss[NUM_CPUS];
114 | int L2_PT_access[NUM_CPUS], L2_PT_hit[NUM_CPUS], L2_PT_invalid[NUM_CPUS], L2_PT_miss[NUM_CPUS];
115 | int l2_sig_dist[NUM_CPUS][1<<SIG_LENGTH];
116 | 
117 | unsigned int get_new_signature(unsigned int old_signature, int curr_delta)
118 | {
119 |     if (curr_delta == 0)
120 |         return old_signature;
121 | 
122 |     unsigned int new_signature = 0;
123 |     int sig_delta = curr_delta;
124 |     if (sig_delta < 0)
125 |         sig_delta = 64 + curr_delta*(-1);
126 |     new_signature = ((old_signature << SIG_SHIFT) ^ sig_delta) & SIG_MASK;
127 |     if (new_signature == 0)
128 |     {
129 |         //printf("old_signature: %x  SIG_SHIFT: %d  sig_delta: %d  SIG_LENGTH: %d\n", old_signature, SIG_SHIFT, sig_delta, SIG_LENGTH);
130 |         if (sig_delta)
131 |             return sig_delta;
132 |         else
133 |             return old_signature;
134 |     }
135 |     return new_signature;
136 | }
137 | 
138 | // Update signature table
139 | int L2_ST_update(uint32_t cpu, uint64_t addr)
140 | {
141 |     uint64_t curr_page = addr >> LOG2_PAGE_SIZE;
142 |     int tag = curr_page & 0xFFFF,
143 |         hit = 0, match = -1,
144 |         L2_ST_idx = curr_page % L2_ST_PRIME,
145 |         curr_block = (addr >> LOG2_BLOCK_SIZE) & 0x3F;
146 |     SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx];
147 |     int delta_buffer = 0, sig_buffer = 0;
148 | 
149 |     for (match=0; match<L2_ST_WAY; match++) {
150 |         if (table[match].valid && (table[match].tag == tag)) { // Hit 
151 |             delta_buffer = curr_block - table[match].last_block; // Buffer current delta
152 |             sig_buffer = table[match].signature; // Buffer old signature
153 | 
154 |             if (table[match].signature == 0) { // First hit in L2_ST
155 |                 // We cannot associate delta pattern with signature when we see "the first hit in L2_ST"
156 |                 // At this point, all we know about this page is "the first accessed offset"
157 |                 // We don't have any delta information that can be a part of signature
158 |                 // In other words, the first offset does not update PT
159 | 
160 |                 int sig_delta = curr_block - table[match].last_block;
161 |                 if (sig_delta < 0)
162 |                     sig_delta = 64 + (curr_block - table[match].last_block)*(-1);
163 |                 table[match].signature = sig_delta & SIG_MASK; // This is the first signature
164 |                 table[match].first_hit = 1;
165 |                 l2_sig_dist[cpu][table[match].signature]++;
166 | 
167 |                 if (warmup_complete[cpu])
168 |                 L2_PF_DEBUG(printf("ST_hit_first cpu: %d cl_addr: %lx page: %lx block: %d init_sig: %x delta: %d\n", 
169 |                             cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].signature, delta_buffer));
170 |             }
171 |             else {
172 |                 hit = 1;
173 |                 table[match].first_hit = 0;
174 | 
175 |                 if (delta_buffer) {
176 |                     // This is non-speculative information tracked from actual L2 cache demand
177 |                     // Now, the old signature will be associated with current delta
178 |                     L2_PT_update(cpu, sig_buffer, delta_buffer);
179 |                 }
180 |                 else
181 |                     break;
182 | 
183 |                 if (warmup_complete[cpu])
184 |                 L2_PF_DEBUG(printf("ST_hit cpu: %d cl_addr: %lx page: %lx block: %d old_sig: %x delta: %d\n", 
185 |                             cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, sig_buffer, delta_buffer));
186 | 
187 |                 // Update signature
188 |                 int new_signature = get_new_signature(sig_buffer, delta_buffer);
189 |                 table[match].signature = new_signature;
190 |                 l2_sig_dist[cpu][table[match].signature]++;
191 |             }
192 | 
193 |             // Update last_block
194 |             table[match].last_block = curr_block;
195 |             L2_ST_hit[cpu]++; L2_ST_access[cpu]++;
196 |             break;
197 |         }
198 |     }
199 | 
200 |     if (match == L2_ST_WAY) {
201 |         for (match=0; match<L2_ST_WAY; match++) {
202 |             if (table[match].valid == 0) { // Invalid
203 |                 // Update metadata
204 |                 table[match].valid = 1;
205 |                 table[match].tag = tag;
206 |                 table[match].signature = 0;
207 |                 table[match].first_hit = 0;
208 |                 table[match].last_block = curr_block;
209 |                 L2_ST_invalid[cpu]++; L2_ST_access[cpu]++;
210 | 
211 |                 if (warmup_complete[cpu])
212 |                 L2_PF_DEBUG(printf("ST_invalid cpu: %d cl_addr: %lx page: %lx block: %d\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block));
213 |                 break;
214 |             }
215 |         }
216 |     }
217 | 
218 |     if (match == L2_ST_WAY) { // Miss
219 |         // Search for LRU victim
220 |         for (match=0; match<L2_ST_WAY; match++) {
221 |             if (table[match].lru == (L2_ST_WAY-1))
222 |                 break;
223 |         }
224 | 
225 |         // Update metadata
226 |         table[match].valid = 1;
227 |         table[match].tag = tag;
228 |         table[match].signature = 0;
229 |         table[match].first_hit = 0;
230 |         table[match].last_block = curr_block;
231 |         
232 |         for (int i=0; i<64; i++) {
233 |             table[match].l2_pf[i] = 0;
234 |             table[match].used[i] = 0;
235 |         }
236 | 
237 |         if (warmup_complete[cpu])
238 |         L2_PF_DEBUG(printf("ST_miss cpu: %d cl_addr: %lx page: %lx block: %d lru: %d\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].lru));
239 |         L2_ST_miss[cpu]++; L2_ST_access[cpu]++;
240 | 
241 |         #ifdef L2_GHR_ON
242 |         // Check GHR
243 |         int ghr_max = 0, ghr_idx = -1, spec_block = 0, spec_sig = 0;
244 |         for (int i=0; i<L2_GHR_TRACK; i++) {
245 |             spec_block = L2_GHR[cpu][i].last_block + L2_GHR[cpu][i].oop_delta;
246 |             if (spec_block >= 64)
247 |                 spec_block -= 64;
248 |             else if (spec_block < 0)
249 |                 spec_block += 64;
250 |             if ((spec_block == curr_block) && (ghr_max <= L2_GHR[cpu][i].path_conf)) {
251 |                 ghr_max = L2_GHR[cpu][i].path_conf;
252 |                 ghr_idx = i;
253 |                 spec_sig = get_new_signature(L2_GHR[cpu][i].signature, L2_GHR[cpu][i].oop_delta);
254 |                 if (warmup_complete[cpu])
255 |                 L2_PF_DEBUG(printf("cpu: %d OOP_match  L2_GHR[%d]  signature: %x  path_conf: %d  last_block: %d  oop_delta: %d  spec_block: %d == curr_block: %d  spec_sig: %x\n",
256 |                           cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 
257 |                           L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig));
258 |             }
259 |             else {
260 |                 if (warmup_complete[cpu])
261 |                 L2_PF_DEBUG(printf("cpu: %d OOP_unmatch  L2_GHR[%d]  signature: %x  path_conf: %d  last_block: %d  oop_delta: %d  spec_block: %d != curr_block: %d  spec_sig: %x\n",
262 |                           cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 
263 |                           L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig));
264 |             }
265 |         }
266 | 
267 |         if (ghr_idx >= 0) {
268 |             // Speculatively update first page
269 |             spec_sig = get_new_signature(L2_GHR[cpu][ghr_idx].signature, L2_GHR[cpu][ghr_idx].oop_delta);
270 | 
271 |             hit = 1;
272 |             table[match].signature = spec_sig;
273 |             if (warmup_complete[cpu])
274 |             L2_PF_DEBUG(printf("cpu: %d spec_update  page: %x  sig: %3x  delta: %3d  curr_block: %2d  last_block[NA]: %2d\n", 
275 |                       cpu, tag, spec_sig, L2_GHR[cpu][ghr_idx].oop_delta, curr_block, L2_GHR[cpu][ghr_idx].last_block));
276 |         }
277 |         #endif
278 |     }
279 | 
280 |     // Update LRU
281 |     int position = table[match].lru;
282 |     for (int i=0; i<L2_ST_WAY; i++) {
283 |         if (table[i].lru < position)
284 |             table[i].lru++;
285 |     }
286 |     table[match].lru = 0;
287 | 
288 |     if (hit)
289 |         return match;
290 |     else
291 |         return -1;
292 | }
293 | 
294 | int L2_ST_check(uint32_t cpu, uint64_t addr)
295 | {
296 |     uint64_t curr_page = addr >> LOG2_PAGE_SIZE;
297 |     int tag = curr_page & 0xFFFF,
298 |         match = -1,
299 |         L2_ST_idx = curr_page % L2_ST_PRIME;
300 | 
301 |     SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx];
302 | 
303 |     for (match=0; match<L2_ST_WAY; match++) {
304 |         if (table[match].valid && (table[match].tag == tag)) {
305 |             if (warmup_complete[cpu])
306 |             L2_PF_DEBUG(printf("ST_check found cpu: %d cl_addr: %lx page: %lx block: %ld old_sig: %x last_block: %d\n", 
307 |                         cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F, table[match].signature, table[match].last_block));
308 |             return match;
309 |         }
310 |     }
311 | 
312 |     if (warmup_complete[cpu])
313 |     L2_PF_DEBUG(printf("ST_check not found cpu: %d cl_addr: %lx page: %lx block: %ld\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F));
314 |     return -1;
315 | }
316 | 
317 | void L2_PT_update(uint32_t cpu, int signature, int delta)
318 | {
319 |     int L2_PT_idx = signature % L2_PT_PRIME;
320 |     PATTERN_TABLE *table = L2_PT[cpu][L2_PT_idx];
321 | 
322 |     // Update L2_PT
323 |     // Update metadata
324 |     table[0].c_sig++;
325 | 
326 |     if (table[0].c_sig == (CSIG_MAX))
327 |     {
328 |         table[0].c_sig = CSIG_MAX >> 1;
329 |         for (int i = 0; i<L2_PT_WAY; i++)
330 |             table[i].c_delta = table[i].c_delta >> 1;
331 |         if (warmup_complete[cpu])
332 |         L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d c_sig saturated sig_total: %d => %d\n", L2_PT_idx, cpu, CSIG_MAX, table[0].c_sig));
333 |     }
334 | 
335 |     int match;
336 |     for (match=0; match<L2_PT_WAY; match++) 
337 |     {
338 |         if (table[match].delta == delta) // Hit 
339 |         {
340 |             table[match].c_delta++;
341 | 
342 |             if (warmup_complete[cpu])
343 |             L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_hit delta[%d]: %2d (%d / %d)\n", 
344 |                         signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
345 |             L2_PT_hit[cpu]++; L2_PT_access[cpu]++;
346 |             break;
347 |         }
348 |     }
349 | 
350 |     if (match == L2_PT_WAY)
351 |     {
352 |         for (match=0; match<L2_PT_WAY; match++)
353 |         {
354 |             if (table[match].delta == 0) // Invalid
355 |             {
356 |                 // Update metadata
357 |                 table[match].delta = delta;
358 |                 table[match].c_delta = 0;
359 | 
360 |                 if (warmup_complete[cpu])
361 |                 L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_invalid delta[%d]: %2d (%d / %d)\n", 
362 |                             signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
363 |                 L2_PT_invalid[cpu]++; L2_PT_access[cpu]++;
364 |                 break;
365 |             }
366 |         }
367 |     }
368 | 
369 |     if (match == L2_PT_WAY) // Miss
370 |     {
371 |         // Search for the lowest counter
372 |         int min_idx = -1;
373 |         int min_val = CDELTA_MAX;
374 |         for (match=0; match<L2_PT_WAY; match++)
375 |         {
376 |             if (table[match].c_delta < min_val)
377 |             {
378 |                 min_idx = match;
379 |                 min_val = table[match].c_delta;
380 |             }
381 |         }
382 |         match = min_idx;
383 | 
384 |         // Update metadata
385 |         table[match].delta = delta;
386 |         table[match].c_delta = 0;
387 | 
388 |         if (warmup_complete[cpu])
389 |         L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_miss delta[%d]: %2d (%d / %d)\n", 
390 |                     signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
391 |         L2_PT_miss[cpu]++; L2_PT_access[cpu]++;
392 |     }
393 | }
394 | 
395 | // TODO: this functino should be moved to the replacement policy file
396 | // Check sampler 
397 | void notify_sampler(uint32_t cpu, int64_t address, int dirty, int useful)
398 | {
399 |     /*
400 |     int set = llc_get_set(address);
401 |     int s_idx = is_it_sampled(set);
402 | 
403 |     if (s_idx == -1)
404 |         return;
405 | 
406 |     SAMPLER_T *s_set = sampler[s_idx];
407 |     int tag = (int) address / (64*LLC_SETS); 
408 |     int match = -1;
409 | 
410 |     // Check hit
411 |     for (match=0; match<SAMPLER_WAY; match++)
412 |     {
413 |         if (s_set[match].valid && (s_set[match].tag == tag))
414 |         {
415 |             if (s_set[match].l2pf)
416 |             {
417 |                 if (useful)
418 |                 {
419 |                     if (conf_counter[cpu] < MAX_CC)
420 |                         conf_counter[cpu]++;
421 | 
422 |                     if (conf_counter[cpu] == MAX_CC)
423 |                     {
424 |                         if (dynamic_fill_thrs[cpu] > 0)
425 |                         {
426 |                             dynamic_fill_thrs[cpu]--;
427 |                             fill_down++;
428 |                             conf_level[dynamic_fill_thrs[cpu]]++;
429 | 
430 |                             printf("FILL_THRESHOLD goes down %d => %d at cycle: %ld\n", dynamic_fill_thrs[cpu]+1, dynamic_fill_thrs[cpu], ooo_cpu[cpu].current_cycle);
431 |                         }
432 |                             
433 |                         conf_counter[cpu] = 0;
434 |                     }
435 | 
436 |                     l2pf_was_useful++;
437 |                 }
438 |                 else
439 |                 {
440 |                     if (conf_counter[cpu] > 0)
441 |                         conf_counter[cpu]--;
442 | 
443 |                     l2pf_was_useless++;
444 |                 }
445 | 
446 |                 l2pf_match++;
447 |             }
448 |             
449 |             break;
450 |         }
451 |     }
452 |     l2pf_signal++;
453 | 
454 |     return;
455 |     */
456 | }
457 | 
458 | #endif
459 | 


--------------------------------------------------------------------------------
/inc/memory_class.h:
--------------------------------------------------------------------------------
 1 | #ifndef MEMORY_CLASS_H
 2 | #define MEMORY_CLASS_H
 3 | 
 4 | #include "champsim.h"
 5 | #include "block.h"
 6 | 
 7 | // CACHE ACCESS TYPE
 8 | #define LOAD      0
 9 | #define RFO       1
10 | #define PREFETCH  2
11 | #define WRITEBACK 3
12 | #define NUM_TYPES 4
13 | 
14 | extern uint32_t tRP,  // Row Precharge (RP) latency
15 |                 tRCD, // Row address to Column address (RCD) latency
16 |                 tCAS; // Column Address Strobe (CAS) latency
17 | 
18 | extern uint64_t l2pf_access;
19 | 
20 | class MEMORY {
21 |   public:
22 |     // memory interface
23 |     MEMORY *upper_level_icache[NUM_CPUS], *upper_level_dcache[NUM_CPUS], *lower_level, *extra_interface;
24 | 
25 |     // empty queues
26 |     PACKET_QUEUE WQ{"EMPTY", 1}, RQ{"EMPTY", 1}, PQ{"EMPTY", 1}, MSHR{"EMPTY", 1};
27 | 
28 |     // functions
29 |     virtual int  add_rq(PACKET *packet) = 0;
30 |     virtual int  add_wq(PACKET *packet) = 0;
31 |     virtual int  add_pq(PACKET *packet) = 0;
32 |     virtual void return_data(PACKET *packet) = 0;
33 |     virtual void operate() = 0;
34 |     virtual void increment_WQ_FULL(uint64_t address) = 0;
35 |     virtual uint32_t get_occupancy(uint8_t queue_type, uint64_t address) = 0;
36 |     virtual uint32_t get_size(uint8_t queue_type, uint64_t address) = 0;
37 | 
38 |     // stats
39 |     uint64_t ACCESS[NUM_TYPES], HIT[NUM_TYPES], MISS[NUM_TYPES], MSHR_MERGED[NUM_TYPES], STALL[NUM_TYPES];
40 | 
41 |     MEMORY() {
42 |         for (uint32_t i=0; i<NUM_TYPES; i++) {
43 |             ACCESS[i] = 0;
44 |             HIT[i] = 0;
45 |             MISS[i] = 0;
46 |             MSHR_MERGED[i] = 0;
47 |             STALL[i] = 0;
48 |         }
49 |     }
50 | };
51 | 
52 | class BANK_REQUEST {
53 |   public:
54 |     uint64_t cycle_available,
55 |              address,
56 |              full_addr;
57 | 
58 |     uint32_t open_row;
59 | 
60 |     uint8_t working,
61 |             working_type,
62 |             row_buffer_hit,
63 |             drc_hit,
64 |             is_write,
65 |             is_read;
66 | 
67 |     int request_index;
68 | 
69 |     BANK_REQUEST() {
70 |         cycle_available = 0;
71 |         address = 0;
72 |         full_addr = 0;
73 | 
74 |         open_row = UINT32_MAX;
75 | 
76 |         working = 0;
77 |         working_type = 0;
78 |         row_buffer_hit = 0;
79 |         drc_hit = 0;
80 |         is_write = 0;
81 |         is_read = 0;
82 | 
83 |         request_index = -1;
84 |     };
85 | };
86 | 
87 | #endif
88 | 


--------------------------------------------------------------------------------
/inc/ooo_cpu.h:
--------------------------------------------------------------------------------
  1 | #ifndef OOO_CPU_H
  2 | #define OOO_CPU_H
  3 | 
  4 | #include "cache.h"
  5 | 
  6 | #ifdef CRC2_COMPILE
  7 | #define STAT_PRINTING_PERIOD 1000000
  8 | #else
  9 | #define STAT_PRINTING_PERIOD 10000000
 10 | #endif
 11 | #define DEADLOCK_CYCLE 1000000
 12 | 
 13 | using namespace std;
 14 | 
 15 | // CORE PROCESSOR
 16 | #define FETCH_WIDTH 6
 17 | #define DECODE_WIDTH 6
 18 | #define EXEC_WIDTH 6
 19 | #define LQ_WIDTH 2
 20 | #define SQ_WIDTH 2
 21 | #define RETIRE_WIDTH 4
 22 | #define SCHEDULER_SIZE 128
 23 | #define BRANCH_MISPREDICT_PENALTY 1
 24 | //#define SCHEDULING_LATENCY 0
 25 | //#define EXEC_LATENCY 0
 26 | //#define DECODE_LATENCY 2
 27 | 
 28 | #define STA_SIZE (ROB_SIZE*NUM_INSTR_DESTINATIONS_SPARC)
 29 | 
 30 | extern uint32_t SCHEDULING_LATENCY, EXEC_LATENCY, DECODE_LATENCY;
 31 | 
 32 | // cpu
 33 | class O3_CPU {
 34 |   public:
 35 |     uint32_t cpu;
 36 | 
 37 |     // trace
 38 |     FILE *trace_file;
 39 |     char trace_string[1024];
 40 |     char gunzip_command[1024];
 41 | 
 42 |     // instruction
 43 |     input_instr next_instr;
 44 |     input_instr current_instr;
 45 |     cloudsuite_instr current_cloudsuite_instr;
 46 |     uint64_t instr_unique_id, completed_executions, 
 47 |              begin_sim_cycle, begin_sim_instr, 
 48 |              last_sim_cycle, last_sim_instr,
 49 |              finish_sim_cycle, finish_sim_instr,
 50 |              warmup_instructions, simulation_instructions, instrs_to_read_this_cycle, instrs_to_fetch_this_cycle,
 51 |              next_print_instruction, num_retired;
 52 |     uint32_t inflight_reg_executions, inflight_mem_executions, num_searched;
 53 |     uint32_t next_ITLB_fetch;
 54 | 
 55 |     // reorder buffer, load/store queue, register file
 56 |     CORE_BUFFER IFETCH_BUFFER{"IFETCH_BUFFER", FETCH_WIDTH*2};
 57 |     CORE_BUFFER DECODE_BUFFER{"DECODE_BUFFER", DECODE_WIDTH*3};
 58 |     CORE_BUFFER ROB{"ROB", ROB_SIZE};
 59 |     LOAD_STORE_QUEUE LQ{"LQ", LQ_SIZE}, SQ{"SQ", SQ_SIZE};
 60 | 
 61 |     // store array, this structure is required to properly handle store instructions
 62 |     uint64_t STA[STA_SIZE], STA_head, STA_tail; 
 63 | 
 64 |     // Ready-To-Execute
 65 |     uint32_t RTE0[ROB_SIZE], RTE0_head, RTE0_tail, 
 66 |              RTE1[ROB_SIZE], RTE1_head, RTE1_tail;  
 67 | 
 68 |     // Ready-To-Load
 69 |     uint32_t RTL0[LQ_SIZE], RTL0_head, RTL0_tail, 
 70 |              RTL1[LQ_SIZE], RTL1_head, RTL1_tail;  
 71 | 
 72 |     // Ready-To-Store
 73 |     uint32_t RTS0[SQ_SIZE], RTS0_head, RTS0_tail,
 74 |              RTS1[SQ_SIZE], RTS1_head, RTS1_tail;
 75 | 
 76 |     // branch
 77 |     int branch_mispredict_stall_fetch; // flag that says that we should stall because a branch prediction was wrong
 78 |     int mispredicted_branch_iw_index; // index in the instruction window of the mispredicted branch.  fetch resumes after the instruction at this index executes
 79 |     uint8_t  fetch_stall;
 80 |     uint64_t fetch_resume_cycle;
 81 |     uint64_t num_branch, branch_mispredictions;
 82 |     uint64_t total_rob_occupancy_at_branch_mispredict;
 83 |   uint64_t total_branch_types[8];
 84 | 
 85 |     // TLBs and caches
 86 |     CACHE ITLB{"ITLB", ITLB_SET, ITLB_WAY, ITLB_SET*ITLB_WAY, ITLB_WQ_SIZE, ITLB_RQ_SIZE, ITLB_PQ_SIZE, ITLB_MSHR_SIZE},
 87 |           DTLB{"DTLB", DTLB_SET, DTLB_WAY, DTLB_SET*DTLB_WAY, DTLB_WQ_SIZE, DTLB_RQ_SIZE, DTLB_PQ_SIZE, DTLB_MSHR_SIZE},
 88 |           STLB{"STLB", STLB_SET, STLB_WAY, STLB_SET*STLB_WAY, STLB_WQ_SIZE, STLB_RQ_SIZE, STLB_PQ_SIZE, STLB_MSHR_SIZE},
 89 |           L1I{"L1I", L1I_SET, L1I_WAY, L1I_SET*L1I_WAY, L1I_WQ_SIZE, L1I_RQ_SIZE, L1I_PQ_SIZE, L1I_MSHR_SIZE},
 90 |           L1D{"L1D", L1D_SET, L1D_WAY, L1D_SET*L1D_WAY, L1D_WQ_SIZE, L1D_RQ_SIZE, L1D_PQ_SIZE, L1D_MSHR_SIZE},
 91 |           L2C{"L2C", L2C_SET, L2C_WAY, L2C_SET*L2C_WAY, L2C_WQ_SIZE, L2C_RQ_SIZE, L2C_PQ_SIZE, L2C_MSHR_SIZE};
 92 | 
 93 |   // trace cache for previously decoded instructions
 94 |   
 95 |     // constructor
 96 |     O3_CPU() {
 97 |         cpu = 0;
 98 | 
 99 |         // trace
100 |         trace_file = NULL;
101 | 
102 |         // instruction
103 |         instr_unique_id = 0;
104 |         completed_executions = 0;
105 |         begin_sim_cycle = 0;
106 |         begin_sim_instr = 0;
107 |         last_sim_cycle = 0;
108 |         last_sim_instr = 0;
109 |         finish_sim_cycle = 0;
110 |         finish_sim_instr = 0;
111 |         warmup_instructions = 0;
112 |         simulation_instructions = 0;
113 |         instrs_to_read_this_cycle = 0;
114 |         instrs_to_fetch_this_cycle = 0;
115 | 
116 |         next_print_instruction = STAT_PRINTING_PERIOD;
117 |         num_retired = 0;
118 | 
119 |         inflight_reg_executions = 0;
120 |         inflight_mem_executions = 0;
121 |         num_searched = 0;
122 | 
123 |         next_ITLB_fetch = 0;
124 | 
125 |         // branch
126 |         branch_mispredict_stall_fetch = 0;
127 |         mispredicted_branch_iw_index = 0;
128 |         fetch_stall = 0;
129 | 	fetch_resume_cycle = 0;
130 |         num_branch = 0;
131 |         branch_mispredictions = 0;
132 | 	for(uint32_t i=0; i<8; i++)
133 | 	  {
134 | 	    total_branch_types[i] = 0;
135 | 	  }
136 | 	
137 |         for (uint32_t i=0; i<STA_SIZE; i++)
138 | 	  STA[i] = UINT64_MAX;
139 |         STA_head = 0;
140 |         STA_tail = 0;
141 | 
142 |         for (uint32_t i=0; i<ROB_SIZE; i++) {
143 | 	  RTE0[i] = ROB_SIZE;
144 | 	  RTE1[i] = ROB_SIZE;
145 |         }
146 |         RTE0_head = 0;
147 |         RTE1_head = 0;
148 |         RTE0_tail = 0;
149 |         RTE1_tail = 0;
150 | 
151 |         for (uint32_t i=0; i<LQ_SIZE; i++) {
152 | 	  RTL0[i] = LQ_SIZE;
153 | 	  RTL1[i] = LQ_SIZE;
154 |         }
155 |         RTL0_head = 0;
156 |         RTL1_head = 0;
157 |         RTL0_tail = 0;
158 |         RTL1_tail = 0;
159 | 
160 |         for (uint32_t i=0; i<SQ_SIZE; i++) {
161 | 	  RTS0[i] = SQ_SIZE;
162 | 	  RTS1[i] = SQ_SIZE;
163 |         }
164 |         RTS0_head = 0;
165 |         RTS1_head = 0;
166 |         RTS0_tail = 0;
167 |         RTS1_tail = 0;
168 |     }
169 | 
170 |     // functions
171 |     void read_from_trace(),
172 |          fetch_instruction(),
173 |          decode_and_dispatch(),
174 |          schedule_instruction(),
175 |          execute_instruction(),
176 |          schedule_memory_instruction(),
177 |          execute_memory_instruction(),
178 |          do_scheduling(uint32_t rob_index),  
179 |          reg_dependency(uint32_t rob_index),
180 |          do_execution(uint32_t rob_index),
181 |          do_memory_scheduling(uint32_t rob_index),
182 |          operate_lsq(),
183 |          complete_execution(uint32_t rob_index),
184 |          reg_RAW_dependency(uint32_t prior, uint32_t current, uint32_t source_index),
185 |          reg_RAW_release(uint32_t rob_index),
186 |          mem_RAW_dependency(uint32_t prior, uint32_t current, uint32_t data_index, uint32_t lq_index),
187 |          handle_o3_fetch(PACKET *current_packet, uint32_t cache_type),
188 |          handle_merged_translation(PACKET *provider),
189 |          handle_merged_load(PACKET *provider),
190 |          release_load_queue(uint32_t lq_index),
191 |          complete_instr_fetch(PACKET_QUEUE *queue, uint8_t is_it_tlb),
192 |          complete_data_fetch(PACKET_QUEUE *queue, uint8_t is_it_tlb);
193 | 
194 |     void initialize_core();
195 |     void add_load_queue(uint32_t rob_index, uint32_t data_index),
196 |          add_store_queue(uint32_t rob_index, uint32_t data_index),
197 |          execute_store(uint32_t rob_index, uint32_t sq_index, uint32_t data_index);
198 |     int  execute_load(uint32_t rob_index, uint32_t sq_index, uint32_t data_index);
199 |     void check_dependency(int prior, int current);
200 |     void operate_cache();
201 |     void update_rob();
202 |     void retire_rob();
203 | 
204 |     uint32_t  add_to_rob(ooo_model_instr *arch_instr),
205 |               check_rob(uint64_t instr_id);
206 | 
207 |     uint32_t add_to_ifetch_buffer(ooo_model_instr *arch_instr);
208 |     uint32_t add_to_decode_buffer(ooo_model_instr *arch_instr);
209 | 
210 |     uint32_t check_and_add_lsq(uint32_t rob_index);
211 | 
212 |     // branch predictor
213 |     uint8_t predict_branch(uint64_t ip);
214 |     void    initialize_branch_predictor(),
215 |             last_branch_result(uint64_t ip, uint8_t taken);
216 | 
217 |   // code prefetching
218 |   void l1i_prefetcher_initialize();
219 |   void l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target);
220 |   void l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit);
221 |   void l1i_prefetcher_cycle_operate();
222 |   void l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr);
223 |   void l1i_prefetcher_final_stats();
224 |   int prefetch_code_line(uint64_t pf_v_addr); 
225 | };
226 | 
227 | extern O3_CPU ooo_cpu[NUM_CPUS];
228 | 
229 | #endif
230 | 


--------------------------------------------------------------------------------
/inc/set.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file defines a specalized bitset data structure that uses 64 bit
  3 |  * words to store bits in a set, but does something special for small
  4 |  * sets to make it faster.
  5 |  */
  6 | 
  7 | #ifndef __SET_H
  8 | #define __SET_H
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <assert.h>
 12 | #include <string.h>
 13 | 
 14 | #define TYPE	unsigned short int
 15 | //#define MAX_SIZE	ROB_SIZE
 16 | // sethpugsley - changed this from ROB_SIZE to allow for non-power-of-2 ROB sizes, like real CPUs have
 17 | // but MAX_SIZE here still requires a power-of-2 number
 18 | #define MAX_SIZE	512
 19 | 
 20 | // tuned empirically
 21 | 
 22 | #define SMALL_SIZE	13
 23 | #define SMALLER_SIZE	6
 24 | 
 25 | class fastset {
 26 | 	union {
 27 | 		// values for a small set
 28 | 		TYPE 
 29 | 			values[SMALL_SIZE];
 30 | 
 31 | 		// the bits representing the set
 32 | 		unsigned long long int 
 33 | 			bits[MAX_SIZE/64];
 34 | 	} data;
 35 | 
 36 | 	int
 37 | 		card;		// cardinality of small set
 38 | 
 39 | 	// set a bit in the bits
 40 | 
 41 | 	void setbit (TYPE x) {
 42 | 		int word = x >> 6;
 43 | 		int bit = x & 63;
 44 | 		data.bits[word] |= 1ull << bit;
 45 | 	}
 46 | 
 47 | 	// get one of the bits
 48 | 
 49 | 	bool getbit (TYPE x) {
 50 | 		int word = x >> 6;
 51 | 		int bit = x & 63;
 52 | 		return (data.bits[word] >> bit) & 1;
 53 | 	}
 54 | 
 55 | 	// insert an item into a small set
 56 | 
 57 | 	void insert_small (TYPE x) {
 58 | 		int i;
 59 | 		for (i=0; i<card; i++) {
 60 | 			TYPE y = data.values[i];
 61 | 			if (y == x) return;
 62 | 			if (y > x) break;
 63 | 		}
 64 | 		// x belongs in i; move everything from v[i] through v[n-1]
 65 | 		// to v[i+1] through v[n]
 66 | 		for (int j=card-1; j>=i; j--) data.values[j+1] = data.values[j];
 67 | 		// the loop seems a little faster than memmove
 68 | 		//memmove (&data.values[i+1], &data.values[i], (sizeof (TYPE) * (card-i)));
 69 | 		data.values[i] = x;
 70 | 		card++;
 71 | 	}
 72 | 
 73 | 
 74 | 	// do a linear search in a small set
 75 | 
 76 | 	bool search_small_linear (TYPE x) {
 77 | 		for (int i=0; i<card; i++) {
 78 | 			TYPE y = data.values[i];
 79 | 			if (y > x) return false;
 80 | 			if (y == x) return true;
 81 | 		}
 82 | 		return false;
 83 | 	}
 84 | 
 85 | 
 86 | 	// search a small set, specializing for the set size
 87 | 
 88 | 	bool search_small (TYPE x) {
 89 | 
 90 | 		// no elements? we're done.
 91 | 
 92 | 		if (!card) return false;
 93 | 
 94 | 		// below a certain size linear search is faster
 95 | 
 96 | 		if (card < SMALLER_SIZE) return search_small_linear (x);
 97 | 
 98 | 		// do a binary search for the item
 99 | 
100 | 		int begin = 0;
101 | 		int end = card-1;
102 | 		int middle = end/2;
103 | 		for (;;) {
104 | 			TYPE y = data.values[middle];
105 | 			if (x < y) {
106 | 				end = middle-1;
107 | 			} else if (x > y) {
108 | 				begin = middle+1;
109 | 			} else return true;
110 | 			if (end < begin) break;
111 | 			middle = (begin + end) / 2;
112 | 			// assert (middle < card && middle >= 0);
113 | 		}
114 | 		return false;
115 | 	}
116 | 
117 | 	// convert a small set into a bitset
118 | 
119 | 	void smalltobit (void) {
120 | 
121 | 		// we have to use a temporary array to hold the small set contents
122 | 		// because the small set and bitset occupy the same memory 
123 | 	
124 | 		TYPE tmp[SMALL_SIZE];
125 | 		memcpy (tmp, data.values, sizeof (TYPE) * card);
126 | 		memset (data.bits, 0, sizeof (data.bits));
127 | 		for (int i=0; i<card; i++) setbit (tmp[i]);
128 | 	}
129 | 
130 | public:
131 | 
132 | 	// constructor
133 | 
134 | 	fastset (void) { card = 0; }
135 | 
136 | 	// destructor
137 | 
138 | 	~fastset (void) { }
139 | 
140 | 	// insert a value into the set
141 | 
142 | 	void insert (TYPE x) {
143 | 		//assert (x < MAX_SIZE);
144 | 
145 | 		// if the set is empty...
146 | 		if (!card) {
147 | 			// now it has a single value
148 | 
149 | 			data.values[card++] = x;
150 | 
151 | 			// and we're done
152 | 
153 | 			return;
154 | 		} 
155 | 
156 | 		// if the set is small
157 | 
158 | 		if (card < SMALL_SIZE) {
159 | 			insert_small (x);
160 | 			if (card == SMALL_SIZE) smalltobit ();
161 | 		} else
162 | 
163 | 		// set the value
164 | 		setbit (x);
165 | 	}
166 | 
167 | 	// search the set for a value
168 | 
169 | 	bool search (TYPE x) {
170 | 		//assert (x < MAX_SIZE);
171 | 
172 | 		// empty?
173 | 		if (!card) return false;
174 | 
175 | 		// singleton?
176 | 		if (card == 1) return data.values[0] == x;
177 | 
178 | 		// small?
179 | 		if (card < SMALL_SIZE) return search_small (x);
180 | 
181 | 		// none of those; extract the bit
182 | 
183 | 		return getbit (x);
184 | 	}
185 | 
186 | 	// this set becomes the union of itself and the other set
187 | 	// (call it "join" because "union" is a C++ keyword)
188 | 
189 | 	void join (fastset & other, int n) {
190 | 
191 | 		// special rules for special sets
192 | 
193 | 		if (!other.card) return;
194 | 
195 | 		if (other.card < SMALL_SIZE) {
196 | 			// not too many values in other; just insert them one by one
197 | 
198 | 			for (int i=0; i<other.card; i++) insert (other.data.values[i]);
199 | 			return;
200 | 		} else if (card < SMALL_SIZE) {
201 | 			// here, we know that other is not small, so we
202 | 			// know we're going to end up with this as a bit
203 | 			// set, so just make it a bit set now and fall 
204 | 			// through to the bitwise ANDing
205 | 			smalltobit ();
206 | 			card = SMALL_SIZE; // fake
207 | 			assert (other.card >= SMALL_SIZE);
208 | 		}
209 | 
210 | 		// lim is the next multiple of 64
211 | 
212 | 		int lim = ((n | 63) + 1) / 64;
213 | 
214 | 		// bitwise OR the other bits into this set
215 | 		for (int i=0; i<lim; i++) data.bits[i] |= other.data.bits[i];
216 | 	}
217 | 
218 | 	// expand the entire set into the array v, returning the cardinality
219 | 
220 | 	int expand (TYPE v[], int n) {
221 | 		if (!card) return 0;
222 | 
223 | 		// a small set can just be copied
224 | 
225 | 		if (card < SMALL_SIZE) {
226 | 			for (int i=0; i<card; i++) v[i] = data.values[i];
227 | 			return card;
228 | 		}
229 | 
230 | 		// go through the bit array looking for elements
231 | 
232 | 		int k = 0;
233 | 		TYPE i;
234 | 		for (i=0; i<n; i+=64) {
235 | 
236 | 			// if this 64 bit subset is not empty, copy it into v
237 | 
238 | 			if (data.bits[i/64]) {
239 | 				for (TYPE j=0; j<64; j++) {
240 | 					TYPE l = i + j;
241 | 					if (l < n) {
242 | 						if (getbit (l)) v[k++] = l;
243 | 					} else break;
244 | 				}
245 | 			}
246 | 		}
247 | 		return k;
248 | 	}
249 | };
250 | 
251 | // this little macro iterates over either the whole set or just the single member
252 | 
253 | #define ITERATE_SET(i,a,n) \
254 | 	TYPE expand_##i[n+1]; \
255 | 	int card_##i = (a).expand (expand_##i, n); \
256 | 	for (int count_##i=0, i=expand_##i[0]; count_##i<card_##i; i=expand_##i[++count_##i])
257 | 
258 | #endif
259 | 


--------------------------------------------------------------------------------
/inc/spp_dev.h:
--------------------------------------------------------------------------------
  1 | #ifndef SPP_H
  2 | #define SPP_H
  3 | 
  4 | // SPP functional knobs
  5 | #define LOOKAHEAD_ON
  6 | #define FILTER_ON
  7 | #define GHR_ON
  8 | #define SPP_SANITY_CHECK
  9 | 
 10 | //#define SPP_DEBUG_PRINT
 11 | #ifdef SPP_DEBUG_PRINT
 12 | #define SPP_DP(x) x
 13 | #else
 14 | #define SPP_DP(x)
 15 | #endif
 16 | 
 17 | // Signature table parameters
 18 | #define ST_SET 1
 19 | #define ST_WAY 256
 20 | #define ST_TAG_BIT 16
 21 | #define ST_TAG_MASK ((1 << ST_TAG_BIT) - 1)
 22 | #define SIG_SHIFT 3
 23 | #define SIG_BIT 12
 24 | #define SIG_MASK ((1 << SIG_BIT) - 1)
 25 | #define SIG_DELTA_BIT 7
 26 | 
 27 | // Pattern table parameters
 28 | #define PT_SET 512
 29 | #define PT_WAY 4
 30 | #define C_SIG_BIT 4
 31 | #define C_DELTA_BIT 4
 32 | #define C_SIG_MAX ((1 << C_SIG_BIT) - 1)
 33 | #define C_DELTA_MAX ((1 << C_DELTA_BIT) - 1)
 34 | 
 35 | // Prefetch filter parameters
 36 | #define QUOTIENT_BIT  10
 37 | #define REMAINDER_BIT 6
 38 | #define HASH_BIT (QUOTIENT_BIT + REMAINDER_BIT + 1)
 39 | #define FILTER_SET (1 << QUOTIENT_BIT)
 40 | #define FILL_THRESHOLD 90
 41 | #define PF_THRESHOLD 25
 42 | 
 43 | // Global register parameters
 44 | #define GLOBAL_COUNTER_BIT 10
 45 | #define GLOBAL_COUNTER_MAX ((1 << GLOBAL_COUNTER_BIT) - 1) 
 46 | #define MAX_GHR_ENTRY 8
 47 | 
 48 | enum FILTER_REQUEST {SPP_L2C_PREFETCH, SPP_LLC_PREFETCH, L2C_DEMAND, L2C_EVICT}; // Request type for prefetch filter
 49 | uint64_t get_hash(uint64_t key);
 50 | 
 51 | class SIGNATURE_TABLE {
 52 |   public:
 53 |     bool     valid[ST_SET][ST_WAY];
 54 |     uint32_t tag[ST_SET][ST_WAY],
 55 |              last_offset[ST_SET][ST_WAY],
 56 |              sig[ST_SET][ST_WAY],
 57 |              lru[ST_SET][ST_WAY];
 58 | 
 59 |     SIGNATURE_TABLE() {
 60 |         cout << "Initialize SIGNATURE TABLE" << endl;
 61 |         cout << "ST_SET: " << ST_SET << endl;
 62 |         cout << "ST_WAY: " << ST_WAY << endl;
 63 |         cout << "ST_TAG_BIT: " << ST_TAG_BIT << endl;
 64 |         cout << "ST_TAG_MASK: " << hex << ST_TAG_MASK << dec << endl;
 65 | 
 66 |         for (uint32_t set = 0; set < ST_SET; set++)
 67 |             for (uint32_t way = 0; way < ST_WAY; way++) {
 68 |                 valid[set][way] = 0;
 69 |                 tag[set][way] = 0;
 70 |                 last_offset[set][way] = 0;
 71 |                 sig[set][way] = 0;
 72 |                 lru[set][way] = way;
 73 |             }
 74 |     };
 75 | 
 76 |     void read_and_update_sig(uint64_t page, uint32_t page_offset, uint32_t &last_sig, uint32_t &curr_sig, int32_t &delta);
 77 | };
 78 | 
 79 | class PATTERN_TABLE {
 80 |   public:
 81 |     int      delta[PT_SET][PT_WAY];
 82 |     uint32_t c_delta[PT_SET][PT_WAY],
 83 |              c_sig[PT_SET];
 84 | 
 85 |     PATTERN_TABLE() {
 86 |         cout << endl << "Initialize PATTERN TABLE" << endl;
 87 |         cout << "PT_SET: " << PT_SET << endl;
 88 |         cout << "PT_WAY: " << PT_WAY << endl;
 89 |         cout << "SIG_DELTA_BIT: " << SIG_DELTA_BIT << endl;
 90 |         cout << "C_SIG_BIT: " << C_SIG_BIT << endl;
 91 |         cout << "C_DELTA_BIT: " << C_DELTA_BIT << endl;
 92 | 
 93 |         for (uint32_t set = 0; set < PT_SET; set++) {
 94 |             for (uint32_t way = 0; way < PT_WAY; way++) {
 95 |                 delta[set][way] = 0;
 96 |                 c_delta[set][way] = 0;
 97 |             }
 98 |             c_sig[set] = 0;
 99 |         }
100 |     }
101 | 
102 |     void update_pattern(uint32_t last_sig, int curr_delta),
103 |          read_pattern(uint32_t curr_sig, int *prefetch_delta, uint32_t *confidence_q, uint32_t &lookahead_way, uint32_t &lookahead_conf, uint32_t &pf_q_tail, uint32_t &depth);
104 | };
105 | 
106 | class PREFETCH_FILTER {
107 |   public:
108 |     uint64_t remainder_tag[FILTER_SET];
109 |     bool     valid[FILTER_SET],  // Consider this as "prefetched"
110 |              useful[FILTER_SET]; // Consider this as "used"
111 | 
112 |     PREFETCH_FILTER() {
113 |         cout << endl << "Initialize PREFETCH FILTER" << endl;
114 |         cout << "FILTER_SET: " << FILTER_SET << endl;
115 | 
116 |         for (uint32_t set = 0; set < FILTER_SET; set++) {
117 |             remainder_tag[set] = 0;
118 |             valid[set] = 0;
119 |             useful[set] = 0;
120 |         }
121 | 
122 |     }
123 | 
124 |     bool     check(uint64_t pf_addr, FILTER_REQUEST filter_request);
125 | };
126 | 
127 | class GLOBAL_REGISTER {
128 |   public:
129 |     // Global counters to calculate global prefetching accuracy
130 |     uint64_t pf_useful,
131 |              pf_issued,
132 |              global_accuracy; // Alpha value in Section III. Equation 3
133 | 
134 |     // Global History Register (GHR) entries
135 |     uint8_t  valid[MAX_GHR_ENTRY];
136 |     uint32_t sig[MAX_GHR_ENTRY],
137 |              confidence[MAX_GHR_ENTRY],
138 |              offset[MAX_GHR_ENTRY];
139 |     int      delta[MAX_GHR_ENTRY];
140 | 
141 |     GLOBAL_REGISTER() {
142 |         pf_useful = 0;
143 |         pf_issued = 0;
144 |         global_accuracy = 0;
145 | 
146 |         for (uint32_t i = 0; i < MAX_GHR_ENTRY; i++) {
147 |             valid[i] = 0;
148 |             sig[i] = 0;
149 |             confidence[i] = 0;
150 |             offset[i] = 0;
151 |             delta[i] = 0;
152 |         }
153 |     }
154 | 
155 |     void update_entry(uint32_t pf_sig, uint32_t pf_confidence, uint32_t pf_offset, int pf_delta);
156 |     uint32_t check_entry(uint32_t page_offset);
157 | };
158 | 
159 | #endif
160 | 


--------------------------------------------------------------------------------
/inc/uncore.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNCORE_H
 2 | #define UNCORE_H
 3 | 
 4 | #include "champsim.h"
 5 | #include "cache.h"
 6 | #include "dram_controller.h"
 7 | //#include "drc_controller.h"
 8 | 
 9 | //#define DRC_MSHR_SIZE 48
10 | 
11 | // uncore
12 | class UNCORE {
13 |   public:
14 | 
15 |     // LLC
16 |     CACHE LLC{"LLC", LLC_SET, LLC_WAY, LLC_SET*LLC_WAY, LLC_WQ_SIZE, LLC_RQ_SIZE, LLC_PQ_SIZE, LLC_MSHR_SIZE};
17 | 
18 |     // DRAM
19 |     MEMORY_CONTROLLER DRAM{"DRAM"}; 
20 | 
21 |     UNCORE(); 
22 | };
23 | 
24 | extern UNCORE uncore;
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | 
  3 | class MLPrefetchModel(object):
  4 |     '''
  5 |     Abstract base class for your models. For HW-based approaches such as the
  6 |     NextLineModel below, you can directly add your prediction code. For ML
  7 |     models, you may want to use it as a wrapper, but alternative approaches
  8 |     are fine so long as the behavior described below is respected.
  9 |     '''
 10 | 
 11 |     @abstractmethod
 12 |     def load(self, path):
 13 |         '''
 14 |         Loads your model from the filepath path
 15 |         '''
 16 |         pass
 17 | 
 18 |     @abstractmethod
 19 |     def save(self, path):
 20 |         '''
 21 |         Saves your model to the filepath path
 22 |         '''
 23 |         pass
 24 | 
 25 |     @abstractmethod
 26 |     def train(self, data):
 27 |         '''
 28 |         Train your model here. No return value. The data parameter is in the
 29 |         same format as the load traces. Namely,
 30 |         Unique Instr Id, Cycle Count, Load Address, Instruction Pointer of the Load, LLC hit/miss
 31 |         '''
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def generate(self, data):
 36 |         '''
 37 |         Generate your prefetches here. Remember to limit yourself to 2 prefetches
 38 |         for each instruction ID and to not look into the future :).
 39 | 
 40 |         The return format for this will be a list of tuples containing the
 41 |         unique instruction ID and the prefetch. For example,
 42 |         [
 43 |             (A, A1),
 44 |             (A, A2),
 45 |             (C, C1),
 46 |             ...
 47 |         ]
 48 | 
 49 |         where A, B, and C are the unique instruction IDs and A1, A2 and C1 are
 50 |         the prefetch addresses.
 51 |         '''
 52 |         pass
 53 | 
 54 | class NextLineModel(MLPrefetchModel):
 55 | 
 56 |     def load(self, path):
 57 |         # Load your pytorch / tensorflow model from the given filepath
 58 |         print('Loading ' + path + ' for NextLineModel')
 59 | 
 60 |     def save(self, path):
 61 |         # Save your model to a file
 62 |         print('Saving ' + path + ' for NextLineModel')
 63 | 
 64 |     def train(self, data):
 65 |         '''
 66 |         Train your model here using the data
 67 | 
 68 |         The data is the same format given in the load traces. Namely:
 69 |         Unique Instr Id, Cycle Count, Load Address, Instruction Pointer of the Load, LLC hit/miss
 70 |         '''
 71 |         print('Training NextLineModel')
 72 | 
 73 |     def generate(self, data):
 74 |         '''
 75 |         Generate the prefetches for the prefetch file for ChampSim here
 76 | 
 77 |         As a reminder, no looking ahead in the data and no more than 2
 78 |         prefetches per unique instruction ID
 79 | 
 80 |         The return format for this function is a list of (instr_id, pf_addr)
 81 |         tuples as shown below
 82 |         '''
 83 |         print('Generating for NextLineModel')
 84 |         prefetches = []
 85 |         for (instr_id, cycle_count, load_addr, load_ip, llc_hit) in data:
 86 |             # Prefetch the next two blocks
 87 |             prefetches.append((instr_id, ((load_addr >> 6) + 1) << 6))
 88 |             prefetches.append((instr_id, ((load_addr >> 6) + 2) << 6))
 89 | 
 90 |         return prefetches
 91 | 
 92 | '''
 93 | # Example PyTorch Model
 94 | import torch
 95 | import torch.nn as nn
 96 | 
 97 | class PytorchMLModel(nn.Module):
 98 | 
 99 |     def __init__(self):
100 |         super().__init__()
101 |         # Initialize your neural network here
102 |         # For example
103 |         self.embedding = nn.Embedding(...)
104 |         self.fc = nn.Linear(...)
105 | 
106 |     def forward(self, x):
107 |         # Forward pass for your model here
108 |         # For example
109 |         return self.relu(self.fc(self.embedding(x)))
110 | 
111 | class TerribleMLModel(MLPrefetchModel):
112 |     """
113 |     This class effectively functions as a wrapper around the above custom
114 |     pytorch nn.Module. You can approach this in another way so long as the the
115 |     load/save/train/generate functions behave as described above.
116 | 
117 |     Disclaimer: It's terrible since the below criterion assumes a gold Y label
118 |     for the prefetches, which we don't really have. In any case, the below
119 |     structure more or less shows how one would use a ML framework with this
120 |     script. Happy coding / researching! :)
121 |     """
122 | 
123 |     def __init__(self):
124 |         self.model = PytorchMLModel()
125 |     
126 |     def load(self, path):
127 |         self.model = torch.load_state_dict(torch.load(path))
128 | 
129 |     def save(self, path):
130 |         torch.save(self.model.state_dict(), path)
131 | 
132 |     def train(self, data):
133 |         # Just standard run-time here
134 |         self.model.train()
135 |         criterion = nn.CrossEntropyLoss()
136 |         optimizer = nn.optim.Adam(self.model.parameters())
137 |         scheduler = nn.optim.lr_scheduler.StepLR(optimizer, step_size=0.1)
138 |         for epoch in range(20):
139 |             # Assuming batch(...) is a generator over the data
140 |             for i, (x, y) in enumerate(batch(data)):
141 |                 y_pred = self.model(x)
142 |                 loss = criterion(y_pred, y)
143 | 
144 |                 if i % 100 == 0:
145 |                     print('Loss:', loss.item())
146 | 
147 |                 optimizer.zero_grad()
148 |                 loss.backward()
149 |                 optimizer.step()
150 |             scheduler.step()
151 | 
152 |     def generate(self, data):
153 |         self.model.eval()
154 |         prefetches = []
155 |         for i, (x, _) in enumerate(batch(data, random=False)):
156 |             y_pred = self.model(x)
157 |             
158 |             for xi, yi in zip(x, y_pred):
159 |                 # Where instr_id is a function that extracts the unique instr_id
160 |                 prefetches.append((instr_id(xi), yi))
161 | 
162 |         return prefetches
163 | '''
164 | 
165 | # Replace this if you create your own model
166 | Model = NextLineModel
167 | 


--------------------------------------------------------------------------------
/prefetcher/bo.h:
--------------------------------------------------------------------------------
  1 | #ifndef __BO_H
  2 | #define __BO_H
  3 | 
  4 | //######################################################################################
  5 | //                             BEST OFFSET PREFETCHER
  6 | //######################################################################################
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include "cache.h"
 11 | #include <map>
 12 | 
 13 | using namespace std;
 14 | 
 15 | // Submission ID: 3
 16 | 
 17 | // Paper title: A Best-Offset Prefetcher
 18 | 
 19 | // Author: Pierre Michaud
 20 | 
 21 | // (Modified to be a LLC prefetcher by Akanksha Jain)
 22 | // Prefetch Throttling is disabled since MSH info is not available
 23 | //######################################################################################
 24 | //                             PREFETCHER PARAMETERS 
 25 | //######################################################################################
 26 | 
 27 | // Because prefetch cannot cross 4KB-page boundaries, there is no need to consider offsets
 28 | // greater than 63. However, with pages larger than 4KB, it would be beneficial to consider
 29 | // larger offsets.
 30 | 
 31 | #define NOFFSETS 46
 32 | int OFFSET[NOFFSETS] = {1,-1,2,-2,3,-3,4,-4,5,-5,6,-6,7,-7,8,-8,9,-9,10,-10,11,-11,12,-12,13,-13,14,-14,15,-15,16,-16,18,-18,20,-20,24,-24,30,-30,32,-32,36,-36,40,-40};
 33 | #define DEFAULT_OFFSET 1
 34 | #define SCORE_MAX 31
 35 | #define ROUND_MAX 100
 36 | #define RRINDEX 6
 37 | #define RRTAG 12
 38 | #define DELAYQSIZE 15
 39 | #define DELAY 60
 40 | #define TIME_BITS 12
 41 | //#define LLC_RATE_MAX 255
 42 | //#define GAUGE_MAX 8191
 43 | //#define MSHR_THRESHOLD_MAX (LLC_MSHR_SIZE-4)
 44 | //#define MSHR_THRESHOLD_MIN 2
 45 | #define LOW_SCORE 20
 46 | //#define BAD_SCORE ((knob_small_llc)? 10 : 1)
 47 | #define BAD_SCORE 10
 48 | //#define BANDWIDTH ((knob_low_bandwidth)? 64 : 16)
 49 | //######################################################################################
 50 | //                               PREFETCHER STATE
 51 | //######################################################################################
 52 | 
 53 | int prefetch_offset;   // 7 bits (6-bit value + 1 sign bit)
 54 | 
 55 | // Recent Requests (RR) table: 2 banks, 64 entries per bank, RRTAG bits per entry
 56 | int recent_request[2][1<<RRINDEX]; // 2x64x12 = 1536 bits
 57 | 
 58 | // 1 prefetch bit per L2 cache line : 256x8 = 2048 bits 
 59 | int prefetch_bit[LLC_SET][LLC_WAY]; 
 60 | 
 61 | 
 62 | struct offsets_scores {
 63 |   int score[NOFFSETS];    // log2 SCORE_MAX = 5 bits per entry
 64 |   int max_score;          // log2 SCORE_MAX = 5 bits
 65 |   int best_offset;        // 7 bits (6-bit value + 1 sign bit)
 66 |   int round;              // log2 ROUND_MAX = 7 bits
 67 |   int p;                  // log2 NOFFSETS = 6 bits
 68 | } os;                     // 46x5+5+7+7+6 = 255 bits
 69 | 
 70 | 
 71 | struct delay_queue {
 72 |   int lineaddr[DELAYQSIZE]; // RRINDEX+RTAG = 18 bits
 73 |   int cycle[DELAYQSIZE];    // TIME_BITS = 12 bits
 74 |   int valid[DELAYQSIZE];    // 1 bit 
 75 |   int tail;                 // log2 DELAYQSIZE = 4 bits
 76 |   int head;                 // log2 DELAYQSIZE = 4 bits
 77 | } dq;                       // 15x(18+12+1)+4+4 = 473 bits
 78 | 
 79 | 
 80 | struct prefetch_throttle {
 81 |   int mshr_threshold;     // log2 L2C_MSHR_SIZE = 4 bits
 82 |   int prefetch_score;     // log2 SCORE_MAX = 5 bits
 83 |   int llc_rate;           // log2 LLC_RATE_MAX = 8 bits
 84 |   int llc_rate_gauge;     // log2 GAUGE_MAX = 13 bits
 85 |   int last_cycle;         // TIME_BITS = 12 bits
 86 | } pt;                     // 4+5+8+13+12 = 42 bits
 87 | 
 88 | // Total prefetcher state: 7 + 1536 + 2048 + 255 + 473 + 42 = 4361 bits 
 89 | 
 90 | 
 91 | 
 92 | //######################################################################################
 93 | //                            SOME MACROS & DEFINITIONS
 94 | //######################################################################################
 95 | 
 96 | #define LOGLINE 6
 97 | 
 98 | #define SAMEPAGE(lineaddr1,lineaddr2) ((((lineaddr1) ^ (lineaddr2)) >> 6) == 0)
 99 | 
100 | #define INCREMENT(x,n) {x++; if (x==(n)) x=0;}
101 | 
102 | #define TRUNCATE(x,nbits) (((x) & ((1<<(nbits))-1)))
103 | 
104 | typedef long long t_addr;
105 | 
106 | 
107 | 
108 | //######################################################################################
109 | //                            RECENT REQUESTS TABLE (RR)
110 | //######################################################################################
111 | 
112 | void rr_init()
113 | {
114 |   int i;
115 |   for (i=0; i<(1<<RRINDEX); i++) {
116 |     recent_request[0][i] = 0;
117 |     recent_request[1][i] = 0;
118 |   }
119 | }
120 | 
121 | 
122 | int rr_tag(t_addr lineaddr)
123 | {
124 |   return TRUNCATE(lineaddr>>RRINDEX,RRTAG);
125 | }
126 | 
127 | 
128 | int rr_index_left(t_addr lineaddr)
129 | {
130 |   return TRUNCATE(lineaddr^(lineaddr>>RRINDEX),RRINDEX);
131 | }
132 | 
133 | 
134 | int rr_index_right(t_addr lineaddr)
135 | {
136 |   return TRUNCATE(lineaddr^(lineaddr>>(2*RRINDEX)),RRINDEX);
137 | }
138 | 
139 | 
140 | void rr_insert_left(t_addr lineaddr)
141 | {
142 |   int i = rr_index_left(lineaddr);
143 |   recent_request[0][i] = rr_tag(lineaddr);
144 | }
145 | 
146 | 
147 | void rr_insert_right(t_addr lineaddr)
148 | {
149 |   int i = rr_index_right(lineaddr);
150 |   recent_request[1][i] = rr_tag(lineaddr);
151 | }
152 | 
153 | 
154 | int rr_hit(t_addr lineaddr)
155 | {
156 |   int i = rr_index_left(lineaddr);
157 |   int j = rr_index_right(lineaddr);
158 |   int tag = rr_tag(lineaddr);
159 |   return (recent_request[0][i] == tag) || (recent_request[1][j] == tag);
160 | }
161 | 
162 | 
163 | 
164 | //######################################################################################
165 | //                               DELAY QUEUE (DQ)
166 | //######################################################################################
167 | 
168 | // Without the delay queue, the prefetcher would always try to select an offset value
169 | // large enough for having timely prefetches. However, sometimes, a small offset yields
170 | // late prefetches but greater prefetch accuracy and better performance. The delay queue
171 | // is an imperfect solution to this problem.
172 | 
173 | // This implementation of the delay queue is specific to the DPC2 simulator, as the DPC2
174 | // prefetcher can act only at certain clock cycles. In a real processor, the delay queue
175 | // implementation can be simpler.
176 | 
177 | 
178 | void dq_init()
179 | {
180 |   int i;
181 |   for (i=0; i<DELAYQSIZE; i++) {
182 |     dq.lineaddr[i] = 0;
183 |     dq.cycle[i] = 0;
184 |     dq.valid[i] = 0;
185 |   }
186 |   dq.tail = 0;
187 |   dq.head = 0;
188 | }
189 | 
190 | 
191 | void dq_push(t_addr lineaddr)
192 | {
193 |   // enqueue one line address
194 |   if (dq.valid[dq.tail]) {
195 |     // delay queue is full
196 |     // dequeue the oldest entry and write the "left" bank of the RR table
197 |     rr_insert_left(dq.lineaddr[dq.head]);
198 |     INCREMENT(dq.head,DELAYQSIZE);
199 |   }
200 |   dq.lineaddr[dq.tail] = TRUNCATE(lineaddr,RRINDEX+RRTAG);
201 |   dq.cycle[dq.tail] = TRUNCATE(current_core_cycle[0],TIME_BITS);
202 |   dq.valid[dq.tail] = 1;
203 |   INCREMENT(dq.tail,DELAYQSIZE);
204 | }
205 | 
206 | 
207 | int dq_ready()
208 | {
209 |   // tells whether or not the oldest entry is ready to be dequeued
210 |   if (! dq.valid[dq.head]) {
211 |     // delay queue is empty
212 |     return 0;
213 |   }
214 |     // TODO: Change to per-core cycle
215 |   int cycle = TRUNCATE(current_core_cycle[0],TIME_BITS);
216 |   int issuecycle = dq.cycle[dq.head];
217 |   int readycycle = TRUNCATE(issuecycle+DELAY,TIME_BITS);
218 |   if (readycycle >= issuecycle) {
219 |     return (cycle < issuecycle) || (cycle >= readycycle);
220 |   } else {
221 |     return (cycle < issuecycle) && (cycle >= readycycle);
222 |   }
223 | }
224 | 
225 | 
226 | void dq_pop()
227 | {
228 |   // dequeue the entries that are ready to be dequeued,
229 |   // and do a write in the "left" bank of the RR table for each of them
230 |   int i;
231 |   for (i=0; i<DELAYQSIZE; i++) {
232 |     if (! dq_ready()) {
233 |       break;
234 |     }
235 |     rr_insert_left(dq.lineaddr[dq.head]);
236 |     dq.valid[dq.head] = 0;
237 |     INCREMENT(dq.head,DELAYQSIZE);
238 |   }
239 | }
240 | 
241 | 
242 | 
243 | //######################################################################################
244 | //                               PREFETCH THROTTLE (PT)
245 | //######################################################################################
246 | 
247 | // The following prefetch throttling method is specific to the DPC2 simulator, as other
248 | // parts of the microarchitecture (requests schedulers, cache replacement policy,
249 | // LLC hit/miss information,...) can be neither modified nor observed. Consequently,
250 | // we ignore hardware implementation considerations here.
251 | 
252 | /*
253 | void pt_init()
254 | {
255 |   pt.mshr_threshold = MSHR_THRESHOLD_MAX;
256 |   pt.prefetch_score = SCORE_MAX;
257 |   pt.llc_rate = 0;
258 |   pt.llc_rate_gauge = GAUGE_MAX/2;
259 |   pt.last_cycle = 0;
260 | }
261 | 
262 | 
263 | // The pt_update_mshr_threshold function is for adjusting the MSHR threshold
264 | // (a prefetch request is dropped when the MSHR occupancy exceeds the threshold)
265 | 
266 | void pt_update_mshr_threshold()
267 | {
268 |   if ((pt.prefetch_score > LOW_SCORE) || (pt.llc_rate > (2*BANDWIDTH))) {
269 |     // prefetch accuracy not too bad, or low bandwidth requirement
270 |     // ==> maximum prefetch aggressiveness
271 |     pt.mshr_threshold = MSHR_THRESHOLD_MAX;
272 |   } else if (pt.llc_rate < BANDWIDTH) {
273 |     // LLC access rate exceeds memory bandwidth, implying that there are some LLC hits.
274 |     // If there are more LLC misses than hits, perhaps memory bandwidth saturates.
275 |     // If there are more LLC hits than misses, the MSHR is probably not stressed.
276 |     // So we set the MSHR threshold low.
277 |     pt.mshr_threshold = MSHR_THRESHOLD_MIN;
278 |   } else {
279 |     // in-between situation: we set the MSHR threshold proportionally to the (inverse) LLC rate
280 |     pt.mshr_threshold = MSHR_THRESHOLD_MIN + (MSHR_THRESHOLD_MAX-MSHR_THRESHOLD_MIN) * (double) (pt.llc_rate - BANDWIDTH) / BANDWIDTH;
281 |   }
282 | }
283 | 
284 | 
285 | // The pt_llc_access function estimates the average time between consecutive LLC accesses.
286 | // It is called on every LLC access.
287 | 
288 | void pt_llc_access()
289 | {
290 |   // update the gauge
291 |   int cycle = TRUNCATE(current_core_cycle[0],TIME_BITS);
292 |   int dt = TRUNCATE(cycle - pt.last_cycle,TIME_BITS);
293 |   pt.last_cycle = cycle;
294 |   pt.llc_rate_gauge += dt - pt.llc_rate;
295 | 
296 |   // if the gauge reaches its upper limit, increment the rate counter
297 |   // if the gauge reaches its lower limit, decrement the rate counter
298 |   // otherwise leave the rate counter unchanged
299 |   if (pt.llc_rate_gauge > GAUGE_MAX) {
300 |     pt.llc_rate_gauge = GAUGE_MAX;
301 |     if (pt.llc_rate < LLC_RATE_MAX) {
302 |       pt.llc_rate++;
303 |       pt_update_mshr_threshold();
304 |     }
305 |   } else if (pt.llc_rate_gauge < 0) {
306 |     pt.llc_rate_gauge = 0;
307 |     if (pt.llc_rate > 0) {
308 |       pt.llc_rate--;
309 |       pt_update_mshr_threshold();
310 |     }
311 |   }
312 | }
313 | */
314 | 
315 | //######################################################################################
316 | //                               OFFSETS SCORES (OS)
317 | //######################################################################################
318 | 
319 | // A method for determining the best offset value
320 | 
321 | void os_reset()
322 | {
323 |   int i;
324 |   for (i=0; i<NOFFSETS; i++) {
325 |     os.score[i] = 0;
326 |   }
327 |   os.max_score = 0;
328 |   os.best_offset = 0;
329 |   os.round = 0;
330 |   os.p = 0;
331 | }
332 | 
333 | 
334 | // The os_learn_best_offset function tests one offset at a time, trying to determine
335 | // if the current line would have been successfully prefetched with that offset
336 | 
337 | void os_learn_best_offset(t_addr lineaddr)
338 | {
339 |   int testoffset = OFFSET[os.p];
340 |   t_addr testlineaddr = lineaddr - testoffset;
341 | 
342 |   if (SAMEPAGE(lineaddr,testlineaddr) && rr_hit(testlineaddr)) {
343 |     // the current line would likely have been prefetched successfully with that offset
344 |     // ==> increment the score 
345 |     os.score[os.p]++;
346 |     if (os.score[os.p] >= os.max_score) {
347 |       os.max_score = os.score[os.p];
348 |       os.best_offset = testoffset;
349 |     }
350 |   }
351 | 
352 |   if (os.p == (NOFFSETS-1)) {
353 |     // one round finished
354 |     os.round++;
355 | 
356 |     if ((os.max_score == SCORE_MAX) || (os.round == ROUND_MAX)) {
357 |       // learning phase is finished, update the prefetch offset
358 |       prefetch_offset = (os.best_offset != 0)? os.best_offset : DEFAULT_OFFSET;
359 |       pt.prefetch_score = os.max_score;
360 |       //pt_update_mshr_threshold();
361 | 
362 |       if (os.max_score <= BAD_SCORE) {
363 |         // prefetch accuracy is likely to be very low ==> turn the prefetch off 
364 |         prefetch_offset = 0;
365 |       }
366 |       // new learning phase starts
367 |       os_reset();
368 |       return;
369 |     }
370 |   }
371 |   INCREMENT(os.p,NOFFSETS); // prepare to test the next offset
372 | }
373 | 
374 | 
375 | //######################################################################################
376 | //                               DPC2 INTERFACE
377 | //######################################################################################
378 | 
379 | 
380 | void bo_prefetcher_initialize() {
381 |     prefetch_offset = DEFAULT_OFFSET;
382 |     rr_init();
383 |     os_reset();
384 |     dq_init();
385 |     //pt_init();
386 |     int i,j;
387 |     for (i=0; i<LLC_SET; i++) {
388 |         for (j=0; j<LLC_WAY; j++) {
389 |             prefetch_bit[i][j] = 0;
390 |         }
391 |     }
392 | }
393 | 
394 | void bo_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, int set, int way, uint32_t degree, vector<uint64_t>& prefetch_candidates)
395 | {
396 |     t_addr lineaddr = addr >> LOGLINE;
397 | 
398 |     int s = set;
399 |     int w = way;
400 |     int llc_hit = (w < LLC_WAY);
401 |     int prefetched = 0;
402 |     assert(prefetch_candidates.size() == 0);
403 | 
404 |     if (llc_hit) {
405 |         // read the prefetch bit, and reset it
406 |         prefetched = prefetch_bit[s][w];
407 |         prefetch_bit[s][w] = 0;
408 |     } 
409 |     else {
410 |         //pt_llc_access();
411 |     }
412 | 
413 |     dq_pop();
414 | 
415 |     //int prefetch_issued = 0;
416 | 
417 |     if (! llc_hit || prefetched ) {
418 |         os_learn_best_offset(lineaddr);
419 | 
420 |         int offset = prefetch_offset;
421 |         if (offset == 0) {
422 |             // The prefetcher is currently turned off.
423 |             // Just push the line address into the delay queue for best-offset learning.
424 |             dq_push(lineaddr);
425 |             //prefetch_issued = 0; 
426 |         }
427 |         /*else if (! SAMEPAGE(lineaddr,lineaddr+offset)) {
428 |             // crossing the page boundary, no prefetch request issued
429 |             prefetch_issued = 0; 
430 |         }*/
431 |         else
432 |         {
433 |             dq_push(lineaddr);
434 |             for(uint32_t i=1; i<=degree; i++)
435 |                 if (pt.prefetch_score > LOW_SCORE)
436 |                     prefetch_candidates.push_back((lineaddr+i*offset)<<LOGLINE);
437 |                     //prefetch_issued += cache->prefetch_line(ip ,lineaddr<<LOGLINE,(lineaddr+i*offset)<<LOGLINE, FILL_LLC, 0);
438 |         }
439 |     }
440 | }
441 | 
442 | 
443 | void bo_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr)
444 | {
445 |     
446 |     // In this version of the DPC2 simulator, the "prefetch" boolean passed
447 |     // as input here is not reset whenever a demand request hits in the L2
448 |     // MSHR on an in-flight prefetch request. Fortunately, this is the information
449 |     // we need for updating the RR table for best-offset learning.
450 |     // However, the prefetch bit stored in the L2 is not completely accurate
451 |     // (though hopefully this does not impact performance too much).
452 |     // In a real hardware implementation of the BO prefetcher, we would distinguish
453 |     // "prefetched" and "demand-requested", which are independent informations.
454 | 
455 |     t_addr lineaddr = addr >> LOGLINE;
456 | 
457 |     // write the prefetch bit 
458 |     int s = set;
459 |     int w = way;
460 |     prefetch_bit[s][w] = prefetch;
461 | 
462 |     // write the "right" bank of the RR table
463 |     t_addr baselineaddr;
464 |     if (prefetch || (prefetch_offset == 0)) {
465 |         baselineaddr = lineaddr - prefetch_offset;
466 |         if (SAMEPAGE(lineaddr,baselineaddr)) {
467 |             rr_insert_right(baselineaddr);
468 |         }
469 |     }
470 | }
471 | 
472 | 
473 | void bo_prefetcher_final_stats() {
474 | }
475 | 
476 | #endif // __BO_H
477 | 


--------------------------------------------------------------------------------
/prefetcher/bo.llc_pref:
--------------------------------------------------------------------------------
 1 | #include "bo.h" 
 2 | #define DEGREE 2
 3 | 
 4 | void CACHE::llc_prefetcher_initialize() 
 5 | {
 6 | 	bo_prefetcher_initialize();
 7 | }
 8 | 
 9 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle)
10 | {
11 |     if(instr_id == 0)
12 |         return metadata_in;
13 | 
14 |     vector<uint64_t> bo_candidates;
15 |     bo_prefetcher_operate(addr, ip, cache_hit, type, get_set(addr), get_way(addr, get_set(addr)), DEGREE, bo_candidates);
16 |     for(uint32_t i=0; i<bo_candidates.size(); i++)
17 |         prefetch_line(ip, addr, bo_candidates[i], FILL_LLC, 0);
18 |     return metadata_in;
19 | }
20 | 
21 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
22 | {
23 |     bo_prefetcher_cache_fill(addr, set, way, prefetch, evicted_addr);
24 |     return metadata_in;
25 | }
26 | 
27 | void CACHE::llc_prefetcher_final_stats()
28 | {
29 | 	bo_prefetcher_final_stats();
30 | }
31 | 


--------------------------------------------------------------------------------
/prefetcher/from_file.llc_pref:
--------------------------------------------------------------------------------
 1 | #include "ooo_cpu.h"
 2 | #include "cache.h"
 3 | 
 4 | #include <cstdlib>
 5 | #include <fstream>
 6 | #include <limits>
 7 | #include <unordered_map>
 8 | #include <vector>
 9 | 
10 | #define MAX_PREFETCH_DEGREE 2
11 | 
12 | unordered_map<uint64_t, vector<uint64_t>> prefetches;
13 | 
14 | void CACHE::llc_prefetcher_initialize() 
15 | {
16 |     cout << "CPU " << cpu << " LLC from_file prefetcher" << endl;
17 | 
18 |     uint64_t line_no = 0;
19 |     uint64_t instr_id, addr;
20 | 
21 |     while(cin >> dec >> instr_id >> hex >> addr) {
22 |         auto itr = prefetches.find(instr_id);
23 |         if (itr == prefetches.end()) {
24 |             prefetches[instr_id] = vector<uint64_t>();
25 |             prefetches[instr_id].push_back(addr);
26 |         } else {
27 |             if (prefetches[instr_id].size() < MAX_PREFETCH_DEGREE) {
28 |                 prefetches[instr_id].push_back(addr);
29 |             } else {
30 |                 cerr << "Exceeded max prefetch degree of " << MAX_PREFETCH_DEGREE << " on line " << line_no << " for instr_id " << instr_id << endl;
31 |             }
32 |         }
33 |         line_no++;
34 |     }
35 | 
36 | }
37 | 
38 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle)
39 | {
40 |     if(instr_id == 0)  return metadata_in; //No prefetches for instructions with id 0 (prefetches and code misses)
41 | 
42 |     auto itr = prefetches.find(instr_id);
43 |     if (itr != prefetches.end()) {
44 |         for(auto prefetch_addr : itr->second) {
45 |             // cout << "Prefetch " << hex << prefetch_addr << dec << " for instr_id " << instr_id << endl;
46 |             prefetch_line(ip, addr, prefetch_addr, FILL_LLC, 0);
47 |         }
48 |     }
49 | 
50 |     return metadata_in;
51 | }
52 | 
53 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
54 | {
55 |   return metadata_in;
56 | }
57 | 
58 | void CACHE::llc_prefetcher_final_stats()
59 | {
60 |     cout << "CPU " << cpu << " LLC from file prefetcher final stats" << endl;
61 | }
62 | 


--------------------------------------------------------------------------------
/prefetcher/ip_stride.l2c_pref:
--------------------------------------------------------------------------------
  1 | //
  2 | // From Data Prefetching Championship Simulator 2
  3 | // Seth Pugsley, seth.h.pugsley@intel.com
  4 | //
  5 | 
  6 | /*
  7 | 
  8 |   This file describes an Instruction Pointer-based (Program Counter-based) stride prefetcher.  
  9 |   The prefetcher detects stride patterns coming from the same IP, and then 
 10 |   prefetches additional cache lines.
 11 | 
 12 |   Prefetches are issued into the L2 or LLC depending on L2 MSHR occupancy.
 13 | 
 14 |  */
 15 | 
 16 | #include "cache.h"
 17 | 
 18 | #define IP_TRACKER_COUNT 1024
 19 | #define PREFETCH_DEGREE 3
 20 | 
 21 | class IP_TRACKER {
 22 |   public:
 23 |     // the IP we're tracking
 24 |     uint64_t ip;
 25 | 
 26 |     // the last address accessed by this IP
 27 |     uint64_t last_cl_addr;
 28 | 
 29 |     // the stride between the last two addresses accessed by this IP
 30 |     int64_t last_stride;
 31 | 
 32 |     // use LRU to evict old IP trackers
 33 |     uint32_t lru;
 34 | 
 35 |     IP_TRACKER () {
 36 |         ip = 0;
 37 |         last_cl_addr = 0;
 38 |         last_stride = 0;
 39 |         lru = 0;
 40 |     };
 41 | };
 42 | 
 43 | IP_TRACKER trackers[IP_TRACKER_COUNT];
 44 | 
 45 | void CACHE::l2c_prefetcher_initialize() 
 46 | {
 47 |     cout << "CPU " << cpu << " L2C IP-based stride prefetcher" << endl;
 48 |     for (int i=0; i<IP_TRACKER_COUNT; i++)
 49 |         trackers[i].lru = i;
 50 | }
 51 | 
 52 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in)
 53 | {
 54 |     // check for a tracker hit
 55 |     uint64_t cl_addr = addr >> LOG2_BLOCK_SIZE;
 56 | 
 57 |     int index = -1;
 58 |     for (index=0; index<IP_TRACKER_COUNT; index++) {
 59 |         if (trackers[index].ip == ip)
 60 |             break;
 61 |     }
 62 | 
 63 |     // this is a new IP that doesn't have a tracker yet, so allocate one
 64 |     if (index == IP_TRACKER_COUNT) {
 65 | 
 66 |         for (index=0; index<IP_TRACKER_COUNT; index++) {
 67 |             if (trackers[index].lru == (IP_TRACKER_COUNT-1))
 68 |                 break;
 69 |         }
 70 | 
 71 |         trackers[index].ip = ip;
 72 |         trackers[index].last_cl_addr = cl_addr;
 73 |         trackers[index].last_stride = 0;
 74 | 
 75 |         //cout << "[IP_STRIDE] MISS index: " << index << " lru: " << trackers[index].lru << " ip: " << hex << ip << " cl_addr: " << cl_addr << dec << endl;
 76 | 
 77 |         for (int i=0; i<IP_TRACKER_COUNT; i++) {
 78 |             if (trackers[i].lru < trackers[index].lru)
 79 |                 trackers[i].lru++;
 80 |         }
 81 |         trackers[index].lru = 0;
 82 | 
 83 |         return metadata_in;
 84 |     }
 85 | 
 86 |     // sanity check
 87 |     // at this point we should know a matching tracker index
 88 |     if (index == -1)
 89 |         assert(0);
 90 | 
 91 |     // calculate the stride between the current address and the last address
 92 |     // this bit appears overly complicated because we're calculating
 93 |     // differences between unsigned address variables
 94 |     int64_t stride = 0;
 95 |     if (cl_addr > trackers[index].last_cl_addr)
 96 |         stride = cl_addr - trackers[index].last_cl_addr;
 97 |     else {
 98 |         stride = trackers[index].last_cl_addr - cl_addr;
 99 |         stride *= -1;
100 |     }
101 | 
102 |     //cout << "[IP_STRIDE] HIT  index: " << index << " lru: " << trackers[index].lru << " ip: " << hex << ip << " cl_addr: " << cl_addr << dec << " stride: " << stride << endl;
103 | 
104 |     // don't do anything if we somehow saw the same address twice in a row
105 |     if (stride == 0)
106 |         return metadata_in;
107 | 
108 |     // only do any prefetching if there's a pattern of seeing the same
109 |     // stride more than once
110 |     if (stride == trackers[index].last_stride) {
111 | 
112 |         // do some prefetching
113 |         for (int i=0; i<PREFETCH_DEGREE; i++) {
114 |             uint64_t pf_address = (cl_addr + (stride*(i+1))) << LOG2_BLOCK_SIZE;
115 | 
116 |             // only issue a prefetch if the prefetch address is in the same 4 KB page 
117 |             // as the current demand access address
118 |             if ((pf_address >> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE))
119 |                 break;
120 | 
121 |             // check the MSHR occupancy to decide if we're going to prefetch to the L2 or LLC
122 |             if (MSHR.occupancy < (MSHR.SIZE>>1))
123 | 	      prefetch_line(ip, addr, pf_address, FILL_L2, 0);
124 |             else
125 | 	      prefetch_line(ip, addr, pf_address, FILL_LLC, 0);
126 |         }
127 |     }
128 | 
129 |     trackers[index].last_cl_addr = cl_addr;
130 |     trackers[index].last_stride = stride;
131 | 
132 |     for (int i=0; i<IP_TRACKER_COUNT; i++) {
133 |         if (trackers[i].lru < trackers[index].lru)
134 |             trackers[i].lru++;
135 |     }
136 |     trackers[index].lru = 0;
137 | 
138 |     return metadata_in;
139 | }
140 | 
141 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
142 | {
143 |   return metadata_in;
144 | }
145 | 
146 | void CACHE::l2c_prefetcher_final_stats()
147 | {
148 |     cout << "CPU " << cpu << " L2C PC-based stride prefetcher final stats" << endl;
149 | }
150 | 


--------------------------------------------------------------------------------
/prefetcher/kpcp_util.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 | //#include "cache.h"
  3 | #include "kpcp.h"
  4 | 
  5 | SIGNATURE_TABLE L2_ST[NUM_CPUS][L2_ST_SET][L2_ST_WAY];
  6 | PATTERN_TABLE L2_PT[NUM_CPUS][L2_PT_SET][L2_PT_WAY];
  7 | GLOBAL_HISTORY_REGISTER L2_GHR[NUM_CPUS][L2_GHR_TRACK];
  8 | 
  9 | int L2_ST_access[NUM_CPUS], L2_ST_hit[NUM_CPUS], L2_ST_invalid[NUM_CPUS], L2_ST_miss[NUM_CPUS];
 10 | int L2_PT_access[NUM_CPUS], L2_PT_hit[NUM_CPUS], L2_PT_invalid[NUM_CPUS], L2_PT_miss[NUM_CPUS];
 11 | int l2_sig_dist[NUM_CPUS][1<<SIG_LENGTH];
 12 | 
 13 | unsigned int get_new_signature(unsigned int old_signature, int curr_delta)
 14 | {
 15 |     if (curr_delta == 0)
 16 |         return old_signature;
 17 | 
 18 |     unsigned int new_signature = 0;
 19 |     int sig_delta = curr_delta;
 20 |     if (sig_delta < 0)
 21 |         sig_delta = 64 + curr_delta*(-1);
 22 |     new_signature = ((old_signature << SIG_SHIFT) ^ sig_delta) & SIG_MASK;
 23 |     if (new_signature == 0)
 24 |     {
 25 |         //printf("old_signature: %x  SIG_SHIFT: %d  sig_delta: %d  SIG_LENGTH: %d\n", old_signature, SIG_SHIFT, sig_delta, SIG_LENGTH);
 26 |         if (sig_delta)
 27 |             return sig_delta;
 28 |         else
 29 |             return old_signature;
 30 |     }
 31 |     return new_signature;
 32 | }
 33 | 
 34 | // Update signature table
 35 | int L2_ST_update(uint32_t cpu, uint64_t addr)
 36 | {
 37 |     uint64_t curr_page = addr >> LOG2_PAGE_SIZE;
 38 |     int tag = curr_page & 0xFFFF,
 39 |         hit = 0, match = -1,
 40 |         L2_ST_idx = curr_page % L2_ST_PRIME,
 41 |         curr_block = (addr >> LOG2_BLOCK_SIZE) & 0x3F;
 42 |     SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx];
 43 |     int delta_buffer = 0, sig_buffer = 0;
 44 | 
 45 |     for (match=0; match<L2_ST_WAY; match++) {
 46 |         if (table[match].valid && (table[match].tag == tag)) { // Hit 
 47 | 			delta_buffer = curr_block - table[match].last_block; // Buffer current delta
 48 |             sig_buffer = table[match].signature; // Buffer old signature
 49 | 
 50 |             if (table[match].signature == 0) { // First hit in L2_ST
 51 |                 // We cannot associate delta pattern with signature when we see "the first hit in L2_ST"
 52 |                 // At this point, all we know about this page is "the first accessed offset"
 53 |                 // We don't have any delta information that can be a part of signature
 54 |                 // In other words, the first offset does not update PT
 55 | 
 56 |                 int sig_delta = curr_block - table[match].last_block;
 57 |                 if (sig_delta < 0)
 58 |                     sig_delta = 64 + (curr_block - table[match].last_block)*(-1);
 59 |                 table[match].signature = sig_delta & SIG_MASK; // This is the first signature
 60 |                 table[match].first_hit = 1;
 61 |                 l2_sig_dist[cpu][table[match].signature]++;
 62 | 
 63 |                 if (warmup_complete[cpu])
 64 |                 L2_PF_DEBUG(printf("ST_hit_first cpu: %d cl_addr: %lx page: %lx block: %d init_sig: %x delta: %d\n", 
 65 |                             cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].signature, delta_buffer));
 66 |             }
 67 |             else {
 68 |                 hit = 1;
 69 |                 table[match].first_hit = 0;
 70 | 
 71 |                 if (delta_buffer) {
 72 |                     // This is non-speculative information tracked from actual L2 cache demand
 73 |                     // Now, the old signature will be associated with current delta
 74 |                     L2_PT_update(cpu, sig_buffer, delta_buffer);
 75 |                 }
 76 |                 else
 77 |                     break;
 78 | 
 79 |                 if (warmup_complete[cpu])
 80 |                 L2_PF_DEBUG(printf("ST_hit cpu: %d cl_addr: %lx page: %lx block: %d old_sig: %x delta: %d\n", 
 81 |                             cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, sig_buffer, delta_buffer));
 82 | 
 83 |                 // Update signature
 84 |                 int new_signature = get_new_signature(sig_buffer, delta_buffer);
 85 |                 table[match].signature = new_signature;
 86 |                 l2_sig_dist[cpu][table[match].signature]++;
 87 |             }
 88 | 
 89 | 			// Update last_block
 90 | 			table[match].last_block = curr_block;
 91 |             L2_ST_hit[cpu]++; L2_ST_access[cpu]++;
 92 |             break;
 93 |         }
 94 |     }
 95 | 
 96 |     if (match == L2_ST_WAY) {
 97 |         for (match=0; match<L2_ST_WAY; match++) {
 98 |             if (table[match].valid == 0) { // Invalid
 99 |                 // Update metadata
100 |                 table[match].valid = 1;
101 |                 table[match].tag = tag;
102 |                 table[match].signature = 0;
103 |                 table[match].first_hit = 0;
104 |                 table[match].last_block = curr_block;
105 |                 L2_ST_invalid[cpu]++; L2_ST_access[cpu]++;
106 | 
107 |                 if (warmup_complete[cpu])
108 |                 L2_PF_DEBUG(printf("ST_invalid cpu: %d cl_addr: %lx page: %lx block: %d\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block));
109 |                 break;
110 |             }
111 |         }
112 |     }
113 | 
114 |     if (match == L2_ST_WAY) { // Miss
115 |         // Search for LRU victim
116 |         for (match=0; match<L2_ST_WAY; match++) {
117 |             if (table[match].lru == (L2_ST_WAY-1))
118 |                 break;
119 |         }
120 | 
121 |         // Update metadata
122 |         table[match].valid = 1;
123 |         table[match].tag = tag;
124 |         table[match].signature = 0;
125 |         table[match].first_hit = 0;
126 |         table[match].last_block = curr_block;
127 |         
128 |         for (int i=0; i<64; i++) {
129 |             table[match].l2_pf[i] = 0;
130 |             table[match].used[i] = 0;
131 |         }
132 | 
133 |         if (warmup_complete[cpu])
134 |         L2_PF_DEBUG(printf("ST_miss cpu: %d cl_addr: %lx page: %lx block: %d lru: %d\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, curr_block, table[match].lru));
135 |         L2_ST_miss[cpu]++; L2_ST_access[cpu]++;
136 | 
137 |         #ifdef L2_GHR_ON
138 |         // Check GHR
139 |         int ghr_max = 0, ghr_idx = -1, spec_block = 0, spec_sig = 0;
140 |         for (int i=0; i<L2_GHR_TRACK; i++) {
141 |             spec_block = L2_GHR[cpu][i].last_block + L2_GHR[cpu][i].oop_delta;
142 |             if (spec_block >= 64)
143 |                 spec_block -= 64;
144 |             else if (spec_block < 0)
145 |                 spec_block += 64;
146 |             if ((spec_block == curr_block) && (ghr_max <= L2_GHR[cpu][i].path_conf)) {
147 |                 ghr_max = L2_GHR[cpu][i].path_conf;
148 |                 ghr_idx = i;
149 |                 spec_sig = get_new_signature(L2_GHR[cpu][i].signature, L2_GHR[cpu][i].oop_delta);
150 |                 if (warmup_complete[cpu])
151 |                 L2_PF_DEBUG(printf("cpu: %d OOP_match  L2_GHR[%d]  signature: %x  path_conf: %d  last_block: %d  oop_delta: %d  spec_block: %d == curr_block: %d  spec_sig: %x\n",
152 |                           cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 
153 |                           L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig));
154 |             }
155 |             else {
156 |                 if (warmup_complete[cpu])
157 |                 L2_PF_DEBUG(printf("cpu: %d OOP_unmatch  L2_GHR[%d]  signature: %x  path_conf: %d  last_block: %d  oop_delta: %d  spec_block: %d != curr_block: %d  spec_sig: %x\n",
158 |                           cpu, i, L2_GHR[cpu][i].signature, L2_GHR[cpu][i].path_conf, L2_GHR[cpu][i].last_block, 
159 |                           L2_GHR[cpu][i].oop_delta, spec_block, curr_block, spec_sig));
160 |             }
161 |         }
162 | 
163 |         if (ghr_idx >= 0) {
164 |             // Speculatively update first page
165 |             spec_sig = get_new_signature(L2_GHR[cpu][ghr_idx].signature, L2_GHR[cpu][ghr_idx].oop_delta);
166 | 
167 |             hit = 1;
168 |             table[match].signature = spec_sig;
169 |             if (warmup_complete[cpu])
170 |             L2_PF_DEBUG(printf("cpu: %d spec_update  page: %x  sig: %3x  delta: %3d  curr_block: %2d  last_block[NA]: %2d\n", 
171 |                       cpu, tag, spec_sig, L2_GHR[cpu][ghr_idx].oop_delta, curr_block, L2_GHR[cpu][ghr_idx].last_block));
172 |         }
173 |         #endif
174 |     }
175 | 
176 |     // Update LRU
177 |     int position = table[match].lru;
178 |     for (int i=0; i<L2_ST_WAY; i++) {
179 |         if (table[i].lru < position)
180 |             table[i].lru++;
181 |     }
182 |     table[match].lru = 0;
183 | 
184 |     if (hit)
185 |         return match;
186 |     else
187 |         return -1;
188 | }
189 | 
190 | int L2_ST_check(uint32_t cpu, uint64_t addr)
191 | {
192 |     uint64_t curr_page = addr >> LOG2_PAGE_SIZE;
193 |     int tag = curr_page & 0xFFFF,
194 |         match = -1,
195 |         L2_ST_idx = curr_page % L2_ST_PRIME;
196 | 
197 |     SIGNATURE_TABLE *table = L2_ST[cpu][L2_ST_idx];
198 | 
199 |     for (match=0; match<L2_ST_WAY; match++) {
200 |         if (table[match].valid && (table[match].tag == tag)) {
201 |             if (warmup_complete[cpu])
202 |             L2_PF_DEBUG(printf("ST_check found cpu: %d cl_addr: %lx page: %lx block: %ld old_sig: %x last_block: %d\n", 
203 |                         cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F, table[match].signature, table[match].last_block));
204 |             return match;
205 |         }
206 |     }
207 | 
208 |     if (warmup_complete[cpu])
209 |     L2_PF_DEBUG(printf("ST_check not found cpu: %d cl_addr: %lx page: %lx block: %ld\n", cpu, addr >> LOG2_BLOCK_SIZE, curr_page, (addr >> LOG2_BLOCK_SIZE) & 0x3F));
210 |     return -1;
211 | }
212 | 
213 | void L2_PT_update(uint32_t cpu, int signature, int delta)
214 | {
215 |     int L2_PT_idx = signature % L2_PT_PRIME;
216 |     PATTERN_TABLE *table = L2_PT[cpu][L2_PT_idx];
217 | 
218 |     // Update L2_PT
219 |     // Update metadata
220 |     table[0].c_sig++;
221 | 
222 |     if (table[0].c_sig == (CSIG_MAX))
223 |     {
224 |         table[0].c_sig = CSIG_MAX >> 1;
225 |         for (int i = 0; i<L2_PT_WAY; i++)
226 |             table[i].c_delta = table[i].c_delta >> 1;
227 |         if (warmup_complete[cpu])
228 |         L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d c_sig saturated sig_total: %d => %d\n", L2_PT_idx, cpu, CSIG_MAX, table[0].c_sig));
229 |     }
230 | 
231 |     int match;
232 |     for (match=0; match<L2_PT_WAY; match++) 
233 |     {
234 |         if (table[match].delta == delta) // Hit 
235 |         {
236 |             table[match].c_delta++;
237 | 
238 |             if (warmup_complete[cpu])
239 |             L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_hit delta[%d]: %2d (%d / %d)\n", 
240 |                         signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
241 |             L2_PT_hit[cpu]++; L2_PT_access[cpu]++;
242 |             break;
243 |         }
244 |     }
245 | 
246 |     if (match == L2_PT_WAY)
247 |     {
248 |         for (match=0; match<L2_PT_WAY; match++)
249 |         {
250 |             if (table[match].delta == 0) // Invalid
251 |             {
252 |                 // Update metadata
253 |                 table[match].delta = delta;
254 |                 table[match].c_delta = 0;
255 | 
256 |                 if (warmup_complete[cpu])
257 |                 L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_invalid delta[%d]: %2d (%d / %d)\n", 
258 |                             signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
259 |                 L2_PT_invalid[cpu]++; L2_PT_access[cpu]++;
260 |                 break;
261 |             }
262 |         }
263 |     }
264 | 
265 |     if (match == L2_PT_WAY) // Miss
266 |     {
267 |         // Search for the lowest counter
268 |         int min_idx = -1;
269 |         int min_val = CDELTA_MAX;
270 |         for (match=0; match<L2_PT_WAY; match++)
271 |         {
272 |             if (table[match].c_delta < min_val)
273 |             {
274 |                 min_idx = match;
275 |                 min_val = table[match].c_delta;
276 |             }
277 |         }
278 |         match = min_idx;
279 | 
280 |         // Update metadata
281 |         table[match].delta = delta;
282 |         table[match].c_delta = 0;
283 | 
284 |         if (warmup_complete[cpu])
285 |         L2_PF_DEBUG(printf("PT_sig: %4x cpu: %d update_miss delta[%d]: %2d (%d / %d)\n", 
286 |                     signature, cpu, match, table[match].delta, table[match].c_delta, table[0].c_sig));
287 |         L2_PT_miss[cpu]++; L2_PT_access[cpu]++;
288 |     }
289 | }
290 | */
291 | 
292 | // TODO: this functino should be moved to the replacement policy file
293 | // Check sampler 
294 | //void notify_sampler(uint32_t cpu, int64_t address, int dirty, int useful)
295 | //{
296 |     /*
297 |     int set = llc_get_set(address);
298 |     int s_idx = is_it_sampled(set);
299 | 
300 |     if (s_idx == -1)
301 |         return;
302 | 
303 |     SAMPLER_T *s_set = sampler[s_idx];
304 |     int tag = (int) address / (64*LLC_SETS); 
305 |     int match = -1;
306 | 
307 |     // Check hit
308 |     for (match=0; match<SAMPLER_WAY; match++)
309 |     {
310 |         if (s_set[match].valid && (s_set[match].tag == tag))
311 |         {
312 |             if (s_set[match].l2pf)
313 |             {
314 |                 if (useful)
315 |                 {
316 |                     if (conf_counter[cpu] < MAX_CC)
317 |                         conf_counter[cpu]++;
318 | 
319 |                     if (conf_counter[cpu] == MAX_CC)
320 |                     {
321 |                         if (dynamic_fill_thrs[cpu] > 0)
322 |                         {
323 |                             dynamic_fill_thrs[cpu]--;
324 |                             fill_down++;
325 |                             conf_level[dynamic_fill_thrs[cpu]]++;
326 | 
327 |                             printf("FILL_THRESHOLD goes down %d => %d at cycle: %ld\n", dynamic_fill_thrs[cpu]+1, dynamic_fill_thrs[cpu], ooo_cpu[cpu].current_cycle);
328 |                         }
329 |                             
330 |                         conf_counter[cpu] = 0;
331 |                     }
332 | 
333 |                     l2pf_was_useful++;
334 |                 }
335 |                 else
336 |                 {
337 |                     if (conf_counter[cpu] > 0)
338 |                         conf_counter[cpu]--;
339 | 
340 |                     l2pf_was_useless++;
341 |                 }
342 | 
343 |                 l2pf_match++;
344 |             }
345 |             
346 |             break;
347 |         }
348 |     }
349 |     l2pf_signal++;
350 | 
351 |     return;
352 |     */
353 | //}
354 | 


--------------------------------------------------------------------------------
/prefetcher/next_line.l1d_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::l1d_prefetcher_initialize() 
 4 | {
 5 |     cout << "CPU " << cpu << " L1D next line prefetcher" << endl;
 6 | }
 7 | 
 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type)
 9 | {
10 |     uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE;
11 | 
12 |     DP ( if (warmup_complete[cpu]) {
13 |     cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE);
14 |     cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; });
15 | 
16 |     prefetch_line(ip, addr, pf_addr, FILL_L1, 0);
17 | }
18 | 
19 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
20 | {
21 | 
22 | }
23 | 
24 | void CACHE::l1d_prefetcher_final_stats()
25 | {
26 |     cout << "CPU " << cpu << " L1D next line prefetcher final stats" << endl;
27 | }
28 | 


--------------------------------------------------------------------------------
/prefetcher/next_line.l1i_pref:
--------------------------------------------------------------------------------
 1 | #include "ooo_cpu.h"
 2 | 
 3 | void O3_CPU::l1i_prefetcher_initialize() 
 4 | {
 5 |   cout << "CPU " << cpu << " L1I next line prefetcher" << endl;
 6 | }
 7 | 
 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target)
 9 | {
10 |   
11 | }
12 | 
13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit)
14 | {
15 |   //cout << "access v_addr: 0x" << hex << v_addr << dec << endl;
16 |   
17 |   if((cache_hit == 0) && (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1)))
18 |     {
19 |       uint64_t pf_addr = v_addr + (1<<LOG2_BLOCK_SIZE);
20 |       prefetch_code_line(pf_addr);
21 |     }
22 | }
23 | 
24 | void O3_CPU::l1i_prefetcher_cycle_operate()
25 | {
26 | 
27 | }
28 | 
29 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr)
30 | {
31 |   //cout << hex << "fill: 0x" << v_addr << dec << " " << set << " " << way << " " << (uint32_t)prefetch << " " << hex << "evict: 0x" << evicted_v_addr << dec << endl;
32 | }
33 | 
34 | void O3_CPU::l1i_prefetcher_final_stats()
35 | {
36 |   cout << "CPU " << cpu << " L1I next line prefetcher final stats" << endl;
37 | }
38 | 


--------------------------------------------------------------------------------
/prefetcher/next_line.l2c_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::l2c_prefetcher_initialize() 
 4 | {
 5 |     cout << "CPU " << cpu << " L2C next line prefetcher" << endl;
 6 | }
 7 | 
 8 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in)
 9 | {
10 |     uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE;
11 | 
12 |     DP ( if (warmup_complete[cpu]) {
13 |     cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE);
14 |     cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; });
15 | 
16 |     prefetch_line(ip, addr, pf_addr, FILL_L2, 0);
17 | 
18 |     return metadata_in;
19 | }
20 | 
21 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
22 | {
23 |   return metadata_in;
24 | }
25 | 
26 | void CACHE::l2c_prefetcher_final_stats()
27 | {
28 |     cout << "CPU " << cpu << " L2C next line prefetcher final stats" << endl;
29 | }
30 | 


--------------------------------------------------------------------------------
/prefetcher/next_line.llc_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::llc_prefetcher_initialize() 
 4 | {
 5 |     cout << "LLC Next Line Prefetcher" << endl;
 6 | }
 7 | 
 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle)
 9 | {
10 |   uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE;
11 |   prefetch_line(ip, addr, pf_addr, FILL_LLC, 0);
12 | 
13 |   return metadata_in;
14 | }
15 | 
16 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
17 | {
18 |   return metadata_in;
19 | }
20 | 
21 | void CACHE::llc_prefetcher_final_stats()
22 | {
23 |   cout << "LLC Next Line Prefetcher Final Stats: none" << endl;
24 | }
25 | 


--------------------------------------------------------------------------------
/prefetcher/no.l1d_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::l1d_prefetcher_initialize() 
 4 | {
 5 | 
 6 | }
 7 | 
 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type)
 9 | {
10 | 
11 | }
12 | 
13 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
14 | {
15 | 
16 | }
17 | 
18 | void CACHE::l1d_prefetcher_final_stats()
19 | {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/prefetcher/no.l1i_pref:
--------------------------------------------------------------------------------
 1 | #include "ooo_cpu.h"
 2 | 
 3 | void O3_CPU::l1i_prefetcher_initialize() 
 4 | {
 5 | 
 6 | }
 7 | 
 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target)
 9 | {
10 | 
11 | }
12 | 
13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit)
14 | {
15 | 
16 | }
17 | 
18 | void O3_CPU::l1i_prefetcher_cycle_operate()
19 | {
20 | 
21 | }
22 | 
23 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr)
24 | {
25 | 
26 | }
27 | 
28 | void O3_CPU::l1i_prefetcher_final_stats()
29 | {
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/prefetcher/no.l2c_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::l2c_prefetcher_initialize() 
 4 | {
 5 | 
 6 | }
 7 | 
 8 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in)
 9 | {
10 |   return metadata_in;
11 | }
12 | 
13 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
14 | {
15 |   return metadata_in;
16 | }
17 | 
18 | void CACHE::l2c_prefetcher_final_stats()
19 | {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/prefetcher/no.llc_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::llc_prefetcher_initialize() 
 4 | {
 5 | 
 6 | }
 7 | 
 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle)
 9 | {
10 |   return metadata_in;
11 | }
12 | 
13 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
14 | {
15 |   return metadata_in;
16 | }
17 | 
18 | void CACHE::llc_prefetcher_final_stats()
19 | {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/prefetcher/trace.llc_pref:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | void CACHE::llc_prefetcher_initialize() 
 4 | {
 5 | 
 6 | }
 7 | 
 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in, uint64_t instr_id, uint64_t curr_cycle)
 9 | {
10 |   assert(type != PREFETCH); // The instr_id is currently set to 0 for all prefetches
11 |   if(instr_id == 0)  return metadata_in; //The instr_id is also set to 0 for I-cache misses and TLB misses, and we do not want to capture these right now
12 | 
13 |   cout << instr_id << ", " << curr_cycle << ", " << hex << addr << ", " << ip << dec << ", " << (int)cache_hit << endl;
14 |   return metadata_in;
15 | }
16 | 
17 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in)
18 | {
19 |   return metadata_in;
20 | }
21 | 
22 | void CACHE::llc_prefetcher_final_stats()
23 | {
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/replacement/base_replacement.cc:
--------------------------------------------------------------------------------
  1 | #include "cache.h"
  2 | 
  3 | uint32_t CACHE::find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
  4 | {
  5 |     // baseline LRU replacement policy for other caches 
  6 |     return lru_victim(cpu, instr_id, set, current_set, ip, full_addr, type); 
  7 | }
  8 | 
  9 | void CACHE::update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit)
 10 | {
 11 |     if (type == WRITEBACK) {
 12 |         if (hit) // wrietback hit does not update LRU state
 13 |             return;
 14 |     }
 15 | 
 16 |     return lru_update(set, way);
 17 | }
 18 | 
 19 | uint32_t CACHE::lru_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
 20 | {
 21 |     uint32_t way = 0;
 22 | 
 23 |     // fill invalid line first
 24 |     for (way=0; way<NUM_WAY; way++) {
 25 |         if (block[set][way].valid == false) {
 26 | 
 27 |             DP ( if (warmup_complete[cpu]) {
 28 |             cout << "[" << NAME << "] " << __func__ << " instr_id: " << instr_id << " invalid set: " << set << " way: " << way;
 29 |             cout << hex << " address: " << (full_addr>>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data;
 30 |             cout << dec << " lru: " << block[set][way].lru << endl; });
 31 | 
 32 |             break;
 33 |         }
 34 |     }
 35 | 
 36 |     // LRU victim
 37 |     if (way == NUM_WAY) {
 38 |         for (way=0; way<NUM_WAY; way++) {
 39 |             if (block[set][way].lru == NUM_WAY-1) {
 40 | 
 41 |                 DP ( if (warmup_complete[cpu]) {
 42 |                 cout << "[" << NAME << "] " << __func__ << " instr_id: " << instr_id << " replace set: " << set << " way: " << way;
 43 |                 cout << hex << " address: " << (full_addr>>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data;
 44 |                 cout << dec << " lru: " << block[set][way].lru << endl; });
 45 | 
 46 |                 break;
 47 |             }
 48 |         }
 49 |     }
 50 | 
 51 |     if (way == NUM_WAY) {
 52 |         cerr << "[" << NAME << "] " << __func__ << " no victim! set: " << set << endl;
 53 |         assert(0);
 54 |     }
 55 | 
 56 |     return way;
 57 | }
 58 | 
 59 | void CACHE::lru_update(uint32_t set, uint32_t way)
 60 | {
 61 |     // update lru replacement state
 62 |     for (uint32_t i=0; i<NUM_WAY; i++) {
 63 |         if (block[set][i].lru < block[set][way].lru) {
 64 |             block[set][i].lru++;
 65 |         }
 66 |     }
 67 |     block[set][way].lru = 0; // promote to the MRU position
 68 | }
 69 | 
 70 | void CACHE::replacement_final_stats()
 71 | {
 72 | 
 73 | }
 74 | 
 75 | #ifdef NO_CRC2_COMPILE
 76 | void InitReplacementState()
 77 | {
 78 |     
 79 | }
 80 | 
 81 | uint32_t GetVictimInSet (uint32_t cpu, uint32_t set, const BLOCK *current_set, uint64_t PC, uint64_t paddr, uint32_t type)
 82 | {
 83 |     return 0;
 84 | }
 85 | 
 86 | void UpdateReplacementState (uint32_t cpu, uint32_t set, uint32_t way, uint64_t paddr, uint64_t PC, uint64_t victim_addr, uint32_t type, uint8_t hit)
 87 | {
 88 |     
 89 | }
 90 | 
 91 | void PrintStats_Heartbeat()
 92 | {
 93 |     
 94 | }
 95 | 
 96 | void PrintStats()
 97 | {
 98 | 
 99 | }
100 | #endif
101 | 


--------------------------------------------------------------------------------
/replacement/drrip.llc_repl:
--------------------------------------------------------------------------------
  1 | #include "cache.h"
  2 | 
  3 | #define maxRRPV 3
  4 | #define NUM_POLICY 2
  5 | #define SDM_SIZE 32
  6 | #define TOTAL_SDM_SETS NUM_CPUS*NUM_POLICY*SDM_SIZE
  7 | #define BIP_MAX 32
  8 | #define PSEL_WIDTH 10
  9 | #define PSEL_MAX ((1<<PSEL_WIDTH)-1)
 10 | #define PSEL_THRS PSEL_MAX/2
 11 | 
 12 | uint32_t rrpv[LLC_SET][LLC_WAY],
 13 |          bip_counter = 0,
 14 |          PSEL[NUM_CPUS];
 15 | unsigned rand_sets[TOTAL_SDM_SETS];
 16 | 
 17 | void CACHE::llc_initialize_replacement()
 18 | {
 19 |     cout << "Initialize DRRIP state" << endl;
 20 | 
 21 |     for(int i=0; i<LLC_SET; i++) {
 22 |         for(int j=0; j<LLC_WAY; j++)
 23 |             rrpv[i][j] = maxRRPV;
 24 |     }
 25 | 
 26 |     // randomly selected sampler sets
 27 |     srand(time(NULL));
 28 |     unsigned long rand_seed = 1;
 29 |     unsigned long max_rand = 1048576;
 30 |     uint32_t my_set = LLC_SET;
 31 |     int do_again = 0;
 32 |     for (int i=0; i<TOTAL_SDM_SETS; i++) {
 33 |         do {
 34 |             do_again = 0;
 35 |             rand_seed = rand_seed * 1103515245 + 12345;
 36 |             rand_sets[i] = ((unsigned) ((rand_seed/65536) % max_rand)) % my_set;
 37 |             printf("Assign rand_sets[%d]: %u  LLC: %u\n", i, rand_sets[i], my_set);
 38 |             for (int j=0; j<i; j++) {
 39 |                 if (rand_sets[i] == rand_sets[j]) {
 40 |                     do_again = 1;
 41 |                     break;
 42 |                 }
 43 |             }
 44 |         } while (do_again);
 45 |         printf("rand_sets[%d]: %d\n", i, rand_sets[i]);
 46 |     }
 47 | 
 48 |     for (int i=0; i<NUM_CPUS; i++)
 49 |         PSEL[i] = 0;
 50 | }
 51 | 
 52 | int is_it_leader(uint32_t cpu, uint32_t set)
 53 | {
 54 |     uint32_t start = cpu * NUM_POLICY * SDM_SIZE,
 55 |              end = start + NUM_POLICY * SDM_SIZE;
 56 | 
 57 |     for (uint32_t i=start; i<end; i++)
 58 |         if (rand_sets[i] == set)
 59 |             return ((i - start) / SDM_SIZE);
 60 | 
 61 |     return -1;
 62 | }
 63 | 
 64 | // called on every cache hit and cache fill
 65 | void CACHE::llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit)
 66 | {
 67 |     // do not update replacement state for writebacks
 68 |     if (type == WRITEBACK) {
 69 |         rrpv[set][way] = maxRRPV-1;
 70 |         return;
 71 |     }
 72 | 
 73 | 	// cache hit
 74 | 	if (hit) { 
 75 | 		rrpv[set][way] = 0; // for cache hit, DRRIP always promotes a cache line to the MRU position
 76 | 		return;
 77 | 	}
 78 | 
 79 | 	// cache miss
 80 |     int leader = is_it_leader(cpu, set);
 81 | 
 82 |     if (leader == -1) { // follower sets
 83 |         if (PSEL[cpu] > PSEL_THRS) { // follow BIP
 84 |             rrpv[set][way] = maxRRPV;
 85 | 
 86 |             bip_counter++;
 87 |             if (bip_counter == BIP_MAX)
 88 |                 bip_counter = 0;
 89 |             if (bip_counter == 0)
 90 |                 rrpv[set][way] = maxRRPV-1;
 91 |         } else // follow SRRIP
 92 |             rrpv[set][way] = maxRRPV-1;
 93 | 
 94 |     } else if (leader == 0) { // leader 0: BIP
 95 |         if (PSEL[cpu] > 0) PSEL[cpu]--;
 96 |         rrpv[set][way] = maxRRPV;
 97 | 
 98 |         bip_counter++;
 99 |         if (bip_counter == BIP_MAX) bip_counter = 0;
100 |         if (bip_counter == 0) rrpv[set][way] = maxRRPV-1;
101 | 
102 | 	} else if (leader == 1) { // leader 1: SRRIP 
103 |         if (PSEL[cpu] < PSEL_MAX) PSEL[cpu]++;
104 |         rrpv[set][way] = maxRRPV-1;
105 | 
106 |     } else // WE SHOULD NOT REACH HERE
107 |         assert(0);
108 | }
109 | 
110 | // find replacement victim
111 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
112 | {
113 |     // look for the maxRRPV line
114 |     while (1)
115 |     {
116 |         for (int i=0; i<LLC_WAY; i++)
117 |             if (rrpv[set][i] == maxRRPV)
118 |                 return i;
119 | 
120 |         for (int i=0; i<LLC_WAY; i++)
121 |             rrpv[set][i]++;
122 |     }
123 | 
124 |     // WE SHOULD NOT REACH HERE
125 |     assert(0);
126 |     return 0;
127 | }
128 | 
129 | // use this function to print out your own stats at the end of simulation
130 | void CACHE::llc_replacement_final_stats()
131 | {
132 | 
133 | }
134 | 


--------------------------------------------------------------------------------
/replacement/lru.llc_repl:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | // initialize replacement state
 4 | void CACHE::llc_initialize_replacement()
 5 | {
 6 | 
 7 | }
 8 | 
 9 | // find replacement victim
10 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
11 | {
12 |     // baseline LRU
13 |     return lru_victim(cpu, instr_id, set, current_set, ip, full_addr, type); 
14 | }
15 | 
16 | // called on every cache hit and cache fill
17 | void CACHE::llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit)
18 | {
19 |     string TYPE_NAME;
20 |     if (type == LOAD)
21 |         TYPE_NAME = "LOAD";
22 |     else if (type == RFO)
23 |         TYPE_NAME = "RFO";
24 |     else if (type == PREFETCH)
25 |         TYPE_NAME = "PF";
26 |     else if (type == WRITEBACK)
27 |         TYPE_NAME = "WB";
28 |     else
29 |         assert(0);
30 | 
31 |     if (hit)
32 |         TYPE_NAME += "_HIT";
33 |     else
34 |         TYPE_NAME += "_MISS";
35 | 
36 |     if ((type == WRITEBACK) && ip)
37 |         assert(0);
38 | 
39 |     // uncomment this line to see the LLC accesses
40 |     // cout << "CPU: " << cpu << "  LLC " << setw(9) << TYPE_NAME << " set: " << setw(5) << set << " way: " << setw(2) << way;
41 |     // cout << hex << " paddr: " << setw(12) << paddr << " ip: " << setw(8) << ip << " victim_addr: " << victim_addr << dec << endl;
42 | 
43 |     // baseline LRU
44 |     if (hit && (type == WRITEBACK)) // writeback hit does not update LRU state
45 |         return;
46 | 
47 |     return lru_update(set, way);
48 | }
49 | 
50 | void CACHE::llc_replacement_final_stats()
51 | {
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/replacement/ship.llc_repl:
--------------------------------------------------------------------------------
  1 | #include "cache.h"
  2 | #include <cstdlib>
  3 | #include <ctime>
  4 | 
  5 | #define maxRRPV 3
  6 | #define SHCT_SIZE  16384
  7 | #define SHCT_PRIME 16381
  8 | #define SAMPLER_SET (256*NUM_CPUS)
  9 | #define SAMPLER_WAY LLC_WAY
 10 | #define SHCT_MAX 7
 11 | 
 12 | uint32_t rrpv[LLC_SET][LLC_WAY];
 13 | 
 14 | // sampler structure
 15 | class SAMPLER_class
 16 | {
 17 |   public:
 18 |     uint8_t valid,
 19 |             type,
 20 |             used;
 21 | 
 22 |     uint64_t tag, cl_addr, ip;
 23 |     
 24 |     uint32_t lru;
 25 | 
 26 |     SAMPLER_class() {
 27 |         valid = 0;
 28 |         type = 0;
 29 |         used = 0;
 30 | 
 31 |         tag = 0;
 32 |         cl_addr = 0;
 33 |         ip = 0;
 34 | 
 35 |         lru = 0;
 36 |     };
 37 | };
 38 | 
 39 | // sampler
 40 | uint32_t rand_sets[SAMPLER_SET];
 41 | SAMPLER_class sampler[SAMPLER_SET][SAMPLER_WAY];
 42 | 
 43 | // prediction table structure
 44 | class SHCT_class {
 45 |   public:
 46 |     uint32_t counter;
 47 | 
 48 |     SHCT_class() {
 49 |         counter = 0;
 50 |     };
 51 | };
 52 | SHCT_class SHCT[NUM_CPUS][SHCT_SIZE];
 53 | 
 54 | // initialize replacement state
 55 | void CACHE::llc_initialize_replacement()
 56 | {
 57 |     cout << "Initialize SHIP state" << endl;
 58 | 
 59 |     for (int i=0; i<LLC_SET; i++) {
 60 |         for (int j=0; j<LLC_WAY; j++) {
 61 |             rrpv[i][j] = maxRRPV;
 62 |         }
 63 |     }
 64 | 
 65 |     // initialize sampler
 66 |     for (int i=0; i<SAMPLER_SET; i++) {
 67 |         for (int j=0; j<SAMPLER_WAY; j++) {
 68 |             sampler[i][j].lru = j;
 69 |         }
 70 |     }
 71 | 
 72 |     // randomly selected sampler sets
 73 |     srand(time(NULL));
 74 |     unsigned long rand_seed = 1;
 75 |     unsigned long max_rand = 1048576;
 76 |     uint32_t my_set = LLC_SET;
 77 |     int do_again = 0;
 78 |     for (int i=0; i<SAMPLER_SET; i++)
 79 |     {
 80 |         do 
 81 |         {
 82 |             do_again = 0;
 83 |             rand_seed = rand_seed * 1103515245 + 12345;
 84 |             rand_sets[i] = ((unsigned) ((rand_seed/65536) % max_rand)) % my_set;
 85 |             printf("Assign rand_sets[%d]: %u  LLC: %u\n", i, rand_sets[i], my_set);
 86 |             for (int j=0; j<i; j++) 
 87 |             {
 88 |                 if (rand_sets[i] == rand_sets[j]) 
 89 |                 {
 90 |                     do_again = 1;
 91 |                     break;
 92 |                 }
 93 |             }
 94 |         } while (do_again);
 95 |         printf("rand_sets[%d]: %d\n", i, rand_sets[i]);
 96 |     }
 97 | }
 98 | 
 99 | // check if this set is sampled
100 | uint32_t is_it_sampled(uint32_t set)
101 | {
102 |     for (int i=0; i<SAMPLER_SET; i++)
103 |         if (rand_sets[i] == set)
104 |             return i;
105 | 
106 |     return SAMPLER_SET;
107 | }
108 | 
109 | // update sampler
110 | void update_sampler(uint32_t cpu, uint32_t s_idx, uint64_t address, uint64_t ip, uint8_t type)
111 | {
112 |     SAMPLER_class *s_set = sampler[s_idx];
113 |     uint64_t tag = address / (64*LLC_SET); 
114 |     int match = -1;
115 | 
116 |     // check hit
117 |     for (match=0; match<SAMPLER_WAY; match++)
118 |     {
119 |         if (s_set[match].valid && (s_set[match].tag == tag))
120 |         {
121 |             uint32_t SHCT_idx = s_set[match].ip % SHCT_PRIME;
122 |             if (SHCT[cpu][SHCT_idx].counter > 0)
123 |                 SHCT[cpu][SHCT_idx].counter--;
124 | 
125 |             /*
126 |             if (draw_transition)
127 |                 printf("cycle: %lu SHCT: %d ip: 0x%llX SAMPLER_HIT cl_addr: 0x%llX page: 0x%llX block: %ld set: %d\n", 
128 |                 ooo_cpu[cpu].current_cycle, SHCT[cpu][SHCT_idx].dead, s_set[match].ip, address>>6, address>>12, (address>>6) & 0x3F, s_idx);
129 |             */
130 | 
131 |             //s_set[match].ip = ip; // SHIP does not update ip on sampler hit
132 |             s_set[match].type = type; 
133 |             s_set[match].used = 1;
134 |             //D(printf("sampler hit  cpu: %d  set: %d  way: %d  tag: %x  ip: %lx  type: %d  lru: %d\n",
135 |             //            cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru));
136 | 
137 |             break;
138 |         }
139 |     }
140 | 
141 |     // check invalid
142 |     if (match == SAMPLER_WAY)
143 |     {
144 |         for (match=0; match<SAMPLER_WAY; match++)
145 |         {
146 |             if (s_set[match].valid == 0)
147 |             {
148 |                 s_set[match].valid = 1;
149 |                 s_set[match].tag = tag;
150 |                 s_set[match].ip = ip;
151 |                 s_set[match].type = type;
152 |                 s_set[match].used = 0;
153 | 
154 |                 //D(printf("sampler invalid  cpu: %d  set: %d  way: %d  tag: %x  ip: %lx  type: %d  lru: %d\n",
155 |                 //            cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru));
156 |                 break;
157 |             }
158 |         }
159 |     }
160 | 
161 |     // miss
162 |     if (match == SAMPLER_WAY)
163 |     {
164 |         for (match=0; match<SAMPLER_WAY; match++)
165 |         {
166 |             if (s_set[match].lru == (SAMPLER_WAY-1)) // Sampler uses LRU replacement
167 |             {
168 |                 if (s_set[match].used == 0)
169 |                 {
170 |                     uint32_t SHCT_idx = s_set[match].ip % SHCT_PRIME;
171 |                     if (SHCT[cpu][SHCT_idx].counter < SHCT_MAX)
172 |                         SHCT[cpu][SHCT_idx].counter++;
173 | 
174 |                     /*
175 |                     if (draw_transition)
176 |                         printf("cycle: %lu SHCT: %d ip: 0x%llX SAMPLER_MISS cl_addr: 0x%llX page: 0x%llX block: %ld set: %d\n", 
177 |                         ooo_cpu[cpu].current_cycle, SHCT[cpu][SHCT_idx].dead, s_set[match].ip, address>>6, address>>12, (address>>6) & 0x3F, s_idx);
178 |                     */
179 |                 }
180 | 
181 |                 s_set[match].tag = tag;
182 |                 s_set[match].ip = ip;
183 |                 s_set[match].type = type;
184 |                 s_set[match].used = 0;
185 | 
186 |                 //D(printf("sampler miss  cpu: %d  set: %d  way: %d  tag: %x  ip: %lx  type: %d  lru: %d\n",
187 |                 //            cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru));
188 |                 break;
189 |             }
190 |         }
191 |     }
192 | 
193 |     // update LRU state
194 |     uint32_t curr_position = s_set[match].lru;
195 |     for (int i=0; i<SAMPLER_WAY; i++)
196 |     {
197 |         if (s_set[i].lru < curr_position)
198 |             s_set[i].lru++;
199 |     }
200 |     s_set[match].lru = 0;
201 | }
202 | 
203 | // find replacement victim
204 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
205 | {
206 |     // look for the maxRRPV line
207 |     while (1)
208 |     {
209 |         for (int i=0; i<LLC_WAY; i++)
210 |             if (rrpv[set][i] == maxRRPV)
211 |                 return i;
212 | 
213 |         for (int i=0; i<LLC_WAY; i++)
214 |             rrpv[set][i]++;
215 |     }
216 | 
217 |     // WE SHOULD NOT REACH HERE
218 |     assert(0);
219 |     return 0;
220 | }
221 | 
222 | // called on every cache hit and cache fill
223 | void CACHE::llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit)
224 | {
225 |     string TYPE_NAME;
226 |     if (type == LOAD)
227 |         TYPE_NAME = "LOAD";
228 |     else if (type == RFO)
229 |         TYPE_NAME = "RFO";
230 |     else if (type == PREFETCH)
231 |         TYPE_NAME = "PF";
232 |     else if (type == WRITEBACK)
233 |         TYPE_NAME = "WB";
234 |     else
235 |         assert(0);
236 | 
237 |     if (hit)
238 |         TYPE_NAME += "_HIT";
239 |     else
240 |         TYPE_NAME += "_MISS";
241 | 
242 |     if ((type == WRITEBACK) && ip)
243 |         assert(0);
244 | 
245 |     //cout << "CPU: " << cpu << "  LLC " << setw(9) << TYPE_NAME << " set: " << setw(5) << set << " way: " << setw(2) << way;
246 |     //cout << hex << " paddr: " << setw(12) << full_addr << " ip: " << setw(8) << ip << " victim_addr: " << victim_addr << dec << endl;
247 |     
248 |     // handle writeback access
249 |     if (type == WRITEBACK) {
250 |         if (hit)
251 |             return;
252 |         else {
253 |             rrpv[set][way] = maxRRPV-1;
254 |             return;
255 |         }
256 |     }
257 | 
258 |     // update sampler
259 |     uint32_t s_idx = is_it_sampled(set);
260 |     if (s_idx < SAMPLER_SET)
261 |         update_sampler(cpu, s_idx, full_addr, ip, type);
262 | 
263 |     if (hit)
264 |         rrpv[set][way] = 0;
265 |     else {
266 |         // SHIP prediction
267 |         uint32_t SHCT_idx = ip % SHCT_PRIME;
268 | 
269 |         // sanity check
270 |         if (SHCT_idx >= SHCT_PRIME)
271 |             assert(0);
272 | 
273 |         rrpv[set][way] = maxRRPV-1;
274 |         if (SHCT[cpu][SHCT_idx].counter == SHCT_MAX)
275 |             rrpv[set][way] = maxRRPV;
276 |     }
277 | }
278 | 
279 | // use this function to print out your own stats at the end of simulation
280 | void CACHE::llc_replacement_final_stats()
281 | {
282 | 
283 | }
284 | 


--------------------------------------------------------------------------------
/replacement/srrip.llc_repl:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | 
 3 | #define maxRRPV 3
 4 | uint32_t rrpv[LLC_SET][LLC_WAY];
 5 | 
 6 | // initialize replacement state
 7 | void CACHE::llc_initialize_replacement()
 8 | {
 9 |     cout << "Initialize SRRIP state" << endl;
10 | 
11 |     for (int i=0; i<LLC_SET; i++) {
12 |         for (int j=0; j<LLC_WAY; j++) {
13 |             rrpv[i][j] = maxRRPV;
14 |         }
15 |     }
16 | }
17 | 
18 | // find replacement victim
19 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type)
20 | {
21 |     // look for the maxRRPV line
22 |     while (1)
23 |     {
24 |         for (int i=0; i<LLC_WAY; i++)
25 |             if (rrpv[set][i] == maxRRPV)
26 |                 return i;
27 | 
28 |         for (int i=0; i<LLC_WAY; i++)
29 |             rrpv[set][i]++;
30 |     }
31 | 
32 |     // WE SHOULD NOT REACH HERE
33 |     assert(0);
34 |     return 0;
35 | }
36 | 
37 | // called on every cache hit and cache fill
38 | void CACHE::llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit)
39 | {
40 |     string TYPE_NAME;
41 |     if (type == LOAD)
42 |         TYPE_NAME = "LOAD";
43 |     else if (type == RFO)
44 |         TYPE_NAME = "RFO";
45 |     else if (type == PREFETCH)
46 |         TYPE_NAME = "PF";
47 |     else if (type == WRITEBACK)
48 |         TYPE_NAME = "WB";
49 |     else
50 |         assert(0);
51 | 
52 |     if (hit)
53 |         TYPE_NAME += "_HIT";
54 |     else
55 |         TYPE_NAME += "_MISS";
56 | 
57 |     if ((type == WRITEBACK) && ip)
58 |         assert(0);
59 | 
60 |     // uncomment this line to see the LLC accesses
61 |     // cout << "CPU: " << cpu << "  LLC " << setw(9) << TYPE_NAME << " set: " << setw(5) << set << " way: " << setw(2) << way;
62 |     // cout << hex << " paddr: " << setw(12) << paddr << " ip: " << setw(8) << ip << " victim_addr: " << victim_addr << dec << endl;
63 |     
64 |     if (hit)
65 |         rrpv[set][way] = 0;
66 |     else
67 |         rrpv[set][way] = maxRRPV-1;
68 | }
69 | 
70 | // use this function to print out your own stats at the end of simulation
71 | void CACHE::llc_replacement_final_stats()
72 | {
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/run_4core.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 8 ] || [ "$#" -gt 9 ]; then
 4 |     echo "Illegal number of parameters"
 5 |     echo "Usage: ./run_4core.sh [BINARY] [N_WARM] [N_SIM] [N_MIX] [TRACE0] [TRACE1] [TRACE2] [TRACE3] [OPTION]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | TRACE_DIR=$PWD/dpc3_traces
10 | BINARY=${1}
11 | N_WARM=${2}
12 | N_SIM=${3}
13 | N_MIX=${4}
14 | TRACE0=${5}
15 | TRACE1=${6}
16 | TRACE2=${7}
17 | TRACE3=${8}
18 | OPTION=${9}
19 | 
20 | # Sanity check
21 | if [ -z $TRACE_DIR ] || [ ! -d "$TRACE_DIR" ] ; then
22 |     echo "[ERROR] Cannot find a trace directory: $TRACE_DIR"
23 |     exit 1
24 | fi
25 | 
26 | if [ ! -f "bin/$BINARY" ] ; then
27 |     echo "[ERROR] Cannot find a ChampSim binary: bin/$BINARY"
28 |     exit 1
29 | fi
30 | 
31 | re='^[0-9]+$'
32 | if ! [[ $N_WARM =~ $re ]] || [ -z $N_WARM ] ; then
33 |     echo "[ERROR]: Number of warmup instructions is NOT a number" >&2;
34 |     exit 1
35 | fi
36 | 
37 | re='^[0-9]+$'
38 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then
39 |     echo "[ERROR]: Number of simulation instructions is NOT a number" >&2;
40 |     exit 1
41 | fi
42 | 
43 | if [ ! -f "$TRACE_DIR/$TRACE0" ] ; then
44 |     echo "[ERROR] Cannot find a trace0 file: $TRACE_DIR/$TRACE0"
45 |     exit 1
46 | fi
47 | 
48 | if [ ! -f "$TRACE_DIR/$TRACE1" ] ; then
49 |     echo "[ERROR] Cannot find a trace1 file: $TRACE_DIR/$TRACE1"
50 |     exit 1
51 | fi
52 | 
53 | if [ ! -f "$TRACE_DIR/$TRACE2" ] ; then
54 |     echo "[ERROR] Cannot find a trace2 file: $TRACE_DIR/$TRACE2"
55 |     exit 1
56 | fi
57 | 
58 | if [ ! -f "$TRACE_DIR/$TRACE3" ] ; then
59 |     echo "[ERROR] Cannot find a trace3 file: $TRACE_DIR/$TRACE3"
60 |     exit 1
61 | fi
62 | 
63 | mkdir -p results_4core_${N_SIM}M
64 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE0} ${TRACE_DIR}/${TRACE1} ${TRACE_DIR}/${TRACE2} ${TRACE_DIR}/${TRACE3}) &> results_4core_${N_SIM}M/mix${N_MIX}-${BINARY}${OPTION}.txt
65 | 


--------------------------------------------------------------------------------
/run_champsim.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 4 ]; then
 4 |     echo "Illegal number of parameters"
 5 |     echo "Usage: ./run_champsim.sh [BINARY] [N_WARM] [N_SIM] [TRACE] [OPTION]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | TRACE_DIR=$PWD/dpc3_traces
10 | BINARY=${1}
11 | N_WARM=${2}
12 | N_SIM=${3}
13 | TRACE=${4}
14 | OPTION=${5}
15 | 
16 | # Sanity check
17 | if [ -z $TRACE_DIR ] || [ ! -d "$TRACE_DIR" ] ; then
18 |     echo "[ERROR] Cannot find a trace directory: $TRACE_DIR"
19 |     exit 1
20 | fi
21 | 
22 | if [ ! -f "bin/$BINARY" ] ; then
23 |     echo "[ERROR] Cannot find a ChampSim binary: bin/$BINARY"
24 |     exit 1
25 | fi
26 | 
27 | re='^[0-9]+$'
28 | if ! [[ $N_WARM =~ $re ]] || [ -z $N_WARM ] ; then
29 |     echo "[ERROR]: Number of warmup instructions is NOT a number" >&2;
30 |     exit 1
31 | fi
32 | 
33 | re='^[0-9]+$'
34 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then
35 |     echo "[ERROR]: Number of simulation instructions is NOT a number" >&2;
36 |     exit 1
37 | fi
38 | 
39 | if [ ! -f "$TRACE_DIR/$TRACE" ] ; then
40 |     echo "[ERROR] Cannot find a trace file: $TRACE_DIR/$TRACE"
41 |     exit 1
42 | fi
43 | 
44 | mkdir -p results_${N_SIM}M
45 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE}) &> results_${N_SIM}M/${TRACE}-${BINARY}${OPTION}.txt
46 | 


--------------------------------------------------------------------------------
/scripts/download_dpc3_traces.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p $PWD/../dpc3_traces
4 | while read LINE
5 | do
6 |     wget -P $PWD/../dpc3_traces -c http://hpca23.cse.tamu.edu/champsim-traces/speccpu/$LINE
7 | done < dpc3_max_simpoint.txt
8 | 


--------------------------------------------------------------------------------
/scripts/dpc3_max_simpoint.txt:
--------------------------------------------------------------------------------
 1 | 600.perlbench_s-210B.champsimtrace.xz
 2 | 602.gcc_s-734B.champsimtrace.xz
 3 | 603.bwaves_s-3699B.champsimtrace.xz
 4 | 605.mcf_s-665B.champsimtrace.xz
 5 | 607.cactuBSSN_s-2421B.champsimtrace.xz
 6 | 619.lbm_s-4268B.champsimtrace.xz
 7 | 620.omnetpp_s-874B.champsimtrace.xz
 8 | 621.wrf_s-575B.champsimtrace.xz
 9 | 623.xalancbmk_s-700B.champsimtrace.xz
10 | 625.x264_s-18B.champsimtrace.xz
11 | 627.cam4_s-573B.champsimtrace.xz
12 | 628.pop2_s-17B.champsimtrace.xz
13 | 631.deepsjeng_s-928B.champsimtrace.xz
14 | 638.imagick_s-10316B.champsimtrace.xz
15 | 641.leela_s-800B.champsimtrace.xz
16 | 644.nab_s-5853B.champsimtrace.xz
17 | 648.exchange2_s-1699B.champsimtrace.xz
18 | 649.fotonik3d_s-1176B.champsimtrace.xz
19 | 654.roms_s-842B.champsimtrace.xz
20 | 657.xz_s-3167B.champsimtrace.xz
21 | 


--------------------------------------------------------------------------------
/scripts/multiworkload.cc:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <random>
 4 | #include <time.h>
 5 | 
 6 | #define NUM_MIX  100
 7 | #define NUM_CPUS 4
 8 | #define NUM_TRACE 20
 9 | 
10 | using namespace std;
11 | default_random_engine generator;
12 | 
13 | int main()
14 | {
15 |     int benchmark[NUM_MIX][NUM_CPUS];
16 |     for (int i=0; i<NUM_MIX; i++)
17 |         for (int j=0; j<NUM_CPUS; j++)
18 |             benchmark[i][j] = -1;
19 | 
20 |     // Random seed
21 |     generator.seed(time(NULL));
22 | 
23 |     // Set range
24 |     const int rand_min = 1;
25 |     const int rand_max = NUM_TRACE; 
26 |     uniform_int_distribution<int> distribution(rand_min, rand_max);
27 | 
28 |     int temp_rand;
29 |     bool do_again = false;
30 |     
31 |     for (int i = 0; i < NUM_MIX; i++) {
32 |         //printf("MIX%2d: ", i+1);
33 |         for (int j = 0; j < NUM_CPUS; j++) {
34 |             do  {
35 |                 do_again = false;
36 |                 temp_rand = distribution(generator); // Generate random integer flat in [rand_min, rand_mix]
37 |                 for (int k = 0; k < j; k++) {
38 |                     if (temp_rand == benchmark[i][k]) {
39 |                         do_again = true;
40 |                         break;
41 |                     }
42 |                 }
43 |             } while (do_again);
44 | 
45 |             benchmark[i][j] = temp_rand;
46 |             printf("%d ", benchmark[i][j]);
47 |         }
48 |         printf("\n");
49 |     }
50 | 
51 |     return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/scripts/seeds.txt:
--------------------------------------------------------------------------------
  1 | 473.astar-s0 854
  2 | 473.astar-s1 801
  3 | 473.astar-s2 851
  4 | 410.bwaves-s0 1017
  5 | 410.bwaves-s1 1017
  6 | 410.bwaves-s2 922
  7 | 459.GemsFDTD-s0 1001
  8 | 459.GemsFDTD-s1 1001
  9 | 459.GemsFDTD-s2 1005
 10 | 470.lbm-s0 673
 11 | 470.lbm-s1 585
 12 | 437.leslie3d-s0 1151
 13 | 437.leslie3d-s1 1158
 14 | 437.leslie3d-s2 1059
 15 | 462.libquantum-s0 1447
 16 | 462.libquantum-s1 1459
 17 | 462.libquantum-s2 1414
 18 | 429.mcf-s0 629
 19 | 429.mcf-s1 622
 20 | 429.mcf-s2 577
 21 | 433.milc-s0 735
 22 | 433.milc-s1 739
 23 | 433.milc-s2 741
 24 | 471.omnetpp-s0 1036
 25 | 471.omnetpp-s1 1083
 26 | 471.omnetpp-s2 984
 27 | 450.soplex-s0 979
 28 | 450.soplex-s1 982
 29 | 450.soplex-s2 936
 30 | 482.sphinx3-s0 1086
 31 | 482.sphinx3-s1 1079
 32 | 482.sphinx3-s2 1041
 33 | 602.gcc-s0 272
 34 | 602.gcc-s1 270
 35 | 602.gcc-s2 275
 36 | 602.gcc-s3 224
 37 | 605.mcf-s0 267
 38 | 605.mcf-s1 273
 39 | 605.mcf-s2 273
 40 | 605.mcf-s3 273
 41 | 605.mcf-s4 223
 42 | 605.mcf-s5 226
 43 | 605.mcf-s6 227
 44 | 605.mcf-s7 227
 45 | 605.mcf-s8 232
 46 | 607.cactuBSSN-s0 267
 47 | 607.cactuBSSN-s1 279
 48 | 607.cactuBSSN-s2 266
 49 | 607.cactuBSSN-s3 276
 50 | 619.lbm-s0 279
 51 | 619.lbm-s1 280
 52 | 619.lbm-s2 280
 53 | 619.lbm-s3 278
 54 | 620.omnetpp-s0 216
 55 | 620.omnetpp-s1 229
 56 | 621.wrf-s0 227
 57 | 621.wrf-s1 280
 58 | 621.wrf-s2 277
 59 | 621.wrf-s3 267
 60 | 623.xalancbmk-s0 163
 61 | 623.xalancbmk-s1 222
 62 | 623.xalancbmk-s2 214
 63 | 623.xalancbmk-s3 220
 64 | 623.xalancbmk-s4 226
 65 | 623.xalancbmk-s5 217
 66 | 649.fotonik3d-s0 324
 67 | 649.fotonik3d-s1 273
 68 | 649.fotonik3d-s2 115
 69 | 649.fotonik3d-s3 277
 70 | 649.fotonik3d-s4 275
 71 | 654.roms-s0 266
 72 | 654.roms-s1 262
 73 | 654.roms-s2 266
 74 | 654.roms-s3 271
 75 | 654.roms-s4 269
 76 | 654.roms-s5 224
 77 | 654.roms-s6 225
 78 | 654.roms-s7 220
 79 | 654.roms-s8 224
 80 | bc-0 48
 81 | bc-12 99
 82 | bc-3 51
 83 | bc-5 53
 84 | bfs-10 97
 85 | bfs-14 101
 86 | bfs-3 51
 87 | bfs-8 56
 88 | cc-13 100
 89 | cc-14 101
 90 | cc-5 53
 91 | cc-6 54
 92 | pr-10 97
 93 | pr-14 101
 94 | pr-3 51
 95 | pr-5 53
 96 | sssp-10 97
 97 | sssp-14 101
 98 | sssp-3 51
 99 | sssp-5 53
100 | 


--------------------------------------------------------------------------------
/src/block.cc:
--------------------------------------------------------------------------------
  1 | #include "block.h"
  2 | 
  3 | int PACKET_QUEUE::check_queue(PACKET *packet)
  4 | {
  5 |     if ((head == tail) && occupancy == 0)
  6 |         return -1;
  7 | 
  8 |     if (head < tail) {
  9 |         for (uint32_t i=head; i<tail; i++) {
 10 |             if (NAME == "L1D_WQ") {
 11 |                 if (entry[i].full_addr == packet->full_addr) {
 12 |                     DP (if (warmup_complete[packet->cpu]) {
 13 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 14 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 15 |                     cout << " cycle " << packet->event_cycle << endl; });
 16 |                     return i;
 17 |                 }
 18 |             }
 19 |             else {
 20 |                 if (entry[i].address == packet->address) {
 21 |                     DP (if (warmup_complete[packet->cpu]) {
 22 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 23 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 24 |                     cout << " cycle " << packet->event_cycle << endl; });
 25 |                     return i;
 26 |                 }
 27 |             }
 28 |         }
 29 |     }
 30 |     else {
 31 |         for (uint32_t i=head; i<SIZE; i++) {
 32 |             if (NAME == "L1D_WQ") {
 33 |                 if (entry[i].full_addr == packet->full_addr) {
 34 |                     DP (if (warmup_complete[packet->cpu]) {
 35 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 36 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 37 |                     cout << " cycle " << packet->event_cycle << endl; });
 38 |                     return i;
 39 |                 }
 40 |             }
 41 |             else {
 42 |                 if (entry[i].address == packet->address) {
 43 |                     DP (if (warmup_complete[packet->cpu]) {
 44 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 45 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 46 |                     cout << " cycle " << packet->event_cycle << endl; });
 47 |                     return i;
 48 |                 }
 49 |             }
 50 |         }
 51 |         for (uint32_t i=0; i<tail; i++) {
 52 |             if (NAME == "L1D_WQ") {
 53 |                 if (entry[i].full_addr == packet->full_addr) {
 54 |                     DP (if (warmup_complete[packet->cpu]) {
 55 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 56 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 57 |                     cout << " cycle " << packet->event_cycle << endl; });
 58 |                     return i;
 59 |                 }
 60 |             }
 61 |             else {
 62 |                 if (entry[i].address == packet->address) {
 63 |                     DP (if (warmup_complete[packet->cpu]) {
 64 |                     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address;
 65 |                     cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i;
 66 |                     cout << " cycle " << packet->event_cycle << endl; });
 67 |                     return i;
 68 |                 }
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     return -1;
 74 | }
 75 | 
 76 | void PACKET_QUEUE::add_queue(PACKET *packet)
 77 | {
 78 | #ifdef SANITY_CHECK
 79 |     if (occupancy && (head == tail))
 80 |         assert(0);
 81 | #endif
 82 | 
 83 |     // add entry
 84 |     entry[tail] = *packet;
 85 | 
 86 |     DP ( if (warmup_complete[packet->cpu]) {
 87 |     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id;
 88 |     cout << " address: " << hex << entry[tail].address << " full_addr: " << entry[tail].full_addr << dec;
 89 |     cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << entry[tail].event_cycle << endl; });
 90 | 
 91 |     occupancy++;
 92 |     tail++;
 93 |     if (tail >= SIZE)
 94 |         tail = 0;
 95 | }
 96 | 
 97 | void PACKET_QUEUE::remove_queue(PACKET *packet)
 98 | {
 99 | #ifdef SANITY_CHECK
100 |     if ((occupancy == 0) && (head == tail))
101 |         assert(0);
102 | #endif
103 | 
104 |     DP ( if (warmup_complete[packet->cpu]) {
105 |     cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id;
106 |     cout << " address: " << hex << packet->address << " full_addr: " << packet->full_addr << dec << " fill_level: " << packet->fill_level;
107 |     cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << packet->event_cycle << endl; });
108 | 
109 |     // reset entry
110 |     PACKET empty_packet;
111 |     *packet = empty_packet;
112 | 
113 |     occupancy--;
114 |     head++;
115 |     if (head >= SIZE)
116 |         head = 0;
117 | }
118 | 


--------------------------------------------------------------------------------
/src/uncore.cc:
--------------------------------------------------------------------------------
 1 | #include "uncore.h"
 2 | 
 3 | // uncore
 4 | UNCORE uncore;
 5 | 
 6 | // constructor
 7 | UNCORE::UNCORE() {
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/tracer/champsim_tracer.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | /*! @file
  3 |  *  This is an example of the PIN tool that demonstrates some basic PIN APIs 
  4 |  *  and could serve as the starting point for developing your first PIN tool
  5 |  */
  6 | 
  7 | #include "pin.H"
  8 | #include <iostream>
  9 | #include <fstream>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | #include <string>
 13 | 
 14 | #define NUM_INSTR_DESTINATIONS 2
 15 | #define NUM_INSTR_SOURCES 4
 16 | 
 17 | using namespace std;
 18 | 
 19 | typedef struct trace_instr_format {
 20 |     unsigned long long int ip;  // instruction pointer (program counter) value
 21 | 
 22 |     unsigned char is_branch;    // is this branch
 23 |     unsigned char branch_taken; // if so, is this taken
 24 | 
 25 |     unsigned char destination_registers[NUM_INSTR_DESTINATIONS]; // output registers
 26 |     unsigned char source_registers[NUM_INSTR_SOURCES];           // input registers
 27 | 
 28 |     unsigned long long int destination_memory[NUM_INSTR_DESTINATIONS]; // output memory
 29 |     unsigned long long int source_memory[NUM_INSTR_SOURCES];           // input memory
 30 | } trace_instr_format_t;
 31 | 
 32 | /* ================================================================== */
 33 | // Global variables 
 34 | /* ================================================================== */
 35 | 
 36 | UINT64 instrCount = 0;
 37 | 
 38 | FILE* out;
 39 | 
 40 | bool output_file_closed = false;
 41 | bool tracing_on = false;
 42 | 
 43 | trace_instr_format_t curr_instr;
 44 | 
 45 | /* ===================================================================== */
 46 | // Command line switches
 47 | /* ===================================================================== */
 48 | KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE,  "pintool", "o", "champsim.trace", 
 49 |         "specify file name for Champsim tracer output");
 50 | 
 51 | KNOB<UINT64> KnobSkipInstructions(KNOB_MODE_WRITEONCE, "pintool", "s", "0", 
 52 |         "How many instructions to skip before tracing begins");
 53 | 
 54 | KNOB<UINT64> KnobTraceInstructions(KNOB_MODE_WRITEONCE, "pintool", "t", "1000000", 
 55 |         "How many instructions to trace");
 56 | 
 57 | /* ===================================================================== */
 58 | // Utilities
 59 | /* ===================================================================== */
 60 | 
 61 | /*!
 62 |  *  Print out help message.
 63 |  */
 64 | INT32 Usage()
 65 | {
 66 |     cerr << "This tool creates a register and memory access trace" << endl 
 67 |         << "Specify the output trace file with -o" << endl 
 68 |         << "Specify the number of instructions to skip before tracing with -s" << endl
 69 |         << "Specify the number of instructions to trace with -t" << endl << endl;
 70 | 
 71 |     cerr << KNOB_BASE::StringKnobSummary() << endl;
 72 | 
 73 |     return -1;
 74 | }
 75 | 
 76 | /* ===================================================================== */
 77 | // Analysis routines
 78 | /* ===================================================================== */
 79 | 
 80 | void BeginInstruction(VOID *ip, UINT32 op_code, VOID *opstring)
 81 | {
 82 |     instrCount++;
 83 |     //printf("[%p %u %s ", ip, opcode, (char*)opstring);
 84 | 
 85 |     if(instrCount > KnobSkipInstructions.Value()) 
 86 |     {
 87 |         tracing_on = true;
 88 | 
 89 |         if(instrCount > (KnobTraceInstructions.Value()+KnobSkipInstructions.Value()))
 90 |             tracing_on = false;
 91 |     }
 92 | 
 93 |     if(!tracing_on) 
 94 |         return;
 95 | 
 96 |     // reset the current instruction
 97 |     curr_instr.ip = (unsigned long long int)ip;
 98 | 
 99 |     curr_instr.is_branch = 0;
100 |     curr_instr.branch_taken = 0;
101 | 
102 |     for(int i=0; i<NUM_INSTR_DESTINATIONS; i++) 
103 |     {
104 |         curr_instr.destination_registers[i] = 0;
105 |         curr_instr.destination_memory[i] = 0;
106 |     }
107 | 
108 |     for(int i=0; i<NUM_INSTR_SOURCES; i++) 
109 |     {
110 |         curr_instr.source_registers[i] = 0;
111 |         curr_instr.source_memory[i] = 0;
112 |     }
113 | }
114 | 
115 | void EndInstruction()
116 | {
117 |     //printf("%d]\n", (int)instrCount);
118 | 
119 |     //printf("\n");
120 | 
121 |     if(instrCount > KnobSkipInstructions.Value())
122 |     {
123 |         tracing_on = true;
124 | 
125 |         if(instrCount <= (KnobTraceInstructions.Value()+KnobSkipInstructions.Value()))
126 |         {
127 |             // keep tracing
128 |             fwrite(&curr_instr, sizeof(trace_instr_format_t), 1, out);
129 |         }
130 |         else
131 |         {
132 |             tracing_on = false;
133 |             // close down the file, we're done tracing
134 |             if(!output_file_closed)
135 |             {
136 |                 fclose(out);
137 |                 output_file_closed = true;
138 |             }
139 | 
140 |             exit(0);
141 |         }
142 |     }
143 | }
144 | 
145 | void BranchOrNot(UINT32 taken)
146 | {
147 |     //printf("[%d] ", taken);
148 | 
149 |     curr_instr.is_branch = 1;
150 |     if(taken != 0)
151 |     {
152 |         curr_instr.branch_taken = 1;
153 |     }
154 | }
155 | 
156 | void RegRead(UINT32 i, UINT32 index)
157 | {
158 |     if(!tracing_on) return;
159 | 
160 |     REG r = (REG)i;
161 | 
162 |     /*
163 |        if(r == 26)
164 |        {
165 |     // 26 is the IP, which is read and written by branches
166 |     return;
167 |     }
168 |     */
169 | 
170 |     //cout << r << " " << REG_StringShort((REG)r) << " " ;
171 |     //cout << REG_StringShort((REG)r) << " " ;
172 | 
173 |     //printf("%d ", (int)r);
174 | 
175 |     // check to see if this register is already in the list
176 |     int already_found = 0;
177 |     for(int i=0; i<NUM_INSTR_SOURCES; i++)
178 |     {
179 |         if(curr_instr.source_registers[i] == ((unsigned char)r))
180 |         {
181 |             already_found = 1;
182 |             break;
183 |         }
184 |     }
185 |     if(already_found == 0)
186 |     {
187 |         for(int i=0; i<NUM_INSTR_SOURCES; i++)
188 |         {
189 |             if(curr_instr.source_registers[i] == 0)
190 |             {
191 |                 curr_instr.source_registers[i] = (unsigned char)r;
192 |                 break;
193 |             }
194 |         }
195 |     }
196 | }
197 | 
198 | void RegWrite(REG i, UINT32 index)
199 | {
200 |     if(!tracing_on) return;
201 | 
202 |     REG r = (REG)i;
203 | 
204 |     /*
205 |        if(r == 26)
206 |        {
207 |     // 26 is the IP, which is read and written by branches
208 |     return;
209 |     }
210 |     */
211 | 
212 |     //cout << "<" << r << " " << REG_StringShort((REG)r) << "> ";
213 |     //cout << "<" << REG_StringShort((REG)r) << "> ";
214 | 
215 |     //printf("<%d> ", (int)r);
216 | 
217 |     int already_found = 0;
218 |     for(int i=0; i<NUM_INSTR_DESTINATIONS; i++)
219 |     {
220 |         if(curr_instr.destination_registers[i] == ((unsigned char)r))
221 |         {
222 |             already_found = 1;
223 |             break;
224 |         }
225 |     }
226 |     if(already_found == 0)
227 |     {
228 |         for(int i=0; i<NUM_INSTR_DESTINATIONS; i++)
229 |         {
230 |             if(curr_instr.destination_registers[i] == 0)
231 |             {
232 |                 curr_instr.destination_registers[i] = (unsigned char)r;
233 |                 break;
234 |             }
235 |         }
236 |     }
237 |     /*
238 |        if(index==0)
239 |        {
240 |        curr_instr.destination_register = (unsigned long long int)r;
241 |        }
242 |        */
243 | }
244 | 
245 | void MemoryRead(VOID* addr, UINT32 index, UINT32 read_size)
246 | {
247 |     if(!tracing_on) return;
248 | 
249 |     //printf("0x%llx,%u ", (unsigned long long int)addr, read_size);
250 | 
251 |     // check to see if this memory read location is already in the list
252 |     int already_found = 0;
253 |     for(int i=0; i<NUM_INSTR_SOURCES; i++)
254 |     {
255 |         if(curr_instr.source_memory[i] == ((unsigned long long int)addr))
256 |         {
257 |             already_found = 1;
258 |             break;
259 |         }
260 |     }
261 |     if(already_found == 0)
262 |     {
263 |         for(int i=0; i<NUM_INSTR_SOURCES; i++)
264 |         {
265 |             if(curr_instr.source_memory[i] == 0)
266 |             {
267 |                 curr_instr.source_memory[i] = (unsigned long long int)addr;
268 |                 break;
269 |             }
270 |         }
271 |     }
272 | }
273 | 
274 | void MemoryWrite(VOID* addr, UINT32 index)
275 | {
276 |     if(!tracing_on) return;
277 | 
278 |     //printf("(0x%llx) ", (unsigned long long int) addr);
279 | 
280 |     // check to see if this memory write location is already in the list
281 |     int already_found = 0;
282 |     for(int i=0; i<NUM_INSTR_DESTINATIONS; i++)
283 |     {
284 |         if(curr_instr.destination_memory[i] == ((unsigned long long int)addr))
285 |         {
286 |             already_found = 1;
287 |             break;
288 |         }
289 |     }
290 |     if(already_found == 0)
291 |     {
292 |         for(int i=0; i<NUM_INSTR_DESTINATIONS; i++)
293 |         {
294 |             if(curr_instr.destination_memory[i] == 0)
295 |             {
296 |                 curr_instr.destination_memory[i] = (unsigned long long int)addr;
297 |                 break;
298 |             }
299 |         }
300 |     }
301 |     /*
302 |        if(index==0)
303 |        {
304 |        curr_instr.destination_memory = (long long int)addr;
305 |        }
306 |        */
307 | }
308 | 
309 | /* ===================================================================== */
310 | // Instrumentation callbacks
311 | /* ===================================================================== */
312 | 
313 | // Is called for every instruction and instruments reads and writes
314 | VOID Instruction(INS ins, VOID *v)
315 | {
316 |     // begin each instruction with this function
317 |     UINT32 opcode = INS_Opcode(ins);
318 |     INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)BeginInstruction, IARG_INST_PTR, IARG_UINT32, opcode, IARG_END);
319 | 
320 |     // instrument branch instructions
321 |     if(INS_IsBranch(ins))
322 |         INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)BranchOrNot, IARG_BRANCH_TAKEN, IARG_END);
323 | 
324 |     // instrument register reads
325 |     UINT32 readRegCount = INS_MaxNumRRegs(ins);
326 |     for(UINT32 i=0; i<readRegCount; i++) 
327 |     {
328 |         UINT32 regNum = INS_RegR(ins, i);
329 | 
330 |         INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)RegRead,
331 |                 IARG_UINT32, regNum, IARG_UINT32, i,
332 |                 IARG_END);
333 |     }
334 | 
335 |     // instrument register writes
336 |     UINT32 writeRegCount = INS_MaxNumWRegs(ins);
337 |     for(UINT32 i=0; i<writeRegCount; i++) 
338 |     {
339 |         UINT32 regNum = INS_RegW(ins, i);
340 | 
341 |         INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)RegWrite,
342 |                 IARG_UINT32, regNum, IARG_UINT32, i,
343 |                 IARG_END);
344 |     }
345 | 
346 |     // instrument memory reads and writes
347 |     UINT32 memOperands = INS_MemoryOperandCount(ins);
348 | 
349 |     // Iterate over each memory operand of the instruction.
350 |     for (UINT32 memOp = 0; memOp < memOperands; memOp++) 
351 |     {
352 |         if (INS_MemoryOperandIsRead(ins, memOp)) 
353 |         {
354 |             UINT32 read_size = INS_MemoryReadSize(ins);
355 | 
356 |             INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)MemoryRead,
357 |                     IARG_MEMORYOP_EA, memOp, IARG_UINT32, memOp, IARG_UINT32, read_size,
358 |                     IARG_END);
359 |         }
360 |         if (INS_MemoryOperandIsWritten(ins, memOp)) 
361 |         {
362 |             INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)MemoryWrite,
363 |                     IARG_MEMORYOP_EA, memOp, IARG_UINT32, memOp,
364 |                     IARG_END);
365 |         }
366 |     }
367 | 
368 |     // finalize each instruction with this function
369 |     INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)EndInstruction, IARG_END);
370 | }
371 | 
372 | /*!
373 |  * Print out analysis results.
374 |  * This function is called when the application exits.
375 |  * @param[in]   code            exit code of the application
376 |  * @param[in]   v               value specified by the tool in the 
377 |  *                              PIN_AddFiniFunction function call
378 |  */
379 | VOID Fini(INT32 code, VOID *v)
380 | {
381 |     // close the file if it hasn't already been closed
382 |     if(!output_file_closed) 
383 |     {
384 |         fclose(out);
385 |         output_file_closed = true;
386 |     }
387 | }
388 | 
389 | /*!
390 |  * The main procedure of the tool.
391 |  * This function is called when the application image is loaded but not yet started.
392 |  * @param[in]   argc            total number of elements in the argv array
393 |  * @param[in]   argv            array of command line arguments, 
394 |  *                              including pin -t <toolname> -- ...
395 |  */
396 | int main(int argc, char *argv[])
397 | {
398 |     // Initialize PIN library. Print help message if -h(elp) is specified
399 |     // in the command line or the command line is invalid 
400 |     if( PIN_Init(argc,argv) )
401 |         return Usage();
402 | 
403 |     const char* fileName = KnobOutputFile.Value().c_str();
404 | 
405 |     out = fopen(fileName, "ab");
406 |     if (!out) 
407 |     {
408 |         cout << "Couldn't open output trace file. Exiting." << endl;
409 |         exit(1);
410 |     }
411 | 
412 |     // Register function to be called to instrument instructions
413 |     INS_AddInstrumentFunction(Instruction, 0);
414 | 
415 |     // Register function to be called when the application exits
416 |     PIN_AddFiniFunction(Fini, 0);
417 | 
418 |     //cerr <<  "===============================================" << endl;
419 |     //cerr <<  "This application is instrumented by the Champsim Trace Generator" << endl;
420 |     //cerr <<  "Trace saved in " << KnobOutputFile.Value() << endl;
421 |     //cerr <<  "===============================================" << endl;
422 | 
423 |     // Start the program, never returns
424 |     PIN_StartProgram();
425 | 
426 |     return 0;
427 | }
428 | 
429 | /* ===================================================================== */
430 | /* eof */
431 | /* ===================================================================== */
432 | 


--------------------------------------------------------------------------------
/tracer/clean_tracer.sh:
--------------------------------------------------------------------------------
1 | export PIN_ROOT=/home/grads/c/cienlux/task/pin-3.2-81205-gcc-linux
2 | make clean
3 | 


--------------------------------------------------------------------------------
/tracer/make_tracer.sh:
--------------------------------------------------------------------------------
1 | export PIN_ROOT=/your/pin/directory/
2 | mkdir -p obj-intel64
3 | make obj-intel64/champsim_tracer.so
4 | 


--------------------------------------------------------------------------------
/tracer/makefile:
--------------------------------------------------------------------------------
 1 | ##############################################################
 2 | #
 3 | #                   DO NOT EDIT THIS FILE!
 4 | #
 5 | ##############################################################
 6 | 
 7 | # If the tool is built out of the kit, PIN_ROOT must be specified in the make invocation and point to the kit root.
 8 | ifdef PIN_ROOT
 9 | CONFIG_ROOT := $(PIN_ROOT)/source/tools/Config
10 | else
11 | CONFIG_ROOT := ../Config
12 | endif
13 | include $(CONFIG_ROOT)/makefile.config
14 | include makefile.rules
15 | include $(TOOLS_ROOT)/Config/makefile.default.rules
16 | 
17 | ##############################################################
18 | #
19 | #                   DO NOT EDIT THIS FILE!
20 | #
21 | ##############################################################
22 | 


--------------------------------------------------------------------------------
/tracer/makefile.rules:
--------------------------------------------------------------------------------
 1 | ##############################################################
 2 | #
 3 | # This file includes all the test targets as well as all the
 4 | # non-default build rules and test recipes.
 5 | #
 6 | ##############################################################
 7 | 
 8 | 
 9 | ##############################################################
10 | #
11 | # Test targets
12 | #
13 | ##############################################################
14 | 
15 | ###### Place all generic definitions here ######
16 | 
17 | # This defines tests which run tools of the same name.  This is simply for convenience to avoid
18 | # defining the test name twice (once in TOOL_ROOTS and again in TEST_ROOTS).
19 | # Tests defined here should not be defined in TOOL_ROOTS and TEST_ROOTS.
20 | TEST_TOOL_ROOTS := MyPinTool
21 | 
22 | # This defines the tests to be run that were not already defined in TEST_TOOL_ROOTS.
23 | TEST_ROOTS :=
24 | 
25 | # This defines a list of tests that should run in the "short" sanity. Tests in this list must also
26 | # appear either in the TEST_TOOL_ROOTS or the TEST_ROOTS list.
27 | # If the entire directory should be tested in sanity, assign TEST_TOOL_ROOTS and TEST_ROOTS to the
28 | # SANITY_SUBSET variable in the tests section below (see example in makefile.rules.tmpl).
29 | SANITY_SUBSET :=
30 | 
31 | # This defines the tools which will be run during the the tests, and were not already defined in
32 | # TEST_TOOL_ROOTS.
33 | TOOL_ROOTS :=
34 | 
35 | # This defines the static analysis tools which will be run during the the tests. They should not
36 | # be defined in TEST_TOOL_ROOTS. If a test with the same name exists, it should be defined in
37 | # TEST_ROOTS.
38 | # Note: Static analysis tools are in fact executables linked with the Pin Static Analysis Library.
39 | # This library provides a subset of the Pin APIs which allows the tool to perform static analysis
40 | # of an application or dll. Pin itself is not used when this tool runs.
41 | SA_TOOL_ROOTS :=
42 | 
43 | # This defines all the applications that will be run during the tests.
44 | APP_ROOTS :=
45 | 
46 | # This defines any additional object files that need to be compiled.
47 | OBJECT_ROOTS :=
48 | 
49 | # This defines any additional dlls (shared objects), other than the pintools, that need to be compiled.
50 | DLL_ROOTS :=
51 | 
52 | # This defines any static libraries (archives), that need to be built.
53 | LIB_ROOTS :=
54 | 
55 | 
56 | ##############################################################
57 | #
58 | # Test recipes
59 | #
60 | ##############################################################
61 | 
62 | # This section contains recipes for tests other than the default.
63 | # See makefile.default.rules for the default test rules.
64 | # All tests in this section should adhere to the naming convention: <testname>.test
65 | 
66 | 
67 | ##############################################################
68 | #
69 | # Build rules
70 | #
71 | ##############################################################
72 | 
73 | # This section contains the build rules for all binaries that have special build rules.
74 | # See makefile.default.rules for the default build rules.
75 | 


--------------------------------------------------------------------------------