├── results_50M └── test.txt ├── tracer ├── clean_tracer.sh ├── make_tracer.sh ├── makefile ├── makefile.rules └── champsim_tracer.cpp ├── BTBX_artifact_results.xlsx ├── src ├── uncore.cc └── block.cc ├── scripts ├── download_dpc3_traces.sh ├── dpc3_max_simpoint.txt └── multiworkload.cc ├── .gitignore ├── prefetcher ├── no.l1d_pref ├── no.l2c_pref ├── no.llc_pref ├── next_line.llc_pref ├── no.l1i_pref ├── idealL1i.l1i_pref ├── next_line.l1d_pref ├── next_line.l2c_pref ├── next_line.l1i_pref ├── ip_stride.l2c_pref ├── fdip.l1i_pref └── ipc-2020-paper41-code ├── inc ├── uncore.h ├── memory_class.h ├── champsim.h ├── dram_controller.h ├── spp_dev.h ├── set.h ├── cache.h ├── instruction.h ├── block.h └── ooo_cpu.h ├── cvp_tracer └── README.md ├── branch ├── bimodal.bpred ├── gshare.bpred ├── hashed_perceptron.bpred └── perceptron.bpred ├── run_champsim.sh ├── replacement ├── lru.llc_repl ├── srrip.llc_repl ├── base_replacement.cc ├── drrip.llc_repl └── ship.llc_repl ├── launch ├── scripts │ └── createConfig.sh └── launch.sh ├── Makefile ├── run_4core.sh ├── collectStats └── getResults.sh ├── README.md └── btb ├── btb.cc ├── convBTB.btb └── BTBX.btb /results_50M/test.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tracer/clean_tracer.sh: -------------------------------------------------------------------------------- 1 | export PIN_ROOT=/home/grads/c/cienlux/task/pin-3.2-81205-gcc-linux 2 | make clean 3 | -------------------------------------------------------------------------------- /BTBX_artifact_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rakeshdhakla/ChampSim-master-BTBX/HEAD/BTBX_artifact_results.xlsx -------------------------------------------------------------------------------- /tracer/make_tracer.sh: -------------------------------------------------------------------------------- 1 | export PIN_ROOT=/your/pin/directory/ 2 | mkdir -p obj-intel64 3 | make obj-intel64/champsim_tracer.so 4 | -------------------------------------------------------------------------------- /src/uncore.cc: -------------------------------------------------------------------------------- 1 | #include "uncore.h" 2 | 3 | // uncore 4 | UNCORE uncore; 5 | 6 | // constructor 7 | UNCORE::UNCORE() { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /scripts/download_dpc3_traces.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p $PWD/../dpc3_traces 4 | while read LINE 5 | do 6 | wget -P $PWD/../dpc3_traces -c http://hpca23.cse.tamu.edu/champsim-traces/speccpu/$LINE 7 | done < dpc3_max_simpoint.txt 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | prefetcher/l1i_prefetcher.cc 2 | prefetcher/l1d_prefetcher.cc 3 | prefetcher/l2c_prefetcher.cc 4 | prefetcher/llc_prefetcher.cc 5 | branch/branch_predictor.cc 6 | replacement/llc_replacement.cc 7 | 8 | inc/champsim.h.bak 9 | 10 | bin/ 11 | obj/ 12 | -------------------------------------------------------------------------------- /prefetcher/no.l1d_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l1d_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type) 9 | { 10 | 11 | } 12 | 13 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | 16 | } 17 | 18 | void CACHE::l1d_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /inc/uncore.h: -------------------------------------------------------------------------------- 1 | #ifndef UNCORE_H 2 | #define UNCORE_H 3 | 4 | #include "champsim.h" 5 | #include "cache.h" 6 | #include "dram_controller.h" 7 | //#include "drc_controller.h" 8 | 9 | //#define DRC_MSHR_SIZE 48 10 | 11 | // uncore 12 | class UNCORE { 13 | public: 14 | 15 | // LLC 16 | CACHE LLC{"LLC", LLC_SET, LLC_WAY, LLC_SET*LLC_WAY, LLC_WQ_SIZE, LLC_RQ_SIZE, LLC_PQ_SIZE, LLC_MSHR_SIZE}; 17 | 18 | // DRAM 19 | MEMORY_CONTROLLER DRAM{"DRAM"}; 20 | 21 | UNCORE(); 22 | }; 23 | 24 | extern UNCORE uncore; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /prefetcher/no.l2c_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l2c_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in) 9 | { 10 | return metadata_in; 11 | } 12 | 13 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | return metadata_in; 16 | } 17 | 18 | void CACHE::l2c_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /prefetcher/no.llc_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::llc_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in) 9 | { 10 | return metadata_in; 11 | } 12 | 13 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 14 | { 15 | return metadata_in; 16 | } 17 | 18 | void CACHE::llc_prefetcher_final_stats() 19 | { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /tracer/makefile: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | # 3 | # DO NOT EDIT THIS FILE! 4 | # 5 | ############################################################## 6 | 7 | # If the tool is built out of the kit, PIN_ROOT must be specified in the make invocation and point to the kit root. 8 | ifdef PIN_ROOT 9 | CONFIG_ROOT := $(PIN_ROOT)/source/tools/Config 10 | else 11 | CONFIG_ROOT := ../Config 12 | endif 13 | include $(CONFIG_ROOT)/makefile.config 14 | include makefile.rules 15 | include $(TOOLS_ROOT)/Config/makefile.default.rules 16 | 17 | ############################################################## 18 | # 19 | # DO NOT EDIT THIS FILE! 20 | # 21 | ############################################################## 22 | -------------------------------------------------------------------------------- /prefetcher/next_line.llc_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::llc_prefetcher_initialize() 4 | { 5 | cout << "LLC Next Line Prefetcher" << endl; 6 | } 7 | 8 | uint32_t CACHE::llc_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in) 9 | { 10 | uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | prefetch_line(ip, addr, pf_addr, FILL_LLC, 0); 12 | 13 | return metadata_in; 14 | } 15 | 16 | uint32_t CACHE::llc_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 17 | { 18 | return metadata_in; 19 | } 20 | 21 | void CACHE::llc_prefetcher_final_stats() 22 | { 23 | cout << "LLC Next Line Prefetcher Final Stats: none" << endl; 24 | } 25 | -------------------------------------------------------------------------------- /scripts/dpc3_max_simpoint.txt: -------------------------------------------------------------------------------- 1 | 600.perlbench_s-210B.champsimtrace.xz 2 | 602.gcc_s-734B.champsimtrace.xz 3 | 603.bwaves_s-3699B.champsimtrace.xz 4 | 605.mcf_s-665B.champsimtrace.xz 5 | 607.cactuBSSN_s-2421B.champsimtrace.xz 6 | 619.lbm_s-4268B.champsimtrace.xz 7 | 620.omnetpp_s-874B.champsimtrace.xz 8 | 621.wrf_s-575B.champsimtrace.xz 9 | 623.xalancbmk_s-700B.champsimtrace.xz 10 | 625.x264_s-18B.champsimtrace.xz 11 | 627.cam4_s-573B.champsimtrace.xz 12 | 628.pop2_s-17B.champsimtrace.xz 13 | 631.deepsjeng_s-928B.champsimtrace.xz 14 | 638.imagick_s-10316B.champsimtrace.xz 15 | 641.leela_s-800B.champsimtrace.xz 16 | 644.nab_s-5853B.champsimtrace.xz 17 | 648.exchange2_s-1699B.champsimtrace.xz 18 | 649.fotonik3d_s-1176B.champsimtrace.xz 19 | 654.roms_s-842B.champsimtrace.xz 20 | 657.xz_s-3167B.champsimtrace.xz 21 | -------------------------------------------------------------------------------- /prefetcher/no.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | void O3_CPU::l1i_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 9 | { 10 | 11 | } 12 | 13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 14 | { 15 | 16 | } 17 | 18 | void O3_CPU::l1i_prefetcher_instruction_operate(uint64_t ip) 19 | { 20 | 21 | } 22 | 23 | void O3_CPU::l1i_prefetcher_instruction_spec_operate(uint64_t ip) 24 | { 25 | 26 | } 27 | 28 | void O3_CPU::l1i_prefetcher_cycle_operate() 29 | { 30 | 31 | } 32 | 33 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr) 34 | { 35 | 36 | } 37 | 38 | void O3_CPU::l1i_prefetcher_final_stats() 39 | { 40 | 41 | } 42 | -------------------------------------------------------------------------------- /prefetcher/idealL1i.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | void O3_CPU::l1i_prefetcher_initialize() 4 | { 5 | 6 | } 7 | 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 9 | { 10 | 11 | } 12 | 13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 14 | { 15 | 16 | } 17 | 18 | void O3_CPU::l1i_prefetcher_instruction_operate(uint64_t ip) 19 | { 20 | 21 | } 22 | 23 | void O3_CPU::l1i_prefetcher_instruction_spec_operate(uint64_t ip) 24 | { 25 | 26 | } 27 | 28 | void O3_CPU::l1i_prefetcher_cycle_operate() 29 | { 30 | 31 | } 32 | 33 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr) 34 | { 35 | 36 | } 37 | 38 | void O3_CPU::l1i_prefetcher_final_stats() 39 | { 40 | 41 | } 42 | -------------------------------------------------------------------------------- /cvp_tracer/README.md: -------------------------------------------------------------------------------- 1 | The cvp2champsim tracer comes as is with no guarantee that it covers every conversion case. 2 | 3 | The tracer is used to convert the traces from the 2nd Championship Value 4 | Prediction (CVP) to a ChampSim-friendly format. 5 | 6 | CVP-1 Site: https://www.microarch.org/cvp1/ 7 | CVP-2 Site: https://www.microarch.org/cvp1/cvp2/rules.html 8 | 9 | To use the tracer first compile it using g++: 10 | 11 | g++ cvp2champsim.cc -o cvp_tracer 12 | 13 | To convert a trace execute: 14 | 15 | ./cvp_tracer TRACE_NAME.gz 16 | 17 | The ChampSim trace will be sent to standard output so to keep and compress the 18 | output trace run: 19 | 20 | ./cvp_tracer TRACE_NAME.gz | gzip > NEW_TRACE.champsim.gz 21 | 22 | Adding the "-v" flag will print the dissassembly of the CVP trace to standard 23 | error output as well as the ChampSim format to standard output. 24 | -------------------------------------------------------------------------------- /branch/bimodal.bpred: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | #define BIMODAL_TABLE_SIZE 16384 4 | #define BIMODAL_PRIME 16381 5 | #define MAX_COUNTER 3 6 | int bimodal_table[NUM_CPUS][BIMODAL_TABLE_SIZE]; 7 | 8 | void O3_CPU::initialize_branch_predictor() 9 | { 10 | cout << "CPU " << cpu << " Bimodal branch predictor" << endl; 11 | 12 | for(int i = 0; i < BIMODAL_TABLE_SIZE; i++) 13 | bimodal_table[cpu][i] = 0; 14 | } 15 | 16 | uint8_t O3_CPU::predict_branch(uint64_t ip) 17 | { 18 | uint32_t hash = ip % BIMODAL_PRIME; 19 | uint8_t prediction = (bimodal_table[cpu][hash] >= ((MAX_COUNTER + 1)/2)) ? 1 : 0; 20 | 21 | return prediction; 22 | } 23 | 24 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 25 | { 26 | uint32_t hash = ip % BIMODAL_PRIME; 27 | 28 | if (taken && (bimodal_table[cpu][hash] < MAX_COUNTER)) 29 | bimodal_table[cpu][hash]++; 30 | else if ((taken == 0) && (bimodal_table[cpu][hash] > 0)) 31 | bimodal_table[cpu][hash]--; 32 | } 33 | -------------------------------------------------------------------------------- /prefetcher/next_line.l1d_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l1d_prefetcher_initialize() 4 | { 5 | cout << "CPU " << cpu << " L1D next line prefetcher" << endl; 6 | } 7 | 8 | void CACHE::l1d_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type) 9 | { 10 | uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | 12 | DP ( if (warmup_complete[cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE); 14 | cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; }); 15 | 16 | prefetch_line(ip, addr, pf_addr, FILL_L1, 0); 17 | } 18 | 19 | void CACHE::l1d_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 20 | { 21 | 22 | } 23 | 24 | void CACHE::l1d_prefetcher_final_stats() 25 | { 26 | cout << "CPU " << cpu << " L1D next line prefetcher final stats" << endl; 27 | } 28 | -------------------------------------------------------------------------------- /prefetcher/next_line.l2c_pref: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | void CACHE::l2c_prefetcher_initialize() 4 | { 5 | cout << "CPU " << cpu << " L2C next line prefetcher" << endl; 6 | } 7 | 8 | uint32_t CACHE::l2c_prefetcher_operate(uint64_t addr, uint64_t ip, uint8_t cache_hit, uint8_t type, uint32_t metadata_in) 9 | { 10 | uint64_t pf_addr = ((addr>>LOG2_BLOCK_SIZE)+1) << LOG2_BLOCK_SIZE; 11 | 12 | DP ( if (warmup_complete[cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << hex << " base_cl: " << (addr>>LOG2_BLOCK_SIZE); 14 | cout << " pf_cl: " << (pf_addr>>LOG2_BLOCK_SIZE) << " ip: " << ip << " cache_hit: " << +cache_hit << " type: " << +type << endl; }); 15 | 16 | prefetch_line(ip, addr, pf_addr, FILL_L2, 0); 17 | 18 | return metadata_in; 19 | } 20 | 21 | uint32_t CACHE::l2c_prefetcher_cache_fill(uint64_t addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_addr, uint32_t metadata_in) 22 | { 23 | return metadata_in; 24 | } 25 | 26 | void CACHE::l2c_prefetcher_final_stats() 27 | { 28 | cout << "CPU " << cpu << " L2C next line prefetcher final stats" << endl; 29 | } 30 | -------------------------------------------------------------------------------- /prefetcher/next_line.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | void O3_CPU::l1i_prefetcher_initialize() 4 | { 5 | cout << "CPU " << cpu << " L1I next line prefetcher" << endl; 6 | } 7 | 8 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 9 | { 10 | 11 | } 12 | 13 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 14 | { 15 | //cout << "access v_addr: 0x" << hex << v_addr << dec << endl; 16 | 17 | if((cache_hit == 0) && (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1))) 18 | { 19 | uint64_t pf_addr = v_addr + (1<&2; 30 | exit 1 31 | fi 32 | 33 | re='^[0-9]+$' 34 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then 35 | echo "[ERROR]: Number of simulation instructions is NOT a number" >&2; 36 | exit 1 37 | fi 38 | 39 | if [ ! -f "$TRACE_DIR/$TRACE" ] ; then 40 | echo "[ERROR] Cannot find a trace file: $TRACE_DIR/$TRACE" 41 | exit 1 42 | fi 43 | 44 | mkdir -p results_${N_SIM}M 45 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE}) &> results_${N_SIM}M/${TRACE}-${BINARY}${OPTION}.txt 46 | -------------------------------------------------------------------------------- /scripts/multiworkload.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define NUM_MIX 100 7 | #define NUM_CPUS 4 8 | #define NUM_TRACE 20 9 | 10 | using namespace std; 11 | default_random_engine generator; 12 | 13 | int main() 14 | { 15 | int benchmark[NUM_MIX][NUM_CPUS]; 16 | for (int i=0; i distribution(rand_min, rand_max); 27 | 28 | int temp_rand; 29 | bool do_again = false; 30 | 31 | for (int i = 0; i < NUM_MIX; i++) { 32 | //printf("MIX%2d: ", i+1); 33 | for (int j = 0; j < NUM_CPUS; j++) { 34 | do { 35 | do_again = false; 36 | temp_rand = distribution(generator); // Generate random integer flat in [rand_min, rand_mix] 37 | for (int k = 0; k < j; k++) { 38 | if (temp_rand == benchmark[i][k]) { 39 | do_again = true; 40 | break; 41 | } 42 | } 43 | } while (do_again); 44 | 45 | benchmark[i][j] = temp_rand; 46 | printf("%d ", benchmark[i][j]); 47 | } 48 | printf("\n"); 49 | } 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /replacement/lru.llc_repl: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | // initialize replacement state 4 | void CACHE::llc_initialize_replacement() 5 | { 6 | 7 | } 8 | 9 | // find replacement victim 10 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 11 | { 12 | // baseline LRU 13 | return lru_victim(cpu, instr_id, set, current_set, ip, full_addr, type); 14 | } 15 | 16 | // called on every cache hit and cache fill 17 | void CACHE::llc_update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit) 18 | { 19 | string TYPE_NAME; 20 | if (type == LOAD) 21 | TYPE_NAME = "LOAD"; 22 | else if (type == RFO) 23 | TYPE_NAME = "RFO"; 24 | else if (type == PREFETCH) 25 | TYPE_NAME = "PF"; 26 | else if (type == WRITEBACK) 27 | TYPE_NAME = "WB"; 28 | else 29 | assert(0); 30 | 31 | if (hit) 32 | TYPE_NAME += "_HIT"; 33 | else 34 | TYPE_NAME += "_MISS"; 35 | 36 | if ((type == WRITEBACK) && ip) 37 | assert(0); 38 | 39 | // uncomment this line to see the LLC accesses 40 | // cout << "CPU: " << cpu << " LLC " << setw(9) << TYPE_NAME << " set: " << setw(5) << set << " way: " << setw(2) << way; 41 | // cout << hex << " paddr: " << setw(12) << paddr << " ip: " << setw(8) << ip << " victim_addr: " << victim_addr << dec << endl; 42 | 43 | // baseline LRU 44 | if (hit && (type == WRITEBACK)) // writeback hit does not update LRU state 45 | return; 46 | 47 | return lru_update(set, way); 48 | } 49 | 50 | void CACHE::llc_replacement_final_stats() 51 | { 52 | 53 | } 54 | -------------------------------------------------------------------------------- /launch/scripts/createConfig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare -a bench 4 | declare -a prefetch 5 | declare -a btb 6 | 7 | PATH_TO_CHAMPSIM= 8 | 9 | bench[0]=client_001 10 | bench[1]=client_002 11 | bench[2]=client_003 12 | bench[3]=client_004 13 | bench[4]=client_005 14 | bench[5]=client_006 15 | bench[6]=client_007 16 | bench[7]=client_008 17 | bench[8]=server_001 18 | bench[9]=server_002 19 | bench[10]=server_003 20 | bench[11]=server_004 21 | bench[12]=server_009 22 | bench[13]=server_010 23 | bench[14]=server_011 24 | bench[15]=server_012 25 | bench[16]=server_013 26 | bench[17]=server_014 27 | bench[18]=server_015 28 | bench[19]=server_016 29 | bench[20]=server_017 30 | bench[21]=server_018 31 | bench[22]=server_019 32 | bench[23]=server_020 33 | bench[24]=server_021 34 | bench[25]=server_022 35 | bench[26]=server_023 36 | bench[27]=server_024 37 | bench[28]=server_025 38 | bench[29]=server_026 39 | bench[30]=server_027 40 | bench[31]=server_028 41 | bench[32]=server_029 42 | bench[33]=server_030 43 | bench[34]=server_031 44 | bench[35]=server_032 45 | bench[36]=server_033 46 | bench[37]=server_034 47 | bench[38]=server_035 48 | bench[39]=server_036 49 | bench[40]=server_037 50 | bench[41]=server_038 51 | bench[42]=server_039 52 | 53 | prefetch[0]=no 54 | prefetch[1]=fdip 55 | 56 | btb[0]=convBTB 57 | btb[1]=pdede 58 | btb[2]=BTBX 59 | 60 | 61 | for ((j=0;j<2;j=j+1)); do 62 | for ((i=0;i<43;i=i+1)); do 63 | for ((k=0;k<3;k=k+1)); do 64 | script_name="${bench[i]}_${prefetch[j]}_${btb[k]}.sh" 65 | echo "#!/bin/bash" > $script_name 66 | echo "cd ${PATH_TO_CHAMPSIM}" >> $script_name 67 | echo "./run_champsim.sh hashed_perceptron-${btb[k]}-${prefetch[j]}-next_line-spp_dev-no-lru-1core 50 50 ${bench[i]}.champsimtrace.xz" >> $script_name 68 | done 69 | done 70 | done 71 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | app = champsim 2 | 3 | srcExt = cc 4 | srcDir = src branch replacement prefetcher btb 5 | objDir = obj 6 | binDir = bin 7 | inc = inc 8 | 9 | debug = 1 10 | 11 | CFlags = -Wall -O3 -std=c++11 12 | LDFlags = 13 | libs = 14 | libDir = 15 | 16 | 17 | #************************ DO NOT EDIT BELOW THIS LINE! ************************ 18 | 19 | ifeq ($(debug),1) 20 | debug=-g 21 | else 22 | debug= 23 | endif 24 | inc := $(addprefix -I,$(inc)) 25 | libs := $(addprefix -l,$(libs)) 26 | libDir := $(addprefix -L,$(libDir)) 27 | CFlags += -c $(debug) $(inc) $(libDir) $(libs) 28 | sources := $(shell find $(srcDir) -name '*.$(srcExt)') 29 | srcDirs := $(shell find . -name '*.$(srcExt)' -exec dirname {} \; | uniq) 30 | objects := $(patsubst %.$(srcExt),$(objDir)/%.o,$(sources)) 31 | 32 | ifeq ($(srcExt),cc) 33 | CC = $(CXX) 34 | else 35 | CFlags += -std=gnu99 36 | endif 37 | 38 | .phony: all clean distclean 39 | 40 | 41 | all: $(binDir)/$(app) 42 | 43 | $(binDir)/$(app): buildrepo $(objects) 44 | @mkdir -p `dirname $@` 45 | @echo "Linking $@..." 46 | @$(CC) $(objects) $(LDFlags) -o $@ 47 | 48 | $(objDir)/%.o: %.$(srcExt) 49 | @echo "Generating dependencies for $<..." 50 | @$(call make-depend,$<,$@,$(subst .o,.d,$@)) 51 | @echo "Compiling $<..." 52 | @$(CC) $(CFlags) $< -o $@ 53 | 54 | clean: 55 | $(RM) -r $(objDir) 56 | 57 | distclean: clean 58 | $(RM) -r $(binDir)/$(app) 59 | 60 | buildrepo: 61 | @$(call make-repo) 62 | 63 | define make-repo 64 | for dir in $(srcDirs); \ 65 | do \ 66 | mkdir -p $(objDir)/$$dir; \ 67 | done 68 | endef 69 | 70 | 71 | # usage: $(call make-depend,source-file,object-file,depend-file) 72 | define make-depend 73 | $(CC) -MM \ 74 | -MF $3 \ 75 | -MP \ 76 | -MT $2 \ 77 | $(CFlags) \ 78 | $1 79 | endef 80 | -------------------------------------------------------------------------------- /launch/launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | declare -a bench 4 | declare -a prefetch 5 | declare -a btb 6 | 7 | bench[0]=client_001 8 | bench[1]=client_002 9 | bench[2]=client_003 10 | bench[3]=client_004 11 | bench[4]=client_005 12 | bench[5]=client_006 13 | bench[6]=client_007 14 | bench[7]=client_008 15 | bench[8]=server_001 16 | bench[9]=server_002 17 | bench[10]=server_003 18 | bench[11]=server_004 19 | bench[12]=server_009 20 | bench[13]=server_010 21 | bench[14]=server_011 22 | bench[15]=server_012 23 | bench[16]=server_013 24 | bench[17]=server_014 25 | bench[18]=server_015 26 | bench[19]=server_016 27 | bench[20]=server_017 28 | bench[21]=server_018 29 | bench[22]=server_019 30 | bench[23]=server_020 31 | bench[24]=server_021 32 | bench[25]=server_022 33 | bench[26]=server_023 34 | bench[27]=server_024 35 | bench[28]=server_025 36 | bench[29]=server_026 37 | bench[30]=server_027 38 | bench[31]=server_028 39 | bench[32]=server_029 40 | bench[33]=server_030 41 | bench[34]=server_031 42 | bench[35]=server_032 43 | bench[36]=server_033 44 | bench[37]=server_034 45 | bench[38]=server_035 46 | bench[39]=server_036 47 | bench[40]=server_037 48 | bench[41]=server_038 49 | bench[42]=server_039 50 | 51 | prefetch[0]=no 52 | prefetch[1]=fdip 53 | 54 | 55 | btb[0]=convBTB 56 | btb[1]=pdede 57 | btb[2]=BTBX 58 | 59 | for ((j=0;j<2;j=j+1)); do 60 | for ((i=0;i<43;i=i+1)); do 61 | for ((k=0;k<3;k=k+1)); do 62 | echo "sbatch --partition=CPUQ --account=share-ie-idi --nodes=1 --ntasks-per-node=1 --mem=3GB --time=0-0:30:00 --job-name=${bench[i]}_${prefetch[j]}_${btb[k]} scripts/${bench[i]}_${prefetch[j]}_${btb[k]}.sh" 63 | #sbatch --partition=CPUQ --account=share-ie-idi --nodes=1 --ntasks-per-node=1 --mem=3GB --time=0-0:30:00 --job-name=${bench[i]}_${prefetch[j]}_${btb[k]} scripts/${bench[i]}_${prefetch[j]}_${btb[k]}.sh 64 | 65 | done 66 | done 67 | done 68 | -------------------------------------------------------------------------------- /branch/gshare.bpred: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | 3 | #define GLOBAL_HISTORY_LENGTH 14 4 | #define GLOBAL_HISTORY_MASK (1 << GLOBAL_HISTORY_LENGTH) - 1 5 | int branch_history_vector[NUM_CPUS]; 6 | 7 | #define GS_HISTORY_TABLE_SIZE 16384 8 | int gs_history_table[NUM_CPUS][GS_HISTORY_TABLE_SIZE]; 9 | int my_last_prediction[NUM_CPUS]; 10 | 11 | void O3_CPU::initialize_branch_predictor() 12 | { 13 | cout << "CPU " << cpu << " GSHARE branch predictor" << endl; 14 | 15 | branch_history_vector[cpu] = 0; 16 | my_last_prediction[cpu] = 0; 17 | 18 | for(int i=0; i>GLOBAL_HISTORY_LENGTH)^(ip>>(GLOBAL_HISTORY_LENGTH*2))^bh_vector; 25 | hash = hash%GS_HISTORY_TABLE_SIZE; 26 | 27 | //printf("%d\n", hash); 28 | 29 | return hash; 30 | } 31 | 32 | uint8_t O3_CPU::predict_branch(uint64_t ip) 33 | { 34 | int prediction = 1; 35 | 36 | int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]); 37 | 38 | if(gs_history_table[cpu][gs_hash] >= 2) 39 | prediction = 1; 40 | else 41 | prediction = 0; 42 | 43 | my_last_prediction[cpu] = prediction; 44 | 45 | return prediction; 46 | } 47 | 48 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 49 | { 50 | int gs_hash = gs_table_hash(ip, branch_history_vector[cpu]); 51 | 52 | if(taken == 1) { 53 | if(gs_history_table[cpu][gs_hash] < 3) 54 | gs_history_table[cpu][gs_hash]++; 55 | } else { 56 | if(gs_history_table[cpu][gs_hash] > 0) 57 | gs_history_table[cpu][gs_hash]--; 58 | } 59 | 60 | // update branch history vector 61 | branch_history_vector[cpu] <<= 1; 62 | branch_history_vector[cpu] &= GLOBAL_HISTORY_MASK; 63 | branch_history_vector[cpu] |= taken; 64 | } 65 | -------------------------------------------------------------------------------- /run_4core.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 8 ] || [ "$#" -gt 9 ]; then 4 | echo "Illegal number of parameters" 5 | echo "Usage: ./run_4core.sh [BINARY] [N_WARM] [N_SIM] [N_MIX] [TRACE0] [TRACE1] [TRACE2] [TRACE3] [OPTION]" 6 | exit 1 7 | fi 8 | 9 | TRACE_DIR=$PWD/dpc3_traces 10 | BINARY=${1} 11 | N_WARM=${2} 12 | N_SIM=${3} 13 | N_MIX=${4} 14 | TRACE0=${5} 15 | TRACE1=${6} 16 | TRACE2=${7} 17 | TRACE3=${8} 18 | OPTION=${9} 19 | 20 | # Sanity check 21 | if [ -z $TRACE_DIR ] || [ ! -d "$TRACE_DIR" ] ; then 22 | echo "[ERROR] Cannot find a trace directory: $TRACE_DIR" 23 | exit 1 24 | fi 25 | 26 | if [ ! -f "bin/$BINARY" ] ; then 27 | echo "[ERROR] Cannot find a ChampSim binary: bin/$BINARY" 28 | exit 1 29 | fi 30 | 31 | re='^[0-9]+$' 32 | if ! [[ $N_WARM =~ $re ]] || [ -z $N_WARM ] ; then 33 | echo "[ERROR]: Number of warmup instructions is NOT a number" >&2; 34 | exit 1 35 | fi 36 | 37 | re='^[0-9]+$' 38 | if ! [[ $N_SIM =~ $re ]] || [ -z $N_SIM ] ; then 39 | echo "[ERROR]: Number of simulation instructions is NOT a number" >&2; 40 | exit 1 41 | fi 42 | 43 | if [ ! -f "$TRACE_DIR/$TRACE0" ] ; then 44 | echo "[ERROR] Cannot find a trace0 file: $TRACE_DIR/$TRACE0" 45 | exit 1 46 | fi 47 | 48 | if [ ! -f "$TRACE_DIR/$TRACE1" ] ; then 49 | echo "[ERROR] Cannot find a trace1 file: $TRACE_DIR/$TRACE1" 50 | exit 1 51 | fi 52 | 53 | if [ ! -f "$TRACE_DIR/$TRACE2" ] ; then 54 | echo "[ERROR] Cannot find a trace2 file: $TRACE_DIR/$TRACE2" 55 | exit 1 56 | fi 57 | 58 | if [ ! -f "$TRACE_DIR/$TRACE3" ] ; then 59 | echo "[ERROR] Cannot find a trace3 file: $TRACE_DIR/$TRACE3" 60 | exit 1 61 | fi 62 | 63 | mkdir -p results_4core_${N_SIM}M 64 | (./bin/${BINARY} -warmup_instructions ${N_WARM}000000 -simulation_instructions ${N_SIM}000000 ${OPTION} -traces ${TRACE_DIR}/${TRACE0} ${TRACE_DIR}/${TRACE1} ${TRACE_DIR}/${TRACE2} ${TRACE_DIR}/${TRACE3}) &> results_4core_${N_SIM}M/mix${N_MIX}-${BINARY}${OPTION}.txt 65 | -------------------------------------------------------------------------------- /replacement/srrip.llc_repl: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | #define maxRRPV 3 4 | uint32_t rrpv[LLC_SET][LLC_WAY]; 5 | 6 | // initialize replacement state 7 | void CACHE::llc_initialize_replacement() 8 | { 9 | cout << "Initialize SRRIP state" << endl; 10 | 11 | for (int i=0; i out_${bench[i]}_${prefetch[j]}_${btb[k]} 89 | sed "s/XXX/${bench[i]} ${runName[j]}_${btb[k]}/g" out_${bench[i]}_${prefetch[j]}_${btb[k]} > Sout_${bench[i]}_${prefetch[j]}_${btb[k]} 90 | done 91 | done 92 | done 93 | 94 | rm out_* 95 | cat Sout_* > all_res 96 | rm Sout_* 97 | -------------------------------------------------------------------------------- /tracer/makefile.rules: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | # 3 | # This file includes all the test targets as well as all the 4 | # non-default build rules and test recipes. 5 | # 6 | ############################################################## 7 | 8 | 9 | ############################################################## 10 | # 11 | # Test targets 12 | # 13 | ############################################################## 14 | 15 | ###### Place all generic definitions here ###### 16 | 17 | # This defines tests which run tools of the same name. This is simply for convenience to avoid 18 | # defining the test name twice (once in TOOL_ROOTS and again in TEST_ROOTS). 19 | # Tests defined here should not be defined in TOOL_ROOTS and TEST_ROOTS. 20 | TEST_TOOL_ROOTS := MyPinTool 21 | 22 | # This defines the tests to be run that were not already defined in TEST_TOOL_ROOTS. 23 | TEST_ROOTS := 24 | 25 | # This defines a list of tests that should run in the "short" sanity. Tests in this list must also 26 | # appear either in the TEST_TOOL_ROOTS or the TEST_ROOTS list. 27 | # If the entire directory should be tested in sanity, assign TEST_TOOL_ROOTS and TEST_ROOTS to the 28 | # SANITY_SUBSET variable in the tests section below (see example in makefile.rules.tmpl). 29 | SANITY_SUBSET := 30 | 31 | # This defines the tools which will be run during the the tests, and were not already defined in 32 | # TEST_TOOL_ROOTS. 33 | TOOL_ROOTS := 34 | 35 | # This defines the static analysis tools which will be run during the the tests. They should not 36 | # be defined in TEST_TOOL_ROOTS. If a test with the same name exists, it should be defined in 37 | # TEST_ROOTS. 38 | # Note: Static analysis tools are in fact executables linked with the Pin Static Analysis Library. 39 | # This library provides a subset of the Pin APIs which allows the tool to perform static analysis 40 | # of an application or dll. Pin itself is not used when this tool runs. 41 | SA_TOOL_ROOTS := 42 | 43 | # This defines all the applications that will be run during the tests. 44 | APP_ROOTS := 45 | 46 | # This defines any additional object files that need to be compiled. 47 | OBJECT_ROOTS := 48 | 49 | # This defines any additional dlls (shared objects), other than the pintools, that need to be compiled. 50 | DLL_ROOTS := 51 | 52 | # This defines any static libraries (archives), that need to be built. 53 | LIB_ROOTS := 54 | 55 | 56 | ############################################################## 57 | # 58 | # Test recipes 59 | # 60 | ############################################################## 61 | 62 | # This section contains recipes for tests other than the default. 63 | # See makefile.default.rules for the default test rules. 64 | # All tests in this section should adhere to the naming convention: .test 65 | 66 | 67 | ############################################################## 68 | # 69 | # Build rules 70 | # 71 | ############################################################## 72 | 73 | # This section contains the build rules for all binaries that have special build rules. 74 | # See makefile.default.rules for the default build rules. 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

ChampSim

3 |

ChampSim is a trace-based simulator for a microarchitecture study. You can sign up to the public mailing list by sending an empty mail to champsim+subscribe@googlegroups.com.

4 |

5 | 6 | 7 | # Compile 8 | 9 | Champsim needs to be compiled with three BTB designs (convBTB, pdede, and BTBX) and two instruction prefetchers (no, fdip). 10 | 11 | Important note on compilation: IFETCH_BUFFER needs to be 128 entries when compiling with “fdip” prefetcher and “FETCH_WIDTH*2” entries when compiling with “no” prefetcher. This is because of how instruction fetch is implemented in baseline Champsim. IFETCH_BUFFER size is defined in line 63 of //inc/ooo_cpu.h 12 | 13 | Use the following commands to compile the code: 14 | 15 | (First set “IFETCH_BUFFER” to “FETCH_WIDTH*2” in line 63 of //inc/ooo_cpu.h) 16 | 17 | ./build_champsim.sh hashed_perceptron convBTB no next_line spp_dev no lru 1 18 | 19 | ./build_champsim.sh hashed_perceptron pdede no next_line spp_dev no lru 1 20 | 21 | ./build_champsim.sh hashed_perceptron BTBX no next_line spp_dev no lru 1 22 | 23 | 24 | (Set “IFETCH_BUFFER” to “128” in line 63 of //inc/ooo_cpu.h) 25 | 26 | ./build_champsim.sh hashed_perceptron convBTB fdip next_line spp_dev no lru 1 27 | 28 | ./build_champsim.sh hashed_perceptron pdede fdip next_line spp_dev no lru 1 29 | 30 | ./build_champsim.sh hashed_perceptron BTBX fdip next_line spp_dev no lru 1 31 | 32 | 33 | # Download IPC-1 trace 34 | 35 | The traces can be downloaded from https://drive.google.com/file/d/1qs8t8-YWc7lLoYbjbH_d3lf1xdoYBznf/view?usp=sharing 36 | 37 | Once downloaded, place them in “/dpc3_traces/” directory. 38 | 39 | # Generating configuration files 40 | 41 | Go to directory //launch/scripts/. In script file createConfig.sh, point PATH_TO_CHAMPSIM to . Run this script (./createConfig.sh) to generate config files needed by Champsim. 42 | 43 | # Run simulation 44 | 45 | Running all workloads: Go to directory //launch/. In script file launch.sh, replace the line (line 64) with the command to run experiments on your cluster. A sample command is given that runs experiments on our cluster. Running this script (./launch.sh) will run simulations, and the stats will be stored in directory //results_50M/. 46 | 47 | Running a single workload: Use the following command (in directory //) to run simulation for a single workload: 48 | 49 | ./run_champsim.sh hashed_perceptron-BTBX-no-next_line-spp_dev-no-lru-1core 50 50 server_001.champsimtrace.xz 50 | 51 | This command will simulate server_001 workload with BTBX and no instruction prefetching. 52 | 53 | # Collecting results 54 | 55 | Go to directory //collectStats/. Run the script getResults.sh, and it will collect results from all workloads and save them in a file “all_res”. 56 | 57 | # Plotting results 58 | 59 | Download the “all_res” file. Open the provided excel file "BTBX_artifact_results.xlsx". Click on “Data” in MS-Excel top menu bar. Click on “Refresh All” in “Queries and Connections” ribbon, go to the folder where you stored “all_res” and double click on “all_res”. Now “Offset Distribution”, “MPKI”, and “Performance” sheets in the excel file should have plots for Figure 4, Figure 9, and Figure 10 respectively. 60 | 61 | # BTBX-HPCA23 62 | -------------------------------------------------------------------------------- /replacement/base_replacement.cc: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | 3 | uint32_t CACHE::find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 4 | { 5 | // baseline LRU replacement policy for other caches 6 | return lru_victim(cpu, instr_id, set, current_set, ip, full_addr, type); 7 | } 8 | 9 | void CACHE::update_replacement_state(uint32_t cpu, uint32_t set, uint32_t way, uint64_t full_addr, uint64_t ip, uint64_t victim_addr, uint32_t type, uint8_t hit) 10 | { 11 | if (type == WRITEBACK) { 12 | if (hit) // wrietback hit does not update LRU state 13 | return; 14 | } 15 | 16 | return lru_update(set, way); 17 | } 18 | 19 | uint32_t CACHE::lru_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 20 | { 21 | uint32_t way = 0; 22 | 23 | // fill invalid line first 24 | for (way=0; way>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data; 30 | cout << dec << " lru: " << block[set][way].lru << endl; }); 31 | 32 | break; 33 | } 34 | } 35 | 36 | // LRU victim 37 | if (way == NUM_WAY) { 38 | for (way=0; way>LOG2_BLOCK_SIZE) << " victim address: " << block[set][way].address << " data: " << block[set][way].data; 44 | cout << dec << " lru: " << block[set][way].lru << endl; }); 45 | 46 | break; 47 | } 48 | } 49 | } 50 | 51 | if (way == NUM_WAY) { 52 | cerr << "[" << NAME << "] " << __func__ << " no victim! set: " << set << endl; 53 | assert(0); 54 | } 55 | 56 | return way; 57 | } 58 | 59 | void CACHE::lru_update(uint32_t set, uint32_t way) 60 | { 61 | // update lru replacement state 62 | for (uint32_t i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | // USEFUL MACROS 25 | //#define DEBUG_PRINT 26 | #define SANITY_CHECK 27 | #define LLC_BYPASS 28 | #define DRC_BYPASS 29 | #define NO_CRC2_COMPILE 30 | 31 | #ifdef DEBUG_PRINT 32 | #define DP(x) x 33 | #else 34 | #define DP(x) 35 | #endif 36 | 37 | // CPU 38 | #define NUM_CPUS 1 39 | #define CPU_FREQ 4000 40 | #define DRAM_IO_FREQ 3200 41 | #define PAGE_SIZE 4096 42 | #define LOG2_PAGE_SIZE 12 43 | 44 | // CACHE 45 | #define BLOCK_SIZE 64 46 | #define LOG2_BLOCK_SIZE 6 47 | #define MAX_READ_PER_CYCLE 8 48 | #define MAX_FILL_PER_CYCLE 1 49 | 50 | #define INFLIGHT 1 51 | #define COMPLETED 2 52 | 53 | #define FILL_L1 1 54 | #define FILL_L2 2 55 | #define FILL_LLC 4 56 | #define FILL_DRC 8 57 | #define FILL_DRAM 16 58 | 59 | // DRAM 60 | #define DRAM_CHANNELS 1 // default: assuming one DIMM per one channel 4GB * 1 => 4GB off-chip memory 61 | #define LOG2_DRAM_CHANNELS 0 62 | #define DRAM_RANKS 1 // 512MB * 8 ranks => 4GB per DIMM 63 | #define LOG2_DRAM_RANKS 0 64 | #define DRAM_BANKS 8 // 64MB * 8 banks => 512MB per rank 65 | #define LOG2_DRAM_BANKS 3 66 | #define DRAM_ROWS 65536 // 2KB * 32K rows => 64MB per bank 67 | #define LOG2_DRAM_ROWS 16 68 | #define DRAM_COLUMNS 128 // 64B * 32 column chunks (Assuming 1B DRAM cell * 8 chips * 8 transactions = 64B size of column chunks) => 2KB per row 69 | #define LOG2_DRAM_COLUMNS 7 70 | #define DRAM_ROW_SIZE (BLOCK_SIZE*DRAM_COLUMNS/1024) 71 | 72 | #define DRAM_SIZE (DRAM_CHANNELS*DRAM_RANKS*DRAM_BANKS*DRAM_ROWS*DRAM_ROW_SIZE/1024) 73 | #define DRAM_PAGES ((DRAM_SIZE<<10)>>2) 74 | //#define DRAM_PAGES 10 75 | 76 | using namespace std; 77 | 78 | extern uint8_t warmup_complete[NUM_CPUS], 79 | simulation_complete[NUM_CPUS], 80 | all_warmup_complete, 81 | all_simulation_complete, 82 | MAX_INSTR_DESTINATIONS, 83 | knob_cloudsuite, 84 | knob_low_bandwidth; 85 | 86 | extern uint64_t current_core_cycle[NUM_CPUS], 87 | stall_cycle[NUM_CPUS], 88 | last_drc_read_mode, 89 | last_drc_write_mode, 90 | drc_blocks; 91 | 92 | extern queue page_queue; 93 | extern map page_table, inverse_table, recent_page, unique_cl[NUM_CPUS]; 94 | extern uint64_t previous_ppage, num_adjacent_page, num_cl[NUM_CPUS], allocated_pages, num_page[NUM_CPUS], minor_fault[NUM_CPUS], major_fault[NUM_CPUS]; 95 | 96 | void print_stats(); 97 | uint64_t rotl64 (uint64_t n, unsigned int c), 98 | rotr64 (uint64_t n, unsigned int c), 99 | va_to_pa(uint32_t cpu, uint64_t instr_id, uint64_t va, uint64_t unique_vpage, uint8_t is_code); 100 | 101 | // log base 2 function from efectiu 102 | int lg2(int n); 103 | 104 | // smart random number generator 105 | class RANDOM { 106 | public: 107 | std::random_device rd; 108 | std::mt19937_64 engine{rd()}; 109 | std::uniform_int_distribution dist{0, 0xFFFFFFFFF}; // used to generate random physical page numbers 110 | 111 | RANDOM (uint64_t seed) { 112 | engine.seed(seed); 113 | } 114 | 115 | uint64_t draw_rand() { 116 | return dist(engine); 117 | }; 118 | }; 119 | extern uint64_t champsim_seed; 120 | #endif 121 | -------------------------------------------------------------------------------- /inc/dram_controller.h: -------------------------------------------------------------------------------- 1 | #ifndef DRAM_H 2 | #define DRAM_H 3 | 4 | #include "memory_class.h" 5 | 6 | // DRAM configuration 7 | #define DRAM_CHANNEL_WIDTH 8 // 8B 8 | #define DRAM_WQ_SIZE 64 9 | #define DRAM_RQ_SIZE 64 10 | 11 | #define tRP_DRAM_NANOSECONDS 12.5 12 | #define tRCD_DRAM_NANOSECONDS 12.5 13 | #define tCAS_DRAM_NANOSECONDS 12.5 14 | 15 | // the data bus must wait this amount of time when switching between reads and writes, and vice versa 16 | #define DRAM_DBUS_TURN_AROUND_TIME ((15*CPU_FREQ)/2000) // 7.5 ns 17 | extern uint32_t DRAM_MTPS, DRAM_DBUS_RETURN_TIME; 18 | 19 | // these values control when to send out a burst of writes 20 | #define DRAM_WRITE_HIGH_WM ((DRAM_WQ_SIZE*7)>>3) // 7/8th 21 | #define DRAM_WRITE_LOW_WM ((DRAM_WQ_SIZE*3)>>2) // 6/8th 22 | #define MIN_DRAM_WRITES_PER_SWITCH (DRAM_WQ_SIZE*1/4) 23 | 24 | // DRAM 25 | class MEMORY_CONTROLLER : public MEMORY { 26 | public: 27 | const string NAME; 28 | 29 | DRAM_ARRAY dram_array[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 30 | uint64_t dbus_cycle_available[DRAM_CHANNELS], dbus_cycle_congested[DRAM_CHANNELS], dbus_congested[NUM_TYPES+1][NUM_TYPES+1]; 31 | uint64_t bank_cycle_available[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 32 | uint8_t do_write, write_mode[DRAM_CHANNELS]; 33 | uint32_t processed_writes, scheduled_reads[DRAM_CHANNELS], scheduled_writes[DRAM_CHANNELS]; 34 | int fill_level; 35 | 36 | BANK_REQUEST bank_request[DRAM_CHANNELS][DRAM_RANKS][DRAM_BANKS]; 37 | 38 | // queues 39 | PACKET_QUEUE WQ[DRAM_CHANNELS], RQ[DRAM_CHANNELS]; 40 | 41 | // constructor 42 | MEMORY_CONTROLLER(string v1) : NAME (v1) { 43 | for (uint32_t i=0; i PSEL_THRS) { // follow BIP 84 | rrpv[set][way] = maxRRPV; 85 | 86 | bip_counter++; 87 | if (bip_counter == BIP_MAX) 88 | bip_counter = 0; 89 | if (bip_counter == 0) 90 | rrpv[set][way] = maxRRPV-1; 91 | } else // follow SRRIP 92 | rrpv[set][way] = maxRRPV-1; 93 | 94 | } else if (leader == 0) { // leader 0: BIP 95 | if (PSEL[cpu] > 0) PSEL[cpu]--; 96 | rrpv[set][way] = maxRRPV; 97 | 98 | bip_counter++; 99 | if (bip_counter == BIP_MAX) bip_counter = 0; 100 | if (bip_counter == 0) rrpv[set][way] = maxRRPV-1; 101 | 102 | } else if (leader == 1) { // leader 1: SRRIP 103 | if (PSEL[cpu] < PSEL_MAX) PSEL[cpu]++; 104 | rrpv[set][way] = maxRRPV-1; 105 | 106 | } else // WE SHOULD NOT REACH HERE 107 | assert(0); 108 | } 109 | 110 | // find replacement victim 111 | uint32_t CACHE::llc_find_victim(uint32_t cpu, uint64_t instr_id, uint32_t set, const BLOCK *current_set, uint64_t ip, uint64_t full_addr, uint32_t type) 112 | { 113 | // look for the maxRRPV line 114 | while (1) 115 | { 116 | for (int i=0; i> LOG2_BLOCK_SIZE; 56 | 57 | int index = -1; 58 | for (index=0; index trackers[index].last_cl_addr) 96 | stride = cl_addr - trackers[index].last_cl_addr; 97 | else { 98 | stride = trackers[index].last_cl_addr - cl_addr; 99 | stride *= -1; 100 | } 101 | 102 | //cout << "[IP_STRIDE] HIT index: " << index << " lru: " << trackers[index].lru << " ip: " << hex << ip << " cl_addr: " << cl_addr << dec << " stride: " << stride << endl; 103 | 104 | // don't do anything if we somehow saw the same address twice in a row 105 | if (stride == 0) 106 | return metadata_in; 107 | 108 | // only do any prefetching if there's a pattern of seeing the same 109 | // stride more than once 110 | if (stride == trackers[index].last_stride) { 111 | 112 | // do some prefetching 113 | for (int i=0; i> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE)) 119 | break; 120 | 121 | // check the MSHR occupancy to decide if we're going to prefetch to the L2 or LLC 122 | if (MSHR.occupancy < (MSHR.SIZE>>1)) 123 | prefetch_line(ip, addr, pf_address, FILL_L2, 0); 124 | else 125 | prefetch_line(ip, addr, pf_address, FILL_LLC, 0); 126 | } 127 | } 128 | 129 | trackers[index].last_cl_addr = cl_addr; 130 | trackers[index].last_stride = stride; 131 | 132 | for (int i=0; ifull_addr) { 12 | DP (if (warmup_complete[packet->cpu]) { 13 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 14 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 15 | cout << " cycle " << packet->event_cycle << endl; }); 16 | return i; 17 | } 18 | } 19 | else { 20 | if (entry[i].address == packet->address) { 21 | DP (if (warmup_complete[packet->cpu]) { 22 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 23 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 24 | cout << " cycle " << packet->event_cycle << endl; }); 25 | return i; 26 | } 27 | } 28 | } 29 | } 30 | else { 31 | for (uint32_t i=head; ifull_addr) { 34 | DP (if (warmup_complete[packet->cpu]) { 35 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 36 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 37 | cout << " cycle " << packet->event_cycle << endl; }); 38 | return i; 39 | } 40 | } 41 | else { 42 | if (entry[i].address == packet->address) { 43 | DP (if (warmup_complete[packet->cpu]) { 44 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 45 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 46 | cout << " cycle " << packet->event_cycle << endl; }); 47 | return i; 48 | } 49 | } 50 | } 51 | for (uint32_t i=0; ifull_addr) { 54 | DP (if (warmup_complete[packet->cpu]) { 55 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 56 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 57 | cout << " cycle " << packet->event_cycle << endl; }); 58 | return i; 59 | } 60 | } 61 | else { 62 | if (entry[i].address == packet->address) { 63 | DP (if (warmup_complete[packet->cpu]) { 64 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id << " same address: " << hex << packet->address; 65 | cout << " full_addr: " << packet->full_addr << dec << " by instr_id: " << entry[i].instr_id << " index: " << i; 66 | cout << " cycle " << packet->event_cycle << endl; }); 67 | return i; 68 | } 69 | } 70 | } 71 | } 72 | 73 | return -1; 74 | } 75 | 76 | void PACKET_QUEUE::add_queue(PACKET *packet) 77 | { 78 | #ifdef SANITY_CHECK 79 | if (occupancy && (head == tail)) 80 | assert(0); 81 | #endif 82 | 83 | // add entry 84 | entry[tail] = *packet; 85 | 86 | DP ( if (warmup_complete[packet->cpu]) { 87 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id; 88 | cout << " address: " << hex << entry[tail].address << " full_addr: " << entry[tail].full_addr << dec; 89 | cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << entry[tail].event_cycle << endl; }); 90 | 91 | occupancy++; 92 | tail++; 93 | if (tail >= SIZE) 94 | tail = 0; 95 | } 96 | 97 | void PACKET_QUEUE::remove_queue(PACKET *packet) 98 | { 99 | #ifdef SANITY_CHECK 100 | if ((occupancy == 0) && (head == tail)) 101 | assert(0); 102 | #endif 103 | 104 | DP ( if (warmup_complete[packet->cpu]) { 105 | cout << "[" << NAME << "] " << __func__ << " cpu: " << packet->cpu << " instr_id: " << packet->instr_id; 106 | cout << " address: " << hex << packet->address << " full_addr: " << packet->full_addr << dec << " fill_level: " << packet->fill_level; 107 | cout << " head: " << head << " tail: " << tail << " occupancy: " << occupancy << " event_cycle: " << packet->event_cycle << endl; }); 108 | 109 | // reset entry 110 | PACKET empty_packet; 111 | *packet = empty_packet; 112 | 113 | occupancy--; 114 | head++; 115 | if (head >= SIZE) 116 | head = 0; 117 | } 118 | -------------------------------------------------------------------------------- /prefetcher/fdip.l1i_pref: -------------------------------------------------------------------------------- 1 | #include "ooo_cpu.h" 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | /**************************************** Basic Structures **************************************/ 8 | 9 | #define MAX_PFETCHQ_ENTRIES 48 10 | #define MAX_RECENT_PFETCH 10 11 | 12 | 13 | /**************************************** Compoenets for prefetching **************************************/ 14 | 15 | std::deque prefetch_queue; //Storage: 64-bits * 48 (queue size) = 384 bytes 16 | std::deque prefetch_queue_spec; //Storage: 64-bits * 48 (queue size) = 384 bytes 17 | std::deque recent_prefetches; //Storage: 64-bits * 10 (queue size) = 80 bytes 18 | uint64_t disp[66] = {0}; 19 | uint64_t lowBits[66] = {0}; 20 | 21 | /**************************************** Prefetcher Operation **************************************/ 22 | 23 | 24 | 25 | void O3_CPU::l1i_prefetcher_initialize() 26 | { 27 | 28 | } 29 | 30 | void O3_CPU::l1i_prefetcher_instruction_operate(uint64_t ip) 31 | { 32 | prefetch_queue_spec.clear(); 33 | uint64_t block_addr = ((ip >> LOG2_BLOCK_SIZE) << LOG2_BLOCK_SIZE); 34 | if (block_addr == 0) 35 | return; 36 | 37 | std::deque::iterator it = std::find(prefetch_queue.begin(), prefetch_queue.end(), block_addr); 38 | if (it == prefetch_queue.end()) { 39 | std::deque::iterator it1 = std::find(recent_prefetches.begin(), recent_prefetches.end(), block_addr); 40 | if (it1 == recent_prefetches.end()) { 41 | prefetch_queue.push_back(block_addr); 42 | } 43 | } 44 | } 45 | 46 | void O3_CPU::l1i_prefetcher_instruction_spec_operate(uint64_t ip) 47 | { 48 | uint64_t block_addr = ((ip >> LOG2_BLOCK_SIZE) << LOG2_BLOCK_SIZE); 49 | if (block_addr == 0) 50 | return; 51 | 52 | std::deque::iterator it = std::find(prefetch_queue.begin(), prefetch_queue.end(), block_addr); 53 | if (it == prefetch_queue.end()) { 54 | std::deque::iterator it1 = std::find(recent_prefetches.begin(), recent_prefetches.end(), block_addr); 55 | if (it1 == recent_prefetches.end()) { 56 | std::deque::iterator it2 = std::find(prefetch_queue_spec.begin(), prefetch_queue_spec.end(), block_addr); 57 | if (it2 == prefetch_queue_spec.end()) { 58 | prefetch_queue_spec.push_back(block_addr); 59 | } 60 | } 61 | } 62 | } 63 | 64 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 65 | { 66 | if (branch_target && branch_type != BRANCH_RETURN) { 67 | /*Find the number of bits needed to encode the target offset*/ 68 | uint64_t target_offset; 69 | if (branch_target > ip) { 70 | target_offset = branch_target - ip; 71 | } else { 72 | target_offset = ip - branch_target; 73 | } 74 | 75 | int num_bits = 0; 76 | if (target_offset) { 77 | num_bits = (int)(log2((double)target_offset)); 78 | /* The cast "(int)log2" rounds down to lower integer, however we want to round it to upper integer, so add 1 to "num_bits" 79 | * As an offset can be both positive and negative, we need to add 1 sign bit to "num_bits". 80 | * */ 81 | } 82 | 83 | //num_bits += 2; //Not needed if the distance/displacement is in 4 byte instructions instead of bytes 84 | 85 | /**********************************************************************************************************************/ 86 | uint64_t diff_bits = (branch_target >> 2) ^ (ip >> 2); 87 | int num_lower_bits = 0; 88 | while (diff_bits != 0) { 89 | diff_bits = diff_bits >> 1; 90 | num_lower_bits++; 91 | } 92 | //cout << "Target " << hex << branch_target << " ip " << ip << " num_bits " << dec << num_bits << " num_lower_bits " << num_lower_bits << endl; 93 | 94 | if ((num_bits - 3) > num_lower_bits) { 95 | cout << "Target " << hex << branch_target << " ip " << ip << " num_bits " << dec << num_bits << " num_lower_bits " << num_lower_bits << endl; 96 | cout << "This is wierd" << endl; 97 | assert(0); 98 | } 99 | 100 | /*********************************************************************************************************************/ 101 | //if (branch_type != BRANCH_RETURN) { 102 | disp[num_bits]++; 103 | lowBits[num_lower_bits]++; 104 | //} 105 | assert(num_bits >= 0 && num_bits < 66); 106 | 107 | } 108 | if (branch_type == BRANCH_RETURN) { 109 | disp[0]++; 110 | lowBits[0]++; 111 | } 112 | } 113 | 114 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 115 | { 116 | if((cache_hit == 0) && (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1))) 117 | { 118 | uint64_t pf_addr = v_addr + (1<>1) && L1I.PQ.occupancy < L1I.PQ.SIZE) { 130 | prefetch_code_line(prefetch_queue.front()); 131 | recent_prefetches.push_back(prefetch_queue.front()); 132 | if (recent_prefetches.size() > MAX_RECENT_PFETCH) { 133 | recent_prefetches.pop_front(); 134 | } 135 | 136 | prefetch_queue.pop_front(); 137 | } 138 | } else if (prefetch_queue_spec.size()) { 139 | if (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1) && L1I.PQ.occupancy < L1I.PQ.SIZE) { 140 | prefetch_code_line(prefetch_queue_spec.front()); 141 | recent_prefetches.push_back(prefetch_queue_spec.front()); 142 | if (recent_prefetches.size() > MAX_RECENT_PFETCH) { 143 | recent_prefetches.pop_front(); 144 | } 145 | 146 | prefetch_queue_spec.pop_front(); 147 | } 148 | } 149 | } 150 | 151 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr) 152 | { 153 | 154 | } 155 | 156 | void O3_CPU::l1i_prefetcher_final_stats() 157 | { 158 | for(int i = 0; i < 66; i++) { 159 | cout << "XXX disp-" << i << " " << disp[i] << endl; 160 | } 161 | 162 | for(int i = 0; i < 66; i++) { 163 | cout << "XXX diffBits-" << i << " " << lowBits[i] << endl; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /inc/set.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This file defines a specalized bitset data structure that uses 64 bit 3 | * words to store bits in a set, but does something special for small 4 | * sets to make it faster. 5 | */ 6 | 7 | #ifndef __SET_H 8 | #define __SET_H 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define TYPE unsigned short int 15 | //#define MAX_SIZE ROB_SIZE 16 | // sethpugsley - changed this from ROB_SIZE to allow for non-power-of-2 ROB sizes, like real CPUs have 17 | // but MAX_SIZE here still requires a power-of-2 number 18 | #define MAX_SIZE 512 19 | 20 | // tuned empirically 21 | 22 | #define SMALL_SIZE 13 23 | #define SMALLER_SIZE 6 24 | 25 | class fastset { 26 | union { 27 | // values for a small set 28 | TYPE 29 | values[SMALL_SIZE]; 30 | 31 | // the bits representing the set 32 | unsigned long long int 33 | bits[MAX_SIZE/64]; 34 | } data; 35 | 36 | int 37 | card; // cardinality of small set 38 | 39 | // set a bit in the bits 40 | 41 | void setbit (TYPE x) { 42 | int word = x >> 6; 43 | int bit = x & 63; 44 | data.bits[word] |= 1ull << bit; 45 | } 46 | 47 | // get one of the bits 48 | 49 | bool getbit (TYPE x) { 50 | int word = x >> 6; 51 | int bit = x & 63; 52 | return (data.bits[word] >> bit) & 1; 53 | } 54 | 55 | // insert an item into a small set 56 | 57 | void insert_small (TYPE x) { 58 | int i; 59 | for (i=0; i x) break; 63 | } 64 | // x belongs in i; move everything from v[i] through v[n-1] 65 | // to v[i+1] through v[n] 66 | for (int j=card-1; j>=i; j--) data.values[j+1] = data.values[j]; 67 | // the loop seems a little faster than memmove 68 | //memmove (&data.values[i+1], &data.values[i], (sizeof (TYPE) * (card-i))); 69 | data.values[i] = x; 70 | card++; 71 | } 72 | 73 | 74 | // do a linear search in a small set 75 | 76 | bool search_small_linear (TYPE x) { 77 | for (int i=0; i x) return false; 80 | if (y == x) return true; 81 | } 82 | return false; 83 | } 84 | 85 | 86 | // search a small set, specializing for the set size 87 | 88 | bool search_small (TYPE x) { 89 | 90 | // no elements? we're done. 91 | 92 | if (!card) return false; 93 | 94 | // below a certain size linear search is faster 95 | 96 | if (card < SMALLER_SIZE) return search_small_linear (x); 97 | 98 | // do a binary search for the item 99 | 100 | int begin = 0; 101 | int end = card-1; 102 | int middle = end/2; 103 | for (;;) { 104 | TYPE y = data.values[middle]; 105 | if (x < y) { 106 | end = middle-1; 107 | } else if (x > y) { 108 | begin = middle+1; 109 | } else return true; 110 | if (end < begin) break; 111 | middle = (begin + end) / 2; 112 | // assert (middle < card && middle >= 0); 113 | } 114 | return false; 115 | } 116 | 117 | // convert a small set into a bitset 118 | 119 | void smalltobit (void) { 120 | 121 | // we have to use a temporary array to hold the small set contents 122 | // because the small set and bitset occupy the same memory 123 | 124 | TYPE tmp[SMALL_SIZE]; 125 | memcpy (tmp, data.values, sizeof (TYPE) * card); 126 | memset (data.bits, 0, sizeof (data.bits)); 127 | for (int i=0; i= SMALL_SIZE); 208 | } 209 | 210 | // lim is the next multiple of 64 211 | 212 | int lim = ((n | 63) + 1) / 64; 213 | 214 | // bitwise OR the other bits into this set 215 | for (int i=0; i 51 | #include 52 | #include 53 | #include 54 | 55 | #include "ooo_cpu.h" 56 | 57 | // this many tables 58 | 59 | #define NTABLES 16 60 | 61 | // maximum history length 62 | 63 | #define MAXHIST 232 64 | 65 | // minimum history length (for table 1; table 0 is biases) 66 | 67 | #define MINHIST 3 68 | 69 | // speed for dynamic threshold setting 70 | 71 | #define SPEED 18 72 | 73 | // geometric global history lengths 74 | 75 | int history_lengths[NTABLES] = { 0, 3, 4, 6, 8, 10, 14, 19, 26, 36, 49, 67, 91, 125, 170, MAXHIST }; 76 | 77 | // 12-bit indices for the tables 78 | 79 | #define LOG_TABLE_SIZE 12 80 | #define TABLE_SIZE (1<= 1; 174 | } 175 | 176 | void O3_CPU::last_branch_result(uint64_t pc, uint8_t taken) { 177 | 178 | // was this prediction correct? 179 | 180 | bool correct = taken == (yout[cpu] >= 1); 181 | 182 | // insert this branch outcome into the global history 183 | 184 | bool b = taken; 185 | for (int i=0; i -128) (*c)--; 217 | } 218 | } 219 | 220 | // dynamic threshold setting from Seznec's O-GEHL paper 221 | 222 | if (!correct) { 223 | 224 | // increase theta after enough mispredictions 225 | 226 | tc[cpu]++; 227 | if (tc[cpu] >= SPEED) { 228 | theta[cpu]++; 229 | tc[cpu] = 0; 230 | } 231 | } else if (a < theta[cpu]) { 232 | 233 | // decrease theta after enough weak but correct predictions 234 | 235 | tc[cpu]--; 236 | if (tc[cpu] <= -SPEED) { 237 | theta[cpu]--; 238 | tc[cpu] = 0; 239 | } 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /inc/cache.h: -------------------------------------------------------------------------------- 1 | #ifndef CACHE_H 2 | #define CACHE_H 3 | 4 | #include "memory_class.h" 5 | 6 | // PAGE 7 | extern uint32_t PAGE_TABLE_LATENCY, SWAP_LATENCY; 8 | 9 | // CACHE TYPE 10 | #define IS_ITLB 0 11 | #define IS_DTLB 1 12 | #define IS_STLB 2 13 | #define IS_L1I 3 14 | #define IS_L1D 4 15 | #define IS_L2C 5 16 | #define IS_LLC 6 17 | 18 | // INSTRUCTION TLB 19 | #define ITLB_SET 16 20 | #define ITLB_WAY 4 21 | #define ITLB_RQ_SIZE 16 22 | #define ITLB_WQ_SIZE 16 23 | #define ITLB_PQ_SIZE 0 24 | #define ITLB_MSHR_SIZE 8 25 | #define ITLB_LATENCY 1 26 | 27 | // DATA TLB 28 | #define DTLB_SET 16 29 | #define DTLB_WAY 4 30 | #define DTLB_RQ_SIZE 16 31 | #define DTLB_WQ_SIZE 16 32 | #define DTLB_PQ_SIZE 0 33 | #define DTLB_MSHR_SIZE 8 34 | #define DTLB_LATENCY 1 35 | 36 | // SECOND LEVEL TLB 37 | #define STLB_SET 128 38 | #define STLB_WAY 12 39 | #define STLB_RQ_SIZE 32 40 | #define STLB_WQ_SIZE 32 41 | #define STLB_PQ_SIZE 0 42 | #define STLB_MSHR_SIZE 16 43 | #define STLB_LATENCY 8 44 | 45 | // L1 INSTRUCTION CACHE 46 | #define L1I_SET 64 47 | #define L1I_WAY 8 48 | #define L1I_RQ_SIZE 64 49 | #define L1I_WQ_SIZE 64 50 | #define L1I_PQ_SIZE 32 51 | #define L1I_MSHR_SIZE 8 52 | #define L1I_LATENCY 4 53 | 54 | // L1 DATA CACHE 55 | #define L1D_SET 64 56 | #define L1D_WAY 12 57 | #define L1D_RQ_SIZE 64 58 | #define L1D_WQ_SIZE 64 59 | #define L1D_PQ_SIZE 8 60 | #define L1D_MSHR_SIZE 16 61 | #define L1D_LATENCY 5 62 | 63 | // L2 CACHE 64 | #define L2C_SET 1024 65 | #define L2C_WAY 8 66 | #define L2C_RQ_SIZE 32 67 | #define L2C_WQ_SIZE 32 68 | #define L2C_PQ_SIZE 16 69 | #define L2C_MSHR_SIZE 32 70 | #define L2C_LATENCY 10 // 4/5 (L1I or L1D) + 10 = 14/15 cycles 71 | 72 | // LAST LEVEL CACHE 73 | #define LLC_SET NUM_CPUS*2048 74 | #define LLC_WAY 16 75 | #define LLC_RQ_SIZE NUM_CPUS*L2C_MSHR_SIZE //48 76 | #define LLC_WQ_SIZE NUM_CPUS*L2C_MSHR_SIZE //48 77 | #define LLC_PQ_SIZE NUM_CPUS*32 78 | #define LLC_MSHR_SIZE NUM_CPUS*64 79 | #define LLC_LATENCY 20 // 4/5 (L1I or L1D) + 10 + 20 = 34/35 cycles 80 | 81 | class CACHE : public MEMORY { 82 | public: 83 | uint32_t cpu; 84 | const string NAME; 85 | const uint32_t NUM_SET, NUM_WAY, NUM_LINE, WQ_SIZE, RQ_SIZE, PQ_SIZE, MSHR_SIZE; 86 | uint32_t LATENCY; 87 | BLOCK **block; 88 | int fill_level; 89 | uint32_t MAX_READ, MAX_FILL; 90 | uint32_t reads_available_this_cycle; 91 | uint8_t cache_type; 92 | 93 | // prefetch stats 94 | uint64_t pf_requested, 95 | pf_issued, 96 | pf_useful, 97 | pf_useless, 98 | pf_fill; 99 | 100 | // queues 101 | PACKET_QUEUE WQ{NAME + "_WQ", WQ_SIZE}, // write queue 102 | RQ{NAME + "_RQ", RQ_SIZE}, // read queue 103 | PQ{NAME + "_PQ", PQ_SIZE}, // prefetch queue 104 | MSHR{NAME + "_MSHR", MSHR_SIZE}, // MSHR 105 | PROCESSED{NAME + "_PROCESSED", ROB_SIZE}; // processed queue 106 | 107 | uint64_t sim_access[NUM_CPUS][NUM_TYPES], 108 | sim_hit[NUM_CPUS][NUM_TYPES], 109 | sim_miss[NUM_CPUS][NUM_TYPES], 110 | roi_access[NUM_CPUS][NUM_TYPES], 111 | roi_hit[NUM_CPUS][NUM_TYPES], 112 | roi_miss[NUM_CPUS][NUM_TYPES]; 113 | 114 | uint64_t total_miss_latency; 115 | 116 | // constructor 117 | CACHE(string v1, uint32_t v2, int v3, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v8) 118 | : NAME(v1), NUM_SET(v2), NUM_WAY(v3), NUM_LINE(v4), WQ_SIZE(v5), RQ_SIZE(v6), PQ_SIZE(v7), MSHR_SIZE(v8) { 119 | 120 | LATENCY = 0; 121 | 122 | // cache block 123 | block = new BLOCK* [NUM_SET]; 124 | for (uint32_t i=0; i 3 | #include 4 | 5 | #define maxRRPV 3 6 | #define SHCT_SIZE 16384 7 | #define SHCT_PRIME 16381 8 | #define SAMPLER_SET (256*NUM_CPUS) 9 | #define SAMPLER_WAY LLC_WAY 10 | #define SHCT_MAX 7 11 | 12 | uint32_t rrpv[LLC_SET][LLC_WAY]; 13 | 14 | // sampler structure 15 | class SAMPLER_class 16 | { 17 | public: 18 | uint8_t valid, 19 | type, 20 | used; 21 | 22 | uint64_t tag, cl_addr, ip; 23 | 24 | uint32_t lru; 25 | 26 | SAMPLER_class() { 27 | valid = 0; 28 | type = 0; 29 | used = 0; 30 | 31 | tag = 0; 32 | cl_addr = 0; 33 | ip = 0; 34 | 35 | lru = 0; 36 | }; 37 | }; 38 | 39 | // sampler 40 | uint32_t rand_sets[SAMPLER_SET]; 41 | SAMPLER_class sampler[SAMPLER_SET][SAMPLER_WAY]; 42 | 43 | // prediction table structure 44 | class SHCT_class { 45 | public: 46 | uint32_t counter; 47 | 48 | SHCT_class() { 49 | counter = 0; 50 | }; 51 | }; 52 | SHCT_class SHCT[NUM_CPUS][SHCT_SIZE]; 53 | 54 | // initialize replacement state 55 | void CACHE::llc_initialize_replacement() 56 | { 57 | cout << "Initialize SHIP state" << endl; 58 | 59 | for (int i=0; i 0) 123 | SHCT[cpu][SHCT_idx].counter--; 124 | 125 | /* 126 | if (draw_transition) 127 | printf("cycle: %lu SHCT: %d ip: 0x%llX SAMPLER_HIT cl_addr: 0x%llX page: 0x%llX block: %ld set: %d\n", 128 | ooo_cpu[cpu].current_cycle, SHCT[cpu][SHCT_idx].dead, s_set[match].ip, address>>6, address>>12, (address>>6) & 0x3F, s_idx); 129 | */ 130 | 131 | //s_set[match].ip = ip; // SHIP does not update ip on sampler hit 132 | s_set[match].type = type; 133 | s_set[match].used = 1; 134 | //D(printf("sampler hit cpu: %d set: %d way: %d tag: %x ip: %lx type: %d lru: %d\n", 135 | // cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru)); 136 | 137 | break; 138 | } 139 | } 140 | 141 | // check invalid 142 | if (match == SAMPLER_WAY) 143 | { 144 | for (match=0; match>6, address>>12, (address>>6) & 0x3F, s_idx); 178 | */ 179 | } 180 | 181 | s_set[match].tag = tag; 182 | s_set[match].ip = ip; 183 | s_set[match].type = type; 184 | s_set[match].used = 0; 185 | 186 | //D(printf("sampler miss cpu: %d set: %d way: %d tag: %x ip: %lx type: %d lru: %d\n", 187 | // cpu, rand_sets[s_idx], match, tag, ip, type, s_set[match].lru)); 188 | break; 189 | } 190 | } 191 | } 192 | 193 | // update LRU state 194 | uint32_t curr_position = s_set[match].lru; 195 | for (int i=0; i= SHCT_PRIME) 271 | assert(0); 272 | 273 | rrpv[set][way] = maxRRPV-1; 274 | if (SHCT[cpu][SHCT_idx].counter == SHCT_MAX) 275 | rrpv[set][way] = maxRRPV; 276 | } 277 | } 278 | 279 | // use this function to print out your own stats at the end of simulation 280 | void CACHE::llc_replacement_final_stats() 281 | { 282 | 283 | } 284 | -------------------------------------------------------------------------------- /inc/instruction.h: -------------------------------------------------------------------------------- 1 | #ifndef INSTRUCTION_H 2 | #define INSTRUCTION_H 3 | 4 | 5 | // instruction format 6 | #define ROB_SIZE 352 7 | #define LQ_SIZE 128 8 | #define SQ_SIZE 72 9 | #define NUM_INSTR_DESTINATIONS_SPARC 4 10 | #define NUM_INSTR_DESTINATIONS 2 11 | #define NUM_INSTR_SOURCES 4 12 | 13 | // special registers that help us identify branches 14 | #define REG_STACK_POINTER 6 15 | #define REG_FLAGS 25 16 | #define REG_INSTRUCTION_POINTER 26 17 | 18 | // branch types 19 | #define NOT_BRANCH 0 20 | #define BRANCH_DIRECT_JUMP 1 21 | #define BRANCH_INDIRECT 2 22 | #define BRANCH_CONDITIONAL 3 23 | #define BRANCH_DIRECT_CALL 4 24 | #define BRANCH_INDIRECT_CALL 5 25 | #define BRANCH_RETURN 6 26 | #define BRANCH_OTHER 7 27 | 28 | #include "set.h" 29 | 30 | class input_instr { 31 | public: 32 | 33 | // instruction pointer or PC (Program Counter) 34 | uint64_t ip; 35 | 36 | // branch info 37 | uint8_t is_branch; 38 | uint8_t branch_taken; 39 | 40 | uint8_t destination_registers[NUM_INSTR_DESTINATIONS]; // output registers 41 | uint8_t source_registers[NUM_INSTR_SOURCES]; // input registers 42 | 43 | uint64_t destination_memory[NUM_INSTR_DESTINATIONS]; // output memory 44 | uint64_t source_memory[NUM_INSTR_SOURCES]; // input memory 45 | 46 | input_instr() { 47 | ip = 0; 48 | is_branch = 0; 49 | branch_taken = 0; 50 | 51 | for (uint32_t i=0; iweights[i] = 0; 159 | } 160 | 161 | void O3_CPU::initialize_branch_predictor() 162 | { 163 | spec_global_history[cpu] = 0; 164 | global_history[cpu] = 0; 165 | perceptron_state_buf_ctr[cpu] = 0; 166 | for (int i=0; i= NUM_UPDATE_ENTRIES) 190 | perceptron_state_buf_ctr[cpu] = 0; 191 | 192 | /* hash the address to get an index into the table of perceptrons */ 193 | 194 | index = address % NUM_PERCEPTRONS; 195 | 196 | /* get pointers to that perceptron and its weights */ 197 | 198 | p = &perceptrons[cpu][index]; 199 | w = &p->weights[0]; 200 | 201 | /* initialize the output to the bias weight, and bump the pointer 202 | * to the weights 203 | */ 204 | 205 | output = *w++; 206 | 207 | /* find the (rest of the) dot product of the history register 208 | * and the perceptron weights. note that, instead of actually 209 | * doing the expensive multiplies, we simply add a weight when the 210 | * corresponding branch in the history register is taken, or 211 | * subtract a weight when the branch is not taken. this also lets 212 | * us use binary instead of bipolar logic to represent the history 213 | * register 214 | */ 215 | for (mask=1,i=0; ioutput = output; 225 | u[cpu]->perc = p; 226 | u[cpu]->history = spec_global_history[cpu]; 227 | u[cpu]->prediction = output >= 0; 228 | u[cpu]->dummy_counter = u[cpu]->prediction ? 3 : 0; 229 | 230 | /* update the speculative global history register */ 231 | 232 | spec_global_history[cpu] <<= 1; 233 | spec_global_history[cpu] |= u[cpu]->prediction; 234 | return u[cpu]->prediction; 235 | } 236 | 237 | void O3_CPU::last_branch_result(uint64_t ip, uint8_t taken) 238 | { 239 | int 240 | i, 241 | y, 242 | *w; 243 | 244 | unsigned long long int 245 | mask, 246 | history; 247 | 248 | /* update the real global history shift register */ 249 | 250 | global_history[cpu] <<= 1; 251 | global_history[cpu] |= taken; 252 | 253 | /* if this branch was mispredicted, restore the speculative 254 | * history to the last known real history 255 | */ 256 | 257 | if (u[cpu]->prediction != taken) spec_global_history[cpu] = global_history[cpu]; 258 | 259 | /* if the output of the perceptron predictor is outside of 260 | * the range [-THETA,THETA] *and* the prediction was correct, 261 | * then we don't need to adjust the weights 262 | */ 263 | 264 | if (u[cpu]->output > THETA) 265 | y = 1; 266 | else if (u[cpu]->output < -THETA) 267 | y = 0; 268 | else 269 | y = 2; 270 | if (y == 1 && taken) return; 271 | if (y == 0 && !taken) return; 272 | 273 | /* w is a pointer to the first weight (the bias weight) */ 274 | 275 | w = &u[cpu]->perc->weights[0]; 276 | 277 | /* if the branch was taken, increment the bias weight, 278 | * else decrement it, with saturating arithmetic 279 | */ 280 | 281 | if (taken) 282 | (*w)++; 283 | else 284 | (*w)--; 285 | if (*w > MAX_WEIGHT) *w = MAX_WEIGHT; 286 | if (*w < MIN_WEIGHT) *w = MIN_WEIGHT; 287 | 288 | /* now w points to the next weight */ 289 | 290 | w++; 291 | 292 | /* get the history that led to this prediction */ 293 | 294 | history = u[cpu]->history; 295 | 296 | /* for each weight and corresponding bit in the history register... */ 297 | 298 | for (mask=1,i=0; i !!x is 1 iff x is not zero, in this case history is positively correlated with branch outcome 306 | (*w)++; 307 | if (*w > MAX_WEIGHT) *w = MAX_WEIGHT; 308 | } else { 309 | (*w)--; 310 | if (*w < MIN_WEIGHT) *w = MIN_WEIGHT; 311 | } 312 | } 313 | } 314 | -------------------------------------------------------------------------------- /inc/ooo_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef OOO_CPU_H 2 | #define OOO_CPU_H 3 | 4 | #include "cache.h" 5 | 6 | #ifdef CRC2_COMPILE 7 | #define STAT_PRINTING_PERIOD 1000000 8 | #else 9 | #define STAT_PRINTING_PERIOD 10000000 10 | #endif 11 | #define DEADLOCK_CYCLE 1000000 12 | 13 | using namespace std; 14 | 15 | // CORE PROCESSOR 16 | #define FETCH_WIDTH 6 17 | #define DECODE_WIDTH 6 18 | #define EXEC_WIDTH 6 19 | #define LQ_WIDTH 2 20 | #define SQ_WIDTH 2 21 | #define RETIRE_WIDTH 4 22 | #define SCHEDULER_SIZE 128 23 | #define BRANCH_MISPREDICT_PENALTY 1 24 | //#define SCHEDULING_LATENCY 0 25 | //#define EXEC_LATENCY 0 26 | //#define DECODE_LATENCY 2 27 | 28 | #define STA_SIZE (ROB_SIZE*NUM_INSTR_DESTINATIONS_SPARC) 29 | 30 | extern uint32_t SCHEDULING_LATENCY, EXEC_LATENCY, DECODE_LATENCY; 31 | 32 | struct BTB_outcome { 33 | uint64_t target; 34 | uint8_t branch_type; 35 | uint8_t sequential_BTB_access; 36 | }; 37 | 38 | // cpu 39 | class O3_CPU { 40 | public: 41 | bool skip_next_cycle; 42 | uint32_t cpu; 43 | 44 | // trace 45 | FILE *trace_file; 46 | char trace_string[1024]; 47 | char gunzip_command[1024]; 48 | 49 | // instruction 50 | input_instr next_instr; 51 | input_instr current_instr; 52 | cloudsuite_instr current_cloudsuite_instr; 53 | uint64_t instr_unique_id, completed_executions, 54 | begin_sim_cycle, begin_sim_instr, 55 | last_sim_cycle, last_sim_instr, 56 | finish_sim_cycle, finish_sim_instr, 57 | warmup_instructions, simulation_instructions, instrs_to_read_this_cycle, instrs_to_fetch_this_cycle, 58 | next_print_instruction, num_retired; 59 | uint32_t inflight_reg_executions, inflight_mem_executions, num_searched; 60 | uint32_t next_ITLB_fetch; 61 | 62 | // reorder buffer, load/store queue, register file 63 | CORE_BUFFER IFETCH_BUFFER{"IFETCH_BUFFER", /*128*/FETCH_WIDTH*2}; 64 | CORE_BUFFER DECODE_BUFFER{"DECODE_BUFFER", DECODE_WIDTH*3}; 65 | CORE_BUFFER ROB{"ROB", ROB_SIZE}; 66 | LOAD_STORE_QUEUE LQ{"LQ", LQ_SIZE}, SQ{"SQ", SQ_SIZE}; 67 | 68 | // store array, this structure is required to properly handle store instructions 69 | uint64_t STA[STA_SIZE], STA_head, STA_tail; 70 | 71 | // Ready-To-Execute 72 | uint32_t RTE0[ROB_SIZE], RTE0_head, RTE0_tail, 73 | RTE1[ROB_SIZE], RTE1_head, RTE1_tail; 74 | 75 | // Ready-To-Load 76 | uint32_t RTL0[LQ_SIZE], RTL0_head, RTL0_tail, 77 | RTL1[LQ_SIZE], RTL1_head, RTL1_tail; 78 | 79 | // Ready-To-Store 80 | uint32_t RTS0[SQ_SIZE], RTS0_head, RTS0_tail, 81 | RTS1[SQ_SIZE], RTS1_head, RTS1_tail; 82 | 83 | // branch 84 | int branch_mispredict_stall_fetch; // flag that says that we should stall because a branch prediction was wrong 85 | int mispredicted_branch_iw_index; // index in the instruction window of the mispredicted branch. fetch resumes after the instruction at this index executes 86 | uint8_t fetch_stall; 87 | uint64_t fetch_resume_cycle; 88 | uint64_t nextInstPC; 89 | uint32_t IBUF_occupancy; 90 | uint64_t num_branch, branch_mispredictions, branch_direction_mispredictions, branch_target_mispredictions1, branch_target_mispredictions2; 91 | uint64_t mispredict_on_btb_hit; 92 | uint64_t total_rob_occupancy_at_branch_mispredict; 93 | uint64_t total_branch_types[8]; 94 | uint64_t mispredicted_branch_types[8]; 95 | uint64_t BTB_reads, BTB_writes, PageBTB_reads, PageBTB_writes, PageBTB_readsBeforeWrite, RegionBTB_reads, RegionBTB_writes, RegionBTB_readsBeforeWrite; 96 | std::map> branchesInCB; 97 | std::set unique_branchPC; 98 | uint64_t offset_size[132]; 99 | uint64_t BTB_4D_Hits, BTB_6D_Hits, BTB_8D_Hits, BTB_12D_Hits, BTB_18D_Hits, BTB_25D_Hits, BTB_46D_Hits, BTB_Ret_Hits; 100 | uint64_t BTB_4D_Misses, BTB_6D_Misses, BTB_8D_Misses, BTB_12D_Misses, BTB_18D_Misses, BTB_25D_Misses, BTB_46D_Misses, BTB_Ret_Misses; 101 | 102 | 103 | // TLBs and caches 104 | CACHE ITLB{"ITLB", ITLB_SET, ITLB_WAY, ITLB_SET*ITLB_WAY, ITLB_WQ_SIZE, ITLB_RQ_SIZE, ITLB_PQ_SIZE, ITLB_MSHR_SIZE}, 105 | DTLB{"DTLB", DTLB_SET, DTLB_WAY, DTLB_SET*DTLB_WAY, DTLB_WQ_SIZE, DTLB_RQ_SIZE, DTLB_PQ_SIZE, DTLB_MSHR_SIZE}, 106 | STLB{"STLB", STLB_SET, STLB_WAY, STLB_SET*STLB_WAY, STLB_WQ_SIZE, STLB_RQ_SIZE, STLB_PQ_SIZE, STLB_MSHR_SIZE}, 107 | L1I{"L1I", L1I_SET, L1I_WAY, L1I_SET*L1I_WAY, L1I_WQ_SIZE, L1I_RQ_SIZE, L1I_PQ_SIZE, L1I_MSHR_SIZE}, 108 | L1D{"L1D", L1D_SET, L1D_WAY, L1D_SET*L1D_WAY, L1D_WQ_SIZE, L1D_RQ_SIZE, L1D_PQ_SIZE, L1D_MSHR_SIZE}, 109 | L2C{"L2C", L2C_SET, L2C_WAY, L2C_SET*L2C_WAY, L2C_WQ_SIZE, L2C_RQ_SIZE, L2C_PQ_SIZE, L2C_MSHR_SIZE}; 110 | 111 | // trace cache for previously decoded instructions 112 | 113 | // constructor 114 | O3_CPU() { 115 | cpu = 0; 116 | 117 | // trace 118 | trace_file = NULL; 119 | skip_next_cycle = false; 120 | 121 | // instruction 122 | instr_unique_id = 0; 123 | completed_executions = 0; 124 | begin_sim_cycle = 0; 125 | begin_sim_instr = 0; 126 | last_sim_cycle = 0; 127 | last_sim_instr = 0; 128 | finish_sim_cycle = 0; 129 | finish_sim_instr = 0; 130 | warmup_instructions = 0; 131 | simulation_instructions = 0; 132 | instrs_to_read_this_cycle = 0; 133 | instrs_to_fetch_this_cycle = 0; 134 | 135 | next_print_instruction = STAT_PRINTING_PERIOD; 136 | num_retired = 0; 137 | 138 | inflight_reg_executions = 0; 139 | inflight_mem_executions = 0; 140 | num_searched = 0; 141 | 142 | next_ITLB_fetch = 0; 143 | 144 | // branch 145 | branch_mispredict_stall_fetch = 0; 146 | mispredicted_branch_iw_index = 0; 147 | fetch_stall = 0; 148 | fetch_resume_cycle = 0; 149 | nextInstPC = 0; 150 | IBUF_occupancy = 0; 151 | num_branch = 0; 152 | branch_mispredictions = 0; 153 | branch_direction_mispredictions = 0; 154 | branch_target_mispredictions1 = 0; 155 | branch_target_mispredictions2 = 0; 156 | mispredict_on_btb_hit = 0; 157 | for(uint32_t i=0; i<8; i++) 158 | { 159 | total_branch_types[i] = 0; 160 | mispredicted_branch_types[i] = 0; 161 | } 162 | BTB_reads = BTB_writes = PageBTB_reads = PageBTB_writes = PageBTB_readsBeforeWrite = RegionBTB_reads = RegionBTB_writes = RegionBTB_readsBeforeWrite = 0; 163 | branchesInCB.clear(); 164 | unique_branchPC.clear(); 165 | for(uint32_t i=0; i<132; i++) 166 | { 167 | offset_size[i] = 0; 168 | } 169 | 170 | BTB_4D_Hits = BTB_6D_Hits = BTB_8D_Hits = BTB_12D_Hits = BTB_18D_Hits = BTB_25D_Hits = BTB_46D_Hits = BTB_Ret_Hits = 0; 171 | BTB_4D_Misses = BTB_6D_Misses = BTB_8D_Misses = BTB_12D_Misses = BTB_18D_Misses = BTB_25D_Misses = BTB_46D_Misses = BTB_Ret_Misses = 0; 172 | 173 | for (uint32_t i=0; i 3 | #include 4 | #include 5 | 6 | 7 | /**************************************** Basic Structures **************************************/ 8 | 9 | // FTQ entry struct 10 | struct FTQEntry { 11 | bool end_found; 12 | uint64_t first_ip; 13 | uint64_t last_ip; 14 | FTQEntry (bool endFound, uint64_t firstIP, uint64_t lastIP) { 15 | end_found = endFound; 16 | first_ip = firstIP; 17 | last_ip = lastIP; 18 | } 19 | }; 20 | 21 | // BTB entry struct 22 | struct BTBEntry { 23 | uint8_t branch_type; //2-bits (We only use four categories: conditional, call, return, and other.) 24 | uint64_t tag; //16-bits 25 | uint64_t target_ip; //We use four different BTBs based of the number of bits required to encode branch target offset. This field can be 10-bits, 15-bits, 25-bits, or 64-bits. 26 | BTBEntry( ) {} 27 | 28 | }; 29 | 30 | struct BTB { 31 | std::vector > theBTB; 32 | uint32_t numSets; 33 | uint32_t assoc; 34 | uint64_t indexMask; 35 | uint32_t numIndexBits; 36 | 37 | BTB( int32_t Sets, int32_t Assoc ) 38 | : numSets(Sets) 39 | , assoc(Assoc) { 40 | //aBTBSize must be a power of 2 41 | assert( ((Sets - 1) & (Sets)) == 0); 42 | theBTB.resize(Sets); 43 | indexMask = Sets - 1; 44 | numIndexBits = (uint32_t) log2((double)Sets); 45 | } 46 | 47 | int32_t index(uint64_t ip) { 48 | if (ip & 0x3) { 49 | return ((ip) & indexMask); 50 | } else { 51 | return ((ip >> 2) & indexMask); 52 | } 53 | } 54 | 55 | uint64_t get_tag(uint64_t ip) { 56 | uint64_t addr = ip; 57 | if (!(addr & 0x3)) { 58 | addr = addr >> 2; 59 | } 60 | addr = addr >> numIndexBits; 61 | /* We use a 16-bit tag. 62 | * The lower 8-bits stay the same as in the full tag. 63 | * The upper 8-bits are the folded X-OR of the remaining bits of the full tag. 64 | */ 65 | uint64_t tag = addr & 0xFF; //Set the lower 8-bits of the tag 66 | addr = addr >> 8; 67 | int tagMSBs = 0; 68 | /*Get the upper 8-bits (folded X-OR)*/ 69 | for (int i = 0; i < 8; i++) { 70 | tagMSBs = tagMSBs ^ (addr & 0xFF); 71 | addr = addr >> 8; 72 | } 73 | /*Concatenate the lower and upper 8-bits of tag*/ 74 | tag = tag | (tagMSBs << 8); 75 | return tag; 76 | } 77 | 78 | BTBEntry *get_BTBentry(uint64_t ip){ 79 | BTBEntry *entry = NULL; 80 | 81 | int idx = index(ip); 82 | uint64_t tag = get_tag(ip); 83 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 84 | if (theBTB[idx][i].tag == tag) { 85 | return &(theBTB[idx][i]); 86 | } 87 | } 88 | 89 | return entry; 90 | } 91 | 92 | void update_BTB(uint64_t ip, uint8_t b_type, uint64_t target){ 93 | int idx = index(ip); 94 | uint64_t tag = get_tag(ip); 95 | int way = -1; 96 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 97 | if (theBTB[idx][i].tag == tag) { 98 | way = i; 99 | break; 100 | } 101 | } 102 | 103 | if (way == -1) { 104 | BTBEntry entry; 105 | entry.tag = tag; 106 | entry.branch_type = b_type; 107 | entry.target_ip = target; 108 | 109 | if (theBTB[idx].size() >= assoc) { 110 | theBTB[idx].erase(theBTB[idx].begin()); 111 | } 112 | theBTB[idx].push_back(entry); 113 | } else { 114 | BTBEntry entry = theBTB[idx][way]; 115 | entry.branch_type = b_type; 116 | if (target != 0) { 117 | entry.target_ip = target; 118 | } 119 | 120 | //Update LRU 121 | theBTB[idx].erase(theBTB[idx].begin() + way); 122 | theBTB[idx].push_back(entry); 123 | } 124 | } 125 | 126 | }; 127 | 128 | 129 | #define MAX_BTB_LOOKUPs 19 130 | #define MAX_FTQ_ENTRIES 48 131 | #define MAX_PFETCHQ_ENTRIES 48 132 | #define MAX_RECENT_PFETCH 10 133 | #define MAX_RAS_ENTRIES 128 134 | 135 | 136 | uint64_t pfetch_ip = 0x0; 137 | uint64_t disp[66] = {0}; 138 | 139 | /**************************************** Compoenets for prefetching **************************************/ 140 | 141 | std::queue FTQ; //Storage: 129-bits (FTQ-entry size) * 48 (number of FTQ entries) = 774 bytes 142 | std::deque prefetch_queue; //Storage: 64-bits * 48 (queue size) = 384 bytes 143 | std::deque recent_prefetches; //Storage: 64-bits * 10 (queue size) = 80 bytes 144 | 145 | std::stack RAS; //Storage: 64-bits * 128 (size) = 1KB 146 | std::stack RAS_Pfetch; //Storage: 64-bits * 128 (size) = 1KB 147 | 148 | 149 | BTB BTB_10D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 10-bit) 28*1024*8 = 28KB 150 | BTB BTB_15D(1024, 7); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 15-bit) 33*1024*7 = 28.875KB 151 | BTB BTB_25D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 25-bit) 43*1024*8 = 43KB 152 | BTB BTB_64D(256, 4); //Storage: (tag:16-bit, branch-type: 2-bit, full-target: 64-bit) 82*256*4 = 10.25KB 153 | 154 | 155 | 156 | /**************************************** Prefetcher Operation **************************************/ 157 | 158 | 159 | 160 | void O3_CPU::l1i_prefetcher_initialize() 161 | { 162 | 163 | } 164 | 165 | void O3_CPU::l1i_prefetcher_branch_operate(uint64_t ip, uint8_t branch_type, uint64_t branch_target) 166 | { 167 | if (branch_target) { 168 | /*Find the number of bits needed to encode the target offset*/ 169 | uint64_t target_offset; 170 | if (branch_target > ip) { 171 | target_offset = branch_target - ip; 172 | } else { 173 | target_offset = ip - branch_target; 174 | } 175 | int num_bits = (int)(log2((double)target_offset)); 176 | /* The cast "(int)log2" rounds down to lower integer, however we want to round it to upper integer, so add 1 to "num_bits" 177 | * As an offset can be both positive and negative, we need to add 1 sign bit to "num_bits". 178 | * */ 179 | num_bits += 2; 180 | disp[num_bits]++; 181 | assert(num_bits >= 0 && num_bits < 66); 182 | 183 | /*Store (or update) the branch in one of the BTBs based on the number of bits required to encode the target offset*/ 184 | if (num_bits <= 10) { 185 | BTB_10D.update_BTB(ip, branch_type, branch_target); 186 | } else if (num_bits <= 15) { 187 | BTB_15D.update_BTB(ip, branch_type, branch_target); 188 | } else if (num_bits <= 25) { 189 | BTB_25D.update_BTB(ip, branch_type, branch_target); 190 | } else { 191 | BTB_64D.update_BTB(ip, branch_type, branch_target); 192 | } 193 | } 194 | 195 | 196 | 197 | /*Update the return address stack*/ 198 | if (branch_type == BRANCH_DIRECT_CALL || branch_type == BRANCH_INDIRECT_CALL) { 199 | if (RAS.size() < MAX_RAS_ENTRIES) { 200 | RAS.push(ip + 4); 201 | } 202 | } else if (branch_type == BRANCH_RETURN) { 203 | if (RAS.size()) { 204 | RAS.pop(); 205 | } 206 | } 207 | 208 | /*Check if the prefetch is on the correct execution path. If not, flush the FTQ.*/ 209 | if (FTQ.size()) { 210 | if (!((ip) >= (FTQ.front().first_ip) && (ip) <= (FTQ.front().last_ip))) { 211 | FTQ.pop(); 212 | if (!((ip) >= (FTQ.front().first_ip) && (ip) <= (FTQ.front().last_ip))) { 213 | while (!FTQ.empty()) { 214 | FTQ.pop(); 215 | } 216 | prefetch_queue.clear(); 217 | } 218 | } else if (FTQ.size() > 1) { 219 | std::queue tempQ = FTQ; 220 | tempQ.pop(); 221 | uint64_t next_pc = branch_target; 222 | if (next_pc == 0) { 223 | next_pc = ip + 4; 224 | } 225 | if (!((next_pc) >= (tempQ.front().first_ip) && (next_pc) <= (tempQ.front().last_ip))) { 226 | while (!FTQ.empty()) { 227 | FTQ.pop(); 228 | } 229 | prefetch_queue.clear(); 230 | } 231 | } 232 | } 233 | 234 | /*Reset the prefethcer if needed*/ 235 | if (pfetch_ip == 0 || FTQ.size() == 0) { 236 | pfetch_ip = branch_target; 237 | if (branch_target == 0) 238 | pfetch_ip = ip + 4; 239 | 240 | RAS_Pfetch = RAS; 241 | } 242 | 243 | } 244 | 245 | void O3_CPU::l1i_prefetcher_cache_operate(uint64_t v_addr, uint8_t cache_hit, uint8_t prefetch_hit) 246 | { 247 | if((cache_hit == 0) && (L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1))) 248 | { 249 | uint64_t pf_addr = v_addr + (1<target_ip; 291 | 292 | if (FTQ.size() == 0 || FTQ.back().end_found) { 293 | FTQ.push(FTQEntry(true, pfetch_ip, branch_ip)); 294 | } else { 295 | FTQ.back().end_found = true; 296 | FTQ.back().last_ip = branch_ip; 297 | } 298 | 299 | /*Update "pfetch_ip" based on branch type and branch prediction*/ 300 | uint8_t branch_taken = true; 301 | if (entry->branch_type == BRANCH_CONDITIONAL) { 302 | branch_taken = predict_branch(branch_ip); 303 | if (branch_taken) { 304 | pfetch_ip = target_ip; 305 | } else { 306 | pfetch_ip = branch_ip + 1; 307 | } 308 | } else if (entry->branch_type == BRANCH_RETURN) { 309 | if (RAS_Pfetch.size()) { 310 | pfetch_ip = RAS_Pfetch.top(); 311 | RAS_Pfetch.pop(); 312 | } else { 313 | pfetch_ip = target_ip; 314 | } 315 | } else if (entry->branch_type == BRANCH_DIRECT_CALL || entry->branch_type == BRANCH_INDIRECT_CALL) { 316 | pfetch_ip = target_ip; 317 | if (RAS_Pfetch.size() < MAX_RAS_ENTRIES) { 318 | RAS_Pfetch.push(branch_ip + 1); 319 | } 320 | } else { 321 | pfetch_ip = target_ip; 322 | } 323 | 324 | if (pfetch_ip == 0) { 325 | pfetch_ip = branch_ip + 1; 326 | } 327 | 328 | last_address = branch_ip; 329 | 330 | break; 331 | } 332 | } 333 | 334 | /*Find prefetch candidates*/ 335 | uint64_t firstBlock = first_address >> LOG2_BLOCK_SIZE; 336 | uint64_t lastBlock = last_address >> LOG2_BLOCK_SIZE; 337 | int numCacheBlocks = 1 + (lastBlock - firstBlock); 338 | if (numCacheBlocks > 5) //Prefetch throttle. 339 | numCacheBlocks = 5; 340 | 341 | for (int i = 0; i < numCacheBlocks; i++) { 342 | uint64_t pfetch_addr = (firstBlock + i) << LOG2_BLOCK_SIZE; 343 | 344 | bool is_recently_prefetched = false; 345 | std::deque::iterator it = std::find(prefetch_queue.begin(), prefetch_queue.end(), pfetch_addr); 346 | if (it == prefetch_queue.end()) { 347 | it = std::find(recent_prefetches.begin(), recent_prefetches.end(), pfetch_addr); 348 | if (it != recent_prefetches.end()) { 349 | is_recently_prefetched = true; 350 | } 351 | } else { 352 | is_recently_prefetched = true; 353 | } 354 | 355 | if (is_recently_prefetched == false && prefetch_queue.size() < MAX_PFETCHQ_ENTRIES) { 356 | prefetch_queue.push_back(pfetch_addr); 357 | } 358 | } 359 | 360 | /*Issue prefetches*/ 361 | 362 | if (prefetch_queue.size() && L1I.MSHR.occupancy < (L1I.MSHR.SIZE>>1) && L1I.PQ.occupancy < L1I.PQ.SIZE) { 363 | prefetch_code_line(prefetch_queue.front()); 364 | recent_prefetches.push_back(prefetch_queue.front()); 365 | if (recent_prefetches.size() > MAX_RECENT_PFETCH) { 366 | recent_prefetches.pop_front(); 367 | } 368 | 369 | prefetch_queue.pop_front(); 370 | } 371 | } 372 | 373 | } 374 | 375 | void O3_CPU::l1i_prefetcher_cache_fill(uint64_t v_addr, uint32_t set, uint32_t way, uint8_t prefetch, uint64_t evicted_v_addr) 376 | { 377 | 378 | } 379 | 380 | void O3_CPU::l1i_prefetcher_final_stats() 381 | { 382 | for(int i = 0; i < 66; i++) { 383 | cout << "XXX disp-" << i << " " << disp[i] << endl; 384 | } 385 | 386 | } 387 | -------------------------------------------------------------------------------- /tracer/champsim_tracer.cpp: -------------------------------------------------------------------------------- 1 | 2 | /*! @file 3 | * This is an example of the PIN tool that demonstrates some basic PIN APIs 4 | * and could serve as the starting point for developing your first PIN tool 5 | */ 6 | 7 | #include "pin.H" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define NUM_INSTR_DESTINATIONS 2 15 | #define NUM_INSTR_SOURCES 4 16 | 17 | using namespace std; 18 | 19 | typedef struct trace_instr_format { 20 | unsigned long long int ip; // instruction pointer (program counter) value 21 | 22 | unsigned char is_branch; // is this branch 23 | unsigned char branch_taken; // if so, is this taken 24 | 25 | unsigned char destination_registers[NUM_INSTR_DESTINATIONS]; // output registers 26 | unsigned char source_registers[NUM_INSTR_SOURCES]; // input registers 27 | 28 | unsigned long long int destination_memory[NUM_INSTR_DESTINATIONS]; // output memory 29 | unsigned long long int source_memory[NUM_INSTR_SOURCES]; // input memory 30 | } trace_instr_format_t; 31 | 32 | /* ================================================================== */ 33 | // Global variables 34 | /* ================================================================== */ 35 | 36 | UINT64 instrCount = 0; 37 | 38 | FILE* out; 39 | 40 | bool output_file_closed = false; 41 | bool tracing_on = false; 42 | 43 | trace_instr_format_t curr_instr; 44 | 45 | /* ===================================================================== */ 46 | // Command line switches 47 | /* ===================================================================== */ 48 | KNOB KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool", "o", "champsim.trace", 49 | "specify file name for Champsim tracer output"); 50 | 51 | KNOB KnobSkipInstructions(KNOB_MODE_WRITEONCE, "pintool", "s", "0", 52 | "How many instructions to skip before tracing begins"); 53 | 54 | KNOB KnobTraceInstructions(KNOB_MODE_WRITEONCE, "pintool", "t", "1000000", 55 | "How many instructions to trace"); 56 | 57 | /* ===================================================================== */ 58 | // Utilities 59 | /* ===================================================================== */ 60 | 61 | /*! 62 | * Print out help message. 63 | */ 64 | INT32 Usage() 65 | { 66 | cerr << "This tool creates a register and memory access trace" << endl 67 | << "Specify the output trace file with -o" << endl 68 | << "Specify the number of instructions to skip before tracing with -s" << endl 69 | << "Specify the number of instructions to trace with -t" << endl << endl; 70 | 71 | cerr << KNOB_BASE::StringKnobSummary() << endl; 72 | 73 | return -1; 74 | } 75 | 76 | /* ===================================================================== */ 77 | // Analysis routines 78 | /* ===================================================================== */ 79 | 80 | void BeginInstruction(VOID *ip, UINT32 op_code, VOID *opstring) 81 | { 82 | instrCount++; 83 | //printf("[%p %u %s ", ip, opcode, (char*)opstring); 84 | 85 | if(instrCount > KnobSkipInstructions.Value()) 86 | { 87 | tracing_on = true; 88 | 89 | if(instrCount > (KnobTraceInstructions.Value()+KnobSkipInstructions.Value())) 90 | tracing_on = false; 91 | } 92 | 93 | if(!tracing_on) 94 | return; 95 | 96 | // reset the current instruction 97 | curr_instr.ip = (unsigned long long int)ip; 98 | 99 | curr_instr.is_branch = 0; 100 | curr_instr.branch_taken = 0; 101 | 102 | for(int i=0; i KnobSkipInstructions.Value()) 122 | { 123 | tracing_on = true; 124 | 125 | if(instrCount <= (KnobTraceInstructions.Value()+KnobSkipInstructions.Value())) 126 | { 127 | // keep tracing 128 | fwrite(&curr_instr, sizeof(trace_instr_format_t), 1, out); 129 | } 130 | else 131 | { 132 | tracing_on = false; 133 | // close down the file, we're done tracing 134 | if(!output_file_closed) 135 | { 136 | fclose(out); 137 | output_file_closed = true; 138 | } 139 | 140 | exit(0); 141 | } 142 | } 143 | } 144 | 145 | void BranchOrNot(UINT32 taken) 146 | { 147 | //printf("[%d] ", taken); 148 | 149 | curr_instr.is_branch = 1; 150 | if(taken != 0) 151 | { 152 | curr_instr.branch_taken = 1; 153 | } 154 | } 155 | 156 | void RegRead(UINT32 i, UINT32 index) 157 | { 158 | if(!tracing_on) return; 159 | 160 | REG r = (REG)i; 161 | 162 | /* 163 | if(r == 26) 164 | { 165 | // 26 is the IP, which is read and written by branches 166 | return; 167 | } 168 | */ 169 | 170 | //cout << r << " " << REG_StringShort((REG)r) << " " ; 171 | //cout << REG_StringShort((REG)r) << " " ; 172 | 173 | //printf("%d ", (int)r); 174 | 175 | // check to see if this register is already in the list 176 | int already_found = 0; 177 | for(int i=0; i "; 213 | //cout << "<" << REG_StringShort((REG)r) << "> "; 214 | 215 | //printf("<%d> ", (int)r); 216 | 217 | int already_found = 0; 218 | for(int i=0; i -- ... 395 | */ 396 | int main(int argc, char *argv[]) 397 | { 398 | // Initialize PIN library. Print help message if -h(elp) is specified 399 | // in the command line or the command line is invalid 400 | if( PIN_Init(argc,argv) ) 401 | return Usage(); 402 | 403 | const char* fileName = KnobOutputFile.Value().c_str(); 404 | 405 | out = fopen(fileName, "ab"); 406 | if (!out) 407 | { 408 | cout << "Couldn't open output trace file. Exiting." << endl; 409 | exit(1); 410 | } 411 | 412 | // Register function to be called to instrument instructions 413 | INS_AddInstrumentFunction(Instruction, 0); 414 | 415 | // Register function to be called when the application exits 416 | PIN_AddFiniFunction(Fini, 0); 417 | 418 | //cerr << "===============================================" << endl; 419 | //cerr << "This application is instrumented by the Champsim Trace Generator" << endl; 420 | //cerr << "Trace saved in " << KnobOutputFile.Value() << endl; 421 | //cerr << "===============================================" << endl; 422 | 423 | // Start the program, never returns 424 | PIN_StartProgram(); 425 | 426 | return 0; 427 | } 428 | 429 | /* ===================================================================== */ 430 | /* eof */ 431 | /* ===================================================================== */ 432 | -------------------------------------------------------------------------------- /btb/btb.cc: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file implements a basic Branch Target Buffer (BTB) structure. 4 | * It uses a set-associative BTB to predict the targets of non-return branches, 5 | * and it uses a small Return Address Stack (RAS) to predict the target of 6 | * returns. 7 | */ 8 | 9 | #include "ooo_cpu.h" 10 | 11 | #define BASIC_BTB_SETS 4096 12 | #define BASIC_BTB_WAYS 4 13 | #define BASIC_BTB_INDIRECT_SIZE 4096 14 | #define BASIC_BTB_RAS_SIZE 64 15 | #define BASIC_BTB_CALL_INSTR_SIZE_TRACKERS 1024 16 | 17 | struct BTBEntry { 18 | uint64_t tag; 19 | uint64_t target_ip; 20 | uint8_t branch_type; 21 | uint64_t lru; 22 | }; 23 | 24 | struct BTB { 25 | std::vector > theBTB; 26 | uint32_t numSets; 27 | uint32_t assoc; 28 | uint64_t indexMask; 29 | uint32_t numIndexBits; 30 | 31 | BTB () {} 32 | 33 | BTB( int32_t Sets, int32_t Assoc ) 34 | : numSets(Sets) 35 | , assoc(Assoc) { 36 | //aBTBSize must be a power of 2 37 | assert( ((Sets - 1) & (Sets)) == 0); 38 | theBTB.resize(Sets); 39 | indexMask = Sets - 1; 40 | numIndexBits = (uint32_t) log2((double)Sets); 41 | } 42 | 43 | void init_btb (int32_t Sets, int32_t Assoc) { 44 | numSets = Sets; 45 | assoc = Assoc; 46 | //aBTBSize must be a power of 2 47 | assert( ((Sets - 1) & (Sets)) == 0); 48 | theBTB.resize(Sets); 49 | indexMask = Sets - 1; 50 | numIndexBits = (uint32_t) log2((double)Sets); 51 | } 52 | 53 | int32_t index(uint64_t ip) { 54 | return ((ip >> 2) & indexMask); 55 | } 56 | 57 | uint64_t get_tag(uint64_t ip) { 58 | //return ip; 59 | uint64_t addr = ip; 60 | addr = addr >> 2; 61 | addr = addr >> numIndexBits; 62 | /* We use a 16-bit tag. 63 | * The lower 8-bits stay the same as in the full tag. 64 | * The upper 8-bits are the folded X-OR of the remaining bits of the full tag. 65 | */ 66 | uint64_t tag = addr & 0xFF; //Set the lower 8-bits of the tag 67 | addr = addr >> 8; 68 | int tagMSBs = 0; 69 | /*Get the upper 8-bits (folded X-OR)*/ 70 | for (int i = 0; i < 8; i++) { 71 | tagMSBs = tagMSBs ^ (addr & 0xFF); 72 | addr = addr >> 8; 73 | } 74 | /*Concatenate the lower and upper 8-bits of tag*/ 75 | tag = tag | (tagMSBs << 8); 76 | return tag; 77 | } 78 | 79 | BTBEntry *get_BTBentry(uint64_t ip){ 80 | BTBEntry *entry = NULL; 81 | 82 | int idx = index(ip); 83 | uint64_t tag = get_tag(ip); 84 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 85 | if (theBTB[idx][i].tag == tag) { 86 | return &(theBTB[idx][i]); 87 | } 88 | } 89 | 90 | return entry; 91 | } 92 | 93 | void update_BTB(uint64_t ip, uint8_t b_type, uint64_t target, uint8_t taken, uint64_t lru_counter){ 94 | int idx = index(ip); 95 | uint64_t tag = get_tag(ip); 96 | int way = -1; 97 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 98 | if (theBTB[idx][i].tag == tag) { 99 | way = i; 100 | break; 101 | } 102 | } 103 | 104 | if (way == -1) { 105 | if ((target != 0) && taken) { 106 | BTBEntry entry; 107 | entry.tag = tag; 108 | entry.branch_type = b_type; 109 | entry.target_ip = target; 110 | entry.lru = lru_counter; 111 | 112 | if (theBTB[idx].size() >= assoc) { 113 | theBTB[idx].erase(theBTB[idx].begin()); 114 | } 115 | theBTB[idx].push_back(entry); 116 | } 117 | } else { 118 | BTBEntry entry = theBTB[idx][way]; 119 | entry.branch_type = b_type; 120 | if (target != 0) { 121 | entry.target_ip = target; 122 | } 123 | entry.lru = lru_counter; 124 | 125 | //Update LRU 126 | theBTB[idx].erase(theBTB[idx].begin() + way); 127 | theBTB[idx].push_back(entry); 128 | } 129 | } 130 | 131 | uint64_t get_lru_value(uint64_t ip) { 132 | int idx = index(ip); 133 | uint64_t lru_value; 134 | if (theBTB[idx].size() < assoc) { //All ways are not yet allocated 135 | lru_value = 0; 136 | } else { 137 | lru_value = theBTB[idx][0].lru; 138 | for (uint32_t i = 1; i < theBTB[idx].size(); i++) { //We should never enter here because head should be LRU 139 | if (theBTB[idx][i].lru < lru_value) { 140 | assert(0); 141 | } 142 | } 143 | } 144 | 145 | return lru_value; 146 | } 147 | 148 | }; 149 | 150 | /*BTB BTB_4D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 10-bit) 28*1024*8 = 28KB 151 | BTB BTB_6D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 15-bit) 33*1024*7 = 28.875KB 152 | BTB BTB_8D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 25-bit) 43*1024*8 = 43KB 153 | BTB BTB_12D(512, 8); //Storage: (tag:16-bit, branch-type: 2-bit, full-target: 64-bit) 82*256*4 = 10.25KB 154 | BTB BTB_18D(512, 8); 155 | BTB BTB_25D(256, 8); 156 | BTB BTB_46D(128, 8); 157 | BTB BTB_Ret(1024, 8);*/ 158 | 159 | #define NUM_BTB_PARTITIONS 2 160 | BTB btb_partition[NUM_BTB_PARTITIONS]; 161 | 162 | uint64_t basic_btb_lru_counter[NUM_CPUS]; 163 | 164 | uint64_t basic_btb_indirect[NUM_CPUS][BASIC_BTB_INDIRECT_SIZE]; 165 | uint64_t basic_btb_conditional_history[NUM_CPUS]; 166 | 167 | uint64_t basic_btb_ras[NUM_CPUS][BASIC_BTB_RAS_SIZE]; 168 | int basic_btb_ras_index[NUM_CPUS]; 169 | /* 170 | * The following two variables are used to automatically identify the 171 | * size of call instructions, in bytes, which tells us the appropriate 172 | * target for a call's corresponding return. 173 | * They exist because ChampSim does not model a specific ISA, and 174 | * different ISAs could use different sizes for call instructions, 175 | * and even within the same ISA, calls can have different sizes. 176 | */ 177 | uint64_t basic_btb_call_instr_sizes[NUM_CPUS][BASIC_BTB_CALL_INSTR_SIZE_TRACKERS]; 178 | 179 | uint64_t basic_btb_abs_addr_dist(uint64_t addr1, uint64_t addr2) { 180 | if(addr1 > addr2) { 181 | return addr1 - addr2; 182 | } 183 | 184 | return addr2 - addr1; 185 | } 186 | 187 | void push_basic_btb_ras(uint8_t cpu, uint64_t ip) { 188 | basic_btb_ras_index[cpu]++; 189 | if (basic_btb_ras_index[cpu] == BASIC_BTB_RAS_SIZE) { 190 | basic_btb_ras_index[cpu] = 0; 191 | } 192 | 193 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = ip; 194 | } 195 | 196 | uint64_t peek_basic_btb_ras(uint8_t cpu) { 197 | return basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 198 | } 199 | 200 | uint64_t pop_basic_btb_ras(uint8_t cpu) { 201 | uint64_t target = basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 202 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = 0; 203 | 204 | basic_btb_ras_index[cpu]--; 205 | if (basic_btb_ras_index[cpu] == -1) { 206 | basic_btb_ras_index[cpu] += BASIC_BTB_RAS_SIZE; 207 | } 208 | 209 | return target; 210 | } 211 | 212 | uint64_t basic_btb_call_size_tracker_hash(uint64_t ip) { 213 | return (ip & (BASIC_BTB_CALL_INSTR_SIZE_TRACKERS-1)); 214 | } 215 | 216 | uint64_t basic_btb_get_call_size(uint8_t cpu, uint64_t ip) { 217 | uint64_t size = basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(ip)]; 218 | 219 | return size; 220 | } 221 | 222 | int convert_offsetBits_to_partitionID(int num_bits) { 223 | return 0; 224 | if (num_bits == 0) { 225 | return 0; 226 | } else if (num_bits <= 4) { 227 | return 1; 228 | } else if (num_bits <= 5) { 229 | return 2; 230 | } else if (num_bits <= 7) { 231 | return 3; 232 | } else if (num_bits <= 9) { 233 | return 4; 234 | } else if (num_bits <= 11) { 235 | return 5; 236 | } else if (num_bits <= 19) { 237 | return 6; 238 | } else if (num_bits <= 25) { 239 | return 7; 240 | } else { 241 | return 8; 242 | } 243 | assert(0); 244 | } 245 | 246 | int get_lru_partition(int start_partitionID, uint64_t ip) { 247 | int lru_partition = start_partitionID; 248 | uint64_t lru_value = btb_partition[start_partitionID].get_lru_value(ip); 249 | for (int i = start_partitionID + 1; i < NUM_BTB_PARTITIONS; i++) { 250 | uint64_t partition_lru_value = btb_partition[i].get_lru_value(ip); 251 | if (partition_lru_value < lru_value) { 252 | lru_partition = i; 253 | lru_value = partition_lru_value; 254 | } 255 | } 256 | return lru_partition; 257 | } 258 | 259 | void O3_CPU::initialize_btb() { 260 | std::cout << "Basic BTB sets: " << BASIC_BTB_SETS 261 | << " ways: " << BASIC_BTB_WAYS 262 | << " indirect buffer size: " << BASIC_BTB_INDIRECT_SIZE 263 | << " RAS size: " << BASIC_BTB_RAS_SIZE << std::endl; 264 | 265 | for (uint32_t i = 0; i < BASIC_BTB_RAS_SIZE; i++) { 266 | basic_btb_ras[cpu][i] = 0; 267 | } 268 | basic_btb_ras_index[cpu] = 0; 269 | for (uint32_t i=0; ibranch_type; 308 | 309 | //uint8_t always_taken = false; 310 | //if (branch_type != BRANCH_CONDITIONAL) { 311 | //always_taken = true; 312 | //} 313 | 314 | if ((branch_type == BRANCH_DIRECT_CALL) || 315 | (branch_type == BRANCH_INDIRECT_CALL)) { 316 | // add something to the RAS 317 | push_basic_btb_ras(cpu, ip); 318 | } 319 | 320 | if (branch_type == BRANCH_RETURN) { 321 | // peek at the top of the RAS 322 | uint64_t target = peek_basic_btb_ras(cpu); 323 | // and adjust for the size of the call instr 324 | target += basic_btb_get_call_size(cpu, target); 325 | 326 | BTB_outcome outcome = {target, BRANCH_RETURN, 0}; 327 | return outcome; 328 | //return std::make_pair(target, always_taken); 329 | } /*else if ((branch_type == BRANCH_INDIRECT) || 330 | (branch_type == BRANCH_INDIRECT_CALL)) { 331 | return std::make_pair(basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)], always_taken); 332 | } */else { 333 | // use BTB for all other branches + direct calls 334 | 335 | BTB_outcome outcome = {btb_entry->target_ip, branch_type, 0}; 336 | return outcome; 337 | //return std::make_pair(btb_entry->target_ip, always_taken); 338 | } 339 | 340 | assert(0); 341 | //return std::make_pair(0, always_taken); 342 | } 343 | 344 | void O3_CPU::update_btb(uint64_t ip, uint64_t branch_target, uint8_t taken, 345 | uint8_t branch_type) { 346 | // updates for indirect branches 347 | /*if ((branch_type == BRANCH_INDIRECT) || 348 | (branch_type == BRANCH_INDIRECT_CALL)) { 349 | basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)] = branch_target; 350 | } 351 | if (branch_type == BRANCH_CONDITIONAL) { 352 | basic_btb_conditional_history[cpu] <<= 1; 353 | if (taken) { 354 | basic_btb_conditional_history[cpu] |= 1; 355 | } 356 | }*/ 357 | 358 | if (branch_type == BRANCH_RETURN) { 359 | // recalibrate call-return offset 360 | // if our return prediction got us into the right ball park, but not the 361 | // exactly correct byte target, then adjust our call instr size tracker 362 | uint64_t call_ip = pop_basic_btb_ras(cpu); 363 | uint64_t estimated_call_instr_size = basic_btb_abs_addr_dist(call_ip, branch_target); 364 | if (estimated_call_instr_size <= 10) { 365 | basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(call_ip)] = estimated_call_instr_size; 366 | } 367 | } 368 | 369 | BTBEntry *btb_entry; 370 | int partitionID = -1; 371 | for (int i = 0; i < NUM_BTB_PARTITIONS; i++) { 372 | btb_entry = btb_partition[i].get_BTBentry(ip); 373 | if (btb_entry) { 374 | partitionID = i; 375 | break; 376 | } 377 | } 378 | 379 | if (btb_entry == NULL) { 380 | 381 | BTB_writes++; 382 | 383 | int num_bits; 384 | if (branch_type == BRANCH_RETURN) { 385 | num_bits = 0; 386 | } else { 387 | uint64_t diff_bits = (branch_target >> 2) ^ (ip >> 2); 388 | num_bits = 0; 389 | while (diff_bits != 0) { 390 | diff_bits = diff_bits >> 1; 391 | num_bits++; 392 | } 393 | } 394 | assert(num_bits >= 0 && num_bits < 66); 395 | 396 | int smallest_offset_partition_id = convert_offsetBits_to_partitionID(num_bits); 397 | 398 | int partition = get_lru_partition(smallest_offset_partition_id, ip); 399 | assert(partition < NUM_BTB_PARTITIONS); 400 | 401 | btb_partition[partition].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 402 | basic_btb_lru_counter[cpu]++; 403 | 404 | 405 | } else { 406 | // update an existing entry 407 | assert(partitionID != -1); 408 | btb_partition[partitionID].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 409 | basic_btb_lru_counter[cpu]++; 410 | } 411 | 412 | } 413 | -------------------------------------------------------------------------------- /btb/convBTB.btb: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file implements a basic Branch Target Buffer (BTB) structure. 4 | * It uses a set-associative BTB to predict the targets of non-return branches, 5 | * and it uses a small Return Address Stack (RAS) to predict the target of 6 | * returns. 7 | */ 8 | 9 | #include "ooo_cpu.h" 10 | 11 | #define BASIC_BTB_SETS 4096 12 | #define BASIC_BTB_WAYS 4 13 | #define BASIC_BTB_INDIRECT_SIZE 4096 14 | #define BASIC_BTB_RAS_SIZE 64 15 | #define BASIC_BTB_CALL_INSTR_SIZE_TRACKERS 1024 16 | 17 | struct BTBEntry { 18 | uint64_t tag; 19 | uint64_t target_ip; 20 | uint8_t branch_type; 21 | uint64_t lru; 22 | }; 23 | 24 | struct BTB { 25 | std::vector > theBTB; 26 | uint32_t numSets; 27 | uint32_t assoc; 28 | uint64_t indexMask; 29 | uint32_t numIndexBits; 30 | 31 | BTB () {} 32 | 33 | BTB( int32_t Sets, int32_t Assoc ) 34 | : numSets(Sets) 35 | , assoc(Assoc) { 36 | //aBTBSize must be a power of 2 37 | assert( ((Sets - 1) & (Sets)) == 0); 38 | theBTB.resize(Sets); 39 | indexMask = Sets - 1; 40 | numIndexBits = (uint32_t) log2((double)Sets); 41 | } 42 | 43 | void init_btb (int32_t Sets, int32_t Assoc) { 44 | numSets = Sets; 45 | assoc = Assoc; 46 | //aBTBSize must be a power of 2 47 | assert( ((Sets - 1) & (Sets)) == 0); 48 | theBTB.resize(Sets); 49 | indexMask = Sets - 1; 50 | numIndexBits = (uint32_t) log2((double)Sets); 51 | } 52 | 53 | int32_t index(uint64_t ip) { 54 | return ((ip >> 2) & indexMask); 55 | } 56 | 57 | uint64_t get_tag(uint64_t ip) { 58 | //return ip; 59 | uint64_t addr = ip; 60 | addr = addr >> 2; 61 | addr = addr >> numIndexBits; 62 | /* We use a 16-bit tag. 63 | * The lower 8-bits stay the same as in the full tag. 64 | * The upper 8-bits are the folded X-OR of the remaining bits of the full tag. 65 | */ 66 | uint64_t tag = addr & 0xFF; //Set the lower 8-bits of the tag 67 | addr = addr >> 8; 68 | int tagMSBs = 0; 69 | /*Get the upper 8-bits (folded X-OR)*/ 70 | for (int i = 0; i < 8; i++) { 71 | tagMSBs = tagMSBs ^ (addr & 0xFF); 72 | addr = addr >> 8; 73 | } 74 | /*Concatenate the lower and upper 8-bits of tag*/ 75 | tag = tag | (tagMSBs << 8); 76 | return tag; 77 | } 78 | 79 | BTBEntry *get_BTBentry(uint64_t ip){ 80 | BTBEntry *entry = NULL; 81 | 82 | int idx = index(ip); 83 | uint64_t tag = get_tag(ip); 84 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 85 | if (theBTB[idx][i].tag == tag) { 86 | return &(theBTB[idx][i]); 87 | } 88 | } 89 | 90 | return entry; 91 | } 92 | 93 | void update_BTB(uint64_t ip, uint8_t b_type, uint64_t target, uint8_t taken, uint64_t lru_counter){ 94 | int idx = index(ip); 95 | uint64_t tag = get_tag(ip); 96 | int way = -1; 97 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 98 | if (theBTB[idx][i].tag == tag) { 99 | way = i; 100 | break; 101 | } 102 | } 103 | 104 | if (way == -1) { 105 | if ((target != 0) && taken) { 106 | BTBEntry entry; 107 | entry.tag = tag; 108 | entry.branch_type = b_type; 109 | entry.target_ip = target; 110 | entry.lru = lru_counter; 111 | 112 | if (theBTB[idx].size() >= assoc) { 113 | theBTB[idx].erase(theBTB[idx].begin()); 114 | } 115 | theBTB[idx].push_back(entry); 116 | } 117 | } else { 118 | BTBEntry entry = theBTB[idx][way]; 119 | entry.branch_type = b_type; 120 | if (target != 0) { 121 | entry.target_ip = target; 122 | } 123 | entry.lru = lru_counter; 124 | 125 | //Update LRU 126 | theBTB[idx].erase(theBTB[idx].begin() + way); 127 | theBTB[idx].push_back(entry); 128 | } 129 | } 130 | 131 | uint64_t get_lru_value(uint64_t ip) { 132 | int idx = index(ip); 133 | uint64_t lru_value; 134 | if (theBTB[idx].size() < assoc) { //All ways are not yet allocated 135 | lru_value = 0; 136 | } else { 137 | lru_value = theBTB[idx][0].lru; 138 | for (uint32_t i = 1; i < theBTB[idx].size(); i++) { //We should never enter here because head should be LRU 139 | if (theBTB[idx][i].lru < lru_value) { 140 | assert(0); 141 | } 142 | } 143 | } 144 | 145 | return lru_value; 146 | } 147 | 148 | }; 149 | 150 | /*BTB BTB_4D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 10-bit) 28*1024*8 = 28KB 151 | BTB BTB_6D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 15-bit) 33*1024*7 = 28.875KB 152 | BTB BTB_8D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 25-bit) 43*1024*8 = 43KB 153 | BTB BTB_12D(512, 8); //Storage: (tag:16-bit, branch-type: 2-bit, full-target: 64-bit) 82*256*4 = 10.25KB 154 | BTB BTB_18D(512, 8); 155 | BTB BTB_25D(256, 8); 156 | BTB BTB_46D(128, 8); 157 | BTB BTB_Ret(1024, 8);*/ 158 | 159 | #define NUM_BTB_PARTITIONS 2 160 | BTB btb_partition[NUM_BTB_PARTITIONS]; 161 | 162 | uint64_t basic_btb_lru_counter[NUM_CPUS]; 163 | 164 | uint64_t basic_btb_indirect[NUM_CPUS][BASIC_BTB_INDIRECT_SIZE]; 165 | uint64_t basic_btb_conditional_history[NUM_CPUS]; 166 | 167 | uint64_t basic_btb_ras[NUM_CPUS][BASIC_BTB_RAS_SIZE]; 168 | int basic_btb_ras_index[NUM_CPUS]; 169 | /* 170 | * The following two variables are used to automatically identify the 171 | * size of call instructions, in bytes, which tells us the appropriate 172 | * target for a call's corresponding return. 173 | * They exist because ChampSim does not model a specific ISA, and 174 | * different ISAs could use different sizes for call instructions, 175 | * and even within the same ISA, calls can have different sizes. 176 | */ 177 | uint64_t basic_btb_call_instr_sizes[NUM_CPUS][BASIC_BTB_CALL_INSTR_SIZE_TRACKERS]; 178 | 179 | uint64_t basic_btb_abs_addr_dist(uint64_t addr1, uint64_t addr2) { 180 | if(addr1 > addr2) { 181 | return addr1 - addr2; 182 | } 183 | 184 | return addr2 - addr1; 185 | } 186 | 187 | void push_basic_btb_ras(uint8_t cpu, uint64_t ip) { 188 | basic_btb_ras_index[cpu]++; 189 | if (basic_btb_ras_index[cpu] == BASIC_BTB_RAS_SIZE) { 190 | basic_btb_ras_index[cpu] = 0; 191 | } 192 | 193 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = ip; 194 | } 195 | 196 | uint64_t peek_basic_btb_ras(uint8_t cpu) { 197 | return basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 198 | } 199 | 200 | uint64_t pop_basic_btb_ras(uint8_t cpu) { 201 | uint64_t target = basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 202 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = 0; 203 | 204 | basic_btb_ras_index[cpu]--; 205 | if (basic_btb_ras_index[cpu] == -1) { 206 | basic_btb_ras_index[cpu] += BASIC_BTB_RAS_SIZE; 207 | } 208 | 209 | return target; 210 | } 211 | 212 | uint64_t basic_btb_call_size_tracker_hash(uint64_t ip) { 213 | return (ip & (BASIC_BTB_CALL_INSTR_SIZE_TRACKERS-1)); 214 | } 215 | 216 | uint64_t basic_btb_get_call_size(uint8_t cpu, uint64_t ip) { 217 | uint64_t size = basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(ip)]; 218 | 219 | return size; 220 | } 221 | 222 | int convert_offsetBits_to_partitionID(int num_bits) { 223 | return 0; 224 | if (num_bits == 0) { 225 | return 0; 226 | } else if (num_bits <= 4) { 227 | return 1; 228 | } else if (num_bits <= 5) { 229 | return 2; 230 | } else if (num_bits <= 7) { 231 | return 3; 232 | } else if (num_bits <= 9) { 233 | return 4; 234 | } else if (num_bits <= 11) { 235 | return 5; 236 | } else if (num_bits <= 19) { 237 | return 6; 238 | } else if (num_bits <= 25) { 239 | return 7; 240 | } else { 241 | return 8; 242 | } 243 | assert(0); 244 | } 245 | 246 | int get_lru_partition(int start_partitionID, uint64_t ip) { 247 | int lru_partition = start_partitionID; 248 | uint64_t lru_value = btb_partition[start_partitionID].get_lru_value(ip); 249 | for (int i = start_partitionID + 1; i < NUM_BTB_PARTITIONS; i++) { 250 | uint64_t partition_lru_value = btb_partition[i].get_lru_value(ip); 251 | if (partition_lru_value < lru_value) { 252 | lru_partition = i; 253 | lru_value = partition_lru_value; 254 | } 255 | } 256 | return lru_partition; 257 | } 258 | 259 | void O3_CPU::initialize_btb() { 260 | std::cout << "Basic BTB sets: " << BASIC_BTB_SETS 261 | << " ways: " << BASIC_BTB_WAYS 262 | << " indirect buffer size: " << BASIC_BTB_INDIRECT_SIZE 263 | << " RAS size: " << BASIC_BTB_RAS_SIZE << std::endl; 264 | 265 | for (uint32_t i = 0; i < BASIC_BTB_RAS_SIZE; i++) { 266 | basic_btb_ras[cpu][i] = 0; 267 | } 268 | basic_btb_ras_index[cpu] = 0; 269 | for (uint32_t i=0; ibranch_type; 308 | 309 | //uint8_t always_taken = false; 310 | //if (branch_type != BRANCH_CONDITIONAL) { 311 | //always_taken = true; 312 | //} 313 | 314 | if ((branch_type == BRANCH_DIRECT_CALL) || 315 | (branch_type == BRANCH_INDIRECT_CALL)) { 316 | // add something to the RAS 317 | push_basic_btb_ras(cpu, ip); 318 | } 319 | 320 | if (branch_type == BRANCH_RETURN) { 321 | // peek at the top of the RAS 322 | uint64_t target = peek_basic_btb_ras(cpu); 323 | // and adjust for the size of the call instr 324 | target += basic_btb_get_call_size(cpu, target); 325 | 326 | BTB_outcome outcome = {target, BRANCH_RETURN, 0}; 327 | return outcome; 328 | //return std::make_pair(target, always_taken); 329 | } /*else if ((branch_type == BRANCH_INDIRECT) || 330 | (branch_type == BRANCH_INDIRECT_CALL)) { 331 | return std::make_pair(basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)], always_taken); 332 | } */else { 333 | // use BTB for all other branches + direct calls 334 | 335 | BTB_outcome outcome = {btb_entry->target_ip, branch_type, 0}; 336 | return outcome; 337 | //return std::make_pair(btb_entry->target_ip, always_taken); 338 | } 339 | 340 | assert(0); 341 | //return std::make_pair(0, always_taken); 342 | } 343 | 344 | void O3_CPU::update_btb(uint64_t ip, uint64_t branch_target, uint8_t taken, 345 | uint8_t branch_type) { 346 | // updates for indirect branches 347 | /*if ((branch_type == BRANCH_INDIRECT) || 348 | (branch_type == BRANCH_INDIRECT_CALL)) { 349 | basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)] = branch_target; 350 | } 351 | if (branch_type == BRANCH_CONDITIONAL) { 352 | basic_btb_conditional_history[cpu] <<= 1; 353 | if (taken) { 354 | basic_btb_conditional_history[cpu] |= 1; 355 | } 356 | }*/ 357 | 358 | if (branch_type == BRANCH_RETURN) { 359 | // recalibrate call-return offset 360 | // if our return prediction got us into the right ball park, but not the 361 | // exactly correct byte target, then adjust our call instr size tracker 362 | uint64_t call_ip = pop_basic_btb_ras(cpu); 363 | uint64_t estimated_call_instr_size = basic_btb_abs_addr_dist(call_ip, branch_target); 364 | if (estimated_call_instr_size <= 10) { 365 | basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(call_ip)] = estimated_call_instr_size; 366 | } 367 | } 368 | 369 | BTBEntry *btb_entry; 370 | int partitionID = -1; 371 | for (int i = 0; i < NUM_BTB_PARTITIONS; i++) { 372 | btb_entry = btb_partition[i].get_BTBentry(ip); 373 | if (btb_entry) { 374 | partitionID = i; 375 | break; 376 | } 377 | } 378 | 379 | if (btb_entry == NULL) { 380 | 381 | BTB_writes++; 382 | 383 | int num_bits; 384 | if (branch_type == BRANCH_RETURN) { 385 | num_bits = 0; 386 | } else { 387 | uint64_t diff_bits = (branch_target >> 2) ^ (ip >> 2); 388 | num_bits = 0; 389 | while (diff_bits != 0) { 390 | diff_bits = diff_bits >> 1; 391 | num_bits++; 392 | } 393 | } 394 | assert(num_bits >= 0 && num_bits < 66); 395 | 396 | int smallest_offset_partition_id = convert_offsetBits_to_partitionID(num_bits); 397 | 398 | int partition = get_lru_partition(smallest_offset_partition_id, ip); 399 | assert(partition < NUM_BTB_PARTITIONS); 400 | 401 | btb_partition[partition].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 402 | basic_btb_lru_counter[cpu]++; 403 | 404 | 405 | } else { 406 | // update an existing entry 407 | assert(partitionID != -1); 408 | btb_partition[partitionID].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 409 | basic_btb_lru_counter[cpu]++; 410 | } 411 | 412 | } 413 | -------------------------------------------------------------------------------- /btb/BTBX.btb: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file implements a basic Branch Target Buffer (BTB) structure. 4 | * It uses a set-associative BTB to predict the targets of non-return branches, 5 | * and it uses a small Return Address Stack (RAS) to predict the target of 6 | * returns. 7 | */ 8 | 9 | #include "ooo_cpu.h" 10 | 11 | #define BASIC_BTB_SETS 4096 12 | #define BASIC_BTB_WAYS 4 13 | #define BASIC_BTB_INDIRECT_SIZE 4096 14 | #define BASIC_BTB_RAS_SIZE 64 15 | #define BASIC_BTB_CALL_INSTR_SIZE_TRACKERS 1024 16 | 17 | struct BTBEntry { 18 | uint64_t tag; 19 | uint64_t target_ip; 20 | uint8_t branch_type; 21 | uint64_t lru; 22 | }; 23 | 24 | struct BTB { 25 | std::vector > theBTB; 26 | uint32_t numSets; 27 | uint32_t assoc; 28 | uint64_t indexMask; 29 | uint32_t numIndexBits; 30 | 31 | BTB () {} 32 | 33 | BTB( int32_t Sets, int32_t Assoc ) 34 | : numSets(Sets) 35 | , assoc(Assoc) { 36 | //aBTBSize must be a power of 2 37 | assert( ((Sets - 1) & (Sets)) == 0); 38 | theBTB.resize(Sets); 39 | indexMask = Sets - 1; 40 | numIndexBits = (uint32_t) log2((double)Sets); 41 | } 42 | 43 | void init_btb (int32_t Sets, int32_t Assoc) { 44 | numSets = Sets; 45 | assoc = Assoc; 46 | //aBTBSize must be a power of 2 47 | assert( ((Sets - 1) & (Sets)) == 0); 48 | theBTB.resize(Sets); 49 | indexMask = Sets - 1; 50 | numIndexBits = (uint32_t) log2((double)Sets); 51 | } 52 | 53 | int32_t index(uint64_t ip) { 54 | return ((ip >> 2) & indexMask); 55 | } 56 | 57 | uint64_t get_tag(uint64_t ip) { 58 | //return ip; 59 | uint64_t addr = ip; 60 | addr = addr >> 2; 61 | addr = addr >> numIndexBits; 62 | /* We use a 16-bit tag. 63 | * The lower 8-bits stay the same as in the full tag. 64 | * The upper 8-bits are the folded X-OR of the remaining bits of the full tag. 65 | */ 66 | uint64_t tag = addr & 0xFF; //Set the lower 8-bits of the tag 67 | addr = addr >> 8; 68 | int tagMSBs = 0; 69 | /*Get the upper 8-bits (folded X-OR)*/ 70 | for (int i = 0; i < 8; i++) { 71 | tagMSBs = tagMSBs ^ (addr & 0xFF); 72 | addr = addr >> 8; 73 | } 74 | /*Concatenate the lower and upper 8-bits of tag*/ 75 | tag = tag | (tagMSBs << 8); 76 | return tag; 77 | } 78 | 79 | BTBEntry *get_BTBentry(uint64_t ip){ 80 | BTBEntry *entry = NULL; 81 | 82 | int idx = index(ip); 83 | uint64_t tag = get_tag(ip); 84 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 85 | if (theBTB[idx][i].tag == tag) { 86 | return &(theBTB[idx][i]); 87 | } 88 | } 89 | 90 | return entry; 91 | } 92 | 93 | void update_BTB(uint64_t ip, uint8_t b_type, uint64_t target, uint8_t taken, uint64_t lru_counter){ 94 | int idx = index(ip); 95 | uint64_t tag = get_tag(ip); 96 | int way = -1; 97 | for (uint32_t i = 0; i < theBTB[idx].size(); i++) { 98 | if (theBTB[idx][i].tag == tag) { 99 | way = i; 100 | break; 101 | } 102 | } 103 | 104 | if (way == -1) { 105 | if ((target != 0) && taken) { 106 | BTBEntry entry; 107 | entry.tag = tag; 108 | entry.branch_type = b_type; 109 | entry.target_ip = target; 110 | entry.lru = lru_counter; 111 | 112 | if (theBTB[idx].size() >= assoc) { 113 | theBTB[idx].erase(theBTB[idx].begin()); 114 | } 115 | theBTB[idx].push_back(entry); 116 | } 117 | } else { 118 | BTBEntry entry = theBTB[idx][way]; 119 | entry.branch_type = b_type; 120 | if (target != 0) { 121 | entry.target_ip = target; 122 | } 123 | entry.lru = lru_counter; 124 | 125 | //Update LRU 126 | theBTB[idx].erase(theBTB[idx].begin() + way); 127 | theBTB[idx].push_back(entry); 128 | } 129 | } 130 | 131 | uint64_t get_lru_value(uint64_t ip) { 132 | int idx = index(ip); 133 | uint64_t lru_value; 134 | if (theBTB[idx].size() < assoc) { //All ways are not yet allocated 135 | lru_value = 0; 136 | } else { 137 | lru_value = theBTB[idx][0].lru; 138 | for (uint32_t i = 1; i < theBTB[idx].size(); i++) { //We should never enter here because head should be LRU 139 | if (theBTB[idx][i].lru < lru_value) { 140 | assert(0); 141 | } 142 | } 143 | } 144 | 145 | return lru_value; 146 | } 147 | 148 | }; 149 | 150 | /*BTB BTB_4D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 10-bit) 28*1024*8 = 28KB 151 | BTB BTB_6D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 15-bit) 33*1024*7 = 28.875KB 152 | BTB BTB_8D(1024, 8); //Storage: (tag:16-bit, branch-type: 2-bit, target-offset: 25-bit) 43*1024*8 = 43KB 153 | BTB BTB_12D(512, 8); //Storage: (tag:16-bit, branch-type: 2-bit, full-target: 64-bit) 82*256*4 = 10.25KB 154 | BTB BTB_18D(512, 8); 155 | BTB BTB_25D(256, 8); 156 | BTB BTB_46D(128, 8); 157 | BTB BTB_Ret(1024, 8);*/ 158 | 159 | #define NUM_BTB_PARTITIONS 9 160 | BTB btb_partition[NUM_BTB_PARTITIONS]; 161 | 162 | uint64_t basic_btb_lru_counter[NUM_CPUS]; 163 | 164 | uint64_t basic_btb_indirect[NUM_CPUS][BASIC_BTB_INDIRECT_SIZE]; 165 | uint64_t basic_btb_conditional_history[NUM_CPUS]; 166 | 167 | uint64_t basic_btb_ras[NUM_CPUS][BASIC_BTB_RAS_SIZE]; 168 | int basic_btb_ras_index[NUM_CPUS]; 169 | /* 170 | * The following two variables are used to automatically identify the 171 | * size of call instructions, in bytes, which tells us the appropriate 172 | * target for a call's corresponding return. 173 | * They exist because ChampSim does not model a specific ISA, and 174 | * different ISAs could use different sizes for call instructions, 175 | * and even within the same ISA, calls can have different sizes. 176 | */ 177 | uint64_t basic_btb_call_instr_sizes[NUM_CPUS][BASIC_BTB_CALL_INSTR_SIZE_TRACKERS]; 178 | 179 | uint64_t basic_btb_abs_addr_dist(uint64_t addr1, uint64_t addr2) { 180 | if(addr1 > addr2) { 181 | return addr1 - addr2; 182 | } 183 | 184 | return addr2 - addr1; 185 | } 186 | 187 | void push_basic_btb_ras(uint8_t cpu, uint64_t ip) { 188 | basic_btb_ras_index[cpu]++; 189 | if (basic_btb_ras_index[cpu] == BASIC_BTB_RAS_SIZE) { 190 | basic_btb_ras_index[cpu] = 0; 191 | } 192 | 193 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = ip; 194 | } 195 | 196 | uint64_t peek_basic_btb_ras(uint8_t cpu) { 197 | return basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 198 | } 199 | 200 | uint64_t pop_basic_btb_ras(uint8_t cpu) { 201 | uint64_t target = basic_btb_ras[cpu][basic_btb_ras_index[cpu]]; 202 | basic_btb_ras[cpu][basic_btb_ras_index[cpu]] = 0; 203 | 204 | basic_btb_ras_index[cpu]--; 205 | if (basic_btb_ras_index[cpu] == -1) { 206 | basic_btb_ras_index[cpu] += BASIC_BTB_RAS_SIZE; 207 | } 208 | 209 | return target; 210 | } 211 | 212 | uint64_t basic_btb_call_size_tracker_hash(uint64_t ip) { 213 | return (ip & (BASIC_BTB_CALL_INSTR_SIZE_TRACKERS-1)); 214 | } 215 | 216 | uint64_t basic_btb_get_call_size(uint8_t cpu, uint64_t ip) { 217 | uint64_t size = basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(ip)]; 218 | 219 | return size; 220 | } 221 | 222 | int convert_offsetBits_to_partitionID(int num_bits) { 223 | if (num_bits == 0) { 224 | return 0; 225 | } else if (num_bits <= 4) { 226 | return 1; 227 | } else if (num_bits <= 5) { 228 | return 2; 229 | } else if (num_bits <= 7) { 230 | return 3; 231 | } else if (num_bits <= 9) { 232 | return 4; 233 | } else if (num_bits <= 11) { 234 | return 5; 235 | } else if (num_bits <= 19) { 236 | return 6; 237 | } else if (num_bits <= 25) { 238 | return 7; 239 | } else { 240 | return 8; 241 | } 242 | assert(0); 243 | } 244 | 245 | int get_lru_partition(int start_partitionID, uint64_t ip) { 246 | int lru_partition = start_partitionID; 247 | uint64_t lru_value = btb_partition[start_partitionID].get_lru_value(ip); 248 | for (int i = start_partitionID + 1; i < NUM_BTB_PARTITIONS; i++) { 249 | uint64_t partition_lru_value = btb_partition[i].get_lru_value(ip); 250 | if (partition_lru_value < lru_value) { 251 | lru_partition = i; 252 | lru_value = partition_lru_value; 253 | } 254 | } 255 | return lru_partition; 256 | } 257 | 258 | void O3_CPU::initialize_btb() { 259 | std::cout << "Basic BTB sets: " << BASIC_BTB_SETS 260 | << " ways: " << BASIC_BTB_WAYS 261 | << " indirect buffer size: " << BASIC_BTB_INDIRECT_SIZE 262 | << " RAS size: " << BASIC_BTB_RAS_SIZE << std::endl; 263 | 264 | for (uint32_t i = 0; i < BASIC_BTB_RAS_SIZE; i++) { 265 | basic_btb_ras[cpu][i] = 0; 266 | } 267 | basic_btb_ras_index[cpu] = 0; 268 | for (uint32_t i=0; ibranch_type; 307 | 308 | //uint8_t always_taken = false; 309 | //if (branch_type != BRANCH_CONDITIONAL) { 310 | //always_taken = true; 311 | //} 312 | 313 | if ((branch_type == BRANCH_DIRECT_CALL) || 314 | (branch_type == BRANCH_INDIRECT_CALL)) { 315 | // add something to the RAS 316 | push_basic_btb_ras(cpu, ip); 317 | } 318 | 319 | if (branch_type == BRANCH_RETURN) { 320 | // peek at the top of the RAS 321 | uint64_t target = peek_basic_btb_ras(cpu); 322 | // and adjust for the size of the call instr 323 | target += basic_btb_get_call_size(cpu, target); 324 | 325 | BTB_outcome outcome = {target, BRANCH_RETURN, 0}; 326 | return outcome; 327 | //return std::make_pair(target, always_taken); 328 | } /*else if ((branch_type == BRANCH_INDIRECT) || 329 | (branch_type == BRANCH_INDIRECT_CALL)) { 330 | return std::make_pair(basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)], always_taken); 331 | } */else { 332 | // use BTB for all other branches + direct calls 333 | 334 | BTB_outcome outcome = {btb_entry->target_ip, branch_type, 0}; 335 | return outcome; 336 | //return std::make_pair(btb_entry->target_ip, always_taken); 337 | } 338 | 339 | assert(0); 340 | //return std::make_pair(0, always_taken); 341 | } 342 | 343 | void O3_CPU::update_btb(uint64_t ip, uint64_t branch_target, uint8_t taken, 344 | uint8_t branch_type) { 345 | // updates for indirect branches 346 | /*if ((branch_type == BRANCH_INDIRECT) || 347 | (branch_type == BRANCH_INDIRECT_CALL)) { 348 | basic_btb_indirect[cpu][basic_btb_indirect_hash(cpu, ip)] = branch_target; 349 | } 350 | if (branch_type == BRANCH_CONDITIONAL) { 351 | basic_btb_conditional_history[cpu] <<= 1; 352 | if (taken) { 353 | basic_btb_conditional_history[cpu] |= 1; 354 | } 355 | }*/ 356 | 357 | if (branch_type == BRANCH_RETURN) { 358 | // recalibrate call-return offset 359 | // if our return prediction got us into the right ball park, but not the 360 | // exactly correct byte target, then adjust our call instr size tracker 361 | uint64_t call_ip = pop_basic_btb_ras(cpu); 362 | uint64_t estimated_call_instr_size = basic_btb_abs_addr_dist(call_ip, branch_target); 363 | if (estimated_call_instr_size <= 10) { 364 | basic_btb_call_instr_sizes[cpu][basic_btb_call_size_tracker_hash(call_ip)] = estimated_call_instr_size; 365 | } 366 | } 367 | 368 | if (taken == false) 369 | return; 370 | 371 | BTBEntry *btb_entry; 372 | int partitionID = -1; 373 | for (int i = 0; i < NUM_BTB_PARTITIONS; i++) { 374 | btb_entry = btb_partition[i].get_BTBentry(ip); 375 | if (btb_entry) { 376 | partitionID = i; 377 | break; 378 | } 379 | } 380 | 381 | if (btb_entry == NULL) { 382 | 383 | BTB_writes++; 384 | 385 | int num_bits; 386 | if (branch_type == BRANCH_RETURN) { 387 | num_bits = 0; 388 | } else { 389 | uint64_t diff_bits = (branch_target >> 2) ^ (ip >> 2); 390 | num_bits = 0; 391 | while (diff_bits != 0) { 392 | diff_bits = diff_bits >> 1; 393 | num_bits++; 394 | } 395 | } 396 | assert(num_bits >= 0 && num_bits < 66); 397 | 398 | int smallest_offset_partition_id = convert_offsetBits_to_partitionID(num_bits); 399 | 400 | int partition = get_lru_partition(smallest_offset_partition_id, ip); 401 | assert(partition < NUM_BTB_PARTITIONS); 402 | 403 | btb_partition[partition].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 404 | basic_btb_lru_counter[cpu]++; 405 | 406 | 407 | } else { 408 | // update an existing entry 409 | assert(partitionID != -1); 410 | btb_partition[partitionID].update_BTB(ip, branch_type, branch_target, taken, basic_btb_lru_counter[cpu]); 411 | basic_btb_lru_counter[cpu]++; 412 | } 413 | 414 | } 415 | --------------------------------------------------------------------------------