├── .gitignore ├── README.md └── inst-sim ├── Core.py ├── GlobalVars.py ├── Main.py ├── Memory.py ├── NoC.py ├── Operations.py ├── Profile.py ├── Tile.py ├── config.py ├── data_convert.py ├── instrn_proto.py ├── result └── mlp │ └── result.txt ├── tile_instrn_proto.py └── workload ├── LSTM2048.tar.gz ├── bigLSTM.tar.gz ├── mlp.tar.gz ├── nmt.tar.gz ├── vgg16_small.tar.gz └── vgg19_small.tar.gz /.gitignore: -------------------------------------------------------------------------------- 1 | *.cfg 2 | *.swp 3 | *.dot 4 | *.pdf 5 | *.png 6 | *.pyc 7 | !*.py 8 | !*.pyx 9 | !*/*.py 10 | !*/*.pyx 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/README.md -------------------------------------------------------------------------------- /inst-sim/Core.py: -------------------------------------------------------------------------------- 1 | import GlobalVars as gv 2 | import numpy as np 3 | import Operations 4 | from data_convert import * 5 | import Profile as pf 6 | import sys 7 | 8 | class Core: 9 | def __init__(self, num, tile): 10 | self.num = num 11 | self.tile = tile 12 | self.inst_list = self.load_inst() 13 | self.cyc = 0 14 | self.pc = 0 15 | self.work_cyc = 0 # core is busy for next work_cyc 16 | self.is_halted = False 17 | self.reg = {} 18 | 19 | def load_inst(self): 20 | filename = "%s/tile%d/core_imem%d.npy" \ 21 | % (gv.params["foldername"], self.tile.num, self.num) 22 | inst_list = np.load(filename, allow_pickle=True) 23 | 24 | return inst_list 25 | 26 | def debug(self, inst): 27 | if gv.debug_enabled == False: return 28 | #if self.cyc > gv.last_debug_cyc: print ("") 29 | gv.last_debug_cyc = self.cyc 30 | #print ("[%4d] Tile %d ; Core %2d ; pc %2d: %s //" 31 | # % (self.cyc, self.tile.num, self.num, self.pc, inst['opcode']), inst) 32 | sys.stdout.flush(); 33 | 34 | 35 | def advance(self): 36 | self.cyc += 1 37 | 38 | if self.is_halted == True: 39 | return 40 | 41 | if self.work_cyc > 0: 42 | self.work_cyc -= 1 43 | 44 | if self.work_cyc == 0: 45 | self.pc += 1 46 | else: return 47 | 48 | inst = self.inst_list[self.pc] 49 | 50 | if inst['opcode'] == 'st': 51 | if self.tile.mem_wait == 0: 52 | mem_addr = self.reg[inst['d1']] 53 | counter = inst['r2'] 54 | width = inst['imm'] 55 | vec = inst['vec'] 56 | dat = inst['r1'] 57 | 58 | #assert (not mem_addr in self.tile.shared_mem)\ 59 | # or self.tile.shared_mem[mem_addr] == 0 60 | 61 | #self.tile.shared_mem[mem_addr] = counter 62 | data = 0 63 | if dat in self.reg: data = self.reg[dat] 64 | 65 | self.tile.memory.allocate(mem_addr, counter, data) 66 | 67 | #self.tile.mem_wait = vec # assuming 1cyc/vec-write 68 | #self.work_cyc = vec 69 | self.tile.mem_wait = 1 # assuming 1cyc/vec-write 70 | self.work_cyc = 1 71 | 72 | self.debug(inst) 73 | pf.call_stack["Store"] += 1 74 | 75 | else: pass # store block 76 | 77 | elif inst['opcode'] == 'ld': 78 | if self.tile.mem_wait == 0: 79 | mem_addr = self.reg[inst['r1']] 80 | vec = inst['vec'] 81 | 82 | #if mem_addr in self.tile.shared_mem\ 83 | # and self.tile.shared_mem[mem_addr] > 0: 84 | accessed, data = self.tile.memory.access(mem_addr) 85 | if accessed: 86 | self.reg[inst['d1']] = data 87 | 88 | #self.tile.mem_wait = vec # assuming 1cyc/vec-read 89 | #self.work_cyc = vec 90 | self.tile.mem_wait = 1 # assuming 1cyc/vec-read 91 | self.work_cyc = 1 92 | #self.tile.shared_mem[mem_addr] -= 1 93 | # 94 | #if self.tile.shared_mem[mem_addr] == 0: 95 | # del self.tile.shared_mem[mem_addr] # free the space 96 | 97 | self.debug(inst) 98 | pf.call_stack["Load"] += 1 99 | 100 | else: pass # load block 101 | 102 | elif inst['opcode'] == 'hlt': 103 | self.is_halted = True 104 | self.tile.halted_core_num += 1 105 | 106 | self.debug(inst) 107 | 108 | elif inst['opcode'] == 'alu' or inst['opcode'] == 'alui': 109 | vec = inst['vec'] 110 | #self.work_cyc = Operations.latency[inst['opcode']] * vec # possible pipelining of multiple vectors / possible parallelization using multiple alu unit 111 | self.work_cyc = Operations.latency[inst['opcode']] * 1 # possible pipelining of multiple vectors / possible parallelization using multiple alu unit 112 | 113 | self.debug(inst) 114 | if inst['opcode'] == 'alu': pf.call_stack["ALU"] += 1 115 | else: pf.call_stack["ALUI"] += 1 116 | 117 | elif inst['opcode'] == 'mvm': 118 | self.work_cyc += Operations.latency[inst['opcode']] 119 | 120 | self.debug(inst) 121 | pf.call_stack["MVM"] += 1 122 | 123 | elif inst['opcode'] == 'cp': 124 | self.work_cyc += Operations.latency[inst['opcode']] 125 | 126 | self.debug(inst) 127 | pf.call_stack["Copy"] += 1 128 | 129 | elif inst['opcode'] == 'set': 130 | self.work_cyc += Operations.latency[inst['opcode']] 131 | 132 | reg_addr = inst['d1'] 133 | imm = inst['imm'] 134 | vec = inst['vec'] 135 | #assert vec == 1 136 | 137 | self.reg[reg_addr] = bin2int(imm, 22) # 22: default compiler config - address bit # 138 | self.debug(inst) 139 | pf.call_stack["Set"] += 1 140 | 141 | else: 142 | self.debug(inst) 143 | raise NotImplementedError 144 | 145 | -------------------------------------------------------------------------------- /inst-sim/GlobalVars.py: -------------------------------------------------------------------------------- 1 | debug_enabled = True 2 | last_debug_cyc = 0 3 | 4 | params = {} 5 | NoC = None 6 | halted_tile_num = 0 7 | fifo_num = 0 8 | total_inst = 0 9 | 10 | def ind_to_coord(ind): 11 | return ind % params["tile_x"], ind // params["tile_x"] 12 | -------------------------------------------------------------------------------- /inst-sim/Main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import math 4 | import config as cfg 5 | import numpy as np 6 | import GlobalVars as gv 7 | import Profile as pf 8 | import NoC 9 | import Tile 10 | from collections import deque 11 | from collections import defaultdict 12 | from queue import PriorityQueue 13 | from cProfile import Profile 14 | from pstats import Stats 15 | 16 | def init(): 17 | # workload name 18 | gv.params["workload"] = sys.argv[1] 19 | gv.params["foldername"] = "workload/%s" % (sys.argv[1]) 20 | gv.params["result_foldername"] = "result/%s" % (sys.argv[1]) 21 | if not os.path.isdir(gv.params["foldername"]): 22 | raise("No workload folder exists") 23 | if not os.path.isdir(gv.params["result_foldername"]): 24 | os.mkdir(gv.params["result_foldername"]) 25 | 26 | os.listdir() 27 | total_cores = 0 28 | total_tiles = 0 29 | 30 | ## 31 | for _, dirnames, filenames in os.walk(gv.params["foldername"]): 32 | for file in filenames: 33 | if "-core" in file and not ".swp" in file: 34 | total_cores += 1 35 | total_tiles += len(dirnames) 36 | 37 | gv.params["tile_x"] = int(math.sqrt(total_tiles)) 38 | gv.fifo_num = total_tiles 39 | 40 | assert(total_cores % total_tiles == 0) 41 | 42 | ## 43 | gv.params["tile_num"] = int(total_tiles) 44 | gv.params["core_num"] = int(total_cores / total_tiles) 45 | 46 | gv.NoC = NoC.NoC() 47 | 48 | def simulate(): 49 | pf.cyc = 0 50 | while gv.halted_tile_num < gv.params["tile_num"]: 51 | pf.cyc += 1 52 | gv.NoC.advance() 53 | 54 | 55 | def stat(): 56 | result_path = os.path.join(gv.params["result_foldername"], "result.txt") 57 | result_file = open(result_path, "w") 58 | 59 | result_file.write("\n====CPI STACK====\n") 60 | result_file.write("total cyc: {}\n".format(pf.cyc)) 61 | #print ("\n====CPI STACK====") 62 | #print ("total cyc: {}".format(pf.cyc)) 63 | sum_v = 0 64 | for _, v in pf.call_stack.items(): 65 | sum_v += v 66 | for k, v in pf.call_stack.items(): 67 | result_file.write("{} : {} / {}\n".format(k, v, sum_v)) 68 | #print ("{} : {} / {}".format(k, v, sum_v)) 69 | 70 | result_file.write("\n====LINK====\n") 71 | #print ("\n====LINK====") 72 | result_file.write("busiest link: {}KB\n".format(float(pf.busiest_link_data) / (8*1024))) 73 | #print ("busiest link: {}KB".format(float(pf.busiest_link_data) / (8*1024))) 74 | 75 | result_file.write("\n====MAX MEMORY SIZE====\n") 76 | #print("\n====MAX MEMORY SIZE====") 77 | for tile in gv.NoC.tiles: 78 | result_file.write("tile id: {}\t\t\tphysical size: {}\t\t\tvirtual size: {}\n".format( 79 | tile.num, tile.memory.max_physical_size, len(tile.memory.virtual_mem) * cfg.xbar_size)) 80 | #print(tile.num, tile.memory.max_physical_size, 81 | # len(tile.memory.virtual_mem) * gv.MVMU_DIM) 82 | 83 | def testrun(): 84 | init() 85 | simulate() 86 | stat() 87 | 88 | profiler = Profile() 89 | profiler.runcall(testrun) 90 | 91 | stats = Stats(profiler) 92 | stats.strip_dirs() 93 | stats.sort_stats('tottime') 94 | stats.print_stats() 95 | -------------------------------------------------------------------------------- /inst-sim/Memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import GlobalVars as gv 3 | import Profile as pf 4 | import config as cfg 5 | import Tile 6 | import sys 7 | 8 | class Memory: 9 | def __init__(self, tile): 10 | self.tile = tile 11 | 12 | # map virtual address to the physical address 13 | # convert the virtual address 14 | # to the physical address 15 | self.virtual_to_physical = {} 16 | 17 | # physical memory (counter (0), virtual addr (1), data(2)) 18 | self.physical_mem = {} 19 | self.max_physical_size = 0 20 | 21 | self.virtual_mem = {} 22 | 23 | def allocate(self, virtual_addr, counter, data = 0): 24 | # check alignment 25 | assert(virtual_addr % cfg.xbar_size == 0) 26 | 27 | self.virtual_mem[virtual_addr] = 0 28 | 29 | # first check if the virtual address & count 30 | assert ((not virtual_addr in self.virtual_to_physical)\ 31 | or (self.physical_mem[self.virtual_to_physical[virtual_addr]][0] == 0)) 32 | 33 | # search for the empty space & allocate to the physical memory 34 | target_addr = 0 35 | while(True): 36 | # if the physical address has not been allocated 37 | # or the counter is zero 38 | if(not target_addr in self.physical_mem): 39 | # then add a new entry to the dictionary 40 | self.virtual_to_physical[virtual_addr] = target_addr 41 | # allocate the new entry to the physical mem 42 | self.physical_mem[target_addr] = [counter, virtual_addr, data] 43 | break 44 | target_addr += cfg.xbar_size 45 | 46 | # renew the maximum size 47 | if(target_addr > self.max_physical_size): self.max_physical_size = target_addr 48 | 49 | def access(self, virtual_addr): 50 | # if the virtual address exist & the physical memory's counter is not zero 51 | 52 | if ((virtual_addr in self.virtual_to_physical)\ 53 | and (self.physical_mem[self.virtual_to_physical[virtual_addr]][0] != 0)): 54 | 55 | physical_addr = self.virtual_to_physical[virtual_addr] 56 | self.physical_mem[physical_addr][0] -= 1 57 | data = self.physical_mem[physical_addr][2] 58 | 59 | if(self.physical_mem[physical_addr][0] == 0): 60 | self.virtual_to_physical.pop(self.physical_mem[physical_addr][1]) 61 | assert(not self.physical_mem[physical_addr][1] in self.virtual_to_physical) 62 | self.physical_mem.pop(physical_addr) 63 | assert(not physical_addr in self.physical_mem) 64 | #del self.virtual_to_physical[self.physical_mem[physical_addr][1]] 65 | return True, data 66 | 67 | return False, -1 68 | 69 | -------------------------------------------------------------------------------- /inst-sim/NoC.py: -------------------------------------------------------------------------------- 1 | import GlobalVars as gv 2 | from queue import PriorityQueue 3 | import Tile 4 | import Profile as pf 5 | 6 | class NoC: 7 | 8 | def __init__(self): 9 | self.tiles = [Tile.Tile(i, self) for i in range(gv.params["tile_num"])] 10 | self.cyc = 0 11 | self.packet_queue = PriorityQueue() 12 | 13 | self.total_inst = 0 14 | for tile in self.tiles: 15 | self.total_inst += tile.total_inst 16 | 17 | self.pc = 0 18 | self.pc_prev = 0 19 | 20 | def send_packets(self, src_tile_num, dst_tile_num, vtile_id, packet_num, wait_cyc): 21 | packets = (self.cyc + wait_cyc + self.getRoutingLatency(src_tile_num, dst_tile_num), 22 | dst_tile_num, vtile_id, packet_num) 23 | self.packet_queue.put(packets) 24 | 25 | def advance(self): 26 | self.cyc += 1 27 | while True: 28 | if self.packet_queue.empty() == True: break 29 | (arrival_cyc, dst_tile_num, vtile_id, packet_num) = self.packet_queue.queue[0] 30 | if arrival_cyc > self.cyc: break 31 | self.packet_queue.get() 32 | self.tiles[dst_tile_num].fifo[vtile_id] += packet_num 33 | 34 | 35 | self.pc = 0 36 | for tile in self.tiles: 37 | self.pc += tile.pc 38 | tile.advance() 39 | for core in tile.cores: 40 | self.pc += core.pc 41 | 42 | self.pc_prev = self.pc 43 | 44 | pf.progress(self.pc, self.total_inst) 45 | 46 | # FIXME: add BW support 47 | def getRoutingLatency(self, src_tile_num, dst_tile_num): 48 | src_x, src_y = gv.ind_to_coord(src_tile_num) 49 | dst_x, dst_y = gv.ind_to_coord(dst_tile_num) 50 | 51 | return abs(src_x - dst_x) + abs(src_y - dst_y) 52 | 53 | -------------------------------------------------------------------------------- /inst-sim/Operations.py: -------------------------------------------------------------------------------- 1 | op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] 2 | aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction 3 | op_list_tile = ['send', 'receive', 'halt'] 4 | 5 | latency = {"ld": None, 6 | "cp": 1, 7 | "st": None, 8 | "set": 1, 9 | "nop": 1, 10 | "alu": 1, 11 | "alui": 1, 12 | "mvm": 1000, 13 | "vvo": 1, 14 | "hlt": 1, 15 | "jmp": 1, 16 | "beq": 1, 17 | "alu_int": 1, 18 | "crs": 1, 19 | "send": None, 20 | "receive": None, 21 | "halt": 1 22 | } 23 | 24 | -------------------------------------------------------------------------------- /inst-sim/Profile.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def progress(count, total): 5 | bar_len = 60 6 | filled_len = int(round(bar_len * count / float(total))) 7 | 8 | percents = round(100.0 * count / float(total), 1) 9 | bar = '=' * filled_len + '-' * (bar_len - filled_len) 10 | 11 | sys.stdout.write('progress: [%s] %s%s\r' % (bar, percents, '%')) 12 | sys.stdout.flush() 13 | 14 | passed_data = None # key:((src_x, src_y), (dst_x, dst_y)), value:data_bits 15 | busiest_link = None 16 | busiest_link_data = -1 17 | cyc = None 18 | cpi_stack = {"Copy": 0, 19 | "Load": 0, 20 | "Store": 0, 21 | "Send": 0, 22 | "Receive": 0, 23 | "MVM": 0, 24 | "ALU": 0, 25 | "ALUI": 0, 26 | "Set": 0, 27 | "WriteInput": 0, 28 | "ReadOutput": 0} 29 | 30 | call_stack = {"Copy": 0, 31 | "Load": 0, 32 | "Store": 0, 33 | "Send": 0, 34 | "Receive": 0, 35 | "MVM": 0, 36 | "ALU": 0, 37 | "ALUI": 0, 38 | "Set": 0, 39 | "WriteInput": 0, 40 | "ReadOutput": 0} 41 | -------------------------------------------------------------------------------- /inst-sim/Tile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import GlobalVars as gv 3 | import Profile as pf 4 | import NoC 5 | import Core 6 | import Memory 7 | import sys 8 | 9 | class Tile: 10 | def __init__(self, num, noc): 11 | self.num = num 12 | self.noc = noc 13 | #self.x, self.y = gv.ind_to_coord(num) 14 | self.inst_list = self.load_inst() 15 | self.cyc = 0 16 | self.pc = 0 17 | self.send_wait_cyc = 0 18 | self.is_halted = False 19 | self.halted_core_num = 0 20 | 21 | self.fifo = [0 for _ in range(gv.fifo_num)] # (# of packets) in the fifo 22 | 23 | self.mem_wait = 0 # how many cycles left to finish serving current request 24 | 25 | self.memory = Memory.Memory(self) 26 | self.cores = [Core.Core(i, self) for i in range(gv.params["core_num"])] 27 | 28 | self.total_inst = len(self.inst_list) 29 | for core in self.cores: self.total_inst += len(core.inst_list) 30 | 31 | 32 | def load_inst(self): 33 | filename = "%s/tile%d/tile_imem.npy" % (gv.params["foldername"], self.num) 34 | inst_list = np.load(filename, allow_pickle=True) 35 | return inst_list 36 | 37 | def debug(self, inst): 38 | if gv.debug_enabled == False: return 39 | #if self.cyc > gv.last_debug_cyc: print ("") 40 | gv.last_debug_cyc = self.cyc 41 | #print ("[%4d] Tile %d ; ; pc %2d: %s //" % (self.cyc, self.num, self.pc, inst['opcode']), inst) 42 | sys.stdout.flush(); 43 | 44 | def advance(self): 45 | 46 | self.cyc += 1 47 | if self.is_halted == True: 48 | return 49 | 50 | if self.mem_wait > 0: 51 | self.mem_wait -= 1 52 | 53 | if self.send_wait_cyc > 0: 54 | self.send_wait_cyc -= 1 55 | 56 | inst = self.inst_list[self.pc] 57 | 58 | if inst['opcode'] == 'send': 59 | if self.mem_wait == 0: 60 | mem_addr = inst['mem_addr'] 61 | #if self.num == 0 or \ 62 | # (mem_addr in self.shared_mem and self.shared_mem[mem_addr] != 0): 63 | accessed = True 64 | if self.num != 0: accessed, data = self.memory.access(mem_addr) 65 | if accessed: 66 | 67 | vtile_id = inst['vtile_id'] 68 | send_width = inst['r1'] 69 | target_tile_num = inst['r2'] 70 | vec = inst['vec'] 71 | 72 | # send_width: scalar #, 16: bit-precision, 32: packet size 73 | packet_num = ((send_width*16+31) / 32) * vec 74 | self.noc.send_packets(self.num, target_tile_num, vtile_id, packet_num, self.send_wait_cyc) 75 | 76 | # shared mem ==> packet geneartion 77 | #self.mem_wait = vec # assuming 1cyc/vec-read, decoupled packet gen 78 | self.mem_wait = 1 # assuming 1cyc/vec-read, decoupled packet gen 79 | 80 | # decoupled packet gen, assuming 1cyc/packet 81 | #self.send_wait_cyc += packet_num 82 | self.send_wait_cyc += 1 83 | 84 | self.debug(inst) 85 | self.pc += 1 86 | pf.call_stack["Send"] += 1 87 | 88 | else: pass #send block - shared memory contention 89 | 90 | elif inst['opcode'] == 'receive': 91 | if self.mem_wait == 0: 92 | mem_addr = inst['mem_addr'] 93 | vtile_id = inst['vtile_id'] 94 | receive_width = inst['r1'] 95 | counter = inst['r2'] 96 | vec = inst['vec'] 97 | 98 | 99 | if self.fifo[vtile_id] > 0: 100 | #assert (not mem_addr in self.shared_mem)\ 101 | # or self.shared_mem[mem_addr] == 0 102 | 103 | packet_num = ((receive_width*16+31) / 32) * vec 104 | assert packet_num <= self.fifo[vtile_id] 105 | 106 | self.fifo[vtile_id] -= packet_num 107 | self.memory.allocate(mem_addr, counter) 108 | 109 | # fifo ==> shared mem 110 | #self.mem_wait = vec # assuming 1cyc/vec-write 111 | self.mem_wait = 1 # assuming 1cyc/vec-write 112 | 113 | self.debug(inst) 114 | self.pc += 1 115 | pf.call_stack["Receive"] += 1 116 | 117 | else: 118 | pass #receive block 119 | 120 | elif inst['opcode'] == 'halt': 121 | if self.halted_core_num == gv.params["core_num"]: 122 | self.is_halted = True 123 | gv.halted_tile_num += 1 124 | 125 | self.debug(inst) 126 | 127 | else: 128 | raise NotImplementedError 129 | 130 | for core in self.cores: 131 | core.advance() 132 | -------------------------------------------------------------------------------- /inst-sim/config.py: -------------------------------------------------------------------------------- 1 | ## Variable to define the type of MVMU 2 | # One of "Analog", "Digital" (To be added), "PNM" (To be added) 3 | MVMU_ver = "Analog" 4 | 5 | ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits 6 | num_bits = 16 7 | 8 | ## IMA configurable parameters (permissible values for each parameter provided here) 9 | ## Instruction generation - affected by xbar_bits, num_xbar, xbar_size. 10 | # xbar_bits: 2, 4, 6 11 | # num_xbar: positive integer 12 | # xbar_size: 32, 64, 128, 256 13 | # dac_res: positive integer <= num_bits 14 | # adc_res: positive integer <= num_bits 15 | # num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar) 16 | # num_ALU: positive integer 17 | # dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped) 18 | # instrnMem_size: (in Bytes) - 512, 1024, 2048 19 | 20 | # Fixed parameters 21 | addr_width = 32 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse) 22 | data_width = num_bits # (in bits) 23 | xbdata_width = data_width # (in bits) 24 | instrn_width = 48 # (in bits) 25 | 26 | 27 | # Input and Weight parameters 28 | input_prec = 16 29 | weight_width = 16 30 | # Change here - Specify the IMA parameters here 31 | xbar_bits = 2 32 | 33 | #FIXME make num_matrix equal to N_CONSTANT_MVMUS_PER_CORE 34 | num_matrix = 4 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars. 35 | xbar_size = 128 36 | dac_res = 1 37 | # ADC configuration 38 | adc_res = 8 # around 4 to 8. this value should be 39 | num_adc_per_matrix = 2 40 | num_adc = num_adc_per_matrix * num_matrix 41 | 42 | # The idea is to have different ADC resolution value for each ADC. 43 | # The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 44 | # NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. 45 | 46 | num_ALU = num_matrix*2 47 | #dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) 48 | dataMem_size = 2048 # 2048 is larger than num_matrix*(6*xbar_size) 49 | instrnMem_size = 512 #in entries 50 | 51 | ## Tile configurable parameters (permissible values for each parameter provided here) 52 | ## Instruction generation - affected by num_ima 53 | # num_ima: positive integer 54 | # edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width) 55 | # edram_size: (in KiloBytes) - 64, 128, 256, 512 56 | # receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \ 57 | # puts a cap on the maximum num ber of tiles that can send data to a tile in next layer 58 | # receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons) 59 | # tile_instrnMem_size: 256, 512, 1024 (in Bytes) 60 | 61 | # Fixed parameters 62 | instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16) 63 | edram_buswidth = 256 # in bits 64 | #receive_buffer_depth = 16 65 | receive_buffer_depth = 150 #set equal to num_tile_max 66 | receive_buffer_width = edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons) 67 | 68 | # Change here - Specify the Tile parameters here 69 | num_ima = 8 70 | edram_size = 64 # in Kilobytes (64 KB - same as issac) 71 | tile_instrnMem_size = 2048 # in entries 72 | 73 | ## Node configurable parameters (permissible values for each parameter provided here) 74 | ## Instruction generation - affected by num_tile 75 | # num_tile_compute = positive integer 76 | # inj_rate < 0.2 (depends on the mapping) 77 | # num_port: 4, 8 78 | 79 | # Fixed parameters 80 | # NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles 81 | cmesh_c = 4 82 | num_bits_tileId =32 83 | flit_width = 32 84 | packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself) 85 | # (b bit of address = logN, N is the number of nodes) 86 | 87 | 88 | ## Node parameters - Our way of simulation just assumes all tile in one actual node 89 | # Change here - Specify the Node parameters here (FIXME to be supported) 90 | num_node = 1 91 | num_tile_max = 138.0 # maximum number of tiles per node 92 | -------------------------------------------------------------------------------- /inst-sim/data_convert.py: -------------------------------------------------------------------------------- 1 | # APIs to convert data from: 2 | # 1. float to fixed point binary (2s complement) [float to bit-string] 3 | # 2. fixed point binary (2s complement) to float [bit-string to float] 4 | # 3. integer to binary (2s complement) [int to bit-string] 5 | # 4. binary (2s complement) to inetger [bit-string to int] 6 | import numpy as np 7 | 8 | def bin2int (binary_string, bits): 9 | val = int (binary_string,2) 10 | if (val & (1 << (bits - 1))) != 0: # if sign bit is set e.g., 8bit: 128-255 11 | val = val - (1 << bits) # compute negative value 12 | return val 13 | 14 | def int2bin (int_data, bits): 15 | data_str = bin(int_data & (2**bits-1))[2:].zfill(bits) 16 | return data_str 17 | 18 | def float2fixed (float_data, int_bits, frac_bits): 19 | temp = float_data * (2**frac_bits) 20 | temp = int(round (temp)) 21 | return int2bin (temp, (int_bits+frac_bits)) 22 | 23 | def fixed2float (binary_string, int_bits, frac_bits): 24 | temp = bin2int (binary_string, (int_bits + frac_bits)) 25 | return float(temp) / (2**frac_bits) 26 | 27 | # defining float (2d numpy float array) <-> fixed (2d list of strings) conversion 28 | def float2fixed_2d (float_data_arr, int_bits, frac_bits): 29 | (num_row, num_col) = np.shape(float_data_arr) 30 | # input type - array, outpt type = 2d list 31 | #out_list = [['']*num_col] * num_row 32 | out_list = [['' for i in range(num_col)] for j in range(num_row)] 33 | for i in range (num_row): 34 | for j in range (num_col): 35 | float_data = float_data_arr[i,j] 36 | out_list[i][j] = float2fixed (float_data, int_bits, frac_bits) 37 | return out_list 38 | 39 | # defining fixed (2d list of string) <-> float (2d numpy float array) conversion 40 | def fixed2float_2d (binary_string_list, int_bits, frac_bits): 41 | (num_row, num_col) = np.shape(binary_string_list) 42 | # input type - 2d list, outpt type = array 43 | out_arr = np.zeros((num_row, num_col), dtype=float) 44 | for i in range (num_row): 45 | for j in range (num_col): 46 | binary_string = binary_string_list[i][j] 47 | out_arr[i, j] = fixed2float (binary_string, int_bits, frac_bits) 48 | return out_arr 49 | 50 | # defiing a fuction to extract a given num of bits from each element of a 2d binary_string_list 51 | def getBitsFromList (binary_string_list, start_bit, num_bit): 52 | (num_row, num_col) = np.shape(binary_string_list) 53 | # input type - 2d list, outpt type = 2d list 54 | out_list = [['']*num_col] * num_row 55 | for i in range (num_row): 56 | for j in range (num_col): 57 | out_list[i][j] = binary_string_list[i][j][start_bit:start_bit + num_bit] 58 | return out_list 59 | 60 | 61 | ## Obsolete - because they were long and less readable 62 | '''def bin2frac (binary_string): 63 | result = 0 64 | ex = 2.0 65 | for c in binary_string: 66 | if c == '1': 67 | result += 1/ex 68 | ex *= 2 69 | return result 70 | 71 | def frac2bin (frac, bits): 72 | result = '' 73 | ex = 2.0 74 | for i in range (bits): 75 | frac = frac * ex 76 | result += str(int(frac)) 77 | frac = frac - int(frac) 78 | return result 79 | 80 | def float2fixed2 (float_data, int_bits, frac_bits): 81 | temp = abs(float_data) 82 | int_part = int (temp) 83 | bin1 = int2bin (int_part, int_bits) 84 | frac_part = temp - int_part 85 | bin2 = frac2bin (frac_part, frac_bits) 86 | 87 | if (float_data >= 0): 88 | return bin1 + bin2 89 | else: 90 | result = -1 * int ((bin1 + bin2), 2) 91 | result = int2bin (result, int_bits + frac_bits) 92 | return result 93 | 94 | def fixed2float2 (binary_string, int_bits, frac_bits): 95 | if (binary_string[0] == 1): 96 | print ('aho negative') 97 | num = -1 * bin2int (binary_string, int_bits + frac_bits) #negative integer 98 | binary_string = int2bin (num, int_bits + frac_bits) 99 | 100 | return bin2int (binary_string[0:int_bits], int_bits) + \ 101 | bin2frac (binary_string[int_bits:])''' 102 | 103 | # Test the above functions 104 | '''import numpy as np 105 | num = 4 106 | int_bits = 4 107 | frac_bits = 12 108 | 109 | inp_float = np.random.rand(num) 110 | print ('original: ', inp_float) 111 | 112 | inp1 = ['']*num 113 | inp2 = ['']*num 114 | for i in range (num): 115 | inp1[i] = float2fixed (inp_float[i], int_bits, frac_bits) 116 | inp2[i] = float2fixed2 (inp_float[i], int_bits, frac_bits) 117 | print ('fixed: ', inp1) 118 | print ('fixed2: ', inp2) 119 | 120 | inp_f1 = [0.0]*num 121 | inp_f2 = [0.0]*num 122 | for i in range (num): 123 | inp_f1[i] = fixed2float (inp1[i], int_bits, frac_bits) 124 | inp_f2[i] = fixed2float2 (inp2[i], int_bits, frac_bits) 125 | print ('back2float: ', inp_f1) 126 | print ('back2float2: ', inp_f2)''' 127 | -------------------------------------------------------------------------------- /inst-sim/instrn_proto.py: -------------------------------------------------------------------------------- 1 | # Define the instruction prototypes which will be used by the generate_instrn.py file 2 | import sys 3 | 4 | import numpy as np 5 | from data_convert import * 6 | import config as cfg 7 | 8 | from data_convert import * 9 | 10 | # List of supported opcodes/aluops for IMA - cp will copy data (from data memory of ima to xbarInmem) 11 | op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] 12 | aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction 13 | 14 | # Instruction format for IMA 15 | dummy_instrn = {'opcode' : op_list[0], # instrn op 16 | 'aluop' : aluop_list[0], # alu function 17 | 'd1' : 0, # destination 18 | 'r1' : 0, # operand1 (stride for mvm) 19 | 'r2' : 0, # operand2 20 | 'r3' : 0, # operand3 (shift) 21 | 'vec' : 0, # vector width 22 | 'imm' : 0, # immediate (scalar) data 23 | 'xb_nma' : 0 } # xbar negative-mask, a xbar evaluates if neg-mask = 1 24 | 25 | 26 | def i_load (d1, r1, load_width = 1, vec = 1): 27 | assert (load_width <= (cfg.edram_buswidth/cfg.data_width)), 'Load width must be smaller than \ 28 | edram_buswidth/data_width' 29 | i_temp = dummy_instrn.copy () 30 | i_temp['opcode'] = 'ld' 31 | i_temp['d1'] = d1 # rf addr 32 | i_temp['r1'] = r1 # mem addr 33 | i_temp['imm'] = load_width 34 | i_temp['vec'] = vec 35 | return i_temp 36 | 37 | # generate store protoyype - store data from (datamem/sboutmem) to edram 38 | def i_store (d1, r1, counter = 1, store_width = 1, vec = 1): 39 | assert (store_width <= (cfg.edram_buswidth/cfg.data_width)), 'Load width must be smaller than \ 40 | edram_buswidth/data_width' 41 | i_temp = dummy_instrn.copy () 42 | i_temp['opcode'] = 'st' 43 | i_temp['d1'] = d1 # mem addr 44 | i_temp['r1'] = r1 # rf addr 45 | i_temp['r2'] = counter 46 | i_temp['imm'] = store_width 47 | i_temp['vec'] = vec 48 | return i_temp 49 | 50 | # generate cp prototype: 51 | # src_type = 0: copy data from (datamem/xbInmem) to (datmem/xbInmem) 52 | # src_type = 1: copy data from (datamem/xbOutmem) to (datmem/xbInmem) 53 | def i_copy (d1, r1, vec = 1, src_type = 0): 54 | i_temp = dummy_instrn.copy () 55 | i_temp['opcode'] = 'cp' 56 | i_temp['d1'] = d1 57 | i_temp['r1'] = r1 58 | i_temp['vec'] = vec 59 | return i_temp 60 | 61 | # generate set prototype - set a particular reg value (datamem/xbInmem) to a scalar 62 | def i_set (d1, imm, vec = 1): 63 | i_temp = dummy_instrn.copy () 64 | i_temp['opcode'] = 'set' 65 | i_temp['d1'] = d1 66 | i_temp['imm'] = imm if (type(imm) == str) else int2bin(imm, cfg.addr_width) 67 | i_temp['vec'] = vec 68 | return i_temp 69 | 70 | # generate alu prototype - arithmrtic, logical, non-linear opearrions 71 | def i_alu (aluop, d1, r1, r2=0, imm=0, vec = 1): 72 | i_temp = dummy_instrn.copy() 73 | i_temp['opcode'] = 'alu' 74 | i_temp['aluop'] = aluop 75 | i_temp['d1'] = d1 76 | i_temp['r1'] = r1 77 | i_temp['r2'] = r2 78 | i_temp['imm'] = imm # will be used in lsh 79 | i_temp['vec'] = vec 80 | return i_temp 81 | 82 | # generate alui prototype - arithmrtic, logical, non-linear opearrions with scalars 83 | def i_alui (aluop, d1, r1, imm, vec = 1): 84 | i_temp = dummy_instrn.copy() 85 | i_temp['opcode'] = 'alui' 86 | i_temp['aluop'] = aluop 87 | i_temp['d1'] = d1 88 | i_temp['r1'] = r1 89 | i_temp['imm'] = float2fixed (imm, cfg.int_bits, cfg.frac_bits) 90 | i_temp['vec'] = vec 91 | return i_temp 92 | 93 | # generate mvm prototype - xbar isntrn 94 | def i_mvm (xb_nma = cfg.num_matrix*'0', r1=0, r2=0): # r1 is displacement, r2 is length of a continuum of data 95 | xb_nma_str = xb_nma[0] 96 | #xb_nma_str = xb_nma 97 | xb_nma_list = [xb_nma_str[i]+'00' for i in range(len(xb_nma_str))] # split into list of 3-bit masks 98 | assert (len(xb_nma_list) == cfg.num_matrix) # each matrix in a core has a 3-bit mask 99 | i_temp = dummy_instrn.copy() 100 | i_temp['opcode'] = 'mvm' 101 | i_temp['r1'] = r1 102 | i_temp['r2'] = r2 103 | i_temp['xb_nma'] = xb_nma_list 104 | return i_temp 105 | 106 | ## Added for COMPILER - i_train, mask as integer 107 | def i_train (xb_nma = cfg.num_matrix*['000'], r1=0, r2=0): # r1 is displacement, r2 is length of a continuum of data 108 | xb_nma_str = xb_nma[0] 109 | xb_nma_list = [xb_nma_str[i*3:(i+1)*3] for i in range(len(xb_nma_str)/3)] # split into list of 3-bit masks 110 | assert (len(xb_nma_list) == cfg.num_matrix) # each matrix in a core has a 3-bit mask 111 | i_temp = dummy_instrn.copy() 112 | i_temp['opcode'] = 'mvm' 113 | i_temp['r1'] = r1 114 | i_temp['r2'] = r2 115 | i_temp['xb_nma'] = xb_nma_list 116 | return i_temp 117 | 118 | # generate crs instruction 119 | # for each matrix, one bit to specify whether to do crs or not 120 | def i_crs (xb_nma = cfg.num_matrix*['0']): 121 | assert (len(xb_nma) == cfg.num_matrix) # each matrix in a core has a 1-bit mask 122 | i_temp = dummy_instrn.copy() 123 | i_temp['opcode'] = 'crs' 124 | i_temp['xb_nma'] = xb_nma 125 | return i_temp 126 | 127 | # generate halt prototype 128 | def i_hlt (): 129 | i_temp = dummy_instrn.copy() 130 | i_temp['opcode'] = 'hlt' 131 | return i_temp 132 | 133 | # generate jmp prototype 134 | def i_jmp (imm): # imm is the jump target 135 | i_temp = dummy_instrn.copy() 136 | i_temp['opcode'] = 'jmp' 137 | i_temp['imm'] = imm 138 | return i_temp 139 | 140 | # generate beq prototype 141 | def i_beq (r1, r2, imm): # imm is the jump target 142 | i_temp = dummy_instrn.copy() 143 | i_temp['opcode'] = 'beq' 144 | i_temp['r1'] = r1 145 | i_temp['r2'] = r2 146 | i_temp['imm'] = imm 147 | return i_temp 148 | 149 | # generate alu_int prototype 150 | def i_alu_int (aluop, d1, r1, r2): 151 | i_temp = dummy_instrn.copy() 152 | i_temp['opcode'] = 'alu_int' 153 | i_temp['aluop'] = aluop 154 | i_temp['d1'] = d1 155 | i_temp['r1'] = r1 156 | i_temp['r2'] = r2 157 | return i_temp 158 | 159 | -------------------------------------------------------------------------------- /inst-sim/result/mlp/result.txt: -------------------------------------------------------------------------------- 1 | 2 | ====CPI STACK==== 3 | total cyc: 10718 4 | Copy : 2284 / 8818 5 | Load : 1537 / 8818 6 | Store : 322 / 8818 7 | Send : 603 / 8818 8 | Receive : 603 / 8818 9 | MVM : 322 / 8818 10 | ALU : 1288 / 8818 11 | ALUI : 0 / 8818 12 | Set : 1859 / 8818 13 | WriteInput : 0 / 8818 14 | ReadOutput : 0 / 8818 15 | 16 | ====LINK==== 17 | busiest link: -0.0001220703125KB 18 | 19 | ====MAX MEMORY SIZE==== 20 | tile id: 0 physical size: 0 virtual size: 0 21 | tile id: 1 physical size: 0 virtual size: 128 22 | tile id: 2 physical size: 896 virtual size: 2560 23 | tile id: 3 physical size: 896 virtual size: 2560 24 | tile id: 4 physical size: 1408 virtual size: 4096 25 | tile id: 5 physical size: 1920 virtual size: 3584 26 | tile id: 6 physical size: 1408 virtual size: 3584 27 | tile id: 7 physical size: 1408 virtual size: 3584 28 | tile id: 8 physical size: 1408 virtual size: 3584 29 | tile id: 9 physical size: 1408 virtual size: 3584 30 | tile id: 10 physical size: 1408 virtual size: 3584 31 | tile id: 11 physical size: 896 virtual size: 3584 32 | tile id: 12 physical size: 1280 virtual size: 5632 33 | tile id: 13 physical size: 2048 virtual size: 4736 34 | tile id: 14 physical size: 1152 virtual size: 4736 35 | tile id: 15 physical size: 2048 virtual size: 4736 36 | tile id: 16 physical size: 1408 virtual size: 4736 37 | tile id: 17 physical size: 2048 virtual size: 4736 38 | tile id: 18 physical size: 1280 virtual size: 4736 39 | tile id: 19 physical size: 2048 virtual size: 4736 40 | tile id: 20 physical size: 1408 virtual size: 4736 41 | tile id: 21 physical size: 2048 virtual size: 4736 42 | tile id: 22 physical size: 1152 virtual size: 4736 43 | tile id: 23 physical size: 2048 virtual size: 4736 44 | tile id: 24 physical size: 2176 virtual size: 6784 45 | tile id: 25 physical size: 1920 virtual size: 4736 46 | tile id: 26 physical size: 1664 virtual size: 4736 47 | tile id: 27 physical size: 1920 virtual size: 4736 48 | tile id: 28 physical size: 3328 virtual size: 5248 49 | -------------------------------------------------------------------------------- /inst-sim/tile_instrn_proto.py: -------------------------------------------------------------------------------- 1 | # Define the instruction prototypes which will be used by the generate_instrn.py file 2 | import sys 3 | 4 | import numpy as np 5 | import config as cfg 6 | 7 | # List of supported opcodes for tile 8 | op_list_tile = ['send', 'receive', 'compute', 'halt'] 9 | 10 | # Instruction format for Tile 11 | dummy_instrn_tile = {'opcode' : op_list_tile[0], 12 | 'mem_addr': 0, # send/receive - edram_addr 13 | 'r1': 0, # send-send_width, receive-receive_width 14 | 'r2': 0, # send-target_addr, receive-counter 15 | 'vtile_id': 0, # send/receive-neuron_id 16 | 'ima_nma': '', # compute - a bit for each ima 17 | 'vec': 0} # vector width 18 | 19 | # Define instruction prototypes 20 | # generate receive prototype 21 | def i_receive (mem_addr, vtile_id, receive_width, counter, vec = 1): 22 | i_temp = dummy_instrn_tile.copy() 23 | i_temp['opcode'] = 'receive' 24 | i_temp['mem_addr'] = mem_addr 25 | i_temp['vtile_id'] = vtile_id 26 | i_temp['r1'] = receive_width 27 | i_temp['r2'] = counter 28 | i_temp['vec'] = vec 29 | return i_temp 30 | 31 | # generate send prototype 32 | def i_send (mem_addr, vtile_id, send_width, target_addr, vec = 1): 33 | i_temp = dummy_instrn_tile.copy() 34 | i_temp['opcode'] = 'send' 35 | i_temp['mem_addr'] = mem_addr 36 | i_temp['vtile_id'] = vtile_id 37 | i_temp['r1'] = send_width 38 | i_temp['r2'] = target_addr 39 | i_temp['vec'] = vec 40 | return i_temp 41 | 42 | # generate halt prototype 43 | def i_halt (): 44 | i_temp = dummy_instrn_tile.copy() 45 | i_temp['opcode'] = 'halt' 46 | return i_temp 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /inst-sim/workload/LSTM2048.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/LSTM2048.tar.gz -------------------------------------------------------------------------------- /inst-sim/workload/bigLSTM.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/bigLSTM.tar.gz -------------------------------------------------------------------------------- /inst-sim/workload/mlp.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/mlp.tar.gz -------------------------------------------------------------------------------- /inst-sim/workload/nmt.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/nmt.tar.gz -------------------------------------------------------------------------------- /inst-sim/workload/vgg16_small.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/vgg16_small.tar.gz -------------------------------------------------------------------------------- /inst-sim/workload/vgg19_small.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PIM-SW/PIM-Simulator/40699dc06e6154e7c1a794631eb66b6e24737e08/inst-sim/workload/vgg19_small.tar.gz --------------------------------------------------------------------------------