├── Makefile
├── helpers.asm
├── qpu-dis.cpp
└── qpu-asm.cpp


/Makefile:
--------------------------------------------------------------------------------
1 | all: qpu-asm qpu-dis
2 | 
3 | qpu-asm: qpu-asm.cpp
4 | 	g++ -g -o qpu-asm qpu-asm.cpp
5 | 
6 | qpu-dis: qpu-dis.cpp
7 | 	g++ -g -o qpu-dis qpu-dis.cpp
8 | 


--------------------------------------------------------------------------------
/helpers.asm:
--------------------------------------------------------------------------------
  1 | define(`MUTEX_ACQUIRE',     `or ra39, ra51, rb39;           nop')
  2 | define(`MUTEX_RELEASE',     `or ra51, ra39, ra39;           nop')
  3 | 
  4 | # Hardwired IO registers
  5 | define(`rVpmWriteFifo', `rb48')
  6 | define(`rVpmReadFifo', `ra48')
  7 | define(`raReadUniform', `ra32')
  8 | define(`rbReadUniform', `rb32')
  9 | define(`raZero', `ra39')
 10 | define(`rbZero', `rb39')
 11 | 
 12 | # Macro argument constants
 13 | define(`MODEW_32_BIT', 0)
 14 | define(`MODEW_16_BIT_OFFSET_0', 2)
 15 | define(`MODEW_16_BIT_OFFSET_1', 3)
 16 | define(`MODEW_8_BIT_OFFSET_0', 4)
 17 | define(`MODEW_8_BIT_OFFSET_1', 5)
 18 | define(`MODEW_8_BIT_OFFSET_2', 6)
 19 | define(`MODEW_8_BIT_OFFSET_3', 7)
 20 | define(`SIZE_8_BIT', 0)
 21 | define(`SIZE_16_BIT', 1)
 22 | define(`SIZE_32_BIT', 2)
 23 | define(`IS_HORIZ', 1)
 24 | define(`NOT_HORIZ', 0)
 25 | define(`IS_VERT', 1)
 26 | define(`NOT_VERT', 0)
 27 | define(`IS_LANED', 1)
 28 | define(`NOT_LANED', 0)
 29 | 
 30 | # VPM_BLOCK_WRITE_SETUP
 31 | # ~~~~~~~~~~~~~~~~~~~~~
 32 | # Sets up things so writes go into the small VPM data cache.
 33 | # Once the data's been written (by outputting repeatedly to the VPM_WRITE_FIFO
 34 | # register rb48), you then call VPM_DMA_WRITE_SETUP to configure the main
 35 | # memory destination and writing pattern.
 36 | # Arguments:
 37 | #  STRIDE: 0-64 - How much to increment the ADDR after each write.
 38 | #  HORIZ: 0 or 1 - Whether the layout is horizontal (1) or vertical (0).
 39 | #  LANED: 0 or 1 - Whether the layout is laned (1) or packed (0).
 40 | #  SIZE: 0, 1, 2 - The data unit size, 8-bit (0), 16-bit(1), or 32-bit (2).
 41 | #  ADDR: 0-255 - Packed address, meaning depends on exact unit size and mode.
 42 | # See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 57
 43 | define(`VPM_BLOCK_WRITE_SETUP_ID_SHIFT', 30)
 44 | define(`VPM_BLOCK_WRITE_SETUP_STRIDE_SHIFT', 12)
 45 | define(`VPM_BLOCK_WRITE_SETUP_HORIZ_SHIFT', 11)
 46 | define(`VPM_BLOCK_WRITE_SETUP_LANED_SHIFT', 10)
 47 | define(`VPM_BLOCK_WRITE_SETUP_SIZE_SHIFT', 8)
 48 | define(`VPM_BLOCK_WRITE_SETUP_ADDR_SHIFT', 0)
 49 | define(`VPM_BLOCK_WRITE_SETUP_VALUE', `eval(
 50 | (0<<VPM_BLOCK_WRITE_SETUP_ID_SHIFT)|
 51 | ($1<<VPM_BLOCK_WRITE_SETUP_STRIDE_SHIFT)|
 52 | ($2<<VPM_BLOCK_WRITE_SETUP_HORIZ_SHIFT)|
 53 | ($3<<VPM_BLOCK_WRITE_SETUP_LANED_SHIFT)|
 54 | ($4<<VPM_BLOCK_WRITE_SETUP_SIZE_SHIFT)|
 55 | ($5<<VPM_BLOCK_WRITE_SETUP_ADDR_SHIFT))')
 56 | define(`VPM_BLOCK_WRITE_SETUP', `ldi rb49, VPM_BLOCK_WRITE_SETUP_VALUE($1, $2, $3, $4, $5)')
 57 | 
 58 | # VPM_BLOCK_READ_SETUP
 59 | # ~~~~~~~~~~~~~~~~~~~~
 60 | # Controls how values are read from the VPM data cache into the QPU.
 61 | # Arguments:
 62 | #  NUM: 0-16 - How many elements to read at a time.
 63 | #  STRIDE: 0-64 - The amount to increment the address by after each read.
 64 | #  HORIZ: 0 or 1 - Whether the layour is horizontal (1) or vertical (0).
 65 | #  LANED: 0 or 1 - Whether the layout is laned (1) or packed (0).
 66 | #  SIZE: 0, 1, 2 - The data unit size, 8-bit (0), 16-bit(1), or 32-bit (2).
 67 | #  ADDR: 0-255 - Packed address, meaning depends on exact unit size and mode.
 68 | # See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 58
 69 | define(`VPM_BLOCK_READ_SETUP_ID_SHIFT', 30)
 70 | define(`VPM_BLOCK_READ_SETUP_NUM_SHIFT', 20)
 71 | define(`VPM_BLOCK_READ_SETUP_STRIDE_SHIFT', 12)
 72 | define(`VPM_BLOCK_READ_SETUP_HORIZ_SHIFT', 11)
 73 | define(`VPM_BLOCK_READ_SETUP_LANED_SHIFT', 10)
 74 | define(`VPM_BLOCK_READ_SETUP_SIZE_SHIFT', 8)
 75 | define(`VPM_BLOCK_READ_SETUP_ADDR_SHIFT', 0)
 76 | define(`VPM_BLOCK_READ_SETUP_VALUE', `eval(
 77 | (0<<VPM_BLOCK_READ_SETUP_ID_SHIFT)|
 78 | ($1<<VPM_BLOCK_READ_SETUP_NUM_SHIFT)|
 79 | ($2<<VPM_BLOCK_READ_SETUP_STRIDE_SHIFT)|
 80 | ($3<<VPM_BLOCK_READ_SETUP_HORIZ_SHIFT)|
 81 | ($4<<VPM_BLOCK_READ_SETUP_LANED_SHIFT)|
 82 | ($5<<VPM_BLOCK_READ_SETUP_SIZE_SHIFT)|
 83 | ($6<<VPM_BLOCK_READ_SETUP_ADDR_SHIFT))')
 84 | define(`VPM_BLOCK_READ_SETUP', `ldi ra49, VPM_BLOCK_READ_SETUP_VALUE($1, $2, $3, $4, $5, $6)')
 85 | 
 86 | # VPM_DMA_STORE_SETUP
 87 | # ~~~~~~~~~~~~~~~~~~~
 88 | # Configures the DMA controller to transfer data from the VPM cache to main memory.
 89 | # Once the setup's been done, you then need to call VPM_DMA_STORE_START to kick
 90 | # off the transfer.
 91 | # Arguments:
 92 | #  UNITS: 0-128 - Number of rows of 2D block in memory.
 93 | #  DEPTH: 0-128 - How long each row is (in bytes?).
 94 | #  HORIZ: 0 or 1 - Whether the layout is horizontal (1) or vertical (0).
 95 | #  ADDRY: The Y coordinate of the address in the VPM space to start from.
 96 | #  ADDRX: The X coordinate of the address in the VPM space to start from.
 97 | #  MODEW: 0-7 : 0 is 32-bit, 2-3 is 16-bit with offset, 4-7 is 8-bit with offset.
 98 | # See http://www.broadcom.com/docs/support/videocore/VideoCoreIV-AG100-R.pdf page 58
 99 | define(`VPM_DMA_STORE_SETUP_ID_SHIFT', 30)
100 | define(`VPM_DMA_STORE_SETUP_UNITS_SHIFT', 23)
101 | define(`VPM_DMA_STORE_SETUP_DEPTH_SHIFT', 16)
102 | define(`VPM_DMA_STORE_SETUP_HORIZ_SHIFT', 14)
103 | define(`VPM_DMA_STORE_SETUP_ADDRY_SHIFT', 7)
104 | define(`VPM_DMA_STORE_SETUP_ADDRX_SHIFT', 3)
105 | define(`VPM_DMA_STORE_SETUP_MODEW_SHIFT', 0)
106 | define(`VPM_DMA_STORE_SETUP_VALUE', `eval(
107 | (2<<VPM_DMA_STORE_SETUP_ID_SHIFT)|
108 | ($1<<VPM_DMA_STORE_SETUP_UNITS_SHIFT)|
109 | ($2<<VPM_DMA_STORE_SETUP_DEPTH_SHIFT)|
110 | ($3<<VPM_DMA_STORE_SETUP_HORIZ_SHIFT)|
111 | ($4<<VPM_DMA_STORE_SETUP_ADDRY_SHIFT)|
112 | ($5<<VPM_DMA_STORE_SETUP_ADDRX_SHIFT)|
113 | ($6<<VPM_DMA_STORE_SETUP_MODEW_SHIFT))')
114 | define(`VPM_DMA_STORE_SETUP', `ldi rb49, VPM_DMA_STORE_SETUP_VALUE($1, $2, $3, $4, $5, $6)')
115 | 
116 | # VPM_DMA_STORE_START
117 | # ~~~~~~~~~~~~~~~~~~~
118 | # Kicks off the transfer of data from the local VPM data cache to main memory.
119 | # It will use the settings from VPM_DMA_STORE_SETUP to control the copy process.
120 | # Arguments:
121 | #  address: A register name that holds the address in main memory to write to.
122 | define(`VPM_DMA_STORE_START', `or rb50, $1, 0;          nop')
123 | 
124 | # VPM_DMA_STORE_WAIT_FOR_COMPLETION
125 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
126 | # Pause until the previous DMA store operation has finished.
127 | define(`VPM_DMA_STORE_WAIT_FOR_COMPLETION', `or rb39, rb50, rb50;       nop')
128 | 
129 | # VPM_DMA_LOAD_SETUP
130 | # ~~~~~~~~~~~~~~~~~~
131 | # Initializes the settings for transfering data from main memory into the VPM cache.
132 | # Arguments:
133 | #  MODEW: 0-7 : 0 is 32-bit, 2-3 is 16-bit with offset, 4-7 is 8-bit with offset.
134 | #  MPITCH: 0-15: The amount to increment the memory pointer between rows, calculated as 8*2^MPITCH bytes.
135 | #  ROWLEN: 0-15: The number of elements in each row in main memory.
136 | #  NROWS: 0-15: How many rows to read from memory.
137 | #  VPITCH: 0-15: How much to increment the VPM address by after each row is loaded.
138 | #  VERT: 0 or 1 - Whether the layout is vertical (1) or horizontal (0). Be careful, this is inverted compared to normal.
139 | #  ADDRY: 0-64 - The Y coordinate of the address in the VPM space to start loading into.
140 | #  ADDRX: 0-16 - The X coordinate of the address in the VPM space to start loading into.
141 | define(`VPM_DMA_LOAD_SETUP_ID_SHIFT', 31)
142 | define(`VPM_DMA_LOAD_SETUP_MODEW_SHIFT', 28)
143 | define(`VPM_DMA_LOAD_SETUP_MPITCH_SHIFT', 24)
144 | define(`VPM_DMA_LOAD_SETUP_ROWLEN_SHIFT', 20)
145 | define(`VPM_DMA_LOAD_SETUP_NROWS_SHIFT', 16)
146 | define(`VPM_DMA_LOAD_SETUP_VPITCH_SHIFT', 12)
147 | define(`VPM_DMA_LOAD_SETUP_VERT_SHIFT', 11)
148 | define(`VPM_DMA_LOAD_SETUP_ADDRY_SHIFT', 4)
149 | define(`VPM_DMA_LOAD_SETUP_ADDRX_SHIFT', 0)
150 | define(`VPM_DMA_LOAD_SETUP_VALUE', `eval(
151 | (1<<VPM_DMA_LOAD_SETUP_ID_SHIFT)|
152 | ($1<<VPM_DMA_LOAD_SETUP_MODEW_SHIFT)|
153 | ($2<<VPM_DMA_LOAD_SETUP_MPITCH_SHIFT)|
154 | ($3<<VPM_DMA_LOAD_SETUP_ROWLEN_SHIFT)|
155 | ($4<<VPM_DMA_LOAD_SETUP_NROWS_SHIFT)|
156 | ($5<<VPM_DMA_LOAD_SETUP_VPITCH_SHIFT)|
157 | ($6<<VPM_DMA_LOAD_SETUP_VERT_SHIFT)|
158 | ($7<<VPM_DMA_LOAD_SETUP_ADDRY_SHIFT)|
159 | ($8<<VPM_DMA_LOAD_SETUP_ADDRX_SHIFT))')
160 | define(`VPM_DMA_LOAD_SETUP', `ldi ra49, VPM_DMA_LOAD_SETUP_VALUE($1, $2, $3, $4, $5, $6, $7, $8)')
161 | 
162 | # VPM_DMA_LOAD_START
163 | # ~~~~~~~~~~~~~~~~~~~
164 | # Kicks off the transfer of data from main memory to the local VPM data cache.
165 | # It will use the settings from VPM_DMA_LOAD_SETUP to control the copy process.
166 | # Arguments:
167 | #  address: A register name that holds the address in main memory to read from.
168 | define(`VPM_DMA_LOAD_START', `or ra50, $1, 0;          nop')
169 | 
170 | # VPM_DMA_LOAD_WAIT_FOR_COMPLETION
171 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
172 | # Pause until the previous DMA load operation has finished.
173 | define(`VPM_DMA_LOAD_WAIT_FOR_COMPLETION', `or rb39, ra50, ra50;       nop')
174 | 
175 | # END_PROGRAM
176 | # ~~~~~~~~~~~
177 | # Triggers a host interrupt to transfer control back to the main CPU.
178 | define(`END_PROGRAM_HARD', `
179 | or rb38, r0, 1;       nop
180 | nop.tend ra39, ra39, ra39;       nop rb39, rb39, rb39
181 | nop ra39, ra39, ra39;       nop rb39, rb39, rb39
182 | nop ra39, ra39, ra39;       nop rb39, rb39, rb39')
183 | 
184 | define(`END_PROGRAM_SOFT', `
185 | nop.tend ra39, ra39, ra39;      nop rb39, rb39, rb39
186 | NOP
187 | NOP
188 | ')
189 | 
190 | # NOP
191 | # ~~~
192 | # Do nothing on both pipes for a cycle
193 | define(`NOP', `nop ra39, ra39, ra39;       nop rb39, rb39, rb39')
194 | 


--------------------------------------------------------------------------------
/qpu-dis.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <errno.h>
  4 | #include <string.h>
  5 | #include <stdint.h>
  6 | 
  7 | void show_qpu_inst(uint32_t *inst);
  8 | void show_qpu_fragment(uint32_t *inst, int length);
  9 | 
 10 | 
 11 | int base;
 12 | int showfields = 1;
 13 | 
 14 | const char *acc_names[] = {
 15 | 	"r0", "r1", "r2", "r3", "r4", "r5"
 16 | };
 17 | 
 18 | const char *banka_r[64] = {
 19 | 	"ra0", "ra1", "ra2", "ra3", "ra4", "ra5", "ra6", "ra7",
 20 | 	"ra8", "ra9", "ra10", "ra11", "ra12", "ra13", "ra14", "ra15", //ra15 is w in shaders
 21 | 	"ra16", "ra17", "ra18", "ra19", "ra20", "ra21", "ra22", "ra23",
 22 | 	"ra24", "ra25", "ra26", "ra27", "ra28", "ra29", "ra30", "ra31",
 23 | 	"unif", "ra33?", "ra34?", "vary", "ra36?", "ra37?", "elem_num", "nop",
 24 | 	"ra40", "x_coord", "ms_mask", "ra43?", "ra44?", "ra45?", "ra46?", "ra47?",
 25 | 	"vpm", "vr_busy", "vr_wait", "mutex", "ra52?", "ra53?", "ra54?", "ra55?",
 26 | 	"ra56?", "ra57?", "ra58?", "ra59?", "ra60?", "ra61?", "ra62?", "ra63?",
 27 | };
 28 | 
 29 | const char *bankb_r[64] = {
 30 | 	"rb0", "rb1", "rb2", "rb3", "rb4", "rb5", "rb6", "rb7",
 31 | 	"rb8", "rb9", "rb10", "rb11", "rb12", "rb13", "rb14", "rb15", //rb15 is z in shaders
 32 | 	"rb16", "rb17", "rb18", "rb19", "rb20", "rb21", "rb22", "rb23",
 33 | 	"rb24", "rb25", "rb26", "rb27", "rb28", "rb29", "rb30", "rb31",
 34 | 	"unif", "rb33?", "rb34?", "vary", "rb36?", "rb37?", "qpu_num", "nop",
 35 | 	"rb40?", "y_coord", "rev_flag", "rb43?", "rb44?", "rb45?", "rb46?", "rb47?",
 36 | 	"vpm", "vw_busy", "vw_wait", "mutex", "rb52?", "rb53?", "rb54?", "rb55?",
 37 | 	"rb56?", "rb57?", "rb58?", "rb59?", "rb60?", "rb61?", "rb62?", "rb63?",
 38 | };
 39 | 
 40 | const char *banka_w[64] = {
 41 | 	"ra0", "ra1", "ra2", "ra3", "ra4", "ra5", "ra6", "ra7",
 42 | 	"ra8", "ra9", "ra10", "ra11", "ra12", "ra13", "ra14", "ra15", //ra15 is w in shaders
 43 | 	"ra16", "ra17", "ra18", "ra19", "ra20", "ra21", "ra22", "ra23",
 44 | 	"ra24", "ra25", "ra26", "ra27", "ra28", "ra29", "ra30", "ra31",
 45 | 	"r0", "r1", "r2", "r3", "tmurs", "r5quad", "irq", "-",
 46 | 	"unif_addr", "x_coord", "ms_mask", "stencil", "tlbz", "tlbm", "tlbc", "tlbam",
 47 | 	"vpm", "vr_setup", "vr_addr", "mutex", "recip", "recipsqrt", "exp", "log",
 48 | 	"t0s", "t0t", "t0r", "t0b", "t1s", "t1t", "t1r", "t1b",
 49 | };
 50 | 
 51 | const char *bankb_w[64] = {
 52 | 	"rb0", "rb1", "rb2", "rb3", "rb4", "rb5", "rb6", "rb7",
 53 | 	"rb8", "rb9", "rb10", "rb11", "rb12", "rb13", "rb14", "rb15", //rb15 is z in shaders
 54 | 	"rb16", "rb17", "rb18", "rb19", "rb20", "rb21", "rb22", "rb23",
 55 | 	"rb24", "rb25", "rb26", "rb27", "rb28", "rb29", "rb30", "rb31",
 56 | 	"r0", "r1", "r2", "r3", "tmurs", "r5rep", "irq", "-",
 57 | 	"unif_addr_rel", "y_coord", "rev_flag", "stencil", "tlbz", "tlbm", "tlbc", "tlbam",
 58 | 	"vpm", "vw_setup", "vw_addr", "mutex", "recip", "recipsqrt", "exp", "log",
 59 | 	"t0s", "t0t", "t0r", "t0b", "t1s", "t1t", "t1r", "t1b",
 60 | };
 61 | 
 62 | const char *ops[] = {
 63 | 	"bkpt", "nop", "thrsw", "thrend", "sbwait", "sbdone", "lthrsw", "loadcv",
 64 | 	"loadc", "ldcend", "ldtmu0", "ldtmu1", "loadam", "nop", "ldi", "bra",
 65 | };
 66 | 
 67 | const char *addops[] = {
 68 | 	"nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs", "ftoi",
 69 | 	"itof", "addop9", "addop10", "addop11", "add", "sub", "shr", "asr",
 70 | 	"ror", "shl", "min", "max", "and", "or", "xor", "not",
 71 | 	"clz", "addop25", "addop26", "addop27", "addop28", "addop29", "v8adds", "v8subs",
 72 | 
 73 | 	"mov"
 74 | };
 75 | 
 76 | const char *mulops[] = {
 77 | 	"nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds", "v8subs",
 78 | 
 79 | 	"mov"
 80 | };
 81 | 
 82 | const char *cc[] = {
 83 | 	".never", "", ".zs", ".zc", ".ns", ".nc", ".cs", ".cc"
 84 | };
 85 | 
 86 | const char *dstpackadd[] = {
 87 | 	"", ".16a", ".16b", ".8abcd", ".8a", ".8b", ".8c", ".8d", ".s", ".16as", ".16bs", ".8abcds", ".8as", ".8bs", ".8cs", ".8ds"
 88 | };
 89 | 
 90 | const char *dstpackmul[] = {
 91 | 	"", ".packm01", ".packm02", ".8abcd", ".8a", ".8b", ".8c", ".8d", ".packm08", ".packm09", ".packm10", ".packm11", ".packm12", ".packm13", ".packm14", ".packm15"
 92 | };
 93 | 
 94 | const char *srcunpackadd[] = {
 95 | 	"", ".16a", ".16b", ".8dr", ".8a", ".8b", ".8c", ".8d"
 96 | };
 97 | 
 98 | const char *srcunpackmul[] = {
 99 | 	"", ".16a", ".16b", ".8dr", ".8a", ".8b", ".8c", ".8d"
100 | };
101 | 
102 | const char *bcc[] = {
103 | 	".allz", ".allnz", ".anyz", ".anynz", ".alln", ".allnn", ".anyn", ".anynn",
104 | 	".allc", ".allnc", ".anyc", ".anync", ".cc12", ".cc13", ".cc14", ""
105 | };
106 | 
107 | const char *imm[] = {
108 | 	"0", "1", "2", "3", "4", "5", "6", "7",
109 | 	"8", "9", "10", "11", "12", "13", "14", "15",
110 | 	"-16", "-15", "-14", "-13", "-12", "-11", "-10", "-9",
111 | 	"-8", "-7", "-6", "-5", "-4", "-3", "-2", "-1",
112 | 	"1.0", "2.0", "4.0", "8.0", "16.0", "32.0", "64.0", "128.0",
113 | 	"1/256", "1/128", "1/64", "1/32", "1/16", "1/8", "1/4", "1/2", 
114 | 	" >> r5", " >> 1", " >> 2", " >> 3", " >> 4", " >> 5", " >> 6", " >> 7",
115 | 	" >> 8", " >> 9", " >> 10", " >> 11", " >> 12", " >> 13", " >> 14", " >> 15"
116 | };
117 | 
118 | const char *setf[] = {
119 | 	"", ".setf"
120 | };
121 | 
122 | // QPU Instruction unpacking
123 | //
124 | // Add/Mul Operations:
125 | //   mulop:3 addop:5 ra:6 rb:6 adda:3 addb:3 mula:3 mulb:3, op:4 packbits:8 addcc:3 mulcc:3 F:1 X:1 wa:6 wb:6
126 | //
127 | // Branches:
128 | //   addr:32, 1111 0000 cond:4 relative:1 register:1 ra:5 X:1 wa:6 wb:6
129 | //
130 | // 32 Bit Immediates:
131 | //   data:32, 1110 unknown:8 addcc:3 mulcc:3 F:1 X:1 wa:6 wb:6
132 | 
133 | unsigned tmpthis=0;
134 | unsigned tmpnext=0;
135 | char tmpbuff[256];
136 | #define tmpalloc(sizebytes) ( tmpthis = tmpnext+sizebytes > sizeof(tmpbuff) ? 0 : tmpnext, tmpnext = (tmpthis+sizebytes), &tmpbuff[tmpthis])
137 | 
138 | const char *qpu_r(uint32_t ra, uint32_t rb, uint32_t adda, uint32_t op, int rotator) {
139 | 
140 | 	if (op == 13) {
141 | 		if (rb<48) {
142 | 			if (adda==6) return banka_r[ra];
143 | 			if (adda==7) return imm[rb];
144 | 		}
145 | 		else {
146 | 			if ((adda<6) && rotator) {
147 | 				char *tmp = tmpalloc(32);
148 | 				sprintf(tmp, "%s%s", acc_names[adda], imm[rb]);
149 | 				return tmp;
150 | 			}
151 | 			if ((adda==6) && rotator) {
152 | 				char *tmp = tmpalloc(32);
153 | 				sprintf(tmp, "%s%s", banka_r[ra], imm[rb]);
154 | 				return tmp;
155 | 			}
156 | 			if ((adda==7) && rotator) {
157 | 				return "err?";
158 | 			}
159 | 		}
160 | 	}
161 | 
162 | 	if (adda==6) return banka_r[ra];
163 | 	if (adda==7) return bankb_r[rb];
164 | 	return acc_names[adda];
165 | }
166 | 
167 | const char *qpu_w_add(uint32_t wa, uint32_t X) {
168 | 	return X ? bankb_w[wa] : banka_w[wa];
169 | }
170 | 
171 | const char *qpu_w_mul(uint32_t wb, uint32_t X) {
172 | 	return X ? banka_w[wb] : bankb_w[wb];
173 | }
174 | 
175 | const char *qpu_unpack_add(uint32_t packmul, uint32_t unpack, uint32_t adda) {
176 | 	if ((packmul == 0) && (adda == 6))
177 | 		return srcunpackadd[unpack];
178 | 	if ((packmul == 1) && (adda == 4))
179 | 		return srcunpackmul[unpack];
180 | 	return "";
181 | }
182 | 
183 | const char *qpu_unpack_mul(uint32_t packmul, uint32_t unpack, uint32_t adda) {
184 | 	if ((packmul == 0) && (adda == 6))
185 | 		return srcunpackmul[unpack];
186 | 	if ((packmul == 1) && (adda == 4))
187 | 		return srcunpackmul[unpack];
188 | 	return "";
189 | }
190 | 
191 | const char *qpu_pack_add(uint32_t packmul, uint32_t pack, uint32_t wa, uint32_t X) {
192 | 	if ((packmul == 0) && (X==0) && (wa<=32)) //todo: what is the real limit on ra range?
193 | 		return dstpackadd[pack];
194 | 	return "";
195 | }
196 | 
197 | const char *qpu_pack_mul(uint32_t packmul, uint32_t pack, uint32_t wa, uint32_t X) {
198 | 	if ((packmul == 0) && (X==1) && (wa<=32)) //todo: what is the real limit on ra range?
199 | 		return dstpackmul[pack];
200 | 	if (packmul == 1)
201 | 		return dstpackmul[pack];
202 | 	return "";
203 | }
204 | 
205 | void show_qpu_add_mul(uint32_t i0, uint32_t i1)
206 | {
207 | 	uint32_t mulop = (i0 >> 29) & 0x7;
208 | 	uint32_t addop = (i0 >> 24) & 0x1f;
209 | 	uint32_t ra    = (i0 >> 18) & 0x3f;
210 | 	uint32_t rb    = (i0 >> 12) & 0x3f;
211 | 	uint32_t adda  = (i0 >>  9) & 0x07;
212 | 	uint32_t addb  = (i0 >>  6) & 0x07;
213 | 	uint32_t mula  = (i0 >>  3) & 0x07;
214 | 	uint32_t mulb  = (i0 >>  0) & 0x07;
215 | 	uint32_t op    = (i1 >> 28) & 0x0f;
216 | 	uint32_t packbits  = (i1 >> 20) & 0xff;
217 | 	uint32_t unpacking = (packbits >> 5) & 0x7;
218 | 	uint32_t packmul   = (packbits >> 4) & 0x1;
219 | 	uint32_t packing   = (packbits >> 0) & 0xf;
220 | 	uint32_t addcc = (i1 >> 17) & 0x07;
221 | 	uint32_t mulcc = (i1 >> 14) & 0x07;
222 | 	uint32_t F     = (i1 >> 13) & 0x01;
223 | 	uint32_t X     = (i1 >> 12) & 0x01;
224 | 	uint32_t wa    = (i1 >> 6) & 0x3f;
225 | 	uint32_t wb    = (i1 >> 0) & 0x3f;
226 | 
227 | 	if (showfields) {
228 | 		printf("mulop=%d, addop=%d, ra=%d, rb=%d, adda=%d, addb=%d, mula=%d, mulb=%d, op=%d, unpacking=%d, packmul=%d, packing=%d, addcc=%d, mulcc=%d, F=%d, X=%d, wa=%d, wb=%d  \n",
229 | 			mulop, addop, ra, rb, adda, addb, mula, mulb, op, unpacking, packmul, packing, addcc, mulcc, F, X, wa, wb);
230 | 	}
231 | 
232 | 	uint32_t addF  = (F==1) && (addop != 0) && (addcc != 0);
233 | 	uint32_t mulF  = (F==1) && !addF;
234 | 
235 | 	// Instruction formats:
236 | 	// op[cc][setf]
237 | 	// op[cc][setf] rd[.pack]
238 | 	// op[cc][setf] rd[.pack], ra[.unpack]
239 | 	// op[cc][setf] rd[.pack], ra[.unpack], rb[.unpack]
240 | 	const char *args[] = {
241 | 		"", " %s%s", " %s%s, %s%s", " %s%s, %s%s, %s%s"
242 | 	};
243 | 
244 | 	uint32_t arity = 3;
245 | 	if (addop == 0) {
246 | 		arity = 0;
247 | 		addcc = 1;
248 | 	}
249 | 	else if ((adda == addb) && ((addop == 7) || (addop == 8) || (addop == 21) || (addop == 23) || (addop == 24))) {
250 | 		arity = 2;
251 | 		if (addop == 21) addop = 32;
252 | 	}
253 | 
254 | 	// add op always
255 | 	printf("%s%s%s", addops[addop], cc[addcc], setf[addF]);
256 | 	printf(args[arity], qpu_w_add(wa, X), qpu_pack_add(packmul, packing, wa, X), qpu_r(ra, rb, adda, op, 0), qpu_unpack_add(packmul, unpacking, adda), qpu_r(ra, rb, addb, op, 0), qpu_unpack_add(packmul, unpacking, addb));
257 | 
258 | 	// show mul op if non nop or control op is non nop
259 |         if (mulop || (op != 1)) {
260 | 
261 | 		uint32_t arity = 3;
262 | 		if (mulop == 0) {
263 | 			arity = 0;
264 | 			mulcc = 1;
265 | 		}
266 | 		else if ((mula == mulb) && (mulop == 4)) {
267 | 			arity = 2;
268 | 			if (mulop == 4) mulop = 8;
269 | 		}
270 | 
271 | 		printf("; %s%s%s", mulops[mulop], cc[mulcc], setf[mulF]);
272 | 		///* 000003a0: 36020037 18025841 */  xor r1, r0, r0; fmul ra1, ra0, unif
273 | 		printf(args[arity], qpu_w_mul(wb, X), qpu_pack_mul(packmul, packing, wb, X), qpu_r(ra, rb, mula, op, 1), qpu_unpack_mul(packmul, unpacking, mula), qpu_r(ra, rb, mulb, op, 1), qpu_unpack_mul(packmul, unpacking, mulb));
274 | 	}
275 | 
276 | 	// show control op if non nop
277 | 	if ((op != 1) && (op != 13)) {
278 | 		printf("; %s", ops[op]);
279 | 	}
280 | 	printf("\n");
281 | 
282 | }
283 | 
284 | void show_qpu_branch(uint32_t i0, uint32_t i1)
285 | {
286 | 	uint32_t addr     = i0;
287 | 	uint32_t unknown  = (i1 >> 24) & 0x0f;
288 | 	uint32_t cond     = (i1 >> 20) & 0x0f;
289 | 	uint32_t pcrel    = (i1 >> 19) & 0x01;
290 | 	uint32_t addreg   = (i1 >> 18) & 0x01;
291 | 	uint32_t ra       = (i1 >> 13) & 0x1f;
292 | 	uint32_t X        = (i1 >> 12) & 0x01;
293 | 	uint32_t wa       = (i1 >>  6) & 0x3f;
294 | 	uint32_t wb       = (i1 >>  0) & 0x3f;
295 | 
296 | 	if (showfields) {
297 | 		printf("branch addr=0x%08x, unknown=%x, cond=%02d, pcrel=%x, addreg=%x, ra=%02d, X=%x, wa=%02d, wb=%02x\n",
298 | 			addr, unknown, cond, pcrel, addreg, ra, X, wa, wb);
299 | 	}
300 | 	// branch: b[link][cc] [linkreg,] [basedreg,]
301 | 	if (wa==39) 
302 | 		printf("%s%s %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_mul(wb, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);
303 | 	else if (wb==39)
304 | 		printf("%s%s %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_add(wa, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);
305 | 	else 
306 | 		printf("%s%s %s, %s, %s%+d", pcrel ? "brr" : "bra", bcc[cond], qpu_w_add(wa, X), qpu_w_mul(wb, X), addreg ? qpu_r(ra, ra, 6, (i1 >> 28)&0xf, 0) : "", addr);
307 | 	
308 | 	if (!addreg) printf(" // 0x%08x", base+addr+8*4);
309 | 	printf("\n");
310 | 
311 | }
312 | 
313 | const char *qpu_ldi_unpack(uint32_t unpack, uint32_t data)
314 | {
315 | 	char *tmp = tmpalloc(128);
316 | 	// unpack = 1 (2 bit signed vectors), 3 = (2 bit unsigned vectors);
317 | 	if ((unpack==1) || (unpack==3)) {
318 | 		int d[16];
319 | 		for (int i=0; i<16; i++) {
320 | 			d[i] = ((data >> (16+i-1))&0x2) | ((data >> i) & 0x1);
321 | 			if ((unpack == 1) && d[i] &0x2)
322 | 				d[i] |= 0xfffffffc;
323 | 		}
324 | 		sprintf(tmp, "[%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d]",
325 | 			d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7],
326 | 			d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]);
327 | 	}
328 | 	else {
329 | 		sprintf(tmp, "0x%08x", data);
330 | 	}
331 | 	return tmp;
332 | }
333 | 
334 | void show_qpu_imm32(uint32_t i0, uint32_t i1)
335 | {
336 | 	uint32_t data = i0;
337 | 	uint32_t packbits  = (i1 >> 20) & 0xff;
338 | 	uint32_t unpacking = (packbits >> 5) & 0x7;
339 | 	uint32_t packmul   = (packbits >> 4) & 0x1;
340 | 	uint32_t packing   = (packbits >> 0) & 0xf;
341 | 	uint32_t addcc   = (i1 >> 17) & 0x07;
342 | 	uint32_t mulcc   = (i1 >> 14) & 0x07;
343 | 	uint32_t F       = (i1 >> 13) & 0x01;
344 | 	uint32_t X       = (i1 >> 12) & 0x01;
345 | 	uint32_t wa      = (i1 >>  6) & 0x3f;
346 | 	uint32_t wb      = (i1 >>  0) & 0x3f;
347 | 
348 | 	if (showfields) {
349 | 		printf("imm32 data=0x%08x, unpacking=0x%d, packmul=%d, packing=%d, addcc=%x, mulcc=%x, F=%x, X=%x, wa=%02d, wb=%02d\n",
350 | 			data, unpacking, packmul, packing, addcc, mulcc, F, X, wa, wb);
351 | 	}
352 | 
353 | 	const char *inst = ops[(i1 >> 28) & 0xf];
354 | 
355 | 	if (unpacking & 0x4) {
356 | 		inst = (data & 0x10) ? "sacq" : "srel";
357 | 		if (data <= 0x1f)
358 | 			data = data & 0xffffffef;
359 | 	}
360 | 
361 | 	// addop: op[cc][setf] rd[.pack?], immediate
362 | 	if (packbits==0 && addcc==0 && wa==39)
363 | 		printf("nop");
364 | 	else
365 | 		printf("%s%s%s %s%s, %s", inst, cc[addcc], setf[F], qpu_w_add(wa, X), qpu_pack_add(packmul, packing, wa, X), qpu_ldi_unpack(unpacking, data));
366 | 
367 | 	// mulop: [op[cc][setf] rd[.pack?], immediate
368 | 	if (mulcc) {
369 | 		printf("; %s%s%s %s%s, %s", inst, cc[mulcc], setf[F], qpu_w_mul(wb, X), qpu_pack_mul(packmul, packing, wa, X), qpu_ldi_unpack(unpacking, data));
370 | 	}
371 | 
372 | 	printf("\n");
373 | }
374 | 
375 | void show_qpu_inst(uint32_t *inst) {
376 | 	uint32_t i0 = inst[0];
377 | 	uint32_t i1 = inst[1];
378 | 
379 | 	int op = (i1 >> 28) & 0xf;
380 | 	if (op<14) show_qpu_add_mul(i0, i1);
381 | 	if (op==14) show_qpu_imm32(i0, i1);
382 | 	if (op==15) show_qpu_branch(i0, i1);
383 | }
384 | 
385 | void show_qpu_fragment(uint32_t *inst, int length) {
386 | 	uint32_t i = 0;
387 | 	for(;i<length; i+=2) {
388 | 		base = i*4;
389 | 		printf("/* %08x: %08x %08x */  ", i*4, inst[i], inst[i+1]); show_qpu_inst(&inst[i]);
390 | 	}
391 | 	printf("\n");
392 | }
393 | 
394 | uint32_t *file_load(const char *filename, uint32_t *filesize) {
395 | 	uint32_t *memory = 0;
396 | 	FILE *f = fopen(filename, "rb");
397 | 	if (f) {
398 | 		fseek(f, 0, SEEK_END);
399 | 		long size = ftell(f);
400 | 		fseek(f, 0, SEEK_SET);
401 | 		memory = (uint32_t*)(malloc(size+1));
402 | 		memory[size] = 0;
403 | 		if ((memory==0) || (fread(memory, size, 1, f)==0)) {
404 | 			free(memory);
405 | 			memory = 0;
406 | 		}
407 | 		fclose(f);
408 | 		if (filesize)
409 | 			*filesize = size;
410 | 	}
411 | 	return memory;
412 | }
413 | 
414 | void file_unload(uint32_t *data) {
415 | 	free(data);
416 | }
417 | 
418 | void qpu_dis_file(const char *filename) {
419 | 	printf("Disassembling %s\n", filename);
420 | 	uint32_t size;
421 | 	uint32_t *fragment = file_load(filename, &size);
422 | 	if (fragment==0) {
423 | 		printf("Couldn't read fragment %s\n", filename);
424 | 		return;
425 | 	}
426 | 	printf("Fragment %s, size %d\n", filename, size/4);
427 | 	show_qpu_fragment(fragment, (size/4));
428 | 	file_unload(fragment);
429 | }
430 | 
431 | int main(int argc, char * argv[]) {
432 | 	if (argc < 2) {
433 |     fprintf(stderr, "qpu-disassemble: Pass in a file name to disassemble as the first argument\n");
434 |     exit(1);
435 |   }
436 |   qpu_dis_file(argv[1]);
437 | }
438 | 


--------------------------------------------------------------------------------
/qpu-asm.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <inttypes.h>
  5 | #include <map>
  6 | #include <vector>
  7 | #include <assert.h>
  8 | #include <errno.h>
  9 | #include <sstream>
 10 | #include <algorithm>
 11 | #include <unistd.h> // for getopt()
 12 | 
 13 | using namespace std;
 14 | 
 15 | enum token_t {
 16 |     END=-1,
 17 |     WORD,
 18 |     DOT,
 19 |     COMMA,
 20 |     SEMI,
 21 |     COLON,
 22 | };
 23 | 
 24 | struct QPUreg {
 25 |     enum { A, B, ACCUM, SMALL } file;
 26 |     int num;
 27 | };
 28 | 
 29 | struct relocation {
 30 |     string label;
 31 |     int pc;
 32 | };
 33 | 
 34 | struct context {
 35 |     const char *stream;
 36 |     map<string, int> labels;
 37 |     int pc;
 38 |     vector<relocation> relocations;
 39 | };
 40 | 
 41 | 
 42 | static string addOps[] = {
 43 |     "nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs",
 44 |     "ftoi", "itof", "XXX", "XXX", "XXX", "add", "sub", "shr",
 45 |     "asr", "ror", "shl", "min", "max", "and", "or", "xor", "not",
 46 |     "clz", "XXX", "XXX", "XXX", "XXX", "XXX", "v8adds", "v8subs" };
 47 | 
 48 | static string mulOps[] = {
 49 |     "nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds",
 50 |     "v8subs" };
 51 | 
 52 | static uint8_t addOpCode(const string& word)
 53 | {
 54 |     for (int i=0; i < 32; i++) {
 55 |         if (word == addOps[i])
 56 |             return i;
 57 |     }
 58 | 
 59 |     return 0xFF;
 60 | }
 61 | 
 62 | string printAddOpCode(uint8_t opcode) {
 63 |   assert((opcode >= 0) && (opcode < 32));
 64 |   return addOps[opcode];
 65 | }
 66 | 
 67 | static uint8_t mulOpCode(const string& word)
 68 | {
 69 |     for (int i=0; i < 8; i++) {
 70 |         if (word == mulOps[i])
 71 |             return i;
 72 |     }
 73 | 
 74 |     return 0xFF;
 75 | }
 76 | 
 77 | string printMulOpCode(uint8_t opcode) {
 78 |   assert((opcode >= 0) && (opcode < 8));
 79 |   return mulOps[opcode];
 80 | }
 81 | 
 82 | bool isRegisterWord(const string& word) { return word[0] == 'r'; }
 83 | 
 84 | string printRegister(const QPUreg& reg)
 85 | {
 86 |     char buffer[32];
 87 |     if (reg.file == QPUreg::A || reg.file == QPUreg::B) {
 88 |         snprintf(buffer, 32, "r%c%d", (reg.file == QPUreg::A) ? 'a' : 'b',
 89 |                                       reg.num);
 90 |     }
 91 |     else if (reg.file == QPUreg::ACCUM) {
 92 |         snprintf(buffer, 32, "r%d", reg.num);
 93 |     }
 94 |     else {
 95 |         snprintf(buffer, 32, ".0x%x.", reg.num);
 96 |     }
 97 | 
 98 |     return buffer;
 99 | }
100 | 
101 | void parsePossibleNumber(const char* possibleNumber, int base, int* outNumber, bool* outIsNumber) {
102 |     char *endOfNumber;
103 |     *outNumber = strtol(possibleNumber, &endOfNumber, base);
104 |     *outIsNumber = (!(endOfNumber == possibleNumber || *endOfNumber != '\0' || errno == ERANGE));
105 | }
106 | 
107 | bool parseRegister(const string& word, QPUreg& reg)
108 | {
109 |     if (word[0] != 'r')
110 |         return false;
111 | 
112 |     int offset = 0;
113 |     switch (word[1]) {
114 |         case 'a': reg.file = QPUreg::A; offset = 2; break;
115 |         case 'b': reg.file = QPUreg::B; offset = 2; break;
116 |         default:
117 |             reg.file = QPUreg::ACCUM;
118 |             offset = 1;
119 |     }
120 | 
121 |     const char* possibleNumber = (word.c_str() + offset);
122 |     bool isNumber;
123 |     int number;
124 |     parsePossibleNumber(possibleNumber, 10, &number, &isNumber);
125 |     if (!isNumber) {
126 |       cerr << "Warning - couldn't interpret '" << word << "' as a register" << endl;
127 |       return false;
128 |     }
129 |     reg.num = number;
130 | 
131 |     if ((reg.file == QPUreg::ACCUM) && (reg.num >= 6)) {
132 |       fprintf(stderr, "Warning - accumulator out of range\n");
133 |       return false;
134 |     }
135 | 
136 |     return true;
137 | }
138 | 
139 | bool parseFullImmediate(const string& str, uint32_t* outResult, uint32_t* outType)
140 | {
141 |     bool isNumber;
142 |     if (str[0] == '[') {
143 |       bool areAnyNegative = false;
144 |       std:string cleanedString(str);
145 |       cleanedString.erase(std::remove(cleanedString.begin(), cleanedString.end(), '['), cleanedString.end());
146 |       cleanedString.erase(std::remove(cleanedString.begin(), cleanedString.end(), ']'), cleanedString.end());
147 |       std::stringstream ss(cleanedString);
148 |       std::string item;
149 |       int itemCount = 0;
150 |       int itemValues[16];
151 |       while (std::getline(ss, item, ',')) {
152 |         if (itemCount >= 16) {
153 |           break;
154 |         }
155 |         bool isItemNumber;
156 |         int itemValue;
157 |         parsePossibleNumber(item.c_str(), 10, &itemValues[itemCount], &isItemNumber);
158 |         if (!isItemNumber) {
159 |           cerr << "Couldn't understand '" << item << "' as an entry in an immediate list" << endl;
160 |           return false;
161 |         }
162 |         if (itemValues[itemCount] < 0) {
163 |           areAnyNegative = true;
164 |         }
165 |         itemCount += 1;
166 |       }
167 | 
168 |       if (itemCount < 16) {
169 |           cerr << "Found too few items in the immediate array - expected 16 but had " << itemCount << endl;
170 |           return false;
171 |       }
172 | 
173 |       if (areAnyNegative) {
174 |         *outType = 0x02;
175 |       } else {
176 |         *outType = 0x06;
177 |       }
178 | 
179 |       uint32_t result = 0;
180 |       for (int index = 0; index < 16; index += 1) {
181 |         int value = itemValues[index];
182 |         if (areAnyNegative) {
183 |           if ((value < -1) || (value > 1)) {
184 |             cerr << "Found an out-of-range signed value in the immediate array - expected -1, 0, or 1 but found " << value << endl;
185 |             return false;
186 |           }
187 |         } else {
188 |           if (value > 3) {
189 |             cerr << "Found an out-of-range unsigned value in the immediate array - expected 0, 1, 2, or 3 but found " << value << endl;
190 |             return false;
191 |           }
192 |         }
193 |         uint32_t msb;
194 |         uint32_t lsb;
195 |         if (areAnyNegative) {
196 |           msb = ((value & 0x80000000) >> 31);
197 |           lsb = (value & 0x1);
198 |         } else {
199 |           msb = ((value & 0x2) >> 1);
200 |           lsb = (value & 0x1);
201 |         }
202 |         result = (result | (lsb << (index + 0)));
203 |         result = (result | (msb << (index + 16)));
204 |       }
205 | 
206 |       *outResult = result;
207 |       isNumber = true;
208 |     } else {
209 |       *outType = 0x00; // A full 32-bit immediate
210 |       // if there is an 'x' we assume it's hex.
211 |       if (str.find_first_of("x") != string::npos) {
212 |           int signedResult;
213 |           parsePossibleNumber(str.c_str(), 16, &signedResult, &isNumber);
214 |           *outResult = signedResult;
215 |       } else if (str.find_first_of(".f") != string::npos) {
216 |           float f = strtof(str.c_str(), NULL);
217 |           *outResult = *(uint32_t*)&f;
218 |           isNumber = true;
219 |       } else {
220 |           int signedResult;
221 |           parsePossibleNumber(str.c_str(), 10, &signedResult, &isNumber);
222 |           *outResult = signedResult;
223 |       }
224 |     }
225 |     return isNumber;
226 | }
227 | 
228 | int32_t parseSmallImmediate(const string& str)
229 | {
230 |     int32_t result;
231 |     if (str.find_first_of("x") != string::npos) {
232 |         result = strtoul(str.c_str(), NULL, 16);
233 |         if (result >= 16) {
234 |           cerr << "Immediate out of range: " << str << endl;
235 |           result = -1;
236 |         }
237 |     } else if (str.find_first_of("<<") != string::npos) {
238 |         uint32_t shift = strtoul(str.c_str() + 2, NULL, 10);
239 |         result = (48 + shift);
240 |     } else if (str.find_first_of("-") != string::npos) {
241 |         uint32_t value = strtoul(str.c_str() + 1, NULL, 10);
242 |         if ((value < 1) || (value > 16)) {
243 |           cerr << "Negative immediate out of range: " << str << endl;
244 |           result = -1;
245 |         } else {
246 |           result = (32 + value);
247 |         }
248 |     } else {
249 |         result = strtoul(str.c_str(), NULL, 10);
250 |         if (result >= 16) {
251 |           cerr << "Immediate out of range: " << str << endl;
252 |           result = -1;
253 |         }
254 |     }
255 |     return result;
256 | }
257 | 
258 | uint8_t parseBranchCond(const string& str)
259 | {
260 |     if (str == "zf")            // all z flags set ("z full")
261 |         return 0x0;
262 |     if (str == "ze")            // all z flags clear ("z empty")
263 |         return 0x1;
264 |     if (str == "zs")            // any z flags set ("z set")
265 |         return 0x2;
266 |     if (str == "zc")            // any z flags clear ("z clear")
267 |         return 0x3;
268 |     if (str == "nf")            // all N flags set ("N full")
269 |         return 0x4;
270 |     if (str == "ne")            // all N flags clear ("N empty")
271 |         return 0x5;
272 |     if (str == "ns")            // any N flags set ("N set")
273 |         return 0x6;
274 |     if (str == "nc")            // any N flags clear ("N clear")
275 |         return 0x7;
276 |     if (str == "cf")            // all C flags set ("C full")
277 |         return 0x8;
278 |     if (str == "ce")            // all C flags clear ("C empty")
279 |         return 0x9;
280 |     if (str == "cs")            // any C flags set ("C set")
281 |         return 0xa;
282 |     if (str == "cc")            // any C flags clear ("C clear")
283 |         return 0xb;
284 |     if (str == "*")             // always
285 |         return 0xf;
286 | 
287 |     // throw some exceptions
288 |     cerr << "Invalid branch condition: " << str << endl;
289 |     exit(0);
290 | }
291 | 
292 | bool parsePacking(const string& str, uint32_t* outUnpack, uint32_t* outPM, uint32_t* outPack)
293 | {
294 |     *outUnpack = 0;
295 |     *outPM = 0;
296 |     *outPack = 0;
297 |     if (str == "unpack32") {
298 |         *outUnpack = 0;
299 |     } else if (str == "unpack16a") {
300 |         *outUnpack = 1;
301 |     } else if (str == "unpack16b") {
302 |         *outUnpack = 2;
303 |     } else if (str == "unpack8ddupe") {
304 |         *outUnpack = 3;
305 |     } else if (str == "unpack8a") {
306 |         *outUnpack = 4;
307 |     } else if (str == "unpack8b") {
308 |         *outUnpack = 5;
309 |     } else if (str == "unpack8c") {
310 |         *outUnpack = 6;
311 |     } else if (str == "unpack8d") {
312 |         *outUnpack = 7;
313 |     } else if (str == "pack32") {
314 |         *outPack = 0;
315 |     } else if (str == "pack16a") {
316 |         *outPack = 1;
317 |     } else if (str == "pack16b") {
318 |         *outPack = 2;
319 |     } else if (str == "pack8ddupe") {
320 |         *outPack = 3;
321 |     } else if (str == "pack8a") {
322 |         *outPack = 4;
323 |     } else if (str == "pack8b") {
324 |         *outPack = 5;
325 |     } else if (str == "pack8c") {
326 |         *outPack = 6;
327 |     } else if (str == "pack8d") {
328 |         *outPack = 7;
329 |     } else if (str == "pack32clamp") {
330 |         *outPack = 8;
331 |     } else if (str == "pack16aclamp") {
332 |         *outPack = 9;
333 |     } else if (str == "pack16bclamp") {
334 |         *outPack = 10;
335 |     } else if (str == "pack8ddupeclamp") {
336 |         *outPack = 11;
337 |     } else if (str == "pack8aclamp") {
338 |         *outPack = 12;
339 |     } else if (str == "pack8bclamp") {
340 |         *outPack = 13;
341 |     } else if (str == "pack8cclamp") {
342 |         *outPack = 14;
343 |     } else if (str == "pack8dclamp") {
344 |         *outPack = 15;
345 |     } else {
346 |       cerr << "Unknown pack condition: " << str << endl;
347 |       return false;
348 |     }
349 | 
350 |     return true;
351 | }
352 | 
353 | uint8_t setALUMux(const QPUreg& reg)
354 | {
355 |     switch (reg.file) {
356 |         case QPUreg::A: return 0x6;
357 |         case QPUreg::B: return 0x7;
358 |         case QPUreg::ACCUM:
359 |             if (reg.num > 6 || reg.num < 0) {
360 |                 cerr << "Invalid accumulator register; out of range" << endl;
361 |                 exit(0);
362 |             }
363 |             return reg.num;
364 |         case QPUreg::SMALL: return 0x7;
365 |     }
366 | }
367 | 
368 | 
369 | token_t nextToken(const char *stream, string& out, const char **ptr)
370 | {
371 |     char buffer[128];
372 |     int i = 0;
373 | 
374 |     *ptr = stream;
375 |     if (!stream || !*stream)
376 |         return END;
377 | 
378 |     while (*stream == ' ' || *stream == '\t')
379 |         stream++;
380 | 
381 |     if (isdigit(*stream))
382 |     {
383 |         // read until we don't find a hex digit, x (for hex) or .
384 |         while (isxdigit(*stream) || isdigit(*stream) || *stream == '.' || *stream == 'x') {
385 |             buffer[i++] = *stream++;
386 |             if (*stream == 0 || i > sizeof(buffer) - 1)
387 |                 break;
388 |         }
389 |         buffer[i++] = '\0';
390 |         out = buffer;
391 |         *ptr = stream;
392 | 
393 |         return WORD;
394 |     }
395 | 
396 | 
397 |     if (*stream == '.') { *ptr = stream+1; return DOT; }
398 |     if (*stream == ',') { *ptr = stream+1; return COMMA; }
399 |     if (*stream == ';') { *ptr = stream+1; return SEMI; }
400 |     if (*stream == '#') { *ptr = stream+1; return END; }
401 |     if (*stream == ':') { *ptr = stream+1; return COLON; }
402 | 
403 |     while (*stream != '.' && *stream != ',' && *stream != ';'
404 |                           && *stream != ' ' && *stream != '\t'
405 |                           && *stream != ':')
406 |     {
407 |         buffer[i++] = *stream++;
408 |         if (*stream == 0 || i > sizeof(buffer)-1)
409 |             break;
410 |     }
411 | 
412 |     buffer[i++] = '\0';
413 |     out = buffer;
414 |     *ptr = stream;
415 |     return WORD;
416 | }
417 | 
418 | 
419 | bool aluHelper(const char *stream, QPUreg& dest, QPUreg& r1, QPUreg& r2, uint8_t& sig, uint32_t& unpack, uint32_t& pm, uint32_t& pack, const char **ptr)
420 | {
421 |     string token_str;
422 |     token_t tok = nextToken(stream, token_str, &stream);
423 | 
424 |     if (tok == DOT) {
425 |         // conditional
426 |         nextToken(stream, token_str, &stream);
427 |         cout << "flag/conditional = " << token_str << endl;
428 |         if (token_str == "ldtmu0") {
429 |             sig = 10;
430 |         } else if (token_str == "ldtmu1") {
431 |             sig = 11;
432 |         } else if (token_str == "tend") {
433 |             sig = 3;
434 |         } else if (parsePacking(token_str, &unpack, &pm, &pack)) {
435 |           // Do nothing, the parse function has filled in the values
436 |         } else {
437 |           cout << "Conditional couldn't be understood: " << token_str << endl;
438 |           return false;
439 |         }
440 |         tok = nextToken(stream, token_str, &stream);
441 |     }
442 | 
443 |     // this is supposed to be the destination register
444 |     if (tok != WORD) {
445 |         cout << "Expecting word.  Got: " << token_str << endl;
446 |         return false;
447 |     }
448 | 
449 |     if (!parseRegister(token_str, dest)) {
450 |       return false;
451 |     }
452 |     tok = nextToken(stream, token_str, &stream);
453 |     if (tok != COMMA) return false;
454 |     tok = nextToken(stream, token_str, &stream);
455 |     if (!parseRegister(token_str, r1)) {
456 |       return false;
457 |     }
458 | 
459 |     tok = nextToken(stream, token_str, &stream);
460 |     if (tok != COMMA) return false;
461 |     tok = nextToken(stream, token_str, &stream);
462 |     if (!parseRegister(token_str, r2)) {
463 |         r2.file = QPUreg::SMALL;
464 |         int32_t imm = parseSmallImmediate(token_str);
465 |         if (imm < 0) {
466 |           return false;
467 |         }
468 |         r2.num = imm;
469 |     }
470 | 
471 |     /*
472 |     cout << "dest: " << printRegister(dest) << ", r1: "
473 |                      << printRegister(r1) << ", r2: "
474 |                      << printRegister(r2) << endl;
475 |                      */
476 | 
477 |     *ptr = stream;
478 |     return true;
479 | }
480 | 
481 | 
482 | uint64_t assembleALU(context& ctx, string word)
483 | {
484 |     string token_str;
485 |     uint8_t add_op = addOpCode(word);
486 |     if (add_op == 0xFF) {
487 |         cout << "FATAL (assert).  Bad ADD opcode: " << word << endl;
488 |         return -1;
489 |     }
490 | 
491 |     uint32_t unpack = 0;
492 |     uint32_t pm = 0;
493 |     uint32_t pack = 0;
494 | 
495 |     QPUreg addDest, addR1, addR2;
496 |     QPUreg mulDest, mulR1, mulR2;
497 | 
498 |     uint8_t sig = 0x1;          // no-signal (TODO: plumb signals through)
499 |     if (!aluHelper(ctx.stream, addDest, addR1, addR2, sig, unpack, pm, pack, &ctx.stream))
500 |         return -1;
501 | 
502 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
503 |     // this should be a semi-colon
504 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
505 |     uint8_t mul_op = mulOpCode(token_str);
506 |     if (mul_op == 0xFF) {
507 |         cout << "FATAL (assert).  Bad MUL opcode: " << token_str << endl;
508 |         return -1;
509 |     }
510 | 
511 |     bool skipParseMul(false);
512 |     if (mul_op == 0) {
513 |         // nop.  If the next token is a semi or END, we'll generate
514 |         // the registers for them
515 |         const char *discard;
516 |         tok = nextToken(ctx.stream, token_str, &discard);
517 |         if (tok == END || tok == SEMI) {
518 |             mulDest.num = 39;
519 |             mulDest.file = (addDest.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
520 |             mulR1 = addR1;
521 |             mulR2 = addR2;
522 |             skipParseMul = true;
523 |         }
524 |     }
525 | 
526 |     if (!skipParseMul) {
527 |         uint8_t junk;
528 |         uint32_t junk32;
529 |         if (!aluHelper(ctx.stream, mulDest, mulR1, mulR2, junk, junk32, junk32, junk32, &ctx.stream))
530 |             return -1;
531 |     }
532 | 
533 |     uint64_t ins = 0x0;
534 |     uint8_t cond_add = 0x1;
535 |     uint8_t cond_mul = 0x1;
536 |     uint8_t sf = 0x1;
537 |     if (add_op == 0)
538 |         sf = 0x0;           // no set flags on nop
539 | 
540 |     // TODO: constraints.  We can only read from file A and file B once (dual-port)
541 | 
542 |     uint8_t ws = 0x0;
543 |     // If the add pipe specifies file b for output, ws = 1
544 |     if ((addDest.file == QPUreg::B) ||
545 |         ((addDest.file == QPUreg::ACCUM) && (mulDest.file == QPUreg::A))) {
546 |         ws = 0x1;
547 |     }
548 |     // if ws == 1, mul pipe must specify file a or accumulator for output
549 |     if (ws == 0x1 && (mulDest.file != QPUreg::A) && (mulDest.file != QPUreg::ACCUM)) {
550 |         cout << "constraint check failed.  mul pipe must specify register file A when write-swap set, but found " << printRegister(mulDest) << endl;
551 |         return -1;
552 |     }
553 |     // if ws == 0, mul pipe must specify file b or accumulator for output
554 |     if (ws == 0x0 && (mulDest.file != QPUreg::B) && (mulDest.file != QPUreg::ACCUM)) {
555 |         cout << "constraint check failed.  mul pipe must specify register file B when write-swap clear, but found " << printRegister(mulDest) << endl;
556 |         return -1;
557 |     }
558 | 
559 |     // TODO: handle the accumulators and the small immediate
560 |     uint8_t read_a = 0x0;
561 |     uint8_t read_b = 0x0;
562 |     bool isReadASet = false;
563 |     bool isReadBSet = false;
564 |     QPUreg candidates[] = {addR1, addR2, mulR1, mulR2};
565 |     for (int index = 0; index < (sizeof(candidates)/sizeof(candidates[0])); index += 1) {
566 |       QPUreg reg = candidates[index];
567 |       if (reg.file == QPUreg::A) {
568 |         if (isReadASet && (read_a != reg.num)) {
569 |           fprintf(stderr, "Error: Can't set multiple different general registers as sources in a single ALU instruction\n");
570 |           return -1;
571 |         }
572 |         isReadASet = true;
573 |         read_a = reg.num;
574 |       }
575 |       if (reg.file == QPUreg::B) {
576 |         if (isReadBSet && (read_b != reg.num)) {
577 |           fprintf(stderr, "Error: Can't set multiple different general registers as sources in a single ALU instruction\n");
578 |           return -1;
579 |         }
580 |         isReadBSet = true;
581 |         read_b = reg.num;
582 |       }
583 |     }
584 | 
585 |     // checks:
586 |     //   read_a not set and one of the muxes specifies file A ...
587 |     //   same for read_b
588 |     //   read_b set and there is a small immediate value
589 | 
590 |     // we could have immediates in the first register slot but not sure it makes sense
591 |     // As above, we should check that read_b is not already set
592 |     if (addR2.file == QPUreg::SMALL)    {
593 |       if (isReadBSet && (read_b != addR2.num)) {
594 |         fprintf(stderr, "Error: Can't set an immediate and general registers as sources in a single ALU instruction\n");
595 |         return -1;
596 |       }
597 |       isReadBSet = true;
598 |       read_b = addR2.num;
599 |       sig = 13;
600 |     }
601 |     if (mulR2.file == QPUreg::SMALL)    {
602 |       if (isReadBSet && (read_b != mulR2.num)) {
603 |         fprintf(stderr, "Error: Can't set an immediate and general registers as sources in a single ALU instruction\n");
604 |         return -1;
605 |       }
606 |       isReadBSet = true;
607 |       read_b = mulR2.num;
608 |       sig = 13;
609 |     }
610 | 
611 |     // The accumulators are mapped to r32-35 when writing to them as destinations
612 |     if (addDest.file == QPUreg::ACCUM) {
613 |       addDest.num += 32;
614 |     }
615 |     if (mulDest.file == QPUreg::ACCUM) {
616 |       mulDest.num += 32;
617 |     }
618 | 
619 |     uint8_t add_a = setALUMux(addR1) & 0x7;
620 |     uint8_t add_b = setALUMux(addR2) & 0x7;
621 |     uint8_t mul_a = setALUMux(mulR1) & 0x7;
622 |     uint8_t mul_b = setALUMux(mulR2) & 0x7;
623 |     read_a &= 0x3f;
624 |     read_b &= 0x3f;
625 |     mul_op &= 0x7;
626 |     add_op &= 0x1f;
627 |     addDest.num &= 0x3f;
628 |     mulDest.num &= 0x3f;
629 |     cond_add &= 0x7;
630 |     cond_mul &= 0x7;
631 |     sf &= 0x1;
632 |     ws &= 0x1;
633 | 
634 | //    printf("Assembling ALU instruction: %s, %d, %d\n", printRegister(addDest).c_str(), ws, sig);
635 | 
636 |     printf("ALU: %s %s, %s, %s; %s %s, %s, %s\n",
637 |       printAddOpCode(add_op).c_str(),
638 |       printRegister(addDest).c_str(),
639 |       printRegister(addR1).c_str(),
640 |       printRegister(addR2).c_str(),
641 |       printMulOpCode(mul_op).c_str(),
642 |       printRegister(mulDest).c_str(),
643 |       printRegister(mulR1).c_str(),
644 |       printRegister(mulR2).c_str()
645 |     );
646 | 
647 |     ins = ((uint64_t)sig << 60) |
648 |       ((uint64_t)unpack << 57) |
649 |       ((uint64_t)pm << 56) |
650 |       ((uint64_t)pack << 52) |
651 |       ((uint64_t)cond_add << 49) |
652 |       ((uint64_t)cond_mul << 46) |
653 |       ((uint64_t)sf << 45) |
654 |       ((uint64_t)ws << 44);
655 |     ins |= ((uint64_t)addDest.num << 38) | ((uint64_t)mulDest.num << 32) | ((uint64_t)mul_op << 29) | ((uint64_t)add_op << 24);
656 |     ins |= ((uint64_t)read_a << 18) | ((uint64_t)read_b << 12) | ((uint64_t)add_a << 9) | ((uint64_t)add_b << 6) | ((uint64_t)mul_a << 3) | mul_b;
657 | 
658 |     return ins;
659 | }
660 | 
661 | uint64_t assembleLDI(context& ctx, string word)
662 | {
663 |     cout << "Assembling LDI instruction ... " << endl;
664 | 
665 |     string token_str;
666 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
667 | 
668 |     if (tok == DOT) {
669 |         // conditional ... conditionals should be on each register ?
670 |         cout << "conditional ... ";
671 |         // chew the conditional
672 |         nextToken(ctx.stream, token_str, &ctx.stream);
673 | 
674 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
675 |     }
676 | 
677 |     // this is supposed to be the register
678 |     if (tok != WORD) return -1;
679 | 
680 |     QPUreg register1, register2;
681 |     // check errors here
682 |     if (!parseRegister(token_str, register1)) {
683 |       return false;
684 |     }
685 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
686 |     if (tok != COMMA) return -1;
687 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
688 | 
689 |     // this can either be another register
690 |     // (in which case we'll use both ALUs to set)
691 |     // or an immediate value (in which case we'll use rX39)
692 |     register2.num = 39;
693 |     register2.file = (register1.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
694 |     if (isRegisterWord(token_str)) {
695 |         if (!parseRegister(token_str, register2)) {
696 |           return -1;
697 |         }
698 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
699 |         // check that this is a comma ...
700 |     }
701 | 
702 |     uint32_t immediateType = 0x00; // A full 32-bit immediate
703 |     unsigned int immediate;
704 |     string restOfLine(ctx.stream);
705 |     restOfLine = (token_str + restOfLine);
706 |     if (!parseFullImmediate(restOfLine, &immediate, &immediateType)) {
707 |       cerr << "Immediate couldn't be parsed: " << restOfLine << endl;
708 |       return -1;
709 |     }
710 | 
711 |     cout << "r1: " << printRegister(register1) << ", r2: "
712 |                    << printRegister(register2) << ", immed: 0x"
713 |                    << hex << immediate << dec << endl;
714 | 
715 |     // The accumulators are mapped to r32-35 in this context
716 |     if (register1.file == QPUreg::ACCUM) {
717 |       register1.num += 32;
718 |     }
719 |     if (register2.file == QPUreg::ACCUM) {
720 |       register2.num += 32;
721 |     }
722 | 
723 |     uint32_t high = (uint32_t)0xE << 28;
724 |     high |= immediateType << 24;
725 |     high |= (uint32_t)0x1 << 17;      // cond_add
726 |     high |= (uint32_t)0x1 << 14;      // cond_mul
727 |     high |= (uint32_t)0x0 << 13;      // sf
728 |     high |= (uint32_t)0x0 << 12;      // ws
729 |     uint8_t addreg = (register1.file != QPUreg::B) ? register1.num : register2.num;
730 |     uint8_t mulreg = (register1.file == QPUreg::B) ? register1.num : register2.num;
731 |     high |= (uint32_t)addreg << 6;
732 |     high |= mulreg;
733 |     uint64_t ins = ((uint64_t)high << 32) | immediate;
734 | 
735 |     return ins;
736 | }
737 | 
738 | uint64_t assembleBRANCH(context& ctx, string word)
739 | {
740 |     cout << "Assembing BRANCH instruction" << endl;
741 | 
742 |     QPUreg dest;
743 |     string token_str;
744 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
745 | 
746 |     // relative or absolute branch?
747 |     uint8_t relative = 1;
748 |     if (word == "bra")
749 |         relative = 0;
750 | 
751 |     uint8_t branchCondition = 0xf;          // by default: always (unconditional branch)
752 |     if (tok == DOT) {
753 |         // conditional
754 |         nextToken(ctx.stream, token_str, &ctx.stream);
755 |         branchCondition = parseBranchCond(token_str);
756 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
757 |     }
758 | 
759 |     // this is the destination register
760 |     if (tok != WORD) {
761 |         cerr << "branch expecting destination register." << endl;
762 |         return -1;
763 |     }
764 |     if (!parseRegister(token_str, dest)) {
765 |       return false;
766 |     }
767 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
768 |     if (tok != COMMA) return false;
769 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
770 |     if (tok != WORD) {
771 |         cerr << "branch expecting label/target" << endl;
772 |         return -1;
773 |     }
774 | 
775 |     // look it up in the labels map
776 |     int target = 0xFFFFFFFF;
777 |     if (ctx.labels.count(token_str) < 1) {
778 |         relocation r;
779 |         r.label = token_str;
780 |         r.pc = ctx.pc;
781 |         ctx.relocations.push_back(r);
782 |     } else
783 |         target = ctx.labels[token_str];
784 |     int offset = target - (ctx.pc+4*8);
785 | 
786 |     uint8_t raddr_a = 0;           // raddr_a is only 5-bits?
787 |     uint8_t use_reg = 0;
788 |     // if there's a third argument, it is a register offset
789 |     const char *discard;
790 |     tok = nextToken(ctx.stream, token_str, &discard);
791 |     if (tok == COMMA) {
792 |         QPUreg offsetReg;
793 |         // chew the comma we just read
794 |         ctx.stream = discard;
795 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
796 |         if (!parseRegister(token_str, offsetReg)) {
797 |           return -1;
798 |         }
799 |         if (offsetReg.file != QPUreg::A) {
800 |             cerr << "branch target offset register must be file A" << endl;
801 |             return -1;
802 |         }
803 |         if (offsetReg.num > 31) {
804 |             cerr << "branch target offset register must be < 32" << endl;
805 |             return -1;
806 |         }
807 |         raddr_a = offsetReg.num;
808 |         use_reg = 1;
809 |     }
810 | 
811 |     uint8_t waddr_add = 39;         // link address appears at ALU outputs
812 |     uint8_t waddr_mul = 39;
813 |     if (dest.file == QPUreg::A) waddr_add = dest.num;
814 |     if (dest.file == QPUreg::B) waddr_mul = dest.num;
815 | 
816 |     // TODO: generate absolute branches too
817 | 
818 |     uint64_t ins = (uint64_t)0xF << 60;
819 |     ins |= (uint64_t)branchCondition << 52;
820 |     ins |= (uint64_t)relative << 51;
821 |     ins |= (uint64_t)use_reg << 50;
822 |     ins |= (uint64_t)raddr_a << 45;
823 |     ins |= (uint64_t)0x0 << 44;                       // write-swap
824 |     ins |= (uint64_t)waddr_add << 38;
825 |     ins |= (uint64_t)waddr_mul << 32;
826 |     ins |= (uint32_t)offset;
827 | 
828 |     return ins;
829 | }
830 | 
831 | uint64_t assembleSEMA(context& ctx, string word)
832 | {
833 | 
834 |     uint64_t ins = (uint64_t)0x74 << 57;
835 | 
836 |     string token_str;
837 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
838 |     if (tok != WORD) {
839 |         cerr << "semaphore instruction expecting down/up or acquire/release" << endl;
840 |         return -1;
841 |     }
842 | 
843 |     uint8_t sa = 0;             // up
844 |     if (token_str == "down" || token_str == "acquire")
845 |         sa = 1;
846 | 
847 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
848 |     if (tok != COMMA)   return -1;
849 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
850 |     uint32_t imm = parseSmallImmediate(token_str);
851 |     if (imm < 0) {
852 |         cerr << "semaphore out of range" << endl;
853 |         return -1;
854 |     }
855 |     // cond_add, cond_mul = NEVER, ws, sf = false
856 |     ins |= (uint64_t)39 << 38;          // waddr_add
857 |     ins |= (uint64_t)39 << 32;          // waddr_mul
858 |     ins |= sa << 4;
859 |     ins |= (uint8_t)imm;
860 | 
861 |     cout << "Assembling SEMAPHORE instruction (" << imm << "), " << (int)sa << endl;
862 | 
863 |     return ins;
864 | }
865 | 
866 | 
867 | int main(int argc, char **argv)
868 | {
869 |     char *outfname = 0;
870 |     int c;
871 | 
872 |     char* writeCPP = NULL;
873 |     while ((c = getopt(argc, argv, "o:c:")) != -1) {
874 |         switch (c) {
875 |             case 'o':
876 |                 outfname = optarg;
877 |                 break;
878 |             case 'c':
879 |                 writeCPP = optarg;
880 |                 break;
881 |         }
882 |     }
883 | 
884 |     if (!outfname) {
885 |         cerr << "Usage: " << argv[0] << " -o <output>" << endl;
886 |         return -1;
887 |     }
888 | 
889 |     char line[128];
890 |     string token_string;
891 | 
892 |     struct context ctx;
893 |     ctx.pc = 0;
894 | 
895 |     vector<uint64_t> instructions;
896 | 
897 |     while (cin.getline(line, 128))
898 |     {
899 |         const char *p = line;
900 |         ctx.stream = p;
901 |         token_t tok = nextToken(ctx.stream, token_string, &ctx.stream);
902 | 
903 |         if (tok == END)
904 |             continue;
905 | 
906 |         if (tok == WORD)
907 |         {
908 |             // read-ahead to see if the next token is a colon in which case
909 |             // this is a label.
910 |             const char *discard = NULL;
911 |             string nextTokenStr;
912 |             if (nextToken(ctx.stream, nextTokenStr, &discard) == COLON) {
913 |                 ctx.labels[token_string] = ctx.pc;
914 |                 continue;
915 |             }
916 | 
917 |             enum { INVALID, ALU, BRANCH, LDI, SEMA } opType = INVALID;
918 |             if (addOpCode(token_string) != 0xFF || mulOpCode(token_string) != 0xFF)
919 |                 opType = ALU;
920 |             if (token_string == "ldi") opType = LDI;
921 |             if (token_string == "bra" || token_string == "brr") opType = BRANCH;
922 |             if (token_string == "sema") opType = SEMA;
923 | 
924 |             if (opType == INVALID) {
925 |                 cout << "Unable to assemble line; invalid opcode: " << line << endl;
926 |                 return -1;
927 |             }
928 | 
929 |             uint64_t ins = 0;
930 |             switch (opType) {
931 |                 case ALU: ins = assembleALU(ctx, token_string); break;
932 |                 case BRANCH: ins = assembleBRANCH(ctx, token_string); break;
933 |                 case LDI: ins = assembleLDI(ctx, token_string); break;
934 |                 case SEMA: ins = assembleSEMA(ctx, token_string); break;
935 |             }
936 | 
937 |             if (ins == (uint64_t)-1) {
938 |                 cerr << "Error on line: " << line << endl;
939 |                 return -1;
940 |             }
941 | 
942 |             instructions.push_back(ins);
943 |             ctx.pc += 8;            // bytes;
944 |         }
945 |     }
946 | 
947 |     // Process relocations
948 |     ctx.labels["ZERO"] = 0x0;
949 |     for (int i=0; i < ctx.relocations.size(); i++)
950 |     {
951 |         relocation& r = ctx.relocations[i];
952 |         if (ctx.labels.count(r.label) < 1)
953 |         {
954 |             cerr << "undefined label: " << r.label << endl;
955 |             return -1;
956 |         }
957 |         int offset = ctx.labels[r.label] - (r.pc + 4*8);
958 |         if (r.label == "ZERO")
959 |             offset = 0x0;
960 |         cout << "Processing relocation at " << r.pc << " : " << r.label
961 |                                             << " : " << offset << endl;
962 |         uint64_t ins = instructions[r.pc / 8];
963 |         ins &= (uint64_t)0xFFFFFFFF << 32;   // zero bottom 32-bits for new value
964 |         ins |= (uint32_t)offset;
965 |         instructions[r.pc / 8] = ins;
966 |     }
967 | 
968 |     FILE *outfile = fopen(outfname, "w");
969 |     if (!outfile)
970 |     {
971 |         cerr << "Unable to open output file " << string(outfname) << endl;
972 |         return -1;
973 |     }
974 | 
975 |     if (writeCPP) {
976 |       fprintf(outfile, "#include <stdint.h>\n");
977 |       fprintf(outfile, "#include <stddef.h>\n\n");
978 |       fprintf(outfile, "uint32_t %s[%d] = {\n", writeCPP, (instructions.size() * 2));
979 |       uint32_t* instructionsData = (uint32_t*)(&instructions[0]);
980 |       for (int i=0; i < instructions.size(); i++) {
981 |         fprintf(outfile, "  0x%08x, 0x%08x,\n", instructionsData[(i * 2) + 0], instructionsData[(i * 2) + 1]);
982 |       }
983 |       fprintf(outfile, "};\n\n");
984 |       fprintf(outfile, "size_t %sByteCount = %d;\n", writeCPP, (instructions.size() * 8));
985 |     } else {
986 |       for (int i=0; i < instructions.size(); i++)
987 |           fwrite(&instructions[i], sizeof(uint64_t), 1, outfile);
988 |     }
989 | 
990 |     fclose(outfile);
991 |     cout << "Done.  Num instructions: " << instructions.size() << ", "
992 |          << instructions.size() * 8 << " bytes." << endl;
993 | }
994 | 


--------------------------------------------------------------------------------