├── QPU ├── SHA-256 │ ├── 1QPU │ │ ├── 1qpu.asm │ │ ├── Makefile │ │ ├── qpufuncs.cpp │ │ ├── qpufuncs.h │ │ ├── sha256.cpp │ │ └── test-data.bin │ ├── final │ │ ├── Makefile │ │ ├── final.asm │ │ ├── qpufuncs.cpp │ │ ├── qpufuncs.h │ │ ├── sha256.cpp │ │ └── test-data.bin │ ├── partial │ │ ├── Makefile │ │ ├── partial.asm │ │ ├── qpufuncs.cpp │ │ ├── qpufuncs.h │ │ ├── sha256.cpp │ │ └── test-data.bin │ └── reference │ │ ├── Makefile │ │ ├── sha256.cpp │ │ └── test-data.bin ├── assembler │ ├── Makefile │ └── assemble.cpp └── helloworld │ ├── Makefile │ ├── driver.c │ └── helloworld.asm └── README.md /QPU/SHA-256/1QPU/1qpu.asm: -------------------------------------------------------------------------------- 1 | define(`NOP', `nop ra39, ra39, ra39; nop rb39, rb39, rb39') 2 | define(`GENSCHEDULE', 3 | ` 4 | add rb32, $1, 0; nop # r0 = W_i-16 5 | ror rb33, $2, 7; nop # r1 = RotR(x, 7) 6 | ror rb34, $2, rb6; nop # r2 = RotR(x, 18) 7 | shr rb35, $2, 3; nop # r3 = x >> 3; 8 | xor rb33, r1, r2; nop # r1 = r1 ^ r2 9 | xor rb35, r1, r3; nop # r3 = r1 ^ r3 10 | add rb32, r0, r3; nop # r0 += r3 (W_i-16 + smsigma0(W_i-15)) 11 | add rb32, r0, $3; nop # r0 += W_i-7 12 | ror rb33, $4, rb8; nop # r1 = RotR(x, 17) 13 | ror rb34, $4, rb7; nop # r2 = RotR(x, 19) 14 | xor rb33, r1, r2; nop # r1 = r1 ^ r2 15 | shr rb34, $4, 10; nop # r2 = x >> 10 16 | xor rb33, r1, r2; nop # r1 = r1 ^ r2 17 | add $1, r0, r1; nop # r0 += smsigma1(W_i-2) 18 | add rb48, r0, r1; nop 19 | ## $2 ignored, $3 ignored, $4 ignored, $1 ignored (suppress warnings)') 20 | define(`GENSCHEDULE_ALL', 21 | ` 22 | GENSCHEDULE(`ra4', `ra5', `ra13', `ra18') 23 | GENSCHEDULE(`ra5', `ra6', `ra14', `ra19') 24 | GENSCHEDULE(`ra6', `ra7', `ra15', `ra4') 25 | GENSCHEDULE(`ra7', `ra8', `ra16', `ra5') 26 | GENSCHEDULE(`ra8', `ra9', `ra17', `ra6') 27 | GENSCHEDULE(`ra9', `ra10', `ra18', `ra7') 28 | GENSCHEDULE(`ra10', `ra11', `ra19', `ra8') 29 | GENSCHEDULE(`ra11', `ra12', `ra4', `ra9') 30 | GENSCHEDULE(`ra12', `ra13', `ra5', `ra10') 31 | GENSCHEDULE(`ra13', `ra14', `ra6', `ra11') 32 | GENSCHEDULE(`ra14', `ra15', `ra7', `ra12') 33 | GENSCHEDULE(`ra15', `ra16', `ra8', `ra13') 34 | GENSCHEDULE(`ra16', `ra17', `ra9', `ra14') 35 | GENSCHEDULE(`ra17', `ra18', `ra10', `ra15') 36 | GENSCHEDULE(`ra18', `ra19', `ra11', `ra16') 37 | GENSCHEDULE(`ra19', `ra4', `ra12', `ra17')') 38 | 39 | ## Move the uniforms (arguments) into registers 40 | or ra31, ra32, 0; nop # address of K in ra31 41 | or ra30, ra32, 0; nop # address of H in ra30 42 | or ra29, ra32, 0; nop # address of data in ra29 43 | 44 | ## Load some rotation constants that don't fit in small immediates 45 | ldi rb2, 0x16; 46 | ldi rb5, 0x19; 47 | ldi rb6, 0x12; 48 | ldi rb7, 0x13; 49 | ldi rb8, 0x11; 50 | 51 | ## VCD DMA setup for the H vectors (16x8) 52 | ldi ra49, 0x82801000 53 | 54 | ## Move the H vectors into the VPM (0,0 in VPM) 55 | or ra50, ra30, 0; nop 56 | 57 | ## Wait for the DMA to complete 58 | and rb39, ra50, ra50; nop 59 | 60 | ## Configure the VPM for reading the H vectors 61 | ldi ra49, 0x801200 62 | 63 | ## Read the H vectors into registers ra20..ra27 (these are the a..h) 64 | ## Also copy them into rb20..rb27 (we need the original values to write back) 65 | or ra20, ra48, 0; v8max rb20, ra48, ra48; 66 | or ra21, ra48, 0; v8max rb21, ra48, ra48; 67 | or ra22, ra48, 0; v8max rb22, ra48, ra48; 68 | or ra23, ra48, 0; v8max rb23, ra48, ra48; 69 | or ra24, ra48, 0; v8max rb24, ra48, ra48; 70 | or ra25, ra48, 0; v8max rb25, ra48, ra48; 71 | or ra26, ra48, 0; v8max rb26, ra48, ra48; 72 | or ra27, ra48, 0; v8max rb27, ra48, ra48; 73 | 74 | ## Configure the VPM/VCD to read the data vectors 75 | ldi ra49, 0x83001000 76 | or ra50, ra29, 0; nop ## Load address to DMA 77 | or rb39, ra50, 0; nop ## Wait for it 78 | 79 | ## Read the data vectors into ra4..ra19 since we use the registers in 80 | ## GENSCHEDULE 81 | ldi ra49, 0x1200 82 | or ra4, ra48, 0; nop 83 | or ra5, ra48, 0; nop 84 | or ra6, ra48, 0; nop 85 | or ra7, ra48, 0; nop 86 | or ra8, ra48, 0; nop 87 | or ra9, ra48, 0; nop 88 | or ra10, ra48, 0; nop 89 | or ra11, ra48, 0; nop 90 | or ra12, ra48, 0; nop 91 | or ra13, ra48, 0; nop 92 | or ra14, ra48, 0; nop 93 | or ra15, ra48, 0; nop 94 | or ra16, ra48, 0; nop 95 | or ra17, ra48, 0; nop 96 | or ra18, ra48, 0; nop 97 | or ra19, ra48, 0; nop 98 | 99 | 100 | ## 4 loops of 16 = 64 iterations 101 | ldi ra1, 4 102 | mainloop: 103 | 104 | ldi ra49, 0x1200 105 | ## 16 loops of compression 106 | ldi ra2, 0x10; 107 | compress: 108 | ## r0 = K[i] + h 109 | or rb56, ra31, 0; nop 110 | nop.tmu ra39, ra39, ra39; nop 111 | add ra31, ra31, 4; nop 112 | add rb32, r4, ra27; nop 113 | 114 | ## T1 = h + K[i] + W[i] 115 | add rb18, r0, rb48; nop 116 | 117 | ## T1 += CH(e,f,g) => (e & f) ^ (~e & g) (e: ra24, f: ra25, g: ra26) 118 | or ra32, ra24, 0; nop # load e into r0 119 | and ra33, ra25, r0; nop # r1 = r0 & f (e & f) 120 | not ra32, r0, 0; nop # r0 = ~r0 (~e) 121 | and ra32, r0, ra26; nop # r0 = r0 & g (~e & g) 122 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 (e & f) ^ (~e & g) 123 | add rb18, rb18, r0; nop # accumulate into T1 124 | 125 | ## T1 += sigma1(e) => RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25) 126 | ror rb32, ra24, 6; nop 127 | ror rb33, ra24, 11; nop 128 | ror rb34, ra24, rb5; nop 129 | xor rb32, r0, r1; nop 130 | xor rb32, r0, r2; nop 131 | add rb18, r0, rb18; nop 132 | 133 | ## T2 (ra3) = sigma0(a) (a: ra20) 134 | ror ra32, ra20, 2; nop # r0 = RotR(a, 2) 135 | ror ra33, ra20, 13; nop # r1 = RotR(a, 13) 136 | xor ra32, r0, r1; nop # r0 = RotR(a, 2) ^ RotR(a, 13) 137 | ror ra33, ra20, rb2; nop # r1 = RotR(a, 22) 138 | xor ra3, r0, r1; nop # T2 = sigma0(a) 139 | 140 | ## T2 += Maj(a,b,c) 141 | or ra32, ra20, 0; nop # load a into r0 142 | and ra33, r0, ra21; nop # r1 = a & b 143 | and ra34, r0, ra22; nop # r2 = a & c 144 | xor ra32, r1, r2; nop # r0 = (a & b) ^ (a & c) 145 | or ra33, ra21, 0; nop # load b into r1 146 | and ra33, r1, ra22; nop # r1 = b & c 147 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 148 | add ra3, ra3, r0; nop # T2 += Maj(a,b,c) 149 | 150 | ## swizzle 151 | or ra27, ra26, 0; nop 152 | or ra26, ra25, 0; nop 153 | or ra25, ra24, 0; nop 154 | add ra24, ra23, rb18; nop 155 | or ra23, ra22, 0; nop 156 | or ra22, ra21, 0; nop 157 | or ra21, ra20, 0; nop 158 | add ra20, rb18, ra3; nop 159 | 160 | ## Loop 161 | sub ra2, ra2, 1; nop 162 | brr.ze ra39, compress 163 | NOP 164 | NOP 165 | NOP 166 | sub ra1, ra1, 1; nop 167 | brr.zf ra39, done 168 | NOP 169 | NOP 170 | NOP 171 | ldi rb49, 0x1200 172 | GENSCHEDULE_ALL 173 | brr ra39, mainloop 174 | done: 175 | NOP 176 | NOP 177 | NOP 178 | 179 | ## Configure the VPM to write the H vectors back into place 180 | ldi rb49, 0x1200 181 | 182 | ## Write H vectors back (+=) 183 | add rb48, ra20, rb20; nop 184 | add rb48, ra21, rb21; nop 185 | add rb48, ra22, rb22; nop 186 | add rb48, ra23, rb23; nop 187 | add rb48, ra24, rb24; nop 188 | add rb48, ra25, rb25; nop 189 | add rb48, ra26, rb26; nop 190 | add rb48, ra27, rb27; nop 191 | 192 | ## Configure the VCD for DMA back to the host 193 | ldi rb49, 0x88084000 194 | 195 | ## Write the H address to store 196 | or rb50, ra30, 0; nop 197 | 198 | ## Wait for the DMA to complete 199 | or rb39, rb50, ra39; nop 200 | 201 | ## Trigger a host interrupt to finish the program 202 | or rb38, ra39, rb39; nop 203 | 204 | nop.tend ra39, ra39, ra39; nop rb39, rb39, rb39 205 | NOP 206 | NOP 207 | -------------------------------------------------------------------------------- /QPU/SHA-256/1QPU/Makefile: -------------------------------------------------------------------------------- 1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c 2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft 3 | 4 | sha256: sha256.cpp qpufuncs.cpp 5 | g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC) 6 | -------------------------------------------------------------------------------- /QPU/SHA-256/1QPU/qpufuncs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // memset 8 | #include 9 | #include 10 | #include "mailbox.h" 11 | #include "qpufuncs.h" 12 | 13 | #define GPU_MEM_FLG 0xC // cached 14 | #define GPU_MEM_MAP 0x0 // cached 15 | #define NUNIFORMS 3 16 | 17 | 18 | struct sha256_memory_map 19 | { 20 | /* 21 | data layout is: 22 | 64 words for K constants (accessed as a texture lookup) 23 | 16x8 (128) words for the 8 H vectors (VPM) 24 | 16x16 (256) words for the input data (VPM) 25 | Total: 448 words 26 | */ 27 | uint32_t data[64 + (128 + 256) * NUM_QPUS]; 28 | uint32_t code[MAX_CODE_SIZE]; 29 | /* 30 | uniforms are: 31 | u1: address of K texture 32 | u2: address of H vectors (also output location) 33 | u3: address of data buffer 34 | u4: stride 35 | */ 36 | uint32_t uniforms[NUNIFORMS * NUM_QPUS]; 37 | uint32_t msg[NUM_QPUS*2]; // msg is a (uniform, code) tuple to execute_qpu 38 | 39 | /* results are placed back into data where the H vectors were read from */ 40 | }; 41 | 42 | 43 | static struct 44 | { 45 | int mb; 46 | unsigned handle; 47 | unsigned size; 48 | unsigned vc_msg; 49 | unsigned ptr; 50 | void* arm_ptr; 51 | } sha256_qpu_context; 52 | 53 | 54 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 55 | unsigned *shader_code, unsigned code_len) 56 | { 57 | sha256_qpu_context.mb = mbox_open(); 58 | if (qpu_enable(sha256_qpu_context.mb, 1)) { 59 | fprintf(stderr, "Unable to enable QPU\n"); 60 | return -1; 61 | } 62 | 63 | // 1 MB should be plenty 64 | sha256_qpu_context.size = 1024 * 1024; 65 | sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb, 66 | sha256_qpu_context.size, 4096, 67 | GPU_MEM_FLG); 68 | if (!sha256_qpu_context.handle) { 69 | fprintf(stderr, "Unable to allocate %d bytes of GPU memory", 70 | sha256_qpu_context.size); 71 | return -2; 72 | } 73 | unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle); 74 | sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size); 75 | sha256_qpu_context.ptr = ptr; 76 | 77 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 78 | sha256_qpu_context.arm_ptr; 79 | memset(arm_map, 0x0, sizeof(struct sha256_memory_map)); 80 | unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data); 81 | unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms); 82 | unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code); 83 | sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg); 84 | 85 | memcpy(arm_map->code, shader_code, code_len); 86 | memcpy(arm_map->data, K, 64*sizeof(uint32_t)); 87 | memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS); 88 | memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t)); 89 | for (int i=0; i < NUM_QPUS; i++) { 90 | arm_map->uniforms[i*NUNIFORMS+0] = vc_data; // data (address of K texture) 91 | arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t); // address of H vectors 92 | arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t); 93 | 94 | arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t); 95 | arm_map->msg[i*2+1] = vc_code; 96 | } 97 | 98 | return sha256_qpu_context.mb; 99 | } 100 | 101 | 102 | void SHA256ExecuteQPU(uint32_t* H) 103 | { 104 | unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS, 105 | sha256_qpu_context.vc_msg, 1, 10000); 106 | if (ret != 0) 107 | fprintf(stderr, "Failed execute_qpu!\n"); 108 | } 109 | 110 | 111 | void SHA256CleanupQPU(int handle) 112 | { 113 | unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size); 114 | mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle); 115 | mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle); 116 | qpu_enable(sha256_qpu_context.mb, 0); 117 | mbox_close(sha256_qpu_context.mb); 118 | } 119 | 120 | 121 | void SHA256FetchResult(uint32_t *H) 122 | { 123 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 124 | sha256_qpu_context.arm_ptr; 125 | memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t)); 126 | } 127 | 128 | 129 | int loadQPUCode(const char *fname, unsigned int* buffer, int len) 130 | { 131 | FILE *in = fopen(fname, "r"); 132 | if (!in) { 133 | fprintf(stderr, "Failed to open %s.\n", fname); 134 | return -1; 135 | } 136 | 137 | size_t items = fread(buffer, sizeof(unsigned int), len, in); 138 | fclose(in); 139 | 140 | return items * sizeof(unsigned int); 141 | } 142 | -------------------------------------------------------------------------------- /QPU/SHA-256/1QPU/qpufuncs.h: -------------------------------------------------------------------------------- 1 | #ifndef _QPUFUNCS_ 2 | #define _QPUFUNCS_ 3 | 4 | #define NUM_QPUS 1 5 | #define MAX_CODE_SIZE 24000 /* in words */ 6 | 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 8 | unsigned *shader_code, unsigned code_len); 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len); 10 | void SHA256CleanupQPU(int handle); 11 | void SHA256ExecuteQPU(uint32_t* H); 12 | void SHA256FetchResult(uint32_t* H); 13 | 14 | #endif // _QPUFUNCS_ 15 | -------------------------------------------------------------------------------- /QPU/SHA-256/1QPU/sha256.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "qpufuncs.h" 7 | 8 | #define QPU_CODE_FILE "sha256.bin" 9 | #define NUM_QPUS 1 10 | #define BUFFER_SIZE NUM_QPUS * 16 11 | 12 | static uint32_t K[] = { 13 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 14 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 15 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 16 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 17 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 18 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 19 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 20 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 21 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 22 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 23 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 24 | }; 25 | 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) { 27 | return (x & y) ^ (~x & z); 28 | } 29 | 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) { 31 | return (x & y) ^ (x & z) ^ (y & z); 32 | } 33 | 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) { 35 | return (x >> shift) | (x << (32-shift)); 36 | } 37 | 38 | static inline uint32_t sigma0(uint32_t x) { 39 | return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22); 40 | } 41 | 42 | static inline uint32_t sigma1(uint32_t x) { 43 | return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25); 44 | } 45 | 46 | static inline uint32_t smsigma0(uint32_t x) { 47 | return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3); 48 | } 49 | 50 | static inline uint32_t smsigma1(uint32_t x) { 51 | return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10); 52 | } 53 | 54 | #define ENDIAN(x, i) ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3])) 55 | 56 | /* 57 | * data is an array of BUFFER_SIZE buffers to hash 58 | * H is an input/output parameter 59 | * stride is the stride for data (TODO: handle hashes of more than one block) 60 | */ 61 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride) 62 | { 63 | uint32_t W[64]; 64 | uint32_t a, b, c, d, e, f, g, h; 65 | 66 | for (int k=0; k < BUFFER_SIZE; k++) 67 | { 68 | for (int i=0; i < 16; i++) 69 | W[i] = data[k*stride+i]; 70 | for (int i=16; i < 64; i++) 71 | W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16]; 72 | 73 | a = H[k*8+0]; 74 | b = H[k*8+1]; 75 | c = H[k*8+2]; 76 | d = H[k*8+3]; 77 | e = H[k*8+4]; 78 | f = H[k*8+5]; 79 | g = H[k*8+6]; 80 | h = H[k*8+7]; 81 | 82 | for (int i=0; i < 64; i++) 83 | { 84 | uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i]; 85 | uint32_t T2 = sigma0(a) + Maj(a,b,c); 86 | h = g; 87 | g = f; 88 | f = e; 89 | e = d + T1; 90 | d = c; 91 | c = b; 92 | b = a; 93 | a = T1 + T2; 94 | } 95 | 96 | H[k*8+0] += a; 97 | H[k*8+1] += b; 98 | H[k*8+2] += c; 99 | H[k*8+3] += d; 100 | H[k*8+4] += e; 101 | H[k*8+5] += f; 102 | H[k*8+6] += g; 103 | H[k*8+7] += h; 104 | } 105 | } 106 | 107 | 108 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride) 109 | { 110 | SHA256ExecuteQPU(H); 111 | } 112 | 113 | 114 | int main(int argc, char **argv) 115 | { 116 | unsigned int shader_code[MAX_CODE_SIZE]; 117 | bool run_cpu(true); 118 | 119 | if (argc < 2) { 120 | fprintf(stderr, "Usage: %s [-qpu]\n", argv[0]); 121 | return 1; 122 | } 123 | if (argc > 2 && (strcmp(argv[2], "-qpu") == 0)) 124 | run_cpu = false; 125 | 126 | /* Load the data to hash */ 127 | FILE *f_data = fopen(argv[1], "r"); 128 | if (!f_data) { 129 | fprintf(stderr, "Unable to open file %s\n", argv[1]); 130 | return 3; 131 | } 132 | 133 | /* Load the QPU code */ 134 | int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE); 135 | if (code_len < 1) { 136 | fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE); 137 | return 2; 138 | } 139 | printf("Loaded %d bytes of QPU code.\n", code_len); 140 | 141 | int nblocks = 1; // 1 512-bit block for now 142 | int stride = nblocks * 16; 143 | uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride]; 144 | uint32_t *H = new uint32_t[BUFFER_SIZE*8]; 145 | 146 | uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 147 | 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; 148 | char filebuffer[64]; // 1 512-bit blocks 149 | for (int i=0; i < BUFFER_SIZE; i++) 150 | { 151 | memcpy(H+i*8, H0, sizeof(H0)); 152 | memset(filebuffer, 0x0, sizeof(filebuffer)); 153 | // read a line up to 64-bytes long 154 | char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data); 155 | if (!p) { 156 | fprintf(stderr, "Failed to read enough lines from data file.\n"); 157 | delete[] H; 158 | delete[] buffer; 159 | return 4; 160 | } 161 | 162 | int bytes = strlen(filebuffer); 163 | filebuffer[bytes] = 0x80; // SHA-256 padding 164 | 165 | // last 8 bytes are the length of the initial message in bits 166 | uint8_t len_buffer[8]; 167 | *(uint64_t *)len_buffer = bytes * 8; 168 | for (int j=0; j < 8; j++) 169 | filebuffer[56+j] = len_buffer[7-j]; 170 | 171 | for (int j=0; j < 16; j++) 172 | buffer[i*16+j] = ENDIAN(filebuffer, j); 173 | } 174 | 175 | int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len); 176 | if (handle < 0) { 177 | fprintf(stderr, "Unable to setup QPU. Check permissions\n"); 178 | delete[] buffer; 179 | delete[] H; 180 | return 4; 181 | } 182 | 183 | /* 184 | * SHA-256 calculation here 185 | */ 186 | for (int i=0; i < nblocks; i++) 187 | { 188 | printf("Running %s version ...\n", (run_cpu) ? "CPU" : "QPU"); 189 | if (run_cpu) 190 | execute_sha256_cpu(buffer+i*16, H, stride); 191 | else 192 | execute_sha256_qpu(buffer+i*16, H, stride); 193 | } 194 | 195 | if (!run_cpu) 196 | SHA256FetchResult(H); 197 | 198 | // print out the H 199 | for (int i=0; i < BUFFER_SIZE; i++) { 200 | printf("%02d / SHA-256: ", i); 201 | for (int j=0; j < 8; j++) 202 | printf("%08x ", H[i*8+j]); 203 | printf("\n"); 204 | } 205 | 206 | SHA256CleanupQPU(handle); 207 | 208 | delete[] buffer; 209 | delete[] H; 210 | } 211 | -------------------------------------------------------------------------------- /QPU/SHA-256/1QPU/test-data.bin: -------------------------------------------------------------------------------- 1 | this is line number one 2 | this is line number two 3 | this is line number three 4 | this is line number four 5 | this is line number five 6 | this is line number six 7 | this is line number seven 8 | this is line number eight 9 | this is line number nine 10 | this is line number ten 11 | this is line number eleven 12 | this is line number twelve 13 | this is line number thirteen 14 | this is line number fourteen 15 | this is line number fifteen 16 | this is line number sixteen 17 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/Makefile: -------------------------------------------------------------------------------- 1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c 2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft 3 | 4 | sha256: sha256.cpp qpufuncs.cpp 5 | g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC) 6 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/final.asm: -------------------------------------------------------------------------------- 1 | define(`MUTEX_ACQUIRE', `or ra39, ra51, rb39; nop') 2 | define(`MUTEX_RELEASE', `or ra51, ra39, ra39; nop') 3 | define(`NOP', `nop ra39, ra39, ra39; nop rb39, rb39, rb39') 4 | ## 5 | # generate a schedule vector. Call as 6 | # GENSCHEDULE(register W_i-16, W_i-15, W_i-7, W_i-2, destination reg) 7 | # these need to be a registers because we use small immediates 8 | # uses temp registers r0 - r3 9 | # 10 | 11 | define(`FAKESCHEDULE', 12 | ` or ra1, $1, $1; nop 13 | bra ra39, ZERO, ra0; 14 | NOP 15 | NOP 16 | NOP') 17 | 18 | define(`GENSCHEDULE', 19 | ` 20 | ror rb33, $2, 7; nop # r1 = RotR(x, 7) 21 | ror rb34, $2, rb6; nop # r2 = RotR(x, 18) 22 | shr rb35, $2, 3; nop # r3 = x >> 3; 23 | xor rb33, r1, r2; v8max ra32, $1, $1 # r1 = r1 ^ r2, r0 = W_i-16 24 | xor rb35, r1, r3; nop # r3 = r1 ^ r3 25 | add rb32, r0, r3; nop # r0 += r3 (W_i-16 + smsigma0(W_i-15)) 26 | add rb32, r0, $3; nop # r0 += W_i-7 27 | ror rb33, $4, rb8; nop # r1 = RotR(x, 17) 28 | ror rb34, $4, rb7; nop # r2 = RotR(x, 19) 29 | xor rb33, r1, r2; nop # r1 = r1 ^ r2 30 | shr rb34, $4, 10; nop # r2 = x >> 10 31 | xor rb33, r1, r2; nop # r1 = r1 ^ r2 32 | add $1, r0, r1; nop # r0 += smsigma1(W_i-2) 33 | ## move it into another register for reading 34 | add ra1, r0, r1; nop 35 | ## branch back (ra0) 36 | bra ra39, ZERO, ra0 37 | NOP 38 | NOP 39 | NOP 40 | ## $2 ignored, $3 ignored, $4 ignored, $1 ignored (suppress warnings)') 41 | define(`GENSCHEDULE_ALL', 42 | ` 43 | GENSCHEDULE(`ra4', `ra5', `ra13', `ra18') 44 | GENSCHEDULE(`ra5', `ra6', `ra14', `ra19') 45 | GENSCHEDULE(`ra6', `ra7', `ra15', `ra4') 46 | GENSCHEDULE(`ra7', `ra8', `ra16', `ra5') 47 | GENSCHEDULE(`ra8', `ra9', `ra17', `ra6') 48 | GENSCHEDULE(`ra9', `ra10', `ra18', `ra7') 49 | GENSCHEDULE(`ra10', `ra11', `ra19', `ra8') 50 | GENSCHEDULE(`ra11', `ra12', `ra4', `ra9') 51 | GENSCHEDULE(`ra12', `ra13', `ra5', `ra10') 52 | GENSCHEDULE(`ra13', `ra14', `ra6', `ra11') 53 | GENSCHEDULE(`ra14', `ra15', `ra7', `ra12') 54 | GENSCHEDULE(`ra15', `ra16', `ra8', `ra13') 55 | GENSCHEDULE(`ra16', `ra17', `ra9', `ra14') 56 | GENSCHEDULE(`ra17', `ra18', `ra10', `ra15') 57 | GENSCHEDULE(`ra18', `ra19', `ra11', `ra16') 58 | GENSCHEDULE(`ra19', `ra4', `ra12', `ra17')') 59 | define(`FAKESCHEDULE_ALL', 60 | ` 61 | FAKESCHEDULE(`ra4') 62 | FAKESCHEDULE(`ra5') 63 | FAKESCHEDULE(`ra6') 64 | FAKESCHEDULE(`ra7') 65 | FAKESCHEDULE(`ra8') 66 | FAKESCHEDULE(`ra9') 67 | FAKESCHEDULE(`ra10') 68 | FAKESCHEDULE(`ra11') 69 | FAKESCHEDULE(`ra12') 70 | FAKESCHEDULE(`ra13') 71 | FAKESCHEDULE(`ra14') 72 | FAKESCHEDULE(`ra15') 73 | FAKESCHEDULE(`ra16') 74 | FAKESCHEDULE(`ra17') 75 | FAKESCHEDULE(`ra18') 76 | FAKESCHEDULE(`ra19')') 77 | 78 | # move the uniforms into registers 79 | or ra31, ra32, 0; nop # address of K in ra31 80 | or ra30, ra32, 0; nop # address of H in ra30 81 | or rb29, ra32, 0; nop # address of data in rb29 82 | or ra2, ra32, 0; nop # number of laps in ra2 83 | 84 | or rb31, ra31, 0; nop ## save ra31 (K address) since we overwrite 85 | ## this in the main loop 86 | 87 | # some rotation constants that don't fit in small immediates 88 | ldi rb2, 0x16; 89 | ldi rb5, 0x19; 90 | ldi rb6, 0x12; 91 | ldi rb7, 0x13; 92 | ldi rb8, 0x11; 93 | 94 | mainloop: 95 | ## Restore the K texture base address 96 | and ra31, rb31, rb31; nop 97 | 98 | ## Lock the VPM mutex 99 | MUTEX_ACQUIRE() 100 | 101 | # VDR DMA read setup for data vectors (16x16) 102 | # MODEW = 0, MPITCH = 3, ROWLEN = 16, NROWS = 16, VPITCH=1, VERT = 0, ADDRXY = 0 103 | ldi ra49, 0x83001000 104 | 105 | # Move the data vectors into place (0,0 in VPM) 106 | or ra50, rb29, rb29; nop 107 | 108 | # wait for the DMA to complete 109 | or rb39, ra50, 0; nop 110 | 111 | # read the data vectors into ra4 .. ra19 112 | ldi ra49, 0x1200 113 | or ra4, ra48, 0; nop 114 | or ra5, ra48, 0; nop 115 | or ra6, ra48, 0; nop 116 | or ra7, ra48, 0; nop 117 | or ra8, ra48, 0; nop 118 | or ra9, ra48, 0; nop 119 | or ra10, ra48, 0; nop 120 | or ra11, ra48, 0; nop 121 | or ra12, ra48, 0; nop 122 | or ra13, ra48, 0; nop 123 | or ra14, ra48, 0; nop 124 | or ra15, ra48, 0; nop 125 | or ra16, ra48, 0; nop 126 | or ra17, ra48, 0; nop 127 | or ra18, ra48, 0; nop 128 | or ra19, ra48, 0; nop 129 | 130 | # VDR DMA read setup for H vectors (16x8) 131 | # MODEW = 0, MPITCH = 2, ROWLEN = 8, NROWS = 16, VPITCH=1, VERT = 0, ADDRXY = (16, 0) 132 | ldi ra49, 0x82801000 133 | 134 | # Move the data vectors into place (16,0 in VPM) 135 | or ra50, ra30, 0; nop 136 | 137 | # wait for the DMA to complete 138 | or rb39, ra50, 0; nop 139 | 140 | # configure the VPM for reading the H vectors 141 | ldi ra49, 0x801200 142 | 143 | # read the H vectors into registers ra20..ra27 (this is a .. h) and rb20..rb27 144 | # (We read them into rb registers so that we can write them back) 145 | or ra20, ra48, 0; v8max rb20, ra48, ra48 146 | or ra21, ra48, 0; v8max rb21, ra48, ra48 147 | or ra22, ra48, 0; v8max rb22, ra48, ra48 148 | or ra23, ra48, 0; v8max rb23, ra48, ra48 149 | or ra24, ra48, 0; v8max rb24, ra48, ra48 150 | or ra25, ra48, 0; v8max rb25, ra48, ra48 151 | or ra26, ra48, 0; v8max rb26, ra48, ra48 152 | or ra27, ra48, 0; v8max rb27, ra48, ra48 153 | 154 | ## Unlock the VPM mutex 155 | MUTEX_RELEASE() 156 | 157 | 158 | define(`COMPRESS_ITER', 159 | ` 160 | ## Compute T1, and T2 161 | 162 | # T1 += K[i] 163 | # move the data address in ra31 (K vector) increment the K[i] and do the 164 | # texture lookup 165 | # cannot put the .tmu on add because it is using a small immediate which 166 | # is a sig as well 167 | # (prefetching these, see below rb56 and .tmu)) 168 | add ra31, ra31, 4; nop 169 | add rb32, r4, ra27; v8max ra1, rb0, rb0; 170 | 171 | # T1 += W[i] 172 | ## need another instruction here to avoid the RAW hazard 173 | add rb0, rb0, ra29; nop 174 | ## this is a confusing overload of ra1 but we are running out of registers 175 | brr ra0, fakeschedule, ra1 176 | NOP 177 | NOP 178 | NOP 179 | add rb18, r0, ra1; nop 180 | 181 | # T1 = CH(e,f,g) = (e & f) ^ (~e & g) (e = ra24, f = ra25, g = ra26 182 | or ra32, ra24, 0; nop # load e into r0 183 | and ra33, ra25, r0; nop # r1 = r0 & f (e & f) 184 | not ra32, r0, 0; v8max rb56, ra31, ra31 # r0 = ~r0 (~e) 185 | and ra32, r0, ra26; nop # r0 = r0 & g (~e & g) 186 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 (e & f) ^ (~e & g) 187 | add rb18, rb18, r0; v8max ra27, ra26, ra26 # T1 += 188 | 189 | # T1 = sigma1(e) = RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25) 190 | ror rb32, ra24, 6; nop 191 | ror rb33, ra24, 11; nop 192 | ror rb34, ra24, rb5; nop 193 | xor rb32, r0, r1; v8max ra26, ra25, ra25 194 | xor rb32, r0, r2; v8max ra25, ra24, ra24 195 | 196 | # T1 = sigma1(e) + CH(e,f,g) 197 | add.tmu rb18, r0, rb18; nop 198 | 199 | # T2 = sigma0(a) = ra20 200 | ror ra32, ra20, 2; nop # r0 = RotR(a, 2) 201 | ror ra33, ra20, 13; nop # r1 = RotR(a, 13) 202 | xor ra32, r0, r1; nop # r0 = RotR(a, 2) ^ RotR(a, 13) 203 | ror ra33, ra20, rb2; nop # r1 = RotR(a, 22) 204 | xor ra3, r0, r1; v8max rb32, ra20, ra20 # T2 = r0 ^ r1, load a into r0 205 | 206 | add ra24, ra23, rb18; nop 207 | 208 | # T2 += Maj(a,b,c) = 209 | and ra33, r0, ra21; nop # r1 = a & b 210 | and ra34, r0, ra22; nop # r2 = a & c 211 | xor ra32, r1, r2; v8max rb33, ra21, ra21 # r0 = r1 ^ r2, load b into r1 212 | 213 | and rb33, r1, ra22; v8max ra23, ra22, ra22 # r1 = b & c 214 | 215 | xor rb32, r0, r1; v8max ra22, ra21, ra21 # r0 = r0 ^ r1 216 | add ra3, ra3, r0; nop # T2 += 217 | 218 | or ra21, ra20, 0; nop 219 | add ra20, rb18, ra3; nop 220 | ') 221 | define(`COMPRESS_SCHED_ITER', 222 | ` 223 | ## Compute T1, and T2 224 | 225 | # T1 += K[i] 226 | # move the data address in ra31 (K vector) increment the K[i] and do the 227 | # texture lookup 228 | # cannot put the .tmu on add because it is using a small immediate which 229 | # is a sig as well 230 | # (prefetching these, see below rb56 and .tmu)) 231 | add ra31, ra31, 4; nop 232 | add rb32, r4, ra27; v8max ra1, rb0, rb0; 233 | add rb18, r0, 0; nop 234 | 235 | # T1 += W[i] 236 | ## this is a confusing overload of ra1 but we are running out of registers 237 | brr ra0, genschedule, ra1 238 | NOP 239 | NOP 240 | NOP 241 | add rb18, rb18, ra1; nop 242 | add rb0, rb0, ra29; nop 243 | 244 | # T1 = CH(e,f,g) = (e & f) ^ (~e & g) (e = ra24, f = ra25, g = ra26 245 | or ra32, ra24, 0; nop # load e into r0 246 | and ra33, ra25, r0; nop # r1 = r0 & f (e & f) 247 | not ra32, r0, 0; v8max rb56, ra31, ra31 # r0 = ~r0 (~e) 248 | and ra32, r0, ra26; nop # r0 = r0 & g (~e & g) 249 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 (e & f) ^ (~e & g) 250 | add rb18, rb18, r0; v8max ra27, ra26, ra26 # T1 += 251 | 252 | # T1 = sigma1(e) = RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25) 253 | ror rb32, ra24, 6; nop 254 | ror rb33, ra24, 11; nop 255 | ror rb34, ra24, rb5; nop 256 | xor rb32, r0, r1; v8max ra26, ra25, ra25 257 | xor rb32, r0, r2; v8max ra25, ra24, ra24 258 | 259 | # T1 = sigma1(e) + CH(e,f,g) 260 | add.tmu rb18, r0, rb18; nop 261 | 262 | # T2 = sigma0(a) = ra20 263 | ror ra32, ra20, 2; nop # r0 = RotR(a, 2) 264 | ror ra33, ra20, 13; nop # r1 = RotR(a, 13) 265 | xor ra32, r0, r1; nop # r0 = RotR(a, 2) ^ RotR(a, 13) 266 | ror ra33, ra20, rb2; nop # r1 = RotR(a, 22) 267 | xor ra3, r0, r1; v8max rb32, ra20, ra20 # T2 = r0 ^ r1, load a into r0 268 | 269 | add ra24, ra23, rb18; nop 270 | 271 | # T2 += Maj(a,b,c) = 272 | and ra33, r0, ra21; nop # r1 = a & b 273 | and ra34, r0, ra22; nop # r2 = a & c 274 | xor ra32, r1, r2; v8max rb33, ra21, ra21 # r0 = r1 ^ r2, load b into r1 275 | 276 | and rb33, r1, ra22; v8max ra23, ra22, ra22 # r1 = b & c 277 | 278 | xor rb32, r0, r1; v8max ra22, ra21, ra21 # r0 = r0 ^ r1 279 | add ra3, ra3, r0; nop # T2 += 280 | 281 | or ra21, ra20, 0; nop 282 | add ra20, rb18, ra3; nop 283 | ') 284 | 285 | 286 | ldi ra29, 40 ## fakeschedule table index increment 287 | ## First 16 times use fakeschedule lookups 288 | ldi ra28, 0x10 289 | or rb56, ra31, 0; nop 290 | xor.tmu rb0, rb0, rb0; nop 291 | firstloop: 292 | COMPRESS_ITER() 293 | sub ra28, ra28, 1; nop 294 | brr.ze ra39, firstloop 295 | NOP 296 | NOP 297 | NOP 298 | 299 | ldi ra29, 144 300 | ## Next 48 times (16*3) use genschedule lookups 301 | ldi rb19, 3 302 | outerloop: 303 | ldi ra28, 0x10 304 | or rb56, ra31, 0; nop 305 | xor.tmu rb0, rb0, rb0; nop 306 | innerloop: 307 | COMPRESS_SCHED_ITER() 308 | sub ra28, ra28, 1; nop 309 | brr.ze ra39, innerloop 310 | NOP 311 | NOP 312 | NOP 313 | 314 | ldi ra32, 1; 315 | sub rb19, rb19, r0; nop 316 | brr.ze ra39, outerloop 317 | NOP 318 | NOP 319 | NOP 320 | 321 | 322 | ## Lock the VPM mutex 323 | MUTEX_ACQUIRE() 324 | 325 | # configure the VPM to write the H vectors back into place 326 | # (stride=1, vert, Y=16, X=0) 327 | ldi rb49, 0x1200 328 | 329 | # write the vectors back (+=) 330 | add rb48, ra20, rb20; nop 331 | add rb48, ra21, rb21; nop 332 | add rb48, ra22, rb22; nop 333 | add rb48, ra23, rb23; nop 334 | add rb48, ra24, rb24; nop 335 | add rb48, ra25, rb25; nop 336 | add rb48, ra26, rb26; nop 337 | add rb48, ra27, rb27; nop 338 | 339 | # configure the VPM for DMA back to the host 340 | # nrows=16, rowlen=8, 16, 0, horiz=1 341 | ldi rb49, 0x88084000 342 | 343 | # write the H address again to store 344 | or rb50, ra30, 0; nop 345 | 346 | # Wait for the DMA to complete 347 | or rb39, rb50, ra39; nop ra39, ra39, ra39 348 | 349 | ## Unlock the VPM mutex 350 | MUTEX_RELEASE() 351 | 352 | sub ra2, ra2, 1; nop 353 | brr.ze ra39, mainloop 354 | NOP 355 | NOP 356 | NOP 357 | 358 | # trigger a host interrupt to stop the program (not necessary with direct-exec) 359 | or rb38, ra39, rb39; nop ra39, ra39, ra39 360 | 361 | finished: 362 | nop.tend ra39, ra39, ra39; nop rb39, rb39, rb39 363 | NOP 364 | NOP 365 | 366 | 367 | ## schedule code table 368 | genschedule: 369 | GENSCHEDULE_ALL() 370 | 371 | fakeschedule: 372 | FAKESCHEDULE_ALL() 373 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/qpufuncs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // memset 8 | #include 9 | #include 10 | #include "mailbox.h" 11 | #include "qpufuncs.h" 12 | 13 | //#define GPU_MEM_FLG 0x4 // cached=0xC; direct=0x4 14 | //#define GPU_MEM_MAP 0x20000000 // cached=0x0; direct=0x20000000 15 | 16 | #define GPU_MEM_FLG 0xC 17 | #define GPU_MEM_MAP 0x0 18 | #define REGISTER_BASE 0x20C00000 19 | 20 | #define V3D_SRQPC 0x10c 21 | #define V3D_SRQUA 0x10d 22 | #define V3D_SRQUL 0x10e 23 | #define V3D_SRQCS 0x10f 24 | 25 | #define V3D_VPMBASE 0x7e 26 | 27 | #define V3D_L2CACTL 0x8 28 | #define V3D_SLCACTL 0x9 29 | 30 | //#define DIRECT_EXEC 31 | #define NUNIFORMS 4 32 | 33 | 34 | /* 35 | * TODO: expand this for multiple QPUs 36 | */ 37 | struct sha256_memory_map 38 | { 39 | /* 40 | data layout is: 41 | 64 words for K constants (accessed as a texture lookup) 42 | 16x8 (128) words for the 8 H vectors (VPM) 43 | 16x16 (256) words for the input data (VPM) 44 | Total: 448 words 45 | */ 46 | uint32_t data[64 + (128 + 256) * NUM_QPUS]; 47 | uint32_t code[MAX_CODE_SIZE]; 48 | /* 49 | uniforms are: 50 | u1: address of K texture 51 | u2: address of H vectors (also output location) 52 | u3: address of data buffer 53 | u4: number of laps to execute 54 | */ 55 | uint32_t uniforms[NUNIFORMS*NUM_QPUS]; 56 | uint32_t msg[NUM_QPUS*2]; // msg is a (uniform, code) tuple to execute_qpu 57 | 58 | /* Results are placed back into the H vector */ 59 | }; 60 | 61 | 62 | static struct 63 | { 64 | int mb; 65 | unsigned handle; 66 | unsigned size; 67 | unsigned vc_msg; 68 | unsigned ptr; 69 | void* arm_ptr; 70 | volatile uint32_t *registers; 71 | } sha256_qpu_context; 72 | 73 | 74 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 75 | unsigned *shader_code, unsigned code_len) 76 | { 77 | sha256_qpu_context.mb = mbox_open(); 78 | if (qpu_enable(sha256_qpu_context.mb, 1)) { 79 | fprintf(stderr, "Unable to enable QPU\n"); 80 | return -1; 81 | } 82 | 83 | #ifdef DIRECT_EXEC 84 | int mem_dev = open("/dev/mem", O_RDWR|O_SYNC); 85 | if (mem_dev == -1) { 86 | fprintf(stderr, "Error opening /dev/mem. Check permissions\n"); 87 | mbox_close(sha256_qpu_context.mb); 88 | return -1; 89 | } 90 | // close mem_dev 91 | // munmap cleanup 92 | 93 | sha256_qpu_context.registers = (volatile uint32_t*)mmap(NULL, 4096, PROT_READ|PROT_WRITE, 94 | MAP_SHARED, mem_dev, REGISTER_BASE); 95 | if (sha256_qpu_context.registers == MAP_FAILED) { 96 | fprintf(stderr, "mmap failed.\n"); 97 | close(mem_dev); 98 | mbox_close(sha256_qpu_context.mb); 99 | return -1; 100 | } 101 | #endif 102 | 103 | // 1 MB should be plenty 104 | sha256_qpu_context.size = 1024 * 1024; 105 | sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb, 106 | sha256_qpu_context.size, 4096, 107 | GPU_MEM_FLG); 108 | if (!sha256_qpu_context.handle) { 109 | fprintf(stderr, "Unable to allocate %d bytes of GPU memory", 110 | sha256_qpu_context.size); 111 | return -2; 112 | } 113 | unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle); 114 | sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size); 115 | sha256_qpu_context.ptr = ptr; 116 | printf("Locked memory at 0x%x = 0x%x\n", ptr, sha256_qpu_context.arm_ptr); 117 | 118 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 119 | sha256_qpu_context.arm_ptr; 120 | memset(arm_map, 0x0, sizeof(struct sha256_memory_map)); 121 | unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data); 122 | unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms); 123 | unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code); 124 | sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg); 125 | 126 | memcpy(arm_map->code, shader_code, code_len); 127 | memcpy(arm_map->data, K, 64*sizeof(uint32_t)); 128 | memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS); 129 | memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t)); 130 | for (int i=0; i < NUM_QPUS; i++) { 131 | arm_map->uniforms[i*NUNIFORMS+0] = vc_data; // data (address of K texture) 132 | arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t); // address of H vectors 133 | arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t); 134 | arm_map->uniforms[i*NUNIFORMS+3] = 20000; // fill this in in ExecuteQPU 135 | arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t); 136 | arm_map->msg[i*2+1] = vc_code; 137 | } 138 | 139 | return sha256_qpu_context.mb; 140 | } 141 | 142 | 143 | void SHA256ExecuteQPU(uint32_t* H, int nlaps) 144 | { 145 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 146 | sha256_qpu_context.arm_ptr; 147 | for (int i=0; i < NUM_QPUS; i++) 148 | arm_map->uniforms[i*NUNIFORMS+3] = 20000; 149 | 150 | #ifndef DIRECT_EXEC 151 | unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS, 152 | sha256_qpu_context.vc_msg, 1, 10000); 153 | if (ret != 0) 154 | fprintf(stderr, "Failed execute_qpu!\n"); 155 | #else 156 | uint32_t qst = sha256_qpu_context.registers[V3D_SRQCS]; 157 | int qlength = qst & 0x3f; 158 | int qreqs = (qst >> 8) & 0xFF; 159 | int qcomp = (qst >> 16) & 0xFF; 160 | int qerr = (qst >> 7) & 0x1; 161 | // printf("Queue length: %d, completed: %d, requests: %d, err: %d\n", qlength, qcomp, qreqs, qerr); 162 | int target = (qcomp + NUM_QPUS) % 256; 163 | 164 | for (int i=0; i < NUM_QPUS; i++) 165 | { 166 | sha256_qpu_context.registers[V3D_SRQUL] = NUNIFORMS; 167 | sha256_qpu_context.registers[V3D_SRQUA] = arm_map->msg[i*2+0]; 168 | sha256_qpu_context.registers[V3D_SRQPC] = arm_map->msg[i*2+1]; 169 | } 170 | 171 | do { 172 | qst = sha256_qpu_context.registers[V3D_SRQCS]; 173 | qcomp = (qst >> 16) & 0xFF; 174 | } while (qcomp != target); 175 | // printf("Queue length: %d, completed: %d, requests: %d, err: %d\n", qlength, qcomp, qreqs, qerr); 176 | #endif 177 | } 178 | 179 | 180 | void SHA256CleanupQPU(int handle) 181 | { 182 | unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size); 183 | mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle); 184 | mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle); 185 | qpu_enable(sha256_qpu_context.mb, 0); 186 | mbox_close(sha256_qpu_context.mb); 187 | } 188 | 189 | 190 | void SHA256FetchResult(uint32_t *H) 191 | { 192 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 193 | sha256_qpu_context.arm_ptr; 194 | memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t)); 195 | } 196 | 197 | 198 | int loadQPUCode(const char *fname, unsigned int* buffer, int len) 199 | { 200 | FILE *in = fopen(fname, "r"); 201 | if (!in) { 202 | fprintf(stderr, "Failed to open %s.\n", fname); 203 | return -1; 204 | } 205 | 206 | size_t items = fread(buffer, sizeof(unsigned int), len, in); 207 | fclose(in); 208 | 209 | return items * sizeof(unsigned int); 210 | } 211 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/qpufuncs.h: -------------------------------------------------------------------------------- 1 | #ifndef _QPUFUNCS_ 2 | #define _QPUFUNCS_ 3 | 4 | #define NUM_QPUS 12 5 | #define MAX_CODE_SIZE 24000 6 | 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 8 | unsigned *shader_code, unsigned code_len); 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len); 10 | void SHA256CleanupQPU(int handle); 11 | void SHA256ExecuteQPU(uint32_t* H, int nlaps); 12 | void SHA256FetchResult(uint32_t* H); 13 | volatile uint32_t* getRegisterMap(); 14 | 15 | #endif // _QPUFUNCS_ 16 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/sha256.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "qpufuncs.h" 7 | 8 | #define QPU_CODE_FILE "sha256.bin" 9 | #define BUFFER_SIZE NUM_QPUS * 16 10 | 11 | 12 | static uint32_t K[] = { 13 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 14 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 15 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 16 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 17 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 18 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 19 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 20 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 21 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 22 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 23 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 24 | }; 25 | 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) { 27 | return (x & y) ^ (~x & z); 28 | } 29 | 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) { 31 | return (x & y) ^ (x & z) ^ (y & z); 32 | } 33 | 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) { 35 | return (x >> shift) | (x << (32-shift)); 36 | } 37 | 38 | static inline uint32_t sigma0(uint32_t x) { 39 | return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22); 40 | } 41 | 42 | static inline uint32_t sigma1(uint32_t x) { 43 | return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25); 44 | } 45 | 46 | static inline uint32_t smsigma0(uint32_t x) { 47 | return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3); 48 | } 49 | 50 | static inline uint32_t smsigma1(uint32_t x) { 51 | return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10); 52 | } 53 | 54 | #define ENDIAN(x, i) ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3])) 55 | 56 | /* 57 | * data is an array of BUFFER_SIZE buffers to hash 58 | * H is an input/output parameter 59 | * stride is the stride for data (TODO: handle hashes of more than one block) 60 | */ 61 | void execute_sha256_cpu(uint32_t * data, uint32_t * H, int stride, int nlaps) 62 | { 63 | uint32_t W[64]; 64 | uint32_t a, b, c, d, e, f, g, h; 65 | 66 | for (int lap = 0; lap < nlaps; lap++) 67 | { 68 | for (int k=0; k < BUFFER_SIZE; k++) 69 | { 70 | for (int i=0; i < 16; i++) 71 | W[i] = data[k*stride+i]; 72 | for (int i=16; i < 64; i++) 73 | W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16]; 74 | 75 | a = H[k*8+0]; 76 | b = H[k*8+1]; 77 | c = H[k*8+2]; 78 | d = H[k*8+3]; 79 | e = H[k*8+4]; 80 | f = H[k*8+5]; 81 | g = H[k*8+6]; 82 | h = H[k*8+7]; 83 | 84 | for (int i=0; i < 64; i++) 85 | { 86 | uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i]; 87 | uint32_t T2 = sigma0(a) + Maj(a,b,c); 88 | h = g; 89 | g = f; 90 | f = e; 91 | e = d + T1; 92 | d = c; 93 | c = b; 94 | b = a; 95 | a = T1 + T2; 96 | } 97 | 98 | H[k*8+0] += a; 99 | H[k*8+1] += b; 100 | H[k*8+2] += c; 101 | H[k*8+3] += d; 102 | H[k*8+4] += e; 103 | H[k*8+5] += f; 104 | H[k*8+6] += g; 105 | H[k*8+7] += h; 106 | } 107 | } 108 | } 109 | 110 | 111 | /* 112 | * data is an array of BUFFER_SIZE buffers to hash 113 | * H is an input/output parameter 114 | * stride is the stride for data 115 | */ 116 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride, int nlaps) 117 | { 118 | SHA256ExecuteQPU(H, nlaps); 119 | } 120 | 121 | 122 | int main(int argc, char **argv) 123 | { 124 | bool run_cpu(true); 125 | if (argc < 3) { 126 | fprintf(stderr, "Usage: %s [-qpu]\n", argv[0]); 127 | return 1; 128 | } 129 | 130 | unsigned int shader_code[MAX_CODE_SIZE]; 131 | 132 | /* Load the QPU code */ 133 | int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE); 134 | if (code_len < 1) { 135 | fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE); 136 | return 2; 137 | } 138 | printf("Loaded %d bytes of QPU code.\n", code_len); 139 | 140 | /* Load the data to hash */ 141 | FILE *f_data = fopen(argv[1], "r"); 142 | if (!f_data) { 143 | fprintf(stderr, "Unable to open file %s\n", argv[1]); 144 | return 3; 145 | } 146 | 147 | int nlaps = atoi(argv[2]); 148 | printf("Running %d laps ...\n", nlaps); 149 | 150 | if (argc > 3 && (strcmp(argv[3], "-qpu") == 0)) 151 | run_cpu = false; 152 | 153 | int nblocks = 1; // 1 512-bit block for now 154 | int stride = nblocks * 16; 155 | uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride]; 156 | uint32_t *H = new uint32_t[BUFFER_SIZE*8]; 157 | 158 | uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 159 | 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; 160 | char filebuffer[64]; // 1 512-bit blocks 161 | for (int i=0; i < BUFFER_SIZE; i++) 162 | { 163 | memcpy(H+i*8, H0, sizeof(H0)); 164 | memset(filebuffer, 0x0, sizeof(filebuffer)); 165 | // read a line up to 64-bytes long 166 | char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data); 167 | if (!p) { 168 | fprintf(stderr, "Failed to read enough lines from data file.\n"); 169 | delete[] H; 170 | delete[] buffer; 171 | return 4; 172 | } 173 | 174 | int bytes = strlen(filebuffer); 175 | filebuffer[bytes] = 0x80; // SHA-256 padding 176 | 177 | // last 8 bytes are the length of the initial message in bits 178 | uint8_t len_buffer[8]; 179 | *(uint64_t *)len_buffer = bytes * 8; 180 | for (int j=0; j < 8; j++) 181 | filebuffer[56+j] = len_buffer[7-j]; 182 | 183 | for (int j=0; j < 16; j++) 184 | buffer[i*16+j] = ENDIAN(filebuffer, j); 185 | } 186 | 187 | int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len); 188 | if (handle < 0) { 189 | fprintf(stderr, "Unable to setup QPU. Check permissions\n"); 190 | delete[] buffer; 191 | delete[] H; 192 | return 4; 193 | } 194 | 195 | struct timeval start, end; 196 | gettimeofday(&start, NULL); 197 | 198 | /* 199 | * SHA-256 calculation here 200 | */ 201 | printf("Executing %s version ...\n", run_cpu ? "CPU" : "QPU"); 202 | for (int i=0; i < nblocks; i++) 203 | { 204 | if (run_cpu) 205 | execute_sha256_cpu(buffer+i*16, H, stride, nlaps); 206 | else 207 | execute_sha256_qpu(buffer+i*16, H, stride, nlaps); 208 | } 209 | 210 | gettimeofday(&end, NULL); 211 | 212 | float elapsed = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); 213 | elapsed /= 1000.0; 214 | 215 | // print out the H 216 | if (!run_cpu) SHA256FetchResult(H); 217 | 218 | for (int i=0; i < BUFFER_SIZE; i++) { 219 | printf("%02d / SHA-256: ", i); 220 | for (int j=0; j < 8; j++) 221 | printf("%08x ", H[i*8+j]); 222 | printf("\n"); 223 | } 224 | 225 | printf("Time in ms: %f = %f hash/s\n", elapsed, BUFFER_SIZE * nlaps * 1000.0/elapsed); 226 | 227 | SHA256CleanupQPU(handle); 228 | 229 | delete[] buffer; 230 | delete[] H; 231 | } 232 | -------------------------------------------------------------------------------- /QPU/SHA-256/final/test-data.bin: -------------------------------------------------------------------------------- 1 | Line #1 2 | Line #2 3 | Line #3 4 | Line #4 5 | Line #5 6 | Line #6 7 | Line #7 8 | Line #8 9 | Line #9 10 | Line #10 11 | Line #11 12 | Line #12 13 | Line #13 14 | Line #14 15 | Line #15 16 | Line #16 17 | Line #17 18 | Line #18 19 | Line #19 20 | Line #20 21 | Line #21 22 | Line #22 23 | Line #23 24 | Line #24 25 | Line #25 26 | Line #26 27 | Line #27 28 | Line #28 29 | Line #29 30 | Line #30 31 | Line #31 32 | Line #32 33 | Line #33 34 | Line #34 35 | Line #35 36 | Line #36 37 | Line #37 38 | Line #38 39 | Line #39 40 | Line #40 41 | Line #41 42 | Line #42 43 | Line #43 44 | Line #44 45 | Line #45 46 | Line #46 47 | Line #47 48 | Line #48 49 | Line #49 50 | Line #50 51 | Line #51 52 | Line #52 53 | Line #53 54 | Line #54 55 | Line #55 56 | Line #56 57 | Line #57 58 | Line #58 59 | Line #59 60 | Line #60 61 | Line #61 62 | Line #62 63 | Line #63 64 | Line #64 65 | Line #65 66 | Line #66 67 | Line #67 68 | Line #68 69 | Line #69 70 | Line #70 71 | Line #71 72 | Line #72 73 | Line #73 74 | Line #74 75 | Line #75 76 | Line #76 77 | Line #77 78 | Line #78 79 | Line #79 80 | Line #80 81 | Line #81 82 | Line #82 83 | Line #83 84 | Line #84 85 | Line #85 86 | Line #86 87 | Line #87 88 | Line #88 89 | Line #89 90 | Line #90 91 | Line #91 92 | Line #92 93 | Line #93 94 | Line #94 95 | Line #95 96 | Line #96 97 | Line #97 98 | Line #98 99 | Line #99 100 | Line #100 101 | Line #101 102 | Line #102 103 | Line #103 104 | Line #104 105 | Line #105 106 | Line #106 107 | Line #107 108 | Line #108 109 | Line #109 110 | Line #110 111 | Line #111 112 | Line #112 113 | Line #113 114 | Line #114 115 | Line #115 116 | Line #116 117 | Line #117 118 | Line #118 119 | Line #119 120 | Line #120 121 | Line #121 122 | Line #122 123 | Line #123 124 | Line #124 125 | Line #125 126 | Line #126 127 | Line #127 128 | Line #128 129 | Line #129 130 | Line #130 131 | Line #131 132 | Line #132 133 | Line #133 134 | Line #134 135 | Line #135 136 | Line #136 137 | Line #137 138 | Line #138 139 | Line #139 140 | Line #140 141 | Line #141 142 | Line #142 143 | Line #143 144 | Line #144 145 | Line #145 146 | Line #146 147 | Line #147 148 | Line #148 149 | Line #149 150 | Line #150 151 | Line #151 152 | Line #152 153 | Line #153 154 | Line #154 155 | Line #155 156 | Line #156 157 | Line #157 158 | Line #158 159 | Line #159 160 | Line #160 161 | Line #161 162 | Line #162 163 | Line #163 164 | Line #164 165 | Line #165 166 | Line #166 167 | Line #167 168 | Line #168 169 | Line #169 170 | Line #170 171 | Line #171 172 | Line #172 173 | Line #173 174 | Line #174 175 | Line #175 176 | Line #176 177 | Line #177 178 | Line #178 179 | Line #179 180 | Line #180 181 | Line #181 182 | Line #182 183 | Line #183 184 | Line #184 185 | Line #185 186 | Line #186 187 | Line #187 188 | Line #188 189 | Line #189 190 | Line #190 191 | Line #191 192 | Line #192 193 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/Makefile: -------------------------------------------------------------------------------- 1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c 2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft 3 | 4 | sha256: sha256.cpp qpufuncs.cpp 5 | g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC) 6 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/partial.asm: -------------------------------------------------------------------------------- 1 | define(`NOP', `nop ra39, ra39, ra39; nop rb39, rb39, rb39') 2 | 3 | ## Move the uniforms (arguments) into registers 4 | or ra31, ra32, 0; nop # address of K in ra31 5 | or ra30, ra32, 0; nop # address of H in ra30 6 | or ra29, ra32, 0; nop # address of data in ra29 7 | 8 | ## Load some rotation constants that don't fit in small immediates 9 | ldi rb2, 0x16; 10 | ldi rb5, 0x19; 11 | 12 | ## VCD DMA setup for the H vectors (16x8) 13 | ldi ra49, 0x82801000 14 | 15 | ## Move the H vectors into the VPM (0,0 in VPM) 16 | or ra50, ra30, 0; nop 17 | 18 | ## Wait for the DMA to complete 19 | and rb39, ra50, ra50; nop 20 | 21 | ## Configure the VPM for reading the H vectors 22 | ldi ra49, 0x801200 23 | 24 | ## Read the H vectors into registers ra20..ra27 (these are the a..h) 25 | ## Also copy them into rb20..rb27 (we need the original values to write back) 26 | or ra20, ra48, 0; v8max rb20, ra48, ra48; 27 | or ra21, ra48, 0; v8max rb21, ra48, ra48; 28 | or ra22, ra48, 0; v8max rb22, ra48, ra48; 29 | or ra23, ra48, 0; v8max rb23, ra48, ra48; 30 | or ra24, ra48, 0; v8max rb24, ra48, ra48; 31 | or ra25, ra48, 0; v8max rb25, ra48, ra48; 32 | or ra26, ra48, 0; v8max rb26, ra48, ra48; 33 | or ra27, ra48, 0; v8max rb27, ra48, ra48; 34 | 35 | ## Configure the VPM/VCD to read the data vectors 36 | ldi ra49, 0x83001000 37 | or ra50, ra29, 0; nop ## Load address to DMA 38 | or rb39, ra50, 0; nop ## Wait for it 39 | 40 | ldi ra49, 0x1200 41 | 42 | ## First 16 loops of compression 43 | ldi ra2, 0x10; 44 | compress: 45 | ## r0 = K[i] + h 46 | or rb56, ra31, 0; nop 47 | nop.tmu ra39, ra39, ra39; nop 48 | add ra31, ra31, 4; nop 49 | add rb32, r4, ra27; nop 50 | 51 | ## T1 = h + K[i] + W[i] 52 | add rb18, r0, rb48; nop 53 | 54 | ## T1 += CH(e,f,g) => (e & f) ^ (~e & g) (e: ra24, f: ra25, g: ra26) 55 | or ra32, ra24, 0; nop # load e into r0 56 | and ra33, ra25, r0; nop # r1 = r0 & f (e & f) 57 | not ra32, r0, 0; nop # r0 = ~r0 (~e) 58 | and ra32, r0, ra26; nop # r0 = r0 & g (~e & g) 59 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 (e & f) ^ (~e & g) 60 | add rb18, rb18, r0; nop # accumulate into T1 61 | 62 | ## T1 += sigma1(e) => RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25) 63 | ror rb32, ra24, 6; nop 64 | ror rb33, ra24, 11; nop 65 | ror rb34, ra24, rb5; nop 66 | xor rb32, r0, r1; nop 67 | xor rb32, r0, r2; nop 68 | add rb18, r0, rb18; nop 69 | 70 | ## T2 (ra3) = sigma0(a) (a: ra20) 71 | ror ra32, ra20, 2; nop # r0 = RotR(a, 2) 72 | ror ra33, ra20, 13; nop # r1 = RotR(a, 13) 73 | xor ra32, r0, r1; nop # r0 = RotR(a, 2) ^ RotR(a, 13) 74 | ror ra33, ra20, rb2; nop # r1 = RotR(a, 22) 75 | xor ra3, r0, r1; nop # T2 = sigma0(a) 76 | 77 | ## T2 += Maj(a,b,c) 78 | or ra32, ra20, 0; nop # load a into r0 79 | and ra33, r0, ra21; nop # r1 = a & b 80 | and ra34, r0, ra22; nop # r2 = a & c 81 | xor ra32, r1, r2; nop # r0 = (a & b) ^ (a & c) 82 | or ra33, ra21, 0; nop # load b into r1 83 | and ra33, r1, ra22; nop # r1 = b & c 84 | xor ra32, r0, r1; nop # r0 = r0 ^ r1 85 | add ra3, ra3, r0; nop # T2 += Maj(a,b,c) 86 | 87 | ## swizzle 88 | or ra27, ra26, 0; nop 89 | or ra26, ra25, 0; nop 90 | or ra25, ra24, 0; nop 91 | add ra24, ra23, rb18; nop 92 | or ra23, ra22, 0; nop 93 | or ra22, ra21, 0; nop 94 | or ra21, ra20, 0; nop 95 | add ra20, rb18, ra3; nop 96 | 97 | ## Loop 98 | sub ra2, ra2, 1; nop 99 | brr.ze ra39, compress 100 | NOP 101 | NOP 102 | NOP 103 | 104 | ## Configure the VPM to write the H vectors back into place 105 | ldi rb49, 0x1200 106 | 107 | ## Write H vectors back (+=) 108 | add rb48, ra20, rb20; nop 109 | add rb48, ra21, rb21; nop 110 | add rb48, ra22, rb22; nop 111 | add rb48, ra23, rb23; nop 112 | add rb48, ra24, rb24; nop 113 | add rb48, ra25, rb25; nop 114 | add rb48, ra26, rb26; nop 115 | add rb48, ra27, rb27; nop 116 | 117 | ## Configure the VCD for DMA back to the host 118 | ldi rb49, 0x88084000 119 | 120 | ## Write the H address to store 121 | or rb50, ra30, 0; nop 122 | 123 | ## Wait for the DMA to complete 124 | or rb39, rb50, ra39; nop 125 | 126 | ## Trigger a host interrupt to finish the program 127 | or rb38, ra39, rb39; nop 128 | 129 | nop.tend ra39, ra39, ra39; nop rb39, rb39, rb39 130 | NOP 131 | NOP 132 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/qpufuncs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // memset 8 | #include 9 | #include 10 | #include "mailbox.h" 11 | #include "qpufuncs.h" 12 | 13 | #define GPU_MEM_FLG 0xC // cached 14 | #define GPU_MEM_MAP 0x0 // cached 15 | #define NUNIFORMS 3 16 | 17 | 18 | struct sha256_memory_map 19 | { 20 | /* 21 | data layout is: 22 | 64 words for K constants (accessed as a texture lookup) 23 | 16x8 (128) words for the 8 H vectors (VPM) 24 | 16x16 (256) words for the input data (VPM) 25 | Total: 448 words 26 | */ 27 | uint32_t data[64 + (128 + 256) * NUM_QPUS]; 28 | uint32_t code[MAX_CODE_SIZE]; 29 | /* 30 | uniforms are: 31 | u1: address of K texture 32 | u2: address of H vectors (also output location) 33 | u3: address of data buffer 34 | u4: stride 35 | */ 36 | uint32_t uniforms[NUNIFORMS * NUM_QPUS]; 37 | uint32_t msg[NUM_QPUS*2]; // msg is a (uniform, code) tuple to execute_qpu 38 | 39 | /* results are placed back into data where the H vectors were read from */ 40 | }; 41 | 42 | 43 | static struct 44 | { 45 | int mb; 46 | unsigned handle; 47 | unsigned size; 48 | unsigned vc_msg; 49 | unsigned ptr; 50 | void* arm_ptr; 51 | } sha256_qpu_context; 52 | 53 | 54 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 55 | unsigned *shader_code, unsigned code_len) 56 | { 57 | sha256_qpu_context.mb = mbox_open(); 58 | if (qpu_enable(sha256_qpu_context.mb, 1)) { 59 | fprintf(stderr, "Unable to enable QPU\n"); 60 | return -1; 61 | } 62 | 63 | // 1 MB should be plenty 64 | sha256_qpu_context.size = 1024 * 1024; 65 | sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb, 66 | sha256_qpu_context.size, 4096, 67 | GPU_MEM_FLG); 68 | if (!sha256_qpu_context.handle) { 69 | fprintf(stderr, "Unable to allocate %d bytes of GPU memory", 70 | sha256_qpu_context.size); 71 | return -2; 72 | } 73 | unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle); 74 | sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size); 75 | sha256_qpu_context.ptr = ptr; 76 | 77 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 78 | sha256_qpu_context.arm_ptr; 79 | memset(arm_map, 0x0, sizeof(struct sha256_memory_map)); 80 | unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data); 81 | unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms); 82 | unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code); 83 | sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg); 84 | 85 | memcpy(arm_map->code, shader_code, code_len); 86 | memcpy(arm_map->data, K, 64*sizeof(uint32_t)); 87 | memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS); 88 | memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t)); 89 | for (int i=0; i < NUM_QPUS; i++) { 90 | arm_map->uniforms[i*NUNIFORMS+0] = vc_data; // data (address of K texture) 91 | arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t); // address of H vectors 92 | arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t); 93 | 94 | arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t); 95 | arm_map->msg[i*2+1] = vc_code; 96 | } 97 | 98 | return sha256_qpu_context.mb; 99 | } 100 | 101 | 102 | void SHA256ExecuteQPU(uint32_t* H) 103 | { 104 | unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS, 105 | sha256_qpu_context.vc_msg, 1, 10000); 106 | if (ret != 0) 107 | fprintf(stderr, "Failed execute_qpu!\n"); 108 | } 109 | 110 | 111 | void SHA256CleanupQPU(int handle) 112 | { 113 | unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size); 114 | mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle); 115 | mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle); 116 | qpu_enable(sha256_qpu_context.mb, 0); 117 | mbox_close(sha256_qpu_context.mb); 118 | } 119 | 120 | 121 | void SHA256FetchResult(uint32_t *H) 122 | { 123 | struct sha256_memory_map *arm_map = (struct sha256_memory_map *) 124 | sha256_qpu_context.arm_ptr; 125 | memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t)); 126 | } 127 | 128 | 129 | int loadQPUCode(const char *fname, unsigned int* buffer, int len) 130 | { 131 | FILE *in = fopen(fname, "r"); 132 | if (!in) { 133 | fprintf(stderr, "Failed to open %s.\n", fname); 134 | return -1; 135 | } 136 | 137 | size_t items = fread(buffer, sizeof(unsigned int), len, in); 138 | fclose(in); 139 | 140 | return items * sizeof(unsigned int); 141 | } 142 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/qpufuncs.h: -------------------------------------------------------------------------------- 1 | #ifndef _QPUFUNCS_ 2 | #define _QPUFUNCS_ 3 | 4 | #define NUM_QPUS 1 5 | #define MAX_CODE_SIZE 24000 /* in words */ 6 | 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride, 8 | unsigned *shader_code, unsigned code_len); 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len); 10 | void SHA256CleanupQPU(int handle); 11 | void SHA256ExecuteQPU(uint32_t* H); 12 | void SHA256FetchResult(uint32_t* H); 13 | 14 | #endif // _QPUFUNCS_ 15 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/sha256.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "qpufuncs.h" 7 | 8 | #define QPU_CODE_FILE "sha256.bin" 9 | #define NUM_QPUS 1 10 | #define BUFFER_SIZE NUM_QPUS * 16 11 | 12 | static uint32_t K[] = { 13 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 14 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 15 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 16 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 17 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 18 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 19 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 20 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 21 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 22 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 23 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 24 | }; 25 | 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) { 27 | return (x & y) ^ (~x & z); 28 | } 29 | 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) { 31 | return (x & y) ^ (x & z) ^ (y & z); 32 | } 33 | 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) { 35 | return (x >> shift) | (x << (32-shift)); 36 | } 37 | 38 | static inline uint32_t sigma0(uint32_t x) { 39 | return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22); 40 | } 41 | 42 | static inline uint32_t sigma1(uint32_t x) { 43 | return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25); 44 | } 45 | 46 | static inline uint32_t smsigma0(uint32_t x) { 47 | return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3); 48 | } 49 | 50 | static inline uint32_t smsigma1(uint32_t x) { 51 | return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10); 52 | } 53 | 54 | #define ENDIAN(x, i) ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3])) 55 | 56 | /* 57 | * data is an array of BUFFER_SIZE buffers to hash 58 | * H is an input/output parameter 59 | * stride is the stride for data (TODO: handle hashes of more than one block) 60 | */ 61 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride) 62 | { 63 | uint32_t W[64]; 64 | uint32_t a, b, c, d, e, f, g, h; 65 | 66 | for (int k=0; k < BUFFER_SIZE; k++) 67 | { 68 | for (int i=0; i < 16; i++) 69 | W[i] = data[k*stride+i]; 70 | for (int i=16; i < 64; i++) 71 | W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16]; 72 | 73 | a = H[k*8+0]; 74 | b = H[k*8+1]; 75 | c = H[k*8+2]; 76 | d = H[k*8+3]; 77 | e = H[k*8+4]; 78 | f = H[k*8+5]; 79 | g = H[k*8+6]; 80 | h = H[k*8+7]; 81 | 82 | /* 83 | * NOTE: We are only computing the first 16 iterations so that we can 84 | * ignore schedule generation and test correctness of this compression 85 | * loop. 86 | */ 87 | for (int i=0; i < 16; i++) 88 | { 89 | uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i]; 90 | uint32_t T2 = sigma0(a) + Maj(a,b,c); 91 | h = g; 92 | g = f; 93 | f = e; 94 | e = d + T1; 95 | d = c; 96 | c = b; 97 | b = a; 98 | a = T1 + T2; 99 | } 100 | 101 | H[k*8+0] += a; 102 | H[k*8+1] += b; 103 | H[k*8+2] += c; 104 | H[k*8+3] += d; 105 | H[k*8+4] += e; 106 | H[k*8+5] += f; 107 | H[k*8+6] += g; 108 | H[k*8+7] += h; 109 | } 110 | } 111 | 112 | 113 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride) 114 | { 115 | SHA256ExecuteQPU(H); 116 | } 117 | 118 | 119 | int main(int argc, char **argv) 120 | { 121 | unsigned int shader_code[MAX_CODE_SIZE]; 122 | bool run_cpu(true); 123 | 124 | if (argc < 2) { 125 | fprintf(stderr, "Usage: %s [-qpu]\n", argv[0]); 126 | return 1; 127 | } 128 | if (argc > 2 && (strcmp(argv[2], "-qpu") == 0)) 129 | run_cpu = false; 130 | 131 | /* Load the data to hash */ 132 | FILE *f_data = fopen(argv[1], "r"); 133 | if (!f_data) { 134 | fprintf(stderr, "Unable to open file %s\n", argv[1]); 135 | return 3; 136 | } 137 | 138 | /* Load the QPU code */ 139 | int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE); 140 | if (code_len < 1) { 141 | fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE); 142 | return 2; 143 | } 144 | printf("Loaded %d bytes of QPU code.\n", code_len); 145 | 146 | int nblocks = 1; // 1 512-bit block for now 147 | int stride = nblocks * 16; 148 | uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride]; 149 | uint32_t *H = new uint32_t[BUFFER_SIZE*8]; 150 | 151 | uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 152 | 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; 153 | char filebuffer[64]; // 1 512-bit blocks 154 | for (int i=0; i < BUFFER_SIZE; i++) 155 | { 156 | memcpy(H+i*8, H0, sizeof(H0)); 157 | memset(filebuffer, 0x0, sizeof(filebuffer)); 158 | // read a line up to 64-bytes long 159 | char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data); 160 | if (!p) { 161 | fprintf(stderr, "Failed to read enough lines from data file.\n"); 162 | delete[] H; 163 | delete[] buffer; 164 | return 4; 165 | } 166 | 167 | int bytes = strlen(filebuffer); 168 | filebuffer[bytes] = 0x80; // SHA-256 padding 169 | 170 | // last 8 bytes are the length of the initial message in bits 171 | uint8_t len_buffer[8]; 172 | *(uint64_t *)len_buffer = bytes * 8; 173 | for (int j=0; j < 8; j++) 174 | filebuffer[56+j] = len_buffer[7-j]; 175 | 176 | for (int j=0; j < 16; j++) 177 | buffer[i*16+j] = ENDIAN(filebuffer, j); 178 | } 179 | 180 | int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len); 181 | if (handle < 0) { 182 | fprintf(stderr, "Unable to setup QPU. Check permissions\n"); 183 | delete[] buffer; 184 | delete[] H; 185 | return 4; 186 | } 187 | 188 | /* 189 | * SHA-256 calculation here 190 | */ 191 | for (int i=0; i < nblocks; i++) 192 | { 193 | printf("Running %s version ...\n", (run_cpu) ? "CPU" : "QPU"); 194 | if (run_cpu) 195 | execute_sha256_cpu(buffer+i*16, H, stride); 196 | else 197 | execute_sha256_qpu(buffer+i*16, H, stride); 198 | } 199 | 200 | if (!run_cpu) 201 | SHA256FetchResult(H); 202 | 203 | // print out the H 204 | for (int i=0; i < BUFFER_SIZE; i++) { 205 | printf("%02d / SHA-256: ", i); 206 | for (int j=0; j < 8; j++) 207 | printf("%08x ", H[i*8+j]); 208 | printf("\n"); 209 | } 210 | 211 | SHA256CleanupQPU(handle); 212 | 213 | delete[] buffer; 214 | delete[] H; 215 | } 216 | -------------------------------------------------------------------------------- /QPU/SHA-256/partial/test-data.bin: -------------------------------------------------------------------------------- 1 | this is line number one 2 | this is line number two 3 | this is line number three 4 | this is line number four 5 | this is line number five 6 | this is line number six 7 | this is line number seven 8 | this is line number eight 9 | this is line number nine 10 | this is line number ten 11 | this is line number eleven 12 | this is line number twelve 13 | this is line number thirteen 14 | this is line number fourteen 15 | this is line number fifteen 16 | this is line number sixteen 17 | -------------------------------------------------------------------------------- /QPU/SHA-256/reference/Makefile: -------------------------------------------------------------------------------- 1 | sha256: sha256.cpp 2 | g++ -O3 -o sha256 sha256.cpp 3 | -------------------------------------------------------------------------------- /QPU/SHA-256/reference/sha256.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define NUM_QPUS 1 8 | #define BUFFER_SIZE NUM_QPUS * 16 9 | 10 | static uint32_t K[] = { 11 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 12 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 13 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 14 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 15 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 16 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 17 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 18 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 19 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 20 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 21 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 22 | }; 23 | 24 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) { 25 | return (x & y) ^ (~x & z); 26 | } 27 | 28 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) { 29 | return (x & y) ^ (x & z) ^ (y & z); 30 | } 31 | 32 | static inline uint32_t RotR(uint32_t x, uint8_t shift) { 33 | return (x >> shift) | (x << (32-shift)); 34 | } 35 | 36 | static inline uint32_t sigma0(uint32_t x) { 37 | return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22); 38 | } 39 | 40 | static inline uint32_t sigma1(uint32_t x) { 41 | return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25); 42 | } 43 | 44 | static inline uint32_t smsigma0(uint32_t x) { 45 | return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3); 46 | } 47 | 48 | static inline uint32_t smsigma1(uint32_t x) { 49 | return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10); 50 | } 51 | 52 | #define ENDIAN(x, i) ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3])) 53 | 54 | /* 55 | * data is an array of BUFFER_SIZE buffers to hash 56 | * H is an input/output parameter 57 | * stride is the stride for data (TODO: handle hashes of more than one block) 58 | */ 59 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride) 60 | { 61 | uint32_t W[64]; 62 | uint32_t a, b, c, d, e, f, g, h; 63 | 64 | for (int k=0; k < BUFFER_SIZE; k++) 65 | { 66 | for (int i=0; i < 16; i++) 67 | W[i] = data[k*stride+i]; 68 | for (int i=16; i < 64; i++) 69 | W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16]; 70 | 71 | a = H[k*8+0]; 72 | b = H[k*8+1]; 73 | c = H[k*8+2]; 74 | d = H[k*8+3]; 75 | e = H[k*8+4]; 76 | f = H[k*8+5]; 77 | g = H[k*8+6]; 78 | h = H[k*8+7]; 79 | 80 | for (int i=0; i < 64; i++) 81 | { 82 | uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i]; 83 | uint32_t T2 = sigma0(a) + Maj(a,b,c); 84 | h = g; 85 | g = f; 86 | f = e; 87 | e = d + T1; 88 | d = c; 89 | c = b; 90 | b = a; 91 | a = T1 + T2; 92 | } 93 | 94 | H[k*8+0] += a; 95 | H[k*8+1] += b; 96 | H[k*8+2] += c; 97 | H[k*8+3] += d; 98 | H[k*8+4] += e; 99 | H[k*8+5] += f; 100 | H[k*8+6] += g; 101 | H[k*8+7] += h; 102 | } 103 | } 104 | 105 | 106 | int main(int argc, char **argv) 107 | { 108 | bool run_cpu(true); 109 | if (argc < 2) { 110 | fprintf(stderr, "Usage: %s \n", argv[0]); 111 | return 1; 112 | } 113 | 114 | /* Load the data to hash */ 115 | FILE *f_data = fopen(argv[1], "r"); 116 | if (!f_data) { 117 | fprintf(stderr, "Unable to open file %s\n", argv[1]); 118 | return 3; 119 | } 120 | 121 | int nblocks = 1; // 1 512-bit block for now 122 | int stride = nblocks * 16; 123 | uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride]; 124 | uint32_t *H = new uint32_t[BUFFER_SIZE*8]; 125 | 126 | uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 127 | 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; 128 | char filebuffer[64]; // 1 512-bit blocks 129 | for (int i=0; i < BUFFER_SIZE; i++) 130 | { 131 | memcpy(H+i*8, H0, sizeof(H0)); 132 | memset(filebuffer, 0x0, sizeof(filebuffer)); 133 | // read a line up to 64-bytes long 134 | char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data); 135 | if (!p) { 136 | fprintf(stderr, "Failed to read enough lines from data file.\n"); 137 | delete[] H; 138 | delete[] buffer; 139 | return 4; 140 | } 141 | 142 | int bytes = strlen(filebuffer); 143 | filebuffer[bytes] = 0x80; // SHA-256 padding 144 | 145 | // last 8 bytes are the length of the initial message in bits 146 | uint8_t len_buffer[8]; 147 | *(uint64_t *)len_buffer = bytes * 8; 148 | for (int j=0; j < 8; j++) 149 | filebuffer[56+j] = len_buffer[7-j]; 150 | 151 | for (int j=0; j < 16; j++) 152 | buffer[i*16+j] = ENDIAN(filebuffer, j); 153 | } 154 | 155 | /* 156 | * SHA-256 calculation here 157 | */ 158 | for (int i=0; i < nblocks; i++) 159 | { 160 | execute_sha256_cpu(buffer+i*16, H, stride); 161 | } 162 | 163 | for (int i=0; i < BUFFER_SIZE; i++) { 164 | printf("%02d / SHA-256: ", i); 165 | for (int j=0; j < 8; j++) 166 | printf("%08x ", H[i*8+j]); 167 | printf("\n"); 168 | } 169 | 170 | delete[] buffer; 171 | delete[] H; 172 | } 173 | -------------------------------------------------------------------------------- /QPU/SHA-256/reference/test-data.bin: -------------------------------------------------------------------------------- 1 | this is line number one 2 | this is line number two 3 | this is line number three 4 | this is line number four 5 | this is line number five 6 | this is line number six 7 | this is line number seven 8 | this is line number eight 9 | this is line number nine 10 | this is line number ten 11 | this is line number eleven 12 | this is line number twelve 13 | this is line number thirteen 14 | this is line number fourteen 15 | this is line number fifteen 16 | this is line number sixteen 17 | -------------------------------------------------------------------------------- /QPU/assembler/Makefile: -------------------------------------------------------------------------------- 1 | qpu-assembler: assemble.cpp 2 | g++ -g -o qpu-assembler assemble.cpp 3 | -------------------------------------------------------------------------------- /QPU/assembler/assemble.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // for getopt() 8 | 9 | using namespace std; 10 | 11 | enum token_t { 12 | END=-1, 13 | WORD, 14 | DOT, 15 | COMMA, 16 | SEMI, 17 | COLON, 18 | }; 19 | 20 | struct QPUreg { 21 | enum { A, B, ACCUM, SMALL } file; 22 | int num; 23 | }; 24 | 25 | struct relocation { 26 | string label; 27 | int pc; 28 | }; 29 | 30 | struct context { 31 | const char *stream; 32 | map labels; 33 | int pc; 34 | vector relocations; 35 | }; 36 | 37 | 38 | static string addOps[] = { 39 | "nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs", 40 | "ftoi", "itof", "XXX", "XXX", "XXX", "add", "sub", "shr", 41 | "asr", "ror", "shl", "min", "max", "and", "or", "xor", "not", 42 | "clz", "XXX", "XXX", "XXX", "XXX", "XXX", "v8adds", "v8subs" }; 43 | 44 | static string mulOps[] = { 45 | "nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds", 46 | "v8subs" }; 47 | 48 | static uint8_t addOpCode(const string& word) 49 | { 50 | for (int i=0; i < 32; i++) { 51 | if (word == addOps[i]) 52 | return i; 53 | } 54 | 55 | return 0xFF; 56 | } 57 | 58 | static uint8_t mulOpCode(const string& word) 59 | { 60 | for (int i=0; i < 8; i++) { 61 | if (word == mulOps[i]) 62 | return i; 63 | } 64 | 65 | return 0xFF; 66 | } 67 | 68 | 69 | bool isRegisterWord(const string& word) { return word[0] == 'r'; } 70 | 71 | string printRegister(const QPUreg& reg) 72 | { 73 | char buffer[32]; 74 | if (reg.file == QPUreg::A || reg.file == QPUreg::B) { 75 | snprintf(buffer, 32, "r%c%d", (reg.file == QPUreg::A) ? 'a' : 'b', 76 | reg.num); 77 | } 78 | else if (reg.file == QPUreg::ACCUM) { 79 | snprintf(buffer, 32, "r%d", reg.num); 80 | } 81 | else { 82 | snprintf(buffer, 32, ".0x%x.", reg.num); 83 | } 84 | 85 | return buffer; 86 | } 87 | 88 | bool parseRegister(const string& word, QPUreg& reg) 89 | { 90 | if (word[0] != 'r') 91 | return false; 92 | 93 | int offset = 0; 94 | switch (word[1]) { 95 | case 'a': reg.file = QPUreg::A; offset = 2; break; 96 | case 'b': reg.file = QPUreg::B; offset = 2; break; 97 | default: 98 | reg.file = QPUreg::ACCUM; 99 | offset = 1; 100 | } 101 | // TODO: check that this is in range. (ACCUM < 6, e.g.) 102 | reg.num = atoi(word.c_str() + offset); 103 | 104 | return true; 105 | } 106 | 107 | uint32_t parseImmediate(const string& str) 108 | { 109 | // if there is an 'x' we assume it's hex. 110 | if (str.find_first_of("x") != string::npos) 111 | return strtoul(str.c_str(), NULL, 16); 112 | 113 | if (str.find_first_of(".f") != string::npos) { 114 | float f = strtof(str.c_str(), NULL); 115 | return *(uint32_t*)&f; 116 | } 117 | 118 | // otherwise decimal 119 | return strtoul(str.c_str(), NULL, 10); 120 | } 121 | 122 | uint8_t parseBranchCond(const string& str) 123 | { 124 | if (str == "zf") // all z flags set ("z full") 125 | return 0x0; 126 | if (str == "ze") // all z flags clear ("z empty") 127 | return 0x1; 128 | if (str == "zs") // any z flags set ("z set") 129 | return 0x2; 130 | if (str == "zc") // any z flags clear ("z clear") 131 | return 0x3; 132 | if (str == "nf") // all N flags set ("N full") 133 | return 0x4; 134 | if (str == "ne") // all N flags clear ("N empty") 135 | return 0x5; 136 | if (str == "ns") // any N flags set ("N set") 137 | return 0x6; 138 | if (str == "nc") // any N flags clear ("N clear") 139 | return 0x7; 140 | if (str == "cf") // all C flags set ("C full") 141 | return 0x8; 142 | if (str == "ce") // all C flags clear ("C empty") 143 | return 0x9; 144 | if (str == "cs") // any C flags set ("C set") 145 | return 0xa; 146 | if (str == "cc") // any C flags clear ("C clear") 147 | return 0xb; 148 | if (str == "*") // always 149 | return 0xf; 150 | 151 | // throw some exceptions 152 | cerr << "Invalid branch condition: " << str << endl; 153 | exit(0); 154 | } 155 | 156 | uint8_t setALUMux(const QPUreg& reg) 157 | { 158 | switch (reg.file) { 159 | case QPUreg::A: return 0x6; 160 | case QPUreg::B: return 0x7; 161 | case QPUreg::ACCUM: 162 | if (reg.num > 6 || reg.num < 0) { 163 | cerr << "Invalid accumulator register; out of range" << endl; 164 | exit(0); 165 | } 166 | return reg.num; 167 | case QPUreg::SMALL: return 0x7; 168 | } 169 | } 170 | 171 | 172 | token_t nextToken(const char *stream, string& out, const char **ptr) 173 | { 174 | char buffer[128]; 175 | int i = 0; 176 | 177 | *ptr = stream; 178 | if (!stream || !*stream) 179 | return END; 180 | 181 | while (*stream == ' ' || *stream == '\t') 182 | stream++; 183 | 184 | if (*stream == '\0') 185 | return END; 186 | 187 | if (isdigit(*stream)) 188 | { 189 | // read until we don't find a hex digit, x (for hex) or . 190 | while (isxdigit(*stream) || isdigit(*stream) || *stream == '.' || *stream == 'x') { 191 | buffer[i++] = *stream++; 192 | if (*stream == 0 || i > sizeof(buffer) - 1) 193 | break; 194 | } 195 | buffer[i++] = '\0'; 196 | out = buffer; 197 | *ptr = stream; 198 | 199 | return WORD; 200 | } 201 | 202 | if (*stream == '.') { *ptr = stream+1; return DOT; } 203 | if (*stream == ',') { *ptr = stream+1; return COMMA; } 204 | if (*stream == ';') { *ptr = stream+1; return SEMI; } 205 | if (*stream == '#') { *ptr = stream+1; return END; } 206 | if (*stream == ':') { *ptr = stream+1; return COLON; } 207 | 208 | while (*stream != '.' && *stream != ',' && *stream != ';' 209 | && *stream != ' ' && *stream != '\t' 210 | && *stream != ':') 211 | { 212 | buffer[i++] = *stream++; 213 | if (*stream == 0 || i > sizeof(buffer)-1) 214 | break; 215 | } 216 | 217 | buffer[i++] = '\0'; 218 | out = buffer; 219 | *ptr = stream; 220 | 221 | return WORD; 222 | } 223 | 224 | 225 | bool aluHelper(const char *stream, QPUreg& dest, QPUreg& r1, QPUreg& r2, uint8_t& sig, const char **ptr) 226 | { 227 | string token_str; 228 | token_t tok = nextToken(stream, token_str, &stream); 229 | 230 | if (tok == DOT) { 231 | // conditional 232 | nextToken(stream, token_str, &stream); 233 | cout << "flag/conditional = " << token_str << endl; 234 | if (token_str == "tmu") 235 | sig = 10; 236 | else if (token_str == "tend") 237 | sig = 3; 238 | tok = nextToken(stream, token_str, &stream); 239 | } 240 | 241 | // this is supposed to be the destination register 242 | if (tok != WORD) { 243 | cout << "Expecting word. Got: " << token_str << endl; 244 | return false; 245 | } 246 | 247 | parseRegister(token_str, dest); 248 | tok = nextToken(stream, token_str, &stream); 249 | if (tok != COMMA) return false; 250 | tok = nextToken(stream, token_str, &stream); 251 | parseRegister(token_str, r1); 252 | 253 | tok = nextToken(stream, token_str, &stream); 254 | if (tok != COMMA) return false; 255 | tok = nextToken(stream, token_str, &stream); 256 | if (!parseRegister(token_str, r2)) { 257 | r2.file = QPUreg::SMALL; 258 | uint32_t imm = parseImmediate(token_str); 259 | // double check handle negative values 260 | if (imm < 16) 261 | r2.num = imm; 262 | else { 263 | cerr << "TODO: Unhandled small immediate" << endl; 264 | return false; 265 | } 266 | } 267 | 268 | /* 269 | cout << "dest: " << printRegister(dest) << ", r1: " 270 | << printRegister(r1) << ", r2: " 271 | << printRegister(r2) << endl; 272 | */ 273 | 274 | *ptr = stream; 275 | return true; 276 | } 277 | 278 | 279 | uint64_t assembleALU(context& ctx, string word) 280 | { 281 | string token_str; 282 | uint8_t add_op = addOpCode(word); 283 | if (add_op == 0xFF) { 284 | cout << "FATAL (assert). Bad opcode" << endl; 285 | return -1; 286 | } 287 | 288 | QPUreg addDest, addR1, addR2; 289 | QPUreg mulDest, mulR1, mulR2; 290 | 291 | uint8_t sig = 0x1; // no-signal (TODO: plumb signals through) 292 | if (!aluHelper(ctx.stream, addDest, addR1, addR2, sig, &ctx.stream)) 293 | return -1; 294 | 295 | token_t tok = nextToken(ctx.stream, token_str, &ctx.stream); 296 | // this should be a semi-colon 297 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 298 | uint8_t mul_op = mulOpCode(token_str); 299 | if (mul_op == 0xFF) { 300 | cout << "FATAL (assert). Bad opcode" << endl; 301 | return -1; 302 | } 303 | 304 | bool skipParseMul(false); 305 | if (mul_op == 0) { 306 | // nop. If the next token is a semi or END, we'll generate 307 | // the registers for them 308 | const char *discard; 309 | tok = nextToken(ctx.stream, token_str, &discard); 310 | if (tok == END || tok == SEMI) { 311 | mulDest.num = 39; 312 | mulDest.file = (addDest.file == QPUreg::A) ? QPUreg::B : QPUreg::A; 313 | mulR1 = addR1; 314 | mulR2 = addR2; 315 | skipParseMul = true; 316 | } 317 | } 318 | 319 | if (!skipParseMul) { 320 | uint8_t junk; 321 | if (!aluHelper(ctx.stream, mulDest, mulR1, mulR2, junk, &ctx.stream)) 322 | return -1; 323 | } 324 | 325 | uint64_t ins = 0x0; 326 | uint8_t cond_add = 0x1; 327 | uint8_t cond_mul = 0x1; 328 | uint8_t sf = 0x1; 329 | if (add_op == 0) 330 | sf = 0x0; // no set flags on nop 331 | 332 | // TODO: constraints. We can only read from file A and file B once (dual-port) 333 | 334 | uint8_t ws = 0x0; 335 | // If the add pipe specifies file b for output, ws = 1 336 | if (addDest.file == QPUreg::B) 337 | ws = 0x1; 338 | // if ws == 1, mul pipe must specify file a for output 339 | if (ws == 0x1 && mulDest.file != QPUreg::A) { 340 | cout << "constraint check failed. mul pipe must specify register file A when write-swap set" << endl; 341 | return -1; 342 | } 343 | // if ws == 0, mul pipe must specify file b for output 344 | if (ws == 0x0 && mulDest.file != QPUreg::B) { 345 | cout << "constraint check failed. mul pipe must specify register file B when write-swap clear" << endl; 346 | return -1; 347 | } 348 | 349 | // TODO: handle the accumulators and the small immediate 350 | uint8_t read_a = 0x0; 351 | if (addR1.file == QPUreg::A) read_a = addR1.num; 352 | else if (addR2.file == QPUreg::A) read_a = addR2.num; 353 | else if (mulR1.file == QPUreg::A) read_a = mulR1.num; 354 | else if (mulR2.file == QPUreg::A) read_a = mulR2.num; 355 | 356 | uint8_t read_b = 0x0; 357 | if (addR1.file == QPUreg::B) read_b = addR1.num; 358 | else if (addR2.file == QPUreg::B) read_b = addR2.num; 359 | else if (mulR1.file == QPUreg::B) read_b = mulR1.num; 360 | else if (mulR2.file == QPUreg::B) read_b = mulR2.num; 361 | 362 | // checks: 363 | // read_a not set and one of the muxes specifies file A ... 364 | // same for read_b 365 | // read_b set and there is a small immediate value 366 | 367 | // we could have immediates in the first register slot but not sure it makes sense 368 | // As above, we should check that read_b is not already set 369 | if (addR2.file == QPUreg::SMALL) { read_b = addR2.num; sig = 13; } 370 | if (mulR2.file == QPUreg::SMALL) { read_b = mulR2.num; sig = 13; } 371 | 372 | uint8_t add_a = setALUMux(addR1) & 0x7; 373 | uint8_t add_b = setALUMux(addR2) & 0x7; 374 | uint8_t mul_a = setALUMux(mulR1) & 0x7; 375 | uint8_t mul_b = setALUMux(mulR2) & 0x7; 376 | read_a &= 0x3f; 377 | read_b &= 0x3f; 378 | mul_op &= 0x7; 379 | add_op &= 0x1f; 380 | addDest.num &= 0x3f; 381 | mulDest.num &= 0x3f; 382 | cond_add &= 0x7; 383 | cond_mul &= 0x7; 384 | sf &= 0x1; 385 | ws &= 0x1; 386 | 387 | printf("Assembling ALU instruction: %s, %d, %d\n", printRegister(addDest).c_str(), ws, sig); 388 | 389 | ins = ((uint64_t)sig << 60) | ((uint64_t)cond_add << 49) | ((uint64_t)cond_mul << 46) | ((uint64_t)sf << 45) | ((uint64_t)ws << 44); 390 | ins |= ((uint64_t)addDest.num << 38) | ((uint64_t)mulDest.num << 32) | ((uint64_t)mul_op << 29) | ((uint64_t)add_op << 24); 391 | ins |= ((uint64_t)read_a << 18) | ((uint64_t)read_b << 12) | ((uint64_t)add_a << 9) | ((uint64_t)add_b << 6) | ((uint64_t)mul_a << 3) | mul_b; 392 | 393 | return ins; 394 | } 395 | 396 | uint64_t assembleLDI(context& ctx, string word) 397 | { 398 | cout << "Assembling LDI instruction ... " << endl; 399 | 400 | string token_str; 401 | token_t tok = nextToken(ctx.stream, token_str, &ctx.stream); 402 | 403 | if (tok == DOT) { 404 | // conditional ... conditionals should be on each register ? 405 | cout << "conditional ... "; 406 | // chew the conditional 407 | nextToken(ctx.stream, token_str, &ctx.stream); 408 | 409 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 410 | } 411 | 412 | // this is supposed to be the register 413 | if (tok != WORD) return -1; 414 | 415 | QPUreg register1, register2; 416 | // check errors here 417 | parseRegister(token_str, register1); 418 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 419 | if (tok != COMMA) return -1; 420 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 421 | 422 | // this can either be another register 423 | // (in which case we'll use both ALUs to set) 424 | // or an immediate value (in which case we'll use rX39) 425 | register2.num = 39; 426 | register2.file = (register1.file == QPUreg::A) ? QPUreg::B : QPUreg::A; 427 | if (isRegisterWord(token_str)) { 428 | parseRegister(token_str, register2); 429 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 430 | // check that this is a comma ... 431 | } 432 | 433 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 434 | unsigned int immediate = parseImmediate(token_str); 435 | 436 | cout << "r1: " << printRegister(register1) << ", r2: " 437 | << printRegister(register2) << ", immed: 0x" 438 | << hex << immediate << dec << endl; 439 | 440 | while (nextToken(ctx.stream, token_str, &ctx.stream) != END) 441 | ; 442 | 443 | uint32_t high = (uint32_t)0xE00 << 20; 444 | high |= (uint32_t)0x1 << 17; // cond_add 445 | high |= (uint32_t)0x1 << 14; // cond_mul 446 | high |= (uint32_t)0x0 << 13; // sf 447 | high |= (uint32_t)0x0 << 12; // ws 448 | uint8_t addreg = (register1.file == QPUreg::A) ? register1.num : register2.num; 449 | uint8_t mulreg = (register1.file == QPUreg::B) ? register1.num : register2.num; 450 | high |= (uint32_t)addreg << 6; 451 | high |= mulreg; 452 | uint64_t ins = ((uint64_t)high << 32) | immediate; 453 | 454 | return ins; 455 | } 456 | 457 | uint64_t assembleBRANCH(context& ctx, string word) 458 | { 459 | cout << "Assembing BRANCH instruction" << endl; 460 | 461 | QPUreg dest; 462 | string token_str; 463 | token_t tok = nextToken(ctx.stream, token_str, &ctx.stream); 464 | 465 | // relative or absolute branch? 466 | uint8_t relative = 1; 467 | if (word == "bra") 468 | relative = 0; 469 | 470 | uint8_t branchCondition = 0xf; // by default: always (unconditional branch) 471 | if (tok == DOT) { 472 | // conditional 473 | nextToken(ctx.stream, token_str, &ctx.stream); 474 | branchCondition = parseBranchCond(token_str); 475 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 476 | } 477 | 478 | // this is the destination register 479 | if (tok != WORD) { 480 | cerr << "branch expecting destination register." << endl; 481 | return -1; 482 | } 483 | parseRegister(token_str, dest); 484 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 485 | if (tok != COMMA) return false; 486 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 487 | if (tok != WORD) { 488 | cerr << "branch expecting label/target" << endl; 489 | return -1; 490 | } 491 | 492 | // look it up in the labels map 493 | int target = 0xFFFFFFFF; 494 | if (ctx.labels.count(token_str) < 1) { 495 | relocation r; 496 | r.label = token_str; 497 | r.pc = ctx.pc; 498 | ctx.relocations.push_back(r); 499 | } else 500 | target = ctx.labels[token_str]; 501 | int offset = target - (ctx.pc+4*8); 502 | 503 | uint8_t raddr_a = 0; // raddr_a is only 5-bits? 504 | uint8_t use_reg = 0; 505 | // if there's a third argument, it is a register offset 506 | const char *discard; 507 | tok = nextToken(ctx.stream, token_str, &discard); 508 | if (tok == COMMA) { 509 | QPUreg offsetReg; 510 | // chew the comma we just read 511 | ctx.stream = discard; 512 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 513 | parseRegister(token_str, offsetReg); 514 | if (offsetReg.file != QPUreg::A) { 515 | cerr << "branch target offset register must be file A" << endl; 516 | return -1; 517 | } 518 | if (offsetReg.num > 31) { 519 | cerr << "branch target offset register must be < 32" << endl; 520 | return -1; 521 | } 522 | raddr_a = offsetReg.num; 523 | use_reg = 1; 524 | } 525 | 526 | uint8_t waddr_add = 39; // link address appears at ALU outputs 527 | uint8_t waddr_mul = 39; 528 | if (dest.file == QPUreg::A) waddr_add = dest.num; 529 | if (dest.file == QPUreg::B) waddr_mul = dest.num; 530 | 531 | uint64_t ins = (uint64_t)0xF << 60; 532 | ins |= (uint64_t)branchCondition << 52; 533 | ins |= (uint64_t)relative << 51; 534 | ins |= (uint64_t)use_reg << 50; 535 | ins |= (uint64_t)raddr_a << 45; 536 | ins |= (uint64_t)0x0 << 44; // write-swap 537 | ins |= (uint64_t)waddr_add << 38; 538 | ins |= (uint64_t)waddr_mul << 32; 539 | ins |= (uint32_t)offset; 540 | 541 | return ins; 542 | } 543 | 544 | uint64_t assembleSEMA(context& ctx, string word) 545 | { 546 | 547 | uint64_t ins = (uint64_t)0x74 << 57; 548 | 549 | string token_str; 550 | token_t tok = nextToken(ctx.stream, token_str, &ctx.stream); 551 | if (tok != WORD) { 552 | cerr << "semaphore instruction expecting down/up or acquire/release" << endl; 553 | return -1; 554 | } 555 | 556 | uint8_t sa = 0; // up 557 | if (token_str == "down" || token_str == "acquire") 558 | sa = 1; 559 | 560 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 561 | if (tok != COMMA) return -1; 562 | tok = nextToken(ctx.stream, token_str, &ctx.stream); 563 | uint32_t imm = parseImmediate(token_str); 564 | if (imm > 15) { 565 | cerr << "semaphore out of range" << endl; 566 | return -1; 567 | } 568 | // cond_add, cond_mul = NEVER, ws, sf = false 569 | ins |= (uint64_t)39 << 38; // waddr_add 570 | ins |= (uint64_t)39 << 32; // waddr_mul 571 | ins |= sa << 4; 572 | ins |= (uint8_t)imm; 573 | 574 | cout << "Assembling SEMAPHORE instruction (" << imm << "), " << (int)sa << endl; 575 | 576 | return ins; 577 | } 578 | 579 | 580 | int main(int argc, char **argv) 581 | { 582 | char *outfname = 0; 583 | int c; 584 | 585 | while ((c = getopt(argc, argv, "o:")) != -1) { 586 | switch (c) { 587 | case 'o': 588 | outfname = optarg; 589 | break; 590 | } 591 | } 592 | 593 | if (!outfname) { 594 | cerr << "Usage: " << argv[0] << " -o " << endl; 595 | return -1; 596 | } 597 | 598 | FILE *outfile = fopen(outfname, "w"); 599 | if (!outfile) 600 | { 601 | cerr << "Unable to open output file output.bin" << endl; 602 | return -1; 603 | } 604 | 605 | char line[128]; 606 | string token_string; 607 | 608 | struct context ctx; 609 | ctx.pc = 0; 610 | 611 | vector instructions; 612 | 613 | int lineNo = 0; 614 | while (cin.getline(line, 128)) 615 | { 616 | lineNo++; 617 | const char *p = line; 618 | ctx.stream = p; 619 | token_t tok = nextToken(ctx.stream, token_string, &ctx.stream); 620 | 621 | if (tok == END) 622 | continue; 623 | 624 | if (tok == WORD) 625 | { 626 | // read-ahead to see if the next token is a colon in which case 627 | // this is a label. 628 | const char *discard = NULL; 629 | string nextTokenStr; 630 | if (nextToken(ctx.stream, nextTokenStr, &discard) == COLON) { 631 | ctx.labels[token_string] = ctx.pc; 632 | continue; 633 | } 634 | 635 | enum { INVALID, ALU, BRANCH, LDI, SEMA } opType = INVALID; 636 | if (addOpCode(token_string) != 0xFF || mulOpCode(token_string) != 0xFF) 637 | opType = ALU; 638 | if (token_string == "ldi") opType = LDI; 639 | if (token_string == "bra" || token_string == "brr") opType = BRANCH; 640 | if (token_string == "sema") opType = SEMA; 641 | 642 | if (opType == INVALID) { 643 | cerr << "Unable to assemble line " << lineNo << " : " << line << endl; 644 | cerr << " ... invalid opcode" << endl; 645 | return -1; 646 | } 647 | 648 | uint64_t ins = 0; 649 | switch (opType) { 650 | case ALU: ins = assembleALU(ctx, token_string); break; 651 | case BRANCH: ins = assembleBRANCH(ctx, token_string); break; 652 | case LDI: ins = assembleLDI(ctx, token_string); break; 653 | case SEMA: ins = assembleSEMA(ctx, token_string); break; 654 | } 655 | 656 | if (ins == (uint64_t)-1) { 657 | cerr << "Error on line " << lineNo << " : " << line << endl; 658 | return -1; 659 | } 660 | 661 | instructions.push_back(ins); 662 | ctx.pc += 8; // bytes; 663 | } 664 | } 665 | 666 | // Process relocations 667 | ctx.labels["ZERO"] = 0x0; 668 | for (int i=0; i < ctx.relocations.size(); i++) 669 | { 670 | relocation& r = ctx.relocations[i]; 671 | if (ctx.labels.count(r.label) < 1) 672 | { 673 | cerr << "undefined label: " << r.label << endl; 674 | return -1; 675 | } 676 | int offset = ctx.labels[r.label] - (r.pc + 4*8); 677 | if (r.label == "ZERO") 678 | offset = 0x0; 679 | cout << "Processing relocation at " << r.pc << " : " << r.label 680 | << " : " << offset << endl; 681 | uint64_t ins = instructions[r.pc / 8]; 682 | ins &= (uint64_t)0xFFFFFFFF << 32; // zero bottom 32-bits for new value 683 | ins |= (uint32_t)offset; 684 | instructions[r.pc / 8] = ins; 685 | } 686 | 687 | for (int i=0; i < instructions.size(); i++) 688 | fwrite(&instructions[i], sizeof(uint64_t), 1, outfile); 689 | 690 | fclose(outfile); 691 | cout << "Done. Num instructions: " << instructions.size() << ", " 692 | << instructions.size() * 8 << " bytes." << endl; 693 | } 694 | -------------------------------------------------------------------------------- /QPU/helloworld/Makefile: -------------------------------------------------------------------------------- 1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c 2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft 3 | 4 | helloworld: driver.c 5 | g++ -g -O3 -o helloworld driver.c $(MBOX_C) $(MBOX_INC) 6 | -------------------------------------------------------------------------------- /QPU/helloworld/driver.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "mailbox.h" 8 | 9 | #define GPU_MEM_FLG 0xC 10 | #define GPU_MEM_MAP 0x0 11 | #define NUM_QPUS 1 12 | #define MAX_CODE_SIZE 8192 13 | 14 | static unsigned int qpu_code[MAX_CODE_SIZE]; 15 | 16 | struct memory_map { 17 | unsigned int code[MAX_CODE_SIZE]; 18 | unsigned int uniforms[NUM_QPUS][2]; // 2 parameters per QPU 19 | // first address is the input value 20 | // for the program to add to 21 | // second is the address of the 22 | // result buffer 23 | unsigned int msg[NUM_QPUS][2]; 24 | unsigned int results[NUM_QPUS][16]; // result buffer for the QPU to 25 | // write into 26 | }; 27 | 28 | 29 | int loadShaderCode(const char *fname, unsigned int* buffer, int len) 30 | { 31 | FILE *in = fopen(fname, "r"); 32 | if (!in) { 33 | fprintf(stderr, "Failed to open %s.\n", fname); 34 | exit(0); 35 | } 36 | 37 | size_t items = fread(buffer, sizeof(unsigned int), len, in); 38 | fclose(in); 39 | 40 | return items; 41 | } 42 | 43 | 44 | int main(int argc, char **argv) 45 | { 46 | if (argc < 3) { 47 | fprintf(stderr, "Usage: %s \n", argv[0]); 48 | return 0; 49 | } 50 | int code_words = loadShaderCode(argv[1], qpu_code, MAX_CODE_SIZE); 51 | 52 | printf("Loaded %d bytes of code from %s ...\n", code_words * sizeof(unsigned), argv[1]); 53 | 54 | int mb = mbox_open(); 55 | if (qpu_enable(mb, 1)) { 56 | fprintf(stderr, "QPU enable failed.\n"); 57 | return -1; 58 | } 59 | printf("QPU enabled.\n"); 60 | 61 | unsigned uniform_val = atoi(argv[2]); 62 | printf("Uniform value = %d\n", uniform_val); 63 | 64 | unsigned size = 1024 * 1024; 65 | unsigned handle = mem_alloc(mb, size, 4096, GPU_MEM_FLG); 66 | if (!handle) { 67 | fprintf(stderr, "Unable to allocate %d bytes of GPU memory", size); 68 | return -2; 69 | } 70 | unsigned ptr = mem_lock(mb, handle); 71 | void *arm_ptr = mapmem(ptr + GPU_MEM_MAP, size); 72 | // assert arm_ptr ... 73 | 74 | struct memory_map *arm_map = (struct memory_map *)arm_ptr; 75 | memset(arm_map, 0x0, sizeof(struct memory_map)); 76 | unsigned vc_uniforms = ptr + offsetof(struct memory_map, uniforms); 77 | unsigned vc_code = ptr + offsetof(struct memory_map, code); 78 | unsigned vc_msg = ptr + offsetof(struct memory_map, msg); 79 | unsigned vc_results = ptr + offsetof(struct memory_map, results); 80 | memcpy(arm_map->code, qpu_code, code_words * sizeof(unsigned int)); 81 | for (int i=0; i < NUM_QPUS; i++) { 82 | arm_map->uniforms[i][0] = uniform_val; 83 | arm_map->uniforms[i][1] = vc_results + i * sizeof(unsigned) * 16; 84 | arm_map->msg[i][0] = vc_uniforms + i * sizeof(unsigned) * 2; 85 | arm_map->msg[i][1] = vc_code; 86 | } 87 | 88 | unsigned ret = execute_qpu(mb, NUM_QPUS, vc_msg, 1, 10000); 89 | 90 | // check the results! 91 | for (int i=0; i < NUM_QPUS; i++) { 92 | for (int j=0; j < 16; j++) { 93 | printf("QPU %d, word %d: 0x%08x\n", i, j, arm_map->results[i][j]); 94 | } 95 | } 96 | 97 | printf("Cleaning up.\n"); 98 | unmapmem(arm_ptr, size); 99 | mem_unlock(mb, handle); 100 | mem_free(mb, handle); 101 | qpu_enable(mb, 0); 102 | printf("Done.\n"); 103 | } 104 | -------------------------------------------------------------------------------- /QPU/helloworld/helloworld.asm: -------------------------------------------------------------------------------- 1 | # Load the value we want to add to the input into a register 2 | ldi ra1, 0x1234 3 | 4 | # Configure the VPM for writing 5 | ldi rb49, 0xa00 6 | 7 | # Add the input value (first uniform - rb32) and the register with the hard-coded 8 | # constant into the VPM. 9 | add rb48, ra1, rb32; nop 10 | 11 | ## move 16 words (1 vector) back to the host (DMA) 12 | ldi rb49, 0x88010000 13 | 14 | ## initiate the DMA (the next uniform - ra32 - is the host address to write to)) 15 | or rb50, ra32, 0; nop 16 | 17 | # Wait for the DMA to complete 18 | or rb39, rb50, ra39; nop 19 | 20 | # trigger a host interrupt (writing rb38) to stop the program 21 | or rb38, ra39, ra39; nop 22 | 23 | nop.tend ra39, ra39, ra39; nop rb39, rb39, rb39 24 | nop ra39, ra39, ra39; nop rb39, rb39, rb39 25 | nop ra39, ra39, ra39; nop rb39, rb39, rb39 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rpi-playground 2 | ============== 3 | 4 | Raspberry Pi Projects 5 | --------------------------------------------------------------------------------