├── QPU
    ├── SHA-256
    │   ├── 1QPU
    │   │   ├── 1qpu.asm
    │   │   ├── Makefile
    │   │   ├── qpufuncs.cpp
    │   │   ├── qpufuncs.h
    │   │   ├── sha256.cpp
    │   │   └── test-data.bin
    │   ├── final
    │   │   ├── Makefile
    │   │   ├── final.asm
    │   │   ├── qpufuncs.cpp
    │   │   ├── qpufuncs.h
    │   │   ├── sha256.cpp
    │   │   └── test-data.bin
    │   ├── partial
    │   │   ├── Makefile
    │   │   ├── partial.asm
    │   │   ├── qpufuncs.cpp
    │   │   ├── qpufuncs.h
    │   │   ├── sha256.cpp
    │   │   └── test-data.bin
    │   └── reference
    │   │   ├── Makefile
    │   │   ├── sha256.cpp
    │   │   └── test-data.bin
    ├── assembler
    │   ├── Makefile
    │   └── assemble.cpp
    └── helloworld
    │   ├── Makefile
    │   ├── driver.c
    │   └── helloworld.asm
└── README.md


/QPU/SHA-256/1QPU/1qpu.asm:
--------------------------------------------------------------------------------
  1 | define(`NOP', `nop ra39, ra39, ra39;  nop rb39, rb39, rb39')
  2 | define(`GENSCHEDULE',
  3 | `
  4 |     add rb32, $1, 0;                nop         # r0 = W_i-16
  5 |     ror rb33, $2, 7;                nop         # r1 = RotR(x, 7)
  6 |     ror rb34, $2, rb6;              nop         # r2 = RotR(x, 18)
  7 |     shr rb35, $2, 3;                nop         # r3 = x >> 3;
  8 |     xor rb33, r1, r2;               nop         # r1 = r1 ^ r2
  9 |     xor rb35, r1, r3;               nop         # r3 = r1 ^ r3
 10 |     add rb32, r0, r3;               nop         # r0 += r3          (W_i-16 + smsigma0(W_i-15))
 11 |     add rb32, r0, $3;               nop         # r0 += W_i-7
 12 |     ror rb33, $4, rb8;              nop         # r1 = RotR(x, 17)
 13 |     ror rb34, $4, rb7;              nop         # r2 = RotR(x, 19)
 14 |     xor rb33, r1, r2;               nop         # r1 = r1 ^ r2
 15 |     shr rb34, $4, 10;               nop         # r2 = x >> 10
 16 |     xor rb33, r1, r2;               nop         # r1 = r1 ^ r2
 17 |     add $1, r0, r1;                 nop         # r0 += smsigma1(W_i-2)
 18 |     add rb48, r0, r1;               nop
 19 |     ## $2 ignored, $3 ignored, $4 ignored, $1 ignored (suppress warnings)')
 20 | define(`GENSCHEDULE_ALL',
 21 | `
 22 | GENSCHEDULE(`ra4', `ra5', `ra13', `ra18')
 23 | GENSCHEDULE(`ra5', `ra6', `ra14', `ra19')
 24 | GENSCHEDULE(`ra6', `ra7', `ra15', `ra4')
 25 | GENSCHEDULE(`ra7', `ra8', `ra16', `ra5')
 26 | GENSCHEDULE(`ra8', `ra9', `ra17', `ra6')
 27 | GENSCHEDULE(`ra9', `ra10', `ra18', `ra7')
 28 | GENSCHEDULE(`ra10', `ra11', `ra19', `ra8')
 29 | GENSCHEDULE(`ra11', `ra12', `ra4', `ra9')
 30 | GENSCHEDULE(`ra12', `ra13', `ra5', `ra10')
 31 | GENSCHEDULE(`ra13', `ra14', `ra6', `ra11')
 32 | GENSCHEDULE(`ra14', `ra15', `ra7', `ra12')
 33 | GENSCHEDULE(`ra15', `ra16', `ra8', `ra13')
 34 | GENSCHEDULE(`ra16', `ra17', `ra9', `ra14')
 35 | GENSCHEDULE(`ra17', `ra18', `ra10', `ra15')
 36 | GENSCHEDULE(`ra18', `ra19', `ra11', `ra16')
 37 | GENSCHEDULE(`ra19', `ra4', `ra12', `ra17')')
 38 | 
 39 | ## Move the uniforms (arguments) into registers
 40 | or ra31, ra32, 0;           nop         # address of K in ra31
 41 | or ra30, ra32, 0;           nop         # address of H in ra30
 42 | or ra29, ra32, 0;           nop         # address of data in ra29
 43 | 
 44 | ## Load some rotation constants that don't fit in small immediates
 45 | ldi rb2, 0x16;
 46 | ldi rb5, 0x19;
 47 | ldi rb6, 0x12;
 48 | ldi rb7, 0x13;
 49 | ldi rb8, 0x11;
 50 | 
 51 | ## VCD DMA setup for the H vectors (16x8)
 52 | ldi ra49, 0x82801000
 53 | 
 54 | ## Move the H vectors into the VPM (0,0 in VPM)
 55 | or ra50, ra30, 0;           nop
 56 | 
 57 | ## Wait for the DMA to complete
 58 | and rb39, ra50, ra50;           nop
 59 | 
 60 | ## Configure the VPM for reading the H vectors
 61 | ldi ra49, 0x801200
 62 | 
 63 | ## Read the H vectors into registers ra20..ra27 (these are the a..h)
 64 | ## Also copy them into rb20..rb27 (we need the original values to write back)
 65 | or ra20, ra48, 0;           v8max rb20, ra48, ra48;
 66 | or ra21, ra48, 0;           v8max rb21, ra48, ra48;
 67 | or ra22, ra48, 0;           v8max rb22, ra48, ra48;
 68 | or ra23, ra48, 0;           v8max rb23, ra48, ra48;
 69 | or ra24, ra48, 0;           v8max rb24, ra48, ra48;
 70 | or ra25, ra48, 0;           v8max rb25, ra48, ra48;
 71 | or ra26, ra48, 0;           v8max rb26, ra48, ra48;
 72 | or ra27, ra48, 0;           v8max rb27, ra48, ra48;
 73 | 
 74 | ## Configure the VPM/VCD to read the data vectors
 75 | ldi ra49, 0x83001000
 76 | or ra50, ra29, 0;           nop         ## Load address to DMA
 77 | or rb39, ra50, 0;           nop         ## Wait for it
 78 | 
 79 | ## Read the data vectors into ra4..ra19 since we use the registers in
 80 | ## GENSCHEDULE
 81 | ldi ra49, 0x1200
 82 | or ra4, ra48, 0;            nop
 83 | or ra5, ra48, 0;            nop
 84 | or ra6, ra48, 0;            nop
 85 | or ra7, ra48, 0;            nop
 86 | or ra8, ra48, 0;            nop
 87 | or ra9, ra48, 0;            nop
 88 | or ra10, ra48, 0;           nop
 89 | or ra11, ra48, 0;           nop
 90 | or ra12, ra48, 0;           nop
 91 | or ra13, ra48, 0;           nop
 92 | or ra14, ra48, 0;           nop
 93 | or ra15, ra48, 0;           nop
 94 | or ra16, ra48, 0;           nop
 95 | or ra17, ra48, 0;           nop
 96 | or ra18, ra48, 0;           nop
 97 | or ra19, ra48, 0;           nop
 98 | 
 99 | 
100 | ## 4 loops of 16 = 64 iterations
101 | ldi ra1, 4
102 | mainloop:
103 | 
104 | ldi ra49, 0x1200
105 | ## 16 loops of compression
106 | ldi ra2, 0x10;
107 | compress:
108 |     ## r0 = K[i] + h 
109 |     or rb56, ra31, 0;       nop
110 |     nop.tmu ra39, ra39, ra39;   nop
111 |     add ra31, ra31, 4;      nop
112 |     add rb32, r4, ra27;     nop
113 | 
114 |     ## T1 = h + K[i] + W[i]
115 |     add rb18, r0, rb48;     nop
116 | 
117 |     ## T1 += CH(e,f,g) => (e & f) ^ (~e & g) (e: ra24, f: ra25, g: ra26)
118 |     or ra32, ra24, 0;       nop         # load e into r0
119 |     and ra33, ra25, r0;     nop         # r1 = r0 & f   (e & f)
120 |     not ra32, r0, 0;        nop         # r0 = ~r0      (~e)
121 |     and ra32, r0, ra26;     nop         # r0 = r0 & g   (~e & g)
122 |     xor ra32, r0, r1;       nop         # r0 = r0 ^ r1  (e & f) ^ (~e & g)
123 |     add rb18, rb18, r0;     nop         # accumulate into T1
124 | 
125 |     ## T1 += sigma1(e) => RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25)
126 |     ror rb32, ra24, 6;      nop
127 |     ror rb33, ra24, 11;     nop
128 |     ror rb34, ra24, rb5;    nop
129 |     xor rb32, r0, r1;       nop
130 |     xor rb32, r0, r2;       nop
131 |     add rb18, r0, rb18;     nop
132 | 
133 |     ## T2 (ra3) = sigma0(a)  (a: ra20)
134 |     ror ra32, ra20, 2;      nop         # r0 = RotR(a, 2)
135 |     ror ra33, ra20, 13;     nop         # r1 = RotR(a, 13)
136 |     xor ra32, r0, r1;       nop         # r0 = RotR(a, 2) ^ RotR(a, 13)
137 |     ror ra33, ra20, rb2;    nop         # r1 = RotR(a, 22)
138 |     xor ra3, r0, r1;        nop         # T2 = sigma0(a)
139 | 
140 |     ## T2 += Maj(a,b,c)
141 |     or ra32, ra20, 0;       nop         # load a into r0
142 |     and ra33, r0, ra21;     nop         # r1 = a & b
143 |     and ra34, r0, ra22;     nop         # r2 = a & c
144 |     xor ra32, r1, r2;       nop         # r0 = (a & b) ^ (a & c)
145 |     or ra33, ra21, 0;       nop         # load b into r1
146 |     and ra33, r1, ra22;     nop         # r1 = b & c
147 |     xor ra32, r0, r1;       nop         # r0 = r0 ^ r1
148 |     add ra3, ra3, r0;       nop         # T2 += Maj(a,b,c)
149 | 
150 |     ## swizzle
151 |     or ra27, ra26, 0;       nop
152 |     or ra26, ra25, 0;       nop
153 |     or ra25, ra24, 0;       nop
154 |     add ra24, ra23, rb18;   nop
155 |     or ra23, ra22, 0;       nop
156 |     or ra22, ra21, 0;       nop
157 |     or ra21, ra20, 0;       nop
158 |     add ra20, rb18, ra3;    nop
159 | 
160 |     ## Loop
161 |     sub ra2, ra2, 1;        nop
162 |     brr.ze ra39, compress
163 | NOP
164 | NOP
165 | NOP
166 |     sub ra1, ra1, 1;        nop
167 |     brr.zf ra39, done
168 |     NOP
169 |     NOP
170 |     NOP
171 |     ldi rb49, 0x1200
172 |     GENSCHEDULE_ALL
173 |     brr ra39, mainloop
174 | done:
175 | NOP
176 | NOP
177 | NOP
178 | 
179 | ## Configure the VPM to write the H vectors back into place
180 | ldi rb49, 0x1200
181 | 
182 | ## Write H vectors back (+=)
183 | add rb48, ra20, rb20;       nop
184 | add rb48, ra21, rb21;       nop
185 | add rb48, ra22, rb22;       nop
186 | add rb48, ra23, rb23;       nop
187 | add rb48, ra24, rb24;       nop
188 | add rb48, ra25, rb25;       nop
189 | add rb48, ra26, rb26;       nop
190 | add rb48, ra27, rb27;       nop
191 | 
192 | ## Configure the VCD for DMA back to the host
193 | ldi rb49, 0x88084000
194 | 
195 | ## Write the H address to store
196 | or rb50, ra30, 0;           nop
197 | 
198 | ## Wait for the DMA to complete
199 | or rb39, rb50, ra39;        nop
200 | 
201 | ## Trigger a host interrupt to finish the program
202 | or rb38, ra39, rb39;        nop
203 | 
204 | nop.tend ra39, ra39, ra39;  nop rb39, rb39, rb39
205 | NOP
206 | NOP
207 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/1QPU/Makefile:
--------------------------------------------------------------------------------
1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c
2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft
3 | 
4 | sha256: sha256.cpp qpufuncs.cpp
5 | 	g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC)
6 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/1QPU/qpufuncs.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <sys/mman.h>
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <fcntl.h>
  6 | #include <inttypes.h>
  7 | #include <string.h>             // memset
  8 | #include <stddef.h>
  9 | #include <unistd.h>
 10 | #include "mailbox.h"
 11 | #include "qpufuncs.h"
 12 | 
 13 | #define GPU_MEM_FLG     0xC         // cached
 14 | #define GPU_MEM_MAP     0x0         // cached
 15 | #define NUNIFORMS       3
 16 | 
 17 | 
 18 | struct sha256_memory_map
 19 | {
 20 |     /*
 21 |        data layout is:
 22 |          64 words for K constants (accessed as a texture lookup)
 23 |          16x8 (128) words for the 8 H vectors (VPM)
 24 |          16x16 (256) words for the input data (VPM)
 25 |        Total: 448 words
 26 |      */
 27 |     uint32_t data[64 + (128 + 256) * NUM_QPUS];
 28 |     uint32_t code[MAX_CODE_SIZE];
 29 |     /*
 30 |       uniforms are:
 31 |         u1: address of K texture
 32 |         u2: address of H vectors (also output location)
 33 |         u3: address of data buffer
 34 |         u4: stride
 35 |      */
 36 |     uint32_t uniforms[NUNIFORMS * NUM_QPUS];
 37 |     uint32_t msg[NUM_QPUS*2];            // msg is a (uniform, code) tuple to execute_qpu
 38 | 
 39 |     /* results are placed back into data where the H vectors were read from */
 40 | };
 41 | 
 42 | 
 43 | static struct
 44 | {
 45 |     int mb;
 46 |     unsigned handle;
 47 |     unsigned size;
 48 |     unsigned vc_msg;
 49 |     unsigned ptr;
 50 |     void* arm_ptr;
 51 | } sha256_qpu_context;
 52 | 
 53 | 
 54 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 55 |                    unsigned *shader_code, unsigned code_len)
 56 | {
 57 |     sha256_qpu_context.mb = mbox_open();
 58 |     if (qpu_enable(sha256_qpu_context.mb, 1)) {
 59 |         fprintf(stderr, "Unable to enable QPU\n");
 60 |         return -1;
 61 |     }
 62 | 
 63 |     // 1 MB should be plenty
 64 |     sha256_qpu_context.size = 1024 * 1024;
 65 |     sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb,
 66 |                                           sha256_qpu_context.size, 4096,
 67 |                                           GPU_MEM_FLG);
 68 |     if (!sha256_qpu_context.handle) {
 69 |         fprintf(stderr, "Unable to allocate %d bytes of GPU memory",
 70 |                         sha256_qpu_context.size);
 71 |         return -2;
 72 |     }
 73 |     unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle);
 74 |     sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size);
 75 |     sha256_qpu_context.ptr = ptr;
 76 | 
 77 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
 78 |                                                 sha256_qpu_context.arm_ptr;
 79 |     memset(arm_map, 0x0, sizeof(struct sha256_memory_map));
 80 |     unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data);
 81 |     unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms);
 82 |     unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code);
 83 |     sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg);
 84 | 
 85 |     memcpy(arm_map->code, shader_code, code_len);
 86 |     memcpy(arm_map->data, K, 64*sizeof(uint32_t));
 87 |     memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS);
 88 |     memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t));
 89 |     for (int i=0; i < NUM_QPUS; i++) {
 90 |         arm_map->uniforms[i*NUNIFORMS+0] = vc_data;         // data (address of K texture)
 91 |         arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t);         // address of H vectors
 92 |         arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t);
 93 | 
 94 |         arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t);
 95 |         arm_map->msg[i*2+1] = vc_code;
 96 |     }
 97 | 
 98 |     return sha256_qpu_context.mb;
 99 | }
100 | 
101 | 
102 | void SHA256ExecuteQPU(uint32_t* H)
103 | {
104 |     unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS,
105 |                                sha256_qpu_context.vc_msg, 1, 10000);
106 |     if (ret != 0)
107 |         fprintf(stderr, "Failed execute_qpu!\n");
108 | }
109 | 
110 | 
111 | void SHA256CleanupQPU(int handle)
112 | {
113 |     unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size);
114 |     mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle);
115 |     mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle);
116 |     qpu_enable(sha256_qpu_context.mb, 0);
117 |     mbox_close(sha256_qpu_context.mb);
118 | }
119 | 
120 | 
121 | void SHA256FetchResult(uint32_t *H)
122 | {
123 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
124 |                                                 sha256_qpu_context.arm_ptr;
125 |     memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t));
126 | }
127 | 
128 | 
129 | int loadQPUCode(const char *fname, unsigned int* buffer, int len)
130 | {
131 |     FILE *in = fopen(fname, "r");
132 |     if (!in) {
133 |         fprintf(stderr, "Failed to open %s.\n", fname);
134 |         return -1;
135 |     }
136 | 
137 |     size_t items = fread(buffer, sizeof(unsigned int), len, in);
138 |     fclose(in);
139 | 
140 |     return items * sizeof(unsigned int);
141 | }
142 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/1QPU/qpufuncs.h:
--------------------------------------------------------------------------------
 1 | #ifndef _QPUFUNCS_
 2 | #define _QPUFUNCS_
 3 | 
 4 | #define NUM_QPUS        1
 5 | #define MAX_CODE_SIZE   24000           /* in words */
 6 | 
 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 8 |                    unsigned *shader_code, unsigned code_len);
 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len);
10 | void SHA256CleanupQPU(int handle);
11 | void SHA256ExecuteQPU(uint32_t* H);
12 | void SHA256FetchResult(uint32_t* H);
13 | 
14 | #endif      // _QPUFUNCS_
15 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/1QPU/sha256.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <inttypes.h>
  3 | #include <sys/time.h>
  4 | #include <string.h>
  5 | #include <stdlib.h>
  6 | #include "qpufuncs.h"
  7 | 
  8 | #define QPU_CODE_FILE   "sha256.bin"
  9 | #define NUM_QPUS        1
 10 | #define BUFFER_SIZE     NUM_QPUS * 16
 11 | 
 12 | static uint32_t K[] = {
 13 |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 14 |     0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 15 |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 16 |     0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 17 |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 18 |     0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 19 |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 20 |     0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 21 |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 22 |     0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 23 |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 24 | };
 25 | 
 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
 27 |     return (x & y) ^ (~x & z);
 28 | }
 29 | 
 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) {
 31 |     return (x & y) ^ (x & z) ^ (y & z);
 32 | }
 33 | 
 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) {
 35 |     return (x >> shift) | (x << (32-shift));
 36 | }
 37 | 
 38 | static inline uint32_t sigma0(uint32_t x) {
 39 |     return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22);
 40 | }
 41 | 
 42 | static inline uint32_t sigma1(uint32_t x) {
 43 |     return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25);
 44 | }
 45 | 
 46 | static inline uint32_t smsigma0(uint32_t x) {
 47 |     return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3);
 48 | }
 49 | 
 50 | static inline uint32_t smsigma1(uint32_t x) {
 51 |     return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10);
 52 | }
 53 | 
 54 | #define ENDIAN(x, i)        ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3]))
 55 | 
 56 | /*
 57 |  * data is an array of BUFFER_SIZE buffers to hash
 58 |  * H is an input/output parameter
 59 |  * stride is the stride for data (TODO: handle hashes of more than one block)
 60 |  */
 61 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride)
 62 | {
 63 |     uint32_t W[64];
 64 |     uint32_t a, b, c, d, e, f, g, h;
 65 | 
 66 |     for (int k=0; k < BUFFER_SIZE; k++)
 67 |     {
 68 |         for (int i=0; i < 16; i++)
 69 |             W[i] = data[k*stride+i];
 70 |         for (int i=16; i < 64; i++)
 71 |             W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16];
 72 | 
 73 |         a = H[k*8+0];
 74 |         b = H[k*8+1];
 75 |         c = H[k*8+2];
 76 |         d = H[k*8+3];
 77 |         e = H[k*8+4];
 78 |         f = H[k*8+5];
 79 |         g = H[k*8+6];
 80 |         h = H[k*8+7];
 81 | 
 82 |         for (int i=0; i < 64; i++)
 83 |         {
 84 |             uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i];
 85 |             uint32_t T2 = sigma0(a) + Maj(a,b,c);
 86 |             h = g;
 87 |             g = f;
 88 |             f = e;
 89 |             e = d + T1;
 90 |             d = c;
 91 |             c = b;
 92 |             b = a;
 93 |             a = T1 + T2;
 94 |         }
 95 | 
 96 |         H[k*8+0] += a;
 97 |         H[k*8+1] += b;
 98 |         H[k*8+2] += c;
 99 |         H[k*8+3] += d;
100 |         H[k*8+4] += e;
101 |         H[k*8+5] += f;
102 |         H[k*8+6] += g;
103 |         H[k*8+7] += h;
104 |     }
105 | }
106 | 
107 | 
108 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride)
109 | {
110 |     SHA256ExecuteQPU(H);
111 | }
112 | 
113 | 
114 | int main(int argc, char **argv)
115 | {
116 |     unsigned int shader_code[MAX_CODE_SIZE];
117 |     bool run_cpu(true);
118 | 
119 |     if (argc < 2) {
120 |         fprintf(stderr, "Usage: %s <input file> [-qpu]\n", argv[0]);
121 |         return 1;
122 |     }
123 |     if (argc > 2 && (strcmp(argv[2], "-qpu") == 0))
124 |         run_cpu = false;
125 | 
126 |     /* Load the data to hash */
127 |     FILE *f_data = fopen(argv[1], "r");
128 |     if (!f_data) {
129 |         fprintf(stderr, "Unable to open file %s\n", argv[1]);
130 |         return 3;
131 |     }
132 | 
133 |     /* Load the QPU code */
134 |     int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE);
135 |     if (code_len < 1) {
136 |         fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE);
137 |         return 2;
138 |     }
139 |     printf("Loaded %d bytes of QPU code.\n", code_len);
140 | 
141 |     int nblocks = 1;                    // 1 512-bit block for now
142 |     int stride = nblocks * 16;
143 |     uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride];
144 |     uint32_t *H = new uint32_t[BUFFER_SIZE*8];
145 | 
146 |     uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f,
147 |                       0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
148 |     char filebuffer[64];            // 1 512-bit blocks
149 |     for (int i=0; i < BUFFER_SIZE; i++)
150 |     {
151 |         memcpy(H+i*8, H0, sizeof(H0));
152 |         memset(filebuffer, 0x0, sizeof(filebuffer));
153 |         // read a line up to 64-bytes long
154 |         char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data);
155 |         if (!p) {
156 |             fprintf(stderr, "Failed to read enough lines from data file.\n");
157 |             delete[] H;
158 |             delete[] buffer;
159 |             return 4;
160 |         }
161 | 
162 |         int bytes = strlen(filebuffer);
163 |         filebuffer[bytes] = 0x80;           // SHA-256 padding
164 | 
165 |         // last 8 bytes are the length of the initial message in bits
166 |         uint8_t len_buffer[8];
167 |         *(uint64_t *)len_buffer = bytes * 8;
168 |         for (int j=0; j < 8; j++)
169 |             filebuffer[56+j] = len_buffer[7-j];
170 | 
171 |         for (int j=0; j < 16; j++)
172 |             buffer[i*16+j] = ENDIAN(filebuffer, j);
173 |     }
174 | 
175 |     int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len);
176 |     if (handle < 0) {
177 |         fprintf(stderr, "Unable to setup QPU.  Check permissions\n");
178 |         delete[] buffer;
179 |         delete[] H;
180 |         return 4;
181 |     }
182 | 
183 |     /*
184 |      * SHA-256 calculation here
185 |      */
186 |     for (int i=0; i < nblocks; i++)
187 |     {
188 |         printf("Running %s version ...\n", (run_cpu) ? "CPU" : "QPU");
189 |         if (run_cpu)
190 |             execute_sha256_cpu(buffer+i*16, H, stride);
191 |         else
192 |             execute_sha256_qpu(buffer+i*16, H, stride);
193 |     }
194 | 
195 |     if (!run_cpu)
196 |         SHA256FetchResult(H);
197 | 
198 |     // print out the H
199 |     for (int i=0; i < BUFFER_SIZE; i++) {
200 |         printf("%02d / SHA-256: ", i);
201 |         for (int j=0; j < 8; j++)
202 |             printf("%08x ", H[i*8+j]);
203 |         printf("\n");
204 |     }
205 | 
206 |     SHA256CleanupQPU(handle);
207 | 
208 |     delete[] buffer;
209 |     delete[] H;
210 | }
211 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/1QPU/test-data.bin:
--------------------------------------------------------------------------------
 1 | this is line number one
 2 | this is line number two
 3 | this is line number three
 4 | this is line number four
 5 | this is line number five
 6 | this is line number six
 7 | this is line number seven
 8 | this is line number eight
 9 | this is line number nine
10 | this is line number ten
11 | this is line number eleven
12 | this is line number twelve
13 | this is line number thirteen
14 | this is line number fourteen
15 | this is line number fifteen
16 | this is line number sixteen
17 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/Makefile:
--------------------------------------------------------------------------------
1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c
2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft
3 | 
4 | sha256: sha256.cpp qpufuncs.cpp
5 | 	g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC)
6 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/final.asm:
--------------------------------------------------------------------------------
  1 | define(`MUTEX_ACQUIRE',     `or ra39, ra51, rb39;           nop')
  2 | define(`MUTEX_RELEASE',     `or ra51, ra39, ra39;           nop')
  3 | define(`NOP',   `nop ra39, ra39, ra39;      nop rb39, rb39, rb39')
  4 | ##
  5 | # generate a schedule vector.  Call as
  6 | # GENSCHEDULE(register W_i-16, W_i-15, W_i-7, W_i-2, destination reg)
  7 | # these need to be a registers because we use small immediates
  8 | # uses temp registers r0 - r3
  9 | #
 10 | 
 11 | define(`FAKESCHEDULE',
 12 | `   or ra1, $1, $1;                 nop
 13 |     bra ra39, ZERO, ra0;
 14 |     NOP
 15 |     NOP
 16 |     NOP')
 17 | 
 18 | define(`GENSCHEDULE',
 19 | `
 20 |     ror rb33, $2, 7;                nop         # r1 = RotR(x, 7)
 21 |     ror rb34, $2, rb6;              nop         # r2 = RotR(x, 18)
 22 |     shr rb35, $2, 3;                nop         # r3 = x >> 3;
 23 |     xor rb33, r1, r2;               v8max ra32, $1, $1         # r1 = r1 ^ r2, r0 = W_i-16
 24 |     xor rb35, r1, r3;               nop         # r3 = r1 ^ r3
 25 |     add rb32, r0, r3;               nop         # r0 += r3          (W_i-16 + smsigma0(W_i-15))
 26 |     add rb32, r0, $3;               nop         # r0 += W_i-7
 27 |     ror rb33, $4, rb8;              nop         # r1 = RotR(x, 17)
 28 |     ror rb34, $4, rb7;              nop         # r2 = RotR(x, 19)
 29 |     xor rb33, r1, r2;               nop         # r1 = r1 ^ r2
 30 |     shr rb34, $4, 10;               nop         # r2 = x >> 10
 31 |     xor rb33, r1, r2;               nop         # r1 = r1 ^ r2
 32 |     add $1, r0, r1;                 nop         # r0 += smsigma1(W_i-2)
 33 |     ## move it into another register for reading
 34 |     add ra1, r0, r1;                nop
 35 |     ## branch back (ra0)
 36 |     bra ra39, ZERO, ra0
 37 |     NOP
 38 |     NOP
 39 |     NOP
 40 |     ## $2 ignored, $3 ignored, $4 ignored, $1 ignored (suppress warnings)')
 41 | define(`GENSCHEDULE_ALL',
 42 | `
 43 | GENSCHEDULE(`ra4', `ra5', `ra13', `ra18')
 44 | GENSCHEDULE(`ra5', `ra6', `ra14', `ra19')
 45 | GENSCHEDULE(`ra6', `ra7', `ra15', `ra4')
 46 | GENSCHEDULE(`ra7', `ra8', `ra16', `ra5')
 47 | GENSCHEDULE(`ra8', `ra9', `ra17', `ra6')
 48 | GENSCHEDULE(`ra9', `ra10', `ra18', `ra7')
 49 | GENSCHEDULE(`ra10', `ra11', `ra19', `ra8')
 50 | GENSCHEDULE(`ra11', `ra12', `ra4', `ra9')
 51 | GENSCHEDULE(`ra12', `ra13', `ra5', `ra10')
 52 | GENSCHEDULE(`ra13', `ra14', `ra6', `ra11')
 53 | GENSCHEDULE(`ra14', `ra15', `ra7', `ra12')
 54 | GENSCHEDULE(`ra15', `ra16', `ra8', `ra13')
 55 | GENSCHEDULE(`ra16', `ra17', `ra9', `ra14')
 56 | GENSCHEDULE(`ra17', `ra18', `ra10', `ra15')
 57 | GENSCHEDULE(`ra18', `ra19', `ra11', `ra16')
 58 | GENSCHEDULE(`ra19', `ra4', `ra12', `ra17')')
 59 | define(`FAKESCHEDULE_ALL',
 60 | `
 61 | FAKESCHEDULE(`ra4')
 62 | FAKESCHEDULE(`ra5')
 63 | FAKESCHEDULE(`ra6')
 64 | FAKESCHEDULE(`ra7')
 65 | FAKESCHEDULE(`ra8')
 66 | FAKESCHEDULE(`ra9')
 67 | FAKESCHEDULE(`ra10')
 68 | FAKESCHEDULE(`ra11')
 69 | FAKESCHEDULE(`ra12')
 70 | FAKESCHEDULE(`ra13')
 71 | FAKESCHEDULE(`ra14')
 72 | FAKESCHEDULE(`ra15')
 73 | FAKESCHEDULE(`ra16')
 74 | FAKESCHEDULE(`ra17')
 75 | FAKESCHEDULE(`ra18')
 76 | FAKESCHEDULE(`ra19')')
 77 | 
 78 | # move the uniforms into registers
 79 | or ra31, ra32, 0;           nop        # address of K in ra31
 80 | or ra30, ra32, 0;           nop        # address of H in ra30
 81 | or rb29, ra32, 0;           nop        # address of data in rb29
 82 | or ra2, ra32, 0;            nop        # number of laps in ra2
 83 | 
 84 | or rb31, ra31, 0;           nop        ## save ra31 (K address) since we overwrite
 85 |                                        ## this in the main loop
 86 | 
 87 | # some rotation constants that don't fit in small immediates
 88 | ldi rb2, 0x16;
 89 | ldi rb5, 0x19;
 90 | ldi rb6, 0x12;
 91 | ldi rb7, 0x13;
 92 | ldi rb8, 0x11;
 93 | 
 94 | mainloop:
 95 |     ## Restore the K texture base address
 96 |     and ra31, rb31, rb31;       nop
 97 | 
 98 | ## Lock the VPM mutex
 99 | MUTEX_ACQUIRE()
100 | 
101 | # VDR DMA read setup for data vectors (16x16)
102 | # MODEW = 0, MPITCH = 3, ROWLEN = 16, NROWS = 16, VPITCH=1, VERT = 0, ADDRXY = 0
103 | ldi ra49, 0x83001000
104 | 
105 | # Move the data vectors into place (0,0 in VPM)
106 | or ra50, rb29, rb29;       nop
107 | 
108 | # wait for the DMA to complete
109 | or rb39, ra50, 0;    nop
110 | 
111 | # read the data vectors into ra4 .. ra19
112 | ldi ra49, 0x1200
113 | or ra4, ra48, 0;        nop
114 | or ra5, ra48, 0;        nop
115 | or ra6, ra48, 0;        nop
116 | or ra7, ra48, 0;        nop
117 | or ra8, ra48, 0;        nop
118 | or ra9, ra48, 0;        nop
119 | or ra10, ra48, 0;        nop
120 | or ra11, ra48, 0;        nop
121 | or ra12, ra48, 0;        nop
122 | or ra13, ra48, 0;        nop
123 | or ra14, ra48, 0;        nop
124 | or ra15, ra48, 0;        nop
125 | or ra16, ra48, 0;        nop
126 | or ra17, ra48, 0;        nop
127 | or ra18, ra48, 0;        nop
128 | or ra19, ra48, 0;        nop
129 | 
130 | # VDR DMA read setup for H vectors (16x8)
131 | # MODEW = 0, MPITCH = 2, ROWLEN = 8, NROWS = 16, VPITCH=1, VERT = 0, ADDRXY = (16, 0)
132 | ldi ra49, 0x82801000
133 | 
134 | # Move the data vectors into place (16,0 in VPM)
135 | or ra50, ra30, 0;       nop
136 | 
137 | # wait for the DMA to complete
138 | or rb39, ra50, 0;       nop
139 | 
140 | # configure the VPM for reading the H vectors 
141 | ldi ra49, 0x801200
142 | 
143 | # read the H vectors into registers ra20..ra27 (this is a .. h) and rb20..rb27
144 | # (We read them into rb registers so that we can write them back)
145 | or ra20, ra48, 0;            v8max rb20, ra48, ra48
146 | or ra21, ra48, 0;            v8max rb21, ra48, ra48
147 | or ra22, ra48, 0;            v8max rb22, ra48, ra48
148 | or ra23, ra48, 0;            v8max rb23, ra48, ra48
149 | or ra24, ra48, 0;            v8max rb24, ra48, ra48
150 | or ra25, ra48, 0;            v8max rb25, ra48, ra48
151 | or ra26, ra48, 0;            v8max rb26, ra48, ra48
152 | or ra27, ra48, 0;            v8max rb27, ra48, ra48
153 | 
154 | ## Unlock the VPM mutex
155 | MUTEX_RELEASE()
156 | 
157 | 
158 | define(`COMPRESS_ITER',
159 | `
160 |     ## Compute T1, and T2
161 | 
162 |     # T1 += K[i]
163 |     # move the data address in ra31 (K vector) increment the K[i] and do the
164 |     # texture lookup
165 |     # cannot put the .tmu on add because it is using a small immediate which
166 |     # is a sig as well
167 |     # (prefetching these, see below rb56 and .tmu))
168 |     add ra31, ra31, 4;              nop
169 |     add rb32, r4, ra27;             v8max ra1, rb0, rb0;
170 | 
171 |     # T1 += W[i]
172 |     ## need another instruction here to avoid the RAW hazard
173 |     add rb0, rb0, ra29;               nop
174 |     ## this is a confusing overload of ra1 but we are running out of registers
175 |     brr ra0, fakeschedule, ra1
176 |     NOP
177 |     NOP
178 |     NOP
179 |     add rb18, r0, ra1;              nop
180 | 
181 |     # T1 = CH(e,f,g) = (e & f) ^ (~e & g) (e = ra24, f = ra25, g = ra26
182 |     or ra32, ra24, 0;           nop         # load e into r0
183 |     and ra33, ra25, r0;         nop         # r1 = r0 & f  (e & f)
184 |     not ra32, r0, 0;            v8max rb56, ra31, ra31         # r0 = ~r0 (~e)
185 |     and ra32, r0, ra26;         nop         # r0 = r0 & g (~e & g)
186 |     xor ra32, r0, r1;           nop         # r0 = r0 ^ r1 (e & f) ^ (~e & g)
187 |     add rb18, rb18, r0;         v8max ra27, ra26, ra26         # T1 += 
188 | 
189 |     # T1 = sigma1(e) = RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25)
190 |     ror rb32, ra24, 6;          nop
191 |     ror rb33, ra24, 11;         nop
192 |     ror rb34, ra24, rb5;       nop
193 |     xor rb32, r0, r1;           v8max ra26, ra25, ra25
194 |     xor rb32, r0, r2;           v8max ra25, ra24, ra24
195 | 
196 |     # T1 = sigma1(e) + CH(e,f,g)
197 |     add.tmu rb18, r0, rb18;         nop
198 | 
199 |     # T2 = sigma0(a) = ra20
200 |     ror ra32, ra20, 2;          nop         # r0 = RotR(a, 2)
201 |     ror ra33, ra20, 13;         nop         # r1 = RotR(a, 13)
202 |     xor ra32, r0, r1;           nop         # r0 = RotR(a, 2) ^ RotR(a, 13)
203 |     ror ra33, ra20, rb2;        nop         # r1 = RotR(a, 22)
204 |     xor ra3, r0, r1;            v8max rb32, ra20, ra20      # T2 = r0 ^ r1, load a into r0
205 | 
206 |     add ra24, ra23, rb18;       nop
207 | 
208 |     # T2 += Maj(a,b,c) = 
209 |     and ra33, r0, ra21;         nop         # r1 = a & b
210 |     and ra34, r0, ra22;         nop         # r2 = a & c
211 |     xor ra32, r1, r2;           v8max rb33, ra21, ra21      # r0 = r1 ^ r2, load b into r1
212 | 
213 |     and rb33, r1, ra22;         v8max ra23, ra22, ra22         # r1 = b & c
214 | 
215 |     xor rb32, r0, r1;           v8max ra22, ra21, ra21         # r0 = r0 ^ r1
216 |     add ra3, ra3, r0;         nop         # T2 +=
217 | 
218 |     or ra21, ra20, 0;          nop
219 |     add ra20, rb18, ra3;       nop
220 | ')
221 | define(`COMPRESS_SCHED_ITER',
222 | `
223 |     ## Compute T1, and T2
224 | 
225 |     # T1 += K[i]
226 |     # move the data address in ra31 (K vector) increment the K[i] and do the
227 |     # texture lookup
228 |     # cannot put the .tmu on add because it is using a small immediate which
229 |     # is a sig as well
230 |     # (prefetching these, see below rb56 and .tmu))
231 |     add ra31, ra31, 4;              nop
232 |     add rb32, r4, ra27;             v8max ra1, rb0, rb0;
233 |     add rb18, r0, 0;                nop
234 | 
235 |     # T1 += W[i]
236 |     ## this is a confusing overload of ra1 but we are running out of registers
237 |     brr ra0, genschedule, ra1
238 |     NOP
239 |     NOP
240 |     NOP
241 |     add rb18, rb18, ra1;              nop
242 |     add rb0, rb0, ra29;               nop
243 | 
244 |     # T1 = CH(e,f,g) = (e & f) ^ (~e & g) (e = ra24, f = ra25, g = ra26
245 |     or ra32, ra24, 0;           nop         # load e into r0
246 |     and ra33, ra25, r0;         nop         # r1 = r0 & f  (e & f)
247 |     not ra32, r0, 0;            v8max rb56, ra31, ra31         # r0 = ~r0 (~e)
248 |     and ra32, r0, ra26;         nop         # r0 = r0 & g (~e & g)
249 |     xor ra32, r0, r1;           nop         # r0 = r0 ^ r1 (e & f) ^ (~e & g)
250 |     add rb18, rb18, r0;         v8max ra27, ra26, ra26         # T1 += 
251 | 
252 |     # T1 = sigma1(e) = RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25)
253 |     ror rb32, ra24, 6;          nop
254 |     ror rb33, ra24, 11;         nop
255 |     ror rb34, ra24, rb5;        nop
256 |     xor rb32, r0, r1;           v8max ra26, ra25, ra25
257 |     xor rb32, r0, r2;           v8max ra25, ra24, ra24
258 | 
259 |     # T1 = sigma1(e) + CH(e,f,g)
260 |     add.tmu rb18, r0, rb18;         nop
261 | 
262 |     # T2 = sigma0(a) = ra20
263 |     ror ra32, ra20, 2;          nop         # r0 = RotR(a, 2)
264 |     ror ra33, ra20, 13;         nop         # r1 = RotR(a, 13)
265 |     xor ra32, r0, r1;           nop         # r0 = RotR(a, 2) ^ RotR(a, 13)
266 |     ror ra33, ra20, rb2;        nop         # r1 = RotR(a, 22)
267 |     xor ra3, r0, r1;            v8max rb32, ra20, ra20      # T2 = r0 ^ r1, load a into r0
268 | 
269 |     add ra24, ra23, rb18;       nop
270 | 
271 |     # T2 += Maj(a,b,c) = 
272 |     and ra33, r0, ra21;         nop         # r1 = a & b
273 |     and ra34, r0, ra22;         nop         # r2 = a & c
274 |     xor ra32, r1, r2;           v8max rb33, ra21, ra21      # r0 = r1 ^ r2, load b into r1
275 | 
276 |     and rb33, r1, ra22;         v8max ra23, ra22, ra22         # r1 = b & c
277 | 
278 |     xor rb32, r0, r1;           v8max ra22, ra21, ra21         # r0 = r0 ^ r1
279 |     add ra3, ra3, r0;         nop         # T2 +=
280 | 
281 |     or ra21, ra20, 0;          nop
282 |     add ra20, rb18, ra3;       nop
283 | ')
284 | 
285 | 
286 | ldi ra29, 40        ## fakeschedule table index increment
287 | ## First 16 times use fakeschedule lookups
288 | ldi ra28, 0x10
289 | or rb56, ra31, 0;           nop
290 | xor.tmu rb0, rb0, rb0;          nop
291 | firstloop:
292 |     COMPRESS_ITER()
293 |     sub ra28, ra28, 1;      nop
294 |     brr.ze ra39, firstloop
295 | NOP
296 | NOP
297 | NOP
298 | 
299 | ldi ra29, 144
300 | ## Next 48 times (16*3) use genschedule lookups
301 | ldi rb19, 3
302 | outerloop:
303 |     ldi ra28, 0x10
304 |     or rb56, ra31, 0;           nop
305 |     xor.tmu rb0, rb0, rb0;          nop
306 |     innerloop:
307 |         COMPRESS_SCHED_ITER()
308 |         sub ra28, ra28, 1;      nop
309 |         brr.ze ra39, innerloop
310 |     NOP
311 |     NOP
312 |     NOP
313 | 
314 |     ldi ra32, 1;
315 |     sub rb19, rb19, r0;          nop
316 |     brr.ze ra39, outerloop
317 | NOP
318 | NOP
319 | NOP
320 | 
321 | 
322 | ## Lock the VPM mutex
323 | MUTEX_ACQUIRE()
324 | 
325 | # configure the VPM to write the H vectors back into place 
326 | # (stride=1, vert, Y=16, X=0)
327 | ldi rb49, 0x1200
328 | 
329 | # write the vectors back (+=)
330 | add rb48, ra20, rb20;           nop
331 | add rb48, ra21, rb21;           nop
332 | add rb48, ra22, rb22;           nop
333 | add rb48, ra23, rb23;           nop
334 | add rb48, ra24, rb24;           nop
335 | add rb48, ra25, rb25;           nop
336 | add rb48, ra26, rb26;           nop
337 | add rb48, ra27, rb27;           nop
338 | 
339 | # configure the VPM for DMA back to the host
340 | # nrows=16, rowlen=8, 16, 0, horiz=1
341 | ldi rb49, 0x88084000
342 | 
343 | # write the H address again to store
344 | or rb50, ra30, 0;           nop
345 | 
346 | # Wait for the DMA to complete
347 | or rb39, rb50, ra39;       nop ra39, ra39, ra39
348 | 
349 | ## Unlock the VPM mutex
350 | MUTEX_RELEASE()
351 | 
352 |     sub ra2, ra2, 1;        nop
353 |     brr.ze ra39, mainloop
354 | NOP
355 | NOP
356 | NOP
357 | 
358 | # trigger a host interrupt to stop the program (not necessary with direct-exec)
359 | or rb38, ra39, rb39;         nop ra39, ra39, ra39
360 | 
361 | finished:
362 | nop.tend ra39, ra39, ra39;      nop rb39, rb39, rb39
363 | NOP
364 | NOP
365 | 
366 | 
367 | ## schedule code table
368 | genschedule:
369 | GENSCHEDULE_ALL()
370 | 
371 | fakeschedule:
372 | FAKESCHEDULE_ALL()
373 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/qpufuncs.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <sys/mman.h>
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <fcntl.h>
  6 | #include <inttypes.h>
  7 | #include <string.h>             // memset
  8 | #include <stddef.h>
  9 | #include <unistd.h>
 10 | #include "mailbox.h"
 11 | #include "qpufuncs.h"
 12 | 
 13 | //#define GPU_MEM_FLG 0x4 // cached=0xC; direct=0x4
 14 | //#define GPU_MEM_MAP 0x20000000 // cached=0x0; direct=0x20000000
 15 | 
 16 | #define GPU_MEM_FLG     0xC
 17 | #define GPU_MEM_MAP     0x0
 18 | #define REGISTER_BASE   0x20C00000
 19 | 
 20 | #define V3D_SRQPC       0x10c
 21 | #define V3D_SRQUA       0x10d
 22 | #define V3D_SRQUL       0x10e
 23 | #define V3D_SRQCS       0x10f
 24 | 
 25 | #define V3D_VPMBASE     0x7e
 26 | 
 27 | #define V3D_L2CACTL     0x8
 28 | #define V3D_SLCACTL     0x9
 29 | 
 30 | //#define DIRECT_EXEC
 31 | #define NUNIFORMS       4
 32 | 
 33 | 
 34 | /*
 35 |  * TODO: expand this for multiple QPUs
 36 |  */
 37 | struct sha256_memory_map
 38 | {
 39 |     /*
 40 |        data layout is:
 41 |          64 words for K constants (accessed as a texture lookup)
 42 |          16x8 (128) words for the 8 H vectors (VPM)
 43 |          16x16 (256) words for the input data (VPM)
 44 |        Total: 448 words
 45 |      */
 46 |     uint32_t data[64 + (128 + 256) * NUM_QPUS];
 47 |     uint32_t code[MAX_CODE_SIZE];
 48 |     /*
 49 |       uniforms are:
 50 |         u1: address of K texture
 51 |         u2: address of H vectors (also output location)
 52 |         u3: address of data buffer
 53 |         u4: number of laps to execute
 54 |      */
 55 |     uint32_t uniforms[NUNIFORMS*NUM_QPUS];
 56 |     uint32_t msg[NUM_QPUS*2];            // msg is a (uniform, code) tuple to execute_qpu
 57 | 
 58 |     /* Results are placed back into the H vector */
 59 | };
 60 | 
 61 | 
 62 | static struct
 63 | {
 64 |     int mb;
 65 |     unsigned handle;
 66 |     unsigned size;
 67 |     unsigned vc_msg;
 68 |     unsigned ptr;
 69 |     void* arm_ptr;
 70 |     volatile uint32_t *registers;
 71 | } sha256_qpu_context;
 72 | 
 73 | 
 74 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 75 |                    unsigned *shader_code, unsigned code_len)
 76 | {
 77 |     sha256_qpu_context.mb = mbox_open();
 78 |     if (qpu_enable(sha256_qpu_context.mb, 1)) {
 79 |         fprintf(stderr, "Unable to enable QPU\n");
 80 |         return -1;
 81 |     }
 82 | 
 83 | #ifdef DIRECT_EXEC
 84 |     int mem_dev = open("/dev/mem", O_RDWR|O_SYNC);
 85 |     if (mem_dev == -1) {
 86 |         fprintf(stderr, "Error opening /dev/mem.  Check permissions\n");
 87 |         mbox_close(sha256_qpu_context.mb);
 88 |         return -1;
 89 |     }
 90 |     // close mem_dev
 91 |     // munmap cleanup
 92 | 
 93 |     sha256_qpu_context.registers = (volatile uint32_t*)mmap(NULL, 4096, PROT_READ|PROT_WRITE,
 94 |                                                             MAP_SHARED, mem_dev, REGISTER_BASE);
 95 |     if (sha256_qpu_context.registers == MAP_FAILED) {
 96 |         fprintf(stderr, "mmap failed.\n");
 97 |         close(mem_dev);
 98 |         mbox_close(sha256_qpu_context.mb);
 99 |         return -1;
100 |     }
101 | #endif
102 | 
103 |     // 1 MB should be plenty
104 |     sha256_qpu_context.size = 1024 * 1024;
105 |     sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb,
106 |                                           sha256_qpu_context.size, 4096,
107 |                                           GPU_MEM_FLG);
108 |     if (!sha256_qpu_context.handle) {
109 |         fprintf(stderr, "Unable to allocate %d bytes of GPU memory",
110 |                         sha256_qpu_context.size);
111 |         return -2;
112 |     }
113 |     unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle);
114 |     sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size);
115 |     sha256_qpu_context.ptr = ptr;
116 |     printf("Locked memory at 0x%x = 0x%x\n", ptr, sha256_qpu_context.arm_ptr);
117 | 
118 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
119 |                                                 sha256_qpu_context.arm_ptr;
120 |     memset(arm_map, 0x0, sizeof(struct sha256_memory_map));
121 |     unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data);
122 |     unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms);
123 |     unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code);
124 |     sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg);
125 | 
126 |     memcpy(arm_map->code, shader_code, code_len);
127 |     memcpy(arm_map->data, K, 64*sizeof(uint32_t));
128 |     memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS);
129 |     memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t));
130 |     for (int i=0; i < NUM_QPUS; i++) {
131 |         arm_map->uniforms[i*NUNIFORMS+0] = vc_data;         // data (address of K texture)
132 |         arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t);         // address of H vectors
133 |         arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t);
134 |         arm_map->uniforms[i*NUNIFORMS+3] = 20000;           // fill this in in ExecuteQPU
135 |         arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t);
136 |         arm_map->msg[i*2+1] = vc_code;
137 |     }
138 |     
139 |     return sha256_qpu_context.mb;
140 | }
141 | 
142 | 
143 | void SHA256ExecuteQPU(uint32_t* H, int nlaps)
144 | {
145 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
146 |                                                 sha256_qpu_context.arm_ptr;
147 |     for (int i=0; i < NUM_QPUS; i++)
148 |         arm_map->uniforms[i*NUNIFORMS+3] = 20000;
149 | 
150 | #ifndef DIRECT_EXEC
151 |     unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS,
152 |                                sha256_qpu_context.vc_msg, 1, 10000);
153 |     if (ret != 0)
154 |         fprintf(stderr, "Failed execute_qpu!\n");
155 | #else
156 |     uint32_t qst = sha256_qpu_context.registers[V3D_SRQCS];
157 |     int qlength = qst & 0x3f;
158 |     int qreqs = (qst >> 8) & 0xFF;
159 |     int qcomp = (qst >> 16) & 0xFF;
160 |     int qerr = (qst >> 7) & 0x1;
161 | //    printf("Queue length: %d, completed: %d, requests: %d, err: %d\n", qlength, qcomp, qreqs, qerr);
162 |     int target = (qcomp + NUM_QPUS) % 256;
163 | 
164 |     for (int i=0; i < NUM_QPUS; i++)
165 |     {
166 |         sha256_qpu_context.registers[V3D_SRQUL] = NUNIFORMS;
167 |         sha256_qpu_context.registers[V3D_SRQUA] = arm_map->msg[i*2+0];
168 |         sha256_qpu_context.registers[V3D_SRQPC] = arm_map->msg[i*2+1];
169 |     }
170 | 
171 |     do {
172 |         qst = sha256_qpu_context.registers[V3D_SRQCS];
173 |         qcomp = (qst >> 16) & 0xFF;
174 |     } while (qcomp != target);
175 | //    printf("Queue length: %d, completed: %d, requests: %d, err: %d\n", qlength, qcomp, qreqs, qerr);
176 | #endif
177 | }
178 | 
179 | 
180 | void SHA256CleanupQPU(int handle)
181 | {
182 |     unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size);
183 |     mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle);
184 |     mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle);
185 |     qpu_enable(sha256_qpu_context.mb, 0);
186 |     mbox_close(sha256_qpu_context.mb);
187 | }
188 | 
189 | 
190 | void SHA256FetchResult(uint32_t *H)
191 | {
192 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
193 |                                                 sha256_qpu_context.arm_ptr;
194 |     memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t));
195 | }
196 | 
197 | 
198 | int loadQPUCode(const char *fname, unsigned int* buffer, int len)
199 | {
200 |     FILE *in = fopen(fname, "r");
201 |     if (!in) {
202 |         fprintf(stderr, "Failed to open %s.\n", fname);
203 |         return -1;
204 |     }
205 | 
206 |     size_t items = fread(buffer, sizeof(unsigned int), len, in);
207 |     fclose(in);
208 | 
209 |     return items * sizeof(unsigned int);
210 | }
211 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/qpufuncs.h:
--------------------------------------------------------------------------------
 1 | #ifndef _QPUFUNCS_
 2 | #define _QPUFUNCS_
 3 | 
 4 | #define NUM_QPUS        12
 5 | #define MAX_CODE_SIZE   24000
 6 | 
 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 8 |                    unsigned *shader_code, unsigned code_len);
 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len);
10 | void SHA256CleanupQPU(int handle);
11 | void SHA256ExecuteQPU(uint32_t* H, int nlaps);
12 | void SHA256FetchResult(uint32_t* H);
13 | volatile uint32_t* getRegisterMap();
14 | 
15 | #endif      // _QPUFUNCS_
16 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/sha256.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <inttypes.h>
  3 | #include <sys/time.h>
  4 | #include <string.h>
  5 | #include <stdlib.h>
  6 | #include "qpufuncs.h"
  7 | 
  8 | #define     QPU_CODE_FILE       "sha256.bin"
  9 | #define     BUFFER_SIZE         NUM_QPUS * 16
 10 | 
 11 | 
 12 | static uint32_t K[] = {
 13 |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 14 |     0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 15 |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 16 |     0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 17 |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 18 |     0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 19 |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 20 |     0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 21 |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 22 |     0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 23 |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 24 | };
 25 | 
 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
 27 |     return (x & y) ^ (~x & z);
 28 | }
 29 | 
 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) {
 31 |     return (x & y) ^ (x & z) ^ (y & z);
 32 | }
 33 | 
 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) {
 35 |     return (x >> shift) | (x << (32-shift));
 36 | }
 37 | 
 38 | static inline uint32_t sigma0(uint32_t x) {
 39 |     return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22);
 40 | }
 41 | 
 42 | static inline uint32_t sigma1(uint32_t x) {
 43 |     return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25);
 44 | }
 45 | 
 46 | static inline uint32_t smsigma0(uint32_t x) {
 47 |     return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3);
 48 | }
 49 | 
 50 | static inline uint32_t smsigma1(uint32_t x) {
 51 |     return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10);
 52 | }
 53 | 
 54 | #define ENDIAN(x, i)        ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3]))
 55 | 
 56 | /*
 57 |  * data is an array of BUFFER_SIZE buffers to hash
 58 |  * H is an input/output parameter
 59 |  * stride is the stride for data (TODO: handle hashes of more than one block)
 60 |  */
 61 | void execute_sha256_cpu(uint32_t * data, uint32_t * H, int stride, int nlaps)
 62 | {
 63 |     uint32_t W[64];
 64 |     uint32_t a, b, c, d, e, f, g, h;
 65 | 
 66 |     for (int lap = 0; lap < nlaps; lap++)
 67 |     {
 68 |         for (int k=0; k < BUFFER_SIZE; k++)
 69 |         {
 70 |             for (int i=0; i < 16; i++)
 71 |                 W[i] = data[k*stride+i];
 72 |             for (int i=16; i < 64; i++)
 73 |                 W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16];
 74 |     
 75 |             a = H[k*8+0];
 76 |             b = H[k*8+1];
 77 |             c = H[k*8+2];
 78 |             d = H[k*8+3];
 79 |             e = H[k*8+4];
 80 |             f = H[k*8+5];
 81 |             g = H[k*8+6];
 82 |             h = H[k*8+7];
 83 |     
 84 |             for (int i=0; i < 64; i++)
 85 |             {
 86 |                 uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i];
 87 |                 uint32_t T2 = sigma0(a) + Maj(a,b,c);
 88 |                 h = g;
 89 |                 g = f;
 90 |                 f = e;
 91 |                 e = d + T1;
 92 |                 d = c;
 93 |                 c = b;
 94 |                 b = a;
 95 |                 a = T1 + T2;
 96 |             }
 97 |     
 98 |             H[k*8+0] += a;
 99 |             H[k*8+1] += b;
100 |             H[k*8+2] += c;
101 |             H[k*8+3] += d;
102 |             H[k*8+4] += e;
103 |             H[k*8+5] += f;
104 |             H[k*8+6] += g;
105 |             H[k*8+7] += h;
106 |         }
107 |     }
108 | }
109 | 
110 | 
111 | /*
112 |  * data is an array of BUFFER_SIZE buffers to hash
113 |  * H is an input/output parameter
114 |  * stride is the stride for data
115 |  */
116 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride, int nlaps)
117 | {
118 |     SHA256ExecuteQPU(H, nlaps);
119 | }
120 | 
121 | 
122 | int main(int argc, char **argv)
123 | {
124 |     bool run_cpu(true);
125 |     if (argc < 3) {
126 |         fprintf(stderr, "Usage: %s <input file> <nlaps> [-qpu]\n", argv[0]);
127 |         return 1;
128 |     }
129 | 
130 |     unsigned int shader_code[MAX_CODE_SIZE];
131 | 
132 |     /* Load the QPU code */
133 |     int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE);
134 |     if (code_len < 1) {
135 |         fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE);
136 |         return 2;
137 |     }
138 |     printf("Loaded %d bytes of QPU code.\n", code_len);
139 | 
140 |     /* Load the data to hash */
141 |     FILE *f_data = fopen(argv[1], "r");
142 |     if (!f_data) {
143 |         fprintf(stderr, "Unable to open file %s\n", argv[1]);
144 |         return 3;
145 |     }
146 | 
147 |     int nlaps = atoi(argv[2]);
148 |     printf("Running %d laps ...\n", nlaps);
149 | 
150 |     if (argc > 3 && (strcmp(argv[3], "-qpu") == 0))
151 |         run_cpu = false;
152 | 
153 |     int nblocks = 1;                    // 1 512-bit block for now
154 |     int stride = nblocks * 16;
155 |     uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride];
156 |     uint32_t *H = new uint32_t[BUFFER_SIZE*8];
157 | 
158 |     uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f,
159 |                       0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
160 |     char filebuffer[64];            // 1 512-bit blocks
161 |     for (int i=0; i < BUFFER_SIZE; i++)
162 |     {
163 |         memcpy(H+i*8, H0, sizeof(H0));
164 |         memset(filebuffer, 0x0, sizeof(filebuffer));
165 |         // read a line up to 64-bytes long
166 |         char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data);
167 |         if (!p) {
168 |             fprintf(stderr, "Failed to read enough lines from data file.\n");
169 |             delete[] H;
170 |             delete[] buffer;
171 |             return 4;
172 |         }
173 | 
174 |         int bytes = strlen(filebuffer);
175 |         filebuffer[bytes] = 0x80;           // SHA-256 padding
176 | 
177 |         // last 8 bytes are the length of the initial message in bits
178 |         uint8_t len_buffer[8];
179 |         *(uint64_t *)len_buffer = bytes * 8;
180 |         for (int j=0; j < 8; j++)
181 |             filebuffer[56+j] = len_buffer[7-j];
182 | 
183 |         for (int j=0; j < 16; j++)
184 |             buffer[i*16+j] = ENDIAN(filebuffer, j);
185 |     }
186 | 
187 |     int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len);
188 |     if (handle < 0) {
189 |         fprintf(stderr, "Unable to setup QPU.  Check permissions\n");
190 |         delete[] buffer;
191 |         delete[] H;
192 |         return 4;
193 |     }
194 | 
195 |     struct timeval start, end;
196 |     gettimeofday(&start, NULL);
197 | 
198 |     /*
199 |      * SHA-256 calculation here
200 |      */
201 |     printf("Executing %s version ...\n", run_cpu ? "CPU" : "QPU");
202 |     for (int i=0; i < nblocks; i++)
203 |     {
204 |         if (run_cpu)
205 |             execute_sha256_cpu(buffer+i*16, H, stride, nlaps);
206 |         else
207 |             execute_sha256_qpu(buffer+i*16, H, stride, nlaps);
208 |     }
209 | 
210 |     gettimeofday(&end, NULL);
211 | 
212 |     float elapsed = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
213 |     elapsed /= 1000.0;
214 | 
215 |     // print out the H
216 |     if (!run_cpu) SHA256FetchResult(H);
217 | 
218 |     for (int i=0; i < BUFFER_SIZE; i++) {
219 |         printf("%02d / SHA-256: ", i);
220 |         for (int j=0; j < 8; j++)
221 |             printf("%08x ", H[i*8+j]);
222 |         printf("\n");
223 |     }
224 | 
225 |     printf("Time in ms: %f = %f hash/s\n", elapsed, BUFFER_SIZE * nlaps * 1000.0/elapsed);
226 | 
227 |     SHA256CleanupQPU(handle);
228 | 
229 |     delete[] buffer;
230 |     delete[] H;
231 | }
232 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/final/test-data.bin:
--------------------------------------------------------------------------------
  1 | Line #1
  2 | Line #2
  3 | Line #3
  4 | Line #4
  5 | Line #5
  6 | Line #6
  7 | Line #7
  8 | Line #8
  9 | Line #9
 10 | Line #10
 11 | Line #11
 12 | Line #12
 13 | Line #13
 14 | Line #14
 15 | Line #15
 16 | Line #16
 17 | Line #17
 18 | Line #18
 19 | Line #19
 20 | Line #20
 21 | Line #21
 22 | Line #22
 23 | Line #23
 24 | Line #24
 25 | Line #25
 26 | Line #26
 27 | Line #27
 28 | Line #28
 29 | Line #29
 30 | Line #30
 31 | Line #31
 32 | Line #32
 33 | Line #33
 34 | Line #34
 35 | Line #35
 36 | Line #36
 37 | Line #37
 38 | Line #38
 39 | Line #39
 40 | Line #40
 41 | Line #41
 42 | Line #42
 43 | Line #43
 44 | Line #44
 45 | Line #45
 46 | Line #46
 47 | Line #47
 48 | Line #48
 49 | Line #49
 50 | Line #50
 51 | Line #51
 52 | Line #52
 53 | Line #53
 54 | Line #54
 55 | Line #55
 56 | Line #56
 57 | Line #57
 58 | Line #58
 59 | Line #59
 60 | Line #60
 61 | Line #61
 62 | Line #62
 63 | Line #63
 64 | Line #64
 65 | Line #65
 66 | Line #66
 67 | Line #67
 68 | Line #68
 69 | Line #69
 70 | Line #70
 71 | Line #71
 72 | Line #72
 73 | Line #73
 74 | Line #74
 75 | Line #75
 76 | Line #76
 77 | Line #77
 78 | Line #78
 79 | Line #79
 80 | Line #80
 81 | Line #81
 82 | Line #82
 83 | Line #83
 84 | Line #84
 85 | Line #85
 86 | Line #86
 87 | Line #87
 88 | Line #88
 89 | Line #89
 90 | Line #90
 91 | Line #91
 92 | Line #92
 93 | Line #93
 94 | Line #94
 95 | Line #95
 96 | Line #96
 97 | Line #97
 98 | Line #98
 99 | Line #99
100 | Line #100
101 | Line #101
102 | Line #102
103 | Line #103
104 | Line #104
105 | Line #105
106 | Line #106
107 | Line #107
108 | Line #108
109 | Line #109
110 | Line #110
111 | Line #111
112 | Line #112
113 | Line #113
114 | Line #114
115 | Line #115
116 | Line #116
117 | Line #117
118 | Line #118
119 | Line #119
120 | Line #120
121 | Line #121
122 | Line #122
123 | Line #123
124 | Line #124
125 | Line #125
126 | Line #126
127 | Line #127
128 | Line #128
129 | Line #129
130 | Line #130
131 | Line #131
132 | Line #132
133 | Line #133
134 | Line #134
135 | Line #135
136 | Line #136
137 | Line #137
138 | Line #138
139 | Line #139
140 | Line #140
141 | Line #141
142 | Line #142
143 | Line #143
144 | Line #144
145 | Line #145
146 | Line #146
147 | Line #147
148 | Line #148
149 | Line #149
150 | Line #150
151 | Line #151
152 | Line #152
153 | Line #153
154 | Line #154
155 | Line #155
156 | Line #156
157 | Line #157
158 | Line #158
159 | Line #159
160 | Line #160
161 | Line #161
162 | Line #162
163 | Line #163
164 | Line #164
165 | Line #165
166 | Line #166
167 | Line #167
168 | Line #168
169 | Line #169
170 | Line #170
171 | Line #171
172 | Line #172
173 | Line #173
174 | Line #174
175 | Line #175
176 | Line #176
177 | Line #177
178 | Line #178
179 | Line #179
180 | Line #180
181 | Line #181
182 | Line #182
183 | Line #183
184 | Line #184
185 | Line #185
186 | Line #186
187 | Line #187
188 | Line #188
189 | Line #189
190 | Line #190
191 | Line #191
192 | Line #192
193 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/Makefile:
--------------------------------------------------------------------------------
1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c
2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft
3 | 
4 | sha256: sha256.cpp qpufuncs.cpp
5 | 	g++ -O3 -o sha256 sha256.cpp qpufuncs.cpp $(MBOX_C) $(MBOX_INC)
6 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/partial.asm:
--------------------------------------------------------------------------------
  1 | define(`NOP', `nop ra39, ra39, ra39;  nop rb39, rb39, rb39')
  2 | 
  3 | ## Move the uniforms (arguments) into registers
  4 | or ra31, ra32, 0;           nop         # address of K in ra31
  5 | or ra30, ra32, 0;           nop         # address of H in ra30
  6 | or ra29, ra32, 0;           nop         # address of data in ra29
  7 | 
  8 | ## Load some rotation constants that don't fit in small immediates
  9 | ldi rb2, 0x16;
 10 | ldi rb5, 0x19;
 11 | 
 12 | ## VCD DMA setup for the H vectors (16x8)
 13 | ldi ra49, 0x82801000
 14 | 
 15 | ## Move the H vectors into the VPM (0,0 in VPM)
 16 | or ra50, ra30, 0;           nop
 17 | 
 18 | ## Wait for the DMA to complete
 19 | and rb39, ra50, ra50;           nop
 20 | 
 21 | ## Configure the VPM for reading the H vectors
 22 | ldi ra49, 0x801200
 23 | 
 24 | ## Read the H vectors into registers ra20..ra27 (these are the a..h)
 25 | ## Also copy them into rb20..rb27 (we need the original values to write back)
 26 | or ra20, ra48, 0;           v8max rb20, ra48, ra48;
 27 | or ra21, ra48, 0;           v8max rb21, ra48, ra48;
 28 | or ra22, ra48, 0;           v8max rb22, ra48, ra48;
 29 | or ra23, ra48, 0;           v8max rb23, ra48, ra48;
 30 | or ra24, ra48, 0;           v8max rb24, ra48, ra48;
 31 | or ra25, ra48, 0;           v8max rb25, ra48, ra48;
 32 | or ra26, ra48, 0;           v8max rb26, ra48, ra48;
 33 | or ra27, ra48, 0;           v8max rb27, ra48, ra48;
 34 | 
 35 | ## Configure the VPM/VCD to read the data vectors
 36 | ldi ra49, 0x83001000
 37 | or ra50, ra29, 0;           nop         ## Load address to DMA
 38 | or rb39, ra50, 0;           nop         ## Wait for it
 39 | 
 40 | ldi ra49, 0x1200
 41 | 
 42 | ## First 16 loops of compression
 43 | ldi ra2, 0x10;
 44 | compress:
 45 |     ## r0 = K[i] + h 
 46 |     or rb56, ra31, 0;       nop
 47 |     nop.tmu ra39, ra39, ra39;   nop
 48 |     add ra31, ra31, 4;      nop
 49 |     add rb32, r4, ra27;     nop
 50 | 
 51 |     ## T1 = h + K[i] + W[i]
 52 |     add rb18, r0, rb48;     nop
 53 | 
 54 |     ## T1 += CH(e,f,g) => (e & f) ^ (~e & g) (e: ra24, f: ra25, g: ra26)
 55 |     or ra32, ra24, 0;       nop         # load e into r0
 56 |     and ra33, ra25, r0;     nop         # r1 = r0 & f   (e & f)
 57 |     not ra32, r0, 0;        nop         # r0 = ~r0      (~e)
 58 |     and ra32, r0, ra26;     nop         # r0 = r0 & g   (~e & g)
 59 |     xor ra32, r0, r1;       nop         # r0 = r0 ^ r1  (e & f) ^ (~e & g)
 60 |     add rb18, rb18, r0;     nop         # accumulate into T1
 61 | 
 62 |     ## T1 += sigma1(e) => RotR(e, 6) ^ RotR(e, 11) ^ RotR(e, 25)
 63 |     ror rb32, ra24, 6;      nop
 64 |     ror rb33, ra24, 11;     nop
 65 |     ror rb34, ra24, rb5;    nop
 66 |     xor rb32, r0, r1;       nop
 67 |     xor rb32, r0, r2;       nop
 68 |     add rb18, r0, rb18;     nop
 69 | 
 70 |     ## T2 (ra3) = sigma0(a)  (a: ra20)
 71 |     ror ra32, ra20, 2;      nop         # r0 = RotR(a, 2)
 72 |     ror ra33, ra20, 13;     nop         # r1 = RotR(a, 13)
 73 |     xor ra32, r0, r1;       nop         # r0 = RotR(a, 2) ^ RotR(a, 13)
 74 |     ror ra33, ra20, rb2;    nop         # r1 = RotR(a, 22)
 75 |     xor ra3, r0, r1;        nop         # T2 = sigma0(a)
 76 | 
 77 |     ## T2 += Maj(a,b,c)
 78 |     or ra32, ra20, 0;       nop         # load a into r0
 79 |     and ra33, r0, ra21;     nop         # r1 = a & b
 80 |     and ra34, r0, ra22;     nop         # r2 = a & c
 81 |     xor ra32, r1, r2;       nop         # r0 = (a & b) ^ (a & c)
 82 |     or ra33, ra21, 0;       nop         # load b into r1
 83 |     and ra33, r1, ra22;     nop         # r1 = b & c
 84 |     xor ra32, r0, r1;       nop         # r0 = r0 ^ r1
 85 |     add ra3, ra3, r0;       nop         # T2 += Maj(a,b,c)
 86 | 
 87 |     ## swizzle
 88 |     or ra27, ra26, 0;       nop
 89 |     or ra26, ra25, 0;       nop
 90 |     or ra25, ra24, 0;       nop
 91 |     add ra24, ra23, rb18;   nop
 92 |     or ra23, ra22, 0;       nop
 93 |     or ra22, ra21, 0;       nop
 94 |     or ra21, ra20, 0;       nop
 95 |     add ra20, rb18, ra3;    nop
 96 | 
 97 |     ## Loop
 98 |     sub ra2, ra2, 1;        nop
 99 |     brr.ze ra39, compress
100 | NOP
101 | NOP
102 | NOP
103 | 
104 | ## Configure the VPM to write the H vectors back into place
105 | ldi rb49, 0x1200
106 | 
107 | ## Write H vectors back (+=)
108 | add rb48, ra20, rb20;       nop
109 | add rb48, ra21, rb21;       nop
110 | add rb48, ra22, rb22;       nop
111 | add rb48, ra23, rb23;       nop
112 | add rb48, ra24, rb24;       nop
113 | add rb48, ra25, rb25;       nop
114 | add rb48, ra26, rb26;       nop
115 | add rb48, ra27, rb27;       nop
116 | 
117 | ## Configure the VCD for DMA back to the host
118 | ldi rb49, 0x88084000
119 | 
120 | ## Write the H address to store
121 | or rb50, ra30, 0;           nop
122 | 
123 | ## Wait for the DMA to complete
124 | or rb39, rb50, ra39;        nop
125 | 
126 | ## Trigger a host interrupt to finish the program
127 | or rb38, ra39, rb39;        nop
128 | 
129 | nop.tend ra39, ra39, ra39;  nop rb39, rb39, rb39
130 | NOP
131 | NOP
132 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/qpufuncs.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <sys/mman.h>
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <fcntl.h>
  6 | #include <inttypes.h>
  7 | #include <string.h>             // memset
  8 | #include <stddef.h>
  9 | #include <unistd.h>
 10 | #include "mailbox.h"
 11 | #include "qpufuncs.h"
 12 | 
 13 | #define GPU_MEM_FLG     0xC         // cached
 14 | #define GPU_MEM_MAP     0x0         // cached
 15 | #define NUNIFORMS       3
 16 | 
 17 | 
 18 | struct sha256_memory_map
 19 | {
 20 |     /*
 21 |        data layout is:
 22 |          64 words for K constants (accessed as a texture lookup)
 23 |          16x8 (128) words for the 8 H vectors (VPM)
 24 |          16x16 (256) words for the input data (VPM)
 25 |        Total: 448 words
 26 |      */
 27 |     uint32_t data[64 + (128 + 256) * NUM_QPUS];
 28 |     uint32_t code[MAX_CODE_SIZE];
 29 |     /*
 30 |       uniforms are:
 31 |         u1: address of K texture
 32 |         u2: address of H vectors (also output location)
 33 |         u3: address of data buffer
 34 |         u4: stride
 35 |      */
 36 |     uint32_t uniforms[NUNIFORMS * NUM_QPUS];
 37 |     uint32_t msg[NUM_QPUS*2];            // msg is a (uniform, code) tuple to execute_qpu
 38 | 
 39 |     /* results are placed back into data where the H vectors were read from */
 40 | };
 41 | 
 42 | 
 43 | static struct
 44 | {
 45 |     int mb;
 46 |     unsigned handle;
 47 |     unsigned size;
 48 |     unsigned vc_msg;
 49 |     unsigned ptr;
 50 |     void* arm_ptr;
 51 | } sha256_qpu_context;
 52 | 
 53 | 
 54 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 55 |                    unsigned *shader_code, unsigned code_len)
 56 | {
 57 |     sha256_qpu_context.mb = mbox_open();
 58 |     if (qpu_enable(sha256_qpu_context.mb, 1)) {
 59 |         fprintf(stderr, "Unable to enable QPU\n");
 60 |         return -1;
 61 |     }
 62 | 
 63 |     // 1 MB should be plenty
 64 |     sha256_qpu_context.size = 1024 * 1024;
 65 |     sha256_qpu_context.handle = mem_alloc(sha256_qpu_context.mb,
 66 |                                           sha256_qpu_context.size, 4096,
 67 |                                           GPU_MEM_FLG);
 68 |     if (!sha256_qpu_context.handle) {
 69 |         fprintf(stderr, "Unable to allocate %d bytes of GPU memory",
 70 |                         sha256_qpu_context.size);
 71 |         return -2;
 72 |     }
 73 |     unsigned ptr = mem_lock(sha256_qpu_context.mb, sha256_qpu_context.handle);
 74 |     sha256_qpu_context.arm_ptr = mapmem(ptr + GPU_MEM_MAP, sha256_qpu_context.size);
 75 |     sha256_qpu_context.ptr = ptr;
 76 | 
 77 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
 78 |                                                 sha256_qpu_context.arm_ptr;
 79 |     memset(arm_map, 0x0, sizeof(struct sha256_memory_map));
 80 |     unsigned vc_data = ptr + offsetof(struct sha256_memory_map, data);
 81 |     unsigned vc_uniforms = ptr + offsetof(struct sha256_memory_map, uniforms);
 82 |     unsigned vc_code = ptr + offsetof(struct sha256_memory_map, code);
 83 |     sha256_qpu_context.vc_msg = ptr + offsetof(struct sha256_memory_map, msg);
 84 | 
 85 |     memcpy(arm_map->code, shader_code, code_len);
 86 |     memcpy(arm_map->data, K, 64*sizeof(uint32_t));
 87 |     memcpy(arm_map->data+64, H, 128*sizeof(uint32_t)*NUM_QPUS);
 88 |     memcpy(arm_map->data+64 + 128*NUM_QPUS, data, 256*NUM_QPUS*sizeof(uint32_t));
 89 |     for (int i=0; i < NUM_QPUS; i++) {
 90 |         arm_map->uniforms[i*NUNIFORMS+0] = vc_data;         // data (address of K texture)
 91 |         arm_map->uniforms[i*NUNIFORMS+1] = vc_data + 64*sizeof(uint32_t) + 128 * i * sizeof(uint32_t);         // address of H vectors
 92 |         arm_map->uniforms[i*NUNIFORMS+2] = vc_data + 64*sizeof(uint32_t) + 128*NUM_QPUS*sizeof(uint32_t) + 256 * i * sizeof(uint32_t);
 93 | 
 94 |         arm_map->msg[i*2+0] = vc_uniforms + i * NUNIFORMS * sizeof(uint32_t);
 95 |         arm_map->msg[i*2+1] = vc_code;
 96 |     }
 97 | 
 98 |     return sha256_qpu_context.mb;
 99 | }
100 | 
101 | 
102 | void SHA256ExecuteQPU(uint32_t* H)
103 | {
104 |     unsigned ret = execute_qpu(sha256_qpu_context.mb, NUM_QPUS,
105 |                                sha256_qpu_context.vc_msg, 1, 10000);
106 |     if (ret != 0)
107 |         fprintf(stderr, "Failed execute_qpu!\n");
108 | }
109 | 
110 | 
111 | void SHA256CleanupQPU(int handle)
112 | {
113 |     unmapmem(sha256_qpu_context.arm_ptr, sha256_qpu_context.size);
114 |     mem_unlock(sha256_qpu_context.mb, sha256_qpu_context.handle);
115 |     mem_free(sha256_qpu_context.mb, sha256_qpu_context.handle);
116 |     qpu_enable(sha256_qpu_context.mb, 0);
117 |     mbox_close(sha256_qpu_context.mb);
118 | }
119 | 
120 | 
121 | void SHA256FetchResult(uint32_t *H)
122 | {
123 |     struct sha256_memory_map *arm_map = (struct sha256_memory_map *)
124 |                                                 sha256_qpu_context.arm_ptr;
125 |     memcpy(H, arm_map->data+64, NUM_QPUS*128*sizeof(uint32_t));
126 | }
127 | 
128 | 
129 | int loadQPUCode(const char *fname, unsigned int* buffer, int len)
130 | {
131 |     FILE *in = fopen(fname, "r");
132 |     if (!in) {
133 |         fprintf(stderr, "Failed to open %s.\n", fname);
134 |         return -1;
135 |     }
136 | 
137 |     size_t items = fread(buffer, sizeof(unsigned int), len, in);
138 |     fclose(in);
139 | 
140 |     return items * sizeof(unsigned int);
141 | }
142 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/qpufuncs.h:
--------------------------------------------------------------------------------
 1 | #ifndef _QPUFUNCS_
 2 | #define _QPUFUNCS_
 3 | 
 4 | #define NUM_QPUS        1
 5 | #define MAX_CODE_SIZE   24000           /* in words */
 6 | 
 7 | int SHA256SetupQPU(uint32_t* K, uint32_t *data, uint32_t *H, int stride,
 8 |                    unsigned *shader_code, unsigned code_len);
 9 | int loadQPUCode(const char *fname, unsigned int* buffer, int len);
10 | void SHA256CleanupQPU(int handle);
11 | void SHA256ExecuteQPU(uint32_t* H);
12 | void SHA256FetchResult(uint32_t* H);
13 | 
14 | #endif      // _QPUFUNCS_
15 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/sha256.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <inttypes.h>
  3 | #include <sys/time.h>
  4 | #include <string.h>
  5 | #include <stdlib.h>
  6 | #include "qpufuncs.h"
  7 | 
  8 | #define QPU_CODE_FILE   "sha256.bin"
  9 | #define NUM_QPUS        1
 10 | #define BUFFER_SIZE     NUM_QPUS * 16
 11 | 
 12 | static uint32_t K[] = {
 13 |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 14 |     0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 15 |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 16 |     0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 17 |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 18 |     0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 19 |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 20 |     0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 21 |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 22 |     0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 23 |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 24 | };
 25 | 
 26 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
 27 |     return (x & y) ^ (~x & z);
 28 | }
 29 | 
 30 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) {
 31 |     return (x & y) ^ (x & z) ^ (y & z);
 32 | }
 33 | 
 34 | static inline uint32_t RotR(uint32_t x, uint8_t shift) {
 35 |     return (x >> shift) | (x << (32-shift));
 36 | }
 37 | 
 38 | static inline uint32_t sigma0(uint32_t x) {
 39 |     return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22);
 40 | }
 41 | 
 42 | static inline uint32_t sigma1(uint32_t x) {
 43 |     return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25);
 44 | }
 45 | 
 46 | static inline uint32_t smsigma0(uint32_t x) {
 47 |     return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3);
 48 | }
 49 | 
 50 | static inline uint32_t smsigma1(uint32_t x) {
 51 |     return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10);
 52 | }
 53 | 
 54 | #define ENDIAN(x, i)        ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3]))
 55 | 
 56 | /*
 57 |  * data is an array of BUFFER_SIZE buffers to hash
 58 |  * H is an input/output parameter
 59 |  * stride is the stride for data (TODO: handle hashes of more than one block)
 60 |  */
 61 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride)
 62 | {
 63 |     uint32_t W[64];
 64 |     uint32_t a, b, c, d, e, f, g, h;
 65 | 
 66 |     for (int k=0; k < BUFFER_SIZE; k++)
 67 |     {
 68 |         for (int i=0; i < 16; i++)
 69 |             W[i] = data[k*stride+i];
 70 |         for (int i=16; i < 64; i++)
 71 |             W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16];
 72 | 
 73 |         a = H[k*8+0];
 74 |         b = H[k*8+1];
 75 |         c = H[k*8+2];
 76 |         d = H[k*8+3];
 77 |         e = H[k*8+4];
 78 |         f = H[k*8+5];
 79 |         g = H[k*8+6];
 80 |         h = H[k*8+7];
 81 | 
 82 |         /*
 83 |          * NOTE: We are only computing the first 16 iterations so that we can
 84 |          * ignore schedule generation and test correctness of this compression
 85 |          * loop.
 86 |          */
 87 |         for (int i=0; i < 16; i++)
 88 |         {
 89 |             uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i];
 90 |             uint32_t T2 = sigma0(a) + Maj(a,b,c);
 91 |             h = g;
 92 |             g = f;
 93 |             f = e;
 94 |             e = d + T1;
 95 |             d = c;
 96 |             c = b;
 97 |             b = a;
 98 |             a = T1 + T2;
 99 |         }
100 | 
101 |         H[k*8+0] += a;
102 |         H[k*8+1] += b;
103 |         H[k*8+2] += c;
104 |         H[k*8+3] += d;
105 |         H[k*8+4] += e;
106 |         H[k*8+5] += f;
107 |         H[k*8+6] += g;
108 |         H[k*8+7] += h;
109 |     }
110 | }
111 | 
112 | 
113 | void execute_sha256_qpu(uint32_t *data, uint32_t *H, int stride)
114 | {
115 |     SHA256ExecuteQPU(H);
116 | }
117 | 
118 | 
119 | int main(int argc, char **argv)
120 | {
121 |     unsigned int shader_code[MAX_CODE_SIZE];
122 |     bool run_cpu(true);
123 | 
124 |     if (argc < 2) {
125 |         fprintf(stderr, "Usage: %s <input file> [-qpu]\n", argv[0]);
126 |         return 1;
127 |     }
128 |     if (argc > 2 && (strcmp(argv[2], "-qpu") == 0))
129 |         run_cpu = false;
130 | 
131 |     /* Load the data to hash */
132 |     FILE *f_data = fopen(argv[1], "r");
133 |     if (!f_data) {
134 |         fprintf(stderr, "Unable to open file %s\n", argv[1]);
135 |         return 3;
136 |     }
137 | 
138 |     /* Load the QPU code */
139 |     int code_len = loadQPUCode(QPU_CODE_FILE, shader_code, MAX_CODE_SIZE);
140 |     if (code_len < 1) {
141 |         fprintf(stderr, "Unable to load QPU code from %s\n", QPU_CODE_FILE);
142 |         return 2;
143 |     }
144 |     printf("Loaded %d bytes of QPU code.\n", code_len);
145 | 
146 |     int nblocks = 1;                    // 1 512-bit block for now
147 |     int stride = nblocks * 16;
148 |     uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride];
149 |     uint32_t *H = new uint32_t[BUFFER_SIZE*8];
150 | 
151 |     uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f,
152 |                       0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
153 |     char filebuffer[64];            // 1 512-bit blocks
154 |     for (int i=0; i < BUFFER_SIZE; i++)
155 |     {
156 |         memcpy(H+i*8, H0, sizeof(H0));
157 |         memset(filebuffer, 0x0, sizeof(filebuffer));
158 |         // read a line up to 64-bytes long
159 |         char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data);
160 |         if (!p) {
161 |             fprintf(stderr, "Failed to read enough lines from data file.\n");
162 |             delete[] H;
163 |             delete[] buffer;
164 |             return 4;
165 |         }
166 | 
167 |         int bytes = strlen(filebuffer);
168 |         filebuffer[bytes] = 0x80;           // SHA-256 padding
169 | 
170 |         // last 8 bytes are the length of the initial message in bits
171 |         uint8_t len_buffer[8];
172 |         *(uint64_t *)len_buffer = bytes * 8;
173 |         for (int j=0; j < 8; j++)
174 |             filebuffer[56+j] = len_buffer[7-j];
175 | 
176 |         for (int j=0; j < 16; j++)
177 |             buffer[i*16+j] = ENDIAN(filebuffer, j);
178 |     }
179 | 
180 |     int handle = SHA256SetupQPU(K, buffer, H, stride, shader_code, code_len);
181 |     if (handle < 0) {
182 |         fprintf(stderr, "Unable to setup QPU.  Check permissions\n");
183 |         delete[] buffer;
184 |         delete[] H;
185 |         return 4;
186 |     }
187 | 
188 |     /*
189 |      * SHA-256 calculation here
190 |      */
191 |     for (int i=0; i < nblocks; i++)
192 |     {
193 |         printf("Running %s version ...\n", (run_cpu) ? "CPU" : "QPU");
194 |         if (run_cpu)
195 |             execute_sha256_cpu(buffer+i*16, H, stride);
196 |         else
197 |             execute_sha256_qpu(buffer+i*16, H, stride);
198 |     }
199 | 
200 |     if (!run_cpu)
201 |         SHA256FetchResult(H);
202 | 
203 |     // print out the H
204 |     for (int i=0; i < BUFFER_SIZE; i++) {
205 |         printf("%02d / SHA-256: ", i);
206 |         for (int j=0; j < 8; j++)
207 |             printf("%08x ", H[i*8+j]);
208 |         printf("\n");
209 |     }
210 | 
211 |     SHA256CleanupQPU(handle);
212 | 
213 |     delete[] buffer;
214 |     delete[] H;
215 | }
216 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/partial/test-data.bin:
--------------------------------------------------------------------------------
 1 | this is line number one
 2 | this is line number two
 3 | this is line number three
 4 | this is line number four
 5 | this is line number five
 6 | this is line number six
 7 | this is line number seven
 8 | this is line number eight
 9 | this is line number nine
10 | this is line number ten
11 | this is line number eleven
12 | this is line number twelve
13 | this is line number thirteen
14 | this is line number fourteen
15 | this is line number fifteen
16 | this is line number sixteen
17 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/reference/Makefile:
--------------------------------------------------------------------------------
1 | sha256: sha256.cpp
2 | 	g++ -O3 -o sha256 sha256.cpp
3 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/reference/sha256.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <inttypes.h>
  3 | #include <sys/time.h>
  4 | #include <string.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #define NUM_QPUS        1
  8 | #define BUFFER_SIZE     NUM_QPUS * 16
  9 | 
 10 | static uint32_t K[] = {
 11 |     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 12 |     0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 13 |     0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 14 |     0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 15 |     0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 16 |     0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 17 |     0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 18 |     0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 19 |     0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 20 |     0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 21 |     0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 22 | };
 23 | 
 24 | static inline uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
 25 |     return (x & y) ^ (~x & z);
 26 | }
 27 | 
 28 | static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) {
 29 |     return (x & y) ^ (x & z) ^ (y & z);
 30 | }
 31 | 
 32 | static inline uint32_t RotR(uint32_t x, uint8_t shift) {
 33 |     return (x >> shift) | (x << (32-shift));
 34 | }
 35 | 
 36 | static inline uint32_t sigma0(uint32_t x) {
 37 |     return RotR(x, 2) ^ RotR(x, 13) ^ RotR(x, 22);
 38 | }
 39 | 
 40 | static inline uint32_t sigma1(uint32_t x) {
 41 |     return RotR(x, 6) ^ RotR(x, 11) ^ RotR(x, 25);
 42 | }
 43 | 
 44 | static inline uint32_t smsigma0(uint32_t x) {
 45 |     return RotR(x, 7) ^ RotR(x, 18) ^ (x >> 3);
 46 | }
 47 | 
 48 | static inline uint32_t smsigma1(uint32_t x) {
 49 |     return RotR(x, 17) ^ RotR(x, 19) ^ (x >> 10);
 50 | }
 51 | 
 52 | #define ENDIAN(x, i)        ((x[i*4] << 24) | (x[i*4+1] << 16) | (x[i*4+2] << 8) | (x[i*4+3]))
 53 | 
 54 | /*
 55 |  * data is an array of BUFFER_SIZE buffers to hash
 56 |  * H is an input/output parameter
 57 |  * stride is the stride for data (TODO: handle hashes of more than one block)
 58 |  */
 59 | void execute_sha256_cpu(uint32_t *data, uint32_t *H, int stride)
 60 | {
 61 |     uint32_t W[64];
 62 |     uint32_t a, b, c, d, e, f, g, h;
 63 | 
 64 |     for (int k=0; k < BUFFER_SIZE; k++)
 65 |     {
 66 |         for (int i=0; i < 16; i++)
 67 |             W[i] = data[k*stride+i];
 68 |         for (int i=16; i < 64; i++)
 69 |             W[i] = smsigma1(W[i-2]) + W[i-7] + smsigma0(W[i-15]) + W[i-16];
 70 | 
 71 |         a = H[k*8+0];
 72 |         b = H[k*8+1];
 73 |         c = H[k*8+2];
 74 |         d = H[k*8+3];
 75 |         e = H[k*8+4];
 76 |         f = H[k*8+5];
 77 |         g = H[k*8+6];
 78 |         h = H[k*8+7];
 79 | 
 80 |         for (int i=0; i < 64; i++)
 81 |         {
 82 |             uint32_t T1 = h + sigma1(e) + CH(e,f,g) + K[i] + W[i];
 83 |             uint32_t T2 = sigma0(a) + Maj(a,b,c);
 84 |             h = g;
 85 |             g = f;
 86 |             f = e;
 87 |             e = d + T1;
 88 |             d = c;
 89 |             c = b;
 90 |             b = a;
 91 |             a = T1 + T2;
 92 |         }
 93 | 
 94 |         H[k*8+0] += a;
 95 |         H[k*8+1] += b;
 96 |         H[k*8+2] += c;
 97 |         H[k*8+3] += d;
 98 |         H[k*8+4] += e;
 99 |         H[k*8+5] += f;
100 |         H[k*8+6] += g;
101 |         H[k*8+7] += h;
102 |     }
103 | }
104 | 
105 | 
106 | int main(int argc, char **argv)
107 | {
108 |     bool run_cpu(true);
109 |     if (argc < 2) {
110 |         fprintf(stderr, "Usage: %s <input file>\n", argv[0]);
111 |         return 1;
112 |     }
113 | 
114 |     /* Load the data to hash */
115 |     FILE *f_data = fopen(argv[1], "r");
116 |     if (!f_data) {
117 |         fprintf(stderr, "Unable to open file %s\n", argv[1]);
118 |         return 3;
119 |     }
120 | 
121 |     int nblocks = 1;                    // 1 512-bit block for now
122 |     int stride = nblocks * 16;
123 |     uint32_t *buffer = new uint32_t[BUFFER_SIZE*stride];
124 |     uint32_t *H = new uint32_t[BUFFER_SIZE*8];
125 | 
126 |     uint32_t H0[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f,
127 |                       0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
128 |     char filebuffer[64];            // 1 512-bit blocks
129 |     for (int i=0; i < BUFFER_SIZE; i++)
130 |     {
131 |         memcpy(H+i*8, H0, sizeof(H0));
132 |         memset(filebuffer, 0x0, sizeof(filebuffer));
133 |         // read a line up to 64-bytes long
134 |         char *p = fgets(filebuffer, sizeof(filebuffer)-1, f_data);
135 |         if (!p) {
136 |             fprintf(stderr, "Failed to read enough lines from data file.\n");
137 |             delete[] H;
138 |             delete[] buffer;
139 |             return 4;
140 |         }
141 | 
142 |         int bytes = strlen(filebuffer);
143 |         filebuffer[bytes] = 0x80;           // SHA-256 padding
144 | 
145 |         // last 8 bytes are the length of the initial message in bits
146 |         uint8_t len_buffer[8];
147 |         *(uint64_t *)len_buffer = bytes * 8;
148 |         for (int j=0; j < 8; j++)
149 |             filebuffer[56+j] = len_buffer[7-j];
150 | 
151 |         for (int j=0; j < 16; j++)
152 |             buffer[i*16+j] = ENDIAN(filebuffer, j);
153 |     }
154 | 
155 |     /*
156 |      * SHA-256 calculation here
157 |      */
158 |     for (int i=0; i < nblocks; i++)
159 |     {
160 |         execute_sha256_cpu(buffer+i*16, H, stride);
161 |     }
162 | 
163 |     for (int i=0; i < BUFFER_SIZE; i++) {
164 |         printf("%02d / SHA-256: ", i);
165 |         for (int j=0; j < 8; j++)
166 |             printf("%08x ", H[i*8+j]);
167 |         printf("\n");
168 |     }
169 | 
170 |     delete[] buffer;
171 |     delete[] H;
172 | }
173 | 


--------------------------------------------------------------------------------
/QPU/SHA-256/reference/test-data.bin:
--------------------------------------------------------------------------------
 1 | this is line number one
 2 | this is line number two
 3 | this is line number three
 4 | this is line number four
 5 | this is line number five
 6 | this is line number six
 7 | this is line number seven
 8 | this is line number eight
 9 | this is line number nine
10 | this is line number ten
11 | this is line number eleven
12 | this is line number twelve
13 | this is line number thirteen
14 | this is line number fourteen
15 | this is line number fifteen
16 | this is line number sixteen
17 | 


--------------------------------------------------------------------------------
/QPU/assembler/Makefile:
--------------------------------------------------------------------------------
1 | qpu-assembler: assemble.cpp
2 | 	g++ -g -o qpu-assembler assemble.cpp
3 | 


--------------------------------------------------------------------------------
/QPU/assembler/assemble.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <inttypes.h>
  5 | #include <map>
  6 | #include <vector>
  7 | #include <unistd.h> // for getopt()
  8 | 
  9 | using namespace std;
 10 | 
 11 | enum token_t {
 12 |     END=-1,
 13 |     WORD,
 14 |     DOT,
 15 |     COMMA,
 16 |     SEMI,
 17 |     COLON,
 18 | };
 19 | 
 20 | struct QPUreg {
 21 |     enum { A, B, ACCUM, SMALL } file;
 22 |     int num;
 23 | };
 24 | 
 25 | struct relocation {
 26 |     string label;
 27 |     int pc;
 28 | };
 29 | 
 30 | struct context {
 31 |     const char *stream;
 32 |     map<string, int> labels;
 33 |     int pc;
 34 |     vector<relocation> relocations;
 35 | };
 36 | 
 37 | 
 38 | static string addOps[] = {
 39 |     "nop", "fadd", "fsub", "fmin", "fmax", "fminabs", "fmaxabs",
 40 |     "ftoi", "itof", "XXX", "XXX", "XXX", "add", "sub", "shr",
 41 |     "asr", "ror", "shl", "min", "max", "and", "or", "xor", "not",
 42 |     "clz", "XXX", "XXX", "XXX", "XXX", "XXX", "v8adds", "v8subs" };
 43 | 
 44 | static string mulOps[] = {
 45 |     "nop", "fmul", "mul24", "v8muld", "v8min", "v8max", "v8adds",
 46 |     "v8subs" };
 47 | 
 48 | static uint8_t addOpCode(const string& word)
 49 | {
 50 |     for (int i=0; i < 32; i++) {
 51 |         if (word == addOps[i])
 52 |             return i;
 53 |     }
 54 | 
 55 |     return 0xFF;
 56 | }
 57 | 
 58 | static uint8_t mulOpCode(const string& word)
 59 | {
 60 |     for (int i=0; i < 8; i++) {
 61 |         if (word == mulOps[i])
 62 |             return i;
 63 |     }
 64 | 
 65 |     return 0xFF;
 66 | }
 67 | 
 68 | 
 69 | bool isRegisterWord(const string& word) { return word[0] == 'r'; }
 70 | 
 71 | string printRegister(const QPUreg& reg)
 72 | {
 73 |     char buffer[32];
 74 |     if (reg.file == QPUreg::A || reg.file == QPUreg::B) {
 75 |         snprintf(buffer, 32, "r%c%d", (reg.file == QPUreg::A) ? 'a' : 'b',
 76 |                                       reg.num);
 77 |     }
 78 |     else if (reg.file == QPUreg::ACCUM) {
 79 |         snprintf(buffer, 32, "r%d", reg.num);
 80 |     }
 81 |     else {
 82 |         snprintf(buffer, 32, ".0x%x.", reg.num);
 83 |     }
 84 | 
 85 |     return buffer;
 86 | }
 87 | 
 88 | bool parseRegister(const string& word, QPUreg& reg)
 89 | {
 90 |     if (word[0] != 'r')
 91 |         return false;
 92 | 
 93 |     int offset = 0;
 94 |     switch (word[1]) {
 95 |         case 'a': reg.file = QPUreg::A; offset = 2; break;
 96 |         case 'b': reg.file = QPUreg::B; offset = 2; break;
 97 |         default:
 98 |             reg.file = QPUreg::ACCUM;
 99 |             offset = 1;
100 |     }
101 |     // TODO: check that this is in range.  (ACCUM < 6, e.g.)
102 |     reg.num = atoi(word.c_str() + offset);
103 | 
104 |     return true;
105 | }
106 | 
107 | uint32_t parseImmediate(const string& str)
108 | {
109 |     // if there is an 'x' we assume it's hex.
110 |     if (str.find_first_of("x") != string::npos)
111 |         return strtoul(str.c_str(), NULL, 16);
112 | 
113 |     if (str.find_first_of(".f") != string::npos) {
114 |         float f = strtof(str.c_str(), NULL);
115 |         return *(uint32_t*)&f;
116 |     }
117 | 
118 |     // otherwise decimal
119 |     return strtoul(str.c_str(), NULL, 10);
120 | }
121 | 
122 | uint8_t parseBranchCond(const string& str)
123 | {
124 |     if (str == "zf")            // all z flags set ("z full")
125 |         return 0x0;
126 |     if (str == "ze")            // all z flags clear ("z empty")
127 |         return 0x1;
128 |     if (str == "zs")            // any z flags set ("z set")
129 |         return 0x2;
130 |     if (str == "zc")            // any z flags clear ("z clear")
131 |         return 0x3;
132 |     if (str == "nf")            // all N flags set ("N full")
133 |         return 0x4;
134 |     if (str == "ne")            // all N flags clear ("N empty")
135 |         return 0x5;
136 |     if (str == "ns")            // any N flags set ("N set")
137 |         return 0x6;
138 |     if (str == "nc")            // any N flags clear ("N clear")
139 |         return 0x7;
140 |     if (str == "cf")            // all C flags set ("C full")
141 |         return 0x8;
142 |     if (str == "ce")            // all C flags clear ("C empty")
143 |         return 0x9;
144 |     if (str == "cs")            // any C flags set ("C set")
145 |         return 0xa;
146 |     if (str == "cc")            // any C flags clear ("C clear")
147 |         return 0xb;
148 |     if (str == "*")             // always
149 |         return 0xf;
150 | 
151 |     // throw some exceptions
152 |     cerr << "Invalid branch condition: " << str << endl;
153 |     exit(0);
154 | }
155 | 
156 | uint8_t setALUMux(const QPUreg& reg)
157 | {
158 |     switch (reg.file) {
159 |         case QPUreg::A: return 0x6;
160 |         case QPUreg::B: return 0x7;
161 |         case QPUreg::ACCUM:
162 |             if (reg.num > 6 || reg.num < 0) {
163 |                 cerr << "Invalid accumulator register; out of range" << endl;
164 |                 exit(0);
165 |             }
166 |             return reg.num;
167 |         case QPUreg::SMALL: return 0x7;
168 |     }
169 | }
170 | 
171 | 
172 | token_t nextToken(const char *stream, string& out, const char **ptr)
173 | {
174 |     char buffer[128];
175 |     int i = 0;
176 | 
177 |     *ptr = stream;
178 |     if (!stream || !*stream)
179 |         return END;
180 | 
181 |     while (*stream == ' ' || *stream == '\t')
182 |         stream++;
183 | 
184 |     if (*stream == '\0')
185 |         return END;
186 | 
187 |     if (isdigit(*stream))
188 |     {
189 |         // read until we don't find a hex digit, x (for hex) or .
190 |         while (isxdigit(*stream) || isdigit(*stream) || *stream == '.' || *stream == 'x') {
191 |             buffer[i++] = *stream++;
192 |             if (*stream == 0 || i > sizeof(buffer) - 1)
193 |                 break;
194 |         }
195 |         buffer[i++] = '\0';
196 |         out = buffer;
197 |         *ptr = stream;
198 | 
199 |         return WORD;
200 |     }
201 | 
202 |     if (*stream == '.') { *ptr = stream+1; return DOT; }
203 |     if (*stream == ',') { *ptr = stream+1; return COMMA; }
204 |     if (*stream == ';') { *ptr = stream+1; return SEMI; }
205 |     if (*stream == '#') { *ptr = stream+1; return END; }
206 |     if (*stream == ':') { *ptr = stream+1; return COLON; }
207 | 
208 |     while (*stream != '.' && *stream != ',' && *stream != ';'
209 |                           && *stream != ' ' && *stream != '\t'
210 |                           && *stream != ':')
211 |     {
212 |         buffer[i++] = *stream++;
213 |         if (*stream == 0 || i > sizeof(buffer)-1)
214 |             break;
215 |     }
216 | 
217 |     buffer[i++] = '\0';
218 |     out = buffer;
219 |     *ptr = stream;
220 | 
221 |     return WORD;
222 | }
223 | 
224 | 
225 | bool aluHelper(const char *stream, QPUreg& dest, QPUreg& r1, QPUreg& r2, uint8_t& sig, const char **ptr)
226 | {
227 |     string token_str;
228 |     token_t tok = nextToken(stream, token_str, &stream);
229 | 
230 |     if (tok == DOT) {
231 |         // conditional
232 |         nextToken(stream, token_str, &stream);
233 |         cout << "flag/conditional = " << token_str << endl;
234 |         if (token_str == "tmu")
235 |             sig = 10;
236 |         else if (token_str == "tend")
237 |             sig = 3;
238 |         tok = nextToken(stream, token_str, &stream);
239 |     }
240 | 
241 |     // this is supposed to be the destination register
242 |     if (tok != WORD) {
243 |         cout << "Expecting word.  Got: " << token_str << endl;
244 |         return false;
245 |     }
246 | 
247 |     parseRegister(token_str, dest);
248 |     tok = nextToken(stream, token_str, &stream);
249 |     if (tok != COMMA) return false;
250 |     tok = nextToken(stream, token_str, &stream);
251 |     parseRegister(token_str, r1);
252 | 
253 |     tok = nextToken(stream, token_str, &stream);
254 |     if (tok != COMMA) return false;
255 |     tok = nextToken(stream, token_str, &stream);
256 |     if (!parseRegister(token_str, r2)) {
257 |         r2.file = QPUreg::SMALL;
258 |         uint32_t imm = parseImmediate(token_str);
259 |         // double check handle negative values
260 |         if (imm < 16)
261 |             r2.num = imm;
262 |         else {
263 |             cerr << "TODO: Unhandled small immediate" << endl;
264 |             return false;
265 |         }
266 |     }
267 | 
268 |     /*
269 |     cout << "dest: " << printRegister(dest) << ", r1: "
270 |                      << printRegister(r1) << ", r2: "
271 |                      << printRegister(r2) << endl;
272 |                      */
273 | 
274 |     *ptr = stream;
275 |     return true;
276 | }
277 | 
278 | 
279 | uint64_t assembleALU(context& ctx, string word)
280 | {
281 |     string token_str;
282 |     uint8_t add_op = addOpCode(word);
283 |     if (add_op == 0xFF) {
284 |         cout << "FATAL (assert).  Bad opcode" << endl;
285 |         return -1;
286 |     }
287 | 
288 |     QPUreg addDest, addR1, addR2;
289 |     QPUreg mulDest, mulR1, mulR2;
290 | 
291 |     uint8_t sig = 0x1;          // no-signal (TODO: plumb signals through)
292 |     if (!aluHelper(ctx.stream, addDest, addR1, addR2, sig, &ctx.stream))
293 |         return -1;
294 | 
295 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
296 |     // this should be a semi-colon
297 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
298 |     uint8_t mul_op = mulOpCode(token_str);
299 |     if (mul_op == 0xFF) {
300 |         cout << "FATAL (assert).  Bad opcode" << endl;
301 |         return -1;
302 |     }
303 | 
304 |     bool skipParseMul(false);
305 |     if (mul_op == 0) {
306 |         // nop.  If the next token is a semi or END, we'll generate
307 |         // the registers for them
308 |         const char *discard;
309 |         tok = nextToken(ctx.stream, token_str, &discard);
310 |         if (tok == END || tok == SEMI) {
311 |             mulDest.num = 39;
312 |             mulDest.file = (addDest.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
313 |             mulR1 = addR1;
314 |             mulR2 = addR2;
315 |             skipParseMul = true;
316 |         }
317 |     }
318 | 
319 |     if (!skipParseMul) {
320 |         uint8_t junk;
321 |         if (!aluHelper(ctx.stream, mulDest, mulR1, mulR2, junk, &ctx.stream))
322 |             return -1;
323 |     }
324 | 
325 |     uint64_t ins = 0x0;
326 |     uint8_t cond_add = 0x1;
327 |     uint8_t cond_mul = 0x1;
328 |     uint8_t sf = 0x1;
329 |     if (add_op == 0)
330 |         sf = 0x0;           // no set flags on nop
331 | 
332 |     // TODO: constraints.  We can only read from file A and file B once (dual-port)
333 | 
334 |     uint8_t ws = 0x0;
335 |     // If the add pipe specifies file b for output, ws = 1
336 |     if (addDest.file == QPUreg::B)
337 |         ws = 0x1;
338 |     // if ws == 1, mul pipe must specify file a for output
339 |     if (ws == 0x1 && mulDest.file != QPUreg::A) {
340 |         cout << "constraint check failed.  mul pipe must specify register file A when write-swap set" << endl;
341 |         return -1;
342 |     }
343 |     // if ws == 0, mul pipe must specify file b for output
344 |     if (ws == 0x0 && mulDest.file != QPUreg::B) {
345 |         cout << "constraint check failed.  mul pipe must specify register file B when write-swap clear" << endl;
346 |         return -1;
347 |     }
348 | 
349 |     // TODO: handle the accumulators and the small immediate
350 |     uint8_t read_a = 0x0;
351 |     if (addR1.file == QPUreg::A) read_a = addR1.num;
352 |     else if (addR2.file == QPUreg::A) read_a = addR2.num;
353 |     else if (mulR1.file == QPUreg::A) read_a = mulR1.num;
354 |     else if (mulR2.file == QPUreg::A) read_a = mulR2.num;
355 | 
356 |     uint8_t read_b = 0x0;
357 |     if (addR1.file == QPUreg::B) read_b = addR1.num;
358 |     else if (addR2.file == QPUreg::B) read_b = addR2.num;
359 |     else if (mulR1.file == QPUreg::B) read_b = mulR1.num;
360 |     else if (mulR2.file == QPUreg::B) read_b = mulR2.num;
361 | 
362 |     // checks:
363 |     //   read_a not set and one of the muxes specifies file A ...
364 |     //   same for read_b
365 |     //   read_b set and there is a small immediate value
366 | 
367 |     // we could have immediates in the first register slot but not sure it makes sense
368 |     // As above, we should check that read_b is not already set
369 |     if (addR2.file == QPUreg::SMALL)    { read_b = addR2.num; sig = 13; }
370 |     if (mulR2.file == QPUreg::SMALL)    { read_b = mulR2.num; sig = 13; }
371 | 
372 |     uint8_t add_a = setALUMux(addR1) & 0x7;
373 |     uint8_t add_b = setALUMux(addR2) & 0x7;
374 |     uint8_t mul_a = setALUMux(mulR1) & 0x7;
375 |     uint8_t mul_b = setALUMux(mulR2) & 0x7;
376 |     read_a &= 0x3f;
377 |     read_b &= 0x3f;
378 |     mul_op &= 0x7;
379 |     add_op &= 0x1f;
380 |     addDest.num &= 0x3f;
381 |     mulDest.num &= 0x3f;
382 |     cond_add &= 0x7;
383 |     cond_mul &= 0x7;
384 |     sf &= 0x1;
385 |     ws &= 0x1;
386 | 
387 |     printf("Assembling ALU instruction: %s, %d, %d\n", printRegister(addDest).c_str(), ws, sig);
388 | 
389 |     ins = ((uint64_t)sig << 60) | ((uint64_t)cond_add << 49) | ((uint64_t)cond_mul << 46) | ((uint64_t)sf << 45) | ((uint64_t)ws << 44);
390 |     ins |= ((uint64_t)addDest.num << 38) | ((uint64_t)mulDest.num << 32) | ((uint64_t)mul_op << 29) | ((uint64_t)add_op << 24);
391 |     ins |= ((uint64_t)read_a << 18) | ((uint64_t)read_b << 12) | ((uint64_t)add_a << 9) | ((uint64_t)add_b << 6) | ((uint64_t)mul_a << 3) | mul_b;
392 | 
393 |     return ins;
394 | }
395 | 
396 | uint64_t assembleLDI(context& ctx, string word)
397 | {
398 |     cout << "Assembling LDI instruction ... " << endl;
399 | 
400 |     string token_str;
401 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
402 | 
403 |     if (tok == DOT) {
404 |         // conditional ... conditionals should be on each register ?
405 |         cout << "conditional ... ";
406 |         // chew the conditional
407 |         nextToken(ctx.stream, token_str, &ctx.stream);
408 | 
409 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
410 |     }
411 | 
412 |     // this is supposed to be the register
413 |     if (tok != WORD) return -1;
414 | 
415 |     QPUreg register1, register2;
416 |     // check errors here
417 |     parseRegister(token_str, register1);
418 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
419 |     if (tok != COMMA) return -1;
420 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
421 | 
422 |     // this can either be another register
423 |     // (in which case we'll use both ALUs to set)
424 |     // or an immediate value (in which case we'll use rX39)
425 |     register2.num = 39;
426 |     register2.file = (register1.file == QPUreg::A) ? QPUreg::B : QPUreg::A;
427 |     if (isRegisterWord(token_str)) {
428 |         parseRegister(token_str, register2);
429 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
430 |         // check that this is a comma ...
431 |     }
432 | 
433 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
434 |     unsigned int immediate = parseImmediate(token_str);
435 | 
436 |     cout << "r1: " << printRegister(register1) << ", r2: "
437 |                    << printRegister(register2) << ", immed: 0x"
438 |                    << hex << immediate << dec << endl;
439 | 
440 |     while (nextToken(ctx.stream, token_str, &ctx.stream) != END)
441 |         ;
442 | 
443 |     uint32_t high = (uint32_t)0xE00 << 20;
444 |     high |= (uint32_t)0x1 << 17;      // cond_add
445 |     high |= (uint32_t)0x1 << 14;      // cond_mul
446 |     high |= (uint32_t)0x0 << 13;      // sf
447 |     high |= (uint32_t)0x0 << 12;      // ws
448 |     uint8_t addreg = (register1.file == QPUreg::A) ? register1.num : register2.num;
449 |     uint8_t mulreg = (register1.file == QPUreg::B) ? register1.num : register2.num;
450 |     high |= (uint32_t)addreg << 6;
451 |     high |= mulreg;
452 |     uint64_t ins = ((uint64_t)high << 32) | immediate;
453 | 
454 |     return ins;
455 | }
456 | 
457 | uint64_t assembleBRANCH(context& ctx, string word)
458 | {
459 |     cout << "Assembing BRANCH instruction" << endl;
460 | 
461 |     QPUreg dest;
462 |     string token_str;
463 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
464 | 
465 |     // relative or absolute branch?
466 |     uint8_t relative = 1;
467 |     if (word == "bra")
468 |         relative = 0;
469 | 
470 |     uint8_t branchCondition = 0xf;          // by default: always (unconditional branch)
471 |     if (tok == DOT) {
472 |         // conditional
473 |         nextToken(ctx.stream, token_str, &ctx.stream);
474 |         branchCondition = parseBranchCond(token_str);
475 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
476 |     }
477 | 
478 |     // this is the destination register
479 |     if (tok != WORD) {
480 |         cerr << "branch expecting destination register." << endl;
481 |         return -1;
482 |     }
483 |     parseRegister(token_str, dest);
484 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
485 |     if (tok != COMMA) return false;
486 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
487 |     if (tok != WORD) {
488 |         cerr << "branch expecting label/target" << endl;
489 |         return -1;
490 |     }
491 | 
492 |     // look it up in the labels map
493 |     int target = 0xFFFFFFFF;
494 |     if (ctx.labels.count(token_str) < 1) {
495 |         relocation r;
496 |         r.label = token_str;
497 |         r.pc = ctx.pc;
498 |         ctx.relocations.push_back(r);
499 |     } else
500 |         target = ctx.labels[token_str];
501 |     int offset = target - (ctx.pc+4*8);
502 | 
503 |     uint8_t raddr_a = 0;           // raddr_a is only 5-bits?
504 |     uint8_t use_reg = 0;
505 |     // if there's a third argument, it is a register offset
506 |     const char *discard;
507 |     tok = nextToken(ctx.stream, token_str, &discard);
508 |     if (tok == COMMA) {
509 |         QPUreg offsetReg;
510 |         // chew the comma we just read
511 |         ctx.stream = discard;
512 |         tok = nextToken(ctx.stream, token_str, &ctx.stream);
513 |         parseRegister(token_str, offsetReg);
514 |         if (offsetReg.file != QPUreg::A) {
515 |             cerr << "branch target offset register must be file A" << endl;
516 |             return -1;
517 |         }
518 |         if (offsetReg.num > 31) {
519 |             cerr << "branch target offset register must be < 32" << endl;
520 |             return -1;
521 |         }
522 |         raddr_a = offsetReg.num;
523 |         use_reg = 1;
524 |     }
525 | 
526 |     uint8_t waddr_add = 39;         // link address appears at ALU outputs
527 |     uint8_t waddr_mul = 39;
528 |     if (dest.file == QPUreg::A) waddr_add = dest.num;
529 |     if (dest.file == QPUreg::B) waddr_mul = dest.num;
530 | 
531 |     uint64_t ins = (uint64_t)0xF << 60;
532 |     ins |= (uint64_t)branchCondition << 52;
533 |     ins |= (uint64_t)relative << 51;
534 |     ins |= (uint64_t)use_reg << 50;
535 |     ins |= (uint64_t)raddr_a << 45;
536 |     ins |= (uint64_t)0x0 << 44;                       // write-swap
537 |     ins |= (uint64_t)waddr_add << 38;
538 |     ins |= (uint64_t)waddr_mul << 32;
539 |     ins |= (uint32_t)offset;
540 | 
541 |     return ins;
542 | }
543 | 
544 | uint64_t assembleSEMA(context& ctx, string word)
545 | {
546 | 
547 |     uint64_t ins = (uint64_t)0x74 << 57;
548 | 
549 |     string token_str;
550 |     token_t tok = nextToken(ctx.stream, token_str, &ctx.stream);
551 |     if (tok != WORD) {
552 |         cerr << "semaphore instruction expecting down/up or acquire/release" << endl;
553 |         return -1;
554 |     }
555 | 
556 |     uint8_t sa = 0;             // up
557 |     if (token_str == "down" || token_str == "acquire")
558 |         sa = 1;
559 | 
560 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
561 |     if (tok != COMMA)   return -1;
562 |     tok = nextToken(ctx.stream, token_str, &ctx.stream);
563 |     uint32_t imm = parseImmediate(token_str);
564 |     if (imm > 15) {
565 |         cerr << "semaphore out of range" << endl;
566 |         return -1;
567 |     }
568 |     // cond_add, cond_mul = NEVER, ws, sf = false
569 |     ins |= (uint64_t)39 << 38;          // waddr_add
570 |     ins |= (uint64_t)39 << 32;          // waddr_mul
571 |     ins |= sa << 4;
572 |     ins |= (uint8_t)imm;
573 | 
574 |     cout << "Assembling SEMAPHORE instruction (" << imm << "), " << (int)sa << endl;
575 | 
576 |     return ins;
577 | }
578 | 
579 | 
580 | int main(int argc, char **argv)
581 | {
582 |     char *outfname = 0;
583 |     int c;
584 | 
585 |     while ((c = getopt(argc, argv, "o:")) != -1) {
586 |         switch (c) {
587 |             case 'o':
588 |                 outfname = optarg;
589 |                 break;
590 |         }
591 |     }
592 | 
593 |     if (!outfname) {
594 |         cerr << "Usage: " << argv[0] << " -o <output>" << endl;
595 |         return -1;
596 |     }
597 | 
598 |     FILE *outfile = fopen(outfname, "w");
599 |     if (!outfile)
600 |     {
601 |         cerr << "Unable to open output file output.bin" << endl;
602 |         return -1;
603 |     }
604 | 
605 |     char line[128];
606 |     string token_string;
607 | 
608 |     struct context ctx;
609 |     ctx.pc = 0;
610 | 
611 |     vector<uint64_t> instructions;
612 | 
613 |     int lineNo = 0;
614 |     while (cin.getline(line, 128))
615 |     {
616 |         lineNo++;
617 |         const char *p = line;
618 |         ctx.stream = p;
619 |         token_t tok = nextToken(ctx.stream, token_string, &ctx.stream);
620 | 
621 |         if (tok == END)
622 |             continue;
623 | 
624 |         if (tok == WORD)
625 |         {
626 |             // read-ahead to see if the next token is a colon in which case
627 |             // this is a label.
628 |             const char *discard = NULL;
629 |             string nextTokenStr;
630 |             if (nextToken(ctx.stream, nextTokenStr, &discard) == COLON) {
631 |                 ctx.labels[token_string] = ctx.pc;
632 |                 continue;
633 |             }
634 | 
635 |             enum { INVALID, ALU, BRANCH, LDI, SEMA } opType = INVALID;
636 |             if (addOpCode(token_string) != 0xFF || mulOpCode(token_string) != 0xFF)
637 |                 opType = ALU;
638 |             if (token_string == "ldi") opType = LDI;
639 |             if (token_string == "bra" || token_string == "brr") opType = BRANCH;
640 |             if (token_string == "sema") opType = SEMA;
641 | 
642 |             if (opType == INVALID) {
643 |                 cerr << "Unable to assemble line " << lineNo << " : " << line << endl;
644 |                 cerr << " ... invalid opcode" << endl;
645 |                 return -1;
646 |             }
647 | 
648 |             uint64_t ins = 0;
649 |             switch (opType) {
650 |                 case ALU: ins = assembleALU(ctx, token_string); break;
651 |                 case BRANCH: ins = assembleBRANCH(ctx, token_string); break;
652 |                 case LDI: ins = assembleLDI(ctx, token_string); break;
653 |                 case SEMA: ins = assembleSEMA(ctx, token_string); break;
654 |             }
655 | 
656 |             if (ins == (uint64_t)-1) {
657 |                 cerr << "Error on line " << lineNo << " : " << line << endl;
658 |                 return -1;
659 |             }
660 | 
661 |             instructions.push_back(ins);
662 |             ctx.pc += 8;            // bytes;
663 |         }
664 |     }
665 | 
666 |     // Process relocations
667 |     ctx.labels["ZERO"] = 0x0;
668 |     for (int i=0; i < ctx.relocations.size(); i++)
669 |     {
670 |         relocation& r = ctx.relocations[i];
671 |         if (ctx.labels.count(r.label) < 1)
672 |         {
673 |             cerr << "undefined label: " << r.label << endl;
674 |             return -1;
675 |         }
676 |         int offset = ctx.labels[r.label] - (r.pc + 4*8);
677 |         if (r.label == "ZERO")
678 |             offset = 0x0;
679 |         cout << "Processing relocation at " << r.pc << " : " << r.label
680 |                                             << " : " << offset << endl;
681 |         uint64_t ins = instructions[r.pc / 8];
682 |         ins &= (uint64_t)0xFFFFFFFF << 32;   // zero bottom 32-bits for new value
683 |         ins |= (uint32_t)offset;
684 |         instructions[r.pc / 8] = ins;
685 |     }
686 | 
687 |     for (int i=0; i < instructions.size(); i++)
688 |         fwrite(&instructions[i], sizeof(uint64_t), 1, outfile);
689 | 
690 |     fclose(outfile);
691 |     cout << "Done.  Num instructions: " << instructions.size() << ", "
692 |          << instructions.size() * 8 << " bytes." << endl;
693 | }
694 | 


--------------------------------------------------------------------------------
/QPU/helloworld/Makefile:
--------------------------------------------------------------------------------
1 | MBOX_C = /opt/vc/src/hello_pi/hello_fft/mailbox.c
2 | MBOX_INC = -I/opt/vc/src/hello_pi/hello_fft
3 | 
4 | helloworld: driver.c
5 | 	g++ -g -O3 -o helloworld driver.c $(MBOX_C) $(MBOX_INC)
6 | 


--------------------------------------------------------------------------------
/QPU/helloworld/driver.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <stddef.h>
  5 | #include <sys/time.h>
  6 | 
  7 | #include "mailbox.h"
  8 | 
  9 | #define GPU_MEM_FLG     0xC
 10 | #define GPU_MEM_MAP     0x0
 11 | #define NUM_QPUS        1
 12 | #define MAX_CODE_SIZE   8192
 13 | 
 14 | static unsigned int qpu_code[MAX_CODE_SIZE];
 15 | 
 16 | struct memory_map {
 17 |     unsigned int code[MAX_CODE_SIZE];
 18 |     unsigned int uniforms[NUM_QPUS][2];     // 2 parameters per QPU
 19 |                                             // first address is the input value
 20 |                                             // for the program to add to
 21 |                                             // second is the address of the
 22 |                                             // result buffer
 23 |     unsigned int msg[NUM_QPUS][2];
 24 |     unsigned int results[NUM_QPUS][16];     // result buffer for the QPU to
 25 |                                             // write into
 26 | };
 27 | 
 28 | 
 29 | int loadShaderCode(const char *fname, unsigned int* buffer, int len)
 30 | {
 31 |     FILE *in = fopen(fname, "r");
 32 |     if (!in) {
 33 |         fprintf(stderr, "Failed to open %s.\n", fname);
 34 |         exit(0);
 35 |     }
 36 | 
 37 |     size_t items = fread(buffer, sizeof(unsigned int), len, in);
 38 |     fclose(in);
 39 | 
 40 |     return items;
 41 | }
 42 | 
 43 | 
 44 | int main(int argc, char **argv)
 45 | {
 46 |     if (argc < 3) {
 47 |         fprintf(stderr, "Usage: %s <code .bin> <val>\n", argv[0]);
 48 |         return 0;
 49 |     }
 50 |     int code_words = loadShaderCode(argv[1], qpu_code, MAX_CODE_SIZE);
 51 | 
 52 |     printf("Loaded %d bytes of code from %s ...\n", code_words * sizeof(unsigned), argv[1]);
 53 | 
 54 |     int mb = mbox_open();
 55 |     if (qpu_enable(mb, 1)) {
 56 |         fprintf(stderr, "QPU enable failed.\n");
 57 |         return -1;
 58 |     }
 59 |     printf("QPU enabled.\n");
 60 | 
 61 |     unsigned uniform_val = atoi(argv[2]);
 62 |     printf("Uniform value = %d\n", uniform_val);
 63 | 
 64 |     unsigned size = 1024 * 1024;
 65 |     unsigned handle = mem_alloc(mb, size, 4096, GPU_MEM_FLG);
 66 |     if (!handle) {
 67 |         fprintf(stderr, "Unable to allocate %d bytes of GPU memory", size);
 68 |         return -2;
 69 |     }
 70 |     unsigned ptr = mem_lock(mb, handle);
 71 |     void *arm_ptr = mapmem(ptr + GPU_MEM_MAP, size);
 72 |     // assert arm_ptr ...
 73 | 
 74 |     struct memory_map *arm_map = (struct memory_map *)arm_ptr;
 75 |     memset(arm_map, 0x0, sizeof(struct memory_map));
 76 |     unsigned vc_uniforms = ptr + offsetof(struct memory_map, uniforms);
 77 |     unsigned vc_code = ptr + offsetof(struct memory_map, code);
 78 |     unsigned vc_msg = ptr + offsetof(struct memory_map, msg);
 79 |     unsigned vc_results = ptr + offsetof(struct memory_map, results);
 80 |     memcpy(arm_map->code, qpu_code, code_words * sizeof(unsigned int));
 81 |     for (int i=0; i < NUM_QPUS; i++) {
 82 |         arm_map->uniforms[i][0] = uniform_val;
 83 |         arm_map->uniforms[i][1] = vc_results + i * sizeof(unsigned) * 16;
 84 |         arm_map->msg[i][0] = vc_uniforms + i * sizeof(unsigned) * 2;
 85 |         arm_map->msg[i][1] = vc_code;
 86 |     }
 87 | 
 88 |     unsigned ret = execute_qpu(mb, NUM_QPUS, vc_msg, 1, 10000);
 89 | 
 90 |     // check the results!
 91 |     for (int i=0; i < NUM_QPUS; i++) {
 92 |         for (int j=0; j < 16; j++) {
 93 |             printf("QPU %d, word %d: 0x%08x\n", i, j, arm_map->results[i][j]);
 94 |         }
 95 |     }
 96 | 
 97 |     printf("Cleaning up.\n");
 98 |     unmapmem(arm_ptr, size);
 99 |     mem_unlock(mb, handle);
100 |     mem_free(mb, handle);
101 |     qpu_enable(mb, 0);
102 |     printf("Done.\n");
103 | }
104 | 


--------------------------------------------------------------------------------
/QPU/helloworld/helloworld.asm:
--------------------------------------------------------------------------------
 1 | # Load the value we want to add to the input into a register
 2 | ldi ra1, 0x1234
 3 | 
 4 | # Configure the VPM for writing
 5 | ldi rb49, 0xa00
 6 | 
 7 | # Add the input value (first uniform - rb32) and the register with the hard-coded
 8 | # constant into the VPM.
 9 | add rb48, ra1, rb32;       nop
10 | 
11 | ## move 16 words (1 vector) back to the host (DMA)
12 | ldi rb49, 0x88010000
13 | 
14 | ## initiate the DMA (the next uniform - ra32 - is the host address to write to))
15 | or rb50, ra32, 0;          nop
16 | 
17 | # Wait for the DMA to complete
18 | or rb39, rb50, ra39;       nop
19 | 
20 | # trigger a host interrupt (writing rb38) to stop the program
21 | or rb38, ra39, ra39;       nop
22 | 
23 | nop.tend ra39, ra39, ra39;       nop rb39, rb39, rb39
24 | nop ra39, ra39, ra39;       nop rb39, rb39, rb39
25 | nop ra39, ra39, ra39;       nop rb39, rb39, rb39
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | rpi-playground
2 | ==============
3 | 
4 | Raspberry Pi Projects
5 | 


--------------------------------------------------------------------------------