├── LICENSE
├── README.md
├── emscripten_example
    ├── Makefile
    ├── README.md
    ├── add.cc
    └── benchmark.js
├── flops_example
    ├── Makefile
    ├── README.md
    ├── index.html
    ├── jit.cc
    └── main.js
├── matmul_example
    ├── Makefile
    ├── README.md
    ├── index.html
    ├── main.js
    ├── mm.cc
    └── mm.js
├── test.cc
├── thread_example
    ├── README.md
    ├── index.html
    ├── main.mjs
    ├── server.py
    ├── thread.cc
    └── worker.js
└── wasmblr.h


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Bram Wasti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # wasmblr
  2 | A single header file WebAssembly assembler.
  3 | 
  4 | This library makes it easier to generate web assembly binaries directly from C++.
  5 | Useful for JIT compilation from within projects compiled with Emscripten.
  6 | For examples see below, or read the `test.cc` file.
  7 | 
  8 | Some benchmarks:
  9 | - Measure Bandwidth (cache not flushed): https://bwasti.github.io/wasmblr
 10 | - Measure Peak Theoretical FLOPs: https://bwasti.github.io/wasmblr/flops
 11 | - Measure Matrix Multiplication Performance (GFLOPs): https://bwasti.github.io/wasmblr/matmul/
 12 | 
 13 | Contributions welcome!
 14 | 
 15 | # Usage
 16 | 
 17 | `#include "wasmblr.h"` and compile with `-std=c++11` or higher.
 18 | 
 19 | In C++:
 20 | 
 21 | ```cpp
 22 | 
 23 | struct Code : wasmblr::CodeGenerator {
 24 |   Code() : wasmblr::CodeGenerator() {
 25 |     auto add_func = function({f32, f32}, {f32}, [&]() {
 26 |       local.get(0);
 27 |       local.get(1);
 28 |       f32.add();
 29 |     });
 30 |     export_(add_func, "add");
 31 |   }
 32 | };
 33 | 
 34 | 
 35 | Code c;
 36 | auto bytes = c.emit();
 37 | std::ofstream wasm("add.wasm", std::ios::binary);
 38 | wasm.write((char*)bytes.data(), bytes.size());
 39 | ```
 40 | 
 41 | If you'd prefer to avoid inheritance, you can use the code generator directly:
 42 | 
 43 | ```cpp
 44 | wasmblr::CodeGenerator cg;
 45 | auto add_func = cg.function({cg.f32, cg.f32}, {cg.f32}, [&]() {
 46 |   cg.local.get(0);
 47 |   cg.local.get(1);
 48 |   cg.f32.add();
 49 | });
 50 | cg.export_(add_func, "add");
 51 | 
 52 | auto bytes = cg.emit();
 53 | std::ofstream wasm("add.wasm", std::ios::binary);
 54 | wasm.write((char*)bytes.data(), bytes.size());
 55 | ```
 56 | 
 57 | And then, in JavaScript:
 58 | 
 59 | ```javascript
 60 | const wasm = fs.readFileSync('add.wasm'); // or however you'd like to load it
 61 | const m = new WebAssembly.Module(wasm);
 62 | const instance = new WebAssembly.Instance(m, {});
 63 | // use the function
 64 | console.log(instance.exports.add(8, 9));
 65 | ```
 66 | 
 67 | # Test
 68 | 
 69 | With `node.js` installed,
 70 | 
 71 | ```
 72 | g++ test.cc -std=c++11 -o test
 73 | ./test
 74 | ```
 75 | 
 76 | # Supported Features
 77 | 
 78 | The semantics of the assembler attempt to mimic the [WebAssembly standard](https://webassembly.github.io/spec/core/) closely.
 79 | In the case of reserved keywords in C++ (such as export, xor, etc.), the mnemonic has an underscore appended (e.g. `export_`, `i32.xor_`).
 80 | 
 81 | A couple of example uses follow:
 82 | 
 83 | ### Recursion
 84 | 
 85 | ```cpp
 86 | struct Code : wasmblr::CodeGenerator {
 87 |   // NB: Needs to be a class variable; the function body is evaluated later
 88 |   uint32_t factorial;
 89 |   Code() : wasmblr::CodeGenerator() {
 90 |     factorial = function({f32}, {f32}, [&]() {
 91 |       local.get(0);
 92 |       f32.const_(1.0f);
 93 |       f32.lt();
 94 |       // base case
 95 |       if_(f32);
 96 |       {
 97 |         f32.const_(1.0f);
 98 |       }
 99 |       else_();
100 |       {
101 |         local.get(0);
102 |         local.get(0);
103 |         f32.const_(1.0f);
104 |         f32.sub();
105 |         call(factorial);
106 |         f32.mul();
107 |       }
108 |       end();
109 |     });
110 |     export_(factorial, "factorial");
111 |   }
112 | };
113 | ```
114 | 
115 | ### Blocks
116 | 
117 | If-statements
118 | 
119 | ```cpp
120 | struct Code : wasmblr::CodeGenerator {
121 |   Code() : wasmblr::CodeGenerator() {
122 |     auto if_func = function({f32}, {f32}, [&]() {
123 |       f32.const_(0.0f);
124 |       local.get(0);
125 |       f32.gt();
126 |       if_(f32);
127 |       f32.const_(0.0f);
128 |       else_();
129 |       local.get(0);
130 |       end();
131 |     });
132 |     export_(if_func, "relu");
133 |   }
134 | };
135 | ```
136 | 
137 | Loops
138 | 
139 | ```cpp
140 | struct Code : wasmblr::CodeGenerator {
141 |   Code() : wasmblr::CodeGenerator() {
142 |     auto loop_fn = function({}, {i32}, [&]() {
143 |       auto i = local(i32);
144 | 
145 |       loop(void_);
146 |       {
147 |         local.get(i);
148 |         i32.const_(1);
149 |         i32.add();
150 |         local.set(i);
151 | 
152 |         local.get(i);
153 |         i32.const_(10);
154 |         i32.lt_s();
155 |         br_if(0);
156 |       }
157 |       end();
158 |       local.get(i);
159 |     });
160 |     export_(loop_fn, "loop");
161 |   }
162 | };
163 | ```
164 | 
165 | ### Memory
166 | 
167 | ```cpp
168 | struct Code : wasmblr::CodeGenerator {
169 |   Code() : wasmblr::CodeGenerator() {
170 |     memory(1, 10).export_("mem");
171 |     auto store = function({}, {}, [&]() {
172 |       i32.const_(0);     // index 0
173 |       i32.const_(1337);  // value 1337
174 |       i32.store(0, 0);   // align 0, offset 0
175 |     });
176 |     export_(store, "store");
177 |   }
178 | };
179 | ```
180 | 
181 | ### SIMD (32-bit lanes for now)
182 | 
183 | ```cpp
184 | struct Code : wasmblr::CodeGenerator {
185 |   Code() : wasmblr::CodeGenerator() {
186 |     memory(1, 10).export_("mem");
187 |     auto square = function({}, {}, [&]() {
188 |       auto vec = local(v128);
189 |       i32.const_(0);
190 |       v128.load();
191 |       local.set(vec);
192 | 
193 |       local.get(vec);
194 |       local.get(vec);
195 |       v128.f32x4_mul();
196 |       local.set(vec);
197 | 
198 |       i32.const_(0);
199 |       local.get(vec);
200 |       v128.store();
201 |     });
202 |     export_(square, "simd_square");
203 |   }
204 | };
205 | ```
206 | 
207 | 
208 | # TODO
209 | 
210 | Many things. I would appreciate any help filing issues for missing things!
211 | 


--------------------------------------------------------------------------------
/emscripten_example/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_add', '_jit_add', '_jit_add_len', '_free']" -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap', 'ccall']" -O3 -DSIMD=1 -msimd128 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o add.js
 3 | 
 4 | no_simd:
 5 | 	emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_add', '_jit_add', '_jit_add_len', '_free']" -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap', 'ccall']" -O3 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o add.js
 6 | 
 7 | wasmblr_only:
 8 | 	emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_add', '_jit_add_len']" -Os -s SINGLE_FILE=1 -s MODULARIZE=1 -s ENVIRONMENT='web' -s 'EXPORT_NAME="createMyModule"' -fno-rtti -fno-exceptions -o add.js
 9 | 
10 | benchmark:
11 | 	node benchmark.js
12 | 


--------------------------------------------------------------------------------
/emscripten_example/README.md:
--------------------------------------------------------------------------------
 1 | # Emscripten Integration Demo
 2 | 
 3 | A detailed writeup of the contents of this folder can be found here: https://jott.live/markdown/wasm_vector_addition
 4 | 
 5 | See `add.cc` for various implementations of vector addition and `Makefile` for the build command (I added `-O3 -msimd128` to make the benchmark more competitve).
 6 | To try this example, ensure that `emcc` is in your path.
 7 | 
 8 | ```
 9 | cd emscripten_example
10 | make
11 | node benchmark.js
12 | ```
13 | 
14 | If you change the value of `wasmblr_unroll` at the top of `benchmark.js`, different code will be generated.
15 | Amping it all the way up to 1024 shows some benefit over the default 16.
16 | 
17 | On my MacBook M1, these are the results I get in node.js (`wasmblr_unroll = 16`):
18 | 
19 | ![](https://i.imgur.com/SuInbUY.png)
20 | 


--------------------------------------------------------------------------------
/emscripten_example/add.cc:
--------------------------------------------------------------------------------
  1 | #include "wasmblr.h"
  2 | 
  3 | std::vector<uint8_t> gen_add_hardcode(int len, int unroll) {
  4 |   assert(len % (unroll * 4) == 0);
  5 |   wasmblr::CodeGenerator cg;
  6 |   // we hardcode the inputs to be
  7 |   // 0 * N * sizeof(float), 1 * N * sizeof(float)
  8 |   // and the output to be
  9 |   // 2 * N * sizeof(float)
 10 |   auto pages = (len * 3 * 4) / (1 << 16) + 1;
 11 |   cg.memory(pages).export_("mem");
 12 |   auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() {
 13 |     auto iter = cg.local(cg.i32);
 14 |     cg.i32.const_(0);
 15 |     cg.local.set(iter);
 16 | 
 17 |     cg.loop(cg.void_);
 18 | 
 19 |     for (auto i = 0; i < unroll; ++i) {
 20 |       cg.local.get(iter);
 21 | 
 22 |       cg.local.get(iter);
 23 |       cg.v128.load(0, i * 16);
 24 | 
 25 |       cg.local.get(iter);
 26 |       cg.v128.load(0, (len * 4) + i * 16);
 27 | 
 28 |       cg.v128.f32x4_add();
 29 | 
 30 |       cg.v128.store(0, (len * 8) + i * 16);
 31 |     }
 32 | 
 33 |     cg.local.get(iter);
 34 |     cg.i32.const_(unroll * 16);
 35 |     cg.i32.add();
 36 |     cg.local.set(iter);
 37 | 
 38 |     cg.i32.const_(len * 4);  // bytes
 39 |     cg.local.get(iter);
 40 |     cg.i32.ge_u();
 41 |     cg.br_if(0);
 42 | 
 43 |     cg.end();
 44 |   });
 45 |   cg.export_(add_func, "add");
 46 |   return cg.emit();
 47 | }
 48 | 
 49 | std::vector<uint8_t> gen_add_loop(int len) {
 50 |   assert(len % 4 == 0);
 51 |   wasmblr::CodeGenerator cg;
 52 |   auto pages = (len * 3 * 4) / (1 << 16) + 1;
 53 |   cg.memory(pages).export_("mem");
 54 |   auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() {
 55 |     auto iter = cg.local(cg.i32);
 56 |     cg.i32.const_(0);
 57 |     cg.local.set(iter);
 58 | 
 59 |     cg.loop(cg.void_);
 60 |     {
 61 |       cg.local.get(2);
 62 |       cg.local.get(iter);
 63 |       cg.i32.add();
 64 | 
 65 |       cg.local.get(0);
 66 |       cg.local.get(iter);
 67 |       cg.i32.add();
 68 |       cg.v128.load();
 69 | 
 70 |       cg.local.get(1);
 71 |       cg.local.get(iter);
 72 |       cg.i32.add();
 73 |       cg.v128.load();
 74 | 
 75 |       cg.v128.f32x4_add();
 76 | 
 77 |       cg.v128.store();
 78 | 
 79 |       cg.i32.const_(4 * 4);  // vec of 4 floats
 80 |       cg.local.get(iter);
 81 |       cg.i32.add();
 82 |       cg.local.set(iter);
 83 | 
 84 |       cg.i32.const_(len * 4);  // bytes
 85 |       cg.local.get(iter);
 86 |       cg.i32.ge_u();
 87 |       cg.br_if(0);
 88 |     }
 89 |     cg.end();
 90 |   });
 91 |   cg.export_(add_func, "add");
 92 |   return cg.emit();
 93 | }
 94 | 
 95 | std::vector<uint8_t> gen_add_unroll(int len) {
 96 |   assert(len % 4 == 0);
 97 |   wasmblr::CodeGenerator cg;
 98 |   auto pages = (len * 3 * 4) / (1 << 16) + 1;
 99 |   cg.memory(pages).export_("mem");
100 |   auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() {
101 |     // no loop at all
102 |     for (auto i = 0; i < len / 4; ++i) {
103 |       cg.local.get(2);
104 | 
105 |       cg.local.get(0);
106 |       cg.v128.load(0, i * 16);
107 | 
108 |       cg.local.get(1);
109 |       cg.v128.load(0, i * 16);
110 | 
111 |       cg.v128.f32x4_add();
112 | 
113 |       cg.v128.store(0, i * 16);
114 |     }
115 |   });
116 |   cg.export_(add_func, "add");
117 |   return cg.emit();
118 | }
119 | 
120 | std::vector<uint8_t> gen_add_mix_no_simd(int len, int unroll) {
121 |   if (len < unroll) {
122 |     unroll = len;
123 |   }
124 |   assert(len % (unroll) == 0);
125 |   wasmblr::CodeGenerator cg;
126 |   auto pages = (len * 3 * 4) / (1 << 16) + 1;
127 |   cg.memory(pages).export_("mem");
128 |   auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() {
129 |     auto iter = cg.local(cg.i32);
130 |     cg.i32.const_(0);
131 |     cg.local.set(iter);
132 | 
133 |     cg.loop(cg.void_);
134 | 
135 |     for (auto i = 0; i < unroll; ++i) {
136 |       cg.local.get(2);
137 | 
138 |       cg.local.get(0);
139 |       cg.f32.load(0, i * 4);
140 | 
141 |       cg.local.get(1);
142 |       cg.f32.load(0, i * 4);
143 | 
144 |       cg.f32.add();
145 | 
146 |       cg.f32.store(0, i * 4);
147 |     }
148 | 
149 |     cg.local.get(0);
150 |     cg.i32.const_(unroll * 4);
151 |     cg.i32.add();
152 |     cg.local.set(0);
153 | 
154 |     cg.local.get(1);
155 |     cg.i32.const_(unroll * 4);
156 |     cg.i32.add();
157 |     cg.local.set(1);
158 | 
159 |     cg.local.get(2);
160 |     cg.i32.const_(unroll * 4);
161 |     cg.i32.add();
162 |     cg.local.set(2);
163 | 
164 |     cg.local.get(iter);
165 |     cg.i32.const_(unroll * 4);
166 |     cg.i32.add();
167 |     cg.local.set(iter);
168 | 
169 |     cg.i32.const_(len * 4);  // bytes
170 |     cg.local.get(iter);
171 |     cg.i32.ge_s();
172 |     cg.br_if(0);
173 | 
174 |     cg.end();
175 |   });
176 |   cg.export_(add_func, "add");
177 |   return cg.emit();
178 | }
179 | 
180 | std::vector<uint8_t> gen_add_mix(int len, int unroll) {
181 |   assert(len % (unroll * 4) == 0);
182 |   wasmblr::CodeGenerator cg;
183 |   auto pages = (len * 3 * 4) / (1 << 16) + 1;
184 |   cg.memory(pages).export_("mem");
185 |   auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() {
186 |     auto iter = cg.local(cg.i32);
187 |     cg.i32.const_(0);
188 |     cg.local.set(iter);
189 | 
190 |     cg.loop(cg.void_);
191 | 
192 |     for (auto i = 0; i < unroll; ++i) {
193 |       cg.local.get(2);
194 | 
195 |       cg.local.get(0);
196 |       cg.v128.load(0, i * 16);
197 | 
198 |       cg.local.get(1);
199 |       cg.v128.load(0, i * 16);
200 | 
201 |       cg.v128.f32x4_add();
202 | 
203 |       cg.v128.store(0, i * 16);
204 |     }
205 | 
206 |     cg.local.get(0);
207 |     cg.i32.const_(unroll * 16);
208 |     cg.i32.add();
209 |     cg.local.set(0);
210 | 
211 |     cg.local.get(1);
212 |     cg.i32.const_(unroll * 16);
213 |     cg.i32.add();
214 |     cg.local.set(1);
215 | 
216 |     cg.local.get(2);
217 |     cg.i32.const_(unroll * 16);
218 |     cg.i32.add();
219 |     cg.local.set(2);
220 | 
221 |     cg.local.get(iter);
222 |     cg.i32.const_(unroll * 16);
223 |     cg.i32.add();
224 |     cg.local.set(iter);
225 | 
226 |     cg.i32.const_(len * 4);  // bytes
227 |     cg.local.get(iter);
228 |     cg.i32.ge_s();
229 |     cg.br_if(0);
230 | 
231 |     cg.end();
232 |   });
233 |   cg.export_(add_func, "add");
234 |   return cg.emit();
235 | }
236 | 
237 | std::vector<uint8_t> gen_add(int len, int unroll, bool simd) {
238 |   if (!simd) {
239 |     return gen_add_mix_no_simd(len, unroll);
240 |   }
241 |   if (unroll * 4 >= len) {
242 |     return gen_add_unroll(len);
243 |   } else if (unroll <= 1) {
244 |     return gen_add_loop(len);
245 |   }
246 |   return gen_add_mix(len, unroll);
247 | }
248 | 
249 | extern "C" {
250 | 
251 | void add(const float* a, const float* b, float* c, int len) {
252 |   for (auto i = 0; i < len; ++i) {
253 |     c[i] = a[i] + b[i];
254 |   }
255 | }
256 | 
257 | #ifdef SIMD
258 | static bool simd = true;
259 | #else
260 | static bool simd = false;
261 | #endif
262 | 
263 | uint8_t* jit_add(int len, int unroll) {
264 |   auto bytes = gen_add(len, unroll, simd);
265 |   uint8_t* out = (uint8_t*)malloc(bytes.size());
266 |   memcpy(out, bytes.data(), bytes.size());
267 |   return out;
268 | }
269 | 
270 | int jit_add_len(int len, int unroll) {
271 |   auto bytes = gen_add(len, unroll, simd);
272 |   return bytes.size();
273 | }
274 | }
275 | 


--------------------------------------------------------------------------------
/emscripten_example/benchmark.js:
--------------------------------------------------------------------------------
  1 | const em = require('./add.js');
  2 | var Module;
  3 | const wasmblr_unroll = 16;
  4 | const warmup = 100;
  5 | const target_ms = 1000;
  6 | 
  7 | async function gen_pure(N) {
  8 |   let a = new Array(N).fill(0);
  9 |   let b = new Array(N).fill(0);
 10 |   let c = new Array(N).fill(0);
 11 | 
 12 |   function add() {
 13 |     for (let i = 0; i < N; ++i) {
 14 |       c[i] = a[i] + b[i];
 15 |     }
 16 |   }
 17 | 
 18 |   return [add, a, b, c];
 19 | }
 20 | 
 21 | async function gen_typed(N) {
 22 |   let a = new Float32Array(N);
 23 |   let b = new Float32Array(N);
 24 |   let c = new Float32Array(N);
 25 | 
 26 |   function add() {
 27 |     for (let i = 0; i < N; ++i) {
 28 |       c[i] = a[i] + b[i];
 29 |     }
 30 |   }
 31 | 
 32 |   return [add, a, b, c];
 33 | }
 34 | 
 35 | async function gen_emscripten(N) {
 36 |   function emscripten_array(len) {
 37 |     var ptr = Module._malloc(len * 4);
 38 |     return [new Float32Array(Module.HEAPF32.buffer, ptr, len), ptr];
 39 |   }
 40 | 
 41 |   let [a, a_] = emscripten_array(N);
 42 |   let [b, b_] = emscripten_array(N);
 43 |   let [c, c_] = emscripten_array(N);
 44 |   const add = Module._add;
 45 | 
 46 |   return [() => add(a_, b_, c_, N), a, b, c, () => {
 47 |     Module._free(a_);
 48 |     Module._free(b_);
 49 |     Module._free(c_);
 50 |   }];
 51 | }
 52 | 
 53 | async function gen_wasmblr(N, unroll) {
 54 |   const wasm = Module._jit_add(N, unroll);
 55 |   const wasm_len = Module._jit_add_len(N, unroll);
 56 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
 57 |   const m = await WebAssembly.compile(wasm_data);
 58 |   const instance = await WebAssembly.instantiate(m, {});
 59 | 
 60 |   let wasmblr_malloc_height = 0;
 61 |   let mem = instance.exports.mem;
 62 | 
 63 |   function wasmblr_array(len) {
 64 |     console.assert((mem.buffer.byteLength - wasmblr_malloc_height) > len * 4);
 65 |     let ptr = wasmblr_malloc_height;
 66 |     console.assert(([0, N * 4, N * 8]).indexOf(ptr) > -1, "allocated invalid ptr")
 67 |     let array = new Float32Array(mem.buffer, ptr, len);
 68 |     wasmblr_malloc_height += len * 4;
 69 |     return [array, ptr];
 70 |   }
 71 |   let [a, a_] = wasmblr_array(N);
 72 |   let [b, b_] = wasmblr_array(N);
 73 |   let [c, c_] = wasmblr_array(N);
 74 | 
 75 |   const add = instance.exports.add;
 76 | 
 77 |   return [() => add(a_, b_, c_), a, b, c];
 78 | }
 79 | 
 80 | async function gen_wasmblr_tuned(N) {
 81 |   let best = 0;
 82 |   let best_time = 1e9;
 83 |   for (let i = 0; Math.pow(2, i) < Math.min(1024, N / 4 + 2); ++i) {
 84 |     let [fn, w_a, w_b, w_c] = await gen_wasmblr(N, Math.pow(2, i));
 85 |     for (let _ = 0; _ < 100; ++_) {
 86 |       fn();
 87 |     }
 88 |     const t = performance.now();
 89 |     for (let _ = 0; _ < 1000; ++_) {
 90 |       fn();
 91 |     }
 92 |     const diff = performance.now() - t;
 93 |     if (diff < best_time) {
 94 |       best = i;
 95 |       best_time = diff;
 96 |     }
 97 |   }
 98 |   return [...await gen_wasmblr(N, Math.pow(2, best)), Math.pow(2, best)];
 99 | }
100 | 
101 | async function perf(N, name, fn) {
102 |   const w0 = performance.now();
103 |   for (let i = 0; i < warmup; ++i) {
104 |     fn();
105 |   }
106 |   const w1 = performance.now();
107 |   let iters = Math.min(Math.max(warmup * target_ms / (w1 - w0), 1), 1e6);
108 |   const t0 = performance.now();
109 |   for (let i = 0; i < iters; ++i) {
110 |     fn();
111 |   }
112 |   const t1 = performance.now();
113 |   const iters_sec = 1e3 * iters / (t1 - t0);
114 |   const elem_sec = N * iters_sec;
115 |   const gb_sec = elem_sec * 4 * 3 /* 2 read 1 write */ / 1e9;
116 |   const round = (num) => Math.round(num * 100) / 100
117 |   console.log(name, round(iters_sec), "iters/sec", `(${round(gb_sec)} GB/s)`);
118 | }
119 | 
120 | async function benchmark(N) {
121 |   let [pure_fn, p_a, p_b, p_c] = await gen_pure(N);
122 |   let [typed_fn, t_a, t_b, t_c] = await gen_typed(N);
123 |   let [emscripten_fn, e_a, e_b, e_c, emscripten_cleanup] = await gen_emscripten(N);
124 |   let [wasmblr_fn, w_a, w_b, w_c] = await gen_wasmblr(N, wasmblr_unroll);
125 |   let [wasmblr_tuned_fn, wt_a, wt_b, wt_c, unroll] = await gen_wasmblr_tuned(N);
126 | 
127 |   for (let i = 0; i < N; ++i) {
128 |     let a = Math.random();
129 |     let b = Math.random();
130 |     p_a[i] = a;
131 |     t_a[i] = a;
132 |     e_a[i] = a;
133 |     w_a[i] = a;
134 |     wt_a[i] = a;
135 | 
136 |     p_b[i] = b;
137 |     t_b[i] = b;
138 |     e_b[i] = b;
139 |     w_b[i] = b;
140 |     wt_b[i] = b;
141 |   }
142 | 
143 |   pure_fn();
144 |   typed_fn();
145 |   emscripten_fn();
146 |   wasmblr_fn();
147 |   wasmblr_tuned_fn();
148 | 
149 |   for (let i = 0; i < N; ++i) {
150 |     function check(arr, name) {
151 |       if (Math.abs(t_c[i] - arr[i]) > 0.01) {
152 |         console.log("difference found at index", i, t_c[i], "vs", name, arr[i]);
153 |         return false;
154 |       }
155 |       return true;
156 |     }
157 |     if (!check(p_c, "pure")) {
158 |       return;
159 |     }
160 |     if (!check(e_c, "emscripten")) {
161 |       return;
162 |     }
163 |     if (!check(w_c, "wasmblr")) {
164 |       return;
165 |     }
166 |     if (!check(wt_c, "wasmblr (tuned)")) {
167 |       return;
168 |     }
169 |   }
170 | 
171 |   console.log("benchmarking vec add of size", N);
172 |   await perf(N, "  pure javascript:        ", pure_fn);
173 |   await perf(N, "  typed arrays:           ", typed_fn);
174 |   await perf(N, "  emscripten:             ", emscripten_fn);
175 |   await perf(N, "  wasmblr:                ", wasmblr_fn);
176 |   await perf(N, `  wasmblr (tuned ${unroll}):`.padEnd(26), wasmblr_tuned_fn);
177 | 
178 |   emscripten_cleanup()
179 | }
180 | 
181 | em().then(function(m) {
182 |   Module = m;
183 |   // any larger and you'll need to recompile to give emscripten more memory
184 |   (async () => {
185 |   for (let i of [4, 64, 1024, 16 * 1024, 256 * 1024]) {
186 |     await benchmark(i);
187 |   }
188 |   })();
189 | });
190 | 


--------------------------------------------------------------------------------
/flops_example/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	emcc jit.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_mac', '_jit_mac_len', '_free']" -O3 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o jit.js
3 | 
4 | 


--------------------------------------------------------------------------------
/flops_example/README.md:
--------------------------------------------------------------------------------
 1 | # Demo of dynamically determining peak FLOPs
 2 | 
 3 | This file sweeps through a range of arithmetic intensities
 4 | to help determine the best configurations for running MAC-based
 5 | operations such as matrix multiplication or convolution.
 6 | 
 7 | To run it in browser right now: https://bwasti.github.io/wasmblr/flops/
 8 | 
 9 | ## Build the jit.js file
10 | 
11 | ```
12 | make
13 | ```
14 | 
15 | ## Use the jit.js file
16 | 
17 | The benchmark code uses the generated `jit.js` file.
18 | An `index.html` file is provided to run the benchmark 
19 | in the browser.
20 | 
21 | ```
22 | python3 -m http.server
23 | ```
24 | 


--------------------------------------------------------------------------------
/flops_example/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta name="viewport" content="width=device-width, user-scalable=no">
 5 |   </head>
 6 |   <body>
 7 |     <script src="jit.js"></script>
 8 |     <script src="main.js"></script>
 9 |     <script>
10 |     function run() {
11 |       launch_mac_benchmark();
12 |     }
13 |     </script>
14 |     <code>
15 |     This page helps determine peak floating point operations per second
16 |     via chained multiply-adds.
17 |     <a href="https://github.com/bwasti/wasmblr/tree/main/flops_example">[source code]</a>
18 |     </code>
19 |     <button onclick="run();">run benchmark</button>
20 |     <br>
21 |     <b><code id="best"></code></b>
22 |     <code id="output"></code>
23 |   </body>
24 | </html>
25 | 


--------------------------------------------------------------------------------
/flops_example/jit.cc:
--------------------------------------------------------------------------------
  1 | #include "wasmblr.h"
  2 | 
  3 | std::vector<uint8_t> gen(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) {
  4 |   wasmblr::CodeGenerator cg;
  5 |   // load per loop * 4 * (simd ? 4 : 1)
  6 |   int32_t mem_per_elem = load_per_loop * 4 * (simd ? 4 : 1);
  7 |   int32_t bytes = 3 * mem_per_elem;
  8 |   int32_t pages = bytes / (1 << 16) + 1;
  9 |   cg.memory(pages).export_("mem");
 10 | 
 11 |   int32_t a_offset = 0 * mem_per_elem;
 12 |   int32_t b_offset = 1 * mem_per_elem;
 13 |   int32_t c_offset = 2 * mem_per_elem;
 14 | 
 15 |   auto func = cg.function({}, {}, [&]() {
 16 |     auto gen_local = [&]() {
 17 |       if (simd) {
 18 |         return cg.local(cg.v128);
 19 |       }
 20 |       return cg.local(cg.f32);
 21 |     };
 22 | 
 23 |     std::vector<int32_t> a_locals;
 24 |     std::vector<int32_t> b_locals;
 25 |     std::vector<int32_t> c_locals;
 26 |     for (auto i = 0; i < load_per_loop; ++i) {
 27 |       a_locals.emplace_back(gen_local());
 28 |       b_locals.emplace_back(gen_local());
 29 |       c_locals.emplace_back(gen_local());
 30 |     }
 31 | 
 32 |     auto iter = -1;
 33 |     if (loops > 1) {
 34 |       iter = cg.local(cg.i32);
 35 |       cg.i32.const_(0);
 36 |       cg.local.set(iter);
 37 | 
 38 |       cg.loop(cg.void_);
 39 |     }
 40 | 
 41 |     auto load_local = [&](int local, int32_t off, int i) {
 42 |       cg.i32.const_(0);
 43 |       if (simd) {
 44 |         cg.v128.load(1, off + i * 16);
 45 |       } else {
 46 |         cg.f32.load(1, off + i * 4);
 47 |       }
 48 |       cg.local.set(local);
 49 |     };
 50 | 
 51 |     auto store_local = [&](int local, int32_t off, int i) {
 52 |       if (simd) {
 53 |         cg.v128.store(1, off + i * 16);
 54 |       } else {
 55 |         cg.f32.store(1, off + i * 4);
 56 |       }
 57 |     };
 58 | 
 59 |     for (auto i = 0; i < load_per_loop; ++i) {
 60 |       load_local(a_locals.at(i), a_offset, i);
 61 |       load_local(b_locals.at(i), b_offset, i);
 62 |       load_local(c_locals.at(i), c_offset, i);
 63 | 
 64 |       for (auto m = 0; m < mac_per_load; ++m) {
 65 |         cg.local.get(c_locals.at(i));
 66 |         cg.local.get(a_locals.at(i));
 67 |         cg.local.get(b_locals.at(i));
 68 |         if (simd) {
 69 |           cg.v128.f32x4_mul();
 70 |           cg.v128.f32x4_add();
 71 |         } else {
 72 |           cg.f32.mul();
 73 |           cg.f32.add();
 74 |         }
 75 |         cg.local.set(c_locals.at(i));
 76 |       }
 77 | 
 78 |       cg.i32.const_(0);
 79 |       cg.local.get(c_locals.at(i));
 80 |       store_local(c_locals.at(i), c_offset, i);
 81 |     }
 82 | 
 83 |     if (loops > 1) {
 84 |       cg.local.get(iter);
 85 |       cg.i32.const_(1);
 86 |       cg.i32.add();
 87 |       cg.local.set(iter);
 88 | 
 89 |       cg.i32.const_(loops);
 90 |       cg.local.get(iter);
 91 |       cg.i32.ge_s();
 92 |       cg.br_if(0);
 93 | 
 94 |       cg.end();
 95 |     }
 96 | 
 97 |   });
 98 |   cg.export_(func, "mac");
 99 |   return cg.emit();
100 | }
101 | 
102 | extern "C" {
103 | 
104 | uint8_t* jit_mac(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) {
105 |   auto bytes = gen(mac_per_load, load_per_loop, loops, simd);
106 |   uint8_t* out = (uint8_t*)malloc(bytes.size());
107 |   memcpy(out, bytes.data(), bytes.size());
108 |   return out;
109 | }
110 | 
111 | size_t jit_mac_len(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) {
112 |   auto bytes = gen(mac_per_load, load_per_loop, loops, simd);
113 |   return bytes.size();
114 | }
115 | 
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/flops_example/main.js:
--------------------------------------------------------------------------------
  1 | async function jit(Module, mac_per_load, loads_per_loop, loops, simd) {
  2 |   const wasm = Module._jit_mac(mac_per_load, loads_per_loop, loops, simd);
  3 |   const wasm_len = Module._jit_mac_len(mac_per_load, loads_per_loop, loops, simd);
  4 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
  5 |   const m = await WebAssembly.compile(wasm_data);
  6 |   const instance = await WebAssembly.instantiate(m, {});
  7 |   Module._free(wasm);
  8 |   const mem = instance.exports.mem;
  9 |   const elems = loads_per_loop * (simd ? 4 : 1);
 10 |   let a = new Float32Array(mem.buffer, 0, elems);
 11 |   let b = new Float32Array(mem.buffer, elems * 4, elems);
 12 |   let c = new Float32Array(mem.buffer, elems * 8, elems);
 13 |   return [instance.exports.mac, a, b, c];
 14 | }
 15 | 
 16 | function gen_ref(mac_per_load, loads_per_loop, loops, simd) {
 17 |   return function(A, B, C) {
 18 |     const elems = loads_per_loop * (simd ? 4 : 1);
 19 |     for (let l = 0; l < loops; ++l) {
 20 |       for (let ll = 0; ll < elems; ++ll) {
 21 |         let a = A[ll];
 22 |         let b = B[ll];
 23 |         let c = C[ll];
 24 |         for (let m = 0; m < mac_per_load; ++m) {
 25 |           c = a * b + c
 26 |         }
 27 |         C[ll] = c;
 28 |       }
 29 |     }
 30 |   }
 31 | }
 32 | 
 33 | function log(...args) {
 34 |   const str = args.reduce((a, b) => {
 35 |     return a + " " + b;
 36 |   }, "");
 37 |   document.querySelector('#output').appendChild(document.createTextNode(str));
 38 |   document.querySelector('#output').appendChild(document.createElement('br'));
 39 | }
 40 | 
 41 | function log_best(...args) {
 42 |   document.querySelector('#best').innerHTML = '';
 43 |   const str = args.reduce((a, b) => {
 44 |     return a + " " + b;
 45 |   }, "");
 46 |   document.querySelector('#best').appendChild(document.createTextNode(str));
 47 |   document.querySelector('#best').appendChild(document.createElement('br'));
 48 |   document.querySelector('#best').appendChild(document.createElement('br'));
 49 | }
 50 | 
 51 | function rand(a) {
 52 |   for (let i = 0; i < a.length; ++i) {
 53 |     a[i] = Math.random() / 100;
 54 |   }
 55 | }
 56 | 
 57 | function diff(a, b) {
 58 |   let max_diff = 0;
 59 |   for (let i = 0; i < a.length; ++i) {
 60 |     Math.max(Math.abs(a[i] - b[i]), max_diff);
 61 |   }
 62 |   return max_diff;
 63 | }
 64 | 
 65 | async function launch_mac_benchmark() {
 66 |   const Module = await createMyModule();
 67 |   let simd_support = [0];
 68 | 
 69 |   jit(Module, 1, 1, 1, true).then(() => {
 70 |     simd_support.push(1);
 71 |   }).catch(() => {
 72 |     log("no simd support");
 73 |   });
 74 | 
 75 |   let best_gflops = 0;
 76 |   let best_str = '';
 77 |   for (let mac_per_load of [1, 2, 4, 8, 16, 32]) {
 78 |     for (let loads_per_loop of [1, 2, 4, 8, 16, 32]) {
 79 |       for (let loops of [1, 16, 64, 128]) {
 80 |         for (let simd of simd_support) {
 81 |           const [fn, a, b, c] = await jit(Module, mac_per_load, loads_per_loop, loops, simd);
 82 |           const ops = loops * loads_per_loop * mac_per_load * (simd ? 4 : 1);
 83 |           rand(a);
 84 |           rand(b);
 85 |           rand(c);
 86 |           const ref_c = new Float32Array(c.length);
 87 |           ref_c.set(c);
 88 |           fn();
 89 |           const str = `(MACs per load: ${mac_per_load}, Loads per loop ${loads_per_loop}, Loops: ${loops}, SIMD: ${simd})`;
 90 |           const err = diff(c, ref_c);
 91 |           if (err > 0.1) {
 92 |             log("error!", str, 'example elem:', c[0]);
 93 |             continue;
 94 |           }
 95 | 
 96 |           const iters_sec = bench(100, fn);
 97 |           const gflops = ops * 2 * iters_sec / 1e9;
 98 |           log(`${gflops} GFlops`, str);
 99 |           if (gflops > best_gflops) {
100 |             best_gflops = gflops;
101 |             best_str = str;
102 |             log_best(`Best: ${gflops} GFlops`, str);
103 |           }
104 |         }
105 |       }
106 |     }
107 |   }
108 |   log_best(`done. Best: ${best_gflops} GFlops`, best_str);
109 | }
110 | 
111 | function run_bench(target, fn) {
112 |   let diff = 0;
113 |   let num_iters = 1;
114 |   while (diff < (target / 2)) {
115 |     num_iters *= 2;
116 |     const t0 = performance.now();
117 |     for (let i = 0; i < num_iters; ++i) {
118 |       fn();
119 |     }
120 |     const t1 = performance.now();
121 |     diff = t1 - t0;
122 |   }
123 |   const iters_sec = 1e3 * num_iters / diff;
124 |   return iters_sec;
125 | }
126 | 
127 | function bench(target_ms, fn) {
128 |   const warmup_ms = target_ms / 10;
129 |   run_bench(warmup_ms, fn);
130 |   return run_bench(target_ms, fn);
131 | }


--------------------------------------------------------------------------------
/matmul_example/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	emcc mm.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_mm', '_jit_mm_len', '_jit_mm_naive', '_jit_mm_naive_len', '_jit_mm_nosimd', '_jit_mm_nosimd_len', '_free']" -O3 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE -s 'EXPORT_NAME="createMyModule"' -s INITIAL_MEMORY=67108864 -o mm.js
3 | 


--------------------------------------------------------------------------------
/matmul_example/README.md:
--------------------------------------------------------------------------------
 1 | ## Matmul Example
 2 | 
 3 | This folder contains an example using emscripten and wasmblr to generate various tuned matrix multiplication implementations
 4 | on the fly in the browser.
 5 | 
 6 | A writeup can be found here: https://jott.live/markdown/mm_wasm
 7 | 
 8 | To use the demo ensure that `emcc` is in your path.
 9 | 
10 | ```
11 | make
12 | ```
13 | 
14 | and then host a server
15 | 
16 | ```
17 | python3 -m http.server
18 | ```
19 | 
20 | http://localhost should have a tuning script up and running.
21 | 


--------------------------------------------------------------------------------
/matmul_example/index.html:
--------------------------------------------------------------------------------
 1 | <script src="mm.js"></script>
 2 | <script src="main.js"></script>
 3 | <pre>
 4 | Select the benchmark to tune for:
 5 | </pre>
 6 | <button id="mm128">N=128</button>
 7 | <button id="mm256">N=256</button>
 8 | <button id="mm512">N=512</button>
 9 | <pre id="highlight"></pre>
10 | <pre id="output"></pre>
11 | 


--------------------------------------------------------------------------------
/matmul_example/main.js:
--------------------------------------------------------------------------------
  1 | function log(...args) {
  2 |   const str = args.reduce((a, b) => {
  3 |     return a + " " + b;
  4 |   }, "");
  5 |   document.querySelector('#output').appendChild(document.createTextNode(str));
  6 |   document.querySelector('#output').appendChild(document.createElement('br'));
  7 | }
  8 | 
  9 | async function has_simd(Module) {
 10 |   const wasm = Module._jit_mm(4,4,4,1,1,1);
 11 |   const wasm_len = Module._jit_mm_len(4,4,4,1,1,1);
 12 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
 13 |   let has = true;
 14 |   const m = await WebAssembly.compile(wasm_data).catch(e => {
 15 |     has = false;
 16 |   });
 17 |   Module._free(wasm);
 18 |   return has;
 19 | }
 20 | 
 21 | async function jit(Module, M, N, K, Mu, Nu, Ku) {
 22 |   let [jit, len] = [Module._jit_mm, Module._jit_mm_len];
 23 |   const simd = await has_simd(Module);
 24 |   if (!simd) {
 25 |     [jit, len] = [Module._jit_mm_nosimd, Module._jit_mm_nosimd_len];
 26 |   }
 27 |   const wasm = jit(M, N, K, Mu, Nu, Ku);
 28 |   const wasm_len = len(M, N, K, Mu, Nu, Ku);
 29 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
 30 |   const m = await WebAssembly.compile(wasm_data).catch(e => log('Error compiling ->', e));
 31 |   const instance = await WebAssembly.instantiate(m, {});
 32 |   Module._free(wasm);
 33 |   const mem = instance.exports.mem;
 34 |   let a = new Float32Array(mem.buffer, 0, M * K);
 35 |   let b = new Float32Array(mem.buffer, M * K * 4, K * N);
 36 |   let c = new Float32Array(mem.buffer, (M * K + K * N) * 4, M * N);
 37 |   return [instance.exports.mm, a, b, c, simd];
 38 | }
 39 | 
 40 | function ref_mm(a, b, M, N, K) {
 41 |   c = new Float32Array(M * N);
 42 |   for (let m = 0; m < M; ++m) {
 43 |     for (let n = 0; n < N; ++n) {
 44 |       for (let k = 0; k < K; ++k) {
 45 |         c[m * N + n] += a[m * K + k] * b[k * N + n];
 46 |       }
 47 |     }
 48 |   }
 49 |   return c;
 50 | }
 51 | 
 52 | async function bench(m, M, N, K, Mu, Nu, Ku) {
 53 |   const [fn, a, b, c, simd] = await jit(m, M, N, K, Mu, Nu, Ku);
 54 |   for (let i = 0; i < N * N; ++i) {
 55 |     a[i] = Math.random();
 56 |     b[i] = Math.random();
 57 |     c[i] = 0;
 58 |   }
 59 |   fn();
 60 |   const ref_c = ref_mm(a, b, M, N, K);
 61 |   let max_diff = 0;
 62 |   for (let i = 0; i < M * N; ++i) {
 63 |     max_diff = Math.max(max_diff, Math.abs(ref_c[i] - c[i]));
 64 |   }
 65 |   console.log("max diff", max_diff);
 66 |   if (max_diff > 0.1) {
 67 |     log("error! max diff", max_diff);
 68 |   }
 69 |   for (let i = 0; i < 10; ++i) {
 70 |     fn();
 71 |   }
 72 |   // ~0.1if we hit 40gflops
 73 |   const iters = 4e9 / (M * N * K * 2) / (simd ? 1 : 4);
 74 |   const t = performance.now();
 75 |   for (let _ = 0; _ < iters; ++_) {
 76 |     fn();
 77 |   }
 78 |   const diff = performance.now() - t;
 79 |   return 1e3 * N * N * N * 2 * iters / diff / 1e9;
 80 | }
 81 | 
 82 | async function init(N) {
 83 |   document.getElementById("output").textContent = '';
 84 |   document.getElementById("highlight").textContent = '';
 85 |   let mod = await createMyModule();
 86 |   const M = N;
 87 |   const K = N;
 88 |   let best_gflops = 0;
 89 |   let best_str = '';
 90 |   const simd = await has_simd(mod);
 91 |   if (!simd) {
 92 |     log('No simd found, falling back to scalar code.');
 93 |   }
 94 |   for (let m of [1, 2, 4, 8, 16, 32]) {
 95 |     for (let n of [1, 2, 4, 8, 16, 32]) {
 96 |       for (let k of [1, 2, 4, 8, 16, 32]) {
 97 |         if (k > K) {
 98 |           continue;
 99 |         }
100 |         if (m > M) {
101 |           continue;
102 |         }
103 |         if (n * 4 > N) {
104 |           continue;
105 |         }
106 |         let gflops = await bench(mod, M, N, K, m, n, k);
107 |         if (gflops > best_gflops) {
108 |           best_gflops = gflops;
109 |           let pre = document.getElementById("highlight");
110 |           best_str = `best gflops: ${best_gflops} (unroll m: ${m}, n: ${n}, k: ${k})`;
111 |           pre.textContent = best_str;
112 |         }
113 |         log(m, n, k, "gflops", gflops);
114 |       }
115 |     }
116 |   }
117 |   let pre = document.getElementById("highlight");
118 |   let str = `(done) ${best_str}`;
119 |   pre.textContent = str;
120 | }
121 | 
122 | window.addEventListener('load', function() {
123 |   document.getElementById('mm128').addEventListener('click', () => init(128));
124 |   document.getElementById('mm256').addEventListener('click', () => init(256));
125 |   document.getElementById('mm512').addEventListener('click', () => init(512));
126 | });
127 | 


--------------------------------------------------------------------------------
/matmul_example/mm.cc:
--------------------------------------------------------------------------------
  1 | #include "wasmblr.h"
  2 | 
  3 | struct MMGenNoSIMD : public wasmblr::CodeGenerator {
  4 |   MMGenNoSIMD(int M, int N, int K, int M_unroll, int N_unroll, int K_unroll) {
  5 |     assert(M_unroll <= M && "Invalid M unroll size");
  6 |     assert(N_unroll <= N && "Invalid N unroll size");
  7 |     assert(K_unroll <= K && "Invalid K unroll size");
  8 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
  9 |     auto A_off = 0;
 10 |     auto B_off = M * K * 4;
 11 |     auto C_off = (M * K + K * N) * 4;
 12 |     memory(pages).export_("mem");
 13 |     auto fn = function({}, {}, [=]() {
 14 |       auto m = local(i32);
 15 |       auto n = local(i32);
 16 |       auto k = local(i32);
 17 |       std::vector<int> load_a;
 18 |       std::vector<int> load_b;
 19 |       for (auto j = 0; j < K_unroll; ++j) {
 20 |         for (auto i = 0; i < M_unroll; ++i) {
 21 |           load_a.emplace_back(local(f32));
 22 |         }
 23 |         for (auto i = 0; i < N_unroll; ++i) {
 24 |           load_b.emplace_back(local(f32));
 25 |         }
 26 |       }
 27 |       auto a_off = local(i32);
 28 |       auto b_off = local(i32);
 29 |       auto c_off = local(i32);
 30 |       std::vector<int> accs;
 31 |       for (auto i = 0; i < M_unroll * N_unroll; ++i) {
 32 |         accs.emplace_back(local(f32));
 33 |       }
 34 | 
 35 |       i32.const_(0);
 36 |       local.set(m);
 37 |       loop(void_); // M
 38 | 
 39 |       local.get(m);
 40 |       i32.const_(N * 4);
 41 |       i32.mul();
 42 |       local.set(c_off);
 43 | 
 44 |       i32.const_(0);
 45 |       local.set(n);
 46 |       loop(void_); // N
 47 | 
 48 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 49 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 50 |           local.get(c_off);
 51 |           f32.load(0, C_off + n_unroll * 4 + m_unroll * N * 4);
 52 |           local.set(accs.at(m_unroll * N_unroll + n_unroll));
 53 |         }
 54 |       }
 55 | 
 56 |       local.get(m);
 57 |       i32.const_(K * 4);
 58 |       i32.mul();
 59 |       local.set(a_off);
 60 | 
 61 |       local.get(n);
 62 |       i32.const_(4 * N_unroll);
 63 |       i32.mul();
 64 |       local.set(b_off);
 65 | 
 66 |       i32.const_(0);
 67 |       local.set(k);
 68 |       loop(void_); // K
 69 | 
 70 |       for (auto k_unroll = 0; k_unroll < K_unroll; ++k_unroll) {
 71 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 72 |           local.get(a_off);
 73 |           f32.load(0, A_off + (m_unroll * K + k_unroll) * 4);
 74 |           local.set(load_a.at(m_unroll * K_unroll + k_unroll));
 75 |         }
 76 | 
 77 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 78 |           local.get(b_off);
 79 |           f32.load(0, B_off + (k_unroll * N + n_unroll) * 4);
 80 |           local.set(load_b.at(n_unroll * K_unroll + k_unroll));
 81 |         }
 82 | 
 83 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 84 |           for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 85 | 
 86 |             local.get(accs.at(m_unroll * N_unroll + n_unroll));
 87 |             local.get(load_a.at(m_unroll * K_unroll + k_unroll));
 88 |             local.get(load_b.at(n_unroll * K_unroll + k_unroll));
 89 |             f32.mul();
 90 |             f32.add();
 91 |             local.set(accs.at(m_unroll * N_unroll + n_unroll));
 92 |           }
 93 |         }
 94 |       }
 95 | 
 96 |       local.get(a_off);
 97 |       i32.const_(4 * K_unroll);
 98 |       i32.add();
 99 |       local.set(a_off);
100 | 
101 |       local.get(b_off);
102 |       i32.const_(N * 4 * K_unroll);
103 |       i32.add();
104 |       local.set(b_off);
105 | 
106 |       local.get(k);
107 |       i32.const_(K_unroll);
108 |       i32.add();
109 |       local.tee(k);
110 |       i32.const_(K);
111 |       i32.lt_u();
112 |       br_if(0);
113 | 
114 |       end(); // K
115 | 
116 |       // store output of C
117 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
118 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
119 |           local.get(c_off);
120 |           local.get(accs.at(m_unroll * N_unroll + n_unroll));
121 |           f32.store(0, C_off + n_unroll * 4 + m_unroll * N * 4);
122 |         }
123 |       }
124 | 
125 |       local.get(c_off);
126 |       i32.const_(N_unroll * 4);
127 |       i32.add();
128 |       local.set(c_off);
129 | 
130 |       local.get(n);
131 |       i32.const_(1);
132 |       i32.add();
133 |       local.tee(n);
134 |       i32.const_(N / N_unroll);
135 |       i32.lt_u();
136 |       br_if(0);
137 | 
138 |       end(); // N
139 | 
140 |       local.get(m);
141 |       i32.const_(M_unroll);
142 |       i32.add();
143 |       local.tee(m);
144 |       i32.const_(M);
145 |       i32.lt_u();
146 |       br_if(0);
147 | 
148 |       end(); // M
149 |     });
150 |     export_(fn, "mm");
151 |   }
152 | };
153 | 
154 | struct MMGenSimple : public wasmblr::CodeGenerator {
155 |   MMGenSimple(int M, int N, int K) {
156 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
157 |     auto A_off = 0;
158 |     auto B_off = M * K * 4;
159 |     auto C_off = (M * K + K * N) * 4;
160 |     memory(pages).export_("mem");
161 |     auto fn = function({}, {}, [=]() {
162 |       auto m = local(i32);
163 |       auto n = local(i32);
164 |       auto k = local(i32);
165 | 
166 |       // loop over m
167 |       i32.const_(0);
168 |       local.set(m);
169 |       loop(void_);
170 | 
171 |       // loop over n
172 |       i32.const_(0);
173 |       local.set(n);
174 |       loop(void_);
175 | 
176 |       // loop over k
177 |       i32.const_(0);
178 |       local.set(k);
179 |       loop(void_);
180 | 
181 |       // load original value of C
182 |       local.get(m);
183 |       i32.const_(N);
184 |       i32.mul();
185 |       local.get(n);
186 |       i32.add();
187 |       i32.const_(4);
188 |       i32.mul();
189 |       f32.load(0, C_off); // stack: [C]
190 | 
191 |       // load value of A
192 |       local.get(m);
193 |       i32.const_(K);
194 |       i32.mul();
195 |       local.get(k);
196 |       i32.add();
197 |       i32.const_(4);
198 |       i32.mul();
199 |       f32.load(0, A_off); // stack: [A, C]
200 | 
201 |       // load value of B
202 |       local.get(k);
203 |       i32.const_(N);
204 |       i32.mul();
205 |       local.get(n);
206 |       i32.add();
207 |       i32.const_(4);
208 |       i32.mul();
209 |       f32.load(0, B_off); // stack: [B, A, C]
210 | 
211 |       f32.mul(); // stack: [B * A, C]
212 |       f32.add(); // stack: [B * A + C]
213 |       auto c = local(f32);
214 |       local.set(c); // save temporarily
215 | 
216 |       // store new value to C
217 |       local.get(m);
218 |       i32.const_(N);
219 |       i32.mul();
220 |       local.get(n);
221 |       i32.add();
222 |       i32.const_(4);
223 |       i32.mul();
224 |       local.get(c); // push the saved value back to the stack
225 |       f32.store(0, C_off);
226 | 
227 |       // loop tail for k
228 |       local.get(k);
229 |       i32.const_(1);
230 |       i32.add();
231 |       local.tee(k);
232 |       i32.const_(K);
233 |       i32.lt_u();
234 |       br_if(0);
235 |       end();
236 | 
237 |       // loop tail for n
238 |       local.get(n);
239 |       i32.const_(1);
240 |       i32.add();
241 |       local.tee(n);
242 |       i32.const_(N);
243 |       i32.lt_u();
244 |       br_if(0);
245 |       end();
246 | 
247 |       // loop tail for m
248 |       local.get(m);
249 |       i32.const_(1);
250 |       i32.add();
251 |       local.tee(m);
252 |       i32.const_(M);
253 |       i32.lt_u();
254 |       br_if(0);
255 |       end(); // M
256 |     });
257 |     export_(fn, "mm");
258 |   }
259 | };
260 | 
261 | struct MMGen : public wasmblr::CodeGenerator {
262 |   MMGen(int M, int N, int K, int M_unroll, int N_unroll, int K_unroll) {
263 |     assert(M_unroll <= M && "Invalid M unroll size");
264 |     assert((N_unroll * 4) <= N && "Invalid N unroll size");
265 |     assert(K_unroll <= K && "Invalid K unroll size");
266 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
267 |     auto A_off = 0;
268 |     auto B_off = M * K * 4;
269 |     auto C_off = (M * K + K * N) * 4;
270 |     memory(pages).export_("mem");
271 |     auto fn = function({}, {}, [=]() {
272 |       auto m = local(i32);
273 |       auto n = local(i32);
274 |       auto k = local(i32);
275 |       std::vector<int> load_a;
276 |       std::vector<int> load_b;
277 |       for (auto j = 0; j < K_unroll; ++j) {
278 |         for (auto i = 0; i < M_unroll; ++i) {
279 |           load_a.emplace_back(local(v128));
280 |         }
281 |         for (auto i = 0; i < N_unroll; ++i) {
282 |           load_b.emplace_back(local(v128));
283 |         }
284 |       }
285 |       auto a_off = local(i32);
286 |       auto b_off = local(i32);
287 |       auto c_off = local(i32);
288 |       std::vector<int> accs;
289 |       for (auto i = 0; i < M_unroll * N_unroll; ++i) {
290 |         accs.emplace_back(local(v128));
291 |       }
292 | 
293 |       i32.const_(0);
294 |       local.set(m);
295 |       loop(void_); // M
296 | 
297 |       local.get(m);
298 |       i32.const_(N * 4);
299 |       i32.mul();
300 |       local.set(c_off);
301 | 
302 |       i32.const_(0);
303 |       local.set(n);
304 |       loop(void_); // N
305 | 
306 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
307 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
308 |           local.get(c_off);
309 |           v128.load(0, C_off + n_unroll * 4 * 4 + m_unroll * N * 4);
310 |           local.set(accs.at(m_unroll * N_unroll + n_unroll));
311 |         }
312 |       }
313 | 
314 |       local.get(m);
315 |       i32.const_(K * 4);
316 |       i32.mul();
317 |       local.set(a_off);
318 | 
319 |       local.get(n);
320 |       i32.const_(4 * 4 * N_unroll);
321 |       i32.mul();
322 |       local.set(b_off);
323 | 
324 |       i32.const_(0);
325 |       local.set(k);
326 |       loop(void_); // K
327 | 
328 |       for (auto k_unroll = 0; k_unroll < K_unroll; ++k_unroll) {
329 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
330 |           local.get(a_off);
331 |           v128.load32_splat(0, A_off + (m_unroll * K + k_unroll) * 4);
332 |           local.set(load_a.at(m_unroll * K_unroll + k_unroll));
333 |         }
334 | 
335 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
336 |           local.get(b_off);
337 |           v128.load(0, B_off + (k_unroll * N + n_unroll * 4) * 4);
338 |           local.set(load_b.at(n_unroll * K_unroll + k_unroll));
339 |         }
340 | 
341 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
342 |           for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
343 | 
344 |             local.get(accs.at(m_unroll * N_unroll + n_unroll));
345 |             local.get(load_a.at(m_unroll * K_unroll + k_unroll));
346 |             local.get(load_b.at(n_unroll * K_unroll + k_unroll));
347 |             v128.f32x4_mul();
348 |             v128.f32x4_add();
349 |             local.set(accs.at(m_unroll * N_unroll + n_unroll));
350 |           }
351 |         }
352 |       }
353 | 
354 |       local.get(a_off);
355 |       i32.const_(4 * K_unroll);
356 |       i32.add();
357 |       local.set(a_off);
358 | 
359 |       local.get(b_off);
360 |       i32.const_(N * 4 * K_unroll);
361 |       i32.add();
362 |       local.set(b_off);
363 | 
364 |       local.get(k);
365 |       i32.const_(K_unroll);
366 |       i32.add();
367 |       local.tee(k);
368 |       i32.const_(K);
369 |       i32.lt_u();
370 |       br_if(0);
371 | 
372 |       end(); // K
373 | 
374 |       // store output of C
375 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
376 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
377 |           local.get(c_off);
378 |           local.get(accs.at(m_unroll * N_unroll + n_unroll));
379 |           v128.store(0, C_off + n_unroll * 4 * 4 + m_unroll * N * 4);
380 |         }
381 |       }
382 | 
383 |       local.get(c_off);
384 |       i32.const_(N_unroll * 4 * 4);
385 |       i32.add();
386 |       local.set(c_off);
387 | 
388 |       local.get(n);
389 |       i32.const_(1);
390 |       i32.add();
391 |       local.tee(n);
392 |       i32.const_(N / 4 / N_unroll);
393 |       i32.lt_u();
394 |       br_if(0);
395 | 
396 |       end(); // N
397 | 
398 |       local.get(m);
399 |       i32.const_(M_unroll);
400 |       i32.add();
401 |       local.tee(m);
402 |       i32.const_(M);
403 |       i32.lt_u();
404 |       br_if(0);
405 | 
406 |       end(); // M
407 |     });
408 |     export_(fn, "mm");
409 |   }
410 | };
411 | 
412 | extern "C" {
413 | 
414 | uint8_t *jit_mm_naive(int M, int N, int K) {
415 |   MMGenSimple mm(M, N, K);
416 |   auto bytes = mm.emit();
417 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
418 |   memcpy(out, bytes.data(), bytes.size());
419 |   return out;
420 | }
421 | 
422 | int jit_mm_naive_len(int M, int N, int K) {
423 |   MMGenSimple mm(M, N, K);
424 |   auto bytes = mm.emit();
425 |   return bytes.size();
426 | }
427 | 
428 | uint8_t *jit_mm(int M, int N, int K, int Mu, int Nu, int Ku) {
429 |   MMGen mm(M, N, K, Mu, Nu, Ku);
430 |   auto bytes = mm.emit();
431 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
432 |   memcpy(out, bytes.data(), bytes.size());
433 |   return out;
434 | }
435 | 
436 | int jit_mm_len(int M, int N, int K, int Mu, int Nu, int Ku) {
437 |   MMGen mm(M, N, K, Mu, Nu, Ku);
438 |   auto bytes = mm.emit();
439 |   return bytes.size();
440 | }
441 | 
442 | uint8_t *jit_mm_nosimd(int M, int N, int K, int Mu, int Nu, int Ku) {
443 |   MMGenNoSIMD mm(M, N, K, Mu, Nu, Ku);
444 |   auto bytes = mm.emit();
445 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
446 |   memcpy(out, bytes.data(), bytes.size());
447 |   return out;
448 | }
449 | 
450 | int jit_mm_nosimd_len(int M, int N, int K, int Mu, int Nu, int Ku) {
451 |   MMGenNoSIMD mm(M, N, K, Mu, Nu, Ku);
452 |   auto bytes = mm.emit();
453 |   return bytes.size();
454 | }
455 | }
456 | 


--------------------------------------------------------------------------------
/test.cc:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream>
  3 | #include <sstream>
  4 | #include "wasmblr.h"
  5 | 
  6 | void testJS(wasmblr::CodeGenerator& c,
  7 |             std::string invoke,
  8 |             std::string expected) {
  9 |   std::stringstream ss;
 10 |   ss << "const wasm = new Uint8Array([";
 11 |   for (const auto& b : c.emit()) {
 12 |     ss << "0x" << std::hex << static_cast<int>(b) << ", ";
 13 |   }
 14 |   ss << "]);\n";
 15 |   ss << "const m = new WebAssembly.Module(wasm);\n";
 16 |   ss << "const instance = new WebAssembly.Instance(m, {});\n";
 17 |   ss << invoke;
 18 | 
 19 |   std::string node_file = "/tmp/test.js";
 20 |   std::string out_file = "/tmp/test.out";
 21 |   std::ofstream nf(node_file);
 22 |   nf << ss.str();
 23 |   nf << std::flush;
 24 |   std::system(("node " + node_file + " > " + out_file).c_str());
 25 |   std::stringstream ss_out;
 26 |   ss_out << std::ifstream(out_file).rdbuf();
 27 |   if (ss_out.str() != expected) {
 28 |     std::cerr << "got: " << ss_out.str();
 29 |     std::cerr << "expected: " << expected;
 30 |     std::ofstream wasm("error.wasm", std::ios::binary);
 31 |     std::cerr << "generated wasm saved to error.wasm\n";
 32 |     auto bytes = c.emit();
 33 |     wasm.write((char*)bytes.data(), bytes.size());
 34 |   }
 35 |   assert(ss_out.str() == expected && "failed");
 36 | }
 37 | 
 38 | void testBasic() {
 39 |   struct Code : wasmblr::CodeGenerator {
 40 |     Code() : wasmblr::CodeGenerator() {
 41 |       auto add_func = function({f32, f32}, {f32}, [&]() {
 42 |         local.get(0);
 43 |         local.get(1);
 44 |         f32.add();
 45 |       });
 46 |       export_(add_func, "add");
 47 |     }
 48 |   };
 49 |   Code c;
 50 |   testJS(c, "console.log(instance.exports.add(8, 4));", "12\n");
 51 | }
 52 | 
 53 | void testConstant() {
 54 |   struct Code : wasmblr::CodeGenerator {
 55 |     Code() : wasmblr::CodeGenerator() {
 56 |       auto constant_func =
 57 |           function({}, {i32}, [&]() { i32.const_(1024 * 1024 * 4); });
 58 |       export_(constant_func, "constant");
 59 |     }
 60 |   };
 61 |   Code c;
 62 |   testJS(c, "console.log(instance.exports.constant());", "4194304\n");
 63 | }
 64 | 
 65 | void testRecursive() {
 66 |   struct Code : wasmblr::CodeGenerator {
 67 |     // NB: Needs to be a class variable,
 68 |     // the function body is evaluated later
 69 |     uint32_t factorial;
 70 |     Code() : wasmblr::CodeGenerator() {
 71 |       factorial = function({f32}, {f32}, [&]() {
 72 |         local.get(0);
 73 |         f32.const_(1.0f);
 74 |         f32.lt();
 75 |         if_(f32);
 76 |         { f32.const_(1.0f); }
 77 |         else_();
 78 |         {
 79 |           local.get(0);
 80 |           local.get(0);
 81 |           f32.const_(1.0f);
 82 |           f32.sub();
 83 |           call(factorial);
 84 |           f32.mul();
 85 |         }
 86 |         end();
 87 |       });
 88 |       export_(factorial, "factorial");
 89 |     }
 90 |   };
 91 |   Code c;
 92 |   testJS(c, "console.log(instance.exports.factorial(4));", "24\n");
 93 |   testJS(c, "console.log(instance.exports.factorial(7));", "5040\n");
 94 | }
 95 | 
 96 | void testIfStatement() {
 97 |   struct Code : wasmblr::CodeGenerator {
 98 |     Code() : wasmblr::CodeGenerator() {
 99 |       auto if_func = function({f32}, {f32}, [&]() {
100 |         f32.const_(0.0f);
101 |         local.get(0);
102 |         f32.gt();
103 |         if_(f32);
104 |         f32.const_(0.0f);
105 |         else_();
106 |         local.get(0);
107 |         end();
108 |       });
109 |       export_(if_func, "relu");
110 |     }
111 |   };
112 |   Code c;
113 |   testJS(c, "console.log(instance.exports.relu(-2));", "0\n");
114 |   testJS(c, "console.log(instance.exports.relu(2));", "2\n");
115 | }
116 | 
117 | void testLoop() {
118 |   struct Code : wasmblr::CodeGenerator {
119 |     Code() : wasmblr::CodeGenerator() {
120 |       auto loop_fn = function({}, {i32}, [&]() {
121 |         auto i = local(i32);
122 | 
123 |         loop(void_);
124 |         {
125 |           local.get(i);
126 |           i32.const_(1);
127 |           i32.add();
128 |           local.set(i);
129 | 
130 |           local.get(i);
131 |           i32.const_(10);
132 |           i32.lt_s();
133 |           br_if(0);
134 |         }
135 |         end();
136 |         local.get(i);
137 |       });
138 |       export_(loop_fn, "loop");
139 |     }
140 |   };
141 |   Code c;
142 |   testJS(c, "console.log(instance.exports.loop());", "10\n");
143 | }
144 | 
145 | void testMemory() {
146 |   struct Code : wasmblr::CodeGenerator {
147 |     Code() : wasmblr::CodeGenerator() { memory(1, 10).export_("mem"); }
148 |   };
149 |   Code c;
150 |   testJS(c, R"(
151 | console.log(instance.exports.mem.buffer.byteLength);
152 | instance.exports.mem.grow(1);
153 | console.log(instance.exports.mem.buffer.byteLength);
154 |   )",
155 |          "65536\n131072\n");
156 | }
157 | 
158 | void testStore() {
159 |   struct Code : wasmblr::CodeGenerator {
160 |     Code() : wasmblr::CodeGenerator() {
161 |       memory(1, 10).export_("mem");
162 |       auto store = function({}, {}, [&]() {
163 |         i32.const_(0);     // index 0
164 |         i32.const_(1337);  // value 1337
165 |         i32.store(0, 0);   // align 0, offset 0
166 |       });
167 |       export_(store, "store");
168 |     }
169 |   };
170 |   Code c;
171 |   testJS(c, R"(
172 | instance.exports.store();
173 | console.log(new Uint32Array(instance.exports.mem.buffer)[0])
174 |   )",
175 |          "1337\n");
176 | }
177 | 
178 | void testSIMD() {
179 |   struct Code : wasmblr::CodeGenerator {
180 |     Code() : wasmblr::CodeGenerator() {
181 |       memory(1, 10).export_("mem");
182 |       auto square = function({}, {}, [&]() {
183 |         auto vec = local(v128);
184 |         i32.const_(0);
185 |         v128.load();
186 |         local.set(vec);
187 | 
188 |         local.get(vec);
189 |         local.get(vec);
190 |         v128.f32x4_mul();
191 |         local.set(vec);
192 | 
193 |         i32.const_(0);
194 |         local.get(vec);
195 |         v128.store();
196 |       });
197 |       export_(square, "simd_square");
198 |     }
199 |   };
200 |   Code c;
201 |   testJS(c, R"(
202 | let inp = new Float32Array(instance.exports.mem.buffer);
203 | inp[0] = 1;
204 | inp[1] = 2;
205 | inp[2] = 3;
206 | inp[3] = 4;
207 | instance.exports.simd_square();
208 | console.log(inp[0], inp[1], inp[2], inp[3]);
209 |   )",
210 |          "1 4 9 16\n");
211 | }
212 | 
213 | void testSIMDShift() {
214 |   struct Code : wasmblr::CodeGenerator {
215 |     Code() : wasmblr::CodeGenerator() {
216 |       memory(1, 10).export_("mem");
217 |       auto square = function({}, {}, [&]() {
218 |         auto vec = local(v128);
219 |         i32.const_(0);
220 |         v128.load();
221 |         local.set(vec);
222 | 
223 |         local.get(vec);
224 |         i32.const_(2);
225 |         v128.i32x4_shl();
226 |         local.set(vec);
227 | 
228 |         i32.const_(0);
229 |         local.get(vec);
230 |         v128.store();
231 |       });
232 |       export_(square, "simd_square");
233 |     }
234 |   };
235 |   Code c;
236 |   testJS(c, R"(
237 | let inp = new Int32Array(instance.exports.mem.buffer);
238 | inp[0] = 1;
239 | inp[1] = 2;
240 | inp[2] = 3;
241 | inp[3] = 4;
242 | instance.exports.simd_square();
243 | console.log(inp[0], inp[1], inp[2], inp[3]);
244 |   )",
245 |          "4 8 12 16\n");
246 | }
247 | 
248 | int main() {
249 |   testBasic();
250 |   testConstant();
251 |   testRecursive();
252 |   testIfStatement();
253 |   testLoop();
254 |   testMemory();
255 |   testStore();
256 |   testSIMD();
257 |   testSIMDShift();
258 |   std::cout << "pass.\n";
259 | }
260 | 


--------------------------------------------------------------------------------
/thread_example/README.md:
--------------------------------------------------------------------------------
 1 | # Demo of how to do threading in WASM
 2 | 
 3 | The WebAssembly threading story is still a bit messy.
 4 | This folder contains a minimal working example of
 5 | two threads sharing the task of calcuation the square of an input.
 6 | It uses modern techniques and should be useful going forward.
 7 | 
 8 | ## Concept
 9 | 
10 | Each thread will be given every other element. The WASM function
11 | `square` is defined to take an offset (into the input and output).
12 | We will call `square` twice with two different offsets and that will
13 | yield the full result.
14 | 
15 | Specifically, `square(4, 8)` denotes starting on the 4th byte and squaring all elements 8 bytes apart.
16 | It should hopefully be clear how this can be run twice in parallel
17 | with different first arguments (0 and 4) to get the full result.
18 | 
19 | ## Generate `square.wasm`
20 | 
21 | ```
22 | g++ thread.cc -I../ -o thread
23 | ./thread
24 | ```
25 | 
26 | ## Then run the server with proper cross origin isolation
27 | 
28 | This is a hacked python http.server that will 
29 | serve up the proper isolation (COOP + COEP).
30 | Read more here: 
31 | [MDN](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/SharedArrayBuffer#security_requirements)
32 | 
33 | ```
34 | python3 -m server
35 | ```
36 | 
37 | ## Now open the browser and look in the console
38 | 
39 | Navigate to `localhost:8000` and you should see the output of each thread (both around 0.09).
40 | We are looking at the result of `main.mjs` and `worker.js` interacting.
41 | 


--------------------------------------------------------------------------------
/thread_example/index.html:
--------------------------------------------------------------------------------
1 | <script src="main.mjs"></script>
2 | <pre id="output"></pre>
3 | 


--------------------------------------------------------------------------------
/thread_example/main.mjs:
--------------------------------------------------------------------------------
 1 | async function launch_threads() {
 2 |   const response = await fetch('./square.wasm');
 3 |   const wasm = await response.arrayBuffer();
 4 |   const wasm_module = await WebAssembly.compile(wasm);
 5 |   const len = 1024;
 6 |   const pages = (len * 2 * 4) / (1 << 16) + 1;
 7 |   let memory = new WebAssembly.Memory({
 8 |     initial: pages,
 9 |     maximum: pages + 1,
10 |     shared: true
11 |   });
12 |   const wasm_instance = await WebAssembly.instantiate(wasm_module, {
13 |     env: {
14 |       memory: memory
15 |     }
16 |   });
17 | 
18 |   const input = new Float32Array(memory.buffer, 0, 1024);
19 |   const output = new Float32Array(memory.buffer, 1024 * 4, 1024);
20 |   input[0] = 0.3;
21 |   input[1] = 0.3;
22 |   input[2] = 0.3;
23 |   const worker0 = new Worker('./worker.js');
24 |   const worker1 = new Worker('./worker.js');
25 |   let w0_done = false;
26 |   let w1_done = false;
27 |   worker0.addEventListener('message', function(e) {
28 |     w0_done = true;
29 |     if (w1_done) {
30 |       document.getElementById('output').textContent = output[0] + ", " + output[1];
31 |       console.log(output[0], output[1]);
32 |     }
33 |   });
34 |   worker1.addEventListener('message', function(e) {
35 |     w1_done = true;
36 |     if (w0_done) {
37 |       document.getElementById('output').textContent = output[0] + ", " + output[1];
38 |       console.log(output[0], output[1]);
39 |     }
40 |   });
41 |   worker0.postMessage([wasm_module, memory, 0, 8]);
42 |   worker1.postMessage([wasm_module, memory, 4, 8]);
43 | }
44 | 
45 | launch_threads();
46 | 


--------------------------------------------------------------------------------
/thread_example/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from http.server import HTTPServer, SimpleHTTPRequestHandler, test
 3 | import sys
 4 | 
 5 | class CORSRequestHandler (SimpleHTTPRequestHandler):
 6 |     def end_headers (self):
 7 |         self.send_header('Cross-Origin-Opener-Policy', 'same-origin')
 8 |         self.send_header('Cross-Origin-Embedder-Policy', 'require-corp')
 9 |         SimpleHTTPRequestHandler.end_headers(self)
10 | 
11 | if __name__ == '__main__':
12 |     test(CORSRequestHandler, HTTPServer, port=int(sys.argv[1]) if len(sys.argv) > 1 else 8000)
13 | 


--------------------------------------------------------------------------------
/thread_example/thread.cc:
--------------------------------------------------------------------------------
 1 | #include "wasmblr.h"
 2 | #include <fstream>
 3 | 
 4 | int main() {
 5 | 
 6 |   int64_t len = 1024;
 7 | 
 8 |   wasmblr::CodeGenerator cg;
 9 | 
10 |   // shared() makes threading possible
11 |   auto pages = (len * 2 * 4) / (1 << 16) + 1;
12 |   cg.memory(pages, pages + 1).import_("env", "memory").shared();
13 | 
14 |   auto square = cg.function({cg.i32, cg.i32}, {}, [&]() {
15 |     auto iter = cg.local(cg.i32);
16 |     //cg.i32.const_(0);
17 |     cg.local.get(0);
18 |     cg.local.set(iter);
19 | 
20 |     cg.loop(cg.void_);
21 |     {
22 |       cg.local.get(iter);
23 | 
24 |       cg.local.get(iter);
25 |       cg.f32.load(0, 0);
26 | 
27 |       cg.local.get(iter);
28 |       cg.f32.load(0, 0);
29 | 
30 |       cg.f32.mul();
31 |       cg.f32.store(0, len * 4);
32 | 
33 |       //cg.i32.const_(4);
34 |       cg.local.get(1);
35 |       cg.local.get(iter);
36 |       cg.i32.add();
37 |       cg.local.set(iter);
38 | 
39 |       cg.i32.const_(len * 4);
40 |       cg.local.get(iter);
41 |       cg.i32.ge_u();
42 |       cg.br_if(0);
43 |     }
44 |     cg.end();
45 |     
46 |   });
47 | 
48 |   cg.export_(square, "square");
49 | 
50 |   // write to a loadable binary
51 |   std::ofstream wasm("square.wasm", std::ios::binary);
52 |   auto bytes = cg.emit();
53 |   wasm.write((char*)bytes.data(), bytes.size());
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/thread_example/worker.js:
--------------------------------------------------------------------------------
 1 | self.addEventListener('message', function(e) {
 2 |   const wasm_module = e.data[0];
 3 |   const memory = e.data[1];
 4 |   WebAssembly.instantiate(wasm_module, {
 5 |     env: {
 6 |       memory: memory
 7 |     }
 8 |   }).then((wasm_instance) => {
 9 |     const square = wasm_instance.exports.square;
10 |     const off = e.data[2];
11 |     const stride = e.data[3];
12 |     square(off, stride);
13 |     self.postMessage("done");
14 |   });
15 | });


--------------------------------------------------------------------------------
/wasmblr.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <array>
  4 | #include <cassert>
  5 | #include <cstdint>
  6 | #include <cstring>
  7 | #include <stack>
  8 | #include <string>
  9 | #include <unordered_map>
 10 | #include <vector>
 11 | 
 12 | namespace wasmblr {
 13 | 
 14 | constexpr std::array<uint8_t, 4> magic_module_header = {0x00, 0x61, 0x73, 0x6d};
 15 | constexpr std::array<uint8_t, 4> module_version = {0x01, 0x00, 0x00, 0x00};
 16 | 
 17 | struct CodeGenerator;
 18 | 
 19 | class Local {
 20 |  public:
 21 |   int operator()(uint8_t type);
 22 |   void get(int idx);
 23 |   void set(int idx);
 24 |   void tee(int idx);
 25 | 
 26 |  private:
 27 |   Local(CodeGenerator& cg_) : cg(cg_) {}
 28 |   CodeGenerator& cg;
 29 |   friend CodeGenerator;
 30 | };
 31 | 
 32 | class I32 {
 33 |  public:
 34 |   operator uint8_t();
 35 |   void const_(int32_t i);
 36 |   void clz();
 37 |   void ctz();
 38 |   void popcnt();
 39 |   void lt_s();
 40 |   void lt_u();
 41 |   void gt_s();
 42 |   void gt_u();
 43 |   void le_s();
 44 |   void le_u();
 45 |   void ge_s();
 46 |   void ge_u();
 47 |   void add();
 48 |   void sub();
 49 |   void mul();
 50 |   void div_s();
 51 |   void div_u();
 52 |   void rem_s();
 53 |   void rem_u();
 54 |   void and_();
 55 |   void or_();
 56 |   void xor_();
 57 |   void shl();
 58 |   void shr_s();
 59 |   void shr_u();
 60 |   void rotl();
 61 |   void rotr();
 62 |   void eqz();
 63 |   void eq();
 64 |   void ne();
 65 | 
 66 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
 67 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
 68 | 
 69 |   void load8_s(uint32_t alignment = 1, uint32_t offset = 0);
 70 |   void load8_u(uint32_t alignment = 1, uint32_t offset = 0);
 71 |   void load16_s(uint32_t alignment = 1, uint32_t offset = 0);
 72 |   void load16_u(uint32_t alignment = 1, uint32_t offset = 0);
 73 |   void store8(uint32_t alignment = 1, uint32_t offset = 0);
 74 |   void store16(uint32_t alignment = 1, uint32_t offset = 0);
 75 | 
 76 |  private:
 77 |   I32(CodeGenerator& cg_) : cg(cg_) {}
 78 |   CodeGenerator& cg;
 79 |   friend CodeGenerator;
 80 | };
 81 | 
 82 | class F32 {
 83 |  public:
 84 |   operator uint8_t();
 85 |   void const_(float f);
 86 |   void eq();
 87 |   void ne();
 88 |   void lt();
 89 |   void gt();
 90 |   void le();
 91 |   void ge();
 92 |   void abs();
 93 |   void neg();
 94 |   void ceil();
 95 |   void floor();
 96 |   void trunc();
 97 |   void nearest();
 98 |   void sqrt();
 99 |   void add();
100 |   void sub();
101 |   void mul();
102 |   void div();
103 |   void min();
104 |   void max();
105 |   void copysign();
106 | 
107 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
108 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
109 | 
110 |  private:
111 |   F32(CodeGenerator& cg_) : cg(cg_) {}
112 |   CodeGenerator& cg;
113 |   friend CodeGenerator;
114 | };
115 | 
116 | class V128 {
117 |  public:
118 |   operator uint8_t();
119 | 
120 |   void i32x4_extract_lane(uint8_t lane);
121 |   void i32x4_replace_lane(uint8_t lane);
122 |   void f32x4_extract_lane(uint8_t lane);
123 |   void f32x4_replace_lane(uint8_t lane);
124 |   void i32x4_splat();
125 |   void f32x4_splat();
126 |   void i32x4_eq();
127 |   void i32x4_ne();
128 |   void i32x4_lt_s();
129 |   void i32x4_lt_u();
130 |   void i32x4_gt_s();
131 |   void i32x4_gt_u();
132 |   void i32x4_le_s();
133 |   void i32x4_le_u();
134 |   void i32x4_ge_s();
135 |   void i32x4_ge_u();
136 |   void f32x4_eq();
137 |   void f32x4_ne();
138 |   void f32x4_lt();
139 |   void f32x4_gt();
140 |   void f32x4_le();
141 |   void f32x4_ge();
142 |   void not_();
143 |   void any_true();
144 |   void and_();
145 |   void andnot();
146 |   void or_();
147 |   void xor_();
148 |   void i32x4_abs();
149 |   void i32x4_neg();
150 |   void i32x4_all_true();
151 |   void i32x4_bitmask();
152 |   void i32x4_shl();
153 |   void i32x4_shr_s();
154 |   void i32x4_shr_u();
155 |   void i32x4_add();
156 |   void i32x4_sub();
157 |   void i32x4_mul();
158 |   void i32x4_min_s();
159 |   void i32x4_min_u();
160 |   void i32x4_max_s();
161 |   void i32x4_max_u();
162 |   void f32x4_ceil();
163 |   void f32x4_floor();
164 |   void f32x4_trunc();
165 |   void f32x4_nearest();
166 |   void f32x4_abs();
167 |   void f32x4_neg();
168 |   void f32x4_sqrt();
169 |   void f32x4_add();
170 |   void f32x4_sub();
171 |   void f32x4_mul();
172 |   void f32x4_div();
173 |   void f32x4_min();
174 |   void f32x4_max();
175 |   void f32x4_pmin();
176 |   void f32x4_pmax();
177 | 
178 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
179 |   void load32x2_s(uint32_t alignment = 1, uint32_t offset = 0);
180 |   void load32x2_u(uint32_t alignment = 1, uint32_t offset = 0);
181 |   void load32_splat(uint32_t alignment = 1, uint32_t offset = 0);
182 |   void load32_zero(uint32_t alignment = 1, uint32_t offset = 0);
183 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
184 | 
185 |  private:
186 |   V128(CodeGenerator& cg_) : cg(cg_) {}
187 |   CodeGenerator& cg;
188 |   friend CodeGenerator;
189 | };
190 | 
191 | class Memory {
192 |  public:
193 |   Memory& operator()(uint32_t min);
194 |   Memory& operator()(uint32_t min, uint32_t max);
195 |   Memory& export_(std::string);
196 |   Memory& shared(bool = true);
197 |   Memory& import_(std::string, std::string);
198 |   void size();
199 |   void grow();
200 | 
201 |  private:
202 |   Memory(CodeGenerator& cg_) : cg(cg_) {}
203 |   CodeGenerator& cg;
204 |   uint32_t min = 0;
205 |   uint32_t max = 0;
206 |   bool is_shared = false;
207 |   std::string a_string = "";
208 |   std::string b_string = "";
209 |   bool is_import() const { return a_string.size() && b_string.size(); }
210 |   bool is_export() const { return a_string.size() && !b_string.size(); }
211 |   friend CodeGenerator;
212 | };
213 | 
214 | struct Function {
215 |   Function(std::vector<uint8_t> input_types_,
216 |            std::vector<uint8_t> output_types_)
217 |       : input_types(input_types_), output_types(output_types_) {}
218 |   Function(std::vector<uint8_t> input_types_,
219 |            std::vector<uint8_t> output_types_,
220 |            std::function<void()> body_)
221 |       : input_types(input_types_), output_types(output_types_), body(body_) {}
222 |   std::vector<uint8_t> input_types;
223 |   std::vector<uint8_t> output_types;
224 |   std::function<void()> body;
225 |   std::vector<uint8_t> locals;  // resolved later
226 |   void emit() {
227 |     locals.clear();
228 |     body();
229 |   };
230 | };
231 | 
232 | struct CodeGenerator {
233 |   // API
234 |   Local local;
235 |   I32 i32;
236 |   F32 f32;
237 |   V128 v128;
238 |   Memory memory;
239 |   uint8_t void_ = 0x40;
240 | 
241 |   void nop();
242 |   void block(uint8_t type);
243 |   void loop(uint8_t type);
244 |   void if_(uint8_t type);
245 |   void else_();
246 |   void br(uint32_t labelidx);
247 |   void br_if(uint32_t labelidx);
248 |   void end();
249 |   void call(uint32_t funcidx);
250 | 
251 |   void export_(uint32_t fn_idx, std::string name);
252 | 
253 |   // returns function index
254 |   uint32_t function(std::vector<uint8_t> input_types,
255 |                     std::vector<uint8_t> output_types,
256 |                     std::function<void()> body);
257 | 
258 |   std::vector<uint8_t> emit();
259 | 
260 |   // Implementation
261 | 
262 |   CodeGenerator()
263 |       : local(*this), i32(*this), f32(*this), v128(*this), memory(*this) {}
264 |   CodeGenerator(const CodeGenerator&) = delete;
265 |   CodeGenerator(CodeGenerator&&) = delete;
266 | 
267 |   std::vector<Function> functions_;
268 |   std::unordered_map<uint32_t, std::string> exported_functions_;
269 |   Function* cur_function_ = nullptr;
270 |   // cur_bytes_ is used as a temporary storage
271 |   std::vector<uint8_t> cur_bytes_;
272 |   // a running type checker, purely for safety
273 |   std::stack<uint8_t> type_stack_;
274 | 
275 |   using memarg = std::pair<uint32_t, uint32_t>;
276 | 
277 |   // From LLVM
278 |   std::vector<uint8_t> encode_signed(int32_t n) {
279 |     std::vector<uint8_t> out;
280 |     auto more = true;
281 |     do {
282 |       uint8_t byte = n & 0x7f;
283 |       n >>= 7;
284 |       more = !((((n == 0) && ((byte & 0x40) == 0)) ||
285 |                 ((n == -1) && ((byte & 0x40) != 0))));
286 |       if (more) {
287 |         byte |= 0x80;
288 |       }
289 |       out.emplace_back(byte);
290 |     } while (more);
291 |     return out;
292 |   }
293 | 
294 |   std::vector<uint8_t> encode_unsigned(uint32_t n) {
295 |     std::vector<uint8_t> out;
296 |     do {
297 |       uint8_t byte = n & 0x7f;
298 |       n >>= 7;
299 |       if (n != 0) {
300 |         byte |= 0x80;
301 |       }
302 |       out.emplace_back(byte);
303 |     } while (n != 0);
304 |     return out;
305 |   }
306 | 
307 |   std::vector<uint8_t> encode_string(std::string s) {
308 |     std::vector<uint8_t> out;
309 |     out.emplace_back(s.size());
310 |     for (const auto& c : s) {
311 |       out.emplace_back(c);
312 |     }
313 |     return out;
314 |   }
315 | 
316 |   template <typename T>
317 |   void concat(std::vector<uint8_t>& out, const T& inp) {
318 |     out.insert(out.end(), inp.begin(), inp.end());
319 |   };
320 | 
321 |   int declare_local(uint8_t type) {
322 |     assert(cur_function_);
323 |     int idx = cur_function_->locals.size() + cur_function_->input_types.size();
324 |     cur_function_->locals.emplace_back(type);
325 |     return idx;
326 |   }
327 | 
328 |   const std::vector<uint8_t>& input_types() {
329 |     assert(cur_function_);
330 |     return cur_function_->input_types;
331 |   }
332 | 
333 |   const std::vector<uint8_t>& locals() {
334 |     assert(cur_function_);
335 |     return cur_function_->locals;
336 |   }
337 | 
338 |   void push(uint8_t type) { type_stack_.push(type); };
339 | 
340 |   uint8_t pop() {
341 |     assert(type_stack_.size() && "popping empty stack");
342 |     auto type = type_stack_.top();
343 |     type_stack_.pop();
344 |     return type;
345 |   };
346 | 
347 |   void emit(uint8_t byte) { cur_bytes_.emplace_back(byte); }
348 |   void emit(std::vector<uint8_t> bytes) { concat(cur_bytes_, bytes); }
349 |   void emit(const memarg& m) {
350 |     emit(encode_unsigned(std::get<0>(m)));
351 |     emit(encode_unsigned(std::get<1>(m)));
352 |   }
353 | };
354 | 
355 | inline int Local::operator()(uint8_t type) {
356 |   return cg.declare_local(type);
357 | };
358 | 
359 | inline void Local::set(int idx) {
360 |   auto t = cg.pop();
361 |   const auto& input_types = cg.input_types();
362 |   auto expected_type = [&]() {
363 |     if (idx < input_types.size()) {
364 |       return input_types.at(idx);
365 |     }
366 |     return cg.locals().at(idx - input_types.size());
367 |   }();
368 |   assert(expected_type == t && "can't set local to this value (wrong type)");
369 | 
370 |   cg.emit(0x21);
371 |   cg.emit(cg.encode_unsigned(idx));
372 | }
373 | 
374 | inline void Local::get(int idx) {
375 |   const auto& input_types = cg.input_types();
376 |   if (idx < input_types.size()) {
377 |     cg.push(input_types.at(idx));
378 |   } else {
379 |     cg.push(cg.locals().at(idx - input_types.size()));
380 |   }
381 | 
382 |   cg.emit(0x20);
383 |   cg.emit(cg.encode_unsigned(idx));
384 | }
385 | 
386 | inline void Local::tee(int idx) {
387 |   auto t = cg.pop();
388 |   const auto& input_types = cg.input_types();
389 |   auto expected_type = [&]() {
390 |     if (idx < input_types.size()) {
391 |       return input_types.at(idx);
392 |     }
393 |     return cg.locals().at(idx - input_types.size());
394 |   }();
395 |   assert(expected_type == t && "can't set local to this value (wrong type)");
396 | 
397 |   cg.emit(0x22);
398 |   cg.emit(cg.encode_unsigned(idx));
399 |   cg.push(expected_type);
400 | }
401 | 
402 | inline I32::operator uint8_t() {
403 |   return 0x7f;
404 | }
405 | 
406 | inline void I32::const_(int32_t i) {
407 |   cg.emit(0x41);
408 |   cg.emit(cg.encode_signed(i));
409 |   cg.push(cg.i32);
410 | }
411 | 
412 | inline F32::operator uint8_t() {
413 |   return 0x7d;
414 | }
415 | 
416 | inline void F32::const_(float f) {
417 |   cg.emit(0x43);
418 |   uint8_t r[4];
419 |   memcpy(&r, &f, sizeof(float));
420 |   for (auto i = 0; i < 4; ++i) {
421 |     cg.emit(r[i]);
422 |   }
423 |   cg.push(cg.f32);
424 | }
425 | 
426 | inline V128::operator uint8_t() {
427 |   return 0x7b;
428 | }
429 | 
430 | #define UNARY_OP(classname, op, opcode, in_type, out_type) \
431 |   inline void classname::op() {                            \
432 |     bool valid = cg.pop() == cg.in_type;                   \
433 |     assert(valid && "invalid type for " #op);              \
434 |     cg.emit(opcode);                                       \
435 |     cg.push(cg.out_type);                                  \
436 |   }
437 | 
438 | #define BINARY_OP(classname, op, opcode, type_a, type_b, out_type) \
439 |   inline void classname::op() {                                    \
440 |     bool valid = cg.pop() == cg.type_a && cg.pop() == cg.type_b;   \
441 |     assert(valid && "invalid type for " #op);                      \
442 |     cg.emit(opcode);                                               \
443 |     cg.push(cg.out_type);                                          \
444 |   }
445 | 
446 | #define LOAD_OP(classname, op, opcode, out_type)                   \
447 |   inline void classname::op(uint32_t alignment, uint32_t offset) { \
448 |     auto idx_type = cg.pop();                                      \
449 |     assert(idx_type == cg.i32);                                    \
450 |     cg.emit(opcode);                                               \
451 |     cg.emit(cg.encode_unsigned(alignment));                        \
452 |     cg.emit(cg.encode_unsigned(offset));                           \
453 |     cg.push(cg.out_type);                                          \
454 |   }
455 | 
456 | #define STORE_OP(classname, op, opcode)                            \
457 |   inline void classname::op(uint32_t alignment, uint32_t offset) { \
458 |     auto val_type = cg.pop();                                      \
459 |     auto idx_type = cg.pop();                                      \
460 |     assert(idx_type == cg.i32);                                    \
461 |     cg.emit(opcode);                                               \
462 |     cg.emit(cg.encode_unsigned(alignment));                        \
463 |     cg.emit(cg.encode_unsigned(offset));                           \
464 |   }
465 | 
466 | UNARY_OP(I32, clz, 0x67, i32, i32);
467 | UNARY_OP(I32, ctz, 0x68, i32, i32);
468 | UNARY_OP(I32, popcnt, 0x69, i32, i32);
469 | BINARY_OP(I32, lt_s, 0x48, i32, i32, i32);
470 | BINARY_OP(I32, lt_u, 0x49, i32, i32, i32);
471 | BINARY_OP(I32, gt_s, 0x4a, i32, i32, i32);
472 | BINARY_OP(I32, gt_u, 0x4b, i32, i32, i32);
473 | BINARY_OP(I32, le_s, 0x4c, i32, i32, i32);
474 | BINARY_OP(I32, le_u, 0x4d, i32, i32, i32);
475 | BINARY_OP(I32, ge_s, 0x4e, i32, i32, i32);
476 | BINARY_OP(I32, ge_u, 0x4f, i32, i32, i32);
477 | BINARY_OP(I32, add, 0x6a, i32, i32, i32);
478 | BINARY_OP(I32, sub, 0x6b, i32, i32, i32);
479 | BINARY_OP(I32, mul, 0x6c, i32, i32, i32);
480 | BINARY_OP(I32, div_s, 0x6d, i32, i32, i32);
481 | BINARY_OP(I32, div_u, 0x6e, i32, i32, i32);
482 | BINARY_OP(I32, rem_s, 0x6f, i32, i32, i32);
483 | BINARY_OP(I32, rem_u, 0x70, i32, i32, i32);
484 | BINARY_OP(I32, and_, 0x71, i32, i32, i32);
485 | BINARY_OP(I32, or_, 0x72, i32, i32, i32);
486 | BINARY_OP(I32, xor_, 0x73, i32, i32, i32);
487 | BINARY_OP(I32, shl, 0x74, i32, i32, i32);
488 | BINARY_OP(I32, shr_s, 0x75, i32, i32, i32);
489 | BINARY_OP(I32, shr_u, 0x76, i32, i32, i32);
490 | BINARY_OP(I32, rotl, 0x77, i32, i32, i32);
491 | BINARY_OP(I32, rotr, 0x78, i32, i32, i32);
492 | BINARY_OP(I32, eqz, 0x45, i32, i32, i32);
493 | BINARY_OP(I32, eq, 0x46, i32, i32, i32);
494 | BINARY_OP(I32, ne, 0x47, i32, i32, i32);
495 | LOAD_OP(I32, load, 0x28, i32);
496 | LOAD_OP(I32, load8_s, 0x2c, i32);
497 | LOAD_OP(I32, load8_u, 0x2d, i32);
498 | LOAD_OP(I32, load16_s, 0x2e, i32);
499 | LOAD_OP(I32, load16_u, 0x2f, i32);
500 | STORE_OP(I32, store, 0x36);
501 | STORE_OP(I32, store8, 0x3a);
502 | STORE_OP(I32, store16, 0x3b);
503 | 
504 | BINARY_OP(F32, eq, 0x5b, f32, f32, i32);
505 | BINARY_OP(F32, ne, 0x5c, f32, f32, i32);
506 | BINARY_OP(F32, lt, 0x5d, f32, f32, i32);
507 | BINARY_OP(F32, gt, 0x5e, f32, f32, i32);
508 | BINARY_OP(F32, le, 0x5f, f32, f32, i32);
509 | BINARY_OP(F32, ge, 0x60, f32, f32, i32);
510 | UNARY_OP(F32, abs, 0x8B, f32, f32);
511 | UNARY_OP(F32, neg, 0x8C, f32, f32);
512 | UNARY_OP(F32, ceil, 0x8D, f32, f32);
513 | UNARY_OP(F32, floor, 0x8E, f32, f32);
514 | UNARY_OP(F32, trunc, 0x8F, f32, f32);
515 | UNARY_OP(F32, nearest, 0x90, f32, f32);
516 | UNARY_OP(F32, sqrt, 0x91, f32, f32);
517 | BINARY_OP(F32, add, 0x92, f32, f32, f32);
518 | BINARY_OP(F32, sub, 0x93, f32, f32, f32);
519 | BINARY_OP(F32, mul, 0x94, f32, f32, f32);
520 | BINARY_OP(F32, div, 0x95, f32, f32, f32);
521 | BINARY_OP(F32, min, 0x96, f32, f32, f32);
522 | BINARY_OP(F32, max, 0x97, f32, f32, f32);
523 | BINARY_OP(F32, copysign, 0x98, f32, f32, f32);
524 | LOAD_OP(F32, load, 0x2a, f32);
525 | STORE_OP(F32, store, 0x38);
526 | 
527 | #undef UNARY_OP
528 | #undef BINARY_OP
529 | #undef LOAD_OP
530 | #undef STORE_OP
531 | 
532 | #define VECTOR_LOAD(op, vopcode)                              \
533 |   inline void V128::op(uint32_t alignment, uint32_t offset) { \
534 |     auto idx_type = cg.pop();                                 \
535 |     assert(idx_type == cg.i32);                               \
536 |     cg.emit(0xfd);                                            \
537 |     cg.emit(cg.encode_unsigned(vopcode));                     \
538 |     cg.emit(cg.encode_unsigned(alignment));                   \
539 |     cg.emit(cg.encode_unsigned(offset));                      \
540 |     cg.push(cg.v128);                                         \
541 |   }
542 | 
543 | VECTOR_LOAD(load, 0);
544 | VECTOR_LOAD(load32x2_s, 5);
545 | VECTOR_LOAD(load32x2_u, 6);
546 | VECTOR_LOAD(load32_splat, 9);
547 | VECTOR_LOAD(load32_zero, 92);
548 | 
549 | inline void V128::store(uint32_t alignment, uint32_t offset) {
550 |   auto val_type = cg.pop();
551 |   assert(val_type == cg.v128);
552 |   auto idx_type = cg.pop();
553 |   assert(idx_type == cg.i32);
554 |   cg.emit(0xfd);
555 |   cg.emit(cg.encode_unsigned(11));
556 |   cg.emit(cg.encode_unsigned(alignment));
557 |   cg.emit(cg.encode_unsigned(offset));
558 | }
559 | 
560 | inline void V128::i32x4_extract_lane(uint8_t lane) {
561 |   auto val_type = cg.pop();
562 |   assert(val_type == cg.v128);
563 |   cg.emit(0xfd);
564 |   cg.emit(cg.encode_unsigned(27));
565 |   cg.emit(lane);
566 |   cg.push(cg.i32);
567 | }
568 | 
569 | inline void V128::f32x4_extract_lane(uint8_t lane) {
570 |   auto val_type = cg.pop();
571 |   assert(val_type == cg.v128);
572 |   cg.emit(0xfd);
573 |   cg.emit(cg.encode_unsigned(31));
574 |   cg.emit(lane);
575 |   cg.push(cg.f32);
576 | }
577 | 
578 | inline void V128::i32x4_replace_lane(uint8_t lane) {
579 |   auto val_type = cg.pop();
580 |   assert(val_type == cg.i32);
581 |   auto vec_type = cg.pop();
582 |   assert(vec_type == cg.v128);
583 |   cg.emit(0xfd);
584 |   cg.emit(cg.encode_unsigned(28));
585 |   cg.emit(lane);
586 |   cg.push(cg.v128);
587 | }
588 | 
589 | inline void V128::f32x4_replace_lane(uint8_t lane) {
590 |   auto val_type = cg.pop();
591 |   assert(val_type == cg.f32);
592 |   auto vec_type = cg.pop();
593 |   assert(vec_type == cg.v128);
594 |   cg.emit(0xfd);
595 |   cg.emit(cg.encode_unsigned(32));
596 |   cg.emit(lane);
597 |   cg.push(cg.v128);
598 | }
599 | 
600 | inline void V128::i32x4_splat() {
601 |   auto val_type = cg.pop();
602 |   assert(val_type == cg.i32);
603 |   cg.emit(0xfd);
604 |   cg.emit(cg.encode_unsigned(17));
605 |   cg.push(cg.v128);
606 | }
607 | 
608 | inline void V128::f32x4_splat() {
609 |   auto val_type = cg.pop();
610 |   assert(val_type == cg.f32);
611 |   cg.emit(0xfd);
612 |   cg.emit(cg.encode_unsigned(19));
613 |   cg.push(cg.v128);
614 | }
615 | 
616 | #define VECTOR_BINARY_OP(op, vopcode, a_type, b_type, out_type) \
617 |   inline void V128::op() {                                      \
618 |     auto b = cg.pop();                                          \
619 |     assert(cg.b_type == b);                                     \
620 |     auto a = cg.pop();                                          \
621 |     assert(cg.a_type == a);                                     \
622 |     cg.emit(0xfd);                                              \
623 |     cg.emit(cg.encode_unsigned(vopcode));                       \
624 |     cg.push(cg.out_type);                                       \
625 |   }
626 | 
627 | #define VECTOR_UNARY_OP(op, vopcode, inp_type, out_type) \
628 |   inline void V128::op() {                               \
629 |     auto inp = cg.pop();                                 \
630 |     assert(cg.inp_type == inp);                          \
631 |     cg.emit(0xfd);                                       \
632 |     cg.emit(cg.encode_unsigned(vopcode));                \
633 |     cg.push(cg.out_type);                                \
634 |   }
635 | 
636 | VECTOR_BINARY_OP(i32x4_eq, 55, v128, v128, v128);
637 | VECTOR_BINARY_OP(i32x4_ne, 56, v128, v128, v128);
638 | VECTOR_BINARY_OP(i32x4_lt_s, 57, v128, v128, v128);
639 | VECTOR_BINARY_OP(i32x4_lt_u, 58, v128, v128, v128);
640 | VECTOR_BINARY_OP(i32x4_gt_s, 59, v128, v128, v128);
641 | VECTOR_BINARY_OP(i32x4_gt_u, 60, v128, v128, v128);
642 | VECTOR_BINARY_OP(i32x4_le_s, 61, v128, v128, v128);
643 | VECTOR_BINARY_OP(i32x4_le_u, 62, v128, v128, v128);
644 | VECTOR_BINARY_OP(i32x4_ge_s, 63, v128, v128, v128);
645 | VECTOR_BINARY_OP(i32x4_ge_u, 64, v128, v128, v128);
646 | 
647 | VECTOR_BINARY_OP(f32x4_eq, 65, v128, v128, v128);
648 | VECTOR_BINARY_OP(f32x4_ne, 66, v128, v128, v128);
649 | VECTOR_BINARY_OP(f32x4_lt, 67, v128, v128, v128);
650 | VECTOR_BINARY_OP(f32x4_gt, 68, v128, v128, v128);
651 | VECTOR_BINARY_OP(f32x4_le, 69, v128, v128, v128);
652 | VECTOR_BINARY_OP(f32x4_ge, 70, v128, v128, v128);
653 | 
654 | VECTOR_UNARY_OP(not_, 77, v128, v128);
655 | VECTOR_UNARY_OP(any_true, 83, v128, i32);
656 | 
657 | // TODO
658 | // VECTOR_TERNARY_OP(bitselect, 82, v128);
659 | 
660 | VECTOR_BINARY_OP(and_, 78, v128, v128, v128);
661 | VECTOR_BINARY_OP(andnot, 79, v128, v128, v128);
662 | VECTOR_BINARY_OP(or_, 80, v128, v128, v128);
663 | VECTOR_BINARY_OP(xor_, 81, v128, v128, v128);
664 | 
665 | VECTOR_UNARY_OP(i32x4_abs, 160, v128, v128);
666 | VECTOR_UNARY_OP(i32x4_neg, 161, v128, v128);
667 | VECTOR_UNARY_OP(i32x4_all_true, 163, v128, i32);
668 | VECTOR_UNARY_OP(i32x4_bitmask, 164, v128, i32);
669 | VECTOR_BINARY_OP(i32x4_shl, 171, v128, i32, v128);
670 | VECTOR_BINARY_OP(i32x4_shr_s, 172, v128, i32, v128);
671 | VECTOR_BINARY_OP(i32x4_shr_u, 173, v128, i32, v128);
672 | VECTOR_BINARY_OP(i32x4_add, 174, v128, v128, v128);
673 | VECTOR_BINARY_OP(i32x4_sub, 177, v128, v128, v128);
674 | VECTOR_BINARY_OP(i32x4_mul, 181, v128, v128, v128);
675 | VECTOR_BINARY_OP(i32x4_min_s, 182, v128, v128, v128);
676 | VECTOR_BINARY_OP(i32x4_min_u, 183, v128, v128, v128);
677 | VECTOR_BINARY_OP(i32x4_max_s, 184, v128, v128, v128);
678 | VECTOR_BINARY_OP(i32x4_max_u, 185, v128, v128, v128);
679 | 
680 | VECTOR_UNARY_OP(f32x4_ceil, 103, v128, v128);
681 | VECTOR_UNARY_OP(f32x4_floor, 104, v128, v128);
682 | VECTOR_UNARY_OP(f32x4_trunc, 105, v128, v128);
683 | VECTOR_UNARY_OP(f32x4_nearest, 106, v128, v128);
684 | VECTOR_UNARY_OP(f32x4_abs, 224, v128, v128);
685 | VECTOR_UNARY_OP(f32x4_neg, 225, v128, v128);
686 | VECTOR_UNARY_OP(f32x4_sqrt, 227, v128, v128);
687 | VECTOR_BINARY_OP(f32x4_add, 228, v128, v128, v128);
688 | VECTOR_BINARY_OP(f32x4_sub, 229, v128, v128, v128);
689 | VECTOR_BINARY_OP(f32x4_mul, 230, v128, v128, v128);
690 | VECTOR_BINARY_OP(f32x4_div, 231, v128, v128, v128);
691 | VECTOR_BINARY_OP(f32x4_min, 232, v128, v128, v128);
692 | VECTOR_BINARY_OP(f32x4_max, 233, v128, v128, v128);
693 | VECTOR_BINARY_OP(f32x4_pmin, 234, v128, v128, v128);
694 | VECTOR_BINARY_OP(f32x4_pmax, 235, v128, v128, v128);
695 | 
696 | inline Memory& Memory::operator()(uint32_t min_) {
697 |   assert(min == 0 && max == 0);
698 |   min = min_;
699 |   return *this;
700 | }
701 | 
702 | inline Memory& Memory::operator()(uint32_t min_, uint32_t max_) {
703 |   assert(min == 0 && max == 0);
704 |   min = min_;
705 |   max = max_;
706 |   return *this;
707 | }
708 | 
709 | inline Memory& Memory::export_(std::string a) {
710 |   assert(!(is_import() || is_export()) && "already set");
711 |   a_string = a;
712 |   return *this;
713 | }
714 | 
715 | inline Memory& Memory::shared(bool make_shared) {
716 |   is_shared = make_shared;
717 |   return *this;
718 | }
719 | 
720 | inline Memory& Memory::import_(std::string a, std::string b) {
721 |   assert(!(is_import() || is_export()) && "already set");
722 |   a_string = a;
723 |   b_string = b;
724 |   return *this;
725 | }
726 | 
727 | inline void Memory::size() {
728 |   cg.emit(0x3f);
729 |   cg.emit(0x00);
730 | }
731 | inline void Memory::grow() {
732 |   cg.emit(0x40);
733 |   cg.emit(0x00);
734 | }
735 | 
736 | inline void CodeGenerator::nop() {
737 |   emit(0x01);
738 | }
739 | inline void CodeGenerator::block(uint8_t type) {
740 |   emit(0x02);
741 |   emit(type);
742 | }
743 | inline void CodeGenerator::loop(uint8_t type) {
744 |   emit(0x03);
745 |   emit(type);
746 | }
747 | 
748 | inline void CodeGenerator::if_(uint8_t type) {
749 |   auto t = pop();
750 |   assert(t == i32);
751 |   emit(0x04);
752 |   emit(type);
753 | }
754 | inline void CodeGenerator::else_() {
755 |   emit(0x05);
756 | }
757 | inline void CodeGenerator::br(uint32_t labelidx) {
758 |   emit(0x0c);
759 |   emit(encode_unsigned(labelidx));
760 | }
761 | inline void CodeGenerator::br_if(uint32_t labelidx) {
762 |   auto t = pop();
763 |   assert(t == i32);
764 |   emit(0x0d);
765 |   emit(encode_unsigned(labelidx));
766 | }
767 | inline void CodeGenerator::end() {
768 |   emit(0x0b);
769 | }
770 | inline void CodeGenerator::call(uint32_t fn_idx) {
771 |   assert(fn_idx < functions_.size() && "function index does not exist");
772 |   emit(0x10);
773 |   emit(encode_unsigned(fn_idx));
774 | }
775 | 
776 | inline void CodeGenerator::export_(uint32_t fn, std::string name) {
777 |   exported_functions_[fn] = name;
778 | }
779 | 
780 | inline uint32_t CodeGenerator::function(std::vector<uint8_t> input_types,
781 |                                         std::vector<uint8_t> output_types,
782 |                                         std::function<void()> body) {
783 |   auto idx = functions_.size();
784 |   functions_.emplace_back(input_types, output_types, body);
785 |   return idx;
786 | }
787 | 
788 | inline std::vector<uint8_t> CodeGenerator::emit() {
789 |   cur_bytes_.clear();
790 |   std::vector<uint8_t> emitted_bytes;
791 | 
792 |   concat(emitted_bytes, magic_module_header);
793 |   concat(emitted_bytes, module_version);
794 | 
795 |   std::vector<uint8_t> type_section_bytes;
796 |   concat(type_section_bytes, encode_unsigned(functions_.size()));
797 |   for (const auto& f : functions_) {
798 |     type_section_bytes.emplace_back(0x60);
799 |     concat(type_section_bytes, encode_unsigned(f.input_types.size()));
800 |     for (const auto& t : f.input_types) {
801 |       type_section_bytes.emplace_back(t);
802 |     }
803 |     concat(type_section_bytes, encode_unsigned(f.output_types.size()));
804 |     for (const auto& t : f.output_types) {
805 |       type_section_bytes.emplace_back(t);
806 |     }
807 |   }
808 | 
809 |   emitted_bytes.emplace_back(0x1);
810 |   concat(emitted_bytes, encode_unsigned(type_section_bytes.size()));
811 |   concat(emitted_bytes, type_section_bytes);
812 | 
813 | 	std::vector<uint8_t> import_section_bytes;
814 |   if (memory.is_import()) {
815 | 		concat(import_section_bytes, encode_unsigned(1)); // 1 import
816 |     concat(import_section_bytes, encode_string(memory.a_string));
817 |     concat(import_section_bytes, encode_string(memory.b_string));
818 | 		import_section_bytes.emplace_back(0x2); // memory flag
819 |     if (memory.min && memory.max) {
820 | 			if (memory.is_shared) {
821 | 				import_section_bytes.emplace_back(0x3);
822 | 			} else {
823 |         import_section_bytes.emplace_back(0x01);
824 | 			}
825 |       concat(import_section_bytes, encode_unsigned(memory.min));
826 |       concat(import_section_bytes, encode_unsigned(memory.max));
827 | 		} else {
828 | 			assert(!memory.is_shared && "shared memory must have a max size");
829 |       concat(import_section_bytes, encode_unsigned(memory.min));
830 | 		}
831 |     emitted_bytes.emplace_back(0x2);
832 |     concat(emitted_bytes, encode_unsigned(import_section_bytes.size()));
833 |     concat(emitted_bytes, import_section_bytes);
834 | 	}
835 | 
836 |   std::vector<uint8_t> function_section_bytes;
837 |   concat(function_section_bytes, encode_unsigned(functions_.size()));
838 |   for (auto i = 0; i < functions_.size(); ++i) {
839 |     concat(function_section_bytes, encode_unsigned(i));
840 |   }
841 |   emitted_bytes.emplace_back(0x3);
842 |   concat(emitted_bytes, encode_unsigned(function_section_bytes.size()));
843 |   concat(emitted_bytes, function_section_bytes);
844 | 
845 |   std::vector<uint8_t> memory_section_bytes;
846 |   if (!memory.is_import() && (memory.min || memory.max)) {
847 |     memory_section_bytes.emplace_back(0x01);  // always 1 memory
848 |     if (memory.min && memory.max) {
849 |       if (memory.is_shared) {
850 |         memory_section_bytes.emplace_back(0x03);
851 |       } else {
852 |         memory_section_bytes.emplace_back(0x01);
853 |       }
854 |       concat(memory_section_bytes, encode_unsigned(memory.min));
855 |       concat(memory_section_bytes, encode_unsigned(memory.max));
856 |     } else {
857 | 			assert(!memory.is_shared && "shared memory must have a max size");
858 |       memory_section_bytes.emplace_back(0x00);
859 |       concat(memory_section_bytes, encode_unsigned(memory.min));
860 |     }
861 |     emitted_bytes.emplace_back(0x05);
862 |     concat(emitted_bytes, encode_unsigned(memory_section_bytes.size()));
863 |     concat(emitted_bytes, memory_section_bytes);
864 |   }
865 | 
866 |   std::vector<uint8_t> export_section_bytes;
867 | 
868 |   auto num_exports = exported_functions_.size() + memory.is_export();
869 |   concat(export_section_bytes, encode_unsigned(num_exports));
870 |   if (memory.is_export()) {
871 |     concat(export_section_bytes, encode_string(memory.a_string));
872 |     export_section_bytes.emplace_back(0x02);
873 |     export_section_bytes.emplace_back(0x00);  // always 1 memory at index 0
874 |   }
875 |   for (const auto& p : exported_functions_) {
876 |     concat(export_section_bytes, encode_string(p.second));
877 |     export_section_bytes.emplace_back(0x00);
878 |     concat(export_section_bytes, encode_unsigned(p.first));
879 |   }
880 |   emitted_bytes.emplace_back(0x7);
881 |   concat(emitted_bytes, encode_unsigned(export_section_bytes.size()));
882 |   concat(emitted_bytes, export_section_bytes);
883 | 
884 |   std::vector<uint8_t> code_section_bytes;
885 |   concat(code_section_bytes, encode_unsigned(functions_.size()));
886 |   for (auto& f : functions_) {
887 |     cur_function_ = &f;
888 | 
889 |     cur_bytes_.clear();
890 |     f.emit();
891 |     end();
892 |     std::vector<uint8_t> body_bytes = cur_bytes_;
893 | 
894 |     cur_bytes_.clear();
895 |     concat(cur_bytes_, encode_unsigned(f.locals.size()));
896 |     for (const auto& l : f.locals) {
897 |       emit(0x1);
898 |       emit(l);
899 |     }
900 | 
901 |     std::vector<uint8_t> header_bytes = cur_bytes_;
902 |     auto fn_size = header_bytes.size() + body_bytes.size();
903 | 
904 |     concat(code_section_bytes, encode_unsigned(fn_size));
905 |     concat(code_section_bytes, header_bytes);
906 |     concat(code_section_bytes, body_bytes);
907 |   }
908 |   cur_function_ = nullptr;
909 | 
910 |   emitted_bytes.emplace_back(0xa);
911 |   concat(emitted_bytes, encode_unsigned(code_section_bytes.size()));
912 |   concat(emitted_bytes, code_section_bytes);
913 | 
914 |   return emitted_bytes;
915 | }
916 | 
917 | }  // namespace wasmblr
918 | 


--------------------------------------------------------------------------------