├── LICENSE ├── README.md ├── emscripten_example ├── Makefile ├── README.md ├── add.cc └── benchmark.js ├── flops_example ├── Makefile ├── README.md ├── index.html ├── jit.cc └── main.js ├── matmul_example ├── Makefile ├── README.md ├── index.html ├── main.js ├── mm.cc └── mm.js ├── test.cc ├── thread_example ├── README.md ├── index.html ├── main.mjs ├── server.py ├── thread.cc └── worker.js └── wasmblr.h /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Bram Wasti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wasmblr 2 | A single header file WebAssembly assembler. 3 | 4 | This library makes it easier to generate web assembly binaries directly from C++. 5 | Useful for JIT compilation from within projects compiled with Emscripten. 6 | For examples see below, or read the `test.cc` file. 7 | 8 | Some benchmarks: 9 | - Measure Bandwidth (cache not flushed): https://bwasti.github.io/wasmblr 10 | - Measure Peak Theoretical FLOPs: https://bwasti.github.io/wasmblr/flops 11 | - Measure Matrix Multiplication Performance (GFLOPs): https://bwasti.github.io/wasmblr/matmul/ 12 | 13 | Contributions welcome! 14 | 15 | # Usage 16 | 17 | `#include "wasmblr.h"` and compile with `-std=c++11` or higher. 18 | 19 | In C++: 20 | 21 | ```cpp 22 | 23 | struct Code : wasmblr::CodeGenerator { 24 | Code() : wasmblr::CodeGenerator() { 25 | auto add_func = function({f32, f32}, {f32}, [&]() { 26 | local.get(0); 27 | local.get(1); 28 | f32.add(); 29 | }); 30 | export_(add_func, "add"); 31 | } 32 | }; 33 | 34 | 35 | Code c; 36 | auto bytes = c.emit(); 37 | std::ofstream wasm("add.wasm", std::ios::binary); 38 | wasm.write((char*)bytes.data(), bytes.size()); 39 | ``` 40 | 41 | If you'd prefer to avoid inheritance, you can use the code generator directly: 42 | 43 | ```cpp 44 | wasmblr::CodeGenerator cg; 45 | auto add_func = cg.function({cg.f32, cg.f32}, {cg.f32}, [&]() { 46 | cg.local.get(0); 47 | cg.local.get(1); 48 | cg.f32.add(); 49 | }); 50 | cg.export_(add_func, "add"); 51 | 52 | auto bytes = cg.emit(); 53 | std::ofstream wasm("add.wasm", std::ios::binary); 54 | wasm.write((char*)bytes.data(), bytes.size()); 55 | ``` 56 | 57 | And then, in JavaScript: 58 | 59 | ```javascript 60 | const wasm = fs.readFileSync('add.wasm'); // or however you'd like to load it 61 | const m = new WebAssembly.Module(wasm); 62 | const instance = new WebAssembly.Instance(m, {}); 63 | // use the function 64 | console.log(instance.exports.add(8, 9)); 65 | ``` 66 | 67 | # Test 68 | 69 | With `node.js` installed, 70 | 71 | ``` 72 | g++ test.cc -std=c++11 -o test 73 | ./test 74 | ``` 75 | 76 | # Supported Features 77 | 78 | The semantics of the assembler attempt to mimic the [WebAssembly standard](https://webassembly.github.io/spec/core/) closely. 79 | In the case of reserved keywords in C++ (such as export, xor, etc.), the mnemonic has an underscore appended (e.g. `export_`, `i32.xor_`). 80 | 81 | A couple of example uses follow: 82 | 83 | ### Recursion 84 | 85 | ```cpp 86 | struct Code : wasmblr::CodeGenerator { 87 | // NB: Needs to be a class variable; the function body is evaluated later 88 | uint32_t factorial; 89 | Code() : wasmblr::CodeGenerator() { 90 | factorial = function({f32}, {f32}, [&]() { 91 | local.get(0); 92 | f32.const_(1.0f); 93 | f32.lt(); 94 | // base case 95 | if_(f32); 96 | { 97 | f32.const_(1.0f); 98 | } 99 | else_(); 100 | { 101 | local.get(0); 102 | local.get(0); 103 | f32.const_(1.0f); 104 | f32.sub(); 105 | call(factorial); 106 | f32.mul(); 107 | } 108 | end(); 109 | }); 110 | export_(factorial, "factorial"); 111 | } 112 | }; 113 | ``` 114 | 115 | ### Blocks 116 | 117 | If-statements 118 | 119 | ```cpp 120 | struct Code : wasmblr::CodeGenerator { 121 | Code() : wasmblr::CodeGenerator() { 122 | auto if_func = function({f32}, {f32}, [&]() { 123 | f32.const_(0.0f); 124 | local.get(0); 125 | f32.gt(); 126 | if_(f32); 127 | f32.const_(0.0f); 128 | else_(); 129 | local.get(0); 130 | end(); 131 | }); 132 | export_(if_func, "relu"); 133 | } 134 | }; 135 | ``` 136 | 137 | Loops 138 | 139 | ```cpp 140 | struct Code : wasmblr::CodeGenerator { 141 | Code() : wasmblr::CodeGenerator() { 142 | auto loop_fn = function({}, {i32}, [&]() { 143 | auto i = local(i32); 144 | 145 | loop(void_); 146 | { 147 | local.get(i); 148 | i32.const_(1); 149 | i32.add(); 150 | local.set(i); 151 | 152 | local.get(i); 153 | i32.const_(10); 154 | i32.lt_s(); 155 | br_if(0); 156 | } 157 | end(); 158 | local.get(i); 159 | }); 160 | export_(loop_fn, "loop"); 161 | } 162 | }; 163 | ``` 164 | 165 | ### Memory 166 | 167 | ```cpp 168 | struct Code : wasmblr::CodeGenerator { 169 | Code() : wasmblr::CodeGenerator() { 170 | memory(1, 10).export_("mem"); 171 | auto store = function({}, {}, [&]() { 172 | i32.const_(0); // index 0 173 | i32.const_(1337); // value 1337 174 | i32.store(0, 0); // align 0, offset 0 175 | }); 176 | export_(store, "store"); 177 | } 178 | }; 179 | ``` 180 | 181 | ### SIMD (32-bit lanes for now) 182 | 183 | ```cpp 184 | struct Code : wasmblr::CodeGenerator { 185 | Code() : wasmblr::CodeGenerator() { 186 | memory(1, 10).export_("mem"); 187 | auto square = function({}, {}, [&]() { 188 | auto vec = local(v128); 189 | i32.const_(0); 190 | v128.load(); 191 | local.set(vec); 192 | 193 | local.get(vec); 194 | local.get(vec); 195 | v128.f32x4_mul(); 196 | local.set(vec); 197 | 198 | i32.const_(0); 199 | local.get(vec); 200 | v128.store(); 201 | }); 202 | export_(square, "simd_square"); 203 | } 204 | }; 205 | ``` 206 | 207 | 208 | # TODO 209 | 210 | Many things. I would appreciate any help filing issues for missing things! 211 | -------------------------------------------------------------------------------- /emscripten_example/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_add', '_jit_add', '_jit_add_len', '_free']" -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap', 'ccall']" -O3 -DSIMD=1 -msimd128 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o add.js 3 | 4 | no_simd: 5 | emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_add', '_jit_add', '_jit_add_len', '_free']" -s EXTRA_EXPORTED_RUNTIME_METHODS="['cwrap', 'ccall']" -O3 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o add.js 6 | 7 | wasmblr_only: 8 | emcc add.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_add', '_jit_add_len']" -Os -s SINGLE_FILE=1 -s MODULARIZE=1 -s ENVIRONMENT='web' -s 'EXPORT_NAME="createMyModule"' -fno-rtti -fno-exceptions -o add.js 9 | 10 | benchmark: 11 | node benchmark.js 12 | -------------------------------------------------------------------------------- /emscripten_example/README.md: -------------------------------------------------------------------------------- 1 | # Emscripten Integration Demo 2 | 3 | A detailed writeup of the contents of this folder can be found here: https://jott.live/markdown/wasm_vector_addition 4 | 5 | See `add.cc` for various implementations of vector addition and `Makefile` for the build command (I added `-O3 -msimd128` to make the benchmark more competitve). 6 | To try this example, ensure that `emcc` is in your path. 7 | 8 | ``` 9 | cd emscripten_example 10 | make 11 | node benchmark.js 12 | ``` 13 | 14 | If you change the value of `wasmblr_unroll` at the top of `benchmark.js`, different code will be generated. 15 | Amping it all the way up to 1024 shows some benefit over the default 16. 16 | 17 | On my MacBook M1, these are the results I get in node.js (`wasmblr_unroll = 16`): 18 | 19 | ![](https://i.imgur.com/SuInbUY.png) 20 | -------------------------------------------------------------------------------- /emscripten_example/add.cc: -------------------------------------------------------------------------------- 1 | #include "wasmblr.h" 2 | 3 | std::vector gen_add_hardcode(int len, int unroll) { 4 | assert(len % (unroll * 4) == 0); 5 | wasmblr::CodeGenerator cg; 6 | // we hardcode the inputs to be 7 | // 0 * N * sizeof(float), 1 * N * sizeof(float) 8 | // and the output to be 9 | // 2 * N * sizeof(float) 10 | auto pages = (len * 3 * 4) / (1 << 16) + 1; 11 | cg.memory(pages).export_("mem"); 12 | auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() { 13 | auto iter = cg.local(cg.i32); 14 | cg.i32.const_(0); 15 | cg.local.set(iter); 16 | 17 | cg.loop(cg.void_); 18 | 19 | for (auto i = 0; i < unroll; ++i) { 20 | cg.local.get(iter); 21 | 22 | cg.local.get(iter); 23 | cg.v128.load(0, i * 16); 24 | 25 | cg.local.get(iter); 26 | cg.v128.load(0, (len * 4) + i * 16); 27 | 28 | cg.v128.f32x4_add(); 29 | 30 | cg.v128.store(0, (len * 8) + i * 16); 31 | } 32 | 33 | cg.local.get(iter); 34 | cg.i32.const_(unroll * 16); 35 | cg.i32.add(); 36 | cg.local.set(iter); 37 | 38 | cg.i32.const_(len * 4); // bytes 39 | cg.local.get(iter); 40 | cg.i32.ge_u(); 41 | cg.br_if(0); 42 | 43 | cg.end(); 44 | }); 45 | cg.export_(add_func, "add"); 46 | return cg.emit(); 47 | } 48 | 49 | std::vector gen_add_loop(int len) { 50 | assert(len % 4 == 0); 51 | wasmblr::CodeGenerator cg; 52 | auto pages = (len * 3 * 4) / (1 << 16) + 1; 53 | cg.memory(pages).export_("mem"); 54 | auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() { 55 | auto iter = cg.local(cg.i32); 56 | cg.i32.const_(0); 57 | cg.local.set(iter); 58 | 59 | cg.loop(cg.void_); 60 | { 61 | cg.local.get(2); 62 | cg.local.get(iter); 63 | cg.i32.add(); 64 | 65 | cg.local.get(0); 66 | cg.local.get(iter); 67 | cg.i32.add(); 68 | cg.v128.load(); 69 | 70 | cg.local.get(1); 71 | cg.local.get(iter); 72 | cg.i32.add(); 73 | cg.v128.load(); 74 | 75 | cg.v128.f32x4_add(); 76 | 77 | cg.v128.store(); 78 | 79 | cg.i32.const_(4 * 4); // vec of 4 floats 80 | cg.local.get(iter); 81 | cg.i32.add(); 82 | cg.local.set(iter); 83 | 84 | cg.i32.const_(len * 4); // bytes 85 | cg.local.get(iter); 86 | cg.i32.ge_u(); 87 | cg.br_if(0); 88 | } 89 | cg.end(); 90 | }); 91 | cg.export_(add_func, "add"); 92 | return cg.emit(); 93 | } 94 | 95 | std::vector gen_add_unroll(int len) { 96 | assert(len % 4 == 0); 97 | wasmblr::CodeGenerator cg; 98 | auto pages = (len * 3 * 4) / (1 << 16) + 1; 99 | cg.memory(pages).export_("mem"); 100 | auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() { 101 | // no loop at all 102 | for (auto i = 0; i < len / 4; ++i) { 103 | cg.local.get(2); 104 | 105 | cg.local.get(0); 106 | cg.v128.load(0, i * 16); 107 | 108 | cg.local.get(1); 109 | cg.v128.load(0, i * 16); 110 | 111 | cg.v128.f32x4_add(); 112 | 113 | cg.v128.store(0, i * 16); 114 | } 115 | }); 116 | cg.export_(add_func, "add"); 117 | return cg.emit(); 118 | } 119 | 120 | std::vector gen_add_mix_no_simd(int len, int unroll) { 121 | if (len < unroll) { 122 | unroll = len; 123 | } 124 | assert(len % (unroll) == 0); 125 | wasmblr::CodeGenerator cg; 126 | auto pages = (len * 3 * 4) / (1 << 16) + 1; 127 | cg.memory(pages).export_("mem"); 128 | auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() { 129 | auto iter = cg.local(cg.i32); 130 | cg.i32.const_(0); 131 | cg.local.set(iter); 132 | 133 | cg.loop(cg.void_); 134 | 135 | for (auto i = 0; i < unroll; ++i) { 136 | cg.local.get(2); 137 | 138 | cg.local.get(0); 139 | cg.f32.load(0, i * 4); 140 | 141 | cg.local.get(1); 142 | cg.f32.load(0, i * 4); 143 | 144 | cg.f32.add(); 145 | 146 | cg.f32.store(0, i * 4); 147 | } 148 | 149 | cg.local.get(0); 150 | cg.i32.const_(unroll * 4); 151 | cg.i32.add(); 152 | cg.local.set(0); 153 | 154 | cg.local.get(1); 155 | cg.i32.const_(unroll * 4); 156 | cg.i32.add(); 157 | cg.local.set(1); 158 | 159 | cg.local.get(2); 160 | cg.i32.const_(unroll * 4); 161 | cg.i32.add(); 162 | cg.local.set(2); 163 | 164 | cg.local.get(iter); 165 | cg.i32.const_(unroll * 4); 166 | cg.i32.add(); 167 | cg.local.set(iter); 168 | 169 | cg.i32.const_(len * 4); // bytes 170 | cg.local.get(iter); 171 | cg.i32.ge_s(); 172 | cg.br_if(0); 173 | 174 | cg.end(); 175 | }); 176 | cg.export_(add_func, "add"); 177 | return cg.emit(); 178 | } 179 | 180 | std::vector gen_add_mix(int len, int unroll) { 181 | assert(len % (unroll * 4) == 0); 182 | wasmblr::CodeGenerator cg; 183 | auto pages = (len * 3 * 4) / (1 << 16) + 1; 184 | cg.memory(pages).export_("mem"); 185 | auto add_func = cg.function({cg.i32, cg.i32, cg.i32}, {}, [&]() { 186 | auto iter = cg.local(cg.i32); 187 | cg.i32.const_(0); 188 | cg.local.set(iter); 189 | 190 | cg.loop(cg.void_); 191 | 192 | for (auto i = 0; i < unroll; ++i) { 193 | cg.local.get(2); 194 | 195 | cg.local.get(0); 196 | cg.v128.load(0, i * 16); 197 | 198 | cg.local.get(1); 199 | cg.v128.load(0, i * 16); 200 | 201 | cg.v128.f32x4_add(); 202 | 203 | cg.v128.store(0, i * 16); 204 | } 205 | 206 | cg.local.get(0); 207 | cg.i32.const_(unroll * 16); 208 | cg.i32.add(); 209 | cg.local.set(0); 210 | 211 | cg.local.get(1); 212 | cg.i32.const_(unroll * 16); 213 | cg.i32.add(); 214 | cg.local.set(1); 215 | 216 | cg.local.get(2); 217 | cg.i32.const_(unroll * 16); 218 | cg.i32.add(); 219 | cg.local.set(2); 220 | 221 | cg.local.get(iter); 222 | cg.i32.const_(unroll * 16); 223 | cg.i32.add(); 224 | cg.local.set(iter); 225 | 226 | cg.i32.const_(len * 4); // bytes 227 | cg.local.get(iter); 228 | cg.i32.ge_s(); 229 | cg.br_if(0); 230 | 231 | cg.end(); 232 | }); 233 | cg.export_(add_func, "add"); 234 | return cg.emit(); 235 | } 236 | 237 | std::vector gen_add(int len, int unroll, bool simd) { 238 | if (!simd) { 239 | return gen_add_mix_no_simd(len, unroll); 240 | } 241 | if (unroll * 4 >= len) { 242 | return gen_add_unroll(len); 243 | } else if (unroll <= 1) { 244 | return gen_add_loop(len); 245 | } 246 | return gen_add_mix(len, unroll); 247 | } 248 | 249 | extern "C" { 250 | 251 | void add(const float* a, const float* b, float* c, int len) { 252 | for (auto i = 0; i < len; ++i) { 253 | c[i] = a[i] + b[i]; 254 | } 255 | } 256 | 257 | #ifdef SIMD 258 | static bool simd = true; 259 | #else 260 | static bool simd = false; 261 | #endif 262 | 263 | uint8_t* jit_add(int len, int unroll) { 264 | auto bytes = gen_add(len, unroll, simd); 265 | uint8_t* out = (uint8_t*)malloc(bytes.size()); 266 | memcpy(out, bytes.data(), bytes.size()); 267 | return out; 268 | } 269 | 270 | int jit_add_len(int len, int unroll) { 271 | auto bytes = gen_add(len, unroll, simd); 272 | return bytes.size(); 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /emscripten_example/benchmark.js: -------------------------------------------------------------------------------- 1 | const em = require('./add.js'); 2 | var Module; 3 | const wasmblr_unroll = 16; 4 | const warmup = 100; 5 | const target_ms = 1000; 6 | 7 | async function gen_pure(N) { 8 | let a = new Array(N).fill(0); 9 | let b = new Array(N).fill(0); 10 | let c = new Array(N).fill(0); 11 | 12 | function add() { 13 | for (let i = 0; i < N; ++i) { 14 | c[i] = a[i] + b[i]; 15 | } 16 | } 17 | 18 | return [add, a, b, c]; 19 | } 20 | 21 | async function gen_typed(N) { 22 | let a = new Float32Array(N); 23 | let b = new Float32Array(N); 24 | let c = new Float32Array(N); 25 | 26 | function add() { 27 | for (let i = 0; i < N; ++i) { 28 | c[i] = a[i] + b[i]; 29 | } 30 | } 31 | 32 | return [add, a, b, c]; 33 | } 34 | 35 | async function gen_emscripten(N) { 36 | function emscripten_array(len) { 37 | var ptr = Module._malloc(len * 4); 38 | return [new Float32Array(Module.HEAPF32.buffer, ptr, len), ptr]; 39 | } 40 | 41 | let [a, a_] = emscripten_array(N); 42 | let [b, b_] = emscripten_array(N); 43 | let [c, c_] = emscripten_array(N); 44 | const add = Module._add; 45 | 46 | return [() => add(a_, b_, c_, N), a, b, c, () => { 47 | Module._free(a_); 48 | Module._free(b_); 49 | Module._free(c_); 50 | }]; 51 | } 52 | 53 | async function gen_wasmblr(N, unroll) { 54 | const wasm = Module._jit_add(N, unroll); 55 | const wasm_len = Module._jit_add_len(N, unroll); 56 | const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len); 57 | const m = await WebAssembly.compile(wasm_data); 58 | const instance = await WebAssembly.instantiate(m, {}); 59 | 60 | let wasmblr_malloc_height = 0; 61 | let mem = instance.exports.mem; 62 | 63 | function wasmblr_array(len) { 64 | console.assert((mem.buffer.byteLength - wasmblr_malloc_height) > len * 4); 65 | let ptr = wasmblr_malloc_height; 66 | console.assert(([0, N * 4, N * 8]).indexOf(ptr) > -1, "allocated invalid ptr") 67 | let array = new Float32Array(mem.buffer, ptr, len); 68 | wasmblr_malloc_height += len * 4; 69 | return [array, ptr]; 70 | } 71 | let [a, a_] = wasmblr_array(N); 72 | let [b, b_] = wasmblr_array(N); 73 | let [c, c_] = wasmblr_array(N); 74 | 75 | const add = instance.exports.add; 76 | 77 | return [() => add(a_, b_, c_), a, b, c]; 78 | } 79 | 80 | async function gen_wasmblr_tuned(N) { 81 | let best = 0; 82 | let best_time = 1e9; 83 | for (let i = 0; Math.pow(2, i) < Math.min(1024, N / 4 + 2); ++i) { 84 | let [fn, w_a, w_b, w_c] = await gen_wasmblr(N, Math.pow(2, i)); 85 | for (let _ = 0; _ < 100; ++_) { 86 | fn(); 87 | } 88 | const t = performance.now(); 89 | for (let _ = 0; _ < 1000; ++_) { 90 | fn(); 91 | } 92 | const diff = performance.now() - t; 93 | if (diff < best_time) { 94 | best = i; 95 | best_time = diff; 96 | } 97 | } 98 | return [...await gen_wasmblr(N, Math.pow(2, best)), Math.pow(2, best)]; 99 | } 100 | 101 | async function perf(N, name, fn) { 102 | const w0 = performance.now(); 103 | for (let i = 0; i < warmup; ++i) { 104 | fn(); 105 | } 106 | const w1 = performance.now(); 107 | let iters = Math.min(Math.max(warmup * target_ms / (w1 - w0), 1), 1e6); 108 | const t0 = performance.now(); 109 | for (let i = 0; i < iters; ++i) { 110 | fn(); 111 | } 112 | const t1 = performance.now(); 113 | const iters_sec = 1e3 * iters / (t1 - t0); 114 | const elem_sec = N * iters_sec; 115 | const gb_sec = elem_sec * 4 * 3 /* 2 read 1 write */ / 1e9; 116 | const round = (num) => Math.round(num * 100) / 100 117 | console.log(name, round(iters_sec), "iters/sec", `(${round(gb_sec)} GB/s)`); 118 | } 119 | 120 | async function benchmark(N) { 121 | let [pure_fn, p_a, p_b, p_c] = await gen_pure(N); 122 | let [typed_fn, t_a, t_b, t_c] = await gen_typed(N); 123 | let [emscripten_fn, e_a, e_b, e_c, emscripten_cleanup] = await gen_emscripten(N); 124 | let [wasmblr_fn, w_a, w_b, w_c] = await gen_wasmblr(N, wasmblr_unroll); 125 | let [wasmblr_tuned_fn, wt_a, wt_b, wt_c, unroll] = await gen_wasmblr_tuned(N); 126 | 127 | for (let i = 0; i < N; ++i) { 128 | let a = Math.random(); 129 | let b = Math.random(); 130 | p_a[i] = a; 131 | t_a[i] = a; 132 | e_a[i] = a; 133 | w_a[i] = a; 134 | wt_a[i] = a; 135 | 136 | p_b[i] = b; 137 | t_b[i] = b; 138 | e_b[i] = b; 139 | w_b[i] = b; 140 | wt_b[i] = b; 141 | } 142 | 143 | pure_fn(); 144 | typed_fn(); 145 | emscripten_fn(); 146 | wasmblr_fn(); 147 | wasmblr_tuned_fn(); 148 | 149 | for (let i = 0; i < N; ++i) { 150 | function check(arr, name) { 151 | if (Math.abs(t_c[i] - arr[i]) > 0.01) { 152 | console.log("difference found at index", i, t_c[i], "vs", name, arr[i]); 153 | return false; 154 | } 155 | return true; 156 | } 157 | if (!check(p_c, "pure")) { 158 | return; 159 | } 160 | if (!check(e_c, "emscripten")) { 161 | return; 162 | } 163 | if (!check(w_c, "wasmblr")) { 164 | return; 165 | } 166 | if (!check(wt_c, "wasmblr (tuned)")) { 167 | return; 168 | } 169 | } 170 | 171 | console.log("benchmarking vec add of size", N); 172 | await perf(N, " pure javascript: ", pure_fn); 173 | await perf(N, " typed arrays: ", typed_fn); 174 | await perf(N, " emscripten: ", emscripten_fn); 175 | await perf(N, " wasmblr: ", wasmblr_fn); 176 | await perf(N, ` wasmblr (tuned ${unroll}):`.padEnd(26), wasmblr_tuned_fn); 177 | 178 | emscripten_cleanup() 179 | } 180 | 181 | em().then(function(m) { 182 | Module = m; 183 | // any larger and you'll need to recompile to give emscripten more memory 184 | (async () => { 185 | for (let i of [4, 64, 1024, 16 * 1024, 256 * 1024]) { 186 | await benchmark(i); 187 | } 188 | })(); 189 | }); 190 | -------------------------------------------------------------------------------- /flops_example/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | emcc jit.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_mac', '_jit_mac_len', '_free']" -O3 -s SINGLE_FILE=1 -s MODULARIZE=1 -s 'EXPORT_NAME="createMyModule"' -o jit.js 3 | 4 | -------------------------------------------------------------------------------- /flops_example/README.md: -------------------------------------------------------------------------------- 1 | # Demo of dynamically determining peak FLOPs 2 | 3 | This file sweeps through a range of arithmetic intensities 4 | to help determine the best configurations for running MAC-based 5 | operations such as matrix multiplication or convolution. 6 | 7 | To run it in browser right now: https://bwasti.github.io/wasmblr/flops/ 8 | 9 | ## Build the jit.js file 10 | 11 | ``` 12 | make 13 | ``` 14 | 15 | ## Use the jit.js file 16 | 17 | The benchmark code uses the generated `jit.js` file. 18 | An `index.html` file is provided to run the benchmark 19 | in the browser. 20 | 21 | ``` 22 | python3 -m http.server 23 | ``` 24 | -------------------------------------------------------------------------------- /flops_example/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 14 | 15 | This page helps determine peak floating point operations per second 16 | via chained multiply-adds. 17 | [source code] 18 | 19 | 20 |
21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /flops_example/jit.cc: -------------------------------------------------------------------------------- 1 | #include "wasmblr.h" 2 | 3 | std::vector gen(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) { 4 | wasmblr::CodeGenerator cg; 5 | // load per loop * 4 * (simd ? 4 : 1) 6 | int32_t mem_per_elem = load_per_loop * 4 * (simd ? 4 : 1); 7 | int32_t bytes = 3 * mem_per_elem; 8 | int32_t pages = bytes / (1 << 16) + 1; 9 | cg.memory(pages).export_("mem"); 10 | 11 | int32_t a_offset = 0 * mem_per_elem; 12 | int32_t b_offset = 1 * mem_per_elem; 13 | int32_t c_offset = 2 * mem_per_elem; 14 | 15 | auto func = cg.function({}, {}, [&]() { 16 | auto gen_local = [&]() { 17 | if (simd) { 18 | return cg.local(cg.v128); 19 | } 20 | return cg.local(cg.f32); 21 | }; 22 | 23 | std::vector a_locals; 24 | std::vector b_locals; 25 | std::vector c_locals; 26 | for (auto i = 0; i < load_per_loop; ++i) { 27 | a_locals.emplace_back(gen_local()); 28 | b_locals.emplace_back(gen_local()); 29 | c_locals.emplace_back(gen_local()); 30 | } 31 | 32 | auto iter = -1; 33 | if (loops > 1) { 34 | iter = cg.local(cg.i32); 35 | cg.i32.const_(0); 36 | cg.local.set(iter); 37 | 38 | cg.loop(cg.void_); 39 | } 40 | 41 | auto load_local = [&](int local, int32_t off, int i) { 42 | cg.i32.const_(0); 43 | if (simd) { 44 | cg.v128.load(1, off + i * 16); 45 | } else { 46 | cg.f32.load(1, off + i * 4); 47 | } 48 | cg.local.set(local); 49 | }; 50 | 51 | auto store_local = [&](int local, int32_t off, int i) { 52 | if (simd) { 53 | cg.v128.store(1, off + i * 16); 54 | } else { 55 | cg.f32.store(1, off + i * 4); 56 | } 57 | }; 58 | 59 | for (auto i = 0; i < load_per_loop; ++i) { 60 | load_local(a_locals.at(i), a_offset, i); 61 | load_local(b_locals.at(i), b_offset, i); 62 | load_local(c_locals.at(i), c_offset, i); 63 | 64 | for (auto m = 0; m < mac_per_load; ++m) { 65 | cg.local.get(c_locals.at(i)); 66 | cg.local.get(a_locals.at(i)); 67 | cg.local.get(b_locals.at(i)); 68 | if (simd) { 69 | cg.v128.f32x4_mul(); 70 | cg.v128.f32x4_add(); 71 | } else { 72 | cg.f32.mul(); 73 | cg.f32.add(); 74 | } 75 | cg.local.set(c_locals.at(i)); 76 | } 77 | 78 | cg.i32.const_(0); 79 | cg.local.get(c_locals.at(i)); 80 | store_local(c_locals.at(i), c_offset, i); 81 | } 82 | 83 | if (loops > 1) { 84 | cg.local.get(iter); 85 | cg.i32.const_(1); 86 | cg.i32.add(); 87 | cg.local.set(iter); 88 | 89 | cg.i32.const_(loops); 90 | cg.local.get(iter); 91 | cg.i32.ge_s(); 92 | cg.br_if(0); 93 | 94 | cg.end(); 95 | } 96 | 97 | }); 98 | cg.export_(func, "mac"); 99 | return cg.emit(); 100 | } 101 | 102 | extern "C" { 103 | 104 | uint8_t* jit_mac(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) { 105 | auto bytes = gen(mac_per_load, load_per_loop, loops, simd); 106 | uint8_t* out = (uint8_t*)malloc(bytes.size()); 107 | memcpy(out, bytes.data(), bytes.size()); 108 | return out; 109 | } 110 | 111 | size_t jit_mac_len(int32_t mac_per_load, int32_t load_per_loop, int32_t loops, bool simd) { 112 | auto bytes = gen(mac_per_load, load_per_loop, loops, simd); 113 | return bytes.size(); 114 | } 115 | 116 | } 117 | 118 | -------------------------------------------------------------------------------- /flops_example/main.js: -------------------------------------------------------------------------------- 1 | async function jit(Module, mac_per_load, loads_per_loop, loops, simd) { 2 | const wasm = Module._jit_mac(mac_per_load, loads_per_loop, loops, simd); 3 | const wasm_len = Module._jit_mac_len(mac_per_load, loads_per_loop, loops, simd); 4 | const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len); 5 | const m = await WebAssembly.compile(wasm_data); 6 | const instance = await WebAssembly.instantiate(m, {}); 7 | Module._free(wasm); 8 | const mem = instance.exports.mem; 9 | const elems = loads_per_loop * (simd ? 4 : 1); 10 | let a = new Float32Array(mem.buffer, 0, elems); 11 | let b = new Float32Array(mem.buffer, elems * 4, elems); 12 | let c = new Float32Array(mem.buffer, elems * 8, elems); 13 | return [instance.exports.mac, a, b, c]; 14 | } 15 | 16 | function gen_ref(mac_per_load, loads_per_loop, loops, simd) { 17 | return function(A, B, C) { 18 | const elems = loads_per_loop * (simd ? 4 : 1); 19 | for (let l = 0; l < loops; ++l) { 20 | for (let ll = 0; ll < elems; ++ll) { 21 | let a = A[ll]; 22 | let b = B[ll]; 23 | let c = C[ll]; 24 | for (let m = 0; m < mac_per_load; ++m) { 25 | c = a * b + c 26 | } 27 | C[ll] = c; 28 | } 29 | } 30 | } 31 | } 32 | 33 | function log(...args) { 34 | const str = args.reduce((a, b) => { 35 | return a + " " + b; 36 | }, ""); 37 | document.querySelector('#output').appendChild(document.createTextNode(str)); 38 | document.querySelector('#output').appendChild(document.createElement('br')); 39 | } 40 | 41 | function log_best(...args) { 42 | document.querySelector('#best').innerHTML = ''; 43 | const str = args.reduce((a, b) => { 44 | return a + " " + b; 45 | }, ""); 46 | document.querySelector('#best').appendChild(document.createTextNode(str)); 47 | document.querySelector('#best').appendChild(document.createElement('br')); 48 | document.querySelector('#best').appendChild(document.createElement('br')); 49 | } 50 | 51 | function rand(a) { 52 | for (let i = 0; i < a.length; ++i) { 53 | a[i] = Math.random() / 100; 54 | } 55 | } 56 | 57 | function diff(a, b) { 58 | let max_diff = 0; 59 | for (let i = 0; i < a.length; ++i) { 60 | Math.max(Math.abs(a[i] - b[i]), max_diff); 61 | } 62 | return max_diff; 63 | } 64 | 65 | async function launch_mac_benchmark() { 66 | const Module = await createMyModule(); 67 | let simd_support = [0]; 68 | 69 | jit(Module, 1, 1, 1, true).then(() => { 70 | simd_support.push(1); 71 | }).catch(() => { 72 | log("no simd support"); 73 | }); 74 | 75 | let best_gflops = 0; 76 | let best_str = ''; 77 | for (let mac_per_load of [1, 2, 4, 8, 16, 32]) { 78 | for (let loads_per_loop of [1, 2, 4, 8, 16, 32]) { 79 | for (let loops of [1, 16, 64, 128]) { 80 | for (let simd of simd_support) { 81 | const [fn, a, b, c] = await jit(Module, mac_per_load, loads_per_loop, loops, simd); 82 | const ops = loops * loads_per_loop * mac_per_load * (simd ? 4 : 1); 83 | rand(a); 84 | rand(b); 85 | rand(c); 86 | const ref_c = new Float32Array(c.length); 87 | ref_c.set(c); 88 | fn(); 89 | const str = `(MACs per load: ${mac_per_load}, Loads per loop ${loads_per_loop}, Loops: ${loops}, SIMD: ${simd})`; 90 | const err = diff(c, ref_c); 91 | if (err > 0.1) { 92 | log("error!", str, 'example elem:', c[0]); 93 | continue; 94 | } 95 | 96 | const iters_sec = bench(100, fn); 97 | const gflops = ops * 2 * iters_sec / 1e9; 98 | log(`${gflops} GFlops`, str); 99 | if (gflops > best_gflops) { 100 | best_gflops = gflops; 101 | best_str = str; 102 | log_best(`Best: ${gflops} GFlops`, str); 103 | } 104 | } 105 | } 106 | } 107 | } 108 | log_best(`done. Best: ${best_gflops} GFlops`, best_str); 109 | } 110 | 111 | function run_bench(target, fn) { 112 | let diff = 0; 113 | let num_iters = 1; 114 | while (diff < (target / 2)) { 115 | num_iters *= 2; 116 | const t0 = performance.now(); 117 | for (let i = 0; i < num_iters; ++i) { 118 | fn(); 119 | } 120 | const t1 = performance.now(); 121 | diff = t1 - t0; 122 | } 123 | const iters_sec = 1e3 * num_iters / diff; 124 | return iters_sec; 125 | } 126 | 127 | function bench(target_ms, fn) { 128 | const warmup_ms = target_ms / 10; 129 | run_bench(warmup_ms, fn); 130 | return run_bench(target_ms, fn); 131 | } -------------------------------------------------------------------------------- /matmul_example/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | emcc mm.cc -I../ -s EXPORTED_FUNCTIONS="['_jit_mm', '_jit_mm_len', '_jit_mm_naive', '_jit_mm_naive_len', '_jit_mm_nosimd', '_jit_mm_nosimd_len', '_free']" -O3 -s ASSERTIONS=1 -s SINGLE_FILE=1 -s MODULARIZE -s 'EXPORT_NAME="createMyModule"' -s INITIAL_MEMORY=67108864 -o mm.js 3 | -------------------------------------------------------------------------------- /matmul_example/README.md: -------------------------------------------------------------------------------- 1 | ## Matmul Example 2 | 3 | This folder contains an example using emscripten and wasmblr to generate various tuned matrix multiplication implementations 4 | on the fly in the browser. 5 | 6 | A writeup can be found here: https://jott.live/markdown/mm_wasm 7 | 8 | To use the demo ensure that `emcc` is in your path. 9 | 10 | ``` 11 | make 12 | ``` 13 | 14 | and then host a server 15 | 16 | ``` 17 | python3 -m http.server 18 | ``` 19 | 20 | http://localhost should have a tuning script up and running. 21 | -------------------------------------------------------------------------------- /matmul_example/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
 4 | Select the benchmark to tune for:
 5 | 
6 | 7 | 8 | 9 |

10 | 

11 | 


--------------------------------------------------------------------------------
/matmul_example/main.js:
--------------------------------------------------------------------------------
  1 | function log(...args) {
  2 |   const str = args.reduce((a, b) => {
  3 |     return a + " " + b;
  4 |   }, "");
  5 |   document.querySelector('#output').appendChild(document.createTextNode(str));
  6 |   document.querySelector('#output').appendChild(document.createElement('br'));
  7 | }
  8 | 
  9 | async function has_simd(Module) {
 10 |   const wasm = Module._jit_mm(4,4,4,1,1,1);
 11 |   const wasm_len = Module._jit_mm_len(4,4,4,1,1,1);
 12 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
 13 |   let has = true;
 14 |   const m = await WebAssembly.compile(wasm_data).catch(e => {
 15 |     has = false;
 16 |   });
 17 |   Module._free(wasm);
 18 |   return has;
 19 | }
 20 | 
 21 | async function jit(Module, M, N, K, Mu, Nu, Ku) {
 22 |   let [jit, len] = [Module._jit_mm, Module._jit_mm_len];
 23 |   const simd = await has_simd(Module);
 24 |   if (!simd) {
 25 |     [jit, len] = [Module._jit_mm_nosimd, Module._jit_mm_nosimd_len];
 26 |   }
 27 |   const wasm = jit(M, N, K, Mu, Nu, Ku);
 28 |   const wasm_len = len(M, N, K, Mu, Nu, Ku);
 29 |   const wasm_data = new Uint8Array(Module.HEAP8.buffer, wasm, wasm_len);
 30 |   const m = await WebAssembly.compile(wasm_data).catch(e => log('Error compiling ->', e));
 31 |   const instance = await WebAssembly.instantiate(m, {});
 32 |   Module._free(wasm);
 33 |   const mem = instance.exports.mem;
 34 |   let a = new Float32Array(mem.buffer, 0, M * K);
 35 |   let b = new Float32Array(mem.buffer, M * K * 4, K * N);
 36 |   let c = new Float32Array(mem.buffer, (M * K + K * N) * 4, M * N);
 37 |   return [instance.exports.mm, a, b, c, simd];
 38 | }
 39 | 
 40 | function ref_mm(a, b, M, N, K) {
 41 |   c = new Float32Array(M * N);
 42 |   for (let m = 0; m < M; ++m) {
 43 |     for (let n = 0; n < N; ++n) {
 44 |       for (let k = 0; k < K; ++k) {
 45 |         c[m * N + n] += a[m * K + k] * b[k * N + n];
 46 |       }
 47 |     }
 48 |   }
 49 |   return c;
 50 | }
 51 | 
 52 | async function bench(m, M, N, K, Mu, Nu, Ku) {
 53 |   const [fn, a, b, c, simd] = await jit(m, M, N, K, Mu, Nu, Ku);
 54 |   for (let i = 0; i < N * N; ++i) {
 55 |     a[i] = Math.random();
 56 |     b[i] = Math.random();
 57 |     c[i] = 0;
 58 |   }
 59 |   fn();
 60 |   const ref_c = ref_mm(a, b, M, N, K);
 61 |   let max_diff = 0;
 62 |   for (let i = 0; i < M * N; ++i) {
 63 |     max_diff = Math.max(max_diff, Math.abs(ref_c[i] - c[i]));
 64 |   }
 65 |   console.log("max diff", max_diff);
 66 |   if (max_diff > 0.1) {
 67 |     log("error! max diff", max_diff);
 68 |   }
 69 |   for (let i = 0; i < 10; ++i) {
 70 |     fn();
 71 |   }
 72 |   // ~0.1if we hit 40gflops
 73 |   const iters = 4e9 / (M * N * K * 2) / (simd ? 1 : 4);
 74 |   const t = performance.now();
 75 |   for (let _ = 0; _ < iters; ++_) {
 76 |     fn();
 77 |   }
 78 |   const diff = performance.now() - t;
 79 |   return 1e3 * N * N * N * 2 * iters / diff / 1e9;
 80 | }
 81 | 
 82 | async function init(N) {
 83 |   document.getElementById("output").textContent = '';
 84 |   document.getElementById("highlight").textContent = '';
 85 |   let mod = await createMyModule();
 86 |   const M = N;
 87 |   const K = N;
 88 |   let best_gflops = 0;
 89 |   let best_str = '';
 90 |   const simd = await has_simd(mod);
 91 |   if (!simd) {
 92 |     log('No simd found, falling back to scalar code.');
 93 |   }
 94 |   for (let m of [1, 2, 4, 8, 16, 32]) {
 95 |     for (let n of [1, 2, 4, 8, 16, 32]) {
 96 |       for (let k of [1, 2, 4, 8, 16, 32]) {
 97 |         if (k > K) {
 98 |           continue;
 99 |         }
100 |         if (m > M) {
101 |           continue;
102 |         }
103 |         if (n * 4 > N) {
104 |           continue;
105 |         }
106 |         let gflops = await bench(mod, M, N, K, m, n, k);
107 |         if (gflops > best_gflops) {
108 |           best_gflops = gflops;
109 |           let pre = document.getElementById("highlight");
110 |           best_str = `best gflops: ${best_gflops} (unroll m: ${m}, n: ${n}, k: ${k})`;
111 |           pre.textContent = best_str;
112 |         }
113 |         log(m, n, k, "gflops", gflops);
114 |       }
115 |     }
116 |   }
117 |   let pre = document.getElementById("highlight");
118 |   let str = `(done) ${best_str}`;
119 |   pre.textContent = str;
120 | }
121 | 
122 | window.addEventListener('load', function() {
123 |   document.getElementById('mm128').addEventListener('click', () => init(128));
124 |   document.getElementById('mm256').addEventListener('click', () => init(256));
125 |   document.getElementById('mm512').addEventListener('click', () => init(512));
126 | });
127 | 


--------------------------------------------------------------------------------
/matmul_example/mm.cc:
--------------------------------------------------------------------------------
  1 | #include "wasmblr.h"
  2 | 
  3 | struct MMGenNoSIMD : public wasmblr::CodeGenerator {
  4 |   MMGenNoSIMD(int M, int N, int K, int M_unroll, int N_unroll, int K_unroll) {
  5 |     assert(M_unroll <= M && "Invalid M unroll size");
  6 |     assert(N_unroll <= N && "Invalid N unroll size");
  7 |     assert(K_unroll <= K && "Invalid K unroll size");
  8 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
  9 |     auto A_off = 0;
 10 |     auto B_off = M * K * 4;
 11 |     auto C_off = (M * K + K * N) * 4;
 12 |     memory(pages).export_("mem");
 13 |     auto fn = function({}, {}, [=]() {
 14 |       auto m = local(i32);
 15 |       auto n = local(i32);
 16 |       auto k = local(i32);
 17 |       std::vector load_a;
 18 |       std::vector load_b;
 19 |       for (auto j = 0; j < K_unroll; ++j) {
 20 |         for (auto i = 0; i < M_unroll; ++i) {
 21 |           load_a.emplace_back(local(f32));
 22 |         }
 23 |         for (auto i = 0; i < N_unroll; ++i) {
 24 |           load_b.emplace_back(local(f32));
 25 |         }
 26 |       }
 27 |       auto a_off = local(i32);
 28 |       auto b_off = local(i32);
 29 |       auto c_off = local(i32);
 30 |       std::vector accs;
 31 |       for (auto i = 0; i < M_unroll * N_unroll; ++i) {
 32 |         accs.emplace_back(local(f32));
 33 |       }
 34 | 
 35 |       i32.const_(0);
 36 |       local.set(m);
 37 |       loop(void_); // M
 38 | 
 39 |       local.get(m);
 40 |       i32.const_(N * 4);
 41 |       i32.mul();
 42 |       local.set(c_off);
 43 | 
 44 |       i32.const_(0);
 45 |       local.set(n);
 46 |       loop(void_); // N
 47 | 
 48 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 49 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 50 |           local.get(c_off);
 51 |           f32.load(0, C_off + n_unroll * 4 + m_unroll * N * 4);
 52 |           local.set(accs.at(m_unroll * N_unroll + n_unroll));
 53 |         }
 54 |       }
 55 | 
 56 |       local.get(m);
 57 |       i32.const_(K * 4);
 58 |       i32.mul();
 59 |       local.set(a_off);
 60 | 
 61 |       local.get(n);
 62 |       i32.const_(4 * N_unroll);
 63 |       i32.mul();
 64 |       local.set(b_off);
 65 | 
 66 |       i32.const_(0);
 67 |       local.set(k);
 68 |       loop(void_); // K
 69 | 
 70 |       for (auto k_unroll = 0; k_unroll < K_unroll; ++k_unroll) {
 71 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 72 |           local.get(a_off);
 73 |           f32.load(0, A_off + (m_unroll * K + k_unroll) * 4);
 74 |           local.set(load_a.at(m_unroll * K_unroll + k_unroll));
 75 |         }
 76 | 
 77 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 78 |           local.get(b_off);
 79 |           f32.load(0, B_off + (k_unroll * N + n_unroll) * 4);
 80 |           local.set(load_b.at(n_unroll * K_unroll + k_unroll));
 81 |         }
 82 | 
 83 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
 84 |           for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
 85 | 
 86 |             local.get(accs.at(m_unroll * N_unroll + n_unroll));
 87 |             local.get(load_a.at(m_unroll * K_unroll + k_unroll));
 88 |             local.get(load_b.at(n_unroll * K_unroll + k_unroll));
 89 |             f32.mul();
 90 |             f32.add();
 91 |             local.set(accs.at(m_unroll * N_unroll + n_unroll));
 92 |           }
 93 |         }
 94 |       }
 95 | 
 96 |       local.get(a_off);
 97 |       i32.const_(4 * K_unroll);
 98 |       i32.add();
 99 |       local.set(a_off);
100 | 
101 |       local.get(b_off);
102 |       i32.const_(N * 4 * K_unroll);
103 |       i32.add();
104 |       local.set(b_off);
105 | 
106 |       local.get(k);
107 |       i32.const_(K_unroll);
108 |       i32.add();
109 |       local.tee(k);
110 |       i32.const_(K);
111 |       i32.lt_u();
112 |       br_if(0);
113 | 
114 |       end(); // K
115 | 
116 |       // store output of C
117 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
118 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
119 |           local.get(c_off);
120 |           local.get(accs.at(m_unroll * N_unroll + n_unroll));
121 |           f32.store(0, C_off + n_unroll * 4 + m_unroll * N * 4);
122 |         }
123 |       }
124 | 
125 |       local.get(c_off);
126 |       i32.const_(N_unroll * 4);
127 |       i32.add();
128 |       local.set(c_off);
129 | 
130 |       local.get(n);
131 |       i32.const_(1);
132 |       i32.add();
133 |       local.tee(n);
134 |       i32.const_(N / N_unroll);
135 |       i32.lt_u();
136 |       br_if(0);
137 | 
138 |       end(); // N
139 | 
140 |       local.get(m);
141 |       i32.const_(M_unroll);
142 |       i32.add();
143 |       local.tee(m);
144 |       i32.const_(M);
145 |       i32.lt_u();
146 |       br_if(0);
147 | 
148 |       end(); // M
149 |     });
150 |     export_(fn, "mm");
151 |   }
152 | };
153 | 
154 | struct MMGenSimple : public wasmblr::CodeGenerator {
155 |   MMGenSimple(int M, int N, int K) {
156 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
157 |     auto A_off = 0;
158 |     auto B_off = M * K * 4;
159 |     auto C_off = (M * K + K * N) * 4;
160 |     memory(pages).export_("mem");
161 |     auto fn = function({}, {}, [=]() {
162 |       auto m = local(i32);
163 |       auto n = local(i32);
164 |       auto k = local(i32);
165 | 
166 |       // loop over m
167 |       i32.const_(0);
168 |       local.set(m);
169 |       loop(void_);
170 | 
171 |       // loop over n
172 |       i32.const_(0);
173 |       local.set(n);
174 |       loop(void_);
175 | 
176 |       // loop over k
177 |       i32.const_(0);
178 |       local.set(k);
179 |       loop(void_);
180 | 
181 |       // load original value of C
182 |       local.get(m);
183 |       i32.const_(N);
184 |       i32.mul();
185 |       local.get(n);
186 |       i32.add();
187 |       i32.const_(4);
188 |       i32.mul();
189 |       f32.load(0, C_off); // stack: [C]
190 | 
191 |       // load value of A
192 |       local.get(m);
193 |       i32.const_(K);
194 |       i32.mul();
195 |       local.get(k);
196 |       i32.add();
197 |       i32.const_(4);
198 |       i32.mul();
199 |       f32.load(0, A_off); // stack: [A, C]
200 | 
201 |       // load value of B
202 |       local.get(k);
203 |       i32.const_(N);
204 |       i32.mul();
205 |       local.get(n);
206 |       i32.add();
207 |       i32.const_(4);
208 |       i32.mul();
209 |       f32.load(0, B_off); // stack: [B, A, C]
210 | 
211 |       f32.mul(); // stack: [B * A, C]
212 |       f32.add(); // stack: [B * A + C]
213 |       auto c = local(f32);
214 |       local.set(c); // save temporarily
215 | 
216 |       // store new value to C
217 |       local.get(m);
218 |       i32.const_(N);
219 |       i32.mul();
220 |       local.get(n);
221 |       i32.add();
222 |       i32.const_(4);
223 |       i32.mul();
224 |       local.get(c); // push the saved value back to the stack
225 |       f32.store(0, C_off);
226 | 
227 |       // loop tail for k
228 |       local.get(k);
229 |       i32.const_(1);
230 |       i32.add();
231 |       local.tee(k);
232 |       i32.const_(K);
233 |       i32.lt_u();
234 |       br_if(0);
235 |       end();
236 | 
237 |       // loop tail for n
238 |       local.get(n);
239 |       i32.const_(1);
240 |       i32.add();
241 |       local.tee(n);
242 |       i32.const_(N);
243 |       i32.lt_u();
244 |       br_if(0);
245 |       end();
246 | 
247 |       // loop tail for m
248 |       local.get(m);
249 |       i32.const_(1);
250 |       i32.add();
251 |       local.tee(m);
252 |       i32.const_(M);
253 |       i32.lt_u();
254 |       br_if(0);
255 |       end(); // M
256 |     });
257 |     export_(fn, "mm");
258 |   }
259 | };
260 | 
261 | struct MMGen : public wasmblr::CodeGenerator {
262 |   MMGen(int M, int N, int K, int M_unroll, int N_unroll, int K_unroll) {
263 |     assert(M_unroll <= M && "Invalid M unroll size");
264 |     assert((N_unroll * 4) <= N && "Invalid N unroll size");
265 |     assert(K_unroll <= K && "Invalid K unroll size");
266 |     auto pages = (M * N + K * N + M * K) * 4 / (1 << 16) + 1;
267 |     auto A_off = 0;
268 |     auto B_off = M * K * 4;
269 |     auto C_off = (M * K + K * N) * 4;
270 |     memory(pages).export_("mem");
271 |     auto fn = function({}, {}, [=]() {
272 |       auto m = local(i32);
273 |       auto n = local(i32);
274 |       auto k = local(i32);
275 |       std::vector load_a;
276 |       std::vector load_b;
277 |       for (auto j = 0; j < K_unroll; ++j) {
278 |         for (auto i = 0; i < M_unroll; ++i) {
279 |           load_a.emplace_back(local(v128));
280 |         }
281 |         for (auto i = 0; i < N_unroll; ++i) {
282 |           load_b.emplace_back(local(v128));
283 |         }
284 |       }
285 |       auto a_off = local(i32);
286 |       auto b_off = local(i32);
287 |       auto c_off = local(i32);
288 |       std::vector accs;
289 |       for (auto i = 0; i < M_unroll * N_unroll; ++i) {
290 |         accs.emplace_back(local(v128));
291 |       }
292 | 
293 |       i32.const_(0);
294 |       local.set(m);
295 |       loop(void_); // M
296 | 
297 |       local.get(m);
298 |       i32.const_(N * 4);
299 |       i32.mul();
300 |       local.set(c_off);
301 | 
302 |       i32.const_(0);
303 |       local.set(n);
304 |       loop(void_); // N
305 | 
306 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
307 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
308 |           local.get(c_off);
309 |           v128.load(0, C_off + n_unroll * 4 * 4 + m_unroll * N * 4);
310 |           local.set(accs.at(m_unroll * N_unroll + n_unroll));
311 |         }
312 |       }
313 | 
314 |       local.get(m);
315 |       i32.const_(K * 4);
316 |       i32.mul();
317 |       local.set(a_off);
318 | 
319 |       local.get(n);
320 |       i32.const_(4 * 4 * N_unroll);
321 |       i32.mul();
322 |       local.set(b_off);
323 | 
324 |       i32.const_(0);
325 |       local.set(k);
326 |       loop(void_); // K
327 | 
328 |       for (auto k_unroll = 0; k_unroll < K_unroll; ++k_unroll) {
329 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
330 |           local.get(a_off);
331 |           v128.load32_splat(0, A_off + (m_unroll * K + k_unroll) * 4);
332 |           local.set(load_a.at(m_unroll * K_unroll + k_unroll));
333 |         }
334 | 
335 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
336 |           local.get(b_off);
337 |           v128.load(0, B_off + (k_unroll * N + n_unroll * 4) * 4);
338 |           local.set(load_b.at(n_unroll * K_unroll + k_unroll));
339 |         }
340 | 
341 |         for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
342 |           for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
343 | 
344 |             local.get(accs.at(m_unroll * N_unroll + n_unroll));
345 |             local.get(load_a.at(m_unroll * K_unroll + k_unroll));
346 |             local.get(load_b.at(n_unroll * K_unroll + k_unroll));
347 |             v128.f32x4_mul();
348 |             v128.f32x4_add();
349 |             local.set(accs.at(m_unroll * N_unroll + n_unroll));
350 |           }
351 |         }
352 |       }
353 | 
354 |       local.get(a_off);
355 |       i32.const_(4 * K_unroll);
356 |       i32.add();
357 |       local.set(a_off);
358 | 
359 |       local.get(b_off);
360 |       i32.const_(N * 4 * K_unroll);
361 |       i32.add();
362 |       local.set(b_off);
363 | 
364 |       local.get(k);
365 |       i32.const_(K_unroll);
366 |       i32.add();
367 |       local.tee(k);
368 |       i32.const_(K);
369 |       i32.lt_u();
370 |       br_if(0);
371 | 
372 |       end(); // K
373 | 
374 |       // store output of C
375 |       for (auto m_unroll = 0; m_unroll < M_unroll; ++m_unroll) {
376 |         for (auto n_unroll = 0; n_unroll < N_unroll; ++n_unroll) {
377 |           local.get(c_off);
378 |           local.get(accs.at(m_unroll * N_unroll + n_unroll));
379 |           v128.store(0, C_off + n_unroll * 4 * 4 + m_unroll * N * 4);
380 |         }
381 |       }
382 | 
383 |       local.get(c_off);
384 |       i32.const_(N_unroll * 4 * 4);
385 |       i32.add();
386 |       local.set(c_off);
387 | 
388 |       local.get(n);
389 |       i32.const_(1);
390 |       i32.add();
391 |       local.tee(n);
392 |       i32.const_(N / 4 / N_unroll);
393 |       i32.lt_u();
394 |       br_if(0);
395 | 
396 |       end(); // N
397 | 
398 |       local.get(m);
399 |       i32.const_(M_unroll);
400 |       i32.add();
401 |       local.tee(m);
402 |       i32.const_(M);
403 |       i32.lt_u();
404 |       br_if(0);
405 | 
406 |       end(); // M
407 |     });
408 |     export_(fn, "mm");
409 |   }
410 | };
411 | 
412 | extern "C" {
413 | 
414 | uint8_t *jit_mm_naive(int M, int N, int K) {
415 |   MMGenSimple mm(M, N, K);
416 |   auto bytes = mm.emit();
417 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
418 |   memcpy(out, bytes.data(), bytes.size());
419 |   return out;
420 | }
421 | 
422 | int jit_mm_naive_len(int M, int N, int K) {
423 |   MMGenSimple mm(M, N, K);
424 |   auto bytes = mm.emit();
425 |   return bytes.size();
426 | }
427 | 
428 | uint8_t *jit_mm(int M, int N, int K, int Mu, int Nu, int Ku) {
429 |   MMGen mm(M, N, K, Mu, Nu, Ku);
430 |   auto bytes = mm.emit();
431 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
432 |   memcpy(out, bytes.data(), bytes.size());
433 |   return out;
434 | }
435 | 
436 | int jit_mm_len(int M, int N, int K, int Mu, int Nu, int Ku) {
437 |   MMGen mm(M, N, K, Mu, Nu, Ku);
438 |   auto bytes = mm.emit();
439 |   return bytes.size();
440 | }
441 | 
442 | uint8_t *jit_mm_nosimd(int M, int N, int K, int Mu, int Nu, int Ku) {
443 |   MMGenNoSIMD mm(M, N, K, Mu, Nu, Ku);
444 |   auto bytes = mm.emit();
445 |   uint8_t *out = (uint8_t *)malloc(bytes.size());
446 |   memcpy(out, bytes.data(), bytes.size());
447 |   return out;
448 | }
449 | 
450 | int jit_mm_nosimd_len(int M, int N, int K, int Mu, int Nu, int Ku) {
451 |   MMGenNoSIMD mm(M, N, K, Mu, Nu, Ku);
452 |   auto bytes = mm.emit();
453 |   return bytes.size();
454 | }
455 | }
456 | 


--------------------------------------------------------------------------------
/test.cc:
--------------------------------------------------------------------------------
  1 | #include 
  2 | #include 
  3 | #include 
  4 | #include "wasmblr.h"
  5 | 
  6 | void testJS(wasmblr::CodeGenerator& c,
  7 |             std::string invoke,
  8 |             std::string expected) {
  9 |   std::stringstream ss;
 10 |   ss << "const wasm = new Uint8Array([";
 11 |   for (const auto& b : c.emit()) {
 12 |     ss << "0x" << std::hex << static_cast(b) << ", ";
 13 |   }
 14 |   ss << "]);\n";
 15 |   ss << "const m = new WebAssembly.Module(wasm);\n";
 16 |   ss << "const instance = new WebAssembly.Instance(m, {});\n";
 17 |   ss << invoke;
 18 | 
 19 |   std::string node_file = "/tmp/test.js";
 20 |   std::string out_file = "/tmp/test.out";
 21 |   std::ofstream nf(node_file);
 22 |   nf << ss.str();
 23 |   nf << std::flush;
 24 |   std::system(("node " + node_file + " > " + out_file).c_str());
 25 |   std::stringstream ss_out;
 26 |   ss_out << std::ifstream(out_file).rdbuf();
 27 |   if (ss_out.str() != expected) {
 28 |     std::cerr << "got: " << ss_out.str();
 29 |     std::cerr << "expected: " << expected;
 30 |     std::ofstream wasm("error.wasm", std::ios::binary);
 31 |     std::cerr << "generated wasm saved to error.wasm\n";
 32 |     auto bytes = c.emit();
 33 |     wasm.write((char*)bytes.data(), bytes.size());
 34 |   }
 35 |   assert(ss_out.str() == expected && "failed");
 36 | }
 37 | 
 38 | void testBasic() {
 39 |   struct Code : wasmblr::CodeGenerator {
 40 |     Code() : wasmblr::CodeGenerator() {
 41 |       auto add_func = function({f32, f32}, {f32}, [&]() {
 42 |         local.get(0);
 43 |         local.get(1);
 44 |         f32.add();
 45 |       });
 46 |       export_(add_func, "add");
 47 |     }
 48 |   };
 49 |   Code c;
 50 |   testJS(c, "console.log(instance.exports.add(8, 4));", "12\n");
 51 | }
 52 | 
 53 | void testConstant() {
 54 |   struct Code : wasmblr::CodeGenerator {
 55 |     Code() : wasmblr::CodeGenerator() {
 56 |       auto constant_func =
 57 |           function({}, {i32}, [&]() { i32.const_(1024 * 1024 * 4); });
 58 |       export_(constant_func, "constant");
 59 |     }
 60 |   };
 61 |   Code c;
 62 |   testJS(c, "console.log(instance.exports.constant());", "4194304\n");
 63 | }
 64 | 
 65 | void testRecursive() {
 66 |   struct Code : wasmblr::CodeGenerator {
 67 |     // NB: Needs to be a class variable,
 68 |     // the function body is evaluated later
 69 |     uint32_t factorial;
 70 |     Code() : wasmblr::CodeGenerator() {
 71 |       factorial = function({f32}, {f32}, [&]() {
 72 |         local.get(0);
 73 |         f32.const_(1.0f);
 74 |         f32.lt();
 75 |         if_(f32);
 76 |         { f32.const_(1.0f); }
 77 |         else_();
 78 |         {
 79 |           local.get(0);
 80 |           local.get(0);
 81 |           f32.const_(1.0f);
 82 |           f32.sub();
 83 |           call(factorial);
 84 |           f32.mul();
 85 |         }
 86 |         end();
 87 |       });
 88 |       export_(factorial, "factorial");
 89 |     }
 90 |   };
 91 |   Code c;
 92 |   testJS(c, "console.log(instance.exports.factorial(4));", "24\n");
 93 |   testJS(c, "console.log(instance.exports.factorial(7));", "5040\n");
 94 | }
 95 | 
 96 | void testIfStatement() {
 97 |   struct Code : wasmblr::CodeGenerator {
 98 |     Code() : wasmblr::CodeGenerator() {
 99 |       auto if_func = function({f32}, {f32}, [&]() {
100 |         f32.const_(0.0f);
101 |         local.get(0);
102 |         f32.gt();
103 |         if_(f32);
104 |         f32.const_(0.0f);
105 |         else_();
106 |         local.get(0);
107 |         end();
108 |       });
109 |       export_(if_func, "relu");
110 |     }
111 |   };
112 |   Code c;
113 |   testJS(c, "console.log(instance.exports.relu(-2));", "0\n");
114 |   testJS(c, "console.log(instance.exports.relu(2));", "2\n");
115 | }
116 | 
117 | void testLoop() {
118 |   struct Code : wasmblr::CodeGenerator {
119 |     Code() : wasmblr::CodeGenerator() {
120 |       auto loop_fn = function({}, {i32}, [&]() {
121 |         auto i = local(i32);
122 | 
123 |         loop(void_);
124 |         {
125 |           local.get(i);
126 |           i32.const_(1);
127 |           i32.add();
128 |           local.set(i);
129 | 
130 |           local.get(i);
131 |           i32.const_(10);
132 |           i32.lt_s();
133 |           br_if(0);
134 |         }
135 |         end();
136 |         local.get(i);
137 |       });
138 |       export_(loop_fn, "loop");
139 |     }
140 |   };
141 |   Code c;
142 |   testJS(c, "console.log(instance.exports.loop());", "10\n");
143 | }
144 | 
145 | void testMemory() {
146 |   struct Code : wasmblr::CodeGenerator {
147 |     Code() : wasmblr::CodeGenerator() { memory(1, 10).export_("mem"); }
148 |   };
149 |   Code c;
150 |   testJS(c, R"(
151 | console.log(instance.exports.mem.buffer.byteLength);
152 | instance.exports.mem.grow(1);
153 | console.log(instance.exports.mem.buffer.byteLength);
154 |   )",
155 |          "65536\n131072\n");
156 | }
157 | 
158 | void testStore() {
159 |   struct Code : wasmblr::CodeGenerator {
160 |     Code() : wasmblr::CodeGenerator() {
161 |       memory(1, 10).export_("mem");
162 |       auto store = function({}, {}, [&]() {
163 |         i32.const_(0);     // index 0
164 |         i32.const_(1337);  // value 1337
165 |         i32.store(0, 0);   // align 0, offset 0
166 |       });
167 |       export_(store, "store");
168 |     }
169 |   };
170 |   Code c;
171 |   testJS(c, R"(
172 | instance.exports.store();
173 | console.log(new Uint32Array(instance.exports.mem.buffer)[0])
174 |   )",
175 |          "1337\n");
176 | }
177 | 
178 | void testSIMD() {
179 |   struct Code : wasmblr::CodeGenerator {
180 |     Code() : wasmblr::CodeGenerator() {
181 |       memory(1, 10).export_("mem");
182 |       auto square = function({}, {}, [&]() {
183 |         auto vec = local(v128);
184 |         i32.const_(0);
185 |         v128.load();
186 |         local.set(vec);
187 | 
188 |         local.get(vec);
189 |         local.get(vec);
190 |         v128.f32x4_mul();
191 |         local.set(vec);
192 | 
193 |         i32.const_(0);
194 |         local.get(vec);
195 |         v128.store();
196 |       });
197 |       export_(square, "simd_square");
198 |     }
199 |   };
200 |   Code c;
201 |   testJS(c, R"(
202 | let inp = new Float32Array(instance.exports.mem.buffer);
203 | inp[0] = 1;
204 | inp[1] = 2;
205 | inp[2] = 3;
206 | inp[3] = 4;
207 | instance.exports.simd_square();
208 | console.log(inp[0], inp[1], inp[2], inp[3]);
209 |   )",
210 |          "1 4 9 16\n");
211 | }
212 | 
213 | void testSIMDShift() {
214 |   struct Code : wasmblr::CodeGenerator {
215 |     Code() : wasmblr::CodeGenerator() {
216 |       memory(1, 10).export_("mem");
217 |       auto square = function({}, {}, [&]() {
218 |         auto vec = local(v128);
219 |         i32.const_(0);
220 |         v128.load();
221 |         local.set(vec);
222 | 
223 |         local.get(vec);
224 |         i32.const_(2);
225 |         v128.i32x4_shl();
226 |         local.set(vec);
227 | 
228 |         i32.const_(0);
229 |         local.get(vec);
230 |         v128.store();
231 |       });
232 |       export_(square, "simd_square");
233 |     }
234 |   };
235 |   Code c;
236 |   testJS(c, R"(
237 | let inp = new Int32Array(instance.exports.mem.buffer);
238 | inp[0] = 1;
239 | inp[1] = 2;
240 | inp[2] = 3;
241 | inp[3] = 4;
242 | instance.exports.simd_square();
243 | console.log(inp[0], inp[1], inp[2], inp[3]);
244 |   )",
245 |          "4 8 12 16\n");
246 | }
247 | 
248 | int main() {
249 |   testBasic();
250 |   testConstant();
251 |   testRecursive();
252 |   testIfStatement();
253 |   testLoop();
254 |   testMemory();
255 |   testStore();
256 |   testSIMD();
257 |   testSIMDShift();
258 |   std::cout << "pass.\n";
259 | }
260 | 


--------------------------------------------------------------------------------
/thread_example/README.md:
--------------------------------------------------------------------------------
 1 | # Demo of how to do threading in WASM
 2 | 
 3 | The WebAssembly threading story is still a bit messy.
 4 | This folder contains a minimal working example of
 5 | two threads sharing the task of calcuation the square of an input.
 6 | It uses modern techniques and should be useful going forward.
 7 | 
 8 | ## Concept
 9 | 
10 | Each thread will be given every other element. The WASM function
11 | `square` is defined to take an offset (into the input and output).
12 | We will call `square` twice with two different offsets and that will
13 | yield the full result.
14 | 
15 | Specifically, `square(4, 8)` denotes starting on the 4th byte and squaring all elements 8 bytes apart.
16 | It should hopefully be clear how this can be run twice in parallel
17 | with different first arguments (0 and 4) to get the full result.
18 | 
19 | ## Generate `square.wasm`
20 | 
21 | ```
22 | g++ thread.cc -I../ -o thread
23 | ./thread
24 | ```
25 | 
26 | ## Then run the server with proper cross origin isolation
27 | 
28 | This is a hacked python http.server that will 
29 | serve up the proper isolation (COOP + COEP).
30 | Read more here: 
31 | [MDN](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/SharedArrayBuffer#security_requirements)
32 | 
33 | ```
34 | python3 -m server
35 | ```
36 | 
37 | ## Now open the browser and look in the console
38 | 
39 | Navigate to `localhost:8000` and you should see the output of each thread (both around 0.09).
40 | We are looking at the result of `main.mjs` and `worker.js` interacting.
41 | 


--------------------------------------------------------------------------------
/thread_example/index.html:
--------------------------------------------------------------------------------
1 | 
2 | 

3 | 


--------------------------------------------------------------------------------
/thread_example/main.mjs:
--------------------------------------------------------------------------------
 1 | async function launch_threads() {
 2 |   const response = await fetch('./square.wasm');
 3 |   const wasm = await response.arrayBuffer();
 4 |   const wasm_module = await WebAssembly.compile(wasm);
 5 |   const len = 1024;
 6 |   const pages = (len * 2 * 4) / (1 << 16) + 1;
 7 |   let memory = new WebAssembly.Memory({
 8 |     initial: pages,
 9 |     maximum: pages + 1,
10 |     shared: true
11 |   });
12 |   const wasm_instance = await WebAssembly.instantiate(wasm_module, {
13 |     env: {
14 |       memory: memory
15 |     }
16 |   });
17 | 
18 |   const input = new Float32Array(memory.buffer, 0, 1024);
19 |   const output = new Float32Array(memory.buffer, 1024 * 4, 1024);
20 |   input[0] = 0.3;
21 |   input[1] = 0.3;
22 |   input[2] = 0.3;
23 |   const worker0 = new Worker('./worker.js');
24 |   const worker1 = new Worker('./worker.js');
25 |   let w0_done = false;
26 |   let w1_done = false;
27 |   worker0.addEventListener('message', function(e) {
28 |     w0_done = true;
29 |     if (w1_done) {
30 |       document.getElementById('output').textContent = output[0] + ", " + output[1];
31 |       console.log(output[0], output[1]);
32 |     }
33 |   });
34 |   worker1.addEventListener('message', function(e) {
35 |     w1_done = true;
36 |     if (w0_done) {
37 |       document.getElementById('output').textContent = output[0] + ", " + output[1];
38 |       console.log(output[0], output[1]);
39 |     }
40 |   });
41 |   worker0.postMessage([wasm_module, memory, 0, 8]);
42 |   worker1.postMessage([wasm_module, memory, 4, 8]);
43 | }
44 | 
45 | launch_threads();
46 | 


--------------------------------------------------------------------------------
/thread_example/server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from http.server import HTTPServer, SimpleHTTPRequestHandler, test
 3 | import sys
 4 | 
 5 | class CORSRequestHandler (SimpleHTTPRequestHandler):
 6 |     def end_headers (self):
 7 |         self.send_header('Cross-Origin-Opener-Policy', 'same-origin')
 8 |         self.send_header('Cross-Origin-Embedder-Policy', 'require-corp')
 9 |         SimpleHTTPRequestHandler.end_headers(self)
10 | 
11 | if __name__ == '__main__':
12 |     test(CORSRequestHandler, HTTPServer, port=int(sys.argv[1]) if len(sys.argv) > 1 else 8000)
13 | 


--------------------------------------------------------------------------------
/thread_example/thread.cc:
--------------------------------------------------------------------------------
 1 | #include "wasmblr.h"
 2 | #include 
 3 | 
 4 | int main() {
 5 | 
 6 |   int64_t len = 1024;
 7 | 
 8 |   wasmblr::CodeGenerator cg;
 9 | 
10 |   // shared() makes threading possible
11 |   auto pages = (len * 2 * 4) / (1 << 16) + 1;
12 |   cg.memory(pages, pages + 1).import_("env", "memory").shared();
13 | 
14 |   auto square = cg.function({cg.i32, cg.i32}, {}, [&]() {
15 |     auto iter = cg.local(cg.i32);
16 |     //cg.i32.const_(0);
17 |     cg.local.get(0);
18 |     cg.local.set(iter);
19 | 
20 |     cg.loop(cg.void_);
21 |     {
22 |       cg.local.get(iter);
23 | 
24 |       cg.local.get(iter);
25 |       cg.f32.load(0, 0);
26 | 
27 |       cg.local.get(iter);
28 |       cg.f32.load(0, 0);
29 | 
30 |       cg.f32.mul();
31 |       cg.f32.store(0, len * 4);
32 | 
33 |       //cg.i32.const_(4);
34 |       cg.local.get(1);
35 |       cg.local.get(iter);
36 |       cg.i32.add();
37 |       cg.local.set(iter);
38 | 
39 |       cg.i32.const_(len * 4);
40 |       cg.local.get(iter);
41 |       cg.i32.ge_u();
42 |       cg.br_if(0);
43 |     }
44 |     cg.end();
45 |     
46 |   });
47 | 
48 |   cg.export_(square, "square");
49 | 
50 |   // write to a loadable binary
51 |   std::ofstream wasm("square.wasm", std::ios::binary);
52 |   auto bytes = cg.emit();
53 |   wasm.write((char*)bytes.data(), bytes.size());
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/thread_example/worker.js:
--------------------------------------------------------------------------------
 1 | self.addEventListener('message', function(e) {
 2 |   const wasm_module = e.data[0];
 3 |   const memory = e.data[1];
 4 |   WebAssembly.instantiate(wasm_module, {
 5 |     env: {
 6 |       memory: memory
 7 |     }
 8 |   }).then((wasm_instance) => {
 9 |     const square = wasm_instance.exports.square;
10 |     const off = e.data[2];
11 |     const stride = e.data[3];
12 |     square(off, stride);
13 |     self.postMessage("done");
14 |   });
15 | });


--------------------------------------------------------------------------------
/wasmblr.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include 
  4 | #include 
  5 | #include 
  6 | #include 
  7 | #include 
  8 | #include 
  9 | #include 
 10 | #include 
 11 | 
 12 | namespace wasmblr {
 13 | 
 14 | constexpr std::array magic_module_header = {0x00, 0x61, 0x73, 0x6d};
 15 | constexpr std::array module_version = {0x01, 0x00, 0x00, 0x00};
 16 | 
 17 | struct CodeGenerator;
 18 | 
 19 | class Local {
 20 |  public:
 21 |   int operator()(uint8_t type);
 22 |   void get(int idx);
 23 |   void set(int idx);
 24 |   void tee(int idx);
 25 | 
 26 |  private:
 27 |   Local(CodeGenerator& cg_) : cg(cg_) {}
 28 |   CodeGenerator& cg;
 29 |   friend CodeGenerator;
 30 | };
 31 | 
 32 | class I32 {
 33 |  public:
 34 |   operator uint8_t();
 35 |   void const_(int32_t i);
 36 |   void clz();
 37 |   void ctz();
 38 |   void popcnt();
 39 |   void lt_s();
 40 |   void lt_u();
 41 |   void gt_s();
 42 |   void gt_u();
 43 |   void le_s();
 44 |   void le_u();
 45 |   void ge_s();
 46 |   void ge_u();
 47 |   void add();
 48 |   void sub();
 49 |   void mul();
 50 |   void div_s();
 51 |   void div_u();
 52 |   void rem_s();
 53 |   void rem_u();
 54 |   void and_();
 55 |   void or_();
 56 |   void xor_();
 57 |   void shl();
 58 |   void shr_s();
 59 |   void shr_u();
 60 |   void rotl();
 61 |   void rotr();
 62 |   void eqz();
 63 |   void eq();
 64 |   void ne();
 65 | 
 66 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
 67 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
 68 | 
 69 |   void load8_s(uint32_t alignment = 1, uint32_t offset = 0);
 70 |   void load8_u(uint32_t alignment = 1, uint32_t offset = 0);
 71 |   void load16_s(uint32_t alignment = 1, uint32_t offset = 0);
 72 |   void load16_u(uint32_t alignment = 1, uint32_t offset = 0);
 73 |   void store8(uint32_t alignment = 1, uint32_t offset = 0);
 74 |   void store16(uint32_t alignment = 1, uint32_t offset = 0);
 75 | 
 76 |  private:
 77 |   I32(CodeGenerator& cg_) : cg(cg_) {}
 78 |   CodeGenerator& cg;
 79 |   friend CodeGenerator;
 80 | };
 81 | 
 82 | class F32 {
 83 |  public:
 84 |   operator uint8_t();
 85 |   void const_(float f);
 86 |   void eq();
 87 |   void ne();
 88 |   void lt();
 89 |   void gt();
 90 |   void le();
 91 |   void ge();
 92 |   void abs();
 93 |   void neg();
 94 |   void ceil();
 95 |   void floor();
 96 |   void trunc();
 97 |   void nearest();
 98 |   void sqrt();
 99 |   void add();
100 |   void sub();
101 |   void mul();
102 |   void div();
103 |   void min();
104 |   void max();
105 |   void copysign();
106 | 
107 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
108 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
109 | 
110 |  private:
111 |   F32(CodeGenerator& cg_) : cg(cg_) {}
112 |   CodeGenerator& cg;
113 |   friend CodeGenerator;
114 | };
115 | 
116 | class V128 {
117 |  public:
118 |   operator uint8_t();
119 | 
120 |   void i32x4_extract_lane(uint8_t lane);
121 |   void i32x4_replace_lane(uint8_t lane);
122 |   void f32x4_extract_lane(uint8_t lane);
123 |   void f32x4_replace_lane(uint8_t lane);
124 |   void i32x4_splat();
125 |   void f32x4_splat();
126 |   void i32x4_eq();
127 |   void i32x4_ne();
128 |   void i32x4_lt_s();
129 |   void i32x4_lt_u();
130 |   void i32x4_gt_s();
131 |   void i32x4_gt_u();
132 |   void i32x4_le_s();
133 |   void i32x4_le_u();
134 |   void i32x4_ge_s();
135 |   void i32x4_ge_u();
136 |   void f32x4_eq();
137 |   void f32x4_ne();
138 |   void f32x4_lt();
139 |   void f32x4_gt();
140 |   void f32x4_le();
141 |   void f32x4_ge();
142 |   void not_();
143 |   void any_true();
144 |   void and_();
145 |   void andnot();
146 |   void or_();
147 |   void xor_();
148 |   void i32x4_abs();
149 |   void i32x4_neg();
150 |   void i32x4_all_true();
151 |   void i32x4_bitmask();
152 |   void i32x4_shl();
153 |   void i32x4_shr_s();
154 |   void i32x4_shr_u();
155 |   void i32x4_add();
156 |   void i32x4_sub();
157 |   void i32x4_mul();
158 |   void i32x4_min_s();
159 |   void i32x4_min_u();
160 |   void i32x4_max_s();
161 |   void i32x4_max_u();
162 |   void f32x4_ceil();
163 |   void f32x4_floor();
164 |   void f32x4_trunc();
165 |   void f32x4_nearest();
166 |   void f32x4_abs();
167 |   void f32x4_neg();
168 |   void f32x4_sqrt();
169 |   void f32x4_add();
170 |   void f32x4_sub();
171 |   void f32x4_mul();
172 |   void f32x4_div();
173 |   void f32x4_min();
174 |   void f32x4_max();
175 |   void f32x4_pmin();
176 |   void f32x4_pmax();
177 | 
178 |   void load(uint32_t alignment = 1, uint32_t offset = 0);
179 |   void load32x2_s(uint32_t alignment = 1, uint32_t offset = 0);
180 |   void load32x2_u(uint32_t alignment = 1, uint32_t offset = 0);
181 |   void load32_splat(uint32_t alignment = 1, uint32_t offset = 0);
182 |   void load32_zero(uint32_t alignment = 1, uint32_t offset = 0);
183 |   void store(uint32_t alignment = 1, uint32_t offset = 0);
184 | 
185 |  private:
186 |   V128(CodeGenerator& cg_) : cg(cg_) {}
187 |   CodeGenerator& cg;
188 |   friend CodeGenerator;
189 | };
190 | 
191 | class Memory {
192 |  public:
193 |   Memory& operator()(uint32_t min);
194 |   Memory& operator()(uint32_t min, uint32_t max);
195 |   Memory& export_(std::string);
196 |   Memory& shared(bool = true);
197 |   Memory& import_(std::string, std::string);
198 |   void size();
199 |   void grow();
200 | 
201 |  private:
202 |   Memory(CodeGenerator& cg_) : cg(cg_) {}
203 |   CodeGenerator& cg;
204 |   uint32_t min = 0;
205 |   uint32_t max = 0;
206 |   bool is_shared = false;
207 |   std::string a_string = "";
208 |   std::string b_string = "";
209 |   bool is_import() const { return a_string.size() && b_string.size(); }
210 |   bool is_export() const { return a_string.size() && !b_string.size(); }
211 |   friend CodeGenerator;
212 | };
213 | 
214 | struct Function {
215 |   Function(std::vector input_types_,
216 |            std::vector output_types_)
217 |       : input_types(input_types_), output_types(output_types_) {}
218 |   Function(std::vector input_types_,
219 |            std::vector output_types_,
220 |            std::function body_)
221 |       : input_types(input_types_), output_types(output_types_), body(body_) {}
222 |   std::vector input_types;
223 |   std::vector output_types;
224 |   std::function body;
225 |   std::vector locals;  // resolved later
226 |   void emit() {
227 |     locals.clear();
228 |     body();
229 |   };
230 | };
231 | 
232 | struct CodeGenerator {
233 |   // API
234 |   Local local;
235 |   I32 i32;
236 |   F32 f32;
237 |   V128 v128;
238 |   Memory memory;
239 |   uint8_t void_ = 0x40;
240 | 
241 |   void nop();
242 |   void block(uint8_t type);
243 |   void loop(uint8_t type);
244 |   void if_(uint8_t type);
245 |   void else_();
246 |   void br(uint32_t labelidx);
247 |   void br_if(uint32_t labelidx);
248 |   void end();
249 |   void call(uint32_t funcidx);
250 | 
251 |   void export_(uint32_t fn_idx, std::string name);
252 | 
253 |   // returns function index
254 |   uint32_t function(std::vector input_types,
255 |                     std::vector output_types,
256 |                     std::function body);
257 | 
258 |   std::vector emit();
259 | 
260 |   // Implementation
261 | 
262 |   CodeGenerator()
263 |       : local(*this), i32(*this), f32(*this), v128(*this), memory(*this) {}
264 |   CodeGenerator(const CodeGenerator&) = delete;
265 |   CodeGenerator(CodeGenerator&&) = delete;
266 | 
267 |   std::vector functions_;
268 |   std::unordered_map exported_functions_;
269 |   Function* cur_function_ = nullptr;
270 |   // cur_bytes_ is used as a temporary storage
271 |   std::vector cur_bytes_;
272 |   // a running type checker, purely for safety
273 |   std::stack type_stack_;
274 | 
275 |   using memarg = std::pair;
276 | 
277 |   // From LLVM
278 |   std::vector encode_signed(int32_t n) {
279 |     std::vector out;
280 |     auto more = true;
281 |     do {
282 |       uint8_t byte = n & 0x7f;
283 |       n >>= 7;
284 |       more = !((((n == 0) && ((byte & 0x40) == 0)) ||
285 |                 ((n == -1) && ((byte & 0x40) != 0))));
286 |       if (more) {
287 |         byte |= 0x80;
288 |       }
289 |       out.emplace_back(byte);
290 |     } while (more);
291 |     return out;
292 |   }
293 | 
294 |   std::vector encode_unsigned(uint32_t n) {
295 |     std::vector out;
296 |     do {
297 |       uint8_t byte = n & 0x7f;
298 |       n >>= 7;
299 |       if (n != 0) {
300 |         byte |= 0x80;
301 |       }
302 |       out.emplace_back(byte);
303 |     } while (n != 0);
304 |     return out;
305 |   }
306 | 
307 |   std::vector encode_string(std::string s) {
308 |     std::vector out;
309 |     out.emplace_back(s.size());
310 |     for (const auto& c : s) {
311 |       out.emplace_back(c);
312 |     }
313 |     return out;
314 |   }
315 | 
316 |   template 
317 |   void concat(std::vector& out, const T& inp) {
318 |     out.insert(out.end(), inp.begin(), inp.end());
319 |   };
320 | 
321 |   int declare_local(uint8_t type) {
322 |     assert(cur_function_);
323 |     int idx = cur_function_->locals.size() + cur_function_->input_types.size();
324 |     cur_function_->locals.emplace_back(type);
325 |     return idx;
326 |   }
327 | 
328 |   const std::vector& input_types() {
329 |     assert(cur_function_);
330 |     return cur_function_->input_types;
331 |   }
332 | 
333 |   const std::vector& locals() {
334 |     assert(cur_function_);
335 |     return cur_function_->locals;
336 |   }
337 | 
338 |   void push(uint8_t type) { type_stack_.push(type); };
339 | 
340 |   uint8_t pop() {
341 |     assert(type_stack_.size() && "popping empty stack");
342 |     auto type = type_stack_.top();
343 |     type_stack_.pop();
344 |     return type;
345 |   };
346 | 
347 |   void emit(uint8_t byte) { cur_bytes_.emplace_back(byte); }
348 |   void emit(std::vector bytes) { concat(cur_bytes_, bytes); }
349 |   void emit(const memarg& m) {
350 |     emit(encode_unsigned(std::get<0>(m)));
351 |     emit(encode_unsigned(std::get<1>(m)));
352 |   }
353 | };
354 | 
355 | inline int Local::operator()(uint8_t type) {
356 |   return cg.declare_local(type);
357 | };
358 | 
359 | inline void Local::set(int idx) {
360 |   auto t = cg.pop();
361 |   const auto& input_types = cg.input_types();
362 |   auto expected_type = [&]() {
363 |     if (idx < input_types.size()) {
364 |       return input_types.at(idx);
365 |     }
366 |     return cg.locals().at(idx - input_types.size());
367 |   }();
368 |   assert(expected_type == t && "can't set local to this value (wrong type)");
369 | 
370 |   cg.emit(0x21);
371 |   cg.emit(cg.encode_unsigned(idx));
372 | }
373 | 
374 | inline void Local::get(int idx) {
375 |   const auto& input_types = cg.input_types();
376 |   if (idx < input_types.size()) {
377 |     cg.push(input_types.at(idx));
378 |   } else {
379 |     cg.push(cg.locals().at(idx - input_types.size()));
380 |   }
381 | 
382 |   cg.emit(0x20);
383 |   cg.emit(cg.encode_unsigned(idx));
384 | }
385 | 
386 | inline void Local::tee(int idx) {
387 |   auto t = cg.pop();
388 |   const auto& input_types = cg.input_types();
389 |   auto expected_type = [&]() {
390 |     if (idx < input_types.size()) {
391 |       return input_types.at(idx);
392 |     }
393 |     return cg.locals().at(idx - input_types.size());
394 |   }();
395 |   assert(expected_type == t && "can't set local to this value (wrong type)");
396 | 
397 |   cg.emit(0x22);
398 |   cg.emit(cg.encode_unsigned(idx));
399 |   cg.push(expected_type);
400 | }
401 | 
402 | inline I32::operator uint8_t() {
403 |   return 0x7f;
404 | }
405 | 
406 | inline void I32::const_(int32_t i) {
407 |   cg.emit(0x41);
408 |   cg.emit(cg.encode_signed(i));
409 |   cg.push(cg.i32);
410 | }
411 | 
412 | inline F32::operator uint8_t() {
413 |   return 0x7d;
414 | }
415 | 
416 | inline void F32::const_(float f) {
417 |   cg.emit(0x43);
418 |   uint8_t r[4];
419 |   memcpy(&r, &f, sizeof(float));
420 |   for (auto i = 0; i < 4; ++i) {
421 |     cg.emit(r[i]);
422 |   }
423 |   cg.push(cg.f32);
424 | }
425 | 
426 | inline V128::operator uint8_t() {
427 |   return 0x7b;
428 | }
429 | 
430 | #define UNARY_OP(classname, op, opcode, in_type, out_type) \
431 |   inline void classname::op() {                            \
432 |     bool valid = cg.pop() == cg.in_type;                   \
433 |     assert(valid && "invalid type for " #op);              \
434 |     cg.emit(opcode);                                       \
435 |     cg.push(cg.out_type);                                  \
436 |   }
437 | 
438 | #define BINARY_OP(classname, op, opcode, type_a, type_b, out_type) \
439 |   inline void classname::op() {                                    \
440 |     bool valid = cg.pop() == cg.type_a && cg.pop() == cg.type_b;   \
441 |     assert(valid && "invalid type for " #op);                      \
442 |     cg.emit(opcode);                                               \
443 |     cg.push(cg.out_type);                                          \
444 |   }
445 | 
446 | #define LOAD_OP(classname, op, opcode, out_type)                   \
447 |   inline void classname::op(uint32_t alignment, uint32_t offset) { \
448 |     auto idx_type = cg.pop();                                      \
449 |     assert(idx_type == cg.i32);                                    \
450 |     cg.emit(opcode);                                               \
451 |     cg.emit(cg.encode_unsigned(alignment));                        \
452 |     cg.emit(cg.encode_unsigned(offset));                           \
453 |     cg.push(cg.out_type);                                          \
454 |   }
455 | 
456 | #define STORE_OP(classname, op, opcode)                            \
457 |   inline void classname::op(uint32_t alignment, uint32_t offset) { \
458 |     auto val_type = cg.pop();                                      \
459 |     auto idx_type = cg.pop();                                      \
460 |     assert(idx_type == cg.i32);                                    \
461 |     cg.emit(opcode);                                               \
462 |     cg.emit(cg.encode_unsigned(alignment));                        \
463 |     cg.emit(cg.encode_unsigned(offset));                           \
464 |   }
465 | 
466 | UNARY_OP(I32, clz, 0x67, i32, i32);
467 | UNARY_OP(I32, ctz, 0x68, i32, i32);
468 | UNARY_OP(I32, popcnt, 0x69, i32, i32);
469 | BINARY_OP(I32, lt_s, 0x48, i32, i32, i32);
470 | BINARY_OP(I32, lt_u, 0x49, i32, i32, i32);
471 | BINARY_OP(I32, gt_s, 0x4a, i32, i32, i32);
472 | BINARY_OP(I32, gt_u, 0x4b, i32, i32, i32);
473 | BINARY_OP(I32, le_s, 0x4c, i32, i32, i32);
474 | BINARY_OP(I32, le_u, 0x4d, i32, i32, i32);
475 | BINARY_OP(I32, ge_s, 0x4e, i32, i32, i32);
476 | BINARY_OP(I32, ge_u, 0x4f, i32, i32, i32);
477 | BINARY_OP(I32, add, 0x6a, i32, i32, i32);
478 | BINARY_OP(I32, sub, 0x6b, i32, i32, i32);
479 | BINARY_OP(I32, mul, 0x6c, i32, i32, i32);
480 | BINARY_OP(I32, div_s, 0x6d, i32, i32, i32);
481 | BINARY_OP(I32, div_u, 0x6e, i32, i32, i32);
482 | BINARY_OP(I32, rem_s, 0x6f, i32, i32, i32);
483 | BINARY_OP(I32, rem_u, 0x70, i32, i32, i32);
484 | BINARY_OP(I32, and_, 0x71, i32, i32, i32);
485 | BINARY_OP(I32, or_, 0x72, i32, i32, i32);
486 | BINARY_OP(I32, xor_, 0x73, i32, i32, i32);
487 | BINARY_OP(I32, shl, 0x74, i32, i32, i32);
488 | BINARY_OP(I32, shr_s, 0x75, i32, i32, i32);
489 | BINARY_OP(I32, shr_u, 0x76, i32, i32, i32);
490 | BINARY_OP(I32, rotl, 0x77, i32, i32, i32);
491 | BINARY_OP(I32, rotr, 0x78, i32, i32, i32);
492 | BINARY_OP(I32, eqz, 0x45, i32, i32, i32);
493 | BINARY_OP(I32, eq, 0x46, i32, i32, i32);
494 | BINARY_OP(I32, ne, 0x47, i32, i32, i32);
495 | LOAD_OP(I32, load, 0x28, i32);
496 | LOAD_OP(I32, load8_s, 0x2c, i32);
497 | LOAD_OP(I32, load8_u, 0x2d, i32);
498 | LOAD_OP(I32, load16_s, 0x2e, i32);
499 | LOAD_OP(I32, load16_u, 0x2f, i32);
500 | STORE_OP(I32, store, 0x36);
501 | STORE_OP(I32, store8, 0x3a);
502 | STORE_OP(I32, store16, 0x3b);
503 | 
504 | BINARY_OP(F32, eq, 0x5b, f32, f32, i32);
505 | BINARY_OP(F32, ne, 0x5c, f32, f32, i32);
506 | BINARY_OP(F32, lt, 0x5d, f32, f32, i32);
507 | BINARY_OP(F32, gt, 0x5e, f32, f32, i32);
508 | BINARY_OP(F32, le, 0x5f, f32, f32, i32);
509 | BINARY_OP(F32, ge, 0x60, f32, f32, i32);
510 | UNARY_OP(F32, abs, 0x8B, f32, f32);
511 | UNARY_OP(F32, neg, 0x8C, f32, f32);
512 | UNARY_OP(F32, ceil, 0x8D, f32, f32);
513 | UNARY_OP(F32, floor, 0x8E, f32, f32);
514 | UNARY_OP(F32, trunc, 0x8F, f32, f32);
515 | UNARY_OP(F32, nearest, 0x90, f32, f32);
516 | UNARY_OP(F32, sqrt, 0x91, f32, f32);
517 | BINARY_OP(F32, add, 0x92, f32, f32, f32);
518 | BINARY_OP(F32, sub, 0x93, f32, f32, f32);
519 | BINARY_OP(F32, mul, 0x94, f32, f32, f32);
520 | BINARY_OP(F32, div, 0x95, f32, f32, f32);
521 | BINARY_OP(F32, min, 0x96, f32, f32, f32);
522 | BINARY_OP(F32, max, 0x97, f32, f32, f32);
523 | BINARY_OP(F32, copysign, 0x98, f32, f32, f32);
524 | LOAD_OP(F32, load, 0x2a, f32);
525 | STORE_OP(F32, store, 0x38);
526 | 
527 | #undef UNARY_OP
528 | #undef BINARY_OP
529 | #undef LOAD_OP
530 | #undef STORE_OP
531 | 
532 | #define VECTOR_LOAD(op, vopcode)                              \
533 |   inline void V128::op(uint32_t alignment, uint32_t offset) { \
534 |     auto idx_type = cg.pop();                                 \
535 |     assert(idx_type == cg.i32);                               \
536 |     cg.emit(0xfd);                                            \
537 |     cg.emit(cg.encode_unsigned(vopcode));                     \
538 |     cg.emit(cg.encode_unsigned(alignment));                   \
539 |     cg.emit(cg.encode_unsigned(offset));                      \
540 |     cg.push(cg.v128);                                         \
541 |   }
542 | 
543 | VECTOR_LOAD(load, 0);
544 | VECTOR_LOAD(load32x2_s, 5);
545 | VECTOR_LOAD(load32x2_u, 6);
546 | VECTOR_LOAD(load32_splat, 9);
547 | VECTOR_LOAD(load32_zero, 92);
548 | 
549 | inline void V128::store(uint32_t alignment, uint32_t offset) {
550 |   auto val_type = cg.pop();
551 |   assert(val_type == cg.v128);
552 |   auto idx_type = cg.pop();
553 |   assert(idx_type == cg.i32);
554 |   cg.emit(0xfd);
555 |   cg.emit(cg.encode_unsigned(11));
556 |   cg.emit(cg.encode_unsigned(alignment));
557 |   cg.emit(cg.encode_unsigned(offset));
558 | }
559 | 
560 | inline void V128::i32x4_extract_lane(uint8_t lane) {
561 |   auto val_type = cg.pop();
562 |   assert(val_type == cg.v128);
563 |   cg.emit(0xfd);
564 |   cg.emit(cg.encode_unsigned(27));
565 |   cg.emit(lane);
566 |   cg.push(cg.i32);
567 | }
568 | 
569 | inline void V128::f32x4_extract_lane(uint8_t lane) {
570 |   auto val_type = cg.pop();
571 |   assert(val_type == cg.v128);
572 |   cg.emit(0xfd);
573 |   cg.emit(cg.encode_unsigned(31));
574 |   cg.emit(lane);
575 |   cg.push(cg.f32);
576 | }
577 | 
578 | inline void V128::i32x4_replace_lane(uint8_t lane) {
579 |   auto val_type = cg.pop();
580 |   assert(val_type == cg.i32);
581 |   auto vec_type = cg.pop();
582 |   assert(vec_type == cg.v128);
583 |   cg.emit(0xfd);
584 |   cg.emit(cg.encode_unsigned(28));
585 |   cg.emit(lane);
586 |   cg.push(cg.v128);
587 | }
588 | 
589 | inline void V128::f32x4_replace_lane(uint8_t lane) {
590 |   auto val_type = cg.pop();
591 |   assert(val_type == cg.f32);
592 |   auto vec_type = cg.pop();
593 |   assert(vec_type == cg.v128);
594 |   cg.emit(0xfd);
595 |   cg.emit(cg.encode_unsigned(32));
596 |   cg.emit(lane);
597 |   cg.push(cg.v128);
598 | }
599 | 
600 | inline void V128::i32x4_splat() {
601 |   auto val_type = cg.pop();
602 |   assert(val_type == cg.i32);
603 |   cg.emit(0xfd);
604 |   cg.emit(cg.encode_unsigned(17));
605 |   cg.push(cg.v128);
606 | }
607 | 
608 | inline void V128::f32x4_splat() {
609 |   auto val_type = cg.pop();
610 |   assert(val_type == cg.f32);
611 |   cg.emit(0xfd);
612 |   cg.emit(cg.encode_unsigned(19));
613 |   cg.push(cg.v128);
614 | }
615 | 
616 | #define VECTOR_BINARY_OP(op, vopcode, a_type, b_type, out_type) \
617 |   inline void V128::op() {                                      \
618 |     auto b = cg.pop();                                          \
619 |     assert(cg.b_type == b);                                     \
620 |     auto a = cg.pop();                                          \
621 |     assert(cg.a_type == a);                                     \
622 |     cg.emit(0xfd);                                              \
623 |     cg.emit(cg.encode_unsigned(vopcode));                       \
624 |     cg.push(cg.out_type);                                       \
625 |   }
626 | 
627 | #define VECTOR_UNARY_OP(op, vopcode, inp_type, out_type) \
628 |   inline void V128::op() {                               \
629 |     auto inp = cg.pop();                                 \
630 |     assert(cg.inp_type == inp);                          \
631 |     cg.emit(0xfd);                                       \
632 |     cg.emit(cg.encode_unsigned(vopcode));                \
633 |     cg.push(cg.out_type);                                \
634 |   }
635 | 
636 | VECTOR_BINARY_OP(i32x4_eq, 55, v128, v128, v128);
637 | VECTOR_BINARY_OP(i32x4_ne, 56, v128, v128, v128);
638 | VECTOR_BINARY_OP(i32x4_lt_s, 57, v128, v128, v128);
639 | VECTOR_BINARY_OP(i32x4_lt_u, 58, v128, v128, v128);
640 | VECTOR_BINARY_OP(i32x4_gt_s, 59, v128, v128, v128);
641 | VECTOR_BINARY_OP(i32x4_gt_u, 60, v128, v128, v128);
642 | VECTOR_BINARY_OP(i32x4_le_s, 61, v128, v128, v128);
643 | VECTOR_BINARY_OP(i32x4_le_u, 62, v128, v128, v128);
644 | VECTOR_BINARY_OP(i32x4_ge_s, 63, v128, v128, v128);
645 | VECTOR_BINARY_OP(i32x4_ge_u, 64, v128, v128, v128);
646 | 
647 | VECTOR_BINARY_OP(f32x4_eq, 65, v128, v128, v128);
648 | VECTOR_BINARY_OP(f32x4_ne, 66, v128, v128, v128);
649 | VECTOR_BINARY_OP(f32x4_lt, 67, v128, v128, v128);
650 | VECTOR_BINARY_OP(f32x4_gt, 68, v128, v128, v128);
651 | VECTOR_BINARY_OP(f32x4_le, 69, v128, v128, v128);
652 | VECTOR_BINARY_OP(f32x4_ge, 70, v128, v128, v128);
653 | 
654 | VECTOR_UNARY_OP(not_, 77, v128, v128);
655 | VECTOR_UNARY_OP(any_true, 83, v128, i32);
656 | 
657 | // TODO
658 | // VECTOR_TERNARY_OP(bitselect, 82, v128);
659 | 
660 | VECTOR_BINARY_OP(and_, 78, v128, v128, v128);
661 | VECTOR_BINARY_OP(andnot, 79, v128, v128, v128);
662 | VECTOR_BINARY_OP(or_, 80, v128, v128, v128);
663 | VECTOR_BINARY_OP(xor_, 81, v128, v128, v128);
664 | 
665 | VECTOR_UNARY_OP(i32x4_abs, 160, v128, v128);
666 | VECTOR_UNARY_OP(i32x4_neg, 161, v128, v128);
667 | VECTOR_UNARY_OP(i32x4_all_true, 163, v128, i32);
668 | VECTOR_UNARY_OP(i32x4_bitmask, 164, v128, i32);
669 | VECTOR_BINARY_OP(i32x4_shl, 171, v128, i32, v128);
670 | VECTOR_BINARY_OP(i32x4_shr_s, 172, v128, i32, v128);
671 | VECTOR_BINARY_OP(i32x4_shr_u, 173, v128, i32, v128);
672 | VECTOR_BINARY_OP(i32x4_add, 174, v128, v128, v128);
673 | VECTOR_BINARY_OP(i32x4_sub, 177, v128, v128, v128);
674 | VECTOR_BINARY_OP(i32x4_mul, 181, v128, v128, v128);
675 | VECTOR_BINARY_OP(i32x4_min_s, 182, v128, v128, v128);
676 | VECTOR_BINARY_OP(i32x4_min_u, 183, v128, v128, v128);
677 | VECTOR_BINARY_OP(i32x4_max_s, 184, v128, v128, v128);
678 | VECTOR_BINARY_OP(i32x4_max_u, 185, v128, v128, v128);
679 | 
680 | VECTOR_UNARY_OP(f32x4_ceil, 103, v128, v128);
681 | VECTOR_UNARY_OP(f32x4_floor, 104, v128, v128);
682 | VECTOR_UNARY_OP(f32x4_trunc, 105, v128, v128);
683 | VECTOR_UNARY_OP(f32x4_nearest, 106, v128, v128);
684 | VECTOR_UNARY_OP(f32x4_abs, 224, v128, v128);
685 | VECTOR_UNARY_OP(f32x4_neg, 225, v128, v128);
686 | VECTOR_UNARY_OP(f32x4_sqrt, 227, v128, v128);
687 | VECTOR_BINARY_OP(f32x4_add, 228, v128, v128, v128);
688 | VECTOR_BINARY_OP(f32x4_sub, 229, v128, v128, v128);
689 | VECTOR_BINARY_OP(f32x4_mul, 230, v128, v128, v128);
690 | VECTOR_BINARY_OP(f32x4_div, 231, v128, v128, v128);
691 | VECTOR_BINARY_OP(f32x4_min, 232, v128, v128, v128);
692 | VECTOR_BINARY_OP(f32x4_max, 233, v128, v128, v128);
693 | VECTOR_BINARY_OP(f32x4_pmin, 234, v128, v128, v128);
694 | VECTOR_BINARY_OP(f32x4_pmax, 235, v128, v128, v128);
695 | 
696 | inline Memory& Memory::operator()(uint32_t min_) {
697 |   assert(min == 0 && max == 0);
698 |   min = min_;
699 |   return *this;
700 | }
701 | 
702 | inline Memory& Memory::operator()(uint32_t min_, uint32_t max_) {
703 |   assert(min == 0 && max == 0);
704 |   min = min_;
705 |   max = max_;
706 |   return *this;
707 | }
708 | 
709 | inline Memory& Memory::export_(std::string a) {
710 |   assert(!(is_import() || is_export()) && "already set");
711 |   a_string = a;
712 |   return *this;
713 | }
714 | 
715 | inline Memory& Memory::shared(bool make_shared) {
716 |   is_shared = make_shared;
717 |   return *this;
718 | }
719 | 
720 | inline Memory& Memory::import_(std::string a, std::string b) {
721 |   assert(!(is_import() || is_export()) && "already set");
722 |   a_string = a;
723 |   b_string = b;
724 |   return *this;
725 | }
726 | 
727 | inline void Memory::size() {
728 |   cg.emit(0x3f);
729 |   cg.emit(0x00);
730 | }
731 | inline void Memory::grow() {
732 |   cg.emit(0x40);
733 |   cg.emit(0x00);
734 | }
735 | 
736 | inline void CodeGenerator::nop() {
737 |   emit(0x01);
738 | }
739 | inline void CodeGenerator::block(uint8_t type) {
740 |   emit(0x02);
741 |   emit(type);
742 | }
743 | inline void CodeGenerator::loop(uint8_t type) {
744 |   emit(0x03);
745 |   emit(type);
746 | }
747 | 
748 | inline void CodeGenerator::if_(uint8_t type) {
749 |   auto t = pop();
750 |   assert(t == i32);
751 |   emit(0x04);
752 |   emit(type);
753 | }
754 | inline void CodeGenerator::else_() {
755 |   emit(0x05);
756 | }
757 | inline void CodeGenerator::br(uint32_t labelidx) {
758 |   emit(0x0c);
759 |   emit(encode_unsigned(labelidx));
760 | }
761 | inline void CodeGenerator::br_if(uint32_t labelidx) {
762 |   auto t = pop();
763 |   assert(t == i32);
764 |   emit(0x0d);
765 |   emit(encode_unsigned(labelidx));
766 | }
767 | inline void CodeGenerator::end() {
768 |   emit(0x0b);
769 | }
770 | inline void CodeGenerator::call(uint32_t fn_idx) {
771 |   assert(fn_idx < functions_.size() && "function index does not exist");
772 |   emit(0x10);
773 |   emit(encode_unsigned(fn_idx));
774 | }
775 | 
776 | inline void CodeGenerator::export_(uint32_t fn, std::string name) {
777 |   exported_functions_[fn] = name;
778 | }
779 | 
780 | inline uint32_t CodeGenerator::function(std::vector input_types,
781 |                                         std::vector output_types,
782 |                                         std::function body) {
783 |   auto idx = functions_.size();
784 |   functions_.emplace_back(input_types, output_types, body);
785 |   return idx;
786 | }
787 | 
788 | inline std::vector CodeGenerator::emit() {
789 |   cur_bytes_.clear();
790 |   std::vector emitted_bytes;
791 | 
792 |   concat(emitted_bytes, magic_module_header);
793 |   concat(emitted_bytes, module_version);
794 | 
795 |   std::vector type_section_bytes;
796 |   concat(type_section_bytes, encode_unsigned(functions_.size()));
797 |   for (const auto& f : functions_) {
798 |     type_section_bytes.emplace_back(0x60);
799 |     concat(type_section_bytes, encode_unsigned(f.input_types.size()));
800 |     for (const auto& t : f.input_types) {
801 |       type_section_bytes.emplace_back(t);
802 |     }
803 |     concat(type_section_bytes, encode_unsigned(f.output_types.size()));
804 |     for (const auto& t : f.output_types) {
805 |       type_section_bytes.emplace_back(t);
806 |     }
807 |   }
808 | 
809 |   emitted_bytes.emplace_back(0x1);
810 |   concat(emitted_bytes, encode_unsigned(type_section_bytes.size()));
811 |   concat(emitted_bytes, type_section_bytes);
812 | 
813 | 	std::vector import_section_bytes;
814 |   if (memory.is_import()) {
815 | 		concat(import_section_bytes, encode_unsigned(1)); // 1 import
816 |     concat(import_section_bytes, encode_string(memory.a_string));
817 |     concat(import_section_bytes, encode_string(memory.b_string));
818 | 		import_section_bytes.emplace_back(0x2); // memory flag
819 |     if (memory.min && memory.max) {
820 | 			if (memory.is_shared) {
821 | 				import_section_bytes.emplace_back(0x3);
822 | 			} else {
823 |         import_section_bytes.emplace_back(0x01);
824 | 			}
825 |       concat(import_section_bytes, encode_unsigned(memory.min));
826 |       concat(import_section_bytes, encode_unsigned(memory.max));
827 | 		} else {
828 | 			assert(!memory.is_shared && "shared memory must have a max size");
829 |       concat(import_section_bytes, encode_unsigned(memory.min));
830 | 		}
831 |     emitted_bytes.emplace_back(0x2);
832 |     concat(emitted_bytes, encode_unsigned(import_section_bytes.size()));
833 |     concat(emitted_bytes, import_section_bytes);
834 | 	}
835 | 
836 |   std::vector function_section_bytes;
837 |   concat(function_section_bytes, encode_unsigned(functions_.size()));
838 |   for (auto i = 0; i < functions_.size(); ++i) {
839 |     concat(function_section_bytes, encode_unsigned(i));
840 |   }
841 |   emitted_bytes.emplace_back(0x3);
842 |   concat(emitted_bytes, encode_unsigned(function_section_bytes.size()));
843 |   concat(emitted_bytes, function_section_bytes);
844 | 
845 |   std::vector memory_section_bytes;
846 |   if (!memory.is_import() && (memory.min || memory.max)) {
847 |     memory_section_bytes.emplace_back(0x01);  // always 1 memory
848 |     if (memory.min && memory.max) {
849 |       if (memory.is_shared) {
850 |         memory_section_bytes.emplace_back(0x03);
851 |       } else {
852 |         memory_section_bytes.emplace_back(0x01);
853 |       }
854 |       concat(memory_section_bytes, encode_unsigned(memory.min));
855 |       concat(memory_section_bytes, encode_unsigned(memory.max));
856 |     } else {
857 | 			assert(!memory.is_shared && "shared memory must have a max size");
858 |       memory_section_bytes.emplace_back(0x00);
859 |       concat(memory_section_bytes, encode_unsigned(memory.min));
860 |     }
861 |     emitted_bytes.emplace_back(0x05);
862 |     concat(emitted_bytes, encode_unsigned(memory_section_bytes.size()));
863 |     concat(emitted_bytes, memory_section_bytes);
864 |   }
865 | 
866 |   std::vector export_section_bytes;
867 | 
868 |   auto num_exports = exported_functions_.size() + memory.is_export();
869 |   concat(export_section_bytes, encode_unsigned(num_exports));
870 |   if (memory.is_export()) {
871 |     concat(export_section_bytes, encode_string(memory.a_string));
872 |     export_section_bytes.emplace_back(0x02);
873 |     export_section_bytes.emplace_back(0x00);  // always 1 memory at index 0
874 |   }
875 |   for (const auto& p : exported_functions_) {
876 |     concat(export_section_bytes, encode_string(p.second));
877 |     export_section_bytes.emplace_back(0x00);
878 |     concat(export_section_bytes, encode_unsigned(p.first));
879 |   }
880 |   emitted_bytes.emplace_back(0x7);
881 |   concat(emitted_bytes, encode_unsigned(export_section_bytes.size()));
882 |   concat(emitted_bytes, export_section_bytes);
883 | 
884 |   std::vector code_section_bytes;
885 |   concat(code_section_bytes, encode_unsigned(functions_.size()));
886 |   for (auto& f : functions_) {
887 |     cur_function_ = &f;
888 | 
889 |     cur_bytes_.clear();
890 |     f.emit();
891 |     end();
892 |     std::vector body_bytes = cur_bytes_;
893 | 
894 |     cur_bytes_.clear();
895 |     concat(cur_bytes_, encode_unsigned(f.locals.size()));
896 |     for (const auto& l : f.locals) {
897 |       emit(0x1);
898 |       emit(l);
899 |     }
900 | 
901 |     std::vector header_bytes = cur_bytes_;
902 |     auto fn_size = header_bytes.size() + body_bytes.size();
903 | 
904 |     concat(code_section_bytes, encode_unsigned(fn_size));
905 |     concat(code_section_bytes, header_bytes);
906 |     concat(code_section_bytes, body_bytes);
907 |   }
908 |   cur_function_ = nullptr;
909 | 
910 |   emitted_bytes.emplace_back(0xa);
911 |   concat(emitted_bytes, encode_unsigned(code_section_bytes.size()));
912 |   concat(emitted_bytes, code_section_bytes);
913 | 
914 |   return emitted_bytes;
915 | }
916 | 
917 | }  // namespace wasmblr
918 | 


--------------------------------------------------------------------------------