├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── condensed
    └── condensed.js
├── globals.js
├── index.html
├── instructions.js
├── model.js
├── other
    ├── conversion_scripts
    │   ├── README.md
    │   ├── ckpt.pt
    │   ├── convert_checkpoint_pytorch.py
    │   ├── convert_pretrained_pytorch.py
    │   └── sample_shakespeare_ckpt.pt
    ├── int8-gemm.js
    ├── misc
    │   ├── files.png
    │   └── header.png
    ├── scratchpad.js
    ├── test.js
    └── validation
    │   ├── README.md
    │   ├── test
    │       ├── gpt2medium_validation.json
    │       └── shakepeare_validation.json
    │   └── validation.js
├── tokenizer.js
├── visuals.js
└── weights
    ├── better_shakespeare
        ├── lm_head.weight_gpt.bin
        ├── params_gpt.json
        ├── transformer.h.0.attn.c_attn.bias_gpt.bin
        ├── transformer.h.0.attn.c_attn.weight_gpt.bin
        ├── transformer.h.0.attn.c_proj.bias_gpt.bin
        ├── transformer.h.0.attn.c_proj.weight_gpt.bin
        ├── transformer.h.0.ln_1.bias_gpt.bin
        ├── transformer.h.0.ln_1.weight_gpt.bin
        ├── transformer.h.0.ln_2.bias_gpt.bin
        ├── transformer.h.0.ln_2.weight_gpt.bin
        ├── transformer.h.0.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.0.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.0.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.0.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.1.attn.c_attn.bias_gpt.bin
        ├── transformer.h.1.attn.c_attn.weight_gpt.bin
        ├── transformer.h.1.attn.c_proj.bias_gpt.bin
        ├── transformer.h.1.attn.c_proj.weight_gpt.bin
        ├── transformer.h.1.ln_1.bias_gpt.bin
        ├── transformer.h.1.ln_1.weight_gpt.bin
        ├── transformer.h.1.ln_2.bias_gpt.bin
        ├── transformer.h.1.ln_2.weight_gpt.bin
        ├── transformer.h.1.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.1.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.1.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.1.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.2.attn.c_attn.bias_gpt.bin
        ├── transformer.h.2.attn.c_attn.weight_gpt.bin
        ├── transformer.h.2.attn.c_proj.bias_gpt.bin
        ├── transformer.h.2.attn.c_proj.weight_gpt.bin
        ├── transformer.h.2.ln_1.bias_gpt.bin
        ├── transformer.h.2.ln_1.weight_gpt.bin
        ├── transformer.h.2.ln_2.bias_gpt.bin
        ├── transformer.h.2.ln_2.weight_gpt.bin
        ├── transformer.h.2.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.2.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.2.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.2.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.3.attn.c_attn.bias_gpt.bin
        ├── transformer.h.3.attn.c_attn.weight_gpt.bin
        ├── transformer.h.3.attn.c_proj.bias_gpt.bin
        ├── transformer.h.3.attn.c_proj.weight_gpt.bin
        ├── transformer.h.3.ln_1.bias_gpt.bin
        ├── transformer.h.3.ln_1.weight_gpt.bin
        ├── transformer.h.3.ln_2.bias_gpt.bin
        ├── transformer.h.3.ln_2.weight_gpt.bin
        ├── transformer.h.3.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.3.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.3.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.3.mlp.c_proj.weight_gpt.bin
        ├── transformer.ln_f.bias_gpt.bin
        ├── transformer.ln_f.weight_gpt.bin
        ├── transformer.wpe.weight_gpt.bin
        └── transformer.wte.weight_gpt.bin
    ├── gpt2
        ├── lm_head.weight_gpt.bin
        ├── params_gpt.json
        ├── transformer.h.0.attn.bias_gpt.bin
        ├── transformer.h.0.attn.c_attn.bias_gpt.bin
        ├── transformer.h.0.attn.c_attn.weight_gpt.bin
        ├── transformer.h.0.attn.c_proj.bias_gpt.bin
        ├── transformer.h.0.attn.c_proj.weight_gpt.bin
        ├── transformer.h.0.attn.masked_bias_gpt.bin
        ├── transformer.h.0.ln_1.bias_gpt.bin
        ├── transformer.h.0.ln_1.weight_gpt.bin
        ├── transformer.h.0.ln_2.bias_gpt.bin
        ├── transformer.h.0.ln_2.weight_gpt.bin
        ├── transformer.h.0.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.0.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.0.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.0.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.1.attn.bias_gpt.bin
        ├── transformer.h.1.attn.c_attn.bias_gpt.bin
        ├── transformer.h.1.attn.c_attn.weight_gpt.bin
        ├── transformer.h.1.attn.c_proj.bias_gpt.bin
        ├── transformer.h.1.attn.c_proj.weight_gpt.bin
        ├── transformer.h.1.attn.masked_bias_gpt.bin
        ├── transformer.h.1.ln_1.bias_gpt.bin
        ├── transformer.h.1.ln_1.weight_gpt.bin
        ├── transformer.h.1.ln_2.bias_gpt.bin
        ├── transformer.h.1.ln_2.weight_gpt.bin
        ├── transformer.h.1.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.1.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.1.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.1.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.10.attn.bias_gpt.bin
        ├── transformer.h.10.attn.c_attn.bias_gpt.bin
        ├── transformer.h.10.attn.c_attn.weight_gpt.bin
        ├── transformer.h.10.attn.c_proj.bias_gpt.bin
        ├── transformer.h.10.attn.c_proj.weight_gpt.bin
        ├── transformer.h.10.attn.masked_bias_gpt.bin
        ├── transformer.h.10.ln_1.bias_gpt.bin
        ├── transformer.h.10.ln_1.weight_gpt.bin
        ├── transformer.h.10.ln_2.bias_gpt.bin
        ├── transformer.h.10.ln_2.weight_gpt.bin
        ├── transformer.h.10.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.10.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.10.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.10.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.11.attn.bias_gpt.bin
        ├── transformer.h.11.attn.c_attn.bias_gpt.bin
        ├── transformer.h.11.attn.c_attn.weight_gpt.bin
        ├── transformer.h.11.attn.c_proj.bias_gpt.bin
        ├── transformer.h.11.attn.c_proj.weight_gpt.bin
        ├── transformer.h.11.attn.masked_bias_gpt.bin
        ├── transformer.h.11.ln_1.bias_gpt.bin
        ├── transformer.h.11.ln_1.weight_gpt.bin
        ├── transformer.h.11.ln_2.bias_gpt.bin
        ├── transformer.h.11.ln_2.weight_gpt.bin
        ├── transformer.h.11.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.11.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.11.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.11.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.2.attn.bias_gpt.bin
        ├── transformer.h.2.attn.c_attn.bias_gpt.bin
        ├── transformer.h.2.attn.c_attn.weight_gpt.bin
        ├── transformer.h.2.attn.c_proj.bias_gpt.bin
        ├── transformer.h.2.attn.c_proj.weight_gpt.bin
        ├── transformer.h.2.attn.masked_bias_gpt.bin
        ├── transformer.h.2.ln_1.bias_gpt.bin
        ├── transformer.h.2.ln_1.weight_gpt.bin
        ├── transformer.h.2.ln_2.bias_gpt.bin
        ├── transformer.h.2.ln_2.weight_gpt.bin
        ├── transformer.h.2.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.2.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.2.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.2.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.3.attn.bias_gpt.bin
        ├── transformer.h.3.attn.c_attn.bias_gpt.bin
        ├── transformer.h.3.attn.c_attn.weight_gpt.bin
        ├── transformer.h.3.attn.c_proj.bias_gpt.bin
        ├── transformer.h.3.attn.c_proj.weight_gpt.bin
        ├── transformer.h.3.attn.masked_bias_gpt.bin
        ├── transformer.h.3.ln_1.bias_gpt.bin
        ├── transformer.h.3.ln_1.weight_gpt.bin
        ├── transformer.h.3.ln_2.bias_gpt.bin
        ├── transformer.h.3.ln_2.weight_gpt.bin
        ├── transformer.h.3.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.3.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.3.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.3.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.4.attn.bias_gpt.bin
        ├── transformer.h.4.attn.c_attn.bias_gpt.bin
        ├── transformer.h.4.attn.c_attn.weight_gpt.bin
        ├── transformer.h.4.attn.c_proj.bias_gpt.bin
        ├── transformer.h.4.attn.c_proj.weight_gpt.bin
        ├── transformer.h.4.attn.masked_bias_gpt.bin
        ├── transformer.h.4.ln_1.bias_gpt.bin
        ├── transformer.h.4.ln_1.weight_gpt.bin
        ├── transformer.h.4.ln_2.bias_gpt.bin
        ├── transformer.h.4.ln_2.weight_gpt.bin
        ├── transformer.h.4.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.4.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.4.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.4.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.5.attn.bias_gpt.bin
        ├── transformer.h.5.attn.c_attn.bias_gpt.bin
        ├── transformer.h.5.attn.c_attn.weight_gpt.bin
        ├── transformer.h.5.attn.c_proj.bias_gpt.bin
        ├── transformer.h.5.attn.c_proj.weight_gpt.bin
        ├── transformer.h.5.attn.masked_bias_gpt.bin
        ├── transformer.h.5.ln_1.bias_gpt.bin
        ├── transformer.h.5.ln_1.weight_gpt.bin
        ├── transformer.h.5.ln_2.bias_gpt.bin
        ├── transformer.h.5.ln_2.weight_gpt.bin
        ├── transformer.h.5.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.5.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.5.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.5.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.6.attn.bias_gpt.bin
        ├── transformer.h.6.attn.c_attn.bias_gpt.bin
        ├── transformer.h.6.attn.c_attn.weight_gpt.bin
        ├── transformer.h.6.attn.c_proj.bias_gpt.bin
        ├── transformer.h.6.attn.c_proj.weight_gpt.bin
        ├── transformer.h.6.attn.masked_bias_gpt.bin
        ├── transformer.h.6.ln_1.bias_gpt.bin
        ├── transformer.h.6.ln_1.weight_gpt.bin
        ├── transformer.h.6.ln_2.bias_gpt.bin
        ├── transformer.h.6.ln_2.weight_gpt.bin
        ├── transformer.h.6.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.6.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.6.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.6.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.7.attn.bias_gpt.bin
        ├── transformer.h.7.attn.c_attn.bias_gpt.bin
        ├── transformer.h.7.attn.c_attn.weight_gpt.bin
        ├── transformer.h.7.attn.c_proj.bias_gpt.bin
        ├── transformer.h.7.attn.c_proj.weight_gpt.bin
        ├── transformer.h.7.attn.masked_bias_gpt.bin
        ├── transformer.h.7.ln_1.bias_gpt.bin
        ├── transformer.h.7.ln_1.weight_gpt.bin
        ├── transformer.h.7.ln_2.bias_gpt.bin
        ├── transformer.h.7.ln_2.weight_gpt.bin
        ├── transformer.h.7.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.7.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.7.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.7.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.8.attn.bias_gpt.bin
        ├── transformer.h.8.attn.c_attn.bias_gpt.bin
        ├── transformer.h.8.attn.c_attn.weight_gpt.bin
        ├── transformer.h.8.attn.c_proj.bias_gpt.bin
        ├── transformer.h.8.attn.c_proj.weight_gpt.bin
        ├── transformer.h.8.attn.masked_bias_gpt.bin
        ├── transformer.h.8.ln_1.bias_gpt.bin
        ├── transformer.h.8.ln_1.weight_gpt.bin
        ├── transformer.h.8.ln_2.bias_gpt.bin
        ├── transformer.h.8.ln_2.weight_gpt.bin
        ├── transformer.h.8.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.8.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.8.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.8.mlp.c_proj.weight_gpt.bin
        ├── transformer.h.9.attn.bias_gpt.bin
        ├── transformer.h.9.attn.c_attn.bias_gpt.bin
        ├── transformer.h.9.attn.c_attn.weight_gpt.bin
        ├── transformer.h.9.attn.c_proj.bias_gpt.bin
        ├── transformer.h.9.attn.c_proj.weight_gpt.bin
        ├── transformer.h.9.attn.masked_bias_gpt.bin
        ├── transformer.h.9.ln_1.bias_gpt.bin
        ├── transformer.h.9.ln_1.weight_gpt.bin
        ├── transformer.h.9.ln_2.bias_gpt.bin
        ├── transformer.h.9.ln_2.weight_gpt.bin
        ├── transformer.h.9.mlp.c_fc.bias_gpt.bin
        ├── transformer.h.9.mlp.c_fc.weight_gpt.bin
        ├── transformer.h.9.mlp.c_proj.bias_gpt.bin
        ├── transformer.h.9.mlp.c_proj.weight_gpt.bin
        ├── transformer.ln_f.bias_gpt.bin
        ├── transformer.ln_f.weight_gpt.bin
        ├── transformer.wpe.weight_gpt.bin
        └── transformer.wte.weight_gpt.bin
    └── tokenization
        ├── gpt_tokens.json
        ├── simple_tokens.json
        └── vocab.bpe


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bin filter=lfs diff=lfs merge=lfs -text
2 | *.json filter=lfs diff=lfs merge=lfs -text
3 |  


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | weights/large-models


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | IFCOOLTELLME License
 2 | 
 3 | Copyright (c) 2023 Will DePue
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | If this software is used for any purpose that is substantially epic, awesome, or
16 | incredible, notice is required to the Author, reachable at will@depue.net. 
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WebGPT
 2 | 
 3 | ![webGPT](other/misc/header.png)
 4 | 
 5 | After six years of development, WebGPU is about to launch across most major web browsers. This is massive: web applications now have near-native access to the GPU, with the added capacity of compute shaders.
 6 | 
 7 | WebGPT is a vanilla JS and HTML implementation of a transformer model, intended as a proof-of-concept as well as educational resource. WebGPT has been tested to be working with models up to 500 M parameters, though could likely support far more with further testing/optimization.
 8 | 
 9 | ### Current Stats
10 | 2020 M1 Mac: 3ms/token at 5M parameters with f32 precision.  
11 | 2020 M1 Mac: 30ms/token at 117M parameters with f32 precision.  
12 | 2020 M1 Mac: 70ms/token at 377M parameters with f32 precision.  
13 | 2020 M1 Mac: 120ms/token at 775M parameters with f32 precision.  
14 | 1.5B is working but unstable, sitting around 1000ms/token due to inefficiencies.  
15 | 
16 | ## Running WebGPT
17 | 
18 | Running WebGPT is remarkably simple, as it's just a set of HTML + JS files. Since WebGPU is still in the process of being released, you'll need to open with a compatible browser. WebGPU is currently available on Chrome v113 but the most straightforward way to ensure proper functionality is to install [Chrome Canary](https://www.google.com/chrome/canary/) or Edge Canary.
19 | 
20 | I've included two different models: a toy GPT-Shakespeare model (which is severly undertrained haha) and GPT-2 117M. See main.js for more information on how to run these models. If you want to import custom models, take a look at misc/conversion_scripts.
21 | 
22 | If you want to try out WebGPT, visit the demo website here [KMeans.org](https://www.kmeans.org). I'd generally reccomend cloning the repo and running locally, just because loading the weights remotely is significantly slower.  
23 | Note: **You'll need to use Git LFS** to download the model files, after cloning the repository.
24 | 
25 | ![file sizes](other/misc/files.png)
26 | 
27 | ## Roadmap / Fixing Stupid Decisions
28 | 
29 | - [x] Embeddings / de-embeddings on GPU.
30 | - [x] Initializing pipelines on every step is incredibly inefficient.
31 | - [x] Key-value caching.
32 | - [x] Reuse buffers.
33 | - [x] Kernel shared memory for matmul!
34 | - [x] Destroy buffers after use!
35 | - [x] Create kernel instruction classes + optimize pipeline creation.
36 | - [X] Fuse all kernels.
37 | - [X] Optimize all other kernels.
38 | - [X] Compute pass splitting for larger models _(maxStorageBufferBindingSize)_
39 | - [ ] Run selection ops on GPU (topk, selection softmax)
40 | - [ ] Attention kernel is optimized for small models, not for large models where each head having it's own matmul is more efficient.
41 | - [ ] Investigate why attention cache isn't giving proper speed-ups.
42 | - [ ] Make simple instructional version without special stuff.
43 | - [ ] Optimize workgroup sizes, specifically for single row/col operations.
44 | - [ ] Convert into a package.
45 | - [ ] Write better comments + make Youtube explainer.
46 | 
47 | ## Acknowledgements
48 | 
49 | When I started this project I had no idea how transformers worked or how to implement them (or GPUs or matmul kernels or WebGPU or tokenization for that matter), so Andrej Karpathy's series on neural networks and building GPT from scratch were invaluable: [Andrej's Youtube](https://www.youtube.com/@AndrejKarpathy). I've also used some code as well from the nanoGPT repository: [nanoGPT](https://github.com/karpathy/nanoGPT).
50 | 
51 | I copied from LatitudeGames' implementation of OpenAI's GPT-3 tokenizer in Javascript: [GPT-3-Encoder](https://github.com/latitudegames/GPT-3-Encoder).
52 | 


--------------------------------------------------------------------------------
/globals.js:
--------------------------------------------------------------------------------
  1 | const FastMatMulBlock = new FastMatMulBlockClass();
  2 | const AttentionBlock = new AttentionBlockClass();
  3 | const ResidualBlock = new ResidualBlockClass();
  4 | const EmbedBlock = new EmbedBlockClass();
  5 | const DeEmbedBlock = new DeEmbedBlockClass();
  6 | const GeluBlock = new GeluBlockClass();
  7 | const LayerNormBlock = new LayerNormBlockClass();
  8 | const SoftmaxBlock = new SoftmaxBlockClass();
  9 | 
 10 | // Needed for deletion.
 11 | let operations = [FastMatMulBlock, AttentionBlock, ResidualBlock, EmbedBlock, DeEmbedBlock, GeluBlock, LayerNormBlock, SoftmaxBlock];
 12 | 
 13 | function initializeOperations(device) {
 14 |   for (const operation of operations) operation.initialize(device);
 15 | }
 16 | 
 17 | function destroyOperationBuffers() {
 18 |   for (const operation of operations) operation.destroyBuffers();
 19 | }
 20 | 
 21 | function clearOperationCache() {
 22 |   for (const operation of operations) operation.clearBufferCache();
 23 | }
 24 | 
 25 | function destroyOperations() {
 26 |   for (const operation of operations) operation.destroy();
 27 | }
 28 | 
 29 | const bufferUsageDict = {
 30 |   copy_from: GPUBufferUsage.COPY_SRC,
 31 |   copy_to: GPUBufferUsage.COPY_DST,
 32 |   storage: GPUBufferUsage.STORAGE,
 33 |   uniform: GPUBufferUsage.UNIFORM,
 34 |   map_read: GPUBufferUsage.MAP_READ,
 35 | };
 36 | 
 37 | // ---------------- Helper Functions ----------------
 38 | 
 39 | async function fetchBin(url) {
 40 |   const response = await fetch(url);
 41 |   const buffer = await response.arrayBuffer();
 42 |   return new Float32Array(buffer);
 43 | }
 44 | 
 45 | const wgSize = (dim, size) => Math.min(Math.ceil(dim / size), Infinity);
 46 | 
 47 | function sampleFromDistribution(probs) {
 48 |   const rand = Math.random();
 49 |   let cumulativeProb = 0;
 50 |   for (let i = 0; i < probs.length; i++) {
 51 |     cumulativeProb += probs[i];
 52 |     if (rand < cumulativeProb) {
 53 |       return i;
 54 |     }
 55 |   }
 56 |   return probs.length - 1;
 57 | }
 58 | 
 59 | function cpuSoftmax(logits, temperature = 1.0) {
 60 |   const maxLogit = Math.max(...logits);
 61 |   const expLogits = logits.map((logit) => Math.exp((logit - maxLogit) / temperature));
 62 |   const sumExpLogits = expLogits.reduce((a, b) => a + b, 0);
 63 |   return expLogits.map((expLogit) => expLogit / sumExpLogits);
 64 | }
 65 | 
 66 | function selectTopK(probs, top_k) {
 67 |   const sortedIndices = Array.from(probs)
 68 |     .map((value, index) => ({ value, index }))
 69 |     .sort((a, b) => b.value - a.value)
 70 |     .map(({ index }) => index);
 71 |   const topKIndices = sortedIndices.slice(0, top_k);
 72 |   const topKProbs = topKIndices.map((index) => probs[index]);
 73 |   return { topKIndices, topKProbs };
 74 | }
 75 | 
 76 | // ----------------------- Matrix Operations -----------------------
 77 | 
 78 | const zeros = (dim) => new Float32Array(dim).fill(0);
 79 | 
 80 | function transpose(array, input_rows, input_cols) {
 81 |   if (array.length !== input_rows * input_cols) {
 82 |     console.log(array.length, input_rows, input_cols);
 83 |     throw new Error("Transpose dims failed");
 84 |   }
 85 | 
 86 |   const transpose = [];
 87 |   for (let col = 0; col < input_cols; col++) {
 88 |     for (let row = 0; row < input_rows; row++) {
 89 |       transpose.push(array[row * input_cols + col]);
 90 |     }
 91 |   }
 92 | 
 93 |   return new Float32Array(transpose);
 94 | }
 95 | 
 96 | function leastPrimeFactor(n, start = 2) {
 97 |   for (let i = start; i <= Math.sqrt(n); i++) {
 98 |     if (n % i === 0) return i;
 99 |   }
100 |   return n;
101 | }
102 | 
103 | function formatAsMatrix(floatArray, dimA, dimB) {
104 |   const resultMatrix = [];
105 |   for (let i = 0; i < dimA; i++) {
106 |     resultMatrix.push(floatArray.slice(i * dimB, (i + 1) * dimB));
107 |   }
108 |   return resultMatrix;
109 | }
110 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <title>WebGPU GPT Model Demo</title>
  5 |     <meta
  6 |       http-equiv="origin-trial"
  7 |       content="Anx9P4m5tzLOL/wLICKy/mA+DRyoSsTkyqmnK5t+S7oyw7A2SeBI2jO4LKqoQiQgChP2MTtqMNKofelMwvGtPQsAAABKeyJvcmlnaW4iOiJodHRwczovL2ttZWFucy5vcmc6NDQzIiwiZmVhdHVyZSI6IldlYkdQVSIsImV4cGlyeSI6MTY5MTcxMTk5OX0="
  8 |     />
  9 |     <script src="tokenizer.js"></script>
 10 |     <script src="instructions.js"></script>
 11 |     <script src="model.js"></script>
 12 |     <script src="visuals.js"></script>
 13 |     <script src="globals.js"></script>
 14 |   </head>
 15 |   <body>
 16 |     <h1>WebGPU GPT Model Demo</h1>
 17 |     <p id="webgpuSupportMessage">Checking WebGPU support...</p>
 18 |     <p>
 19 |       <i>PS: Loading models is 5x slower on the web rather than running locally. Just <a href="https://github.com/0hq/WebGPT">clone the repo</a> and open!</i>
 20 |     </p>
 21 | 
 22 |     <button class="loadModelButton" onclick="loadModel('better_shakespeare', 'char')" disabled>Load Shakespeare Model</button>
 23 |     <button class="loadModelButton" onclick="loadModel('gpt2', 'bpe')" disabled>Load GPT2 117M Model</button>
 24 |     <button id="destroyCacheButton" onclick="destroyCache()" disabled>Destroy Cache</button>
 25 | 
 26 |     <p>
 27 |       <i>Special models (download required):</i>
 28 |     </p>
 29 |     <button class="loadModelButton" onclick="loadModel('large-models/gpt2-medium', 'bpe')" disabled>Load GPT2 377M Model</button>
 30 |     <button class="loadModelButton" onclick="loadModel('large-models/gpt2-large', 'bpe')" disabled>Load GPT2 777M Model</button>
 31 |     <button class="loadModelButton" onclick="loadModel('large-models/gpt2-xl', 'bpe')" disabled>Load GPT2 1.5B Model</button>
 32 | 
 33 |     <br />
 34 |     <br />
 35 |     <label for="tokens">Number of tokens:</label>
 36 |     <input type="number" min="1" max="300" value="100" id="tokensInput" disabled />
 37 |     <br /><br />
 38 |     <label for="topK">Top K:</label>
 39 |     <input type="number" min="1" max="100" value="2" id="topKInput" disabled />
 40 |     <br /><br />
 41 |     <label for="temperature">Temperature:</label>
 42 |     <input type="number" step="0.01" min="0.1" max="2" value="1" id="temperatureInput" disabled />
 43 |     <br /><br />
 44 |     <button id="generateButton" onclick="startGeneration()" disabled>Generate Text</button>
 45 |     <br /><br />
 46 |     <textarea id="prompt" rows="15" cols="50" disabled>
 47 | WILL:
 48 | Ah, how dare you challenge me?
 49 | Have you forgotten I built WebGPT?&#13;&#10;</textarea
 50 |     >
 51 |     <br /><br />
 52 |     <button id="updateVisuals" onclick="updateVisuals()" disabled>Force Update Visuals</button>
 53 |     <br /><br />
 54 |     <div id="visualsContainer"></div>
 55 |     <script>
 56 |       const webgpuSupportMessage = document.getElementById("webgpuSupportMessage");
 57 |       const loadModelButton = document.getElementsByClassName("loadModelButton");
 58 |       const setModelButtonDisabled = (bool) => {
 59 |         for (let i = 0; i < loadModelButton.length; i++) loadModelButton[i].disabled = bool;
 60 |       };
 61 |       const destroyCacheButton = document.getElementById("destroyCacheButton");
 62 | 
 63 |       const tokensInput = document.getElementById("tokensInput");
 64 |       const topKInput = document.getElementById("topKInput");
 65 |       const temperatureInput = document.getElementById("temperatureInput");
 66 |       const generateButton = document.getElementById("generateButton");
 67 |       const promptTextarea = document.getElementById("prompt");
 68 |       const updateVisualsButton = document.getElementById("updateVisuals");
 69 | 
 70 |       let GPTModel = null;
 71 |       let visuals = null;
 72 | 
 73 |       // Check for WebGPU support
 74 |       if (!navigator.gpu) {
 75 |         webgpuSupportMessage.innerHTML =
 76 |           "WebGPU is not supported in your browser yet. Update Chrome to v113 or download <a href='https://www.google.com/chrome/canary/'>Chrome Canary</a>. Also available on <a href='https://www.microsoftedgeinsider.com/en-us/download/canary'>Edge Canary</a>.";
 77 |         console.error("WebGPU is not supported");
 78 |       } else {
 79 |         webgpuSupportMessage.innerHTML = "WebGPU is supported in your browser!";
 80 |         setModelButtonDisabled(false);
 81 |       }
 82 | 
 83 |       async function startGeneration() {
 84 |         setTextareaDisabled(true);
 85 | 
 86 |         const prompt = promptTextarea.value || " ";
 87 |         const numTokens = tokensInput.value;
 88 | 
 89 |         const topK = topKInput.value;
 90 |         const temperature = temperatureInput.value;
 91 |         const textStream = GPTModel.generate(prompt, numTokens, topK, temperature);
 92 | 
 93 |         for await (const text of textStream) {
 94 |           promptTextarea.value += text;
 95 |           visuals.render();
 96 |         }
 97 | 
 98 |         setTextareaDisabled(false);
 99 |       }
100 | 
101 |       async function loadModel(folder, tokenizer) {
102 |         setModelButtonDisabled(true);
103 | 
104 |         GPTModel = new GPT(folder, tokenizer);
105 |         await GPTModel.initialize();
106 | 
107 | 
108 |         promptTextarea.value = GPTModel.defaultPrompt;
109 |         topKInput.value = GPTModel.defaultTopK;
110 |         tokensInput.value = GPTModel.defaultTokens;
111 |         temperatureInput.value = GPTModel.defaultTemperature;
112 | 
113 |         setTextareaDisabled(false);
114 |         tokensInput.disabled = false;
115 |         topKInput.disabled = false;
116 |         temperatureInput.disabled = false;
117 | 
118 |         destroyCacheButton.disabled = false;
119 |         updateVisualsButton.disabled = false;
120 | 
121 |         if (!visuals) {
122 |           visuals = new Visuals(GPTModel);
123 |           visuals.init();
124 |           visuals.render();
125 |         }
126 |       }
127 | 
128 |       function setTextareaDisabled(bool) {
129 |         promptTextarea.disabled = bool;
130 |         generateButton.disabled = bool;
131 |       }
132 | 
133 |       async function continueGeneration() {
134 |         setTextareaDisabled(true);
135 | 
136 |         const prompt = outputDiv.innerHTML || " ";
137 |         const numTokens = tokensInput.value;
138 | 
139 |         outputDiv.innerHTML = prompt;
140 | 
141 |         const topK = topKInput.value;
142 |         const temperature = temperatureInput.value;
143 |         const textStream = generate(prompt, numTokens, 10, topK, temperature);
144 | 
145 |         for await (const text of textStream) {
146 |           outputDiv.innerHTML += text;
147 |         }
148 | 
149 |         setTextareaDisabled(false);
150 |       }
151 | 
152 |       function destroyCache() {
153 |         GPTModel.unloadBuffers();
154 |         destroyOperations();
155 | 
156 |         GPTModel = null;
157 | 
158 |         visuals.destroy();
159 |         visuals = null;
160 | 
161 |         setModelButtonDisabled(false);
162 | 
163 |         destroyCacheButton.disabled = true;
164 |         tokensInput.disabled = true;
165 |         topKInput.disabled = true;
166 |         temperatureInput.disabled = true;
167 |         updateVisualsButton.disabled = true;
168 |         setTextareaDisabled(true);
169 |       }
170 | 
171 |       function updateVisuals() {
172 |           visuals.render();
173 |       }
174 |     </script>
175 |   </body>
176 | </html>
177 | 


--------------------------------------------------------------------------------
/model.js:
--------------------------------------------------------------------------------
  1 | class GPT {
  2 |   constructor(folder, type) {
  3 |     this.folder = folder;
  4 |     this.tokenizerType = type;
  5 |     this.initialized = false;
  6 | 
  7 |     this.device;
  8 |     this.model;
  9 |     this.tokenizer;
 10 |     this.params;
 11 |     this.minBufferOffset = 1;
 12 | 
 13 |     this.defaultPrompt;
 14 |     this.defaultTopK;
 15 |     this.defaultTemperature;
 16 |     this.defaultTokens;
 17 | 
 18 |     this.externalBuffer;
 19 | 
 20 |     this.unloadDeletionStack = [];
 21 |   }
 22 | 
 23 |   async initialize() {
 24 |     if (this.initialized) return console.error("Model already initialized");
 25 |     if (!navigator.gpu) throw new Error("WebGPU is not supported");
 26 | 
 27 |     const adapter = await navigator.gpu.requestAdapter();
 28 |     this.device = await adapter.requestDevice();
 29 | 
 30 |     initializeOperations(this.device);
 31 | 
 32 |     [this.model, this.params] = await this.loadModel(this.folder);
 33 |     this.tokenizer = this.tokenizerType == "bpe" ? new GPT2Tokenizer() : new SimpleTokenizer();
 34 |     await this.tokenizer.load();
 35 | 
 36 |     if (this.tokenizerType == "bpe") {
 37 |       this.defaultPrompt = `What is the answer to life, the universe, and everything?\n`;
 38 |       this.defaultTopK = 3;
 39 |       this.defaultTemperature = 1;
 40 |       this.defaultTokens = 30;
 41 |     } else {
 42 |       this.defaultPrompt = `WILL:\nAh, how dare you challenge me?\nHave you forgotten I built WebGPT?\n`;
 43 |       this.defaultTopK = 2;
 44 |       this.defaultTemperature = 1;
 45 |       this.defaultTokens = 80;
 46 |     }
 47 | 
 48 |     this.initialized = true;
 49 | 
 50 |     console.log("Model initialized");
 51 |   }
 52 | 
 53 |   async *generate(prompt, max_new_tokens, top_k, temperature) {
 54 |     if (!this.initialized) {
 55 |       console.error("Model not loaded yet");
 56 |       return;
 57 |     }
 58 | 
 59 |     // Buffer size (321644800) exceeds the max buffer size limit (268435456).
 60 |     //  - While calling [Device].CreateBuffer([BufferDescriptor]).
 61 | 
 62 |     let history = this.tokenizer.encode(prompt);
 63 |     console.log(`Prompt (${history.length} tokens):\n${prompt}`);
 64 | 
 65 |     const warmupRuns = 3;
 66 |     let totalTime = 0;
 67 | 
 68 |     for (let i = 0; i < max_new_tokens; i++) {
 69 |       const idx_cond = history.slice(-this.params.n_ctx);
 70 |       const useAttCache = i !== 0 && history.length <= this.params.n_ctx;
 71 | 
 72 |       const startTime = performance.now();
 73 |       const logits = await this.run(idx_cond, useAttCache);
 74 |       const endTime = performance.now();
 75 | 
 76 |       // console.log(`\nIteration ${i + 1} of ${max_new_tokens}`);
 77 |       const lapsedTime = endTime - startTime;
 78 |       console.log(`Kernel execution time: ${lapsedTime} ms`);
 79 |       i >= warmupRuns && (totalTime += lapsedTime);
 80 | 
 81 |       const { topKIndices, topKProbs } = selectTopK(logits, top_k);
 82 |       const probs = cpuSoftmax(topKProbs, temperature);
 83 |       const idx_next = topKIndices[sampleFromDistribution(probs)];
 84 | 
 85 |       history = history.concat(idx_next);
 86 | 
 87 |       // console.log(`Output:\n${this.tokenizer.decode(history)}`);
 88 | 
 89 |       // const totalProbs = cpuSoftmax(logits, temperature);
 90 |       // const tokenProbsString = Array.from(totalProbs)
 91 |       //   .map((value, index) => ({ value, index }))
 92 |       //   .sort((a, b) => b.value - a.value)
 93 |       //   .slice(0, 8)
 94 |       //   .map((prob) => `{ ${this.tokenizer.decode([prob.index]).replace(/(\r\n|\n|\r)/gm, "newline")} } : ${prob.value.toPrecision(3)}`)
 95 |       //   .join(" | ");
 96 |       // console.log("Top 8 token probs:", tokenProbsString);
 97 | 
 98 |       yield this.tokenizer.decode([idx_next]);
 99 |     }
100 | 
101 |     console.log(`Average kernel execution time: ${totalTime / (max_new_tokens - warmupRuns)} ms`);
102 |   }
103 | 
104 |   async run(idx) {
105 |     const { posEmbdBuffer, layer_buffers, normGammaBuffer, normBetaBuffer, embeddingsBuffers, deEmbeddingsBuffers } = this.model;
106 |     const { attention_scale, n_embd, n_head, head_size, n_layer, vocab_size, hidden_size, vocab_chunk_size, vocab_chunk_instances } = this.params;
107 |     const seq_length = idx.length;
108 | 
109 |     // ---------------- Create Passes ---------------- //
110 |     // Note: These are re-initialized because everytime seq_length changes buffers are different sizes.
111 | 
112 |     // Pipeline creation is major bottleneck to spin up speed! Also buffer re-use.
113 | 
114 |     this.computePasses = [];
115 |     let intermediateBuffer;
116 |     let residualBuffer;
117 |     {
118 |       const { passes, resultBuffer } = EmbedBlock.newInstance(idx, seq_length, n_embd, vocab_chunk_size, embeddingsBuffers, posEmbdBuffer, ResidualBlock);
119 |       intermediateBuffer = resultBuffer;
120 |       residualBuffer = resultBuffer;
121 |       this.computePasses.push(...passes);
122 |     }
123 |     for (let i = 0; i < n_layer; i++) {
124 |       const buffers = layer_buffers[i];
125 |       {
126 |         const { passes, resultBuffer } = LayerNormBlock.newInstance(
127 |           seq_length,
128 |           n_embd,
129 |           intermediateBuffer,
130 |           buffers.normAttentionGammaBuffer,
131 |           buffers.normAttentionBetaBuffer
132 |         );
133 |         intermediateBuffer = resultBuffer;
134 |         this.computePasses.push(...passes);
135 |       }
136 |       {
137 |         const { passes, resultBuffer } = AttentionBlock.newFusedInstance(
138 |           seq_length,
139 |           n_embd,
140 |           attention_scale,
141 |           n_head,
142 |           head_size,
143 |           intermediateBuffer,
144 |           buffers.qkvWeightArray[0],
145 |           buffers.qkvBiasArray[0],
146 |           buffers.qkvWeightArray[1],
147 |           buffers.qkvBiasArray[1],
148 |           buffers.qkvWeightArray[2],
149 |           buffers.qkvBiasArray[2],
150 |           buffers.linearWeightsBuffer,
151 |           buffers.linearBiasBuffer,
152 |           FastMatMulBlock,
153 |           SoftmaxBlock
154 |         );
155 |         intermediateBuffer = resultBuffer;
156 |         this.computePasses.push(...passes);
157 |       }
158 |       {
159 |         const { passes, resultBuffer } = ResidualBlock.newInstance(seq_length, n_embd, intermediateBuffer, residualBuffer);
160 |         intermediateBuffer = resultBuffer;
161 |         residualBuffer = resultBuffer;
162 |         this.computePasses.push(...passes);
163 |       }
164 |       {
165 |         const { passes, resultBuffer } = LayerNormBlock.newInstance(
166 |           seq_length,
167 |           n_embd,
168 |           intermediateBuffer,
169 |           buffers.normLinearGammaBuffer,
170 |           buffers.normLinearBetaBuffer
171 |         );
172 |         intermediateBuffer = resultBuffer;
173 |         this.computePasses.push(...passes);
174 |       }
175 |       {
176 |         const { resultBuffer, passes } = FastMatMulBlock.newInstance(
177 |           seq_length,
178 |           hidden_size,
179 |           n_embd,
180 |           intermediateBuffer,
181 |           buffers.firstLayerWeightsBuffer,
182 |           buffers.firstLayerBiasBuffer
183 |         );
184 |         intermediateBuffer = resultBuffer;
185 |         this.computePasses.push(...passes);
186 |       }
187 |       {
188 |         const { resultBuffer, passes } = GeluBlock.newInstance(seq_length, hidden_size, intermediateBuffer);
189 |         intermediateBuffer = resultBuffer;
190 |         this.computePasses.push(...passes);
191 |       }
192 |       {
193 |         const { resultBuffer, passes } = FastMatMulBlock.newInstance(
194 |           seq_length,
195 |           n_embd,
196 |           hidden_size,
197 |           intermediateBuffer,
198 |           buffers.secondLayerWeightsBuffer,
199 |           buffers.secondLayerBiasBuffer
200 |         );
201 |         intermediateBuffer = resultBuffer;
202 |         this.computePasses.push(...passes);
203 |       }
204 |       {
205 |         const { passes, resultBuffer } = ResidualBlock.newInstance(seq_length, n_embd, intermediateBuffer, residualBuffer);
206 |         intermediateBuffer = resultBuffer;
207 |         residualBuffer = resultBuffer;
208 |         this.computePasses.push(...passes);
209 |       }
210 |     }
211 |     {
212 |       if (this.externalBuffer) {
213 |         this.computePasses.push({
214 |           flag: "copy",
215 |           src: intermediateBuffer,
216 |           srcOffset: 0,
217 |           dst: this.externalBuffer,
218 |           dstOffset: 0,
219 |           size: this.bufferSize(seq_length, n_embd),
220 |         });
221 |       }
222 |     }
223 |     {
224 |       const { passes, resultBuffer } = LayerNormBlock.newInstance(seq_length, n_embd, intermediateBuffer, normGammaBuffer, normBetaBuffer);
225 |       intermediateBuffer = resultBuffer;
226 |       this.computePasses.push(...passes);
227 |     }
228 |     {
229 |       const { passes, resultBuffer } = DeEmbedBlock.newInstance(
230 |         n_embd,
231 |         vocab_size,
232 |         vocab_chunk_size * vocab_chunk_instances,
233 |         seq_length,
234 |         vocab_chunk_size,
235 |         intermediateBuffer,
236 |         deEmbeddingsBuffers
237 |       );
238 |       intermediateBuffer = resultBuffer;
239 |       this.computePasses.push(...passes);
240 |     }
241 |     const resultBuffer = intermediateBuffer;
242 | 
243 |     // ---------------- Compute Passes ----------------
244 | 
245 |     const commandEncoder = this.device.createCommandEncoder();
246 |     for (const pass of this.computePasses) {
247 |       if (pass.flag === "compute") {
248 |         const passEncoder = commandEncoder.beginComputePass();
249 |         passEncoder.setPipeline(pass.pipeline);
250 |         for (let i = 0; i < pass.groups.length; i++) passEncoder.setBindGroup(i, pass.groups[i]);
251 |         passEncoder.dispatchWorkgroups(pass.workgroups.x, pass.workgroups.y);
252 |         passEncoder.end();
253 |       } else if (pass.flag === "copy") {
254 |         commandEncoder.copyBufferToBuffer(pass.src, pass.srcOffset, pass.dst, pass.dstOffset, pass.size);
255 |       }
256 |     }
257 |     this.device.queue.submit([commandEncoder.finish()]);
258 | 
259 |     // ---------------- Read Results ----------------
260 | 
261 |     await resultBuffer.mapAsync(GPUMapMode.READ);
262 |     const output = resultBuffer.getMappedRange();
263 |     const outputArray = new Float32Array(output).slice(0); // Copy the array, otherwise it'll be destroyed.
264 | 
265 |     clearOperationCache();
266 | 
267 |     return outputArray;
268 |   }
269 | 
270 |   async loadModel(folder) {
271 |     if (this.initialized) return console.error("Model already loaded");
272 | 
273 |     console.log("Loading model from folder:", folder);
274 |     const weightsFolder = `weights/${folder}/`;
275 | 
276 |     const params = await this.loadParameters(weightsFolder);
277 |     const { embeddingsBuffers, deEmbeddingsBuffers } = await this.loadEmbeddings(params, weightsFolder);
278 |     const { posEmbdBuffer } = await this.loadPositionalEmbeddings(params, weightsFolder);
279 |     const layer_buffers = await this.loadLayers(params, weightsFolder);
280 | 
281 |     console.log("Loading final layer norm...");
282 |     const { normGammaBuffer, normBetaBuffer } = await this.loadFinalLayerNorm(params, weightsFolder);
283 | 
284 |     const output = { layer_buffers, embeddingsBuffers, deEmbeddingsBuffers, posEmbdBuffer, normGammaBuffer, normBetaBuffer };
285 |     console.log("Finished loading model.", output, params);
286 |     return [output, params];
287 |   }
288 | 
289 |   async loadParameters(weightsFolder) {
290 |     console.log("Loading params...");
291 |     const params = await (await fetch(`${weightsFolder}/params_gpt.json`)).json();
292 | 
293 |     // Did you enable GitHub LFS? Won't work without it.
294 |     if (params.n_embd % 4 !== 0) throw new Error("Model load failed: n_embd must be divisible by 4.");
295 |     if (params.n_embd % params.n_head !== 0) throw new Error("Model load failed: n_embd must be divisible by n_head.");
296 |     // I'm unsure if this is a reasonable requirement here. At worst, I can figure out some padding method.
297 |     if ((params.n_embd / params.n_head) % 4 !== 0) throw new Error("Model load failed: n_embd / n_head must be divisible by 4.");
298 |     const tokenParam = this.bufferSize(params.vocab_size, params.n_embd);
299 |     let minSplits = Math.ceil(tokenParam / this.device.limits.maxStorageBufferBindingSize);
300 |     function vocabChunkSizeCalc(vocab_size, n_embd, splits, maxStorageBufferBindingSize) {
301 |       // Possibly could be better? Needs actual benchmarking to know what approach is best.
302 |       const optimisticSize = Math.ceil(vocab_size / splits / 4) * 4 * n_embd;
303 |       const pessimiticSize = Math.floor(vocab_size / splits / 4) * 4 * n_embd;
304 |       let vocab_chunk_size = optimisticSize;
305 |       if (optimisticSize > maxStorageBufferBindingSize) {
306 |         vocab_chunk_size = pessimiticSize;
307 |         if (pessimiticSize * splits < tokenParam) {
308 |           return vocabChunkSizeCalc(vocab_size, n_embd, splits + 1, maxStorageBufferBindingSize);
309 |         }
310 |       }
311 |       return { vocab_chunk_size: vocab_chunk_size / n_embd, splits };
312 |     }
313 |     const { vocab_chunk_size, splits } = vocabChunkSizeCalc(params.vocab_size, params.n_embd, minSplits, this.device.limits.maxStorageBufferBindingSize);
314 |     if (splits > minSplits) console.warn(`Non-optimal number of vocab splits. Optimal: ${minSplits}, Selected: ${splits}`);
315 | 
316 |     // Set derived parameters
317 |     params.vocab_chunk_size = vocab_chunk_size;
318 |     params.vocab_chunk_instances = splits;
319 |     params.head_size = params.n_embd / params.n_head;
320 |     params.hidden_size = params.n_embd * 4;
321 |     params.attention_scale = 1 / Math.sqrt(params.n_embd / params.n_head);
322 |     params.bias = params.bias == undefined ? true : params.bias;
323 | 
324 |     // Check for overflow in buffers larger than maxStorageBufferBindingSize
325 |     const maxBufferSize = this.device.limits.maxStorageBufferBindingSize / 4;
326 |     if (params.n_embd * params.n_ctx > maxBufferSize) console.warn("Model load failed: n_embd * n_ctx must be less than maxStorageBufferBindingSize.");
327 |     if (params.n_embd * params.hidden_size > maxBufferSize)
328 |       console.warn("Model load failed: n_embd * hidden_size must be less than maxStorageBufferBindingSize.");
329 |     if (params.n_ctx * params.n_ctx * params.n_head > maxBufferSize)
330 |       console.warn("Model load failed: n_ctx * n_ctx must be less than maxStorageBufferBindingSize.");
331 |     if (params.n_embd * params.n_embd * 3 > maxBufferSize)
332 |       console.warn("Model load failed: n_embd * n_embd * 3 must be less than maxStorageBufferBindingSize.");
333 | 
334 |     console.log("Params:", params);
335 | 
336 |     return params;
337 |   }
338 | 
339 |   async loadEmbeddings(params, weightsFolder) {
340 |     console.log("Loading token embeddings...");
341 |     const embeddingWeights = await fetchBin(`${weightsFolder}/transformer.wte.weight_gpt.bin`);
342 | 
343 |     // Chunks are stored in row-major order and are of dimensions n_embd x vocab_chunk_size.
344 |     // Embedding weights are imported in column-major order and are of dimensions vocab_size x n_embd.
345 |     // We pre-transpose the chunk for the deEmbedding process for the matmul. Could do this on GPU later.
346 |     const embeddingsBuffers = [];
347 |     const deEmbeddingsBuffers = [];
348 |     for (let i = 0; i < params.vocab_chunk_instances; i++) {
349 |       console.log(`Loading deEmbedding chunk ${i + 1}/${params.vocab_chunk_instances}...`);
350 |       const offset = i * params.vocab_chunk_size;
351 |       let size = params.vocab_chunk_size;
352 | 
353 |       const paddedArray = new Float32Array(params.vocab_chunk_size * params.n_embd);
354 |       if (i === params.vocab_chunk_instances - 1) {
355 |         size = params.vocab_size - offset;
356 |         paddedArray.set(size * params.n_embd, zeros((params.vocab_chunk_size * params.vocab_chunk_instances - params.vocab_size) * params.n_embd));
357 |       }
358 |       paddedArray.set(embeddingWeights.subarray(offset * params.n_embd, offset * params.n_embd + size * params.n_embd));
359 | 
360 |       embeddingsBuffers.push(this.initTensor(paddedArray, [params.vocab_chunk_size, params.n_embd], ["copy_from"]));
361 | 
362 |       const chunk = transpose(paddedArray, params.vocab_chunk_size, params.n_embd); // Use GPU perhaps?
363 |       deEmbeddingsBuffers.push(this.initTensor(chunk, [params.n_embd, params.vocab_chunk_size], ["storage"]));
364 |     }
365 | 
366 |     return { embeddingsBuffers, deEmbeddingsBuffers };
367 |   }
368 | 
369 |   async loadPositionalEmbeddings(params, weightsFolder) {
370 |     console.log("Loading positional embeddings...");
371 |     const posEmbeddings = await fetchBin(`${weightsFolder}/transformer.wpe.weight_gpt.bin`);
372 |     const posEmbdBuffer = this.initTensor(posEmbeddings, [params.n_ctx, params.n_embd], ["copy_from"]);
373 | 
374 |     return { posEmbdBuffer };
375 |   }
376 | 
377 |   async loadFinalLayerNorm(params, weightsFolder) {
378 |     console.log("Loading final norm...");
379 |     const prefix = `${weightsFolder}/transformer.ln_f.`;
380 | 
381 |     const tensorPromises = [
382 |       this.fetchAndInitTensor(`${prefix}weight_gpt.bin`, [params.n_embd], ["storage"]),
383 |       this.fetchAndInitTensor(`${prefix}bias_gpt.bin`, [params.n_embd], ["storage"]),
384 |     ];
385 | 
386 |     const [normGammaBuffer, normBetaBuffer] = await Promise.all(tensorPromises);
387 | 
388 |     return { normGammaBuffer, normBetaBuffer };
389 |   }
390 | 
391 |   async loadLayers(params, weightsFolder) {
392 |     console.log("Loading layers...");
393 |     const layerPromises = [];
394 | 
395 |     for (let i = 0; i < params.n_layer; i++) {
396 |       layerPromises.push(this.loadLayer(params, weightsFolder, i));
397 |     }
398 | 
399 |     const layer_buffers = await Promise.all(layerPromises);
400 |     return layer_buffers;
401 |   }
402 | 
403 |   async loadLayer(params, weightsFolder, layerIndex) {
404 |     console.log("Starting to load layer...", layerIndex);
405 |     const prefix = `${weightsFolder}transformer.h.${layerIndex}.`;
406 | 
407 |     // Create an array of promises for fetching and initializing the tensors
408 |     const tensorPromises = [
409 |       this.fetchAndInitTensor(`${prefix}ln_1.weight_gpt.bin`, [params.n_embd], ["storage"]),
410 |       this.fetchAndInitTensor(`${prefix}ln_1.bias_gpt.bin`, [params.n_embd], ["storage"]),
411 |       this.fetchAndSplitQKVWeightTensors(`${prefix}attn.c_attn.weight_gpt.bin`, [params.n_embd, 3 * params.n_embd], ["storage"]),
412 |       this.fetchAndSplitQKVBiasTensors(`${prefix}attn.c_attn.bias_gpt.bin`, [params.n_embd], ["storage"]),
413 |       this.fetchAndInitTensor(`${prefix}attn.c_proj.weight_gpt.bin`, [params.n_embd, params.n_embd], ["storage"]),
414 |       this.fetchAndInitTensor(`${prefix}attn.c_proj.bias_gpt.bin`, [params.n_embd], ["storage"]),
415 |       this.fetchAndInitTensor(`${prefix}ln_2.weight_gpt.bin`, [params.n_embd], ["storage"]),
416 |       this.fetchAndInitTensor(`${prefix}ln_2.bias_gpt.bin`, [params.n_embd], ["storage"]),
417 |       this.fetchAndInitTensor(`${prefix}mlp.c_fc.weight_gpt.bin`, [params.n_embd, params.hidden_size], ["storage"]),
418 |       this.fetchAndInitTensor(`${prefix}mlp.c_fc.bias_gpt.bin`, [params.hidden_size], ["storage"]),
419 |       this.fetchAndInitTensor(`${prefix}mlp.c_proj.weight_gpt.bin`, [params.hidden_size, params.n_embd], ["storage"]),
420 |       this.fetchAndInitTensor(`${prefix}mlp.c_proj.bias_gpt.bin`, [params.n_embd], ["storage"]),
421 |     ];
422 | 
423 |     // Wait for all tensors to be fetched and initialized
424 |     const [
425 |       normAttentionGammaBuffer,
426 |       normAttentionBetaBuffer,
427 |       qkvWeightArray,
428 |       qkvBiasArray,
429 |       linearWeightsBuffer,
430 |       linearBiasBuffer,
431 |       normLinearGammaBuffer,
432 |       normLinearBetaBuffer,
433 |       firstLayerWeightsBuffer,
434 |       firstLayerBiasBuffer,
435 |       secondLayerWeightsBuffer,
436 |       secondLayerBiasBuffer,
437 |     ] = await Promise.all(tensorPromises);
438 | 
439 |     // Process the fetched data and return the layer buffers
440 |     return {
441 |       normAttentionGammaBuffer,
442 |       normAttentionBetaBuffer,
443 |       qkvWeightArray,
444 |       qkvBiasArray,
445 |       linearWeightsBuffer,
446 |       linearBiasBuffer,
447 |       normLinearGammaBuffer,
448 |       normLinearBetaBuffer,
449 |       firstLayerWeightsBuffer,
450 |       firstLayerBiasBuffer,
451 |       secondLayerWeightsBuffer,
452 |       secondLayerBiasBuffer,
453 |     };
454 |   }
455 | 
456 |   async fetchAndSplitQKVWeightTensors(url, dims, ops) {
457 |     const data = transpose(await fetchBin(url), dims[0], dims[1]);
458 | 
459 |     const qWeights = transpose(data.subarray(0, dims[0] * dims[0]), dims[0], dims[0]);
460 |     const kWeights = transpose(data.subarray(dims[0] * dims[0], dims[0] * dims[0] * 2), dims[0], dims[0]);
461 |     const vWeights = transpose(data.subarray(dims[0] * dims[0] * 2, dims[0] * dims[0] * 3), dims[0], dims[0]);
462 | 
463 |     const qWeightsBuffer = this.initTensor(qWeights, [dims[0], dims[0]], ops);
464 |     const kWeightsBuffer = this.initTensor(kWeights, [dims[0], dims[0]], ops);
465 |     const vWeightsBuffer = this.initTensor(vWeights, [dims[0], dims[0]], ops);
466 | 
467 |     return [qWeightsBuffer, kWeightsBuffer, vWeightsBuffer];
468 |   }
469 | 
470 |   async fetchAndSplitQKVBiasTensors(url, dims, ops) {
471 |     const data = await fetchBin(url);
472 | 
473 |     const qBias = data.subarray(0, dims[0]);
474 |     const kBias = data.subarray(dims[0], dims[0] * 2);
475 |     const vBias = data.subarray(dims[0] * 2, dims[0] * 3);
476 | 
477 |     const qBiasBuffer = this.initTensor(qBias, [dims[0]], ops);
478 |     const kBiasBuffer = this.initTensor(kBias, [dims[0]], ops);
479 |     const vBiasBuffer = this.initTensor(vBias, [dims[0]], ops);
480 | 
481 |     return [qBiasBuffer, kBiasBuffer, vBiasBuffer];
482 |   }
483 | 
484 |   async fetchAndInitTensor(url, dims, ops) {
485 |     console.log("Fetching and initializing tensor...", url);
486 |     const data = await fetchBin(url);
487 |     return this.initTensor(data, dims, ops);
488 |   }
489 | 
490 |   initTensor(data, dims, ops) {
491 |     const buffer = this.device.createBuffer({
492 |       size: this.bufferSize(dims[0], dims[1] || 1, dims[2] || 1),
493 |       usage: ops.map((u) => bufferUsageDict[u]).reduce((a, b) => a | b),
494 |       mappedAtCreation: true,
495 |     });
496 |     new Float32Array(buffer.getMappedRange()).set(data);
497 |     buffer.unmap();
498 |     this.unloadDeletionStack.push(buffer);
499 |     return buffer;
500 |   }
501 | 
502 |   unloadBuffers() {
503 |     this.unloadDeletionStack.map((buffer) => buffer.destroy());
504 |     this.unloadDeletionStack = [];
505 |   }
506 | 
507 |   bufferSize(dimX, dimY = 1, dimZ = 1) {
508 |     const size = Math.ceil((dimX * dimY * dimZ * Float32Array.BYTES_PER_ELEMENT) / this.minBufferOffset) * this.minBufferOffset;
509 |     if (size > this.device.limits.maxStorageBufferBindingSize)
510 |       console.warn("Warning: Buffer size calc result exceeds GPU limit, are you using this value for a tensor size?", dimX, dimY, dimZ, size);
511 |     return size;
512 |   }
513 | }
514 | 


--------------------------------------------------------------------------------
/other/conversion_scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Running custom models on WebGPU
 2 | 
 3 | It's fairly easy to run custom models on WebGPU. At the moment, I only support PyTorch models via the scripts below but it should be fairly simple to export other model weights to work here.
 4 | 
 5 | Importing weights requires you to export transformer weights as a series of individual .bin files. Pardon the somewhat inconvenient process as loading such significant file sizes into Javascript requires some clever engineering.
 6 | 
 7 | An example structure with only two layers. Each matrix is collapes into a row-major 1-dimensional array.
 8 | 
 9 | ```
10 | transformer.wte.weight.bin: [65, 128]
11 | transformer.wpe.weight.bin: [64, 128]
12 | transformer.h.0.ln_1.weight.bin: [128]
13 | transformer.h.0.ln_1.bias.bin: [128]
14 | transformer.h.0.attn.c_attn.weight.bin: [384, 128]
15 | transformer.h.0.attn.c_attn.bias.bin: [384]
16 | transformer.h.0.attn.c_proj.weight.bin: [128, 128]
17 | transformer.h.0.attn.c_proj.bias.bin: [128]
18 | transformer.h.0.ln_2.weight.bin: [128]
19 | transformer.h.0.ln_2.bias.bin: [128]
20 | transformer.h.0.mlp.c_fc.weight.bin: [512, 128]
21 | transformer.h.0.mlp.c_fc.bias.bin: [512]
22 | transformer.h.0.mlp.c_proj.weight.bin: [128, 512]
23 | transformer.h.0.mlp.c_proj.bias.bin: [128]
24 | transformer.h.1.ln_1.weight.bin: [128]
25 | transformer.h.1.ln_1.bias.bin: [128]
26 | transformer.h.1.attn.c_attn.weight.bin: [384, 128]
27 | transformer.h.1.attn.c_attn.bias.bin: [384]
28 | transformer.h.1.attn.c_proj.weight.bin: [128, 128]
29 | transformer.h.1.attn.c_proj.bias.bin: [128]
30 | transformer.h.1.ln_2.weight.bin: [128]
31 | transformer.h.1.ln_2.bias.bin: [128]
32 | transformer.h.1.mlp.c_fc.weight.bin: [512, 128]
33 | transformer.h.1.mlp.c_fc.bias.bin: [512]
34 | transformer.h.1.mlp.c_proj.weight.bin: [128, 512]
35 | transformer.h.1.mlp.c_proj.bias.bin: [128]
36 | transformer.ln_f.weight.bin: [128]
37 | transformer.ln_f.bias.bin: [128]
38 | lm_head.weight.bin: [65, 128]
39 | ```
40 | 
41 | I've included a export script for PyTorch models. Quite simply, you must use the model.state_dict() and export into individual files. If you want to export pre-trained GPT models, you'll need to slightly format the parameters to work correctly.
42 | 


--------------------------------------------------------------------------------
/other/conversion_scripts/ckpt.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0hq/WebGPT/a83cdc8d46e8d140b55d87089482999580b64a3d/other/conversion_scripts/ckpt.pt


--------------------------------------------------------------------------------
/other/conversion_scripts/convert_checkpoint_pytorch.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import struct
 3 | import torch
 4 | import os
 5 | 
 6 | transposed = ['attn.c_attn.weight', 'attn.c_proj.weight',
 7 |               'mlp.c_fc.weight', 'mlp.c_proj.weight']
 8 | 
 9 | 
10 | def save_weights_to_bin_files(checkpoint, folder_name):
11 |     for key, value in checkpoint['model'].items():
12 |         print(f"{key}: {value.shape}")
13 |         if key.startswith('_orig_mod.'):
14 |             continue
15 |         with open(os.path.join(folder_name, f"{key}_gpt.bin"), 'wb') as file:
16 |             values = value.cpu().numpy()
17 |             # Only use this if using old minGPT model.
18 |             # if any(key.endswith(w) for w in transposed):
19 |             #     values = values.T
20 | 
21 |             for single_value in values.flatten():
22 |                 file.write(struct.pack('<f', single_value))
23 | 
24 | 
25 | def save_model_args_to_json(checkpoint, folder_name):
26 |     with open(os.path.join(folder_name, 'params_gpt.json'), 'w') as file:
27 |         json.dump(checkpoint['model_args'], file, indent=4)
28 | 
29 | 
30 | def export_pytorch_checkpoint_to_bin_files(ckpt_path, folder_name):
31 |     checkpoint = torch.load(ckpt_path, map_location='cpu')
32 | 
33 |     os.makedirs(folder_name, exist_ok=True)
34 | 
35 |     save_model_args_to_json(checkpoint, folder_name)
36 |     save_weights_to_bin_files(checkpoint, folder_name)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     ckpt_path = 'other/conversion_scripts/ckpt.pt'
41 |     folder_name = 'model_weights/'
42 | 
43 |     export_pytorch_checkpoint_to_bin_files(ckpt_path, folder_name)
44 | 


--------------------------------------------------------------------------------
/other/conversion_scripts/convert_pretrained_pytorch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import struct
 3 | import json
 4 | from transformers import GPT2LMHeadModel
 5 | 
 6 | 
 7 | def load_pretrained_gpt_model(model_type):
 8 |     print(f"Loading weights from pretrained GPT model: {model_type}")
 9 |     model_hf = GPT2LMHeadModel.from_pretrained(model_type)
10 |     return model_hf
11 | 
12 | 
13 | transposed = ['attn.c_attn.weight', 'attn.c_proj.weight',
14 |               'mlp.c_fc.weight', 'mlp.c_proj.weight']
15 | 
16 | ignored = ['.attn.masked_bias', '.attn.bias', 'lm_head.weight']
17 | 
18 | 
19 | def export_weights_to_files(model, folder_name):
20 |     os.makedirs(folder_name, exist_ok=True)
21 | 
22 |     state_dict = model.state_dict()
23 | 
24 |     for k, v in state_dict.items():
25 |         print(f"{k}: {v.shape}")
26 | 
27 |         if any(i in k for i in ignored):
28 |             print(f"Skipping {k}")
29 |             continue
30 | 
31 |         with open(os.path.join(folder_name, f"{k}_gpt.bin"), 'wb') as f:
32 |             values = v.cpu().numpy()
33 |             # Only use this if using old minGPT model.
34 |             # if any(k.endswith(w) for w in transposed):
35 |             #     values = values.T
36 |             for value in values.flatten():
37 |                 f.write(struct.pack('<f', value))
38 | 
39 | 
40 | def save_model_args_to_json(model, folder_name):
41 |     with open(os.path.join(folder_name, 'params_gpt.json'), 'w') as f:
42 |         json.dump(model.config.to_dict(), f, indent=4)
43 | 
44 | 
45 | def main(model_type, folder_name):
46 |     model_hf = load_pretrained_gpt_model(model_type)
47 |     export_weights_to_files(model_hf, folder_name)
48 |     save_model_args_to_json(model_hf, folder_name)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     model_type = 'gpt2-xl'
53 |     folder_name = f"{model_type}/"
54 | 
55 |     main(model_type, folder_name)
56 | 


--------------------------------------------------------------------------------
/other/conversion_scripts/sample_shakespeare_ckpt.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0hq/WebGPT/a83cdc8d46e8d140b55d87089482999580b64a3d/other/conversion_scripts/sample_shakespeare_ckpt.pt


--------------------------------------------------------------------------------
/other/int8-gemm.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This is a sample script with an int8 gemm shader. I don't have time to add it to the repo's system, so I'm just going to leave it here.
  3 |  * The current fastmatmul kernel gets 1.2 GFLOPs on an M1 Pro for the M, N, and K below, whereas my kernel gets 4.2 GFLOPs. This should greatly speed up inference and also speed up model loading time.
  4 |  * Should adjust the absmax from being a global value to a vector-wise value.
  5 |  */
  6 | 
  7 | const M = 1;
  8 | const N = 2048;
  9 | const K = 2048;
 10 | const workgroupSizeX = 16;
 11 | const workgroupSizeY = 16;
 12 | 
 13 | const A = new Float32Array(M * K);
 14 | const B = new Float32Array(K * N);
 15 | const C = new Float32Array(M * N);
 16 | 
 17 | // Initialize matrices A and B with random values
 18 | for (let i = 0; i < M * K; i++) {
 19 |   A[i] = (Math.random() * 2 - 1) / 5;
 20 | }
 21 | for (let i = 0; i < K * N; i++) {
 22 |   B[i] = (Math.random() * 2 - 1) / 5;
 23 | }
 24 | 
 25 | function quantizeMatrix(matrix, M, N) {
 26 |   const blockSize = 4;
 27 |   const quantizedMatrix = new Int32Array(Math.ceil((M * N) / blockSize));
 28 | 
 29 |   // Find the global absmax value
 30 |   let absmax = 0;
 31 |   for (let i = 0; i < M * N; i++) {
 32 |     absmax = Math.max(absmax, Math.abs(matrix[i]));
 33 |   }
 34 | 
 35 |   // Quantize the matrix values to int8 and pack them into Int32Array
 36 |   for (let i = 0; i < M * N; i += blockSize) {
 37 |     const packedValue =
 38 |       (Math.round((matrix[i] / absmax) * 127) & 0xff) |
 39 |       ((Math.round((matrix[i + 1] / absmax) * 127) & 0xff) << 8) |
 40 |       ((Math.round((matrix[i + 2] / absmax) * 127) & 0xff) << 16) |
 41 |       ((Math.round((matrix[i + 3] / absmax) * 127) & 0xff) << 24);
 42 |     quantizedMatrix[Math.floor(i / blockSize)] = packedValue;
 43 |   }
 44 | 
 45 |   return { quantizedMatrix, absmax };
 46 | }
 47 | 
 48 | function dequantizeMatrix(quantizedMatrix, absmax, M, N) {
 49 |   const blockSize = 4;
 50 |   const matrix = new Float32Array(M * N);
 51 | 
 52 |   // Dequantize the matrix values from Int32Array to Float32Array
 53 |   for (let i = 0; i < M * N; i += blockSize) {
 54 |     const packedValue = quantizedMatrix[Math.floor(i / blockSize)];
 55 |     matrix[i] = (((packedValue << 24) >> 24) / 127.0) * absmax;
 56 |     matrix[i + 1] = (((packedValue << 16) >> 24) / 127.0) * absmax;
 57 |     matrix[i + 2] = (((packedValue << 8) >> 24) / 127.0) * absmax;
 58 |     matrix[i + 3] = ((packedValue >> 24) / 127.0) * absmax;
 59 |   }
 60 | 
 61 |   return matrix;
 62 | }
 63 | 
 64 | const qa = quantizeMatrix(A, M, K);
 65 | const qb = quantizeMatrix(B, K, N);
 66 | 
 67 | const quantizedA = qa.quantizedMatrix;
 68 | const quantizedB = qb.quantizedMatrix;
 69 | 
 70 | const dqB = dequantizeMatrix(quantizedB, qb.absmax, K, N);
 71 | 
 72 | // for (let i = 0; i < 10; i++) {
 73 | //     console.log(B[i], dqB[i]);
 74 | // }
 75 | 
 76 | const absmax = Math.max(qa.absmax, qb.absmax);
 77 | 
 78 | // Naive CPU implementation of matrix multiplication
 79 | function multiplyMatrices(A, B, C, M, N, K) {
 80 |   for (let i = 0; i < M; i++) {
 81 |     for (let j = 0; j < N; j++) {
 82 |       let sum = 0;
 83 |       for (let k = 0; k < K; k++) {
 84 |         sum += A[i * K + k] * B[k * N + j];
 85 |       }
 86 |       C[i * N + j] = sum;
 87 |     }
 88 |   }
 89 | }
 90 | 
 91 | async function run() {
 92 |   // Create WebGPU device and queue
 93 |   const adapter = await navigator.gpu.requestAdapter();
 94 |   const device = await adapter.requestDevice();
 95 |   const queue = device.queue;
 96 | 
 97 |   // Create buffers for matrices A, B, and C
 98 |   const aBuffer = device.createBuffer({
 99 |     size: A.byteLength,
100 |     usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
101 |   });
102 |   const bBuffer = device.createBuffer({
103 |     size: quantizedB.byteLength,
104 |     usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
105 |   });
106 |   const cBuffer = device.createBuffer({
107 |     size: C.byteLength,
108 |     usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
109 |   });
110 | 
111 |   // Copy matrices A and B to their respective buffers
112 |   queue.writeBuffer(aBuffer, 0, A);
113 |   queue.writeBuffer(bBuffer, 0, quantizedB);
114 | 
115 |   // Create bind group layout and bind group
116 | 
117 |   const shaderCode = `
118 |     
119 |     @group(0) @binding(0) var<storage,read> array_a: array<vec4<f32>>;
120 |     @group(0) @binding(1) var<storage,read> array_b: array<i32>;
121 | 
122 |     @group(0) @binding(2) var<storage,read_write> array_c: array<vec4<f32>>;
123 | 
124 |     const absmax = ${absmax};
125 | 
126 |     fn unpackInt8x4(value: i32) -> vec4<f32> {
127 |         let x = f32((value << 24) >> 24) / 127.0 * absmax;
128 |         let y = f32(((value << 16) >> 24)) / 127.0 * absmax;
129 |         let z = f32(((value << 8) >> 24)) / 127.0 * absmax;
130 |         let w = f32(((value >> 24))) / 127.0 * absmax;
131 |         return vec4<f32>(x, y, z, w);
132 |     }
133 | 
134 |     @compute @workgroup_size(${workgroupSizeX}, ${workgroupSizeY})
135 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
136 |         var M: u32 = ${M};
137 |         var N: u32 = ${N};
138 |         var ND4: u32 = ${Math.ceil(N / 4)};
139 |         var KD4: u32 = ${Math.ceil(K / 4)};
140 |         var x: u32 = global_id.x;
141 |         var y: u32 = global_id.y;
142 | 
143 |         if (x * 8 >= N || y * 4 >= M) {
144 |             return;
145 |         }
146 | 
147 |         var sum00: vec4<f32> = vec4<f32>();
148 |         var sum01: vec4<f32> = vec4<f32>();
149 |         var sum02: vec4<f32> = vec4<f32>();
150 |         var sum03: vec4<f32> = vec4<f32>();
151 |         var sum10: vec4<f32> = vec4<f32>();
152 |         var sum11: vec4<f32> = vec4<f32>();
153 |         var sum12: vec4<f32> = vec4<f32>();
154 |         var sum13: vec4<f32> = vec4<f32>();
155 | 
156 |         for(var k: u32 = 0u; k < KD4; k = k + 1u) {
157 |             var arow0: vec4<f32> = array_a[(y * 4u + 0u) * KD4 + k];
158 |             var arow1: vec4<f32> = array_a[(y * 4u + 1u) * KD4 + k];
159 |             var arow2: vec4<f32> = array_a[(y * 4u + 2u) * KD4 + k];
160 |             var arow3: vec4<f32> = array_a[(y * 4u + 3u) * KD4 + k];
161 |             var brow: vec4<f32>;
162 | 
163 |             brow = unpackInt8x4(array_b[(k * 4u + 0u) * ND4 + x * 2u + 0u]);
164 |             sum00 = vec4<f32>(arow0.x) * brow + sum00;
165 |             sum01 = vec4<f32>(arow1.x) * brow + sum01;
166 |             sum02 = vec4<f32>(arow2.x) * brow + sum02;
167 |             sum03 = vec4<f32>(arow3.x) * brow + sum03;
168 | 
169 |             brow = unpackInt8x4(array_b[(k * 4u + 0u) * ND4 + x * 2u + 1u]);
170 |             sum10 = vec4<f32>(arow0.x) * brow + sum10;
171 |             sum11 = vec4<f32>(arow1.x) * brow + sum11;
172 |             sum12 = vec4<f32>(arow2.x) * brow + sum12;
173 |             sum13 = vec4<f32>(arow3.x) * brow + sum13;
174 | 
175 |             brow = unpackInt8x4(array_b[(k * 4u + 1u) * ND4 + x * 2u + 0u]);
176 |             sum00 = vec4<f32>(arow0.y) * brow + sum00;
177 |             sum01 = vec4<f32>(arow1.y) * brow + sum01;
178 |             sum02 = vec4<f32>(arow2.y) * brow + sum02;
179 |             sum03 = vec4<f32>(arow3.y) * brow + sum03;
180 | 
181 |             brow = unpackInt8x4(array_b[(k * 4u + 1u) * ND4 + x * 2u + 1u]);
182 |             sum10 = vec4<f32>(arow0.y) * brow + sum10;
183 |             sum11 = vec4<f32>(arow1.y) * brow + sum11;
184 |             sum12 = vec4<f32>(arow2.y) * brow + sum12;
185 |             sum13 = vec4<f32>(arow3.y) * brow + sum13;
186 | 
187 |             brow = unpackInt8x4(array_b[(k * 4u + 2u) * ND4 + x * 2u + 0u]);
188 |             sum00 = vec4<f32>(arow0.z) * brow + sum00;
189 |             sum01 = vec4<f32>(arow1.z) * brow + sum01;
190 |             sum02 = vec4<f32>(arow2.z) * brow + sum02;
191 |             sum03 = vec4<f32>(arow3.z) * brow + sum03;
192 | 
193 |             brow = unpackInt8x4(array_b[(k * 4u + 2u) * ND4 + x * 2u + 1u]);
194 |             sum10 = vec4<f32>(arow0.z) * brow + sum10;
195 |             sum11 = vec4<f32>(arow1.z) * brow + sum11;
196 |             sum12 = vec4<f32>(arow2.z) * brow + sum12;
197 |             sum13 = vec4<f32>(arow3.z) * brow + sum13;
198 | 
199 |             brow = unpackInt8x4(array_b[(k * 4u + 3u) * ND4 + x * 2u + 0u]);
200 |             sum00 = vec4<f32>(arow0.w) * brow + sum00;
201 |             sum01 = vec4<f32>(arow1.w) * brow + sum01;
202 |             sum02 = vec4<f32>(arow2.w) * brow + sum02;
203 |             sum03 = vec4<f32>(arow3.w) * brow + sum03;
204 | 
205 |             brow = unpackInt8x4(array_b[(k * 4u + 3u) * ND4 + x * 2u + 1u]);
206 |             sum10 = vec4<f32>(arow0.w) * brow + sum10;
207 |             sum11 = vec4<f32>(arow1.w) * brow + sum11;
208 |             sum12 = vec4<f32>(arow2.w) * brow + sum12;
209 |             sum13 = vec4<f32>(arow3.w) * brow + sum13;
210 |         }
211 | 
212 |         if (y * 4u + 0u < M) {
213 |             array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
214 |             array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
215 |         }
216 |         if (y * 4u + 1u < M) {
217 |             array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
218 |             array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
219 |         }
220 |         if (y * 4u + 2u < M) {
221 |             array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
222 |             array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
223 |         }
224 |         if (y * 4u + 3u < M) {
225 |             array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
226 |             array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
227 |         }
228 |     }
229 | `;
230 | 
231 |   const shaderModule = device.createShaderModule({
232 |     code: shaderCode,
233 |   });
234 | 
235 |   const bindGroupLayout = device.createBindGroupLayout({
236 |     entries: [
237 |       {
238 |         binding: 0,
239 |         visibility: GPUShaderStage.COMPUTE,
240 |         buffer: {
241 |           type: "read-only-storage",
242 |         },
243 |       },
244 |       {
245 |         binding: 1,
246 |         visibility: GPUShaderStage.COMPUTE,
247 |         buffer: {
248 |           type: "read-only-storage",
249 |         },
250 |       },
251 |       {
252 |         binding: 2,
253 |         visibility: GPUShaderStage.COMPUTE,
254 |         buffer: {
255 |           type: "storage",
256 |         },
257 |       },
258 |     ],
259 |   });
260 | 
261 |   const bindGroup = device.createBindGroup({
262 |     layout: bindGroupLayout,
263 |     entries: [
264 |       {
265 |         binding: 0,
266 |         resource: {
267 |           buffer: aBuffer,
268 |         },
269 |       },
270 |       {
271 |         binding: 1,
272 |         resource: {
273 |           buffer: bBuffer,
274 |         },
275 |       },
276 |       {
277 |         binding: 2,
278 |         resource: {
279 |           buffer: cBuffer,
280 |         },
281 |       },
282 |     ],
283 |   });
284 | 
285 |   const pipelineLayout = device.createPipelineLayout({
286 |     bindGroupLayouts: [bindGroupLayout],
287 |   });
288 | 
289 |   const pipeline = device.createComputePipeline({
290 |     layout: pipelineLayout,
291 |     compute: {
292 |       module: shaderModule,
293 |       entryPoint: "main",
294 |     },
295 |   });
296 |   const encoder = device.createCommandEncoder();
297 |   const passEncoder = encoder.beginComputePass();
298 | 
299 |   // Dispatch the compute kernel
300 |   passEncoder.setPipeline(pipeline);
301 |   passEncoder.setBindGroup(0, bindGroup);
302 |   passEncoder.dispatchWorkgroups(workgroupSizeX, workgroupSizeY, 1);
303 |   passEncoder.end();
304 | 
305 |   const readBuffer = device.createBuffer({
306 |     size: C.byteLength,
307 |     usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
308 |   });
309 | 
310 |   // Copy matrix C from the GPU to the CPU
311 |   encoder.copyBufferToBuffer(cBuffer, 0, readBuffer, 0, C.byteLength);
312 | 
313 |   device.queue.submit([encoder.finish()]);
314 | 
315 |   await readBuffer.mapAsync(GPUMapMode.READ);
316 |   const readBufferData = new Float32Array(readBuffer.getMappedRange());
317 | 
318 |   const C_cpu = new Float32Array(M * N);
319 |   multiplyMatrices(A, B, C_cpu, M, N, K);
320 | 
321 |   for (let i = 0; i < M * N; i++) {
322 |     if (Math.abs(C_cpu[i] - readBufferData[i]) > 0.1) {
323 |       console.error("CPU and GPU results differ at index", i);
324 |       console.error("CPU:", C_cpu[i], "GPU:", readBufferData[i]);
325 |       break;
326 |     }
327 |     // } else {
328 |     //     console.log("CPU and GPU results are the same at index", i);
329 |     //     console.log("CPU:", C_cpu[i], "GPU:", readBufferData[i]);
330 |     // }
331 |   }
332 | 
333 |   let mae = 0;
334 |   for (let i = 0; i < M * N; i++) {
335 |     mae += Math.abs(C_cpu[i] - readBufferData[i]);
336 |   }
337 |   mae /= M * N;
338 |   console.log("Mean Absolute Error:", mae);
339 | 
340 |   const NUM_RUNS = 100;
341 | 
342 |   //warmup
343 | 
344 |   for (let i = 0; i < NUM_RUNS; i++) {
345 |     // Dispatch the compute kernel
346 |     const encoder = device.createCommandEncoder();
347 |     const passEncoder = encoder.beginComputePass();
348 | 
349 |     // Dispatch the compute kernel
350 |     passEncoder.setPipeline(pipeline);
351 |     passEncoder.setBindGroup(0, bindGroup);
352 |     passEncoder.dispatchWorkgroups(workgroupSizeX, workgroupSizeY, 1);
353 | 
354 |     passEncoder.end();
355 | 
356 |     const readBuffer = device.createBuffer({
357 |       size: C.byteLength,
358 |       usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
359 |     });
360 | 
361 |     // Copy matrix C from the GPU to the CPU
362 |     encoder.copyBufferToBuffer(cBuffer, 0, readBuffer, 0, C.byteLength);
363 |   }
364 | 
365 |   // Run GPU kernel NUM_RUNS times and measure time
366 |   let totalTime = 0;
367 |   for (let i = 0; i < NUM_RUNS; i++) {
368 |     const start = performance.now();
369 | 
370 |     // Dispatch the compute kernel
371 |     const encoder = device.createCommandEncoder();
372 |     const passEncoder = encoder.beginComputePass();
373 | 
374 |     // Dispatch the compute kernel
375 |     passEncoder.setPipeline(pipeline);
376 |     passEncoder.setBindGroup(0, bindGroup);
377 |     passEncoder.dispatchWorkgroups(M / workgroupSizeX, N / workgroupSizeY, 1);
378 | 
379 |     passEncoder.end();
380 | 
381 |     const readBuffer = device.createBuffer({
382 |       size: C.byteLength,
383 |       usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
384 |     });
385 | 
386 |     // Copy matrix C from the GPU to the CPU
387 |     encoder.copyBufferToBuffer(cBuffer, 0, readBuffer, 0, C.byteLength);
388 | 
389 |     const end = performance.now();
390 |     totalTime += end - start;
391 |   }
392 |   const averageTime = totalTime / NUM_RUNS;
393 |   console.log(`Average time per run: ${averageTime.toFixed(2)} ms`);
394 |   // print flops
395 | 
396 |   const flops = (2 * M * N * K) / averageTime;
397 |   console.log(`GFLOPS: ${flops / 1e9}`);
398 | }
399 | 
400 | run();
401 | 


--------------------------------------------------------------------------------
/other/misc/files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0hq/WebGPT/a83cdc8d46e8d140b55d87089482999580b64a3d/other/misc/files.png


--------------------------------------------------------------------------------
/other/misc/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0hq/WebGPT/a83cdc8d46e8d140b55d87089482999580b64a3d/other/misc/header.png


--------------------------------------------------------------------------------
/other/scratchpad.js:
--------------------------------------------------------------------------------
  1 | class Instruction {
  2 |   constructor(device) {
  3 |     this.device = device;
  4 |     this.bufferDeletionStack = [];
  5 |     this.unloadDeletionStack = [];
  6 | 
  7 |     this.initBindGroups();
  8 |   }
  9 | 
 10 |   initBindGroup(layout, buffers, label = "") {
 11 |     return this.device.createBindGroup({
 12 |       layout,
 13 |       entries: buffers.map((buffer, i) => ({
 14 |         binding: i,
 15 |         resource: { buffer },
 16 |       })),
 17 |       label,
 18 |     });
 19 |   }
 20 | 
 21 |   initBuffer(ops, row, col = 1, noDelete = false) {
 22 |     const buffer = this.device.createBuffer({
 23 |       size: this.bufferSize(row, col),
 24 |       usage: ops.map((u) => bufferUsageDict[u]).reduce((a, b) => a | b),
 25 |     });
 26 |     if (!noDelete) this.bufferDeletionStack.push(buffer);
 27 |     else this.unloadDeletionStack.push(buffer);
 28 |     return buffer;
 29 |   }
 30 | 
 31 |   bufferSize(dimA, dimB = 1) {
 32 |     return Math.ceil((dimA * dimB * Float32Array.BYTES_PER_ELEMENT) / 1) * 1;
 33 |   }
 34 | 
 35 |   initBindGroups() {
 36 |     const bg = (types) =>
 37 |       this.device.createBindGroupLayout({
 38 |         entries: types.map((entry, i) => ({
 39 |           binding: i,
 40 |           visibility: GPUShaderStage.COMPUTE,
 41 |           buffer: { type: entry },
 42 |         })),
 43 |       });
 44 | 
 45 |     this.r_r_r_Layout = bg(["read-only-storage", "read-only-storage", "read-only-storage"]);
 46 |     this.r_r_Layout = bg(["read-only-storage", "read-only-storage"]);
 47 |     this.r_Layout = bg(["read-only-storage"]);
 48 |     this.u_s_Layout = bg(["uniform", "storage"]);
 49 |     this.u_s_s_s_Layout = bg(["uniform", "storage", "storage", "storage"]);
 50 |   }
 51 | 
 52 |   initPipeline(code, bindGroupLayouts, label = "", constants = {}) {
 53 |     return this.device.createComputePipeline({
 54 |       layout: this.device.createPipelineLayout({ bindGroupLayouts }),
 55 |       compute: {
 56 |         module: this.device.createShaderModule({ code }),
 57 |         entryPoint: "main",
 58 |         constants,
 59 |       },
 60 |       label,
 61 |     });
 62 |   }
 63 | 
 64 |   unloadBuffers() {
 65 |     this.unloadDeletionStack.map((buffer) => buffer.destroy());
 66 |     this.unloadDeletionStack = [];
 67 |   }
 68 | 
 69 |   destroyBuffers() {
 70 |     this.bufferDeletionStack.map((buffer) => buffer.destroy());
 71 |     this.bufferDeletionStack = [];
 72 |   }
 73 | }
 74 | 
 75 | class FastMatMul extends Instruction {
 76 |   constructor(device) {
 77 |     super(device);
 78 |     this.name = "fastMatMul";
 79 |     this.pipelineCache = new Map();
 80 |   }
 81 | 
 82 |   getPipeline(rows) {
 83 |     const div4 = rows % 4 === 0;
 84 |     const pipelineCacheKey = div4 ? "fastMatMulNoCheck" : "fastMatMul";
 85 |     if (this.pipelineCache.has(pipelineCacheKey)) {
 86 |       return this.pipelineCache.get(pipelineCacheKey);
 87 |     }
 88 |     const kernel = div4 ? this.fastMatMulNoCheck : this.fastMatMul;
 89 |     const pipeline = this.initPipeline(kernel, [this.u_s_Layout, this.r_r_Layout], pipelineCacheKey);
 90 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
 91 |     return pipeline;
 92 |   }
 93 | 
 94 |   newInstance(rows, cols, shared, bufA, bufB) {
 95 |     const pipeline = this.getPipeline(rows);
 96 |     const uniformBuffer = this.initBuffer(["uniform", "copy_to"], 4);
 97 |     const resultBuf = this.initBuffer(["storage", "copy_from"], rows, cols);
 98 |     const opBindGroup = this.initBindGroup(this.u_s_Layout, [uniformBuffer, resultBuf], "opBindGroup");
 99 |     const inputBindGroup = this.initBindGroup(this.r_r_Layout, [bufA, bufB], "inputBindGroup");
100 |     const workgroups = { x: wgSize(cols, 64), y: wgSize(rows, 32) };
101 |     this.device.queue.writeBuffer(uniformBuffer, 0, new Uint32Array([rows, cols, Math.ceil(cols / 4), Math.ceil(shared / 4)]));
102 | 
103 |     return {
104 |       resultBuf,
105 |       pass: {
106 |         pipeline,
107 |         groups: [opBindGroup, inputBindGroup],
108 |         workgroups,
109 |       },
110 |     };
111 |   }
112 | 
113 |   fastMatMul = `
114 |     struct CMeta {
115 |       M: u32,
116 |       N: u32,
117 |       ND4: u32,
118 |       KD4: u32,
119 |     }
120 | 
121 |     @group(1) @binding(0) var<storage,read> array_a: array<vec4<f32>>;
122 |     @group(1) @binding(1) var<storage,read> array_b: array<vec4<f32>>;
123 | 
124 |     @group(0) @binding(0) var<uniform> cmeta: CMeta;
125 |     @group(0) @binding(1) var<storage,read_write> array_c: array<vec4<f32>>;
126 | 
127 |     @compute @workgroup_size(8, 8)
128 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
129 |       var M: u32 = cmeta.M;
130 |       var N: u32 = cmeta.N;
131 |       var ND4: u32 = cmeta.ND4;
132 |       var KD4: u32 = cmeta.KD4;
133 |       var x: u32 = global_id.x;
134 |       var y: u32 = global_id.y;
135 | 
136 |       if (x * 8 >= N || y * 4 >= M) {
137 |         return;
138 |       }
139 | 
140 |       var sum00: vec4<f32> = vec4<f32>();
141 |       var sum01: vec4<f32> = vec4<f32>();
142 |       var sum02: vec4<f32> = vec4<f32>();
143 |       var sum03: vec4<f32> = vec4<f32>();
144 |       var sum10: vec4<f32> = vec4<f32>();
145 |       var sum11: vec4<f32> = vec4<f32>();
146 |       var sum12: vec4<f32> = vec4<f32>();
147 |       var sum13: vec4<f32> = vec4<f32>();
148 | 
149 |       for(var k: u32 = 0u; k < KD4; k = k + 1u) {
150 |         var arow0: vec4<f32> = array_a[(y * 4u + 0u) * KD4 + k];
151 |         var arow1: vec4<f32> = array_a[(y * 4u + 1u) * KD4 + k];
152 |         var arow2: vec4<f32> = array_a[(y * 4u + 2u) * KD4 + k];
153 |         var arow3: vec4<f32> = array_a[(y * 4u + 3u) * KD4 + k];
154 |         var brow: vec4<f32>;
155 | 
156 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 0u];
157 |         sum00 = vec4<f32>(arow0.x) * brow + sum00;
158 |         sum01 = vec4<f32>(arow1.x) * brow + sum01;
159 |         sum02 = vec4<f32>(arow2.x) * brow + sum02;
160 |         sum03 = vec4<f32>(arow3.x) * brow + sum03;
161 | 
162 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 1u];
163 |         sum10 = vec4<f32>(arow0.x) * brow + sum10;
164 |         sum11 = vec4<f32>(arow1.x) * brow + sum11;
165 |         sum12 = vec4<f32>(arow2.x) * brow + sum12;
166 |         sum13 = vec4<f32>(arow3.x) * brow + sum13;
167 | 
168 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 0u];
169 |         sum00 = vec4<f32>(arow0.y) * brow + sum00;
170 |         sum01 = vec4<f32>(arow1.y) * brow + sum01;
171 |         sum02 = vec4<f32>(arow2.y) * brow + sum02;
172 |         sum03 = vec4<f32>(arow3.y) * brow + sum03;
173 | 
174 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 1u];
175 |         sum10 = vec4<f32>(arow0.y) * brow + sum10;
176 |         sum11 = vec4<f32>(arow1.y) * brow + sum11;
177 |         sum12 = vec4<f32>(arow2.y) * brow + sum12;
178 |         sum13 = vec4<f32>(arow3.y) * brow + sum13;
179 | 
180 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 0u];
181 |         sum00 = vec4<f32>(arow0.z) * brow + sum00;
182 |         sum01 = vec4<f32>(arow1.z) * brow + sum01;
183 |         sum02 = vec4<f32>(arow2.z) * brow + sum02;
184 |         sum03 = vec4<f32>(arow3.z) * brow + sum03;
185 | 
186 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 1u];
187 |         sum10 = vec4<f32>(arow0.z) * brow + sum10;
188 |         sum11 = vec4<f32>(arow1.z) * brow + sum11;
189 |         sum12 = vec4<f32>(arow2.z) * brow + sum12;
190 |         sum13 = vec4<f32>(arow3.z) * brow + sum13;
191 | 
192 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 0u];
193 |         sum00 = vec4<f32>(arow0.w) * brow + sum00;
194 |         sum01 = vec4<f32>(arow1.w) * brow + sum01;
195 |         sum02 = vec4<f32>(arow2.w) * brow + sum02;
196 |         sum03 = vec4<f32>(arow3.w) * brow + sum03;
197 | 
198 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 1u];
199 |         sum10 = vec4<f32>(arow0.w) * brow + sum10;
200 |         sum11 = vec4<f32>(arow1.w) * brow + sum11;
201 |         sum12 = vec4<f32>(arow2.w) * brow + sum12;
202 |         sum13 = vec4<f32>(arow3.w) * brow + sum13;
203 |       }
204 | 
205 |       if (y * 4u + 0u < M) {
206 |         array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
207 |         array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
208 |       }
209 |       if (y * 4u + 1u < M) {
210 |         array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
211 |         array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
212 |       }
213 |       if (y * 4u + 2u < M) {
214 |         array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
215 |         array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
216 |       }
217 |       if (y * 4u + 3u < M) {
218 |         array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
219 |         array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
220 |       }
221 |     }
222 |   `;
223 | 
224 |   fastMatMulNoCheck = `
225 |     struct CMeta {
226 |       M: u32,
227 |       N: u32,
228 |       ND4: u32,
229 |       KD4: u32,
230 |     }
231 | 
232 |     @group(1) @binding(0) var<storage,read> array_a: array<vec4<f32>>;
233 |     @group(1) @binding(1) var<storage,read> array_b: array<vec4<f32>>;
234 | 
235 |     @group(0) @binding(0) var<uniform> cmeta: CMeta;
236 |     @group(0) @binding(1) var<storage,read_write> array_c: array<vec4<f32>>;
237 | 
238 |     @compute @workgroup_size(8, 8)
239 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
240 |       var M: u32 = cmeta.M;
241 |       var N: u32 = cmeta.N;
242 |       var ND4: u32 = cmeta.ND4;
243 |       var KD4: u32 = cmeta.KD4;
244 |       var x: u32 = global_id.x;
245 |       var y: u32 = global_id.y;
246 | 
247 |       if (x * 8 >= N || y * 4 >= M) {
248 |         return;
249 |       }
250 | 
251 |       var sum00: vec4<f32> = vec4<f32>();
252 |       var sum01: vec4<f32> = vec4<f32>();
253 |       var sum02: vec4<f32> = vec4<f32>();
254 |       var sum03: vec4<f32> = vec4<f32>();
255 |       var sum10: vec4<f32> = vec4<f32>();
256 |       var sum11: vec4<f32> = vec4<f32>();
257 |       var sum12: vec4<f32> = vec4<f32>();
258 |       var sum13: vec4<f32> = vec4<f32>();
259 | 
260 |       for(var k: u32 = 0u; k < KD4; k = k + 1u) {
261 |         var arow0: vec4<f32> = array_a[(y * 4u + 0u) * KD4 + k];
262 |         var arow1: vec4<f32> = array_a[(y * 4u + 1u) * KD4 + k];
263 |         var arow2: vec4<f32> = array_a[(y * 4u + 2u) * KD4 + k];
264 |         var arow3: vec4<f32> = array_a[(y * 4u + 3u) * KD4 + k];
265 |         var brow: vec4<f32>;
266 | 
267 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 0u];
268 |         sum00 = vec4<f32>(arow0.x) * brow + sum00;
269 |         sum01 = vec4<f32>(arow1.x) * brow + sum01;
270 |         sum02 = vec4<f32>(arow2.x) * brow + sum02;
271 |         sum03 = vec4<f32>(arow3.x) * brow + sum03;
272 | 
273 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 1u];
274 |         sum10 = vec4<f32>(arow0.x) * brow + sum10;
275 |         sum11 = vec4<f32>(arow1.x) * brow + sum11;
276 |         sum12 = vec4<f32>(arow2.x) * brow + sum12;
277 |         sum13 = vec4<f32>(arow3.x) * brow + sum13;
278 | 
279 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 0u];
280 |         sum00 = vec4<f32>(arow0.y) * brow + sum00;
281 |         sum01 = vec4<f32>(arow1.y) * brow + sum01;
282 |         sum02 = vec4<f32>(arow2.y) * brow + sum02;
283 |         sum03 = vec4<f32>(arow3.y) * brow + sum03;
284 | 
285 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 1u];
286 |         sum10 = vec4<f32>(arow0.y) * brow + sum10;
287 |         sum11 = vec4<f32>(arow1.y) * brow + sum11;
288 |         sum12 = vec4<f32>(arow2.y) * brow + sum12;
289 |         sum13 = vec4<f32>(arow3.y) * brow + sum13;
290 | 
291 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 0u];
292 |         sum00 = vec4<f32>(arow0.z) * brow + sum00;
293 |         sum01 = vec4<f32>(arow1.z) * brow + sum01;
294 |         sum02 = vec4<f32>(arow2.z) * brow + sum02;
295 |         sum03 = vec4<f32>(arow3.z) * brow + sum03;
296 | 
297 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 1u];
298 |         sum10 = vec4<f32>(arow0.z) * brow + sum10;
299 |         sum11 = vec4<f32>(arow1.z) * brow + sum11;
300 |         sum12 = vec4<f32>(arow2.z) * brow + sum12;
301 |         sum13 = vec4<f32>(arow3.z) * brow + sum13;
302 | 
303 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 0u];
304 |         sum00 = vec4<f32>(arow0.w) * brow + sum00;
305 |         sum01 = vec4<f32>(arow1.w) * brow + sum01;
306 |         sum02 = vec4<f32>(arow2.w) * brow + sum02;
307 |         sum03 = vec4<f32>(arow3.w) * brow + sum03;
308 | 
309 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 1u];
310 |         sum10 = vec4<f32>(arow0.w) * brow + sum10;
311 |         sum11 = vec4<f32>(arow1.w) * brow + sum11;
312 |         sum12 = vec4<f32>(arow2.w) * brow + sum12;
313 |         sum13 = vec4<f32>(arow3.w) * brow + sum13;
314 |       }
315 | 
316 |       array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
317 |       array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
318 |       array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
319 |       array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
320 |       array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
321 |       array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
322 |       array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
323 |       array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
324 |     }
325 |   `;
326 | }
327 | 
328 | class TestGPT {
329 |   constructor(folder, type, doAttentionCache = false) {
330 |     this.folder = folder;
331 |     this.tokenizerType = type;
332 |     this.initialized = false;
333 | 
334 |     this.device;
335 |     this.model;
336 |     this.tokenizer;
337 |     this.params;
338 |     this.minBufferOffset = 1;
339 |     this.doAttentionCache = doAttentionCache;
340 | 
341 |     this.defaultPrompt;
342 |     this.defaultTopK;
343 |     this.defaultTemperature;
344 |     this.defaultTokens;
345 | 
346 |     this.bufferDeletionStack = [];
347 |     this.unloadDeletionStack = [];
348 |   }
349 | 
350 |   async initialize() {
351 |     if (this.initialized) return console.error("Model already initialized");
352 |     if (!navigator.gpu) throw new Error("WebGPU is not supported");
353 | 
354 |     const adapter = await navigator.gpu.requestAdapter();
355 |     this.device = await adapter.requestDevice();
356 | 
357 |     this.matMulOperation = new FastMatMul(this.device);
358 | 
359 |     const dimM = 10;
360 |     const dimN = 10;
361 |     const demo = new Float32Array(dimM * dimN);
362 |     for (let i = 0; i < dimM * dimN; i++) demo[i] = 1;
363 |     const weights1 = this.initTensor(demo, [dimM, dimN], ["storage", "copy_from"]);
364 |     // const weights2 = this.initTensor(demo, [dimM, dimN], ["storage", "copy_from"]);
365 |     this.inputBuffer = this.initBuffer(["storage", "copy_from", "copy_to"], dimM, dimN);
366 | 
367 |     this.computePasses = [];
368 |     let intermediateBuffer = this.inputBuffer;
369 |     for (let i = 0; i < 10; i++) {
370 |       let { pass, resultBuf } = this.matMulOperation.newInstance(10, 10, 10, intermediateBuffer, weights1);
371 |       intermediateBuffer = resultBuf;
372 |       this.computePasses.push(pass);
373 |     }
374 |     this.resultBuffer = intermediateBuffer;
375 |     this.outputBuffer = this.initBuffer(["map_read", "copy_to"], dimM, dimN);
376 | 
377 |     this.initialized = true;
378 |   }
379 | 
380 |   async test() {
381 |     const dimM = 10;
382 |     const dimN = 10;
383 |     const matrixA = new Float32Array(dimM * dimN);
384 |     for (let i = 0; i < dimM * dimN; i++) matrixA[i] = i * 0.1;
385 | 
386 |     this.device.queue.writeBuffer(this.inputBuffer, 0, matrixA);
387 | 
388 |     const commandEncoder = this.device.createCommandEncoder();
389 |     for (const pass of this.computePasses) {
390 |       const passEncoder = commandEncoder.beginComputePass();
391 |       passEncoder.setPipeline(pass.pipeline);
392 |       for (let i = 0; i < pass.groups.length; i++) passEncoder.setBindGroup(i, pass.groups[i]);
393 |       passEncoder.dispatchWorkgroups(pass.workgroups.x, pass.workgroups.y);
394 |       passEncoder.end();
395 |     }
396 |     commandEncoder.copyBufferToBuffer(this.resultBuffer, 0, this.outputBuffer, 0, this.bufferSize(dimM, dimN));
397 |     this.device.queue.submit([commandEncoder.finish()]);
398 | 
399 |     await this.outputBuffer.mapAsync(GPUMapMode.READ);
400 |     const output = this.outputBuffer.getMappedRange();
401 |     const outputArray = new Float32Array(output).slice(0); // Prevent destruction.
402 |     console.log(outputArray, formatAsMatrix(outputArray, dimM, dimN));
403 | 
404 |     this.destroyBuffers();
405 |   }
406 | 
407 |   initBindGroup(layout, buffers) {
408 |     return this.device.createBindGroup({
409 |       layout,
410 |       entries: buffers.map((buffer, i) => ({
411 |         binding: i,
412 |         resource: { buffer },
413 |       })),
414 |     });
415 |   }
416 | 
417 |   initOutputBuffer(commandEncoder, buffer, row, col) {
418 |     const outputBuffer = this.initBuffer(["map_read", "copy_to"], row, col);
419 |     commandEncoder.copyBufferToBuffer(buffer, 0, outputBuffer, 0, this.bufferSize(row, col));
420 |     return outputBuffer;
421 |   }
422 | 
423 |   initBuffer(ops, row, col = 1, noDelete = false) {
424 |     const buffer = this.device.createBuffer({
425 |       size: this.bufferSize(row, col),
426 |       usage: ops.map((u) => bufferUsageDict[u]).reduce((a, b) => a | b),
427 |     });
428 |     if (!noDelete) this.bufferDeletionStack.push(buffer);
429 |     else this.unloadDeletionStack.push(buffer);
430 |     return buffer;
431 |   }
432 | 
433 |   initTensor(data, dims, ops) {
434 |     const buffer = this.device.createBuffer({
435 |       size: this.bufferSize(dims[0], dims[1], dims[2] || 1),
436 |       usage: ops.map((u) => bufferUsageDict[u]).reduce((a, b) => a | b),
437 |       mappedAtCreation: true,
438 |     });
439 |     const array = new Float32Array(buffer.getMappedRange());
440 |     array.set(data);
441 |     buffer.unmap();
442 |     this.unloadDeletionStack.push(buffer);
443 |     return buffer;
444 |   }
445 | 
446 |   bufferSize(dimX, dimY = 1, dimZ = 1) {
447 |     return Math.ceil((dimX * dimY * dimZ * Float32Array.BYTES_PER_ELEMENT) / this.minBufferOffset) * this.minBufferOffset;
448 |   }
449 | 
450 |   unloadBuffers() {
451 |     this.unloadDeletionStack.map((buffer) => buffer.destroy());
452 |     this.unloadDeletionStack = [];
453 |   }
454 | 
455 |   destroyBuffers() {
456 |     this.bufferDeletionStack.map((buffer) => buffer.destroy());
457 |     this.bufferDeletionStack = [];
458 |   }
459 | 
460 |   initBindGroups() {
461 |     const bg = (types) =>
462 |       this.device.createBindGroupLayout({
463 |         entries: types.map((entry, i) => ({
464 |           binding: i,
465 |           visibility: GPUShaderStage.COMPUTE,
466 |           buffer: { type: entry },
467 |         })),
468 |       });
469 | 
470 |     this.r_r_r_Layout = bg(["read-only-storage", "read-only-storage", "read-only-storage"]);
471 |     this.r_r_Layout = bg(["read-only-storage", "read-only-storage"]);
472 |     this.r_Layout = bg(["read-only-storage"]);
473 |     this.u_s_Layout = bg(["uniform", "storage"]);
474 |     this.u_s_s_s_Layout = bg(["uniform", "storage", "storage", "storage"]);
475 |   }
476 | 
477 |   async initPipelines() {
478 |     const p = (code, bindGroupLayouts) => {
479 |       return this.device.createComputePipelineAsync({
480 |         layout: this.device.createPipelineLayout({ bindGroupLayouts }),
481 |         compute: {
482 |           module: this.device.createShaderModule({ code }),
483 |           entryPoint: "main",
484 |         },
485 |       });
486 |     };
487 |   }
488 | }
489 | 
490 | async function test() {
491 |   const GPU = new TestGPT();
492 |   await GPU.initialize();
493 |   await GPU.test();
494 | }
495 | 
496 | /*
497 | 
498 | 
499 | fast row add shader for reference
500 | struct BMeta {
501 |       M: u32,
502 |       N: u32,
503 |       ND4: u32,
504 |     }
505 | 
506 |     @group(1) @binding(0) var<storage,read> array_matrix: array<vec4<f32>>;
507 |     @group(1) @binding(1) var<storage,read> array_bias: array<vec4<f32>>;
508 |     @group(0) @binding(0) var<uniform> bmeta: BMeta;
509 |     @group(0) @binding(1) var<storage,read_write> array_output: array<vec4<f32>>;
510 | 
511 |     @compute @workgroup_size(8,8)
512 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
513 |       var col: u32 = global_id.x;
514 |       var row: u32 = global_id.y;
515 |       var ND4: u32 = bmeta.ND4;
516 |       var M: u32 = bmeta.M;
517 |       
518 |       if (row >= M || col >= ND4) {
519 |         return;
520 |       }
521 | 
522 |       array_output[row * ND4 + col] = array_matrix[row * ND4 + col] + array_bias[col];
523 |     }
524 | 
525 |     class FastMatMulBlockClass extends Block {
526 |   constructor() {
527 |     super();
528 |     this.name = "fastMatMul";
529 |     this.pipelineCache = new Map();
530 |   }
531 | 
532 |   getPipeline(rows) {
533 |     const div4 = rows % 4 === 0;
534 |     const pipelineCacheKey = div4 ? "fastMatMulNoCheck" : "fastMatMul";
535 |     if (this.pipelineCache.has(pipelineCacheKey)) return this.pipelineCache.get(pipelineCacheKey);
536 |     const kernel = div4 ? this.fastMatMulNoCheck : this.fastMatMul;
537 |     const pipeline = this.initPipeline(kernel, [this.u_s_Layout, this.r_r_Layout], `${this.name}_Pipeline_${pipelineCacheKey}`);
538 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
539 |     return pipeline;
540 |   }
541 | 
542 |   newInstance(rows, cols, shared, bufA, bufB) {
543 |     const pipeline = this.getPipeline(rows);
544 |     const uniformBuffer = this.initBuffer(["uniform", "copy_to"], [4]);
545 |     const resultBuffer = this.initBuffer(["storage", "copy_from"], [rows, cols]);
546 |     const opBindGroup = this.initBindGroup(this.u_s_Layout, [uniformBuffer, resultBuffer], `${this.name}_OpG`);
547 |     const inputBindGroup = this.initBindGroup(this.r_r_Layout, [bufA, bufB], `${this.name}_InputG`);
548 |     const workgroups = { x: wgSize(cols, 64), y: wgSize(rows, 32) };
549 |     this.device.queue.writeBuffer(uniformBuffer, 0, new Uint32Array([rows, cols, Math.ceil(cols / 4), Math.ceil(shared / 4)]));
550 | 
551 |     return {
552 |       resultBuffer,
553 |       passes: [
554 |         {
555 |           flag: "compute",
556 |           pipeline,
557 |           groups: [opBindGroup, inputBindGroup],
558 |           workgroups,
559 |         },
560 |       ],
561 |     };
562 |   }
563 | 
564 |   fastMatMul = `
565 |     struct CMeta {
566 |       M: u32,
567 |       N: u32,
568 |       ND4: u32,
569 |       KD4: u32,
570 |     }
571 | 
572 |     @group(1) @binding(0) var<storage,read> array_a: array<vec4<f32>>;
573 |     @group(1) @binding(1) var<storage,read> array_b: array<vec4<f32>>;
574 | 
575 |     @group(0) @binding(0) var<uniform> cmeta: CMeta;
576 |     @group(0) @binding(1) var<storage,read_write> array_c: array<vec4<f32>>;
577 | 
578 |     @compute @workgroup_size(8, 8)
579 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
580 |       var M: u32 = cmeta.M;
581 |       var N: u32 = cmeta.N;
582 |       var ND4: u32 = cmeta.ND4;
583 |       var KD4: u32 = cmeta.KD4;
584 |       var x: u32 = global_id.x;
585 |       var y: u32 = global_id.y;
586 | 
587 |       if (x * 8 >= N || y * 4 >= M) {
588 |         return;
589 |       }
590 | 
591 |       var sum00: vec4<f32> = vec4<f32>();
592 |       var sum01: vec4<f32> = vec4<f32>();
593 |       var sum02: vec4<f32> = vec4<f32>();
594 |       var sum03: vec4<f32> = vec4<f32>();
595 |       var sum10: vec4<f32> = vec4<f32>();
596 |       var sum11: vec4<f32> = vec4<f32>();
597 |       var sum12: vec4<f32> = vec4<f32>();
598 |       var sum13: vec4<f32> = vec4<f32>();
599 | 
600 |       for(var k: u32 = 0u; k < KD4; k = k + 1u) {
601 |         var arow0: vec4<f32> = array_a[(y * 4u + 0u) * KD4 + k];
602 |         var arow1: vec4<f32> = array_a[(y * 4u + 1u) * KD4 + k];
603 |         var arow2: vec4<f32> = array_a[(y * 4u + 2u) * KD4 + k];
604 |         var arow3: vec4<f32> = array_a[(y * 4u + 3u) * KD4 + k];
605 |         var brow: vec4<f32>;
606 | 
607 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 0u];
608 |         sum00 = vec4<f32>(arow0.x) * brow + sum00;
609 |         sum01 = vec4<f32>(arow1.x) * brow + sum01;
610 |         sum02 = vec4<f32>(arow2.x) * brow + sum02;
611 |         sum03 = vec4<f32>(arow3.x) * brow + sum03;
612 | 
613 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 1u];
614 |         sum10 = vec4<f32>(arow0.x) * brow + sum10;
615 |         sum11 = vec4<f32>(arow1.x) * brow + sum11;
616 |         sum12 = vec4<f32>(arow2.x) * brow + sum12;
617 |         sum13 = vec4<f32>(arow3.x) * brow + sum13;
618 | 
619 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 0u];
620 |         sum00 = vec4<f32>(arow0.y) * brow + sum00;
621 |         sum01 = vec4<f32>(arow1.y) * brow + sum01;
622 |         sum02 = vec4<f32>(arow2.y) * brow + sum02;
623 |         sum03 = vec4<f32>(arow3.y) * brow + sum03;
624 | 
625 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 1u];
626 |         sum10 = vec4<f32>(arow0.y) * brow + sum10;
627 |         sum11 = vec4<f32>(arow1.y) * brow + sum11;
628 |         sum12 = vec4<f32>(arow2.y) * brow + sum12;
629 |         sum13 = vec4<f32>(arow3.y) * brow + sum13;
630 | 
631 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 0u];
632 |         sum00 = vec4<f32>(arow0.z) * brow + sum00;
633 |         sum01 = vec4<f32>(arow1.z) * brow + sum01;
634 |         sum02 = vec4<f32>(arow2.z) * brow + sum02;
635 |         sum03 = vec4<f32>(arow3.z) * brow + sum03;
636 | 
637 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 1u];
638 |         sum10 = vec4<f32>(arow0.z) * brow + sum10;
639 |         sum11 = vec4<f32>(arow1.z) * brow + sum11;
640 |         sum12 = vec4<f32>(arow2.z) * brow + sum12;
641 |         sum13 = vec4<f32>(arow3.z) * brow + sum13;
642 | 
643 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 0u];
644 |         sum00 = vec4<f32>(arow0.w) * brow + sum00;
645 |         sum01 = vec4<f32>(arow1.w) * brow + sum01;
646 |         sum02 = vec4<f32>(arow2.w) * brow + sum02;
647 |         sum03 = vec4<f32>(arow3.w) * brow + sum03;
648 | 
649 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 1u];
650 |         sum10 = vec4<f32>(arow0.w) * brow + sum10;
651 |         sum11 = vec4<f32>(arow1.w) * brow + sum11;
652 |         sum12 = vec4<f32>(arow2.w) * brow + sum12;
653 |         sum13 = vec4<f32>(arow3.w) * brow + sum13;
654 |       }
655 | 
656 |       if (y * 4u + 0u < M) {
657 |         array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
658 |         array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
659 |       }
660 |       if (y * 4u + 1u < M) {
661 |         array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
662 |         array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
663 |       }
664 |       if (y * 4u + 2u < M) {
665 |         array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
666 |         array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
667 |       }
668 |       if (y * 4u + 3u < M) {
669 |         array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
670 |         array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
671 |       }
672 |     }
673 |   `;
674 | 
675 |   fastMatMulNoCheck = `
676 |     struct CMeta {
677 |       M: u32,
678 |       N: u32,
679 |       ND4: u32,
680 |       KD4: u32,
681 |     }
682 | 
683 |     @group(1) @binding(0) var<storage,read> array_a: array<vec4<f32>>;
684 |     @group(1) @binding(1) var<storage,read> array_b: array<vec4<f32>>;
685 | 
686 |     @group(0) @binding(0) var<uniform> cmeta: CMeta;
687 |     @group(0) @binding(1) var<storage,read_write> array_c: array<vec4<f32>>;
688 | 
689 |     @compute @workgroup_size(8, 8)
690 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
691 |       var M: u32 = cmeta.M;
692 |       var N: u32 = cmeta.N;
693 |       var ND4: u32 = cmeta.ND4;
694 |       var KD4: u32 = cmeta.KD4;
695 |       var x: u32 = global_id.x;
696 |       var y: u32 = global_id.y;
697 | 
698 |       if (x * 8 >= N || y * 4 >= M) {
699 |         return;
700 |       }
701 | 
702 |       var sum00: vec4<f32> = vec4<f32>();
703 |       var sum01: vec4<f32> = vec4<f32>();
704 |       var sum02: vec4<f32> = vec4<f32>();
705 |       var sum03: vec4<f32> = vec4<f32>();
706 |       var sum10: vec4<f32> = vec4<f32>();
707 |       var sum11: vec4<f32> = vec4<f32>();
708 |       var sum12: vec4<f32> = vec4<f32>();
709 |       var sum13: vec4<f32> = vec4<f32>();
710 | 
711 |       for(var k: u32 = 0u; k < KD4; k = k + 1u) {
712 |         var arow0: vec4<f32> = array_a[(y * 4u + 0u) * KD4 + k];
713 |         var arow1: vec4<f32> = array_a[(y * 4u + 1u) * KD4 + k];
714 |         var arow2: vec4<f32> = array_a[(y * 4u + 2u) * KD4 + k];
715 |         var arow3: vec4<f32> = array_a[(y * 4u + 3u) * KD4 + k];
716 |         var brow: vec4<f32>;
717 | 
718 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 0u];
719 |         sum00 = vec4<f32>(arow0.x) * brow + sum00;
720 |         sum01 = vec4<f32>(arow1.x) * brow + sum01;
721 |         sum02 = vec4<f32>(arow2.x) * brow + sum02;
722 |         sum03 = vec4<f32>(arow3.x) * brow + sum03;
723 | 
724 |         brow = array_b[(k * 4u + 0u) * ND4 + x * 2u + 1u];
725 |         sum10 = vec4<f32>(arow0.x) * brow + sum10;
726 |         sum11 = vec4<f32>(arow1.x) * brow + sum11;
727 |         sum12 = vec4<f32>(arow2.x) * brow + sum12;
728 |         sum13 = vec4<f32>(arow3.x) * brow + sum13;
729 | 
730 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 0u];
731 |         sum00 = vec4<f32>(arow0.y) * brow + sum00;
732 |         sum01 = vec4<f32>(arow1.y) * brow + sum01;
733 |         sum02 = vec4<f32>(arow2.y) * brow + sum02;
734 |         sum03 = vec4<f32>(arow3.y) * brow + sum03;
735 | 
736 |         brow = array_b[(k * 4u + 1u) * ND4 + x * 2u + 1u];
737 |         sum10 = vec4<f32>(arow0.y) * brow + sum10;
738 |         sum11 = vec4<f32>(arow1.y) * brow + sum11;
739 |         sum12 = vec4<f32>(arow2.y) * brow + sum12;
740 |         sum13 = vec4<f32>(arow3.y) * brow + sum13;
741 | 
742 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 0u];
743 |         sum00 = vec4<f32>(arow0.z) * brow + sum00;
744 |         sum01 = vec4<f32>(arow1.z) * brow + sum01;
745 |         sum02 = vec4<f32>(arow2.z) * brow + sum02;
746 |         sum03 = vec4<f32>(arow3.z) * brow + sum03;
747 | 
748 |         brow = array_b[(k * 4u + 2u) * ND4 + x * 2u + 1u];
749 |         sum10 = vec4<f32>(arow0.z) * brow + sum10;
750 |         sum11 = vec4<f32>(arow1.z) * brow + sum11;
751 |         sum12 = vec4<f32>(arow2.z) * brow + sum12;
752 |         sum13 = vec4<f32>(arow3.z) * brow + sum13;
753 | 
754 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 0u];
755 |         sum00 = vec4<f32>(arow0.w) * brow + sum00;
756 |         sum01 = vec4<f32>(arow1.w) * brow + sum01;
757 |         sum02 = vec4<f32>(arow2.w) * brow + sum02;
758 |         sum03 = vec4<f32>(arow3.w) * brow + sum03;
759 | 
760 |         brow = array_b[(k * 4u + 3u) * ND4 + x * 2u + 1u];
761 |         sum10 = vec4<f32>(arow0.w) * brow + sum10;
762 |         sum11 = vec4<f32>(arow1.w) * brow + sum11;
763 |         sum12 = vec4<f32>(arow2.w) * brow + sum12;
764 |         sum13 = vec4<f32>(arow3.w) * brow + sum13;
765 |       }
766 | 
767 |       array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
768 |       array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
769 |       array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
770 |       array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
771 |       array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
772 |       array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
773 |       array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
774 |       array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
775 |     }
776 |   `;
777 | }
778 | 
779 | 
780 |  fusedAttentionShaderNew = `
781 |     struct Meta {
782 |       M: u32,
783 |       N: u32,
784 |       ND4: u32,
785 |       KD4: u32,
786 |       attentionScale: f32,
787 |     }
788 | 
789 |     @group(1) @binding(0) var<storage,read> query_array: array<vec4<f32>>;
790 |     @group(1) @binding(1) var<storage,read> key_array: array<vec4<f32>>;
791 | 
792 |     @group(0) @binding(0) var<uniform> uniforms: Meta;
793 |     @group(0) @binding(1) var<storage,read_write> array_c: array<vec4<f32>>;
794 | 
795 |     @compute @workgroup_size(8, 8)
796 |     fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
797 |       var M: u32 = uniforms.M;
798 |       var N: u32 = uniforms.N;
799 |       var ND4: u32 = uniforms.ND4;
800 |       var KD4: u32 = uniforms.KD4;
801 |       var x: u32 = global_id.x;
802 |       var y: u32 = global_id.y;
803 | 
804 |       if (x * 8 >= N || y * 4 >= M) {
805 |         return;
806 |       }
807 | 
808 |       var sum00: vec4<f32> = vec4<f32>();
809 |       var sum01: vec4<f32> = vec4<f32>();
810 |       var sum02: vec4<f32> = vec4<f32>();
811 |       var sum03: vec4<f32> = vec4<f32>();
812 |       var sum10: vec4<f32> = vec4<f32>();
813 |       var sum11: vec4<f32> = vec4<f32>();
814 |       var sum12: vec4<f32> = vec4<f32>();
815 |       var sum13: vec4<f32> = vec4<f32>();
816 | 
817 |       for(var k: u32 = 0u; k < KD4; k = k + 1u) {
818 |         var arow0: vec4<f32> = query_array[(y * 4u + 0u) * KD4 + k];
819 |         var arow1: vec4<f32> = query_array[(y * 4u + 1u) * KD4 + k];
820 |         var arow2: vec4<f32> = query_array[(y * 4u + 2u) * KD4 + k];
821 |         var arow3: vec4<f32> = query_array[(y * 4u + 3u) * KD4 + k];
822 |         var brow: vec4<f32>;
823 | 
824 |         brow = key_array[(k * 4u + 0u) * ND4 + x * 2u + 0u];
825 |         sum00 = vec4<f32>(arow0.x) * brow + sum00;
826 |         sum01 = vec4<f32>(arow1.x) * brow + sum01;
827 |         sum02 = vec4<f32>(arow2.x) * brow + sum02;
828 |         sum03 = vec4<f32>(arow3.x) * brow + sum03;
829 | 
830 |         brow = key_array[(k * 4u + 0u) * ND4 + x * 2u + 1u];
831 |         sum10 = vec4<f32>(arow0.x) * brow + sum10;
832 |         sum11 = vec4<f32>(arow1.x) * brow + sum11;
833 |         sum12 = vec4<f32>(arow2.x) * brow + sum12;
834 |         sum13 = vec4<f32>(arow3.x) * brow + sum13;
835 | 
836 |         brow = key_array[(k * 4u + 1u) * ND4 + x * 2u + 0u];
837 |         sum00 = vec4<f32>(arow0.y) * brow + sum00;
838 |         sum01 = vec4<f32>(arow1.y) * brow + sum01;
839 |         sum02 = vec4<f32>(arow2.y) * brow + sum02;
840 |         sum03 = vec4<f32>(arow3.y) * brow + sum03;
841 | 
842 |         brow = key_array[(k * 4u + 1u) * ND4 + x * 2u + 1u];
843 |         sum10 = vec4<f32>(arow0.y) * brow + sum10;
844 |         sum11 = vec4<f32>(arow1.y) * brow + sum11;
845 |         sum12 = vec4<f32>(arow2.y) * brow + sum12;
846 |         sum13 = vec4<f32>(arow3.y) * brow + sum13;
847 | 
848 |         brow = key_array[(k * 4u + 2u) * ND4 + x * 2u + 0u];
849 |         sum00 = vec4<f32>(arow0.z) * brow + sum00;
850 |         sum01 = vec4<f32>(arow1.z) * brow + sum01;
851 |         sum02 = vec4<f32>(arow2.z) * brow + sum02;
852 |         sum03 = vec4<f32>(arow3.z) * brow + sum03;
853 | 
854 |         brow = key_array[(k * 4u + 2u) * ND4 + x * 2u + 1u];
855 |         sum10 = vec4<f32>(arow0.z) * brow + sum10;
856 |         sum11 = vec4<f32>(arow1.z) * brow + sum11;
857 |         sum12 = vec4<f32>(arow2.z) * brow + sum12;
858 |         sum13 = vec4<f32>(arow3.z) * brow + sum13;
859 | 
860 |         brow = key_array[(k * 4u + 3u) * ND4 + x * 2u + 0u];
861 |         sum00 = vec4<f32>(arow0.w) * brow + sum00;
862 |         sum01 = vec4<f32>(arow1.w) * brow + sum01;
863 |         sum02 = vec4<f32>(arow2.w) * brow + sum02;
864 |         sum03 = vec4<f32>(arow3.w) * brow + sum03;
865 | 
866 |         brow = key_array[(k * 4u + 3u) * ND4 + x * 2u + 1u];
867 |         sum10 = vec4<f32>(arow0.w) * brow + sum10;
868 |         sum11 = vec4<f32>(arow1.w) * brow + sum11;
869 |         sum12 = vec4<f32>(arow2.w) * brow + sum12;
870 |         sum13 = vec4<f32>(arow3.w) * brow + sum13;
871 |       }
872 | 
873 |       if (y * 4u + 0u < M) {
874 |         array_c[x * 2u + 0u + (y * 4u + 0u) * ND4] = sum00;
875 |         array_c[x * 2u + 1u + (y * 4u + 0u) * ND4] = sum10;
876 |       }
877 |       if (y * 4u + 1u < M) {
878 |         array_c[x * 2u + 0u + (y * 4u + 1u) * ND4] = sum01;
879 |         array_c[x * 2u + 1u + (y * 4u + 1u) * ND4] = sum11;
880 |       }
881 |       if (y * 4u + 2u < M) {
882 |         array_c[x * 2u + 0u + (y * 4u + 2u) * ND4] = sum02;
883 |         array_c[x * 2u + 1u + (y * 4u + 2u) * ND4] = sum12;
884 |       }
885 |       if (y * 4u + 3u < M) {
886 |         array_c[x * 2u + 0u + (y * 4u + 3u) * ND4] = sum03;
887 |         array_c[x * 2u + 1u + (y * 4u + 3u) * ND4] = sum13;
888 |       }
889 |     `;
890 | 
891 | 
892 |   // In progress.
893 |   //    withCheckOffset: `
894 |   //    var x1Offset: u32 = ((x * 2u + 0u) / uniforms.TOffset) * uniforms.TOffset * M;
895 |   //    var x2Offset: u32 = ((x * 2u + 1u) / uniforms.TOffset) * uniforms.TOffset * M;
896 |   //
897 | 
898 |   //    if (y * 4u + 0u < M) {
899 |   //      array_c[xMod * 2u + 0u + x1Offset + (y * 4u + 0u) * uniforms.TOffset] = vec4<f32>(1.0);
900 |   //      array_c[xMod * 2u + 1u + x2Offset + (y * 4u + 0u) * uniforms.TOffset] = vec4<f32>(f32(x1Offset));
901 |   //    }
902 |   //    if (y * 4u + 1u < M) {
903 |   //      array_c[xMod * 2u + 0u + x1Offset + (y * 4u + 1u) * uniforms.TOffset] = vec4<f32>(2.0);
904 |   //      array_c[xMod * 2u + 1u + x2Offset + (y * 4u + 1u) * uniforms.TOffset] = vec4<f32>(f32(x2Offset));
905 |   //    }
906 |   //    if (y * 4u + 2u < M) {
907 |   //      array_c[xMod * 2u + 0u + x1Offset + (y * 4u + 2u) * uniforms.TOffset] = vec4<f32>(3.0);
908 |   //      array_c[xMod * 2u + 1u + x2Offset + (y * 4u + 2u) * uniforms.TOffset] = vec4<f32>(3.0);
909 |   //    }
910 |   //    if (y * 4u + 3u < M) {
911 |   //      array_c[xMod * 2u + 0u + x1Offset + (y * 4u + 3u) * uniforms.TOffset] = vec4<f32>(4.0);
912 |   //      array_c[xMod * 2u + 1u + x2Offset + (y * 4u + 3u) * uniforms.TOffset] = vec4<f32>(4.0);
913 |   //    }
914 |   //  `,
915 | 
916 |     transposeShader = `
917 |     struct Meta {
918 |       M: u32,
919 |       N: u32,
920 |     }
921 |     
922 |     @group(1) @binding(0) var<storage, read> input_array: array<f32>;
923 |     
924 |     @group(0) @binding(0) var<uniform> uniforms: Meta;
925 |     @group(0) @binding(1) var<storage, read_write> result_array: array<f32>;
926 |     
927 |     // Bank conflicts?
928 |     var<workgroup> tile: array<array<f32, 8>, 8>;
929 |     
930 |     @compute @workgroup_size(8, 8)
931 |     fn main (@builtin(workgroup_id) wg_id: vec3<u32>,  @builtin(local_invocation_id) local_id: vec3<u32>) {
932 |       let col: u32 = wg_id.x;
933 |       let row: u32 = wg_id.y;
934 |       let N: u32 = uniforms.N;
935 |       let M: u32 = uniforms.M;
936 | 
937 |       let tile_col = col * 8u + local_id.x;
938 |       let tile_row = row * 8u + local_id.y;
939 |     
940 |       // Load a tile from input_array to shared memory tile
941 |       if (tile_row < M && tile_col < N) {
942 |         tile[local_id.y][local_id.x] = input_array[tile_row * N + tile_col];
943 |       }
944 |     
945 |       workgroupBarrier(); // Ensure all threads have finished writing to the shared memory before proceeding
946 |         
947 |       // Write the transposed tile to result_array. Flips dims.
948 |       if (tile_row < M && tile_col < N) {
949 |         result_array[tile_col * M + tile_row] = tile[local_id.x][local_id.y]; 
950 |       }
951 |     }
952 |   `;
953 | 
954 | */
955 | 


--------------------------------------------------------------------------------
/other/test.js:
--------------------------------------------------------------------------------
  1 | class OutputBlockClass extends Block {
  2 |   constructor() {
  3 |     super();
  4 |     this.name = "output";
  5 |   }
  6 | 
  7 |   newInstance(row, col, inputBuffer) {
  8 |     const outputBuffer = this.initBuffer(["map_read", "copy_to"], [row, col]);
  9 | 
 10 |     const copyCommand = {
 11 |       flag: "copy",
 12 |       src: inputBuffer,
 13 |       srcOffset: 0,
 14 |       dst: outputBuffer,
 15 |       dstOffset: 0,
 16 |       size: this.bufferSize(row, col),
 17 |     };
 18 | 
 19 |     return {
 20 |       resultBuffer: outputBuffer,
 21 |       passes: [copyCommand],
 22 |     };
 23 |   }
 24 | }
 25 | 
 26 | class CausalMaskBlockClass extends Block {
 27 |   constructor() {
 28 |     super();
 29 |     this.name = "causal_mask";
 30 |     this.pipelineCache = new Map();
 31 |   }
 32 | 
 33 |   getSimpleCausalMaskPipeline() {
 34 |     const pipelineCacheKey = `${this.name}_simplecausalmask`; // No param optimization.
 35 |     if (this.pipelineCache.has(pipelineCacheKey)) return this.pipelineCache.get(pipelineCacheKey);
 36 |     const pipeline = this.initPipeline(this.origCausalMaskShader, [this.u_s_Layout, this.r_Layout], `${this.name}_Pipeline_CausalMask`);
 37 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
 38 |     return pipeline;
 39 |   }
 40 | 
 41 |   getCausalMaskPipeline() {
 42 |     const pipelineCacheKey = `${this.name}_causalmask`; // No param optimization.
 43 |     if (this.pipelineCache.has(pipelineCacheKey)) return this.pipelineCache.get(pipelineCacheKey);
 44 |     const pipeline = this.initPipeline(this.causalMaskShader, [this.u_s_Layout, this.r_Layout], `${this.name}_Pipeline_CausalMask`);
 45 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
 46 |     return pipeline;
 47 |   }
 48 | 
 49 |   newInstance(rows, cols, inputBuffer) {
 50 |     const causalMaskPipeline = this.getCausalMaskPipeline();
 51 |     const causalMaskUniformBuffer = this.initBuffer(["uniform", "copy_to"], [4]);
 52 |     const causalMaskResultBuffer = this.initBuffer(["storage", "copy_from"], [rows, cols]);
 53 |     const causalMaskBindGroup = this.initBindGroup(this.u_s_Layout, [causalMaskUniformBuffer, causalMaskResultBuffer], `${this.name}_CausalMaskG`);
 54 |     const causalMaskInputBindGroup = this.initBindGroup(this.r_Layout, [inputBuffer], `${this.name}_CausalMaskInputG`);
 55 |     this.device.queue.writeBuffer(causalMaskUniformBuffer, 0, new Uint32Array([cols, rows])); // Transposes! This is needed for softmax.
 56 |     const causalMaskWorkgroups = { x: wgSize(rows, 16), y: wgSize(cols, 16), z: 1 };
 57 | 
 58 |     return {
 59 |       resultBuffer: causalMaskResultBuffer,
 60 |       passes: [
 61 |         {
 62 |           flag: "compute",
 63 |           pipeline: causalMaskPipeline,
 64 |           groups: [causalMaskBindGroup, causalMaskInputBindGroup],
 65 |           workgroups: causalMaskWorkgroups,
 66 |         },
 67 |       ],
 68 |     };
 69 |   }
 70 | 
 71 |   simpleCausalMaskShader = `
 72 |     struct Matrix {
 73 |         data: array<f32>,
 74 |     }
 75 | 
 76 |     struct Dimensions {
 77 |       dimY: u32, // row dimension of input matrix
 78 |       dimX: u32, // col dimension of input matrix
 79 |     };
 80 | 
 81 |     @group(0) @binding(0) var<uniform> DimBuffer: Dimensions;
 82 |     @group(0) @binding(1) var<storage, read_write> Result: Matrix;
 83 | 
 84 |     @group(1) @binding(0) var<storage, read> Input: Matrix;
 85 | 
 86 |     @compute @workgroup_size(16, 16)
 87 |     fn main (@builtin(global_invocation_id) global_id: vec3<u32>) {
 88 |       let col: u32 = global_id.x;
 89 |       let row: u32 = global_id.y;
 90 |       let dimX: u32 = DimBuffer.dimX;
 91 |       let dimY: u32 = DimBuffer.dimY;
 92 | 
 93 |       let rowMask: u32 = row % dimX;
 94 |       if (row >= dimY || col >= dimX) {
 95 |         return;
 96 |       }
 97 | 
 98 |       if (col > rowMask) {
 99 |         Result.data[row * dimX + col] = 0.0;
100 |       } else {
101 |         let rowNum: u32 = row / dimX;
102 |         Result.data[row * dimX + col] = Input.data[rowMask * dimY + col + rowNum * dimX];
103 |       }
104 |     }
105 |   `;
106 | 
107 |   origCausalMaskShader = `
108 |     struct Matrix {
109 |         data: array<f32>,
110 |     }
111 | 
112 |     struct Dimensions {
113 |       dimY: u32, // row dimension of input matrix
114 |       dimX: u32, // col dimension of input matrix
115 |     };
116 | 
117 |     @group(0) @binding(0) var<uniform> DimBuffer: Dimensions;
118 |     @group(0) @binding(1) var<storage, read_write> Result: Matrix;
119 | 
120 |     @group(1) @binding(0) var<storage, read> Input: Matrix;
121 | 
122 |     @compute @workgroup_size(16, 16)
123 |     fn main (@builtin(global_invocation_id) global_id: vec3<u32>) {
124 |       let row: u32 = global_id.x;
125 |       let col: u32 = global_id.y;
126 |       let dimX: u32 = DimBuffer.dimX;
127 |       let dimY: u32 = DimBuffer.dimY;
128 | 
129 |       let rowMask: u32 = row % dimX;
130 |       if (row >= dimY || col > rowMask) {
131 |         return;
132 |       }
133 | 
134 |       let rowNum: u32 = row / dimX;
135 |       Result.data[row * dimX + col] = Input.data[rowMask * dimY + col + rowNum * dimX];
136 |     }
137 |   `;
138 | 
139 |   causalMaskShader = `
140 |     struct Matrix {
141 |         data: array<f32>,
142 |     }
143 | 
144 |     struct Dimensions {
145 |       dimY: u32, // row dimension of input matrix
146 |       dimX: u32, // col dimension of input matrix
147 |     };
148 | 
149 |     @group(0) @binding(0) var<uniform> DimBuffer: Dimensions;
150 |     @group(0) @binding(1) var<storage, read_write> Result: Matrix;
151 | 
152 |     @group(1) @binding(0) var<storage, read> Input: Matrix;
153 | 
154 |     @compute @workgroup_size(16, 16)
155 |     fn main (@builtin(global_invocation_id) global_id: vec3<u32>) {
156 |       let col: u32 = global_id.x;
157 |       let row: u32 = global_id.y;
158 |       let dimX: u32 = DimBuffer.dimX;
159 |       let dimY: u32 = DimBuffer.dimY;
160 | 
161 |       if (row >= dimY || col >= dimX) {
162 |         return;
163 |       }
164 | 
165 |       let rowMask: u32 = row % dimX;
166 |       let rowNum: u32 = row / dimX;
167 |       let index = row * dimX + col;
168 |       let causalMask: bool = (col <= rowMask);
169 |       Result.data[index] = select(-1e9, Input.data[rowMask * dimY + col + rowNum * dimX], causalMask);
170 |     }
171 |   `;
172 | }
173 | 
174 | class TransposeBlockClass extends Block {
175 |   constructor() {
176 |     super();
177 |     this.name = "transpose";
178 |     this.pipelineCache = new Map();
179 |   }
180 | 
181 |   getPipeline() {
182 |     const pipelineCacheKey = this.name; // No param optimization.
183 |     if (this.pipelineCache.has(pipelineCacheKey)) return this.pipelineCache.get(pipelineCacheKey);
184 |     const pipeline = this.initPipeline(this.transposeNewShader, [this.u_s_Layout, this.r_Layout], `${this.name}_Pipeline`);
185 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
186 |     return pipeline;
187 |   }
188 | 
189 |   newInstance(rows, cols, inputBuf) {
190 |     const pipeline = this.getPipeline();
191 |     const uniformBuffer = this.initBuffer(["uniform", "copy_to"], [4]);
192 |     const resultBuffer = this.initBuffer(["storage", "copy_from"], [rows, cols]);
193 |     const opBindGroup = this.initBindGroup(this.u_s_Layout, [uniformBuffer, resultBuffer], `${this.name}_OpG`);
194 |     const inputBindGroup = this.initBindGroup(this.r_Layout, [inputBuf], `${this.name}_InputG`);
195 |     const workgroups = { x: 100, y: 100, z: 1 };
196 |     this.device.queue.writeBuffer(uniformBuffer, 0, new Uint32Array([rows, cols]));
197 | 
198 |     return {
199 |       resultBuffer,
200 |       passes: [
201 |         {
202 |           flag: "compute",
203 |           pipeline,
204 |           groups: [opBindGroup, inputBindGroup],
205 |           workgroups,
206 |         },
207 |       ],
208 |     };
209 |   }
210 | 
211 |   transposeNewShader = `
212 |     struct Meta {
213 |       M: u32,
214 |       N: u32,
215 |     }
216 |     
217 |     @group(1) @binding(0) var<storage, read> input_array: array<f32>;
218 |     
219 |     @group(0) @binding(0) var<uniform> uniforms: Meta;
220 |     @group(0) @binding(1) var<storage, read_write> result_array: array<f32>;
221 |     
222 |     // Bank conflicts?
223 |     var<workgroup> tile: array<array<f32, 8>, 8>;
224 |     
225 |     @compute @workgroup_size(8, 8)
226 |     fn main (@builtin(workgroup_id) wg_id: vec3<u32>,  @builtin(local_invocation_id) local_id: vec3<u32>) {
227 |       let col: u32 = wg_id.x;
228 |       let row: u32 = wg_id.y;
229 |       let N: u32 = uniforms.N;
230 |       let M: u32 = uniforms.M;
231 | 
232 |       let tile_col = col * 8u + local_id.x;
233 |       let tile_row = row * 8u + local_id.y;
234 |     
235 |       // Load a tile from input_array to shared memory tile
236 |       if (tile_row < M && tile_col < N) {
237 |         tile[local_id.y][local_id.x] = input_array[tile_row * N + tile_col];
238 |       }
239 |     
240 |       workgroupBarrier(); // Ensure all threads have finished writing to the shared memory before proceeding
241 |     
242 |       // Compute transposed coordinates
243 |       let transposed_col: u32 = row * 8u + local_id.x;
244 |       let transposed_row: u32 = col * 8u + local_id.y;
245 |     
246 |       // Write the transposed tile to result_array
247 |       if (transposed_col < M && transposed_row < N) {
248 |         result_array[transposed_row * M + transposed_col] = tile[local_id.x][local_id.y]; // This line was incorrect
249 |       }
250 |     }
251 |   `;
252 | }
253 | 
254 | class SplitQBlockClass extends Block {
255 |   constructor() {
256 |     super();
257 |     this.name = "splitq";
258 |     this.pipelineCache = new Map();
259 |   }
260 | 
261 |   getPipeline() {
262 |     const pipelineCacheKey = this.name; // No param optimization.
263 |     if (this.pipelineCache.has(pipelineCacheKey)) return this.pipelineCache.get(pipelineCacheKey);
264 |     const pipeline = this.initPipeline(this.splitQShader, [this.u_s_Layout, this.r_Layout], `${this.name}_Pipeline`);
265 |     this.pipelineCache.set(pipelineCacheKey, pipeline);
266 |     return pipeline;
267 |   }
268 | 
269 |   newInstance(rows, cols, numHeads, inputBuf) {
270 |     if (cols % numHeads !== 0) throw new Error(`cols ${cols} must be divisible by numHeads ${numHeads}`);
271 |     const pipeline = this.getPipeline();
272 |     const uniformBuffer = this.initBuffer(["uniform", "copy_to"], [4]);
273 |     const resultBuffer = this.initBuffer(["storage", "copy_from"], [rows, cols]);
274 |     const opBindGroup = this.initBindGroup(this.u_s_Layout, [uniformBuffer, resultBuffer], `${this.name}_OpG`);
275 |     const inputBindGroup = this.initBindGroup(this.r_Layout, [inputBuf], `${this.name}_InputG`);
276 |     const workgroups = { x: 100, y: 100, z: 1 };
277 |     this.device.queue.writeBuffer(uniformBuffer, 0, new Uint32Array([rows, cols, cols / numHeads]));
278 | 
279 |     return {
280 |       resultBuffer,
281 |       passes: [
282 |         {
283 |           flag: "compute",
284 |           pipeline,
285 |           groups: [opBindGroup, inputBindGroup],
286 |           workgroups,
287 |         },
288 |       ],
289 |     };
290 |   }
291 | 
292 |   splitQShader = `
293 |     struct Meta {
294 |       M: u32,
295 |       N: u32,
296 |       HSize: u32,
297 |     }
298 | 
299 |     @group(1) @binding(0) var<storage, read> input_array: array<f32>;
300 | 
301 |     @group(0) @binding(0) var<uniform> uniforms: Meta;
302 |     @group(0) @binding(1) var<storage, read_write> result_array: array<f32>;
303 | 
304 |     var<workgroup> tile: array<array<f32, 8>, 8>;
305 | 
306 |     @compute @workgroup_size(8, 8)
307 |     fn main (@builtin(local_invocation_id) local_id: vec3<u32>, @builtin(workgroup_id) workgroup_id: vec3<u32>) {
308 |       let col: u32 = workgroup_id.x * 8 + local_id.x;
309 |       let row: u32 = workgroup_id.y * 8 + local_id.y;
310 |       let N: u32 = uniforms.N;
311 |       let M: u32 = uniforms.M;
312 | 
313 |       // Load a tile from input_array to shared memory tile
314 |       if (row < M && col < N) {
315 |           tile[local_id.y][local_id.x] = input_array[row * N + col];
316 |       }
317 | 
318 |       workgroupBarrier(); // Ensure all threads have finished writing to the shared memory before proceeding
319 | 
320 |       let HSize: u32 = uniforms.HSize;
321 |       let xOffset: u32 = col % HSize;
322 |       let yOffset: u32 = row * HSize + (col / HSize) * HSize * M;
323 | 
324 |       // Write the tile to result_array
325 |       if (row < M && col < N) {
326 |           result_array[yOffset + xOffset] = tile[local_id.y][local_id.x];
327 |       }
328 |     } 
329 |   `;
330 | }
331 | 
332 | const CausalMaskBlock = new CausalMaskBlockClass();
333 | const OutputBlock = new OutputBlockClass();
334 | const TransposeBlock = new TransposeBlockClass();
335 | const SplitQBlock = new SplitQBlockClass();
336 | 
337 | operations.push(CausalMaskBlock, OutputBlock, TransposeBlock, SplitQBlock);
338 | 
339 | class TestShader {
340 |   constructor(folder, type) {
341 |     this.folder = folder;
342 |     this.tokenizerType = type;
343 |     this.initialized = false;
344 | 
345 |     this.device;
346 |     this.model;
347 |     this.tokenizer;
348 |     this.params;
349 |     this.minBufferOffset = 1;
350 | 
351 |     this.unloadDeletionStack = [];
352 |   }
353 | 
354 |   async initialize() {
355 |     if (this.initialized) return console.error("Model already initialized");
356 |     if (!navigator.gpu) throw new Error("WebGPU is not supported");
357 | 
358 |     const adapter = await navigator.gpu.requestAdapter();
359 |     this.device = await adapter.requestDevice();
360 | 
361 |     initializeOperations(this.device);
362 | 
363 |     this.initialized = true;
364 | 
365 |     console.log("Model initialized");
366 |   }
367 | 
368 |   async test() {
369 |     // ---------------- Create Passes ---------------- //
370 |     const seq_length = 15;
371 |     const n_embd = 128;
372 |     const n_head = 4;
373 |     const head_size = n_embd / n_head;
374 |     const { M, N } = { M: seq_length * n_head, N: seq_length };
375 |     const input_array = new Float32Array(M * N); // Softmax
376 |     const weight_array = new Float32Array(seq_length * n_embd);
377 |     for (let y = 0; y < M; y++) {
378 |       for (let x = 0; x < N; x++) {
379 |         input_array[y * N + x] = Math.floor(y / N) + 1;
380 |         // causal mask
381 |         if (x > y % N) input_array[y * N + x] = 0;
382 |       }
383 |     }
384 |     for (let y = 0; y < seq_length; y++) {
385 |       for (let x = 0; x < n_embd; x++) {
386 |         weight_array[y * n_embd + x] = Math.floor(x / head_size) + 1;
387 |       }
388 |     }
389 | 
390 |     console.log(formatAsMatrix(input_array, M, N));
391 |     console.log(formatAsMatrix(weight_array, seq_length, n_embd));
392 | 
393 |     const inputBuffer = this.initTensor(input_array, [M, N], ["storage"]);
394 |     const weightBuffer = this.initTensor(weight_array, [seq_length, n_embd], ["storage", "copy_from"]);
395 | 
396 |     this.computePasses = [];
397 |     const push = ({ passes, resultBuffer }) => {
398 |       this.computePasses.push(...passes);
399 |       return resultBuffer;
400 |     };
401 | 
402 |     let intermediateBuffer = inputBuffer;
403 |     intermediateBuffer = push(AttentionBlock.newTestInstance(seq_length, n_embd, head_size, intermediateBuffer, weightBuffer));
404 |     // intermediateBuffer = push(AttentionBlock.newTestOldInstance(seq_length, n_embd, head_size, n_head, intermediateBuffer, weightBuffer));
405 |     intermediateBuffer = push(OutputBlock.newInstance(seq_length, n_embd, intermediateBuffer));
406 |     let resultBuffer = intermediateBuffer;
407 | 
408 |     // ---------------- Compute Passes ----------------
409 | 
410 |     const commandEncoder = this.device.createCommandEncoder();
411 |     for (const pass of this.computePasses) {
412 |       if (pass.flag === "compute") {
413 |         const passEncoder = commandEncoder.beginComputePass();
414 |         passEncoder.setPipeline(pass.pipeline);
415 |         for (let i = 0; i < pass.groups.length; i++) passEncoder.setBindGroup(i, pass.groups[i]);
416 |         passEncoder.dispatchWorkgroups(pass.workgroups.x, pass.workgroups.y);
417 |         passEncoder.end();
418 |       } else if (pass.flag === "copy") {
419 |         commandEncoder.copyBufferToBuffer(pass.src, pass.srcOffset, pass.dst, pass.dstOffset, pass.size);
420 |       }
421 |     }
422 |     this.device.queue.submit([commandEncoder.finish()]);
423 | 
424 |     // ---------------- Read Results ----------------
425 | 
426 |     await resultBuffer.mapAsync(GPUMapMode.READ);
427 |     const output = resultBuffer.getMappedRange();
428 |     const outputArray = new Float32Array(output).slice(0); // Copy the array, otherwise it'll be destroyed.
429 |     console.log(formatAsMatrix(outputArray, seq_length, n_embd));
430 | 
431 |     // ---------------- Create Passes ---------------- //
432 | 
433 |     // this.computePasses = [];
434 | 
435 |     // intermediateBuffer = inputBuffer;
436 |     // intermediateBuffer = push(CausalMaskBlock.newInstance(M, N, intermediateBuffer)); // Transposes!
437 |     // intermediateBuffer = push(SoftmaxBlock.newInstance(N, M, intermediateBuffer));
438 |     // intermediateBuffer = push(OutputBlock.newInstance(N, M, intermediateBuffer));
439 |     // resultBuffer = intermediateBuffer;
440 | 
441 |     // // ---------------- Compute Passes ----------------
442 | 
443 |     // const commandEncoder2 = this.device.createCommandEncoder();
444 |     // for (const pass of this.computePasses) {
445 |     //   if (pass.flag === "compute") {
446 |     //     const passEncoder = commandEncoder2.beginComputePass();
447 |     //     passEncoder.setPipeline(pass.pipeline);
448 |     //     for (let i = 0; i < pass.groups.length; i++) passEncoder.setBindGroup(i, pass.groups[i]);
449 |     //     passEncoder.dispatchWorkgroups(pass.workgroups.x, pass.workgroups.y);
450 |     //     passEncoder.end();
451 |     //   } else if (pass.flag === "copy") {
452 |     //     commandEncoder2.copyBufferToBuffer(pass.src, pass.srcOffset, pass.dst, pass.dstOffset, pass.size);
453 |     //   }
454 |     // }
455 |     // this.device.queue.submit([commandEncoder2.finish()]);
456 | 
457 |     // // ---------------- Read Results ----------------
458 | 
459 |     // await resultBuffer.mapAsync(GPUMapMode.READ);
460 |     // const output2 = resultBuffer.getMappedRange();
461 |     // const outputArray2 = new Float32Array(output2).slice(0); // Copy the array, otherwise it'll be destroyed.
462 |     // console.log(formatAsMatrix(outputArray2, N, M));
463 | 
464 |     // // ---------------- Compare Results ----------------
465 | 
466 |     // let error = 0;
467 |     // for (let i = 0; i < outputArray.length; i++) {
468 |     //   error += Math.abs(outputArray[i] - outputArray2[i]);
469 |     // }
470 |     // console.log("Error: ", error);
471 | 
472 |     // ---------------- Cleanup ----------------
473 | 
474 |     destroyOperationBuffers();
475 |     this.unloadBuffers();
476 | 
477 |     return outputArray;
478 |   }
479 | 
480 |   initTensor(data, dims, ops) {
481 |     const buffer = this.device.createBuffer({
482 |       size: this.bufferSize(dims[0], dims[1] || 1, dims[2] || 1),
483 |       usage: ops.map((u) => bufferUsageDict[u]).reduce((a, b) => a | b),
484 |       mappedAtCreation: true,
485 |     });
486 |     new Float32Array(buffer.getMappedRange()).set(data);
487 |     buffer.unmap();
488 |     this.unloadDeletionStack.push(buffer);
489 |     return buffer;
490 |   }
491 | 
492 |   unloadBuffers() {
493 |     this.unloadDeletionStack.map((buffer) => buffer.destroy());
494 |     this.unloadDeletionStack = [];
495 |   }
496 | 
497 |   bufferSize(dimX, dimY = 1, dimZ = 1) {
498 |     return Math.ceil((dimX * dimY * dimZ * Float32Array.BYTES_PER_ELEMENT) / this.minBufferOffset) * this.minBufferOffset;
499 |   }
500 | }
501 | 
502 | async function testInstruction() {
503 |   const testShader = new TestShader();
504 |   await testShader.initialize();
505 |   await testShader.test();
506 | }
507 | 


--------------------------------------------------------------------------------
/other/validation/README.md:
--------------------------------------------------------------------------------
 1 | # Validating Results
 2 | 
 3 | PSA: This is old code and not meant to be super maintained. More general guideline.
 4 | 
 5 | This is an extremely helpful validation tool for checking the results of your WebGPU model versus the original when writing kernels or otherwise.
 6 | 
 7 | The format is an array of model states at each point in a generation sequence, first generating from the reference model and saving the state of the model as you generate each token, then comparing to the browser model. You must greedily select tokens, of course, to maintain determinism. This can be done simply by setting top_k = 1.
 8 | 
 9 | I haven't included a script for how to export this generation as my code was quite sloppy and this will likely be quite different depending on your implementation. Here's an example of how you might save from Andrej Karpathy's NanoGPT code:
10 | 
11 | ```
12 | 
13 | def generate(self, idx, max_new_tokens, temperature=1.0, top_k=1):
14 |     for i in range(max_new_tokens):
15 | 
16 |         # I sloppily made a global that tracks the generation index.
17 |         index = i
18 | 
19 |         # Another global variable.
20 |         tensors.append({})
21 | 
22 |         idx_cond = idx if idx.size(
23 |             1) <= self.config.block_size else idx[:, -self.config.block_size:]
24 | 
25 |         # Save inputs.
26 |         logits, _ = self(idx_cond)
27 | 
28 |         # Save the logits.
29 |         tensors[index]['logits'] = logits
30 | 
31 |         logits = logits[:, -1, :] / temperature
32 |         tensors[index]['logits_t'] = logits
33 | 
34 |         if top_k is not None:
35 |             v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
36 |             logits[logits < v[:, [-1]]] = -float('Inf')
37 | 
38 |         probs = F.softmax(logits, dim=-1)
39 | 
40 |         # Save the probs.
41 |         tensors[index]['probs'] = probs
42 | 
43 |         idx_next = torch.multinomial(probs, num_samples=1)
44 |         idx = torch.cat((idx, idx_next), dim=1)
45 | 
46 |     # Save tensors to JSON + format them correctly.
47 |     # See conversion scripts for correct formatting.
48 | ```
49 | 
50 | # Included validation files.
51 | 
52 | I've included 2 validation files (gpt2medium_validation.json and shakespeare_validation.json) for convenience.
53 | 
54 | Both are sampled with the prompt "What is the answer to life, the universe, and everything?".
55 | 


--------------------------------------------------------------------------------
/other/validation/test/gpt2medium_validation.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:49670416a4d85c6e1b1a3b651217f8ece6a6b764625de36fb7222dde79cef798
3 | size 456324435
4 | 


--------------------------------------------------------------------------------
/other/validation/test/shakepeare_validation.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bed2aa71484d9be3e0eb99d498bd08967ed649551416e359c7dd36b5d4d12d67
3 | size 10122136
4 | 


--------------------------------------------------------------------------------
/other/validation/validation.js:
--------------------------------------------------------------------------------
  1 | async function runGPTValidation(
  2 |   device,
  3 |   queue,
  4 |   seq_length,
  5 |   vocab_size,
  6 |   n_embd,
  7 |   n_heads,
  8 |   n_layers,
  9 |   attentionDotProductScale,
 10 |   embdOutputBuffer,
 11 |   posEmbdBuffer,
 12 |   layer_buffers,
 13 |   normGammaBuffer,
 14 |   normBetaBuffer,
 15 |   validateIndex
 16 | ) {
 17 |   console.log("Running GPT validation...");
 18 | 
 19 |   const commandEncoder = device.createCommandEncoder();
 20 | 
 21 |   console.log("Mixing embeddings...");
 22 |   // Crop the position embeddings to the correct size.
 23 |   const posEmbdOutputBuffer = createBuffer(
 24 |     device,
 25 |     bufferSizeCalc(seq_length, n_embd),
 26 |     GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
 27 |   );
 28 |   commandEncoder.copyBufferToBuffer(
 29 |     posEmbdBuffer, // Source buffer (original position embeddings)
 30 |     0, // Source offset (starting from the beginning of the buffer)
 31 |     posEmbdOutputBuffer, // Destination buffer (cropped buffer)
 32 |     0, // Destination offset (starting from the beginning of the cropped buffer)
 33 |     bufferSizeCalc(seq_length, n_embd) // Number of bytes to copy
 34 |   );
 35 |   // Residual connection is just elementwise addition, can be used for combining embedding and position embedding.
 36 |   const embeddedInputBuffer = inlineResidual(device, queue, commandEncoder, seq_length, n_embd, embdOutputBuffer, posEmbdOutputBuffer);
 37 |   let layerBuffer = embeddedInputBuffer;
 38 | 
 39 |   // Used for validation.
 40 |   const buffers = [];
 41 | 
 42 |   for (let i = 0; i < n_layers; i++) {
 43 |     console.log(`Processing block ${i}...`);
 44 |     const layer_params = layer_buffers[i];
 45 |     const {
 46 |       layerNormAttentionOutputBuffer,
 47 |       attentionOutputBuffer,
 48 |       residualAttentionOutputBuffer,
 49 |       layerNormLinearOutputBuffer,
 50 |       linearOutputBuffer,
 51 |       residualLinearOutputBuffer,
 52 |     } = transformerBlock(device, queue, commandEncoder, seq_length, n_embd, n_heads, attentionDotProductScale, layerBuffer, ...layer_params);
 53 |     buffers.push({
 54 |       layerNormAttentionOutputBuffer,
 55 |       attentionOutputBuffer,
 56 |       residualAttentionOutputBuffer,
 57 |       layerNormLinearOutputBuffer,
 58 |       linearOutputBuffer,
 59 |       residualLinearOutputBuffer,
 60 |     });
 61 |     layerBuffer = residualLinearOutputBuffer;
 62 |   }
 63 | 
 64 |   console.log("Normalizing output...");
 65 | 
 66 |   const layerNormOutputBuffer = inlineLayerNorm(device, queue, commandEncoder, seq_length, n_embd, layerBuffer, normGammaBuffer, normBetaBuffer);
 67 | 
 68 |   // OUTPUT and VALIDATION
 69 | 
 70 |   const outputEmbedBuffer = createOutputBuffer(device, commandEncoder, embeddedInputBuffer, seq_length, n_embd);
 71 | 
 72 |   const outputBlockBuffers = [];
 73 |   for (let i = 0; i < n_layers; i++) {
 74 |     const block = buffers[i];
 75 |     const outputLayerNormAttentionBuffer = createOutputBuffer(device, commandEncoder, block.layerNormAttentionOutputBuffer, seq_length, n_embd);
 76 |     const outputAttentionBuffer = createOutputBuffer(device, commandEncoder, block.attentionOutputBuffer, seq_length, n_embd);
 77 |     const outputResidualAttentionBuffer = createOutputBuffer(device, commandEncoder, block.residualAttentionOutputBuffer, seq_length, n_embd);
 78 |     const outputLayerNormLinearBuffer = createOutputBuffer(device, commandEncoder, block.layerNormLinearOutputBuffer, seq_length, n_embd);
 79 |     const outputLinearBuffer = createOutputBuffer(device, commandEncoder, block.linearOutputBuffer, seq_length, n_embd);
 80 |     const outputResidualLinearBuffer = createOutputBuffer(device, commandEncoder, block.residualLinearOutputBuffer, seq_length, n_embd);
 81 |     outputBlockBuffers.push([
 82 |       outputLayerNormAttentionBuffer,
 83 |       outputAttentionBuffer,
 84 |       outputResidualAttentionBuffer,
 85 |       outputLayerNormLinearBuffer,
 86 |       outputLinearBuffer,
 87 |       outputResidualLinearBuffer,
 88 |     ]);
 89 |   }
 90 |   const outputLayerBuffer = createOutputBuffer(device, commandEncoder, layerBuffer, seq_length, n_embd);
 91 |   const outputLayerNormBuffer = createOutputBuffer(device, commandEncoder, layerNormOutputBuffer, seq_length, n_embd);
 92 | 
 93 |   queue.submit([commandEncoder.finish()]);
 94 | 
 95 |   await outputEmbedBuffer.mapAsync(GPUMapMode.READ);
 96 | 
 97 |   for (let i = 0; i < n_layers; i++) {
 98 |     const block = outputBlockBuffers[i];
 99 |     for (let j = 0; j < block.length; j++) {
100 |       await block[j].mapAsync(GPUMapMode.READ);
101 |     }
102 |   }
103 |   await outputLayerBuffer.mapAsync(GPUMapMode.READ);
104 |   await outputLayerNormBuffer.mapAsync(GPUMapMode.READ);
105 | 
106 |   // You can't read twice from mapped range.
107 |   const layerNormOutput = outputLayerNormBuffer.getMappedRange();
108 |   const output = deEmbedCPU(layerNormOutput, seq_length, n_embd, vocab_size);
109 | 
110 |   console.log("Validating output...");
111 |   console.log("Expected output block:", validateModel[validateIndex]);
112 |   console.log("Validating embedding...");
113 |   validateResult(new Float32Array(outputEmbedBuffer.getMappedRange()), validateModel[validateIndex].tok_pos_emb);
114 |   console.log("Validating blocks...");
115 |   for (let i = 0; i < n_layers; i++) {
116 |     console.log(`\tValidating block ${i}...`);
117 |     const block = outputBlockBuffers[i];
118 |     console.log("\t\tValidating first layer norm...");
119 |     validateResult(new Float32Array(outputBlockBuffers[i][0].getMappedRange()), validateModel[validateIndex][`block${i}_ln1`]);
120 |     console.log("\t\tValidating attention...");
121 |     validateResult(new Float32Array(outputBlockBuffers[i][1].getMappedRange()), validateModel[validateIndex][`block${i}_attn`]);
122 |     console.log("\t\tValidating residual attention...");
123 |     validateResult(new Float32Array(outputBlockBuffers[i][2].getMappedRange()), validateModel[validateIndex][`block${i}_r1`]);
124 |     console.log("\t\tValidating second layer norm...");
125 |     validateResult(new Float32Array(outputBlockBuffers[i][3].getMappedRange()), validateModel[validateIndex][`block${i}_ln2`]);
126 |     console.log("\t\tValidating mlp...");
127 |     validateResult(new Float32Array(outputBlockBuffers[i][4].getMappedRange()), validateModel[validateIndex][`block${i}_mlp`]);
128 |     console.log("\t\tValidating residual mlp...");
129 |     validateResult(new Float32Array(outputBlockBuffers[i][5].getMappedRange()), validateModel[validateIndex][`block${i}_r2`]);
130 |   }
131 |   console.log("Validating layer norm...");
132 |   validateResult(new Float32Array(layerNormOutput), validateModel[validateIndex].ln_f);
133 |   console.log("Validating logits...");
134 |   validateResult(new Float32Array(output), validateModel[validateIndex].logits);
135 | 
136 |   return output;
137 | }
138 | 
139 | function validateResult(result, validate, verbose = false) {
140 |   const resultArray = formatAsMatrix(result, validate.shape[1], validate.shape[2]);
141 |   const validateArray = validate.data[0]; // Unpack from batch of 1
142 | 
143 |   const equal = checkAlmostEqualMatrices(resultArray, validateArray);
144 | 
145 |   if (!equal) {
146 |     // console.log("Result:", result);
147 |     // console.log("Validate:", validate);
148 |     console.log("Result mat:", resultArray);
149 |     console.log("Validate mat:", validateArray);
150 | 
151 |     // Calculate the difference
152 |     const diff = subtractMatrices(resultArray, validateArray);
153 |     console.log("Diff mat:", diff);
154 | 
155 |     // Sum the absolute values of the difference
156 |     const sum = sumMatrix(diff);
157 |     console.log("Sum:", sum);
158 | 
159 |     throw new Error("Test failed");
160 |   } else {
161 |     // console.log("Test passed!");
162 |     if (verbose) {
163 |       console.log("Result mat:", resultArray, validateArray);
164 |       // console.log("Validate mat:", validateArray);
165 |     }
166 |   }
167 | }
168 | 
169 | function reshapeRecursively(flatArray, shape) {
170 |   if (shape.length === 1) {
171 |     return flatArray.slice(0, shape[0]);
172 |   }
173 | 
174 |   let result = [];
175 |   let elementsPerSection = shape.slice(1).reduce((a, b) => a * b);
176 |   for (let i = 0; i < flatArray.length; i += elementsPerSection) {
177 |     result.push(reshapeRecursively(flatArray.slice(i, i + elementsPerSection), shape.slice(1)));
178 |   }
179 | 
180 |   return result;
181 | }
182 | 
183 | async function loadValidateModel(validateFile) {
184 |   console.log("Loading validation model...");
185 | 
186 |   const validateData = await (await fetch(`test/${validateFile}`)).json();
187 | 
188 |   const steps = [];
189 |   for (let i = 0; i < validateData.length; i++) {
190 |     const loadedData = {};
191 |     for (const key in validateData[i]) {
192 |       const shape = validateData[i][key].shape;
193 |       const data = validateData[i][key].data.flat(Infinity).map((value) => parseFloat(value));
194 |       const typedArray = new Float32Array(data);
195 | 
196 |       loadedData[key] = {
197 |         shape,
198 |         data: reshapeRecursively(typedArray, shape),
199 |       };
200 |     }
201 |     steps.push(loadedData);
202 |   }
203 | 
204 |   return steps;
205 | }
206 | 
207 | function checkAlmostEqualMatrices(a, b) {
208 |   if (a.length !== b.length) {
209 |     return false;
210 |   }
211 |   for (let i = 0; i < a.length; i++) {
212 |     if (a[i].length !== b[i].length) {
213 |       return false;
214 |     }
215 |     for (let j = 0; j < a[i].length; j++) {
216 |       if (a[i][j] - b[i][j] > 0.001) {
217 |         return false;
218 |       }
219 |     }
220 |   }
221 |   return true;
222 | }
223 | 
224 | function formatAsMatrix(floatArray, dimA, dimB) {
225 |   const resultMatrix = [];
226 |   for (let i = 0; i < dimA; i++) {
227 |     resultMatrix.push(floatArray.slice(i * dimB, (i + 1) * dimB));
228 |   }
229 |   return resultMatrix;
230 | }
231 | 
232 | async function runValidation(idx, validationIndex) {
233 |   if (!modelParams || !embeddingWeights) {
234 |     console.log("Model not loaded yet");
235 |     return;
236 |   }
237 | 
238 |   console.log("\nRunning model inference.");
239 |   console.log("Starting with", idx.length, "tokens.");
240 | 
241 |   const { device, queue, params, posEmbdBuffer, layer_buffers, normGammaBuffer, normBetaBuffer } = modelParams;
242 |   const { attentionDotProductScale, n_embd, n_heads, n_layers, vocab_size } = params;
243 |   const seq_length = idx.length;
244 | 
245 |   console.log("Embedding inputs...");
246 | 
247 |   const embeddings = idx.map((token) => embeddingWeights.slice(token * n_embd, (token + 1) * n_embd));
248 |   const flattened = flattenEmbeddings(embeddings);
249 |   const embdOutputBuffer = createBuffer(device, bufferSizeCalc(seq_length, n_embd), GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST);
250 |   queue.writeBuffer(embdOutputBuffer, 0, flattened);
251 | 
252 |   const startTime = performance.now();
253 |   const result = await runGPTValidation(
254 |     device,
255 |     queue,
256 |     seq_length,
257 |     vocab_size,
258 |     n_embd,
259 |     n_heads,
260 |     n_layers,
261 |     attentionDotProductScale,
262 |     embdOutputBuffer,
263 |     posEmbdBuffer,
264 |     layer_buffers,
265 |     normGammaBuffer,
266 |     normBetaBuffer,
267 |     validationIndex
268 |   );
269 | 
270 |   const endTime = performance.now();
271 |   console.log(`Time: ${endTime - startTime} ms`);
272 | 
273 |   return new Float32Array(result);
274 | }
275 | 
276 | async function validateAgainstModel() {
277 |   if (!modelParams || !validateModel) {
278 |     console.log("Model not loaded yet");
279 |     return;
280 |   }
281 | 
282 |   const context_size = modelParams.params.context_size;
283 | 
284 |   console.log(`Starting validation.`);
285 |   console.log("Validate model loaded", validateModel);
286 |   console.log("Model params", modelParams);
287 |   console.log("Context size", context_size);
288 | 
289 |   for (let i = 0; i < validateModel.length; i++) {
290 |     const step = validateModel[i];
291 | 
292 |     const idx_cond = Array.from(step.idx.data[0].slice(-context_size));
293 |     const logits = await runInference(idx_cond, i);
294 |     const probs = cpuSoftmax(logits, 1.0);
295 | 
296 |     const idx_next = sampleFromDistribution(probs, 1);
297 | 
298 |     console.log("Next token", idx_next);
299 |     console.log("Expected token", sampleFromDistribution(step.probs.data[0], 1));
300 | 
301 |     if (idx_next !== sampleFromDistribution(step.probs.data[0], 1)) {
302 |       throw new Error("Validation failed");
303 |     }
304 |   }
305 | }
306 | 


--------------------------------------------------------------------------------
/tokenizer.js:
--------------------------------------------------------------------------------
  1 | class Tokenizer {
  2 |   constructor() {
  3 |     this.encoder = undefined;
  4 |     this.decoder = undefined;
  5 |     this.vocab_size = undefined;
  6 |   }
  7 | 
  8 |   async load() {
  9 |     throw new Error("Not implemented.");
 10 |   }
 11 | 
 12 |   getVocabSize() {
 13 |     return this.vocab_size;
 14 |   }
 15 | 
 16 |   encode(str) {
 17 |     throw new Error("Not implemented.");
 18 |   }
 19 | 
 20 |   decode(arr) {
 21 |     throw new Error("Not implemented.");
 22 |   }
 23 | }
 24 | 
 25 | class SimpleTokenizer extends Tokenizer {
 26 |   constructor() {
 27 |     super();
 28 |   }
 29 | 
 30 |   async load() {
 31 |     console.log("Loading simple tokenizer...");
 32 |     this.encoder = await (await fetch("weights/tokenization/simple_tokens.json")).json();
 33 |     this.decoder = Object.keys(this.encoder).reduce((acc, x) => ({ ...acc, [this.encoder[x]]: x }), {});
 34 |     this.vocab_size = Object.keys(this.encoder).length;
 35 |   }
 36 | 
 37 |   encode(str) {
 38 |     return str.split("").map((x) => this.encoder[x]);
 39 |   }
 40 | 
 41 |   decode(arr) {
 42 |     return arr.map((x) => this.decoder[x]).join("");
 43 |   }
 44 | }
 45 | 
 46 | // ------------------ GPT Tokenizer ------------------
 47 | // Credit to https://github.com/latitudegames/GPT-3-Encoder
 48 | 
 49 | class GPT2Tokenizer extends Tokenizer {
 50 |   constructor() {
 51 |     super();
 52 |     this.pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu;
 53 |     this.textEncoder = new TextEncoder(); // always utf-8 by spec
 54 |     this.textDecoder = new TextDecoder("utf-8");
 55 |   }
 56 | 
 57 |   async load() {
 58 |     console.log("Loading GPT2 tokenizer...");
 59 | 
 60 |     const bpe_file = await (await fetch("weights/tokenization/vocab.bpe")).text();
 61 |     const encoder = await (await fetch("weights/tokenization/gpt_tokens.json")).json();
 62 |     this.encoder = encoder;
 63 | 
 64 |     console.log("Building decoder...");
 65 |     const decoder = {};
 66 |     Object.keys(encoder).map((x) => {
 67 |       decoder[encoder[x]] = x;
 68 |     });
 69 |     this.decoder = decoder;
 70 | 
 71 |     const lines = bpe_file.split("\n");
 72 |     const bpe_merges = lines.slice(1, lines.length - 1).map((x) => {
 73 |       return x.split(/(\s+)/).filter(function (e) {
 74 |         return e.trim().length > 0;
 75 |       });
 76 |     });
 77 | 
 78 |     const byte_encoder = bytes_to_unicode();
 79 |     const byte_decoder = {};
 80 |     Object.keys(byte_encoder).map((x) => {
 81 |       byte_decoder[byte_encoder[x]] = x;
 82 |     });
 83 |     this.byte_encoder = byte_encoder;
 84 |     this.byte_decoder = byte_decoder;
 85 | 
 86 |     this.bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length));
 87 |     this.cache = new Map();
 88 |     this.vocab_size = Object.keys(encoder).length;
 89 |   }
 90 | 
 91 |   encode(text) {
 92 |     if (!this.byte_encoder) throw new Error("Tokenizer not loaded.");
 93 |     let bpe_tokens = [];
 94 |     const matches = Array.from(text.matchAll(this.pat)).map((x) => x[0]);
 95 |     for (let token of matches) {
 96 |       const encoded_bytes = this.textEncoder.encode(token);
 97 |       let bytes = [];
 98 |       for (let i = 0; i < encoded_bytes.length; i++) {
 99 |         bytes.push(this.byte_encoder[encoded_bytes[i].toString()]);
100 |       }
101 |       token = bytes.join("");
102 | 
103 |       const new_tokens = this.bpe(token)
104 |         .split(" ")
105 |         .map((x) => this.encoder[x]);
106 |       bpe_tokens = bpe_tokens.concat(new_tokens);
107 |     }
108 |     return bpe_tokens;
109 |   }
110 | 
111 |   decode(tokens) {
112 |     if (!this.byte_decoder) throw new Error("Tokenizer not loaded.");
113 |     let text = tokens.map((x) => this.decoder[x]).join("");
114 |     text = this.textDecoder.decode(new Uint8Array(text.split("").map((x) => this.byte_decoder[x])));
115 |     return text;
116 |   }
117 | 
118 |   bpe(token) {
119 |     if (this.cache.has(token)) return this.cache.get(token);
120 |     let word = token.split("");
121 |     let pairs = get_pairs(word);
122 |     if (!pairs) return token;
123 |     while (true) {
124 |       const minPairs = {};
125 |       pairs.forEach(pair => {
126 |         const rank = this.bpe_ranks[pair];
127 |         minPairs[isNaN(rank) ? 10e10 : rank] = pair;
128 |       });
129 |       const keys = Object.keys(minPairs).map((x) => parseInt(x));
130 |       const bigram = minPairs[Math.min(...keys)];
131 |       if (!Object.hasOwn(this.bpe_ranks, bigram)) break;
132 |       const first = bigram[0];
133 |       const second = bigram[1];
134 |       let new_word = [];
135 |       let i = 0;
136 |       while (i < word.length) {
137 |         const j = word.indexOf(first, i);
138 |         if (j === -1) {
139 |           new_word = new_word.concat(word.slice(i));
140 |           break;
141 |         }
142 |         new_word = new_word.concat(word.slice(i, j));
143 |         i = j;
144 |         if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
145 |           new_word.push(first + second);
146 |           i = i + 2;
147 |         } else {
148 |           new_word.push(word[i]);
149 |           i = i + 1;
150 |         }
151 |       }
152 |       word = new_word;
153 |       if (word.length === 1) break;
154 |       else pairs = get_pairs(word);
155 |     }
156 |     word = word.join(" ");
157 |     this.cache.set(token, word);
158 |     return word;
159 |   }
160 | }
161 | 
162 | const range = (x, y) => {
163 |   const res = [];
164 |   for (let i = x; i < y; i++) { res.push(i) }
165 |   return res;
166 | };
167 | 
168 | const ord = (x) => {
169 |   return x.charCodeAt(0);
170 | };
171 | 
172 | const dictZip = (x, y) => {
173 |   const result = {};
174 |   x.map((_, i) => {
175 |     result[x[i]] = y[i];
176 |   });
177 |   return result;
178 | };
179 | 
180 | const bytes_to_unicode = () => {
181 |   const bs = range(ord("!"), ord("~") + 1).concat(range(ord("¡"), ord("¬") + 1), range(ord("®"), ord("ÿ") + 1));
182 |   let cs = bs.slice();
183 |   let n = 0;
184 |   for (let b = 0; b < 2 ** 8; b++) {
185 |     if (!bs.includes(b)) {
186 |       bs.push(b);
187 |       cs.push(2 ** 8 + n);
188 |       n = n + 1;
189 |     }
190 |   }
191 |   cs = cs.map((x) => String.fromCharCode(x));
192 |   const result = {};
193 |   bs.map((_, i) => {
194 |     result[bs[i]] = cs[i];
195 |   });
196 |   return result;
197 | };
198 | 
199 | const get_pairs = (word) => {
200 |   const pairs = new Set();
201 |   let prev_char = word[0];
202 |   for (let i = 1; i < word.length; i++) {
203 |     const char = word[i];
204 |     pairs.add([prev_char, char]);
205 |     prev_char = char;
206 |   }
207 |   return pairs;
208 | };
209 | 


--------------------------------------------------------------------------------
/visuals.js:
--------------------------------------------------------------------------------
  1 | class Visuals {
  2 | 
  3 |   initialized = false;
  4 | 
  5 |   constructor(model) {
  6 |     this.model = model;
  7 |     this.device = model.device;
  8 |     this.params = model.params;
  9 |   }
 10 | 
 11 |   init() {
 12 |     this.initFoundation();
 13 |     this.initUniforms();
 14 |     this.initLayoutAndPipeline();
 15 |     this.initBuffersAndBindGroup();
 16 |     this.updateModelBuffer();
 17 | 
 18 |     this.initialized = true;
 19 |   }
 20 | 
 21 |   initFoundation() {
 22 |     const containerEl = document.getElementById("visualsContainer");
 23 |     const gpuCanvasEl = document.createElement("canvas");
 24 | 
 25 |     containerEl.style.width = this.params.n_embd + "px";
 26 |     containerEl.style.height = this.params.n_ctx + "px";
 27 |     gpuCanvasEl.style.width = "100%";
 28 |     gpuCanvasEl.style.height = "100%";
 29 |     gpuCanvasEl.width = this.params.n_embd;
 30 |     gpuCanvasEl.height = this.params.n_ctx;
 31 | 
 32 |     const gpuContext = gpuCanvasEl.getContext("webgpu");
 33 |     const gpuCanvasFormat = navigator.gpu.getPreferredCanvasFormat();
 34 | 
 35 |     gpuContext.configure({
 36 |       device: this.device,
 37 |       format: gpuCanvasFormat,
 38 |     });
 39 | 
 40 |     containerEl.appendChild(gpuCanvasEl);
 41 | 
 42 |     this.containerEl = containerEl;
 43 |     this.gpuCanvasFormat = gpuCanvasFormat;
 44 |     this.gpuCanvasEl = gpuCanvasEl;
 45 |     this.gpuContext = gpuContext;
 46 |   }
 47 | 
 48 |   updateModelBuffer() {
 49 |     this.model.externalBuffer = this.embeddingsBuffer;
 50 |   }
 51 | 
 52 |   initUniforms() {
 53 |     this.uniforms = {
 54 |       width: this.model.params.n_embd,
 55 |       height: this.model.params.n_ctx,
 56 |     };
 57 |   }
 58 | 
 59 |   initLayoutAndPipeline() {
 60 |     this.bindGroupLayout = this.device.createBindGroupLayout({
 61 |       entries: [
 62 |         {
 63 |           binding: 0,
 64 |           visibility: GPUShaderStage.FRAGMENT,
 65 |           buffer: {
 66 |             type: "uniform",
 67 |           }
 68 |         },
 69 |         {
 70 |           binding: 1,
 71 |           visibility: GPUShaderStage.FRAGMENT,
 72 |           buffer: {
 73 |             type: "read-only-storage",
 74 |           }
 75 |         },
 76 |       ]
 77 |     });
 78 | 
 79 |     this.renderShaderModule = this.device.createShaderModule({
 80 |       label: 'visuals',
 81 |       code: `
 82 |         struct UniformData {
 83 |           width: f32,
 84 |           height: f32,
 85 |         }
 86 | 
 87 |         @vertex
 88 |         fn vsMain(@builtin(vertex_index) vertexIndex: u32) -> @builtin(position) vec4<f32> {
 89 |           var positions = array<vec2<f32>, 6>(
 90 |             vec2<f32>(-1.0, -1.0), // bottom left
 91 |             vec2<f32>( 1.0, -1.0), // bottom right
 92 |             vec2<f32>(-1.0,  1.0), // top left
 93 |             vec2<f32>(-1.0,  1.0), // top left
 94 |             vec2<f32>( 1.0, -1.0), // bottom right
 95 |             vec2<f32>( 1.0,  1.0)  // top right
 96 |           );
 97 |           return vec4<f32>(positions[vertexIndex], 0.0, 1.0);
 98 |         }
 99 | 
100 |         @group(0) @binding(0) var<uniform> uniformData: UniformData;
101 |         @group(0) @binding(1) var<storage, read> embeddingsBuffer: array<f32>;
102 | 
103 |         @fragment
104 |         fn fsMain(@builtin(position) fragCoord: vec4<f32>) -> @location(0) vec4<f32> {
105 |           let xNormalized = fragCoord.x / uniformData.width;
106 |           let yNormalized = fragCoord.y / uniformData.height;
107 | 
108 |           let xIndex = xNormalized * ${this.params.n_embd};
109 |           let yIndex = yNormalized * ${this.params.n_ctx};
110 |           let index = u32(yIndex) * ${this.params.n_embd} + u32(xIndex);
111 | 
112 |           let vectorValue = embeddingsBuffer[index];
113 | 
114 |           var outColor = vec4<f32>(0.0);
115 |           outColor = hdrColorMapping(outColor, 1.0, vectorValue * 0.1);
116 | 
117 |           return outColor;
118 |         }
119 | 
120 |         fn hdrColorMapping(colorRef: vec4<f32>, hdrThreshold: f32, vectorValue: f32) -> vec4<f32> {
121 |           var color = colorRef;
122 | 
123 |           if (vectorValue < 0.0) {
124 |             color.b = -vectorValue;
125 |             if (vectorValue < -hdrThreshold) {
126 |               color.g = -vectorValue - hdrThreshold;
127 |             }
128 |           } else {
129 |             color.r = vectorValue;
130 |             if (vectorValue > hdrThreshold) {
131 |               color.g = vectorValue - hdrThreshold;
132 |             }
133 |           }
134 |           color.g = min(color.g, 0.7);
135 |           return color;
136 |         }
137 |       `
138 |     });
139 | 
140 |     this.renderPipeline = this.device.createRenderPipeline({
141 |       layout: this.device.createPipelineLayout({
142 |         bindGroupLayouts: [this.bindGroupLayout],
143 |       }),
144 |       vertex: {
145 |         module: this.renderShaderModule,
146 |         entryPoint: 'vsMain',
147 |         buffers: []
148 |       },
149 |       fragment: {
150 |         module: this.renderShaderModule,
151 |         entryPoint: 'fsMain',
152 |         targets: [
153 |           {
154 |             format: this.gpuCanvasFormat,
155 |           },
156 |         ],
157 |       },
158 |     });
159 |   }
160 | 
161 |   initBuffersAndBindGroup() {
162 |     const uniformCount = Object.values(this.uniforms).length;
163 | 
164 |     this.uniformBuffer = this.device.createBuffer({
165 |       size: uniformCount * Float32Array.BYTES_PER_ELEMENT,
166 |       usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
167 |     });
168 | 
169 |     this.embeddingsBuffer = this.device.createBuffer({
170 |       size: this.model.bufferSize(this.params.n_ctx, this.params.n_embd),
171 |       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
172 |     });
173 | 
174 |     this.bindGroup = this.device.createBindGroup({
175 |       layout: this.bindGroupLayout,
176 |       entries: [
177 |         {
178 |           binding: 0,
179 |           resource: {
180 |             buffer: this.uniformBuffer,
181 |           }
182 |         },
183 |         {
184 |           binding: 1,
185 |           resource: {
186 |             buffer: this.embeddingsBuffer,
187 |           }
188 |         },
189 |       ]
190 |     });
191 | 
192 |     this.updateUniforms();
193 |   }
194 | 
195 |   updateUniforms() {
196 |     this.uniforms.width = this.gpuCanvasEl.width;
197 |     this.uniforms.height = this.gpuCanvasEl.height;
198 | 
199 |     const uniformArray = new Float32Array([
200 |       this.uniforms.width,
201 |       this.uniforms.height,
202 |     ]);
203 | 
204 |     this.device.queue.writeBuffer(
205 |       this.uniformBuffer,
206 |       0,
207 |       uniformArray.buffer,
208 |       uniformArray.byteOffset,
209 |       uniformArray.byteLength,
210 |     );
211 |   }
212 | 
213 |   render(existingCommandEncoder) {
214 |     const commandEncoder = existingCommandEncoder ?? this.device.createCommandEncoder();
215 | 
216 |     const textureView = this.gpuContext.getCurrentTexture().createView();
217 | 
218 |     const renderPassDescriptor = {
219 |       colorAttachments: [
220 |         {
221 |           view: textureView,
222 |           loadOp: 'clear',
223 |           loadValue: { r: 0.0, g: 0.0, b: 0.0, a: 1.0 },
224 |           storeOp: 'store',
225 |         }
226 |       ]
227 |     };
228 | 
229 |     const passEncoder = commandEncoder.beginRenderPass(renderPassDescriptor);
230 |     passEncoder.setPipeline(this.renderPipeline);
231 |     passEncoder.setBindGroup(0, this.bindGroup);
232 |     passEncoder.draw(6, 1, 0, 0);
233 |     passEncoder.end();
234 | 
235 |     this.device.queue.submit([commandEncoder.finish()]);
236 |   }
237 | 
238 |   destroy() {
239 |     this.gpuCanvasEl.remove();
240 |     this.uniformBuffer.destroy();
241 |     this.embeddingsBuffer.destroy();
242 |   }
243 | }


--------------------------------------------------------------------------------
/weights/better_shakespeare/lm_head.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0ab6e98e9867735d96b6fbf28d7343c0505145d97adc4ead3b6d11cee9084746
3 | size 33280
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/params_gpt.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:68125bf8d6d60d4f19cf4cd5b69b9fe6ec2f158acf0f39588226d0ecde6f9ba9
3 | size 120
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:72f84a760f8f97776cb5556538a6a38cfa9e228ab309630942c548c96162199e
3 | size 1536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8580563fe350ed41e165b80189067b4af5104896bbc71a255cfd1a2dda59dace
3 | size 196608
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bf23b1142e8eb119456ebd1724bcb11e0ff7cc819e85ddc642bd3be7b42d0188
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c9f2eed35a856a866e5a1f7cf4304441d0111bf052d9448efdde72354efd5a3b
3 | size 65536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:99257b1bccd1d6573a23aa1524175061116818c03c155b9ffa037c1db43ed072
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9295026c2ddd94b0dca4b9f938981076e791e239f67bab3dbcef0fe946293916
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cc3856e2d74abdf0b59c5f2e513515c688de43d4a78029d4c1ac423a80d489c3
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0b823c79c5f465a125265a1ed1605bcf6d0fdee9e0a61b1ec29d6d01b99cd99b
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3ef0dab8b13160f91a11552f3021601e72ff0b39bf8345ac5e4ac4bdb3c93007
3 | size 2048
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f064ec06e3e7b0bd08e5a24dd6974e9b60ef98b58a68accb436957d4e747b5d7
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ff124fdd060c6393e93eb411973d5a80da1619338071366d29e97905b1fed92b
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.0.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:98aa8ec3a193694183f83093412c9be071ffec682d1e4da9de8582dfbe02f38c
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:06f8994bed3de9d75b578ceab225c5b0d018c6f3dc87055b6abc6b867011404d
3 | size 1536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:763c6c84cb177cd40c30590ffde950090a2bcbf53fef1ab2407200e3f5d35b7a
3 | size 196608
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c083b7f18d953d8708ef66a1cb7a009a6b2cb28aee483ffdc426eada87f5e5bc
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:61e7c84491124abe4f05572850f598814a3d91d55a378dea5d51c297a8b698df
3 | size 65536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d17f07aff6ebc1d8b181af8f7913b7990cfed6978f2912c9f19c6c1f8875a2d8
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8712a34288538e1865b73420fd0cb058f06806633d1e46dc02d494b4c145ffb6
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8dc5e2789371a24cc0122f7c80855762974e6c2d935251cfe1012e76f483b884
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3d91cd410e1f1b2f9ef9e0326edd3649a2a62df5d297e5d855f526316565abd9
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0a217a9ad20381aa6bf35adb0b4a837672568c519ca7778871233b48f3b9b226
3 | size 2048
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b1d761486220d7e14f9890eae8e7222773b41603b69c6b9e9d597b24fb772383
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:221e43b3f354bf9b6e9f7fd7188ed287f26b1fd14b1a33c3cd60707d0f843436
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.1.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bead9ba9a96d2ac38d2200189d82662c4d37d4858d017494e6d7e8665d164fe3
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:893b4c782f43910e43265a3e1bba9fc75401fbe505ee0e3124371b442f56f3ce
3 | size 1536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:979cc99c4a3fc016331d71133d833802db0eaf4b6e2b750978b9157320edd994
3 | size 196608
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a93153e6c295e60ae9891ff9f4818b36b5062da7f3cafea5278d66583b10420b
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b007d8ab36a09440893b2e174e41af91a23fc1d8e980c954d022244808fce4f
3 | size 65536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:48322aae8e9bf86fb9cc36ea8f8c020690d2f59c20cb9c964bdfdf7fc8f5ffd0
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2c499c13600198ee052898a737493f17ceb329ab33e2a7daec5e0cebdc9625e7
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7341ebef6c3b8de112bce50478e4de7cf3f42eb48275a0c1e8272def4f782527
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:aa00a84b155a682111ec8eb5b157c87fae88d56f0700f88c5bbdeef38c928c2c
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f41aab925b92ed74eeffbd6bbc873e6f87e58e58f8c95b0ee185e7076c5cd7ad
3 | size 2048
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:25446ada6261b3c9c661d419a49a32f6b71a0c61aba8c9efa2195c845420604b
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1e54397368cc314fc4207ebc2bf934d89467efceca23d522b458f5a1e2fe81df
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.2.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:20fff27d1d97120732f6602494e1c62948ee5be66025701a93c7028bed17c3ab
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:79a4fc56a2174859aca9ac89b9b56342660e229aa20103e98dc6c9efb0c354a9
3 | size 1536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f67f2b9e211ec51eb8c2769c8205990de9ff7443d7cba743ed9099a9cfdd579c
3 | size 196608
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:916c10888a36ad9378d6acc0950bd48f6ef37a13cec3250aebd8b072ec4c942d
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a9c47bfc087a1e040d50147d7be178e7f34fd0d2371b53c14a70c323ad0f7ec1
3 | size 65536
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4dd31a6816ff915f15ad563e4783b978adcf23372fe69207229946572a94aef0
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ba212f3661f16cf701fedc42c8cde2552d5ca59d70d22e9b0cdc084af2bd9e50
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8c6b81ca4cd8f9fc1dfc96cc55c5d9a5d16c4920f529ffad30938e918b011eaa
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f808c9220f7b765a32345ae16eb9c0ac51ec4a23fb33dd7dd19567e0896e7416
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f93bb7c4a4035893d0dcf84d1cec2abdc7804443f4e81a752a90817157ac8eb4
3 | size 2048
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:af50ec178134ae89654b88700038ab334b09fc82bcb472c4657b93ce2df60623
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1409028c19e4206378b5a032ac2c8c2aa051b5d74e83d467d8ac270524988a46
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.h.3.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:85c861ecf42c294d20de0a7f13f9cbb1c464f63896afad0b9d47084d1cbf537d
3 | size 262144
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.ln_f.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9d68bef558eec5196825f8434389b67d6a4dad2b174b1d54135745a139ef1988
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.ln_f.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:899b3347e3794f65c4e5b84c23a1f386afc6a0767362f067ff2228f55c690f4f
3 | size 512
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.wpe.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:526c6e8a3e416a0094a0e007e3f9984535d00a007a2b8d579d48d7ef186cd7d3
3 | size 32768
4 | 


--------------------------------------------------------------------------------
/weights/better_shakespeare/transformer.wte.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0ab6e98e9867735d96b6fbf28d7343c0505145d97adc4ead3b6d11cee9084746
3 | size 33280
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/lm_head.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e182e433b37dbdb47448e0413c840edf6965113c1fc8048c63b69795d8cf875a
3 | size 154389504
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/params_gpt.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d7c996104a4971dcc7208714675c1e95787dd89ab46f107ba37cfc7c3100bf8d
3 | size 127
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef36c6f4fcc7f15fdb5882316834dc7e304ae5f9e8ae32673108c6efa441f787
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df332e9d9ae908db358a00423f14af1c0036bc9a2332aca395adf68e3eb08b48
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:52c55de1fa9685a50e5bd91c5741fc9ad80c03671c1eec71d92010194140e9a7
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e13178678e8c1e591d293bc5a96f081251d734d8a43b7038cc98329742e79952
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5a4a949fb3e7a463e3f1aca6f5285979f5fd330d9432ef9f272adb8d55bfbb2d
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87c92a5f0409ab2a8f2b92a9ebbb62ab9215590402d38929402c84357d53e4ae
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:52af0f6c03edf6298c4819da6d63b831cae48de804a0a23b19aca674efadf3b2
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b9e28ffbdfcac5b56ba9a9e8ccd9fe0efcc603f4f0d00685615e1c9cc9994960
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d71a74a230d32edd0a3e2562ba65a4cbc8ca1d1ff918a5f53468b0d096ad0086
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:534d05221a3d2e0b68a64c7b26e58fb39afcfd20adf1c6bef55bd100adb834a6
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:10a0e86f0d29a95ba16fa1a2cf5cca3ccc9b8c5cc21e82ed3169f40101e459be
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.0.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f4c17b36a51b9ae77a93ee6dd767512626076da8295342d8a760cf4aa0880993
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:467fba1ba8e52f8646c93bb101234c37b07fd51ba3cb5772c010e6aa68d1ed34
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:17cadf5370083f5d58dc8be9ceda794154d29c0f11cf8a38f347e84492093f32
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2637585a0b7a7cddc1cb97129712dcc3f1ee75cc4455cfb2429c956867ea24c5
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a6a27ec95bbc643c83a5206dca39af86d02a9c17c12b24d5640b7eccc48f9b21
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c9a75b7ca299432f7c4547e856d5914595d064548e9b9d375b4e3e920219f580
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9614528015a8b5dc32de46003e25e9bca7715ee69f4a838836c789509f51acd7
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fccc1781905485f36ab7d62ed9d2d265cebda07d27e9d267923ac3cbbd43a82b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b01fbb6df507a3ebe395e92e88c1df066ad477141130dff987e030caf6b5ceba
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:65347695d5a622d1f3a42b6a95b9010a1fce980dfd707abcefcb36088f9076d8
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:604b9974694fc19b1a68c83104ae4c8f6fa1dc2ead7be7e6d6ffa7a2558a3a25
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:39965842da4b639219358ba6757393ade2d2d0b0b6eda6c5e51cad2ac4aec205
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.1.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b18010d6564311c1dad46ce345f2b912188ea8c6162321420031f3c74234c6d
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f9377e6334fdc42c0cabba230f355daf07ce76f6af20edfdc8ae8e1dd5eaa074
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b43d111a3f0cee2c55a141f6724f11516d91c0a6e9c5fdf18cd85ef8ba36fc93
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5488c5c27460f555fd4b983213d51e8f57a91b10e9ce63aecaf1713008ba5c1b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7a6473a5c23cf4efe0b37d89f183e151d480e31c6b6dc06f56a5c41cc5dbfefa
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d3090fe7c30e68fbc6029cccaca150eb39b853d7982b81dffe7374981f34d2f9
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9bb50ac0614875e3e0ad6b9ba9acb2db50ed1ccfa15e34ae54a9c867eebff6e5
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d8f6b44d46bf330757f31931761bea5d06176d96930871a216893662c0b54531
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3605b6e73b6764e8d8329e6cbe1561203d57f69a4745e87a6b82e16b7906a2a5
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2f0c88abc4d0c1871371cf5ee6ab71617ba71e0d7d428fb00d7a8f40dd859041
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3dd499b76016e3c7dd6b9a14df2a96e1807b9354addca69e6f7ffdd03a231cf8
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9f76592c0f287e9d07f754e437e5eb382b52fb0f8f9fc98d83421b319e32404b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.10.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0b8d7258c7c4fc018aa4bd896355f5049eb3a68a373a7019235071c872f37901
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c6986457b215979836e0c8c4def716590f8d1db9ba372fc99007715cd1adf4b6
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b4fc281ff5bf255dfccd63ea1c13281481134c84cc565b5a648cfabcbecd1630
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9ff98d1c914971eb382ec53ddbf885c75db8c0d970626512d565922845d30213
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:be2b17561815008138e23302883718760ac336ee06ae3c008db0fb3c8d5fd83f
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8012007aadcbf1ae8a6cb5e33495dfd565dcc819e6e38cbd3879c64d7538a67e
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:829592908c2c2d4fce699c300c01aac3b185777ef231a29d91f880c21997f5e6
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8d347b0fede57daad897ce7fdd096ac7e0a34061634a1f7a71eb1ef178fccfb9
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2da7a8debd3bf751ae402e3a99cef5f8b400ff4cb6f4cf81e48b49ec09a04280
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9af0c1f873cd929c9ddfd672368e0292a46df76103b93fb53d1d76cf890f82f5
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:78a86284471ef4b965d6cff04239040d1614cab1422bcde1ff73acab34453dc8
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d87ba328b87dd9de0de3e323d52b375d9e4b9296734462462c3a1c6b21ae2bc
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.11.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:67ea7df0336063f954695d4e8d747feae4a4c1f2d1ed34f8c7beab2b3686cb0c
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9cd34935245ac07685284ea2eac419f2a68bcbe8f045eecf262544d457678bc7
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c5702a21258404c3f3393891d8ac943944732e58269955c3506ff164190eb285
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22dc4d1b92f2696b72362acd3238dc8491fe45d6ffa07f72747f35561fd1ff8e
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:04f8d1bf3b60fc1d444aefe697cf8f136eff217d176e581ebd0384a15ee8e99a
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9af0d11c8bf13ca5e4610db3dd59fd85d81b164601caadc12d574f7d34ad8fd8
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0191be1336ee8c22e8d43e1863fe199c09474cdb4840d1cadfe01b2572ff7cf9
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c9fca51b9f331e29c8b67d4395b09854f6498d4edf7335393fd38f9d53424b23
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b1d71d7ca62a19a32b557736894a0fd2c508b70ce0331142b61e061f732346b1
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2765ac2cd7430cf0c1bb511e0ce3ff8ec36f13c3c7523e714f40358fec128ffd
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:999c7c5ace2c54f8a129722d41d3c07443e357ece47d50b66c5e3c2d2f8231a0
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bc1394c11c732512984180b39a2b81d7d93b92654d19b59356f76b829d8653c1
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.2.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:73a8e497fd4aaf61e4adef335f8359579ced6ad4a5e03ae140c8b8c5bba57c44
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ccba1e26e98da0553ebde3f439ebfd477288a7019da77e7c4458d8e8ef9ba03e
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a85a482eb972a89371cb918a8600e2bd0d676ad79bea30b12d266eae670440ab
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:517b2e2187c251971aa455c516ab10cdc1ffb329b10e10668847e9d73a45d161
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d93f68892a300c6e59ee32a7d6101db8f934bb5cdfac1dcad2164f85d0316b09
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e0df07d4104990fc925e8323a8234e99fe51e661f7b3fd0be537c8a85dd95af8
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c645e3ef0c240592a5f01e1f88a069eb882d459f8d27fe59a304b4206547fc5
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:618d4065b4b02ecf8205797ba1462750879a273b35fbb6fd43b866b791af9f94
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b75ae3bd11256b16bb8dd112d94d7bb5290b5d7188795098f1ad661d5131078b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1c8f1f1bae9d7a47c5c4dff6d3fbb59b6b04f03a2306f76f590c32eda1ce586e
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:52031bf35212fb8c7406f213999a9cc3540d28a48b859cb8a14a203e8f7dab5b
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:203983d8a4b322ff0d0914a96537a9e42c27016b2589473c5eb733d546c28370
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.3.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:15c01b278388decef3a736d85f60e16246c52da9bae2dafb3e3e7f6319989abb
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13caa8e5d0cde4068374b220a6d61978f9eedeac7018a914d0c7d4494505109d
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e999b277552708af1fb43929f2d7798594627a95e26dc99c7d31d97dec32cb2f
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:12ce61485860901638dbb967bfcfb3ae30939d345fbc2f5fc1fde99022c70e8e
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fd4832c3b53e2e5006ec032a1ae062f6f43ce320d7c7674ff266f770c795106c
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6c6aeb4b7ef9b742962d7bcde232d39ba749d8a2de62cbb4a4f458737fef7084
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bf129fef6b6628f08deeff4708af62fd5d2f7e1bb7eacc17561bc82b60c5c50d
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0555ff00c21dded930a8b52f384649e5d4aa07f586f19c7380311319a48d5553
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7a9f0e04c79ad95e00051ee8ef52a7f1417a7a345d7f1d9f426aeb56b0ac4537
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2d18dab3491739f503390f1fb0d50925a296ecae2acbee224224c08456041bd5
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0d79d14d5527683450f592bc2b77aebc84ad63df0e15f3470c8b065746d4685c
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8357e8ef29c291564e7b24f12bba0419e67ebe68f9ec5b6e0434bec11f420af2
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.4.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:98ae7a2aad8b848f54fea75acc95e927b07b5bd0d22f032aadcc8499ee4362f8
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88f7a9fed056edb8150cecb3aa04108ca555d54f04fc8371693b643571d7ecd1
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0d734b6b250378058a27e933f923f9a9857c6947c35c7668180b1287f2a98d0e
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:257e6070c41a5ac1c1977f407249c38453370a7e2b7b7083b801178e4b07392b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f70cda4ea74169e206ca457720e9e34db4d8cb0e11761e3179c7aa4519cb03d8
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0eb9eee4d2369d36fc2a04839e36a010278858bae789f5895aeed662f16d7792
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d5ed450453b2a76cf52f3d7fabce3493965551311f977e3c518ec4621106a4de
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:49431716411b3ab43cdf929234ba34a5bc78008510652fc2da16f4b1c28775f8
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bc0e51c254f443b53a207baef543b0c77e6dac08b41341322dc5174310dbdee8
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f9d1f6351e138e2a48744a782e398796ef21d667ea1c813646b7994ba137fbf3
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e9d3b4787ad395d015a6743621a82162ec7d6355afa2847c37ccfa4497bca6ab
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0389e2ca03a50993765309402691f0312c269bb6d0194f3f80001ce7881086f2
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.5.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ee5f66696d79e7e78f05e6faf3f13ac4f6a2fa31e89ccc36aae668560de4b3e7
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:60a1bbc9ad182f5bfeb2dbb60614a06c00551676ac7627ca044aff55b0247ccf
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2e2aa09354477f74f0e137bd54f457f7614cba06e1c70acbab0ae16ea7d33c22
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4798e6f3ed4910c11cc55ed530966fd4f747c25743739ad8de152ffef0996718
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:708fe340f34bf3e5ce8c2b46d13367a05c6333748dad09811983131cf79328a9
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c0874e46eab1a6426599ec9784d2ec04c786d8df5c17fc8984ec42930d5679ac
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b2236a1d2958360879d5094d62d4728e56d06c06e7a22536473be4fba6d2f8e
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:36e9db9b57b6a886a1c10b351f645ceb19be1ec4c90a0af1b57f80184a02752a
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b6a44d99d843df8fd68463389a4efc449f1441014e874aec0ed8467b3cb35c68
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ee10d3e344c4f45d9c6f4e293ef30fc45513c1407139f5f2efbfeb62edf4571c
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87d249a6fbf548455aa966dbab64d76b18126878ef137b38d5e3ddec420cbdd5
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:27a09ed1c198859314c993301fdd25b101107c42a8ea8dad0194a61e9e9fea61
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.6.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b4552dc39a05023ba72df2f73067c75e5108aeedfa52d65054c05b14bf330b6
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3c44f02bfaf0d3c7e299bd47f0e59421326e7fcdb8128a9dd38002e1245de5c4
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c101117c479f6d522161e177a1360429b6c43760cd08fc6b05df275d3bde019e
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b7a5e213d9ee9872981592c873ef6f338c682ab5654c817e4014618502f5fb4
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ba453997dadc87090ad63c5c329d04630088f4e7f38067cbcdf29725e43ad6c8
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d68aeb30a5e9e4c47dce864de23565fe0e5bfebcda078bb75d85e16626e5eb70
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dee0061336b027e6306a7b2b80f909edab923a444f492caf03ed530e1e4e8ab0
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ac227d13adec753e107febc6dd90f5f495e79c4869582314af1d4c222a564951
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5963fa68c304222f4b1dc7e2a4a7439407cbb69d6852fd1ad92ff018a74e4de2
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6688980419ea053fccede5261b04f7d86ffefd57a457b10c15e8ecc7961ffe02
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:48380f00d00f3c743c2b35c6fc3be49a431fd2aea7648d778c6af0169ea5d682
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5d64a5ba0130a1ac1894a47f027322881d8bff3b3b38ea7f4d74ec72463c6dae
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.7.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4943d294ea8507543562bb00ced6a9b04bfca4029f8855f7d042ae97d6c5a10e
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:026218fac39588b78bfa747e12b81ccf9b558fbb15205fb55c9cec29509ce79a
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:460fa88fd47022f0269b6ffad8712360005535a96b86ae8ed5755785efff9a60
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:207a1897b77a033bc3c883232c86151bf2646f811c3c1dbe0a4da4f1f1e98527
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:214802a449991cc6b5b1858ba2a44740c5754ad555d7895f20f09c5634494da7
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:28362a797737604473418736cb360b0fc43aecc7e8b42af3abaf9402bfb95320
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:86c20d0ff9b523c865a7f39bdd8b1ec1d6c7e308169bf977a42443d42beea98c
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:55ca7b08f00a2966e8d3d458073e5dc8d9ee417cf2cf53dc1d8d6f1b6f6b59df
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2df2d5283d9e81dcc6e2c9d33acfeef16c9cf797f70469c30087cbba2b8c893b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dad4ea7c1036088c2c3bf3525048a5e413705159c03a480157bd316fbffc8aed
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b49cb75b0e746bf24be929b47d346a460ea98b60b66ceeeafea27145d69b333a
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8055c6f339e4a3c4628a35ef92a2f503d30481022fc378380851a6edf9e96dad
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.8.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7329ae0d79cd1372abb03b74772b9e95e0a4be69c219a8e2e905d916ae02d475
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e16420835229addd98c66a061dd8a9bea9b405eff3738f4bb00ff603f98400d1
3 | size 4194304
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.c_attn.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:45efa4ee6ac320f7eb5449955b7ac0154c814d9874c375abd47b9001e2855fd9
3 | size 9216
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.c_attn.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1c4d50c6b3e7c62591e7287b5940ffa58ced69aec815e585b94e872e5e79bed4
3 | size 7077888
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f5251700af6d9a6b3885cf9c7815d70c6ff17a0dc939c005465e4756c2ed55dc
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b6716a7af12ccf45e27e92aa7b033038175fbc82fc22750291c0f759cf7b5c6b
3 | size 2359296
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.attn.masked_bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8fc93e15d41731c6b43acf5c11babd0465d3e869ba00ac58364110274eecef16
3 | size 4
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.ln_1.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:43dcf271d476d0a084d70934e04db7bfd23bc421006f8524e0cb6c4bca41631b
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.ln_1.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a388be1c285b378cee43f48ecce1a56e9f39c537603ccbdf43314179b70d2964
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.ln_2.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ebe7f54adc51cbd1c0a90ffe54b5b082f612f21fff7590b31c91b9a45d6dbae4
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.ln_2.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:21187e2fb37e9d5bd9bfb8c7917b6e4651c0bba0edd8e602fe33b532a863216f
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.mlp.c_fc.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f325dd48d6535236fc82f15c296c0ba72305107373084e7d83a2af7b61a01945
3 | size 12288
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.mlp.c_fc.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b81e29daa03b021cc0297b38eaabc4bf278628207f3e582267c444c3cdfccae4
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.mlp.c_proj.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ff91b8d9d462396042d5ec07c14c7be91b701bcbe7aab999597ed78b6c7b59aa
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.h.9.mlp.c_proj.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d34e766640c9dc500daad86ddfc55e9739ff1a002c34767cfbc71a44ebddc4fc
3 | size 9437184
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.ln_f.bias_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3101f7c89d3985741e5e7a720572bb3b4967763b2b6f9c8d7be5f27bf9b6f225
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.ln_f.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0179316b9de281fc74b7a4edcec4370af4e5831242a2200daa7ce7c0c6546e8c
3 | size 3072
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.wpe.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:29c69587e2af826b7c159c38620a32e549a925e6d9a0dc37cb563f377f5be772
3 | size 3145728
4 | 


--------------------------------------------------------------------------------
/weights/gpt2/transformer.wte.weight_gpt.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e182e433b37dbdb47448e0413c840edf6965113c1fc8048c63b69795d8cf875a
3 | size 154389504
4 | 


--------------------------------------------------------------------------------
/weights/tokenization/gpt_tokens.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
3 | size 1042301
4 | 


--------------------------------------------------------------------------------
/weights/tokenization/simple_tokens.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:48456ee48f8479821ec7d5b87ed0d707bfc80a1f1e3a97618206164482783270
3 | size 709
4 | 


--------------------------------------------------------------------------------