├── .github
    └── workflows
    │   ├── ci.yml
    │   ├── ci_gpu.yml
    │   └── ci_tests.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── dev
    ├── cpu
    │   └── matmul_forward.c
    ├── cuda
    │   ├── Makefile
    │   ├── README.md
    │   ├── adamw.cu
    │   ├── attention_backward.cu
    │   ├── attention_forward.cu
    │   ├── benchmark_on_modal.py
    │   ├── classifier_fused.cu
    │   ├── common.h
    │   ├── crossentropy_forward.cu
    │   ├── crossentropy_softmax_backward.cu
    │   ├── encoder_backward.cu
    │   ├── encoder_forward.cu
    │   ├── fused_residual_forward.cu
    │   ├── gelu_backward.cu
    │   ├── gelu_forward.cu
    │   ├── global_norm.cu
    │   ├── layernorm_backward.cu
    │   ├── layernorm_forward.cu
    │   ├── matmul_backward.cu
    │   ├── matmul_backward_bias.cu
    │   ├── matmul_forward.cu
    │   ├── nccl_all_reduce.cu
    │   ├── permute.cu
    │   ├── residual_forward.cu
    │   ├── softmax_forward.cu
    │   └── trimat_forward.cu
    ├── data
    │   ├── README.md
    │   ├── data_common.py
    │   ├── edu_fineweb.sh
    │   ├── fineweb.py
    │   ├── fineweb.sh
    │   ├── hellaswag.py
    │   ├── mmlu.py
    │   ├── tinyshakespeare.py
    │   └── tinystories.py
    ├── download_starter_pack.sh
    ├── eval
    │   ├── README.md
    │   ├── export_hf.py
    │   ├── run_eval.sh
    │   └── summarize_eval.py
    ├── loss_checker_ci.py
    ├── test
    │   ├── Makefile
    │   ├── device_file_io.cu
    │   ├── test_dataloader.c
    │   └── test_outlier_detector.c
    ├── unistd.h
    └── vislog.ipynb
├── doc
    └── layernorm
    │   ├── layernorm.c
    │   ├── layernorm.md
    │   └── layernorm.py
├── llmc
    ├── CMakeLists.txt
    ├── adamw.cuh
    ├── attention.cuh
    ├── cublas_common.h
    ├── cuda_common.h
    ├── cuda_utils.cuh
    ├── cudnn_att.cpp
    ├── cudnn_att.h
    ├── dataloader.h
    ├── encoder.cuh
    ├── fused_classifier.cuh
    ├── gelu.cuh
    ├── global_norm.cuh
    ├── layernorm.cuh
    ├── logger.h
    ├── matmul.cuh
    ├── mfu.h
    ├── outlier_detector.h
    ├── rand.h
    ├── sampler.h
    ├── schedulers.h
    ├── tokenizer.h
    ├── utils.h
    └── zero.cuh
├── llmcpp
    ├── CMakeLists.txt
    ├── README.md
    ├── cuda_profile_util.hpp
    ├── gpt.hpp
    ├── gpt2.hpp
    ├── gpt_optim.cpp
    ├── gpt_optim.cu
    ├── gpt_test.cpp
    ├── gpt_test.cu
    ├── nn.hpp
    ├── nn_test.cpp
    ├── nn_test.cu
    ├── optim.hpp
    ├── optim_test.cpp
    ├── tensor_types.hpp
    ├── tensor_util.hpp
    ├── test_eigen_cpu.cpp
    ├── test_eigen_gpu.cu
    ├── test_gpt2.cpp
    ├── train_gpt2.cpp
    └── train_gpt2.cu
├── profile_gpt2.cu
├── profile_gpt2cu.py
├── requirements.txt
├── scripts
    ├── README.md
    ├── multi_node
    │   ├── run_gpt2_124M_fs.sbatch
    │   ├── run_gpt2_124M_mpi.sh
    │   └── run_gpt2_124M_tcp.sbatch
    ├── pyrun_gpt2_124M.sh
    ├── run_gpt2_124M.sh
    ├── run_gpt2_1558M.sh
    ├── run_gpt2_350M.sh
    ├── run_gpt2_774M.sh
    └── run_gpt3_125M.sh
├── test_gpt2.c
├── test_gpt2.cu
├── test_gpt2_fp32.cu
├── train_gpt2.c
├── train_gpt2.cu
├── train_gpt2.py
├── train_gpt2_fp32.cu
└── train_llama3.py


/.github/workflows/ci_gpu.yml:
--------------------------------------------------------------------------------
  1 | name: GPU Builds and Tests
  2 | 
  3 | on:
  4 |   create:
  5 |   workflow_dispatch:
  6 |   push:
  7 |     branches:
  8 |       - master
  9 |   pull_request:
 10 |     branches:
 11 |       - master
 12 | 
 13 | jobs:
 14 |   build-and-test-gpu:
 15 |     runs-on: ubicloud-gpu-standard-1-latest
 16 | 
 17 |     steps:
 18 |       - name: Checkout code
 19 |         uses: actions/checkout@v4
 20 | 
 21 |       - name: Install OpenMP
 22 |         run: sudo apt-get update && sudo apt-get install -y libomp-dev
 23 | 
 24 |       - name: Install dependencies
 25 |         run: pip install -r requirements.txt
 26 | 
 27 |       - name: Run preprocessing
 28 |         run: python dev/data/tinyshakespeare.py
 29 | 
 30 |       - name: Train model
 31 |         run: python train_gpt2.py
 32 | 
 33 |       - name: Compile training and testing program
 34 |         run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
 35 | 
 36 |       - name: Train model (With OpenMP)
 37 |         run: OMP_NUM_THREADS=8 ./train_gpt2cu
 38 | 
 39 |       - name: Train model (FP32) with gpt2_124M.bin
 40 |         run: |
 41 |           PRECISION=FP32 make train_gpt2cu
 42 |           ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin"
 43 | 
 44 |       - name: Test for percent loss differential for FP32 
 45 |         run: |
 46 |           PRECISION=FP32 make train_gpt2cu
 47 |           ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt
 48 |           python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0
 49 | 
 50 |       - name: Build FP32 precision
 51 |         run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu
 52 | 
 53 |       - name: Run default
 54 |         run: ./test_gpt2cu
 55 | 
 56 |       - name: Run no recompute GeLU
 57 |         run: ./test_gpt2cu -r 0
 58 | 
 59 |       - name: Run recompute LN
 60 |         run: ./test_gpt2cu -r 2
 61 | 
 62 |       - name: Build BF16 precision
 63 |         run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu
 64 | 
 65 |       - name: Run default
 66 |         run: ./test_gpt2cu
 67 | 
 68 |       - name: Run no recompute GeLU
 69 |         run: ./test_gpt2cu -r 0
 70 | 
 71 |       - name: Run no master weights
 72 |         run: ./test_gpt2cu -w 0
 73 | 
 74 |       - name: Run recompute LN
 75 |         run: ./test_gpt2cu -r 2
 76 | 
 77 |       - name: Train model fp32 (With OpenMP)
 78 |         run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu
 79 | 
 80 |       - name: Execute testing program (With OpenMP)
 81 |         run: OMP_NUM_THREADS=8 ./test_gpt2cu
 82 | 
 83 |       - name: Execute testing program fp32 (With OpenMP)
 84 |         run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu
 85 | 
 86 |       - name: Compile training and testing program without OpenMP
 87 |         run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
 88 | 
 89 |       - name: Train model (No OpenMP)
 90 |         run: NO_OMP=1 ./train_gpt2cu
 91 | 
 92 |       - name: Train model fp32 (No OpenMP)
 93 |         run: NO_OMP=1 ./train_gpt2fp32cu
 94 | 
 95 |       - name: Execute testing program (No OpenMP)
 96 |         run: ./test_gpt2cu -b 32
 97 | 
 98 |       - name: Execute testing program fp32 (No OpenMP)
 99 |         run: ./test_gpt2fp32cu
100 | 
101 |       - name: Install cuDNN-frontend
102 |         run:
103 |           git clone https://github.com/NVIDIA/cudnn-frontend.git
104 | 
105 |       - name: Build with cuDNN
106 |         run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
107 | 
108 |       - name: Train model with cuDNN
109 |         run: ./train_gpt2cu
110 | 
111 |       - name: Train model fp32 with cuDNN
112 |         run: ./train_gpt2fp32cu
113 | 
114 |       - name: Execute testing program with cuDNN
115 |         run: ./test_gpt2cu
116 | 
117 |       - name: Execute testing program fp32 with cuDNN
118 |         run: ./test_gpt2fp32cu
119 | 
120 |   unit-tests-gpu:
121 |     runs-on: ubicloud-gpu-standard-1-latest
122 | 
123 |     steps:
124 |       - name: Checkout code
125 |         uses: actions/checkout@v4
126 | 
127 |       - name: Test Device<->File IO
128 |         run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
129 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_tests.yml:
--------------------------------------------------------------------------------
  1 | name: Unit, Static and other Tests
  2 | 
  3 | on:
  4 |   create:
  5 |   workflow_dispatch:
  6 |   push:
  7 |     branches:
  8 |       - master
  9 |   pull_request:
 10 |     branches:
 11 |       - master
 12 | 
 13 | jobs:
 14 |   dataloader_test:
 15 |     runs-on: ubuntu-latest
 16 | 
 17 |     steps:
 18 |       - name: Checkout code
 19 |         uses: actions/checkout@v4
 20 | 
 21 |       - name: test the dataloader without / with sanitize address
 22 |         run: |
 23 |           cd dev/test
 24 |           make PRECISION=BF16 test_dataloader
 25 |           ./test_dataloader   
 26 |           make clean       
 27 |           make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader 
 28 |           ./test_dataloader          
 29 | 
 30 |   ptx_and_sass_files:
 31 |     runs-on: ubuntu-latest
 32 |     container:
 33 |       image: nvidia/cuda:12.4.1-devel-ubuntu22.04
 34 | 
 35 |     steps:
 36 |       - name: Checkout code
 37 |         uses: actions/checkout@v4
 38 | 
 39 |       - name: Install OpenMP and OpenMPI
 40 |         run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
 41 |     
 42 |       - name: Generate ptx/sass files and upload them to persistent storage
 43 |         run: |
 44 |           mkdir -p dev/cuda/ptx_sass_logs
 45 |           make train_gpt2cu
 46 |           cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
 47 |           cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
 48 |           cd dev/cuda
 49 |           make -j all_ptx
 50 |           make -j all_sass
 51 |           cp *.ptx ptx_sass_logs/
 52 |           cp *.sass ptx_sass_logs/
 53 |           ls ptx_sass_logs/
 54 | 
 55 |       - name: Generate ptx/sass files for A100 and upload them to persistent storage
 56 |         run: |
 57 |             mkdir -p dev/cuda/ptx_sass_logs_A100
 58 |             make train_gpt2cu GPU_COMPUTE_CAPABILITY=80
 59 |             cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
 60 |             cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
 61 |             cd dev/cuda
 62 |             make -j GPU_COMPUTE_CAPABILITY=80 all_ptx
 63 |             make -j GPU_COMPUTE_CAPABILITY=80 all_sass
 64 |             cp *.ptx ptx_sass_logs_A100/
 65 |             cp *.sass ptx_sass_logs_A100/
 66 |             ls ptx_sass_logs_A100/
 67 | 
 68 |       - name: Generate ptx/sass files for H100 and upload them to persistent storage
 69 |         run: |
 70 |             mkdir -p dev/cuda/ptx_sass_logs_H100
 71 |             make train_gpt2cu GPU_COMPUTE_CAPABILITY=90
 72 |             cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
 73 |             cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
 74 |             cd dev/cuda
 75 |             make -j GPU_COMPUTE_CAPABILITY=90 all_ptx
 76 |             make -j GPU_COMPUTE_CAPABILITY=90 all_sass
 77 |             cp *.ptx ptx_sass_logs_H100/
 78 |             cp *.sass ptx_sass_logs_H100/
 79 |             ls ptx_sass_logs_H100/
 80 | 
 81 |       - name: Upload ptx/sass files
 82 |         uses: actions/upload-artifact@v4
 83 |         with:
 84 |           name: ptx_sass_files
 85 |           path: dev/cuda/ptx_sass_logs/
 86 |           retention-days: 30 # days to retain
 87 | 
 88 |       - name: Upload ptx/sass files for A100
 89 |         uses: actions/upload-artifact@v4
 90 |         with:
 91 |           name: ptx_sass_files_A100
 92 |           path: dev/cuda/ptx_sass_logs_A100/
 93 |           retention-days: 30 # days to retain          
 94 | 
 95 |       - name: Upload ptx/sass files for H100
 96 |         uses: actions/upload-artifact@v4
 97 |         with:
 98 |           name: ptx_sass_files_H100
 99 |           path: dev/cuda/ptx_sass_logs_H100/
100 |           retention-days: 30 # days to retain                    


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # dot files and such
 2 | .vscode
 3 | .venv
 4 | 
 5 | # .bin files generated by Python
 6 | *.bin
 7 | 
 8 | # data directories
 9 | dev/data/__pycache__/
10 | dev/data/fineweb10B/
11 | dev/data/hellaswag/
12 | dev/data/mmlu/
13 | dev/data/tinyshakespeare/
14 | dev/data/tinystories/
15 | 
16 | # binaries
17 | test_gpt2
18 | test_gpt2cu
19 | test_gpt2fp32cu
20 | train_gpt2
21 | train_gpt2cu
22 | train_gpt2fp32cu
23 | profile_gpt2cu
24 | dev/cuda/*_forward
25 | dev/cuda/*_backward
26 | dev/cuda/classifier_fused
27 | dev/cuda/adamw
28 | dev/cuda/matmul_backward_bias
29 | dev/cuda/nccl_all_reduce
30 | dev/cuda/global_norm
31 | *.obj
32 | *.exe
33 | *.o
34 | 
35 | # log files
36 | *.log
37 | 
38 | # clion files
39 | .idea
40 | cmake-build-*
41 | build
42 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/abseil-cpp"]
 2 | 	path = third_party/abseil-cpp
 3 | 	url = https://github.com/abseil/abseil-cpp.git
 4 | [submodule "third_party/eigen"]
 5 | 	path = third_party/eigen
 6 | 	url = https://gitlab.com/libeigen/eigen.git
 7 | [submodule "third_party/googletest"]
 8 | 	path = third_party/googletest
 9 | 	url = https://github.com/google/googletest.git
10 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14)
 2 | project(llm.cpp LANGUAGES C CXX CUDA)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CUDA_STANDARD 17)
 6 | set(BUILD_SHARED_LIBS OFF)
 7 | # add_compile_options(-Ofast -march=native)
 8 | # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Ofast -march=native")
 9 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -march=native")
10 | 
11 | find_program(CCACHE_PROGRAM ccache)
12 | if (CCACHE_PROGRAM)
13 |     set(CMAKE_C_COMPILER_LAUNCHER ccache)
14 |     set(CMAKE_CXX_COMPILER_LAUNCHER ccache)
15 |     set(CMAKE_CUDA_COMPILER_LAUNCHER ccache)
16 | endif ()
17 | 
18 | enable_testing()
19 | include_directories(.)
20 | 
21 | # Abseil
22 | set(ABSL_PROPAGATE_CXX_STD ON)
23 | add_subdirectory(third_party/abseil-cpp)
24 | 
25 | # GoogleTest
26 | add_subdirectory(third_party/googletest)
27 | 
28 | # Eigen
29 | set(EIGEN3_INCLUDE_DIR third_party/eigen)
30 | add_definitions(-DEIGEN_DONT_PARALLELIZE)
31 | #add_definitions(-DEIGEN_DONT_VECTORIZE)
32 | add_definitions(-DEIGEN_USE_THREADS)
33 | include_directories(${EIGEN3_INCLUDE_DIR})
34 | 
35 | add_subdirectory(llmc)
36 | add_subdirectory(llmcpp)
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Andrej Karpathy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llm.cpp
  2 | 项目 fork 自 karpathy 的 [llm.c](https://github.com/karpathy/llm.c)，使用 C++(with Eigen) 来复现 GPT-2，支持 CPU/CUDA 计算。
  3 | - 所有的计算部分都通过 Eigen Tensor 完成，所以同样一份代码通过简单地切换 Device 就可完成 CPU/CUDA 的计算
  4 | - 这里实现的 GPT-2 与 PyTorch 版本是完全对齐的
  5 | - 值得注意的是，CPU 版本比 PyTorch 快大约 20%，但是 GPU 版本比 PyTorch GPU 慢得多，主要原因是 Eigen 的 Tensor 不支持 BatchMatmul 
  6 | 
  7 | 
  8 | This repo is forked from karpathy's [llm.c](https://github.com/karpathy/llm.c), using C++ (with Eigen) to reproduce GPT-2.
  9 | 
 10 | - All calculations are done through the Eigen Tensor Module, so the same code can be used for CPU/CUDA calculations by simply switching the Device.
 11 | - Currently, this repo has reproduced GPT-2 and the results are completely aligned with the PyTorch version.
 12 | - It is worth noting that CPU calculations are about 20% faster than PyTorch, while GPU calculations are still far behind PyTorch's GPU due to the difficulty of Eigen Tensor Module to support BatchMatmul.
 13 | 
 14 | ## quick start (CPU)
 15 | 
 16 | ```bash
 17 | pip install -r requirements.txt
 18 | python dev/data/tinyshakespeare.py
 19 | python train_gpt2.py
 20 | mkdir build && cd build
 21 | cmake ..
 22 | make train_gpt2_cpu
 23 | cd ../
 24 | ./build/llmcpp/train_gpt2_cpu
 25 | ```
 26 | 
 27 | The above lines 
 28 | - (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, 
 29 | tokenize it with the GPT-2 Tokenizer
 30 | - (2) download and save the GPT-2 (124M) weights
 31 | - (3) init from them in C++ and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. The output looks like this on my LMDE3 (Intel© Core™ i7-10700K CPU @ 3.80GHz × 8):
 32 | 
 33 | ```
 34 | [GPT-2]
 35 | max_seq_len: 1024
 36 | vocab_size: 50257
 37 | padded_vocab_size: 50304
 38 | num_layers: 12
 39 | num_heads: 12
 40 | channels: 768
 41 | num_parameters: 124475904(474 MB)
 42 | train dataset num_batches: 1192
 43 | val dataset num_batches: 128
 44 | num_activations: 82723584(315 MB)
 45 | val loss 5.325413
 46 | step 0: train loss 5.356086 (took 786.515755 ms)
 47 | step 1: train loss 4.300581 (took 677.340087 ms)
 48 | step 2: train loss 4.623053 (took 674.843167 ms)
 49 | step 3: train loss 4.599307 (took 673.189660 ms)
 50 | ... (trunctated) ...
 51 | step 39: train loss 3.972404 (took 749.386021 ms)
 52 | val loss 4.017484
 53 | generating:
 54 | ---
 55 | Requinetarius,
 56 | Which; supreme, but
 57 | Commands jest in vain for ever.
 58 | 
 59 | <|endoftext|>Lady:
 60 | No, heavens,
 61 | I were not to haste
 62 | To retire valorously and look nobly in the face,
 63 | Before this
 64 | UNHISILIUS UNDERDEINTS
 65 | 
 66 | ---
 67 | step 40: train loss 4.378605 (took 692.830391 ms)
 68 | final 40 iters avg: 692.974 ms
 69 | ```
 70 | 
 71 | ## quick start (1 GPU, fp32 only)
 72 | ```bash
 73 | mkdir build && cd build
 74 | cmake ..
 75 | make train_gpt2_gpu
 76 | cd ../
 77 | ./build/llmcpp/train_gpt2_gpu
 78 | ```
 79 | 
 80 | 
 81 | ## datasets
 82 | 
 83 | The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to .bin files, readable easily from C. So for example when you run:
 84 | 
 85 | ```bash
 86 | python dev/data/tinyshakespeare.py
 87 | ```
 88 | 
 89 | We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this:
 90 | 
 91 | ```
 92 | writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin
 93 | writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin
 94 | ```
 95 | 
 96 | The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`.
 97 | 
 98 | ## test
 99 | 
100 | I am also attaching a simple unit test for making sure our C++ code agrees with the PyTorch code. On the CPU as an example, compile and run with:
101 | 
102 | ```bash
103 | mkdir build && cd build
104 | cmake ..
105 | make test_gpt2_cpu
106 | cd ../
107 | ./build/llmcpp/test_gpt2_cpu
108 | ```
109 | 
110 | This now loads the `gpt2_124M_debug_state.bin` file that gets written by train_gpt2.py, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.
111 | This tests both the fp32 path and the mixed precision path. The test should pass and print `overall okay: 1`.
112 | 
113 | 
114 | ## license
115 | 
116 | MIT
117 | 


--------------------------------------------------------------------------------
/dev/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for building dev/cuda kernels
 2 | # Collects all the make commands in one file but each file also
 3 | # has the compile and run commands in the header comments section.
 4 | 
 5 | # Find nvcc (NVIDIA CUDA compiler)
 6 | NVCC := $(shell which nvcc 2>/dev/null)
 7 | ifeq ($(NVCC),)
 8 | 		$(error nvcc not found.)
 9 | endif
10 | 
11 | ifneq ($(CI),true) # if not in CI, then use the GPU query
12 |   ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
13 |     GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) # assume if NVCC is present, then this likely is too
14 |     GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
15 |   endif
16 | endif
17 | 
18 | # Compiler flags
19 | ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY=
20 |   CFLAGS = -O3 --use_fast_math
21 | else
22 |   CFLAGS = -O3 --use_fast_math --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
23 | endif
24 | 
25 | NVCCFLAGS = -lcublas -lcublasLt -std=c++17
26 | MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
27 | 
28 | # Default rule for our CUDA files
29 | %: %.cu
30 | 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
31 | 
32 | # Build all targets
33 | TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm permute
34 | 
35 | all: $(TARGETS)
36 | all_ptx:  $(TARGETS:%=%.ptx)
37 | all_sass: $(TARGETS:%=%.sass)
38 | 
39 | # Individual targets: forward pass
40 | attention_forward: attention_forward.cu
41 | classifier_fused: classifier_fused.cu
42 | crossentropy_forward: crossentropy_forward.cu
43 | encoder_forward: encoder_forward.cu
44 | gelu_forward: gelu_forward.cu
45 | layernorm_forward: layernorm_forward.cu
46 | fused_residual_forward: fused_residual_forward.cu
47 | residual_forward: residual_forward.cu
48 | softmax_forward: softmax_forward.cu
49 | trimat_forward: trimat_forward.cu
50 | # matmul fwd/bwd also uses OpenMP (optionally) and cuBLASLt libs
51 | matmul_forward: matmul_forward.cu
52 | 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward
53 | 
54 | # Individual targets: backward pass
55 | attention_backward: attention_backward.cu
56 | crossentropy_softmax_backward: crossentropy_softmax_backward.cu
57 | encoder_backward: encoder_backward.cu
58 | gelu_backward: gelu_backward.cu
59 | layernorm_backward: layernorm_backward.cu
60 | matmul_backward_bias: matmul_backward_bias.cu
61 | matmul_backward: matmul_backward.cu
62 | 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward
63 | 
64 | # Update kernels
65 | adamw: adamw.cu
66 | global_norm: global_norm.cu
67 | 
68 | permute: permute.cu
69 | 
70 | # NCCL communication kernels
71 | nccl_all_reduce: nccl_all_reduce.cu
72 | 	$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
73 | 
74 | # Generate PTX using cuobjdump
75 | %.ptx: %
76 | 	cuobjdump --dump-ptx $< > $@
77 | 
78 | # Generate SASS using cuobjdump
79 | %.sass: %
80 | 	cuobjdump --dump-sass $< > $@
81 | 
82 | # Run all targets
83 | run_all: all
84 | 	@for target in $(TARGETS); do \
85 | 		echo "\n========================================"; \
86 | 		echo "Running $$target ..."; \
87 | 		echo "========================================\n"; \
88 | 		./$$target; \
89 | 	done
90 | 
91 | # Clean up
92 | clean:
93 | 	rm -f $(TARGETS) *.ptx *.sass
94 | 


--------------------------------------------------------------------------------
/dev/cuda/README.md:
--------------------------------------------------------------------------------
 1 | # dev/cuda
 2 | 
 3 | This directory is scratch space for developing various versions of the needed CUDA kernels. Each file develops a kernel, and usually multiple versions of that kernel that could have different running times and of different code or time complexity.
 4 | 
 5 | See the top of each file for how to compile and run the kernel. Alternatively, the commands are also all grouped in the `Makefile` in this directory for convenience.
 6 | 
 7 | For example, we can look at the top of `layernorm_forward.cu` to build the forward pass kernels for the LayerNorm:
 8 | 
 9 | ```bash
10 | nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
11 | ```
12 | 
13 | or simply
14 | 
15 | ```bash
16 | make layernorm_forward
17 | ```
18 | 
19 | The comments at the top then document the different versions of this kernel available, usually these are in increasing complexity and decreasing running times. For example, inspecting the comments in the file on top, the most naive kernel we can then run as:
20 | 
21 | ```bash
22 | ./layernorm_forward 1
23 | ```
24 | 
25 | You'll see that this first forwards the reference code on the CPU, then it runs kernel 1 on the GPU, compares the results to check for correctness, and then runs a number of configurations of this kernel (most often and most notably the block size), to time the kernel in these launch configurations. We can then run one of the faster kernels (kernel 4) instead:
26 | 
27 | ```bash
28 | ./layernorm_forward 4
29 | ```
30 | 
31 | You'll see that this matches all the CPU results but runs much much faster. The typical process from here on is we copy paste the kernel that ran fastest, adjust it manually (e.g. to hardcode the best block size) and drop it into the training code file, e.g. `train_gpt2.cu`.
32 | 
33 | To add a new version of a kernel, add the kernel to the corresponding file and adjust the docs. To add a new kernel, add the new file and adjust the Makefile. Run `make clean` to clean up binaries from your directory.
34 | 
35 | If you do not have a GPU or is having trouble with CUDA dependencies, you can run the benchmarks on the [Modal platform](http://modal.com). For example, to run the benchmark for the attention forward pass on an A100 GPU with 80GB of memory, you can run the following command:
36 | 
37 | ```bash
38 | GPU_MEM=80 modal run benchmark_on_modal.py --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas" --run-command "./attention_forward 1"
39 | ```
40 | 


--------------------------------------------------------------------------------
/dev/cuda/benchmark_on_modal.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script for running benchmarks on the Modal platform.
  3 | This is useful for folks who do not have access to expensive GPUs locally.
  4 | Example usage for cuda kernels:
  5 | GPU_MEM=80 modal run benchmark_on_modal.py \
  6 |     --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas" \
  7 |     --run-command "./attention_forward 1"
  8 | OR if you want to use cuDNN etc.
  9 | 
 10 | 
 11 | For training the gpt2 model with cuDNN use:
 12 | GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \
 13 |     --compile-command "make train_gpt2cu USE_CUDNN=1"
 14 |     --run-command "./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin -j dev/data/tinyshakespeare/tiny_shakespeare_val.bin -v 250 -s 250 -g 144 -f shakespeare.log -b 4"
 15 | 
 16 | 
 17 | For profiling using nsight system:
 18 | GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \
 19 |     --compile-command "make train_gpt2cu USE_CUDNN=1" \
 20 |     --run-command "nsys profile --cuda-graph-trace=graph --python-backtrace=cuda --cuda-memory-usage=true \
 21 |     ./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin \
 22 |     -j dev/data/tinyshakespeare/tiny_shakespeare_val.bin -v 250 -s 250 -g 144 -f shakespeare.log -b 4"
 23 | 
 24 | For more nsys profiling specifics and command options, take a look at: https://docs.nvidia.com/nsight-systems/2024.2/UserGuide/
 25 | -> To profile the report using a GUI, download NVIDIA NSight System GUI version (this software can run on all OS, so you download it locally)
 26 | 
 27 | NOTE: Currently there is a bug in the profiling using nsight system which produces a unrecognized GPU UUId error on the command line but it
 28 | does not actually interfere with the model training and validation. The report (that you download) is still generated and can be viewed from Nsight Systems
 29 | """
 30 | import subprocess
 31 | import os
 32 | import sys
 33 | import datetime
 34 | 
 35 | import modal
 36 | from modal import Image, Stub
 37 | GPU_NAME_TO_MODAL_CLASS_MAP = {
 38 |     "H100": modal.gpu.H100,
 39 |     "A100": modal.gpu.A100,
 40 |     "A10G": modal.gpu.A10G,
 41 | }
 42 | N_GPUS = int(os.environ.get("N_GPUS", 1))
 43 | GPU_MEM = int(os.environ.get("GPU_MEM", 40))
 44 | GPU_NAME = os.environ.get("GPU_NAME", "A100")
 45 | GPU_CONFIG = GPU_NAME_TO_MODAL_CLASS_MAP[GPU_NAME](count=N_GPUS, size=str(GPU_MEM) + 'GB')
 46 | 
 47 | APP_NAME = "llm.c benchmark run"
 48 | 
 49 | image = (
 50 |     Image.from_registry("totallyvyom/cuda-env:latest-2")
 51 |     .pip_install("huggingface_hub==0.20.3", "hf-transfer==0.1.5")
 52 |     .env(
 53 |         dict(
 54 |             HUGGINGFACE_HUB_CACHE="/pretrained",
 55 |             HF_HUB_ENABLE_HF_TRANSFER="1",
 56 |             TQDM_DISABLE="true",
 57 |         )
 58 |     )
 59 |     .run_commands(
 60 |     "wget -q https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-Linux-x86_64.sh",
 61 |     "bash cmake-3.28.1-Linux-x86_64.sh --skip-license --prefix=/usr/local",
 62 |     "rm cmake-3.28.1-Linux-x86_64.sh",
 63 |     "ln -s /usr/local/bin/cmake /usr/bin/cmake",)
 64 |     .run_commands(
 65 |         "apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev",
 66 |         "apt-get install -y openmpi-bin openmpi-doc libopenmpi-dev kmod sudo",
 67 |         "git clone https://github.com/NVIDIA/cudnn-frontend.git /root/cudnn-frontend",
 68 |         "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make"
 69 |     )
 70 |     .run_commands(
 71 |         "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
 72 |         mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
 73 |         apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
 74 |         add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /\" && \
 75 |         apt-get update"
 76 |     ).run_commands(
 77 |         "apt-get install -y nsight-systems-2023.3.3"
 78 |     )
 79 | )
 80 | 
 81 | stub = modal.App(APP_NAME)
 82 | 
 83 | def execute_command(command: str):
 84 |     command_args = command.split(" ")
 85 |     print(f"{command_args = }")
 86 |     subprocess.run(command_args, stdout=sys.stdout, stderr=subprocess.STDOUT)
 87 | 
 88 | @stub.function(
 89 |     gpu=GPU_CONFIG,
 90 |     image=image,
 91 |     allow_concurrent_inputs=4,
 92 |     container_idle_timeout=900,
 93 |     mounts=[modal.Mount.from_local_dir("./", remote_path="/root/")],
 94 |     # Instead of 'cuda-env' put your volume name that you create from 'modal volume create {volume-name}'
 95 |     # This enables the profiling reports to be saved on the volume that you can download by using:
 96 |     # 'modal volume get {volume-name} {/output_file_name}
 97 |     # For example right now, when profiling using this command "nsys profile --trace=cuda,nvtx --cuda-graph-trace=graph --python-backtrace=cuda --cuda-memory-usage=true" you would get your report
 98 |     # using in a directory in your volume, where the name contains the timestamp unique id.
 99 |     # This script will generate a "report1_{timestamp} folder in volume"
100 |     # and you can download it with 'modal volume get {volume-name} report1_{timestamp}
101 |     volumes={"/cuda-env": modal.Volume.from_name("cuda-env")},
102 | )
103 | def run_benchmark(compile_command: str, run_command: str):
104 |     execute_command("pwd")
105 |     execute_command("ls")
106 |     execute_command(compile_command)
107 |     execute_command(run_command)
108 |     # Use this section if you want to profile using nsight system and install the reports on your volume to be locally downloaded
109 |     timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
110 | 
111 |     execute_command("mkdir report1_" + timestamp)
112 |     execute_command("mv /root/report1.nsys-rep /root/report1_" + timestamp + "/")
113 |     execute_command("mv /root/report1.qdstrm /root/report1_" + timestamp + "/")
114 |     execute_command("mv /root/report1_" + timestamp + "/" + " /cuda-env/")
115 | 
116 |     return None
117 | 
118 | @stub.local_entrypoint()
119 | def inference_main(compile_command: str, run_command: str):
120 |     results = run_benchmark.remote(compile_command, run_command)
121 |     return results


--------------------------------------------------------------------------------
/dev/cuda/crossentropy_forward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Kernels for crossentropy forward pass.
  3 | 
  4 | Compile example:
  5 | nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward
  6 | 
  7 | version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
  8 | ./crossentropy_forward 1
  9 | */
 10 | 
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <cuda_runtime.h>
 14 | #include "common.h"
 15 | 
 16 | // ----------------------------------------------------------------------------
 17 | // CPU code reference
 18 | 
 19 | void crossentropy_forward_cpu(float* losses,
 20 |                             const float* probs, const int* targets,
 21 |                             int B, int T, int V) {
 22 |     // output: losses is (B,T) of the individual losses at each position
 23 |     // input: probs are (B,T,V) of the probabilities
 24 |     // input: targets is (B,T) of integers giving the correct index in logits
 25 |     for (int b = 0; b < B; b++) {
 26 |         for (int t = 0; t < T; t++) {
 27 |             // loss = -log(probs[target])
 28 |             const float* probs_bt = probs + b * T * V + t * V;
 29 |             int ix = targets[b * T + t];
 30 |             losses[b * T + t] = -logf(probs_bt[ix]);
 31 |         }
 32 |     }
 33 | }
 34 | 
 35 | // ----------------------------------------------------------------------------
 36 | // GPU kernels
 37 | 
 38 | __global__ void crossentropy_forward_kernel1(float* losses,
 39 |                             const float* probs, const int* targets,
 40 |                             int B, int T, int V) {
 41 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 42 |     if (i < B * T) {
 43 |         int b = i / T;
 44 |         int t = i % T;
 45 |         const float* probs_bt = probs + b * T * V + t * V;
 46 |         int ix = targets[b * T + t];
 47 |         losses[b * T + t] = -logf(probs_bt[ix]);
 48 |     }
 49 | }
 50 | 
 51 | // ----------------------------------------------------------------------------
 52 | // kernel launcher
 53 | 
 54 | void crossentropy_forward1(float* losses,
 55 |                             const float* probs, const int* targets,
 56 |                             int B, int T, int V,
 57 |                             const int block_size) {
 58 |     const int N = B * T;
 59 |     const int grid_size = ceil_div(N, block_size);
 60 |     crossentropy_forward_kernel1<<<grid_size, block_size>>>(losses, probs, targets, B, T, V);
 61 |     cudaCheck(cudaGetLastError());
 62 | }
 63 | 
 64 | // kernel version dispatch
 65 | void crossentropy_forward(int kernel_num,
 66 |                           float* losses,
 67 |                           const float* probs, const int* targets,
 68 |                           int B, int T, int V,
 69 |                           const int block_size) {
 70 |     switch (kernel_num) {
 71 |         case 1:
 72 |             crossentropy_forward1(losses, probs, targets, B, T, V, block_size);
 73 |             break;
 74 |         default:
 75 |             printf("Invalid kernel number\n");
 76 |             exit(1);
 77 |     }
 78 | }
 79 | 
 80 | // ----------------------------------------------------------------------------
 81 | 
 82 | int main(int argc, char **argv) {
 83 |     srand(0);
 84 | 
 85 |     int B = 8;
 86 |     int T = 1024;
 87 |     int V = 50257;
 88 | 
 89 |     int deviceIdx = 0;
 90 |     cudaCheck(cudaSetDevice(deviceIdx));
 91 | 
 92 |     // create host memory of random numbers
 93 |     float* out = (float*)malloc(B * T * sizeof(float));
 94 |     float* probs = make_random_float_01(B * T * V);
 95 |     int* targets = make_random_int(B * T, V);
 96 | 
 97 |     // move to GPU
 98 |     float* d_out;
 99 |     float* d_probs;
100 |     int* d_targets;
101 |     cudaCheck(cudaMalloc(&d_out, B * T * sizeof(float)));
102 |     cudaCheck(cudaMalloc(&d_probs, B * T * V * sizeof(float)));
103 |     cudaCheck(cudaMalloc(&d_targets, B * T * sizeof(int)));
104 |     cudaCheck(cudaMemcpy(d_probs, probs, B * T * V * sizeof(float), cudaMemcpyHostToDevice));
105 |     cudaCheck(cudaMemcpy(d_targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
106 | 
107 |     // read kernel_num from command line
108 |     int kernel_num = 1;
109 |     if (argc > 1) {
110 |         kernel_num = atoi(argv[1]);
111 |     }
112 |     printf("Using kernel %d\n", kernel_num);
113 | 
114 |     // first check the correctness of the kernel
115 |     crossentropy_forward_cpu(out, probs, targets, B, T, V);
116 |     // time the kernel at different block sizes
117 |     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
118 | 
119 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
120 |         int block_size = block_sizes[j];
121 |         printf("Checking block size %d.\n", block_size);
122 |         crossentropy_forward(kernel_num, d_out, d_probs, d_targets, B, T, V, block_size);
123 |         validate_result(d_out, out, "out", B * T, 1e-5f);
124 |     }
125 | 
126 |     printf("All results match. Starting benchmarks.\n\n");
127 | 
128 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
129 |         int block_size = block_sizes[j];
130 | 
131 |         int repeat_times = 1000;
132 |         float elapsed_time = benchmark_kernel(repeat_times, crossentropy_forward,
133 |                                               kernel_num, d_out, d_probs, d_targets,
134 |                                               B, T, V, block_size);
135 | 
136 |         printf("block_size %4d | time %.4f ms | per token %.2f ns\n", block_size, elapsed_time, elapsed_time * 1'000'000 / (B*T));
137 |     }
138 | 
139 |     // free memory
140 |     free(out);
141 |     free(probs);
142 |     free(targets);
143 |     cudaCheck(cudaFree(d_out));
144 |     cudaCheck(cudaFree(d_probs));
145 |     cudaCheck(cudaFree(d_targets));
146 | 
147 |     return 0;
148 | }


--------------------------------------------------------------------------------
/dev/cuda/crossentropy_softmax_backward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Kernels for crossentropy forward pass.
  3 | 
  4 | Compile example:
  5 | nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
  6 | 
  7 | version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
  8 | ./crossentropy_softmax_backward 1
  9 | */
 10 | 
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <cuda_runtime.h>
 14 | #include "common.h"
 15 | 
 16 | // ----------------------------------------------------------------------------
 17 | // CPU code reference
 18 | 
 19 | void crossentropy_softmax_backward_cpu(float* dlogits,
 20 |                            const float* dlosses, const float* probs, const int* targets,
 21 |                            int B, int T, int V) {
 22 |     // backwards through both softmax and crossentropy
 23 |     for (int b = 0; b < B; b++) {
 24 |         for (int t = 0; t < T; t++) {
 25 |             float* dlogits_bt = dlogits + b * T * V + t * V;
 26 |             const float* probs_bt = probs + b * T * V + t * V;
 27 |             float dloss = dlosses[b * T + t];
 28 |             int ix = targets[b * T + t];
 29 |             for (int i = 0; i < V; i++) {
 30 |                 float p = probs_bt[i];
 31 |                 float indicator = i == ix ? 1.0f : 0.0f;
 32 |                 dlogits_bt[i] += (p - indicator) * dloss;
 33 |             }
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | // ----------------------------------------------------------------------------
 39 | // GPU kernels
 40 | 
 41 | // naive kernel that just parallelizes over B,T,V
 42 | __global__ void crossentropy_softmax_backward_kernel1(float* dlogits,
 43 |                            const float* dlosses, const float* probs, const int* targets,
 44 |                            int B, int T, int V) {
 45 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 46 |     if (i < B * T * V) {
 47 |         int b = i / (T * V);
 48 |         int t = (i / V) % T;
 49 |         int v = i % V;
 50 |         float* dlogits_bt = dlogits + b * T * V + t * V;
 51 |         const float* probs_bt = probs + b * T * V + t * V;
 52 |         float dloss = dlosses[b * T + t];
 53 |         int ix = targets[b * T + t];
 54 |         float p = probs_bt[v];
 55 |         float indicator = v == ix ? 1.0f : 0.0f;
 56 |         dlogits_bt[v] += (p - indicator) * dloss;
 57 |     }
 58 | }
 59 | 
 60 | // ----------------------------------------------------------------------------
 61 | // kernel launcher
 62 | 
 63 | void crossentropy_softmax_backward1(float* dlogits,
 64 |                            const float* dlosses, const float* probs, const int* targets,
 65 |                            int B, int T, int V,
 66 |                            const int block_size) {
 67 |     const int N = B * T * V;
 68 |     const int grid_size = ceil_div(N, block_size);
 69 |     crossentropy_softmax_backward_kernel1<<<grid_size, block_size>>>(dlogits, dlosses, probs, targets, B, T, V);
 70 |     cudaCheck(cudaGetLastError());
 71 | }
 72 | 
 73 | // kernel version dispatch
 74 | void crossentropy_softmax_backward(int kernel_num,
 75 |                            float* dlogits,
 76 |                            const float* dlosses, const float* probs, const int* targets,
 77 |                            int B, int T, int V,
 78 |                            const int block_size) {
 79 |     switch (kernel_num) {
 80 |         case 1:
 81 |             crossentropy_softmax_backward1(dlogits, dlosses, probs, targets, B, T, V, block_size);
 82 |             break;
 83 |         default:
 84 |             printf("Invalid kernel number\n");
 85 |             exit(1);
 86 |     }
 87 | }
 88 | 
 89 | // ----------------------------------------------------------------------------
 90 | 
 91 | int main(int argc, char **argv) {
 92 |     srand(0);
 93 | 
 94 |     int B = 8;
 95 |     int T = 1024;
 96 |     int V = 50257;
 97 | 
 98 |     int deviceIdx = 0;
 99 |     cudaCheck(cudaSetDevice(deviceIdx));
100 | 
101 |     // create host memory of random numbers
102 |     float* probs = make_random_float_01(B * T * V);
103 |     int* targets = make_random_int(B * T, V);
104 |     float* dlosses = make_random_float(B * T);
105 |     float* dlogits = make_zeros_float(B * T * V);
106 | 
107 |     // move to GPU
108 |     float* d_probs;
109 |     int* d_targets;
110 |     float* d_dlosses;
111 |     float* d_dlogits;
112 |     cudaCheck(cudaMalloc(&d_probs, B * T * V * sizeof(float)));
113 |     cudaCheck(cudaMalloc(&d_targets, B * T * sizeof(int)));
114 |     cudaCheck(cudaMalloc(&d_dlosses, B * T * sizeof(float)));
115 |     cudaCheck(cudaMalloc(&d_dlogits, B * T * V * sizeof(float)));
116 |     cudaCheck(cudaMemcpy(d_probs, probs, B * T * V * sizeof(float), cudaMemcpyHostToDevice));
117 |     cudaCheck(cudaMemcpy(d_targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
118 |     cudaCheck(cudaMemcpy(d_dlosses, dlosses, B * T * sizeof(float), cudaMemcpyHostToDevice));
119 | 
120 |     // read kernel_num from command line
121 |     int kernel_num = 1;
122 |     if (argc > 1) {
123 |         kernel_num = atoi(argv[1]);
124 |     }
125 |     printf("Using kernel %d\n", kernel_num);
126 | 
127 |     // first check the correctness of the kernel
128 |     crossentropy_softmax_backward_cpu(dlogits, dlosses, probs, targets, B, T, V);
129 | 
130 |     // time the kernel at different block sizes
131 |     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
132 | 
133 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
134 |         int block_size = block_sizes[j];
135 |         cudaCheck(cudaMemset(d_dlogits, 0, B * T * V * sizeof(float)));
136 |         printf("Checking block size %d.\n", block_size);
137 |         crossentropy_softmax_backward(kernel_num, d_dlogits, d_dlosses, d_probs, d_targets, B, T, V, block_size);
138 |         validate_result(d_dlogits, dlogits, "dlogits", B * T * V, 1e-5f);
139 |     }
140 | 
141 |     printf("All results match. Starting benchmarks.\n\n");
142 | 
143 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
144 |         int block_size = block_sizes[j];
145 | 
146 |         int repeat_times = 100;
147 |         float elapsed_time = benchmark_kernel(repeat_times, crossentropy_softmax_backward,
148 |                                               kernel_num, d_dlogits, d_dlosses, d_probs, d_targets,
149 |                                               B, T, V, block_size);
150 | 
151 |         printf("block_size %4d | time %.4f ms | per token %.2f µs\n", block_size, elapsed_time, elapsed_time * 1'000 / (B*T));
152 |     }
153 | 
154 |     // free memory
155 |     free(probs);
156 |     free(targets);
157 |     free(dlosses);
158 |     free(dlogits);
159 |     cudaCheck(cudaFree(d_probs));
160 |     cudaCheck(cudaFree(d_targets));
161 |     cudaCheck(cudaFree(d_dlosses));
162 |     cudaCheck(cudaFree(d_dlogits));
163 | 
164 |     return 0;
165 | }


--------------------------------------------------------------------------------
/dev/cuda/gelu_forward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Kernels for gelu forward pass.
  3 | 
  4 | Compile example:
  5 | nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward
  6 | 
  7 | If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
  8 | 
  9 | #define _USE_MATH_DEFINES
 10 | #include <math.h>  OR  #include <cmath>
 11 | 
 12 | version 1 is naive CPU port
 13 | ./gelu_forward 1
 14 | 
 15 | version 2 is bfloat16 with the Packed128 data structure
 16 | ./gelu_forward 2
 17 | */
 18 | 
 19 | #include <stdio.h>
 20 | #include <stdlib.h>
 21 | #include <cuda_runtime.h>
 22 | 
 23 | #define ENABLE_BF16
 24 | #include "common.h"
 25 | 
 26 | // ----------------------------------------------------------------------------
 27 | // CPU code reference
 28 | 
 29 | #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
 30 | 
 31 | void gelu_forward_cpu(float* out, const float* inp, int N) {
 32 |     for (int i = 0; i < N; i++) {
 33 |         float x = inp[i];
 34 |         float cube = 0.044715f * x * x * x;
 35 |         out[i] = 0.5f * x * (1.0f + tanhf(GELU_SCALING_FACTOR * (x + cube)));
 36 |     }
 37 | }
 38 | 
 39 | // ----------------------------------------------------------------------------
 40 | // GPU kernels
 41 | 
 42 | // elementwise ops are nice and ez
 43 | __global__ void gelu_forward_kernel1(floatX* out, const floatX* inp, int N) {
 44 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 45 |     if (i < N) {
 46 |         float xi = inp[i];
 47 |         float cube = 0.044715f * xi * xi * xi;
 48 |         out[i] = 0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube)));
 49 |     }
 50 | }
 51 | 
 52 | // elementwise ops are nice and ez
 53 | __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp, int N) {
 54 |     int i = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
 55 |     if (i < N) {
 56 |         x128 packed_out;
 57 |         x128 packed_inp = load128cs(inp + i); // load and do not keep in cache
 58 |         for(int k = 0; k < packed_inp.size; ++k) {
 59 |             float xi = (float)packed_inp[k];
 60 |             float cube = 0.044715f * xi * xi * xi;
 61 |             packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
 62 |         }
 63 |         // store instead of storecs (without cache streaming) in case it is useful for the
 64 |         // data to be in the cache for the next operation after this GeLU
 65 |         store128(out + i, packed_out);
 66 |     }
 67 | }
 68 | 
 69 | // ----------------------------------------------------------------------------
 70 | // kernel launcher
 71 | 
 72 | void gelu_forward1(floatX* out, const floatX* inp, int N, const int block_size) {
 73 |     const int grid_size = ceil_div(N, block_size);
 74 |     gelu_forward_kernel1<<<grid_size, block_size>>>(out, inp, N);
 75 |     cudaCheck(cudaGetLastError());
 76 | }
 77 | 
 78 | void gelu_forward2(floatX* out, const floatX* inp, int N, const int block_size) {
 79 |     const int grid_size = ceil_div(N, block_size * x128::size);
 80 |     gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp, N);
 81 |     cudaCheck(cudaGetLastError());
 82 | }
 83 | 
 84 | // kernel version dispatch
 85 | void gelu_forward(int kernel_num,
 86 |                   floatX* out,
 87 |                   const floatX* inp,
 88 |                   int B, int T, int C,
 89 |                   int block_size) {
 90 |     switch (kernel_num) {
 91 |         case 1:
 92 |             gelu_forward1(out, inp, B * T * C, block_size);
 93 |             break;
 94 |         case 2:
 95 |             gelu_forward2(out, inp, B * T * C, block_size);
 96 |             break;
 97 |         default:
 98 |             printf("Invalid kernel number\n");
 99 |             exit(1);
100 |     }
101 | }
102 | 
103 | // ----------------------------------------------------------------------------
104 | 
105 | int main(int argc, const char **argv) {
106 |     setup_main();
107 | 
108 |     int B = 8;
109 |     int T = 1024;
110 |     int C = 768;
111 | 
112 |     // create host memory of random numbers
113 |     float* out = (float*)malloc(B * T * C * sizeof(float));
114 |     float* inp = make_random_float(B * T * C);
115 | 
116 |     // read kernel_num from command line
117 |     int kernel_num = 1;
118 |     if (argc > 1) {
119 |         kernel_num = atoi(argv[1]);
120 |     }
121 |     printf("Using kernel %d\n", kernel_num);
122 | 
123 |     // first check the correctness of the kernel
124 |     gelu_forward_cpu(out, inp, B * T * C);
125 | 
126 |     // move to GPU
127 |     floatX* d_out;
128 |     floatX* d_inp;
129 |     cudaCheck(cudaMalloc(&d_out, B * T * C * sizeof(floatX)));
130 |     cudaCheck(cudaMalloc(&d_inp, B * T * C * sizeof(floatX)));
131 |     cudaCheck(memcpy_convert(d_inp, inp, B * T * C));
132 | 
133 |     // time the kernel at different block sizes
134 |     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
135 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
136 |         int block_size = block_sizes[j];
137 |         printf("Checking block size %d.\n", block_size);
138 |         gelu_forward(kernel_num, d_out, d_inp, B, T, C, block_size);
139 | #if !defined(ENABLE_BF16) && !defined(ENABLE_FP16)
140 |         float tol = 1e-5;
141 | #else
142 |         float tol = 1e-2f;
143 | #endif
144 |         validate_result(d_out, out, "out", B * T * C, tol);
145 |     }
146 | 
147 |     printf("All results match. Starting benchmarks.\n\n");
148 | 
149 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
150 |         int block_size = block_sizes[j];
151 | 
152 |         int repeat_times = 1000;
153 | 
154 |         float elapsed_time = benchmark_kernel(repeat_times, gelu_forward,
155 |                                               kernel_num, d_out, d_inp,
156 |                                               B, T, C, block_size);
157 | 
158 |         // napkin math: estimate the memory bandwidth achieved
159 |         // for each (B,T,C) output element, we do 1 read and 1 write, 4 bytes each
160 |         // and e.g. A100 40GB PCIe is advertised at 1,555GB/s
161 |         long memory_ops = B * T * C * 2 * (int)sizeof(floatX);
162 |         float memory_bandwidth = memory_ops / elapsed_time / 1e6;
163 | 
164 |         printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s\n", block_size, elapsed_time, memory_bandwidth);
165 |     }
166 | 
167 |     // free memory
168 |     free(out);
169 |     free(inp);
170 | 
171 |     cudaCheck(cudaFree(d_out));
172 |     cudaCheck(cudaFree(d_inp));
173 |     return 0;
174 | }


--------------------------------------------------------------------------------
/dev/cuda/residual_forward.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Kernels for residual forward pass.
  3 | 
  4 | Compile example:
  5 | nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward
  6 | 
  7 | version 1 is naive port from CPU code to kernel
  8 | ./residual_forward 1
  9 | version 2 packs input into 128 bit memory reads
 10 | ./residual_forward 2
 11 | */
 12 | 
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <cuda_runtime.h>
 16 | 
 17 | #define ENABLE_BF16
 18 | #include "common.h"
 19 | 
 20 | // ----------------------------------------------------------------------------
 21 | // CPU code reference lol
 22 | 
 23 | void residual_forward_cpu(float* out, const float* inp1, const float* inp2, int N) {
 24 |     for (int i = 0; i < N; i++) {
 25 |         out[i] = inp1[i] + inp2[i];
 26 |     }
 27 | }
 28 | 
 29 | // ----------------------------------------------------------------------------
 30 | // GPU kernels
 31 | 
 32 | // elementwise ops are nice and ez
 33 | __global__ void residual_forward_kernel1(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
 34 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 35 |     if (idx < N) {
 36 |         out[idx] = (floatX)((float)inp1[idx] + (float)inp2[idx]);
 37 |     }
 38 | }
 39 | 
 40 | __global__ void residual_forward_kernel2(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
 41 |     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
 42 |     if (idx < N) {
 43 |         x128 packed_out;
 44 |         x128 packed_inp1 = load128cs(inp1 + idx);
 45 |         x128 packed_inp2 = load128cs(inp2 + idx);
 46 |         for (int k = 0; k < packed_inp1.size; ++k)
 47 |         {
 48 |             packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
 49 |         }
 50 |         store128(out + idx, packed_out);
 51 |     }
 52 | }
 53 | 
 54 | // ----------------------------------------------------------------------------
 55 | // kernel launcher
 56 | 
 57 | void residual_forward1(floatX* out, const floatX* inp1, const floatX* inp2, int N, const int block_size) {
 58 |     const int grid_size = ceil_div(N, block_size);
 59 |     residual_forward_kernel1<<<grid_size, block_size>>>(out, inp1, inp2, N);
 60 |     cudaCheck(cudaGetLastError());
 61 | }
 62 | 
 63 | void residual_forward2(floatX* out, const floatX* inp1, const floatX* inp2, int N, const int block_size) {
 64 |     const int grid_size = ceil_div(N, (int)(block_size * x128::size));
 65 |     residual_forward_kernel2<<<grid_size, block_size>>>(out, inp1, inp2, N);
 66 |     cudaCheck(cudaGetLastError());
 67 | }
 68 | 
 69 | // kernel version dispatch
 70 | void residual_forward(int kernel_num,
 71 |                   floatX* out,
 72 |                   const floatX* inp1,
 73 |                   const floatX* inp2,
 74 |                   int N,
 75 |                   int block_size) {
 76 |     switch (kernel_num) {
 77 |         case 1:
 78 |             residual_forward1(out, inp1, inp2, N, block_size);
 79 |             break;
 80 |         case 2:
 81 |             residual_forward2(out, inp1, inp2, N, block_size);
 82 |             break;
 83 |         default:
 84 |             printf("Invalid kernel number\n");
 85 |             exit(1);
 86 |     }
 87 | }
 88 | 
 89 | // ----------------------------------------------------------------------------
 90 | 
 91 | int main(int argc, char **argv) {
 92 |     setup_main();
 93 | 
 94 |     int B = 8;
 95 |     int T = 1024;
 96 |     int C = 768;
 97 | 
 98 |     // create host memory of random numbers
 99 |     float* out = (float*)malloc(B * T * C * sizeof(float));
100 |     float* inp1 = make_random_float(B * T * C);
101 |     float* inp2 = make_random_float(B * T * C);
102 | 
103 |     // move to GPU
104 |     floatX* d_out;
105 |     floatX* d_inp1;
106 |     floatX* d_inp2;
107 |     cudaCheck(cudaMalloc(&d_out, B * T * C * sizeof(floatX)));
108 |     cudaCheck(cudaMalloc(&d_inp1, B * T * C * sizeof(floatX)));
109 |     cudaCheck(cudaMalloc(&d_inp2, B * T * C * sizeof(floatX)));
110 |     cudaCheck(memcpy_convert(d_inp1, inp1, B * T * C));
111 |     cudaCheck(memcpy_convert(d_inp2, inp2, B * T * C));
112 | 
113 |     // read kernel_num from command line
114 |     int kernel_num = 1;
115 |     if (argc > 1) {
116 |         kernel_num = atoi(argv[1]);
117 |     }
118 |     printf("Using kernel %d\n", kernel_num);
119 | 
120 |     // first check the correctness of the kernel
121 |     residual_forward_cpu(out, inp1, inp2, B * T * C);
122 | 
123 | 
124 |     // time the kernel at different block sizes
125 |     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
126 | 
127 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
128 |         int block_size = block_sizes[j];
129 |         printf("Checking block size %d.\n", block_size);
130 |         residual_forward(kernel_num, d_out, d_inp1, d_inp2, B * T * C, block_size);
131 | #if !defined(ENABLE_BF16) && !defined(ENABLE_FP16)
132 |         float tol = 1e-5;
133 | #else
134 |         float tol = 1e-2f;
135 | #endif
136 |         validate_result(d_out, out, "out", B * T * C, tol);
137 |     }
138 | 
139 |     printf("All results match. Starting benchmarks.\n\n");
140 | 
141 |     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
142 |         int block_size = block_sizes[j];
143 | 
144 |         int repeat_times = 1000;
145 |         float elapsed_time = benchmark_kernel(repeat_times, residual_forward,
146 |                                               kernel_num, d_out, d_inp1, d_inp2, B * T * C, block_size
147 |                                               );
148 | 
149 |         // napkin math: estimate the memory bandwidth achieved
150 |         // for each (B,T,C) output element, we do 2 read and 1 write, 4 bytes each
151 |         // and e.g. A100 40GB PCIe is advertised at 1,555GB/s
152 |         long memory_ops = B * T * C * 3 * 4;
153 |         float memory_bandwidth = memory_ops / elapsed_time / 1e6;
154 | 
155 |         printf("block_size %4d | time %.4f ms | bandwidth %.2f GB/s\n", block_size, elapsed_time, memory_bandwidth);
156 |     }
157 | 
158 |     // free memory
159 |     free(out);
160 |     free(inp1);
161 |     free(inp2);
162 |     cudaCheck(cudaFree(d_out));
163 |     cudaCheck(cudaFree(d_inp1));
164 |     cudaCheck(cudaFree(d_inp2));
165 | 
166 |     return 0;
167 | }
168 | 


--------------------------------------------------------------------------------
/dev/data/README.md:
--------------------------------------------------------------------------------
1 | # dev/data organization
2 | 
3 | The idea is that each dataset has a .py file here in the root of `dev/data`, and each dataset then creates a directory here, and writes and caches anything inside that directory. So for example:
4 | 
5 | - running `python tinystories.py` will create a directory `tinystories` with its .bin files inside it
6 | - running `python tinyshakespeare.py` will create a directory `tinyshakespeare` with its .bin files inside it
7 | 
8 | And so on. This way we can nicely organize multiple datasets here, share common utilities between them, and then point the .py/.c code in the root of the project accordingly to these.
9 | 


--------------------------------------------------------------------------------
/dev/data/data_common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Common utilities for the datasets
  3 | """
  4 | 
  5 | import requests
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | 
  9 | 
 10 | def download_file(url: str, fname: str, chunk_size=1024):
 11 |     """Helper function to download a file from a given url"""
 12 |     resp = requests.get(url, stream=True)
 13 |     total = int(resp.headers.get("content-length", 0))
 14 |     with open(fname, "wb") as file, tqdm(
 15 |         desc=fname,
 16 |         total=total,
 17 |         unit="iB",
 18 |         unit_scale=True,
 19 |         unit_divisor=1024,
 20 |     ) as bar:
 21 |         for data in resp.iter_content(chunk_size=chunk_size):
 22 |             size = file.write(data)
 23 |             bar.update(size)
 24 | 
 25 | 
 26 | def write_datafile(filename, toks):
 27 |     """
 28 |     Saves token data as a .bin file, for reading in C.
 29 |     - First comes a header with 256 int32s
 30 |     - The tokens follow, each as a uint16
 31 |     """
 32 |     assert len(toks) < 2**31, "token count too large" # ~2.1B tokens
 33 |     # construct the header
 34 |     header = np.zeros(256, dtype=np.int32)
 35 |     header[0] = 20240520 # magic
 36 |     header[1] = 1 # version
 37 |     header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16)
 38 |     # construct the tokens numpy array, if not already
 39 |     if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16:
 40 |         # validate that no token exceeds a uint16
 41 |         maxtok = 2**16
 42 |         assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16"
 43 |         toks_np = np.array(toks, dtype=np.uint16)
 44 |     else:
 45 |         toks_np = toks
 46 |     # write to file
 47 |     print(f"writing {len(toks):,} tokens to {filename}")
 48 |     with open(filename, "wb") as f:
 49 |         f.write(header.tobytes())
 50 |         f.write(toks_np.tobytes())
 51 | 
 52 | def write_evalfile(filename, datas):
 53 |     """
 54 |     Saves eval data as a .bin file, for reading in C.
 55 |     Used for multiple-choice style evals, e.g. HellaSwag and MMLU
 56 |     - First comes a header with 256 int32s
 57 |     - The examples follow, each example is a stream of uint16_t:
 58 |         - <START_EXAMPLE> delimiter of 2**16-1, i.e. 65,535
 59 |         - <EXAMPLE_BYTES>, bytes encoding this example, allowing efficient skip to next
 60 |         - <EXAMPLE_INDEX>, the index of the example in the dataset
 61 |         - <LABEL>, the index of the correct completion
 62 |         - <NUM_COMPLETIONS>, indicating the number of completions (usually 4)
 63 |         - <NUM><CONTEXT_TOKENS>, where <NUM> is the number of tokens in the context
 64 |         - <NUM><COMPLETION_TOKENS>, repeated NUM_COMPLETIONS times
 65 |     """
 66 |     # construct the header
 67 |     header = np.zeros(256, dtype=np.int32)
 68 |     header[0] = 20240522 # magic
 69 |     header[1] = 1 # version
 70 |     header[2] = len(datas) # number of examples
 71 |     header[3] = 0 # reserved for longest_example_bytes, fill in later
 72 |     # now write the individual examples
 73 |     longest_example_bytes = 0 # in units of uint16s
 74 |     full_stream = [] # the stream of uint16s, we'll write a single time at the end
 75 |     assert len(datas) < 2**16, "too many examples?"
 76 |     for idx, data in enumerate(datas):
 77 |         stream = []
 78 |         # header of the example
 79 |         stream.append(2**16-1) # <START_EXAMPLE>
 80 |         stream.append(0) # <EXAMPLE_BYTES> (fill in later)
 81 |         stream.append(idx) # <EXAMPLE_INDEX>
 82 |         stream.append(data["label"]) # <LABEL>
 83 |         ending_tokens = data["ending_tokens"]
 84 |         assert len(ending_tokens) == 4, "expected 4 completions for now? can relax later"
 85 |         stream.append(len(ending_tokens)) # <NUM_COMPLETIONS>
 86 |         # the (shared) context tokens
 87 |         ctx_tokens = data["ctx_tokens"]
 88 |         assert all(0 <= t < 2**16-1 for t in ctx_tokens), "bad context token"
 89 |         stream.append(len(ctx_tokens))
 90 |         stream.extend(ctx_tokens)
 91 |         # the completion tokens
 92 |         for end_tokens in ending_tokens:
 93 |             assert all(0 <= t < 2**16-1 for t in end_tokens), "bad completion token"
 94 |             stream.append(len(end_tokens))
 95 |             stream.extend(end_tokens)
 96 |         # write to full stream
 97 |         nbytes = len(stream)*2 # 2 bytes per uint16
 98 |         assert nbytes < 2**16, "example too large?"
 99 |         stream[1] = nbytes # fill in the <EXAMPLE_BYTES> field
100 |         longest_example_bytes = max(longest_example_bytes, nbytes)
101 |         full_stream.extend(stream)
102 |     # construct the numpy array
103 |     stream_np = np.array(full_stream, dtype=np.uint16)
104 |     # fill in the longest_example field
105 |     assert 0 < longest_example_bytes < 2**16, f"bad longest_example"
106 |     header[3] = longest_example_bytes
107 |     # write to file (for HellaSwag val this is 10,042 examples, 3.6MB file)
108 |     print(f"writing {len(datas):,} examples to {filename}")
109 |     with open(filename, "wb") as f:
110 |         f.write(header.tobytes())
111 |         f.write(stream_np.tobytes())
112 | 


--------------------------------------------------------------------------------
/dev/data/edu_fineweb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads the FineWeb-Edu 100B dataset, but in an already tokenized format in .bin files
 4 | # Example: ./edu_fineweb.sh 100
 5 | # would download 100 shards
 6 | # Default is all shards
 7 | # Make sure to run this from current directory, i.e. inside ./dev/data!
 8 | 
 9 | # Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024
10 | if [ $# -eq 0 ]; then
11 |     MAX_SHARDS=1001
12 | else
13 |     MAX_SHARDS=$1
14 | fi
15 | 
16 | if [ $MAX_SHARDS -gt 1001 ]; then
17 |     MAX_SHARDS=1001
18 | fi
19 | 
20 | # Base URLs
21 | TRAIN_BASE_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_train_"
22 | VAL_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_val_000000.bin"
23 | 
24 | # Directory to save files
25 | SAVE_DIR="edu_fineweb100B"
26 | 
27 | # Create the directory if it doesn't exist
28 | mkdir -p "$SAVE_DIR"
29 | 
30 | download() {
31 |     local FILE_URL=$1
32 |     local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1)
33 |     local FILE_PATH="${SAVE_DIR}/${FILE_NAME}"
34 |     curl -s -L -o "$FILE_PATH" "$FILE_URL"
35 |     echo "Downloaded $FILE_NAME to $SAVE_DIR"
36 | }
37 | 
38 | # Function to manage parallel jobs
39 | run_in_parallel() {
40 |     local max_jobs=$1
41 |     shift
42 |     local commands=("$@")
43 |     local job_count=0
44 | 
45 |     for cmd in "${commands[@]}"; do
46 |         eval "$cmd" &
47 |         ((job_count++))
48 |         if (( job_count >= max_jobs )); then
49 |             wait -n
50 |             ((job_count--))
51 |         fi
52 |     done
53 | 
54 |     # Wait for any remaining jobs to finish
55 |     wait
56 | }
57 | 
58 | # Export the function so it's available in subshells
59 | export -f download
60 | 
61 | # Download the validation shard
62 | download "$VAL_URL" &
63 | 
64 | # Generate train file shard download commands
65 | train_commands=()
66 | for i in $(seq -f "%06g" 1 $MAX_SHARDS); do
67 |     FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true"
68 |     train_commands+=("download \"$FILE_URL\"")
69 | done
70 | 
71 | # Run the train file commands in parallel
72 | run_in_parallel 40 "${train_commands[@]}"
73 | echo "The val shard and first $MAX_SHARDS train shards of FineWebEdu100B files downloaded in $SAVE_DIR"
74 | 


--------------------------------------------------------------------------------
/dev/data/fineweb.py:
--------------------------------------------------------------------------------
  1 | """
  2 | FineWeb dataset (for srs pretraining)
  3 | https://huggingface.co/datasets/HuggingFaceFW/fineweb
  4 | 
  5 | example doc to highlight the structure of the dataset:
  6 | {
  7 |   "text": "Posted by mattsmith on 20th April 2012\nStraight from...",
  8 |   "id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>",
  9 |   "dump": "CC-MAIN-2013-20",
 10 |   "url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/",
 11 |   "date": "2013-05-18T07:24:47Z",
 12 |   "file_path": "s3://commoncrawl/long.../path.../file.gz",
 13 |   "language": "en",
 14 |   "language_score": 0.9185474514961243,
 15 |   "token_count": 594
 16 | }
 17 | 
 18 | Example of downloading the 100B dataset of FineWebEDU, from root directory:
 19 | python dev/data/fineweb.py -t edu -v 100B
 20 | 100B runs for small few hours, depending on your internet and computer.
 21 | """
 22 | import os
 23 | import argparse
 24 | import multiprocessing as mp
 25 | import numpy as np
 26 | import tiktoken
 27 | from datasets import load_dataset
 28 | from tqdm import tqdm
 29 | import argparse
 30 | 
 31 | from data_common import write_datafile
 32 | # ------------------------------------------
 33 | 
 34 | parser = argparse.ArgumentParser(description="FineWeb and Edu-FineWeb dataset preprocessing")
 35 | parser.add_argument("-t", "--type", type=str, default="classic", help="Fineweb type, edu|classic")
 36 | parser.add_argument("-v", "--version", type=str, default="10B", help="Fineweb data sample size, 10B|100B")
 37 | parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each data shard in the output .bin files, in tokens")
 38 | args = parser.parse_args()
 39 | 
 40 | # FineWeb has a few possible subsamples available
 41 | assert args.version in {"10B", "100B"}, "version must be one of: 10B, 100B"
 42 | assert args.type in {"edu", "classic"}, "type must be one of: edu, classic"
 43 | directories = {
 44 |     ("classic", "10B"): ("fineweb10B", "sample-10BT"),
 45 |     ("classic", "100B"): ("fineweb100B", "sample-100BT"),
 46 |     ("edu", "10B"): ("edu_fineweb10B", "sample-10BT"),
 47 |     ("edu", "100B"): ("edu_fineweb100B", "sample-100BT")
 48 | }
 49 | local_dir, remote_name = directories[(args.type, args.version)]
 50 | 
 51 | # create the cache the local directory if it doesn't exist yet
 52 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
 53 | os.makedirs(DATA_CACHE_DIR, exist_ok=True)
 54 | 
 55 | # download the dataset
 56 | if args.type == "classic":
 57 |     fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train")
 58 |     name = "fineweb"
 59 | elif args.type =="edu":
 60 |     fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
 61 |     name = "edu_fineweb"
 62 | 
 63 | # init the tokenizer
 64 | enc = tiktoken.get_encoding("gpt2")
 65 | eot = enc._special_tokens['<|endoftext|>'] # end of text token
 66 | def tokenize(doc):
 67 |     # tokenizes a single document and returns a numpy array of uint16 tokens
 68 |     tokens = [eot] # the special <|endoftext|> token delimits all documents
 69 |     tokens.extend(enc.encode_ordinary(doc["text"]))
 70 |     tokens_np = np.array(tokens)
 71 |     assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
 72 |     tokens_np_uint16 = tokens_np.astype(np.uint16)
 73 |     return tokens_np_uint16
 74 | 
 75 | # tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
 76 | nprocs = max(1, os.cpu_count() - 2) # don't hog the entire system
 77 | with mp.Pool(nprocs) as pool:
 78 |     shard_index = 0
 79 |     # preallocate buffer to hold current shard
 80 |     all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
 81 |     token_count = 0
 82 |     progress_bar = None
 83 |     for tokens in pool.imap(tokenize, fw, chunksize=16):
 84 | 
 85 |         # is there enough space in the current shard for the new tokens?
 86 |         if token_count + len(tokens) < args.shard_size:
 87 |             # simply append tokens to current shard
 88 |             all_tokens_np[token_count:token_count+len(tokens)] = tokens
 89 |             token_count += len(tokens)
 90 |             # update progress bar
 91 |             if progress_bar is None:
 92 |                 progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
 93 |             progress_bar.update(len(tokens))
 94 |         else:
 95 |             # write the current shard and start a new one
 96 |             split = "val" if shard_index == 0 else "train"
 97 |             filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
 98 |             # split the document into whatever fits in this shard; the remainder goes to next one
 99 |             remainder = args.shard_size - token_count
100 |             progress_bar.update(remainder)
101 |             all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
102 |             write_datafile(filename, all_tokens_np)
103 |             shard_index += 1
104 |             progress_bar = None
105 |             # populate the next shard with the leftovers of the current doc
106 |             all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
107 |             token_count = len(tokens)-remainder
108 | 
109 |     # write any remaining tokens as the last shard
110 |     if token_count != 0:
111 |         split = "val" if shard_index == 0 else "train"
112 |         filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
113 |         write_datafile(filename, all_tokens_np[:token_count])
114 | 


--------------------------------------------------------------------------------
/dev/data/fineweb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Downloads the FineWeb100B dataset, but in an already tokenized format in .bin files
 4 | # Example: ./fineweb.sh 100
 5 | # would download 100 shards
 6 | # Default is all shards
 7 | 
 8 | # Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024
 9 | if [ $# -eq 0 ]; then
10 |     MAX_SHARDS=1028
11 | else
12 |     MAX_SHARDS=$1
13 | fi
14 | 
15 | # Ensure MAX_SHARDS is not greater than 1028
16 | if [ $MAX_SHARDS -gt 1028 ]; then
17 |     MAX_SHARDS=1028
18 | fi
19 | 
20 | # Base URLs
21 | TRAIN_BASE_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_train_"
22 | VAL_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_val_000000.bin?download=true"
23 | 
24 | # Directory to save files
25 | SAVE_DIR="fineweb100B"
26 | 
27 | # Create the directory if it doesn't exist
28 | mkdir -p "$SAVE_DIR"
29 | 
30 | # Function to download, decompress, and delete files
31 | download() {
32 |     local FILE_URL=$1
33 |     local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1)
34 |     local FILE_PATH="${SAVE_DIR}/${FILE_NAME}"
35 | 
36 |     # Download the file
37 |     curl -s -L -o "$FILE_PATH" "$FILE_URL"
38 |     echo "Downloaded $FILE_NAME to $SAVE_DIR"
39 | }
40 | 
41 | # Function to manage parallel jobs
42 | run_in_parallel() {
43 |     local max_jobs=$1
44 |     shift
45 |     local commands=("$@")
46 |     local job_count=0
47 | 
48 |     for cmd in "${commands[@]}"; do
49 |         eval "$cmd" &
50 |         ((job_count++))
51 |         if (( job_count >= max_jobs )); then
52 |             wait -n
53 |             ((job_count--))
54 |         fi
55 |     done
56 | 
57 |     # Wait for any remaining jobs to finish
58 |     wait
59 | }
60 | 
61 | # Export the function so it's available in subshells
62 | export -f download
63 | 
64 | # Download
65 | download "$VAL_URL" &
66 | 
67 | # Generate train file commands
68 | train_commands=()
69 | for i in $(seq -f "%06g" 1 $MAX_SHARDS); do
70 |     FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true"
71 |     train_commands+=("download \"$FILE_URL\"")
72 | done
73 | 
74 | # Run the train file commands in parallel
75 | run_in_parallel 40 "${train_commands[@]}"
76 | 
77 | echo "The val shard and first $MAX_SHARDS train shards of FineWeb100B files downloaded in $SAVE_DIR"
78 | 


--------------------------------------------------------------------------------
/dev/data/mmlu.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloads and evaluates MMLU in Python.
  3 | This then acts as the reference file for llm.c
  4 | https://github.com/hendrycks/test
  5 | 
  6 | gpt2 (124M)
  7 | - this script: 14042 acc: 0.2557 acc_norm: 0.2721
  8 | 
  9 | gpt2-xl (1558M)
 10 | - this script: 14042 acc: 0.2927 acc_norm: 0.3035
 11 | """
 12 | 
 13 | import os
 14 | import requests
 15 | import tiktoken
 16 | import pandas as pd
 17 | from tqdm import tqdm
 18 | import torch
 19 | import torch.nn as nn
 20 | from torch.nn import functional as F
 21 | from transformers import GPT2LMHeadModel
 22 | from data_common import download_file
 23 | 
 24 | # -----------------------------------------------------------------------------
 25 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "mmlu")
 26 | 
 27 | enc = tiktoken.get_encoding("gpt2")
 28 | data_url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
 29 | 
 30 | def download():
 31 |     """Downloads MMLU to DATA_CACHE_DIR"""
 32 |     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
 33 |     data_filename = os.path.join(DATA_CACHE_DIR, f"data.tar")
 34 |     if not os.path.exists(data_filename):
 35 |         print(f"Downloading {data_url} to {data_filename}...")
 36 |         download_file(data_url, data_filename)
 37 |         os.system(f"tar -xf {data_filename} -C {DATA_CACHE_DIR}") # untar
 38 |         # creates a directory "data" inside it, with e.g. data/test/*csv
 39 |     else:
 40 |         print(f"{data_filename} already exists, skipping download...")
 41 | 
 42 | def iterate_examples():
 43 |     # there are 14,042 examples in total in the test set
 44 | 
 45 |     download()
 46 |     test_dir = os.path.join(DATA_CACHE_DIR, "data", "test")
 47 |     csv_files = [f for f in os.listdir(test_dir) if f.endswith(".csv")]
 48 |     for csv_file in csv_files:
 49 |         csv_path = os.path.join(test_dir, csv_file)
 50 |         print(csv_path)
 51 |         df = pd.read_csv(csv_path, header=None)
 52 |         n = df.shape[0]
 53 |         for idx in range(n):
 54 |             example = {
 55 |                 "question": df.iloc[idx, 0],
 56 |                 "endings": [df.iloc[idx, 1], df.iloc[idx, 2], df.iloc[idx, 3], df.iloc[idx, 4]],
 57 |                 "label": df.iloc[idx, 5],
 58 |             }
 59 |             yield example
 60 | 
 61 | def render_example(example):
 62 |     """
 63 |     Given the example as a dictionary, render it as three torch tensors:
 64 |     - tokens (the tokens of context + completion, of size 4xN, as there are always 4 candidates)
 65 |     - mask (is 1 in the region of the candidate completion, where we evaluate likelihoods)
 66 |     - label (the index of the correct completion, which we hope has the highest likelihood)
 67 |     """
 68 |     ctx = f"Question: {example['question']}\n\nAnswer:"
 69 |     ctx_tokens = enc.encode(ctx)
 70 | 
 71 |     tok_rows = []
 72 |     mask_rows = []
 73 |     for end in example["endings"]:
 74 |         end_tokens = enc.encode(" " + str(end)) # note: prepending " " because GPT-2 tokenizer
 75 |         tok_rows.append(ctx_tokens + end_tokens)
 76 |         mask_rows.append([0]*len(ctx_tokens) + [1]*len(end_tokens))
 77 | 
 78 |     # have to be careful during the collation because the number of tokens in each row can differ
 79 |     max_len = max(len(row) for row in tok_rows)
 80 |     tokens = torch.zeros((4, max_len), dtype=torch.long)
 81 |     mask = torch.zeros((4, max_len), dtype=torch.long)
 82 |     for i, (tok_row, mask_row) in enumerate(zip(tok_rows, mask_rows)):
 83 |         tokens[i, :len(tok_row)] = torch.tensor(tok_row)
 84 |         mask[i, :len(mask_row)] = torch.tensor(mask_row)
 85 | 
 86 |     label = "ABCD".index(example["label"])
 87 |     return tokens, mask, label
 88 | 
 89 | @torch.no_grad()
 90 | def evaluate(model_type, device):
 91 | 
 92 |     torch.set_float32_matmul_precision('high') # use tf32
 93 | 
 94 |     model = GPT2LMHeadModel.from_pretrained(model_type)
 95 |     model.to(device)
 96 |     # model = torch.compile(model)
 97 | 
 98 |     num_correct_norm = 0
 99 |     num_correct = 0
100 |     num_total = 0
101 |     for example in iterate_examples():
102 |         tokens, mask, label = render_example(example)
103 |         tokens = tokens.to(device)
104 |         mask = mask.to(device)
105 | 
106 |         # get the logits
107 |         logits = model(tokens).logits
108 |         # evaluate the autoregressive loss at all positions
109 |         shift_logits = (logits[..., :-1, :]).contiguous()
110 |         shift_tokens = (tokens[..., 1:]).contiguous()
111 |         flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
112 |         flat_shift_tokens = shift_tokens.view(-1)
113 |         shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
114 |         shift_losses = shift_losses.view(tokens.size(0), -1)
115 |         # now get the average loss just for the completion region (where mask == 1), in each row
116 |         shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
117 |         masked_shift_losses = shift_losses * shift_mask
118 |         # sum and divide by the number of 1s in the mask
119 |         sum_loss = masked_shift_losses.sum(dim=1)
120 |         avg_loss = sum_loss / shift_mask.sum(dim=1)
121 |         # now we have a loss for each of the 4 completions
122 |         # the one with the lowest loss should be the most likely
123 |         pred = sum_loss.argmin().item()
124 |         pred_norm = avg_loss.argmin().item()
125 | 
126 |         # accumulate stats
127 |         num_total += 1
128 |         num_correct += int(pred == label)
129 |         num_correct_norm += int(pred_norm == label)
130 |         print(f"{num_total} acc: {num_correct/num_total:.4f} acc_norm: {num_correct_norm/num_total:.4f}")
131 | 
132 |         # debug prints
133 |         if num_total < 10:
134 |             print("---")
135 |             print(f"Context:\n {example['question']}")
136 |             print(f"Endings:")
137 |             for i, end in enumerate(example["endings"]):
138 |                 print(f"{i} (loss: {avg_loss[i].item():.4f}) {end}")
139 |             print(f"predicted: {pred}, actual: {label}")
140 | 
141 | if __name__ == "__main__":
142 |     import argparse
143 |     parser = argparse.ArgumentParser()
144 |     parser.add_argument("-m", "--model_type", type=str, default="gpt2", help="the model type to use")
145 |     parser.add_argument("-d", "--device", type=str, default="cuda", help="the device to use")
146 |     args = parser.parse_args()
147 |     evaluate(args.model_type, args.device)
148 | 


--------------------------------------------------------------------------------
/dev/data/tinyshakespeare.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Downloads and tokenizes the TinyShakespeare dataset.
 3 | - The download is from Github.
 4 | - The tokenization is GPT-2 tokenizer with tiktoken
 5 | 
 6 | The output is written to a newly created tinyshakespeare/ folder.
 7 | The script prints:
 8 | 
 9 | Saved 32768 tokens to tinyshakespeare/tiny_shakespeare_val.bin
10 | Saved 305260 tokens to tinyshakespeare/tiny_shakespeare_train.bin
11 | 
12 | And runs in a few seconds depending on your internet
13 | connection and computer. The .bin files are raw byte
14 | streams of int32 numbers indicating the token ids.
15 | """
16 | 
17 | import os
18 | import tiktoken
19 | import numpy as np
20 | from data_common import download_file, write_datafile
21 | 
22 | # -----------------------------------------------------------------------------
23 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare")
24 | 
25 | enc = tiktoken.get_encoding("gpt2")
26 | encode = lambda s: enc.encode(s, allowed_special={'<|endoftext|>'})
27 | 
28 | def download():
29 |     """Downloads the TinyShakespeare dataset to DATA_CACHE_DIR"""
30 |     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
31 |     # download the TinyShakespeare dataset, unless it's already downloaded
32 |     data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
33 |     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
34 |     if not os.path.exists(data_filename):
35 |         print(f"Downloading {data_url} to {data_filename}...")
36 |         download_file(data_url, data_filename)
37 |     else:
38 |         print(f"{data_filename} already exists, skipping download...")
39 | 
40 | def tokenize():
41 |     data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt")
42 |     text = open(data_filename, 'r').read()
43 |     # let's treat every person's statement in the dialog as a separate document
44 |     text = "<|endoftext|>" + text
45 |     text = text.replace('\n\n', '\n\n<|endoftext|>')
46 |     # encode the text
47 |     tokens = encode(text)
48 |     # let's take the first 32,768 tokens as the validation split (~10%)
49 |     val_tokens = tokens[:32768]
50 |     train_tokens = tokens[32768:]
51 |     # save to file
52 |     val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin")
53 |     train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin")
54 |     write_datafile(val_filename, val_tokens)
55 |     write_datafile(train_filename, train_tokens)
56 | 
57 | if __name__ == "__main__":
58 |     download()
59 |     tokenize()
60 | 


--------------------------------------------------------------------------------
/dev/data/tinystories.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloads and tokenizes the TinyStories dataset.
  3 | - The download is from HuggingFace datasets.
  4 | - The tokenization is GPT-2 tokenizer with tiktoken
  5 | 
  6 | The output is written to a newly created tinystories/ folder.
  7 | The script prints:
  8 | 
  9 | Tokenizing val split...
 10 | Saved 19043638 tokens to tinystories/TinyStories_val.bin
 11 | Tokenizing train split...
 12 | Saved 925653391 tokens to tinystories/TinyStories_train.bin
 13 | 
 14 | And runs in 1-2 minutes two depending on your internet
 15 | connection and computer. The .bin files are raw byte
 16 | streams of int32 numbers indicating the token ids.
 17 | """
 18 | 
 19 | import os
 20 | import glob
 21 | import json
 22 | import random
 23 | import requests
 24 | from tqdm import tqdm
 25 | from concurrent.futures import ProcessPoolExecutor, as_completed
 26 | import tiktoken
 27 | import numpy as np
 28 | from data_common import download_file, write_datafile
 29 | 
 30 | # -----------------------------------------------------------------------------
 31 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories")
 32 | 
 33 | enc = tiktoken.get_encoding("gpt2")
 34 | encode = lambda s: enc.encode_ordinary(s)
 35 | 
 36 | def download():
 37 |     """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
 38 |     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
 39 | 
 40 |     # download the TinyStories dataset, unless it's already downloaded
 41 |     data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
 42 |     data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz")
 43 |     if not os.path.exists(data_filename):
 44 |         print(f"Downloading {data_url} to {data_filename}...")
 45 |         download_file(data_url, data_filename)
 46 |     else:
 47 |         print(f"{data_filename} already exists, skipping download...")
 48 | 
 49 |     # unpack the tar.gz file into all the data shards (json files)
 50 |     data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
 51 |     if not os.path.exists(data_dir):
 52 |         os.makedirs(data_dir, exist_ok=True)
 53 |         print(f"Unpacking {data_filename}...")
 54 |         os.system(f"tar -xzf {data_filename} -C {data_dir}")
 55 |     else:
 56 |         print(f"{data_dir} already exists, skipping unpacking...")
 57 | 
 58 |     # print a single example just for debugging and such
 59 |     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
 60 |     print("Download done.")
 61 |     print(f"Number of shards: {len(shard_filenames)}")
 62 |     # with open(shard_filenames[0], "r") as f:
 63 |     #     data = json.load(f)
 64 |     # print(f"Example story:\n{data[0]}")
 65 | 
 66 | def process_shard(shard_index, shard_filename):
 67 |     with open(shard_filename, "r") as f:
 68 |         data = json.load(f)
 69 |     eot = enc._special_tokens['<|endoftext|>'] # end of text token
 70 |     rng = random.Random(1337 + shard_index)
 71 |     rng.shuffle(data)
 72 |     all_tokens = []
 73 |     for example in data:
 74 |         text = example["story"]
 75 |         text = text.strip()  # get rid of leading/trailing whitespace
 76 |         tokens = encode(text)
 77 |         all_tokens.append(eot)
 78 |         all_tokens.extend(tokens)
 79 |     return all_tokens
 80 | 
 81 | def tokenize():
 82 |     # shard 0 will be the val split, rest is train
 83 |     data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
 84 |     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
 85 |     val_shards = [shard_filenames[0]]
 86 |     train_shards = shard_filenames[1:]
 87 |     for split_name, split_shards in [("val", val_shards), ("train", train_shards)]:
 88 | 
 89 |         print(f"Tokenizing {split_name} split...")
 90 |         all_tokens = []
 91 |         with ProcessPoolExecutor() as executor:
 92 |             futures = [executor.submit(process_shard, shard_index, shard_filename)
 93 |                        for shard_index, shard_filename in enumerate(split_shards)]
 94 |             for future in as_completed(futures):
 95 |                 all_tokens.extend(future.result())
 96 | 
 97 |         split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin")
 98 |         write_datafile(split_filename, all_tokens)
 99 | 
100 | if __name__ == "__main__":
101 |     download()
102 |     tokenize()
103 | 
104 |     # Prints:
105 |     # Tokenizing val split...
106 |     # Saved 19043638 tokens to data/TinyStories_val.bin
107 |     # Tokenizing train split...
108 |     # Saved 925653391 tokens to data/TinyStories_train.bin
109 | 


--------------------------------------------------------------------------------
/dev/download_starter_pack.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the directory of the script
 4 | SCRIPT_DIR=$(dirname "$(realpath "$0")")
 5 | 
 6 | # Base URL
 7 | BASE_URL="https://huggingface.co/datasets/karpathy/llmc-starter-pack/resolve/main/"
 8 | 
 9 | # Directory paths based on script location
10 | SAVE_DIR_PARENT="$SCRIPT_DIR/.."
11 | SAVE_DIR_TINY="$SCRIPT_DIR/data/tinyshakespeare"
12 | SAVE_DIR_HELLA="$SCRIPT_DIR/data/hellaswag"
13 | 
14 | # Create the directories if they don't exist
15 | mkdir -p "$SAVE_DIR_TINY"
16 | mkdir -p "$SAVE_DIR_HELLA"
17 | 
18 | # Files to download
19 | FILES=(
20 |     "gpt2_124M.bin"
21 |     "gpt2_124M_bf16.bin"
22 |     "gpt2_124M_debug_state.bin"
23 |     "gpt2_tokenizer.bin"
24 |     "tiny_shakespeare_train.bin"
25 |     "tiny_shakespeare_val.bin"
26 |     "hellaswag_val.bin"
27 | )
28 | 
29 | # Function to download files to the appropriate directory
30 | download_file() {
31 |     local FILE_NAME=$1
32 |     local FILE_URL="${BASE_URL}${FILE_NAME}?download=true"
33 |     local FILE_PATH
34 | 
35 |     # Determine the save directory based on the file name
36 |     if [[ "$FILE_NAME" == tiny_shakespeare* ]]; then
37 |         FILE_PATH="${SAVE_DIR_TINY}/${FILE_NAME}"
38 |     elif [[ "$FILE_NAME" == hellaswag* ]]; then
39 |         FILE_PATH="${SAVE_DIR_HELLA}/${FILE_NAME}"
40 |     else
41 |         FILE_PATH="${SAVE_DIR_PARENT}/${FILE_NAME}"
42 |     fi
43 | 
44 |     # Download the file
45 |     curl -s -L -o "$FILE_PATH" "$FILE_URL"
46 |     echo "Downloaded $FILE_NAME to $FILE_PATH"
47 | }
48 | 
49 | # Export the function so it's available in subshells
50 | export -f download_file
51 | 
52 | # Generate download commands
53 | download_commands=()
54 | for FILE in "${FILES[@]}"; do
55 |     download_commands+=("download_file \"$FILE\"")
56 | done
57 | 
58 | # Function to manage parallel jobs in increments of a given size
59 | run_in_parallel() {
60 |     local batch_size=$1
61 |     shift
62 |     local i=0
63 |     local command
64 | 
65 |     for command; do
66 |         eval "$command" &
67 |         ((i = (i + 1) % batch_size))
68 |         if [ "$i" -eq 0 ]; then
69 |             wait
70 |         fi
71 |     done
72 | 
73 |     # Wait for any remaining jobs to finish
74 |     wait
75 | }
76 | 
77 | # Run the download commands in parallel in batches of 2
78 | run_in_parallel 6 "${download_commands[@]}"
79 | 
80 | echo "All files downloaded and saved in their respective directories"


--------------------------------------------------------------------------------
/dev/eval/README.md:
--------------------------------------------------------------------------------
 1 | # eleuther eval readme
 2 | 
 3 | The goal here is to run the Eleuther Eval harness exactly in the same way as that used in the [huggingface LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard).
 4 | 
 5 | The starting point is a `.bin` file trained by llm.c. We now have to export it to a huggingface model and then evaluate it.
 6 | 
 7 | To export the model, use [export_hf.py](export_hf.py). See its documentation up top. Eample usage, from this directory:
 8 | 
 9 | ```bash
10 | cd dev/eval
11 | python export_hf.py --input model.bin --output output_dir
12 | ```
13 | 
14 | Where you point to your model .bin file, and huggingface files get written to output_dir. The script can optionally also upload to huggingface hub. One more post-processing that is advisable is to go into the `output_dir`, open up the `config.json` there and add one more entry into the json object:
15 | 
16 | ```
17 | "_attn_implementation": "flash_attention_2"
18 | ```
19 | 
20 | To use FlashAttention 2. We had trouble evaluating in bfloat16 without using FlashAttention 2 (the scores are much lower, and this was never fully resolved). This is a temporary hack/workaround.
21 | 
22 | Now that we have the model in huggingface format, we download the Eleuther Eval Harness repo and run it. Head over to the parent/root directory of the llm.c repo and:
23 | 
24 | ```bash
25 | git clone https://github.com/EleutherAI/lm-evaluation-harness/
26 | cd lm-evaluation-harness
27 | git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463
28 | pip install -e .
29 | ```
30 | 
31 | And then run the run_eval.sh script:
32 | 
33 | ```bash
34 | ./dev/eval/run_eval.sh output_dir result_dir
35 | ```
36 | 
37 | Where output_dir can either be local output dir (above), or a huggingface repo name.This will write eval json objects to `./lm-evaluation-harness/results/results_dir`. It will print the results into console, e.g. for a 774M model we see:
38 | 
39 | ```
40 | ----------------------------------------
41 | arc_challenge_25shot.json      : 30.4608
42 | gsm8k_5shot.json               : 0.1516
43 | hellaswag_10shot.json          : 57.8072
44 | mmlu_5shot.json                : 25.8682
45 | truthfulqa_0shot.json          : 35.7830
46 | winogrande_5shot.json          : 59.3528
47 | ----------------------------------------
48 | Average Score                  : 34.9039
49 | ```
50 | 
51 | But you can additionally get these results later by running `summarize_eval.py`:
52 | 
53 | ```bash
54 | python dev/eval/summarize_eval.py lm-evaluation-harness/results/results_dir
55 | ```
56 | 
57 | The same information will be printed again.
58 | 
59 | For some reason, the evaluation is quite expensive and runs for somewhere around 1-3 hours, even though it should be a few minutes at most. This has not been satisfyingly resolved so far.


--------------------------------------------------------------------------------
/dev/eval/run_eval.sh:
--------------------------------------------------------------------------------
 1 | # https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
 2 | # (See About tab -> REPRODUCIBILITY)
 3 | 
 4 | # This script is intended to be run from the parent/root directory of llm.c repo.
 5 | 
 6 | # Clone the evaluation harness:
 7 | 
 8 | # git clone https://github.com/EleutherAI/lm-evaluation-harness/
 9 | # cd lm-evaluation-harness
10 | # git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463
11 | # pip install -e .
12 | 
13 | # Then return to the parent directory and run this script
14 | 
15 | # cd ..
16 | # ./dev/eval/run_eval.sh [model_name] [result_name]
17 | 
18 | # where model_name is either a HF model such as openai-community/gpt2 or a local path such as ./gpt2-124M-run1
19 | # and result_name is the name of the folder under lm-evaluation-harness/results to store the evaluations
20 | 
21 | # Since the evals can take a couple of hours to run, depending on the model size, you may wish to
22 | # run within a "screen" session or by using nohup to run the script:
23 | 
24 | # nohup ./dev/eval/run_eval.sh [model_name] [result_name] > run.txt 2> err.txt &
25 | 
26 | if [ -z "$1" ]; then
27 |     echo "Error: missing HuggingFace model name or path to local model"
28 |     echo "./run_eval.sh hf_account/model_name my_result"
29 |   exit 1
30 | fi
31 | if [ -z "$2" ]; then
32 |   echo "Error: missing output name for results"
33 |     echo "./run_eval.sh hf_account/model_name my_result"
34 |   exit 1
35 | fi
36 | 
37 | export MODEL="$(realpath -s "$1")"
38 | export RESULT="$2"
39 | echo "Evaluating model $MODEL"
40 | echo "Saving results to ./lm-evaluation-harness/results/$RESULT"
41 | 
42 | cd lm-evaluation-harness
43 | 
44 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks truthfulqa_mc --batch_size 1 --no_cache --write_out --output_path results/$RESULT/truthfulqa_0shot.json --device cuda
45 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks winogrande --batch_size 1 --no_cache --write_out --output_path results/$RESULT/winogrande_5shot.json --device cuda --num_fewshot 5
46 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks arc_challenge --batch_size 1 --no_cache --write_out --output_path results/$RESULT/arc_challenge_25shot.json --device cuda --num_fewshot 25
47 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hellaswag --batch_size 1 --no_cache --write_out --output_path results/$RESULT/hellaswag_10shot.json --device cuda --num_fewshot 10
48 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks gsm8k --batch_size 1 --no_cache --write_out --output_path results/$RESULT/gsm8k_5shot.json --device cuda --num_fewshot 5
49 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --batch_size 1 --no_cache --write_out --output_path results/$RESULT/mmlu_5shot.json --device cuda --num_fewshot 5
50 | 
51 | cd ..
52 | python dev/eval/summarize_eval.py lm-evaluation-harness/results/$RESULT
53 | 


--------------------------------------------------------------------------------
/dev/eval/summarize_eval.py:
--------------------------------------------------------------------------------
 1 | # example run command
 2 | # python dev/eval/summarize_eval.py lm-evaluation-harness/results/result774M
 3 | # this script is optional, the run_eval.sh should already print these
 4 | # but this script can be used to re-print them
 5 | 
 6 | import json, sys
 7 | 
 8 | RESULT = sys.argv[1]
 9 | print("-"*40)
10 | 
11 | key = {"arc_challenge_25shot.json": "acc_norm",
12 |        "gsm8k_5shot.json": "acc",
13 |        "hellaswag_10shot.json": "acc_norm",
14 |        "mmlu_5shot.json": "acc",
15 |        "truthfulqa_0shot.json": "mc2",
16 |        "winogrande_5shot.json": "acc"
17 |        }
18 | 
19 | total = 0
20 | for test in ["arc_challenge_25shot.json", "gsm8k_5shot.json", "hellaswag_10shot.json", "mmlu_5shot.json", "truthfulqa_0shot.json", "winogrande_5shot.json"]:
21 |     data = json.loads(open("./%s/%s"%(RESULT, test)).read())
22 |     r_count = 0
23 |     r_total = 0
24 |     for test_name in data['results']:
25 |       r_count += 1
26 |       r_total += data['results'][test_name][key[test]]
27 |     score = (r_total*100)/r_count
28 |     print(f"{test:<30} : {score:.4f}")
29 |     total += score
30 | average = total / 6.0
31 | print("-"*40)
32 | print(f"Average Score                  : {average:.4f}")
33 | 


--------------------------------------------------------------------------------
/dev/loss_checker_ci.py:
--------------------------------------------------------------------------------
 1 | # Description: A script to compare numbers in a file with fixed values and check for accuracy within a specified percent difference.
 2 | # Usage: python loss_checker_ci.py -f <file_path> -s <col_start> -e <col_end> -a <percent_accuracy>
 3 | # Example: python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 10.0
 4 | import sys
 5 | import argparse
 6 | 
 7 | def read_numbers_from_file(file_path, col_start, col_end):
 8 |     try:
 9 |         numbers = []
10 |         with open(file_path, 'r') as file:
11 |             lines = file.readlines()
12 |             start_index = None
13 |             for i, line in enumerate(lines):
14 |                 if "step    1/10" in line:
15 |                     start_index = i
16 |                     break
17 | 
18 |             if start_index is None:
19 |                 print("Error: Could not find the string 'step    1/10' in the file.")
20 |                 return None
21 | 
22 |             # Read 10 rows starting from the identified start row
23 |             for line in lines[start_index:start_index + 10]:
24 |                 # Extracting the specified columns
25 |                 number = float(line[col_start:col_end].strip())
26 |                 numbers.append(number)
27 |         return numbers
28 |     except Exception as e:
29 |         print(f"Error reading the file: {e}")
30 |         return None
31 | 
32 | def compare_numbers(read_values, fixed_values, percent_accuracy):
33 |     for i in range(len(read_values)):
34 |         read_value = read_values[i]
35 |         fixed_value = fixed_values[i]
36 |         percent_difference = ((read_value - fixed_value) / fixed_value) * 100
37 |         print(f"Fixed Value: {fixed_value}, Read Value: {read_value}, Percent Difference: {percent_difference:.2f}%")
38 |         if abs(percent_difference) > percent_accuracy:
39 |             print(f"Error: Percent difference {percent_difference:.2f}% exceeds the allowed accuracy of {percent_accuracy}%")
40 |             return 1
41 |     print("Success: All values are within the allowed accuracy.")
42 |     return 0
43 | 
44 | def main():
45 |     parser = argparse.ArgumentParser(description='Compare numbers in a file with fixed values.')
46 |     parser.add_argument('-f', '--file', required=True, help='Path to the input file')
47 |     parser.add_argument('-s', '--col_start', type=int, required=True, help='Starting column index (0-based)')
48 |     parser.add_argument('-e', '--col_end', type=int, required=True, help='Ending column index (0-based)')
49 |     parser.add_argument('-a', '--percent_accuracy', type=float, required=True, help='Allowed percent accuracy for comparison')
50 | 
51 |     args = parser.parse_args()
52 | 
53 |     # Read numbers from file
54 |     read_values = read_numbers_from_file(args.file, args.col_start, args.col_end)
55 |     if read_values is None:
56 |         return 1
57 | 
58 |     # Use values from test_gpt2.cu for fp32 precision
59 |     fixed_values = [5.270009,4.060681,3.320085,2.717550,2.181066,1.653923,1.168050,0.736873,0.401021,0.187493];
60 | 
61 |     # Compare the numbers and check accuracy
62 |     result = compare_numbers(read_values, fixed_values, args.percent_accuracy)
63 |     return result
64 | 
65 | if __name__ == "__main__":
66 |     sys.exit(main())
67 | 


--------------------------------------------------------------------------------
/dev/test/Makefile:
--------------------------------------------------------------------------------
  1 | CC ?= gcc
  2 | # example: make test_dataloader TEST_CFLAGS=-fsanitize=address -fno-omit-frame-pointer 
  3 | CFLAGS = -Ofast -Wno-unused-result -Wno-ignored-pragmas -Wno-unknown-attributes -g
  4 | CFLAGS += $(TEST_CFLAGS)
  5 | LDFLAGS =
  6 | LDLIBS = -lm
  7 | INCLUDES =
  8 | CFLAGS_COND = -march=native
  9 | 
 10 | # Find nvcc
 11 | SHELL_UNAME = $(shell uname)
 12 | REMOVE_FILES = rm -f
 13 | OUTPUT_FILE = -o $@
 14 | CUDA_OUTPUT_FILE = -o $@
 15 | 
 16 | # NVCC flags
 17 | # -t=0 is short for --threads, 0 = number of CPUs on the machine
 18 | NVCC_FLAGS = -O3 -t=0 --use_fast_math -std=c++17
 19 | NVCC_LDFLAGS = -lcublas -lcublasLt
 20 | NVCC_INCLUDES =
 21 | NVCC_LDLIBS =
 22 | NVCC_CUDNN =
 23 | # By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
 24 | USE_CUDNN ?= 0
 25 | 
 26 | # We will place .o files in the `build` directory (create it if it doesn't exist)
 27 | BUILD_DIR = build
 28 | $(shell mkdir -p $(BUILD_DIR))
 29 | REMOVE_BUILD_OBJECT_FILES := rm -f $(BUILD_DIR)/*.o
 30 | 
 31 | # Function to check if a file exists in the PATH
 32 | define file_exists_in_path
 33 |   $(which $(1) 2>/dev/null)
 34 | endef
 35 | 
 36 | ifneq ($(CI),true) # if not in CI, then use the GPU query
 37 |   ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
 38 |     ifneq ($(call file_exists_in_path, __nvcc_device_query),)
 39 |       GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query)
 40 |       GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
 41 |     endif
 42 |   endif
 43 | endif
 44 | 
 45 | # set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
 46 | ifneq ($(GPU_COMPUTE_CAPABILITY),)
 47 |   NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
 48 | endif
 49 | 
 50 | # autodect a lot of various supports on current platform
 51 | $(info ---------------------------------------------)
 52 | 
 53 | NVCC := $(shell which nvcc 2>/dev/null)
 54 | 
 55 | # Check and include cudnn if available
 56 | # You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
 57 | # By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
 58 | # Refer to the README for cuDNN install instructions
 59 | ifeq ($(USE_CUDNN), 1)
 60 |   ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
 61 |     $(info ✓ cuDNN found, will run with flash-attention)
 62 |     CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
 63 |   else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
 64 |     $(info ✓ cuDNN found, will run with flash-attention)
 65 |     CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
 66 |   else
 67 |     $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
 68 |   endif
 69 |   NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
 70 |   NVCC_LDFLAGS += -lcudnn
 71 |   NVCC_FLAGS += -DENABLE_CUDNN
 72 |   NVCC_CUDNN = $(BUILD_DIR)/cudnn_att.o
 73 | else
 74 |   $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
 75 | endif
 76 | 
 77 | # Check if OpenMP is available
 78 | # This is done by attempting to compile an empty file with OpenMP flags
 79 | # OpenMP makes the code a lot faster so I advise installing it
 80 | # e.g. on MacOS: brew install libomp
 81 | # e.g. on Ubuntu: sudo apt-get install libomp-dev
 82 | # later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2
 83 | # First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks
 84 | ifeq ($(NO_OMP), 1)
 85 |   $(info OpenMP is manually disabled)
 86 | else
 87 |   ifneq ($(OS), Windows_NT)
 88 |     # Check for OpenMP support in GCC or Clang on Linux
 89 |     ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
 90 |       CFLAGS += -fopenmp -DOMP
 91 |       LDLIBS += -lgomp
 92 |       $(info ✓ OpenMP found)
 93 |     else
 94 |       $(info ✗ OpenMP not found)
 95 |     endif
 96 |   endif
 97 | endif
 98 | 
 99 | # Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
100 | ifeq ($(NO_MULTI_GPU), 1)
101 |   $(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
102 | else
103 |   ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
104 |     $(info ✓ OpenMPI found, OK to train with multiple GPUs)
105 |     NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
106 |     NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
107 |     NVCC_LDLIBS += -lmpi -lnccl
108 |     NVCC_FLAGS += -DMULTI_GPU
109 |   else
110 |     $(info ✗ OpenMPI is not found, disabling multi-GPU support)
111 |     $(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
112 |   endif
113 | endif
114 | 
115 | # Precision settings, default to bf16 but ability to override
116 | ifeq ($(MAKECMDGOALS), clean)
117 |   PRECISION=BF16 
118 | endif
119 | 
120 | VALID_PRECISIONS := FP32 FP16 BF16
121 | ifeq ($(filter $(PRECISION),$(VALID_PRECISIONS)),)
122 |   $(error Invalid precision $(PRECISION), valid precisions are $(VALID_PRECISIONS))
123 | endif
124 | ifeq ($(PRECISION), FP32)
125 |   PFLAGS = -DENABLE_FP32
126 | else ifeq ($(PRECISION), FP16)
127 |   PFLAGS = -DENABLE_FP16
128 | else
129 |   PFLAGS = -DENABLE_BF16
130 | endif
131 | 
132 | # PHONY means these targets will always be executed
133 | .PHONY: all clean
134 | 
135 | # Add targets
136 | TARGETS = test_dataloader
137 | 
138 | # Dependency files
139 | test_dataloader_dependencies = test_dataloader.d
140 | HEADER_DEPENDENCIES = $(test_dataloader_dependencies)
141 | 
142 | # Conditional inclusion of CUDA targets
143 | ifeq ($(NVCC),)
144 |     $(info ✗ nvcc not found, skipping GPU/CUDA builds)
145 | else
146 |     $(info ✓ nvcc found, including GPU/CUDA support)
147 |     TARGETS += 
148 | endif
149 | 
150 | $(info ---------Build Configuration Complete - Build Targets -------------------------)
151 | 
152 | all: $(TARGETS)
153 | 
154 | # Generate dependency files
155 | %.d: %.c
156 | 	$(CC) $(CFLAGS) -MMD -MP -MF $@ -c $<
157 | 
158 | # Include the dependency files
159 | -include test_dataloader.d
160 | 
161 | test_dataloader: test_dataloader.c
162 | 	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) -MMD -MP $^ $(LDLIBS) $(OUTPUT_FILE)
163 | 
164 | clean:
165 | 	$(REMOVE_FILES) $(TARGETS) *.d *.o
166 | 	$(REMOVE_BUILD_OBJECT_FILES)
167 | 


--------------------------------------------------------------------------------
/dev/test/device_file_io.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | Tests device <-> file IO functions
 3 | 
 4 | compile and run as (from dev/test directory)
 5 | nvcc -o device_file_io device_file_io.cu && ./device_file_io
 6 | */
 7 | 
 8 | 
 9 | #include "../../llmc/cuda_common.h"
10 | #include <vector>
11 | #include <random>
12 | #include <cstdio>
13 | #include <algorithm>
14 | 
15 | void test(size_t nelem, size_t wt_buf_size, size_t rd_buf_size) {
16 | 
17 |     float* data;
18 |     cudaCheck(cudaMalloc(&data, nelem*sizeof(float)));
19 | 
20 |     // generate random array
21 |     std::vector<float> random_data(nelem);
22 |     std::mt19937 rng(42);
23 |     std::uniform_real_distribution<float> dist(-100.f, 100.f);
24 |     std::generate(random_data.begin(), random_data.end(), [&](){ return dist(rng); });
25 | 
26 |     cudaCheck(cudaMemcpy(data, random_data.data(), random_data.size()*sizeof(float), cudaMemcpyHostToDevice));
27 | 
28 |     cudaStream_t stream;
29 |     cudaStreamCreate(&stream);
30 | 
31 |     FILE* tmp = fopenCheck("tmp.bin", "w");
32 |     device_to_file(tmp, data, nelem * sizeof(float), wt_buf_size, stream);
33 |     fcloseCheck(tmp);
34 | 
35 | 
36 |     float* reload;
37 |     cudaCheck(cudaMalloc(&reload, nelem*sizeof(float)));
38 | 
39 |     tmp  = fopenCheck("tmp.bin", "r");
40 |     file_to_device(reload, tmp, nelem * sizeof(float), rd_buf_size, stream);
41 |     fcloseCheck(tmp);
42 | 
43 |     std::vector<float> cmp(nelem);
44 |     cudaCheck(cudaMemcpy(cmp.data(), reload, nelem * sizeof(float), cudaMemcpyDeviceToHost));
45 |     for(int i = 0; i < nelem; ++i) {
46 |         if(random_data[i] != cmp[i])  {
47 |             fprintf(stderr, "FAIL: Mismatch at position %d: %f vs %f\n", i, random_data[i], cmp[i]);
48 |             remove("tmp.bin");
49 |             exit(EXIT_FAILURE);
50 |         }
51 |     }
52 | 
53 |     cudaCheck(cudaFree(reload));
54 |     cudaCheck(cudaFree(data));
55 |     remove("tmp.bin");
56 | }
57 | 
58 | int main() {
59 |     test(1025, 10000, 10000);           // buffers larger than data
60 |     test(1025, 1024, 513);              // different and smaller
61 |     test(500, 500*sizeof(float),
62 |          500*sizeof(float));            // exact match
63 |     test(125'000, 10000, 10000);        // large array
64 | }


--------------------------------------------------------------------------------
/dev/test/test_outlier_detector.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | Tests our OutlierDetector
 3 | 
 4 | compile and run as (from dev/test directory)
 5 | gcc -O3 -I../../llmc -o test_outlier_detector test_outlier_detector.c -lm && ./test_outlier_detector
 6 | */
 7 | 
 8 | #include <stdlib.h>
 9 | #include "../../llmc/outlier_detector.h"
10 | 
11 | int main(void) {
12 |     OutlierDetector detector;
13 |     init_detector(&detector);
14 | 
15 |     srand(1337); // init rng
16 | 
17 |     // generate OUTLIER_DETECTOR_WINDOW_SIZE * 2 random numbers between -1 and 1
18 |     for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE * 2; i++) {
19 |         double val = (double)rand() / RAND_MAX * 2 - 1;  // Random number between -1 and 1
20 |         double zscore = update_detector(&detector, val);
21 | 
22 |         printf("Step %d: Value = %.4f, zscore = %.4f\n", i, val, zscore);
23 | 
24 |         // check that the first OUTLIER_DETECTOR_WINDOW_SIZE values return nan
25 |         if (i < OUTLIER_DETECTOR_WINDOW_SIZE) {
26 |             if (!isnan(zscore)) {
27 |                 printf("Error: Expected nan, got %.4f\n", zscore);
28 |                 return EXIT_FAILURE;
29 |             }
30 |         } else {
31 |             // check that the zscore is within reasonable bounds
32 |             if (zscore < -3.0 || zscore > 3.0) {
33 |                 printf("Error: Z-score %.4f is outside of expected range\n", zscore);
34 |                 return EXIT_FAILURE;
35 |             }
36 |         }
37 |     }
38 | 
39 |     // simulate an outlier
40 |     double outlier = 10.0; // <--- loss spike
41 |     double zscore = update_detector(&detector, outlier);
42 |     printf("Outlier Step: Value = %.4f, zscore = %.4f\n", outlier, zscore);
43 | 
44 |     // check that the z-score here is large
45 |     if (zscore < 5.0) {
46 |         printf("Error: Z-score %.4f is not large enough for an outlier\n", zscore);
47 |         return EXIT_FAILURE;
48 |     }
49 | 
50 |     printf("OK\n");
51 |     return EXIT_SUCCESS;
52 | }
53 | 


--------------------------------------------------------------------------------
/dev/unistd.h:
--------------------------------------------------------------------------------
  1 | // header file that is necessary to compile on Windows
  2 | #ifndef UNISTD_H
  3 | #define UNISTD_H
  4 | 
  5 | #define _CRT_SECURE_NO_WARNINGS
  6 | #define _USE_MATH_DEFINES
  7 | #define WIN32_LEAN_AND_MEAN
  8 | 
  9 | #include <stdio.h>
 10 | #include <math.h>
 11 | #include <time.h>
 12 | #include <stdlib.h> // for malloc and free
 13 | #include <string.h>
 14 | #include <direct.h> // for _mkdir and _stat
 15 | #include <io.h> // needed for _access below and _findfirst, _findnext, _findclose
 16 | #pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
 17 | #include <winsock2.h>
 18 | 
 19 | #define CLOCK_MONOTONIC 0
 20 | static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 21 | {
 22 |     return timespec_get(tv, TIME_UTC); // TODO: not sure this is the best solution. Need to review.
 23 | }
 24 | 
 25 | #define OMP /* turn it on */
 26 | #define F_OK 0
 27 | #define access _access
 28 | 
 29 | #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 30 | #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 31 | 
 32 | #define mkdir(path, mode) _mkdir(path) /* sketchy way to get mkdir to work on windows */
 33 | #define stat _stat
 34 | 
 35 | typedef struct glob_t {
 36 |     size_t gl_pathc;    // Count of matched pathnames
 37 |     char **gl_pathv;    // List of matched pathnames
 38 | } glob_t;
 39 | 
 40 | static inline void replace_forward_slashes(char* str) {
 41 |     while (*str) {
 42 |         if (*str == '/') {
 43 |             *str = '\\';
 44 |         }
 45 |         str++;
 46 |     }
 47 | }
 48 | 
 49 | static inline void globfree(glob_t *pglob) {
 50 |     for (size_t i = 0; i < pglob->gl_pathc; ++i) {
 51 |         free(pglob->gl_pathv[i]); // Free the allocated memory for each filename
 52 |     }
 53 |     free(pglob->gl_pathv); // Free the allocated memory for the list of filenames
 54 | }
 55 | 
 56 | static inline int glob(const char* pattern, int ignored_flags, int (*ignored_errfunc)(const char* epath, int eerrno), glob_t* pglob){
 57 |     struct _finddata_t find_file_data;
 58 |     char full_path[576]; // stored in pglob->gl_pathv[n]
 59 |     char directory_path[512] = {0}; // Store the directory path from the pattern
 60 |     char pattern_copy[512]; // Copy of the pattern to modify
 61 | 
 62 |     strncpy_s(pattern_copy, sizeof(pattern_copy) - 1, pattern, sizeof(pattern_copy) - 1);
 63 | 
 64 |     replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
 65 | 
 66 |     if (strchr(pattern_copy, '\\') != (void*) NULL) {
 67 |         strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
 68 |         directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
 69 |     }
 70 | 
 71 |     // find the first file matching the pattern in the directory
 72 |     intptr_t find_handle = _findfirst(pattern_copy, &find_file_data);
 73 | 
 74 |     if (find_handle == -1) {
 75 |         return 1; // No files found
 76 |     }
 77 | 
 78 |     size_t file_count = 0;
 79 |     size_t max_files = 64000; // hard-coded limit for the number of files
 80 | 
 81 |     pglob->gl_pathv = (char **) malloc(max_files * sizeof(char*)); // freed in globfree
 82 | 
 83 |     if (pglob->gl_pathv == NULL) {
 84 |         _findclose(find_handle);
 85 |         return 2; // Memory allocation failed
 86 |     }
 87 | 
 88 |     do {
 89 |         if (file_count >= max_files) {
 90 |             _findclose(find_handle);
 91 |             return 2; // Too many files found
 92 |             }
 93 | 
 94 |         snprintf(full_path, sizeof(full_path), "%s%s", directory_path, find_file_data.name);
 95 | 
 96 |         pglob->gl_pathv[file_count] = _strdup(full_path); // freed in globfree
 97 | 
 98 |         if (pglob->gl_pathv[file_count] == NULL) {
 99 |             _findclose(find_handle);
100 |             return 2; // Memory allocation for filename failed
101 |         }
102 |         file_count++;
103 |     } while (_findnext(find_handle, &find_file_data) == 0);
104 | 
105 |     _findclose(find_handle);
106 | 
107 |     pglob->gl_pathc = file_count;
108 |     return 0;
109 | }
110 | 
111 | // dirent.h support
112 | 
113 | #define MAX_PATH_LENGTH 512
114 | typedef struct dirent {
115 |     char d_name[MAX_PATH_LENGTH];
116 | } dirent;
117 | 
118 | typedef struct DIR {
119 |     intptr_t handle;
120 |     struct _finddata_t findFileData;
121 |     int firstRead;
122 | } DIR;
123 | 
124 | static inline DIR *opendir(const char *name) {
125 |     DIR *dir = (DIR *)malloc(sizeof(DIR));
126 |     if (dir == NULL) {
127 |         return NULL;
128 |     }
129 | 
130 |     char searchPath[MAX_PATH_LENGTH];
131 | 
132 |     snprintf(searchPath, MAX_PATH_LENGTH, "%s\\*.*", name);
133 | 
134 |     dir->handle = _findfirst(searchPath, &dir->findFileData);
135 |     if (dir->handle == -1) {
136 |         free(dir);
137 |         return NULL;
138 |     }
139 | 
140 |     dir->firstRead = 1;
141 |     return dir;
142 | }
143 | 
144 | static inline struct dirent *readdir(DIR *directory) {
145 |     static struct dirent result;
146 | 
147 |     if (directory->firstRead) {
148 |         directory->firstRead = 0;
149 |     } else {
150 |         if (_findnext(directory->handle, &directory->findFileData) != 0) {
151 |             return NULL;
152 |         }
153 |     }
154 | 
155 |     strncpy(result.d_name, directory->findFileData.name, MAX_PATH_LENGTH);
156 |     result.d_name[MAX_PATH_LENGTH - 1] = '\0'; // Ensure null termination
157 |     return &result;
158 | }
159 | 
160 | static inline int closedir(DIR *directory) {
161 |     if (directory == NULL) {
162 |         return -1;
163 |     }
164 | 
165 |     if (_findclose(directory->handle) != 0) {
166 |         return -1;
167 |     }
168 | 
169 |     free(directory);
170 |     return 0;
171 | }
172 | #endif // UNISTD_H
173 | 


--------------------------------------------------------------------------------
/dev/vislog.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Simple visualizer for log files written by the training loop"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import matplotlib.pyplot as plt\n",
 17 |     "%matplotlib inline"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "def parse_logfile(logfile):\n",
 27 |     "    # so the tricky part we have to deal with in these log files\n",
 28 |     "    # is that the job could crash and get restarted, which will\n",
 29 |     "    # re-wind back and start re-logging older steps. So we keep\n",
 30 |     "    # all the data as dictionary and over-write old data with new\n",
 31 |     "    # and then at the end compile everything together\n",
 32 |     "\n",
 33 |     "    # read raw data\n",
 34 |     "    streams = {} # stream:str -> {step: val}\n",
 35 |     "    with open(logfile, \"r\") as f:\n",
 36 |     "        for line in f:\n",
 37 |     "            parts = line.split()\n",
 38 |     "            step = int(parts[0].split(\":\")[1])\n",
 39 |     "            stream = parts[1].split(\":\")[0]\n",
 40 |     "            val = float(parts[1].split(\":\")[1])\n",
 41 |     "            if not stream in streams:\n",
 42 |     "                streams[stream] = {}\n",
 43 |     "            d = streams[stream]\n",
 44 |     "            d[step] = val\n",
 45 |     "    # now re-represent as list of (step, val) tuples\n",
 46 |     "    streams_xy = {}\n",
 47 |     "    for k, v in streams.items():\n",
 48 |     "        # get all (step, val) items, sort them\n",
 49 |     "        xy = sorted(list(v.items()))\n",
 50 |     "        # unpack the list of tuples to tuple of lists\n",
 51 |     "        streams_xy[k] = zip(*xy)\n",
 52 |     "    # return the xs, ys lists\n",
 53 |     "    return streams_xy\n",
 54 |     "\n",
 55 |     "parse_logfile(\"../log124M/main.log\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "import numpy as np\n",
 65 |     "\n",
 66 |     "sz = \"124M\"\n",
 67 |     "loss_baseline = {\n",
 68 |     "    \"124M\": 3.424958,\n",
 69 |     "    \"350M\": 3.083089,\n",
 70 |     "    \"774M\": 3.000580,\n",
 71 |     "    \"1558M\": 2.831273,\n",
 72 |     "}[sz]\n",
 73 |     "hella2_baseline = { # for GPT-2\n",
 74 |     "    \"124M\": 0.294463,\n",
 75 |     "    \"350M\": 0.375224,\n",
 76 |     "    \"774M\": 0.431986,\n",
 77 |     "    \"1558M\": 0.488946,\n",
 78 |     "}[sz]\n",
 79 |     "hella3_baseline = { # for GPT-3\n",
 80 |     "    \"124M\": 0.337,\n",
 81 |     "    \"350M\": 0.436,\n",
 82 |     "    \"774M\": 0.510,\n",
 83 |     "    \"1558M\": 0.547,\n",
 84 |     "}[sz]\n",
 85 |     "# assumes each model run is stored in this way\n",
 86 |     "logfile = f\"../log_gpt2_{sz}/main.log\"\n",
 87 |     "streams = parse_logfile(logfile)\n",
 88 |     "\n",
 89 |     "# optional function that smooths out the loss some\n",
 90 |     "def smooth_moving_average(signal, window_size):\n",
 91 |     "    if signal.ndim != 1:\n",
 92 |     "        raise ValueError(\"smooth_moving_average only accepts 1D arrays.\")\n",
 93 |     "    if signal.size < window_size:\n",
 94 |     "        raise ValueError(\"Input vector needs to be bigger than window size.\")\n",
 95 |     "    if window_size < 3:\n",
 96 |     "        return signal\n",
 97 |     "\n",
 98 |     "    s = np.pad(signal, (window_size//2, window_size-1-window_size//2), mode='edge')\n",
 99 |     "    w = np.ones(window_size) / window_size\n",
100 |     "    smoothed_signal = np.convolve(s, w, mode='valid')\n",
101 |     "    return smoothed_signal\n",
102 |     "\n",
103 |     "plt.figure(figsize=(16, 6))\n",
104 |     "\n",
105 |     "# Panel 1: losses: both train and val\n",
106 |     "plt.subplot(121)\n",
107 |     "xs, ys = streams[\"trl\"] # training loss\n",
108 |     "ys = np.array(ys)\n",
109 |     "# smooth out ys using a rolling window\n",
110 |     "# ys = smooth_moving_average(ys, 21) # optional\n",
111 |     "plt.plot(xs, ys, label=f'llm.c ({sz}) train loss')\n",
112 |     "print(\"Min Train Loss:\", min(ys))\n",
113 |     "xs, ys = streams[\"tel\"] # validation loss\n",
114 |     "plt.plot(xs, ys, label=f'llm.c ({sz}) val loss')\n",
115 |     "# horizontal line at GPT-2 baseline\n",
116 |     "# we don't have GPT-3 loss on this dataset because the weights were never released\n",
117 |     "if loss_baseline is not None:\n",
118 |     "    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint val loss\")\n",
119 |     "plt.xlabel(\"steps\")\n",
120 |     "plt.ylabel(\"loss\")\n",
121 |     "plt.yscale('log')\n",
122 |     "plt.ylim(top=4.0)\n",
123 |     "plt.legend()\n",
124 |     "plt.title(\"Loss\")\n",
125 |     "print(\"Min Validation Loss:\", min(ys))\n",
126 |     "\n",
127 |     "# Panel 2: HellaSwag eval\n",
128 |     "plt.subplot(122)\n",
129 |     "if \"eval\" in streams:\n",
130 |     "    xs, ys = streams[\"eval\"] # HellaSwag eval\n",
131 |     "    ys = np.array(ys)\n",
132 |     "    plt.plot(xs, ys, label=f\"llm.c ({sz})\")\n",
133 |     "    # horizontal line at GPT-2/3 baselines\n",
134 |     "    if hella2_baseline:\n",
135 |     "        plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint\")\n",
136 |     "    if hella3_baseline:\n",
137 |     "        plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f\"OpenAI GPT-3 ({sz}) checkpoint\")\n",
138 |     "    plt.xlabel(\"steps\")\n",
139 |     "    plt.ylabel(\"accuracy\")\n",
140 |     "    plt.legend()\n",
141 |     "    plt.title(\"HellaSwag eval\")\n",
142 |     "    print(\"Max Hellaswag eval:\", max(ys))\n"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "pytorch3",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.10.14"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 2
167 | }
168 | 


--------------------------------------------------------------------------------
/doc/layernorm/layernorm.c:
--------------------------------------------------------------------------------
  1 | // must run `python layernorm.py` first to generate the reference data
  2 | // then compile for example as `gcc layernorm.c -o layernorm -lm`
  3 | // and then run as `./layernorm` to see the output
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <math.h>
  8 | 
  9 | void layernorm_forward(float* out, float* mean, float* rstd,
 10 |                        float* inp, float* weight, float* bias,
 11 |                        int B, int T, int C) {
 12 |     float eps = 1e-5f;
 13 |     for (int b = 0; b < B; b++) {
 14 |         for (int t = 0; t < T; t++) {
 15 |             // seek to the input position inp[b,t,:]
 16 |             float* x = inp + b * T * C + t * C;
 17 |             // calculate the mean
 18 |             float m = 0.0f;
 19 |             for (int i = 0; i < C; i++) {
 20 |                 m += x[i];
 21 |             }
 22 |             m = m/C;
 23 |             // calculate the variance (without any bias correction)
 24 |             float v = 0.0f;
 25 |             for (int i = 0; i < C; i++) {
 26 |                 float xshift = x[i] - m;
 27 |                 v += xshift * xshift;
 28 |             }
 29 |             v = v/C;
 30 |             // calculate the rstd
 31 |             float s = 1.0f / sqrtf(v + eps);
 32 |             // seek to the output position in out[b,t,:]
 33 |             float* out_bt = out + b * T * C + t * C;
 34 |             for (int i = 0; i < C; i++) {
 35 |                 float n = (s * (x[i] - m)); // normalized output
 36 |                 float o = n * weight[i] + bias[i]; // scale and shift it
 37 |                 out_bt[i] = o; // write
 38 |             }
 39 |             // cache the mean and rstd for the backward pass later
 40 |             mean[b * T + t] = m;
 41 |             rstd[b * T + t] = s;
 42 |         }
 43 |     }
 44 | }
 45 | 
 46 | void layernorm_backward(float* dinp, float* dweight, float* dbias,
 47 |                         float* dout, float* inp, float* weight, float* mean, float* rstd,
 48 |                         int B, int T, int C) {
 49 |     for (int b = 0; b < B; b++) {
 50 |         for (int t = 0; t < T; t++) {
 51 |             float* dout_bt = dout + b * T * C + t * C;
 52 |             float* inp_bt = inp + b * T * C + t * C;
 53 |             float* dinp_bt = dinp + b * T * C + t * C;
 54 |             float mean_bt = mean[b * T + t];
 55 |             float rstd_bt = rstd[b * T + t];
 56 | 
 57 |             // first: two reduce operations
 58 |             float dnorm_mean = 0.0f;
 59 |             float dnorm_norm_mean = 0.0f;
 60 |             for (int i = 0; i < C; i++) {
 61 |                 float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
 62 |                 float dnorm_i = weight[i] * dout_bt[i];
 63 |                 dnorm_mean += dnorm_i;
 64 |                 dnorm_norm_mean += dnorm_i * norm_bti;
 65 |             }
 66 |             dnorm_mean = dnorm_mean / C;
 67 |             dnorm_norm_mean = dnorm_norm_mean / C;
 68 | 
 69 |             // now iterate again and accumulate all the gradients
 70 |             for (int i = 0; i < C; i++) {
 71 |                 float norm_bti = (inp_bt[i] - mean_bt) * rstd_bt;
 72 |                 float dnorm_i = weight[i] * dout_bt[i];
 73 |                 // gradient contribution to bias
 74 |                 dbias[i] += dout_bt[i];
 75 |                 // gradient contribution to weight
 76 |                 dweight[i] += norm_bti * dout_bt[i];
 77 |                 // gradient contribution to input
 78 |                 float dval = 0.0f;
 79 |                 dval += dnorm_i; // term 1
 80 |                 dval -= dnorm_mean; // term 2
 81 |                 dval -= norm_bti * dnorm_norm_mean; // term 3
 82 |                 dval *= rstd_bt; // final scale
 83 |                 dinp_bt[i] += dval;
 84 |             }
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | // poor man's tensor checker
 90 | int check_tensor(float *a, float *b, int n, char* label) {
 91 |     int ok = 1;
 92 |     printf("%s\n", label);
 93 |     for (int i = 0; i < n; i++) {
 94 |         if (fabs(a[i] - b[i]) <= 1e-5) {
 95 |             printf("OK ");
 96 |         } else {
 97 |             printf("NOT OK ");
 98 |             ok = 0;
 99 |         }
100 |         printf("%f %f\n", a[i], b[i]);
101 |     }
102 |     return ok;
103 | }
104 | 
105 | int main() {
106 | 
107 |     int B = 2; // batch
108 |     int T = 3; // time / sequence length
109 |     int C = 4; // number of channels
110 | 
111 |     float* x = (float*) malloc(B * T * C * sizeof(float));
112 |     float* w = (float*) malloc(C * sizeof(float));
113 |     float* b = (float*) malloc(C * sizeof(float));
114 |     float* out = (float*) malloc(B * T * C * sizeof(float));
115 |     float* mean = (float*) malloc(B * T * sizeof(float));
116 |     float* rstd = (float*) malloc(B * T * sizeof(float));
117 |     float* dout = (float*) malloc(B * T * C * sizeof(float));
118 |     float* dx = (float*) malloc(B * T * C * sizeof(float));
119 |     float* dw = (float*) malloc(C * sizeof(float));
120 |     float* db = (float*) malloc(C * sizeof(float));
121 | 
122 |     // read reference information from Python
123 |     FILE *file = fopen("ln.bin", "rb");
124 |     if (file == NULL) {
125 |         printf("Error opening file\n");
126 |         return 1;
127 |     }
128 |     fread(x, sizeof(float), B * T * C, file);
129 |     fread(w, sizeof(float), C, file);
130 |     fread(b, sizeof(float), C, file);
131 |     fread(out, sizeof(float), B * T * C, file);
132 |     fread(mean, sizeof(float), B * T, file);
133 |     fread(rstd, sizeof(float), B * T, file);
134 |     fread(dout, sizeof(float), B * T * C, file);
135 |     fread(dx, sizeof(float), B * T * C, file);
136 |     fread(dw, sizeof(float), C, file);
137 |     fread(db, sizeof(float), C, file);
138 |     fclose(file);
139 | 
140 |     // now let's calculate everything ourselves
141 | 
142 |     // forward pass
143 |     float* c_out = (float*) malloc(B * T * C * sizeof(float));
144 |     float* c_mean = (float*) malloc(B * T * sizeof(float));
145 |     float* c_rstd = (float*) malloc(B * T * sizeof(float));
146 |     layernorm_forward(c_out, c_mean, c_rstd, x, w, b, B, T, C);
147 | 
148 |     // check correctness of forward pass
149 |     check_tensor(out, c_out, B*T*C, "out");
150 |     check_tensor(mean, c_mean, B*T, "mean");
151 |     check_tensor(rstd, c_rstd, B*T, "rstd");
152 | 
153 |     // backward pass (note calloc inits grads to zero)
154 |     float* c_dx = (float*) calloc(B * T * C, sizeof(float));
155 |     float* c_dw = (float*) calloc(B * T, sizeof(float));
156 |     float* c_db = (float*) calloc(B * T, sizeof(float));
157 |     layernorm_backward(c_dx, c_dw, c_db, dout, x, w, c_mean, c_rstd, B, T, C);
158 | 
159 |     // check correctness of backward pass
160 |     check_tensor(c_dx, dx, B*T*C, "dx");
161 |     check_tensor(c_dw, dw, C, "dw");
162 |     check_tensor(c_db, db, C, "db");
163 | 
164 |     free(x);
165 |     free(w);
166 |     free(b);
167 |     free(out);
168 |     free(mean);
169 |     free(rstd);
170 |     free(dout);
171 |     free(dx);
172 |     free(dw);
173 |     free(db);
174 |     return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/doc/layernorm/layernorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | eps = 1e-5
 4 | 
 5 | class LayerNorm:
 6 | 
 7 |     @staticmethod
 8 |     def forward(x, w, b):
 9 |         B, T, C = x.size()
10 |         mean = x.sum(-1, keepdim=True) / C # B,T,1
11 |         xshift = x - mean # B,T,C
12 |         var = (xshift**2).sum(-1, keepdim=True) / C # B,T,1
13 |         rstd = (var + eps) ** -0.5 # B,T,1
14 |         norm = xshift * rstd # B,T,C
15 |         out = norm * w + b # B,T,C
16 | 
17 |         cache = (x, w, mean, rstd)
18 |         return out, cache
19 | 
20 |     @staticmethod
21 |     def backward(dout, cache):
22 |         x, w, mean, rstd = cache
23 |         # recompute the norm (save memory at the cost of compute)
24 |         norm = (x - mean) * rstd
25 |         # gradients for weights, bias
26 |         db = dout.sum((0, 1))
27 |         dw = (dout * norm).sum((0, 1))
28 |         # gradients for input
29 |         dnorm = dout * w
30 |         dx = dnorm - dnorm.mean(-1, keepdim=True) - norm * (dnorm * norm).mean(-1, keepdim=True)
31 |         dx *= rstd
32 |         return dx, dw, db
33 | 
34 | # create a small dummy example and check w.r.t PyTorch backward
35 | B = 2
36 | T = 3
37 | C = 4
38 | x = torch.randn(B, T, C, requires_grad=True)
39 | w = torch.randn(C, requires_grad=True)
40 | b = torch.randn(C, requires_grad=True)
41 | out, cache = LayerNorm.forward(x, w, b)
42 | 
43 | dout = torch.randn(B, T, C)
44 | dx, dw, db = LayerNorm.backward(dout, cache)
45 | 
46 | # compare to PyTorch autograd
47 | fakeloss = (out * dout).sum()
48 | fakeloss.backward()
49 | print("dx error:", (x.grad - dx).abs().max().item())
50 | print("dw error:", (w.grad - dw).abs().max().item())
51 | print("db error:", (b.grad - db).abs().max().item())
52 | 
53 | # for reference checking in C also
54 | x, w, mean, rstd = cache
55 | 
56 | def write(tensor, handle):
57 |     handle.write(tensor.detach().numpy().astype("float32").tobytes())
58 | 
59 | # Write to file
60 | with open('ln.bin', 'wb') as file:
61 |     write(x, file) # (B, T, C)
62 |     write(w, file) # (C, )
63 |     write(b, file) # (C, )
64 |     write(out, file) # (B, T, C)
65 |     write(mean, file) # (B, T)
66 |     write(rstd, file) # (B, T)
67 |     write(dout, file) # (B, T, C)
68 |     write(dx, file) # (B, T, C)
69 |     write(dw, file) # (C, )
70 |     write(db, file) # (C, )
71 | 


--------------------------------------------------------------------------------
/llmc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # OpenMP
 3 | find_package(OpenMP)
 4 | if (OpenMP_FOUND)
 5 |     add_compile_definitions(OMP)
 6 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
 7 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 8 | endif ()
 9 | 
10 | # llm.c
11 | add_executable(train_gpt2 ../train_gpt2.c)
12 | target_link_libraries(train_gpt2 m ${OpenMP_CXX_LIBRARIES})
13 | 
14 | add_executable(test_gpt2 ../test_gpt2.c)
15 | target_link_libraries(test_gpt2 m ${OpenMP_CXX_LIBRARIES})
16 | 
17 | if (CUDA_FOUND)
18 |     add_compile_definitions(ENABLE_FP32)
19 |     add_executable(train_gpt2cu train_gpt2.cu)
20 |     set_target_properties(train_gpt2cu PROPERTIES
21 |             CUDA_SEPARABLE_COMPILATION ON
22 |             CUDA_ARCHITECTURES "61;70;75"
23 |     )
24 |     target_link_libraries(train_gpt2cu ${CUDA_LIBRARIES} cublas cublasLt)
25 | endif ()
26 | 


--------------------------------------------------------------------------------
/llmc/adamw.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | AdamW kernel
 3 | */
 4 | 
 5 | // llmc internal imports
 6 | #include "cuda_common.h"
 7 | #include "cuda_utils.cuh"
 8 | 
 9 | // ----------------------------------------------------------------------------
10 | // CUDA kernels
11 | 
12 | // Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
13 | // Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
14 | __device__ float lerp(float start, float end, float weight) {
15 |     return fma(weight, end, fma(-weight, start, start));
16 | }
17 | 
18 | template <typename Tp, typename Tg>
19 | __device__ void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
20 |                              float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
21 |                              float grad_scale, unsigned int seed) {
22 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
23 |     if (idx >= num_parameters) { return; }  // guard
24 | 
25 |     // get the gradient, m, and v for this parameter
26 |     float grad = grad_scale * (float)grads_memory[idx];
27 |     float m = m_memory[idx];
28 |     float v = v_memory[idx];
29 |     // update the first moment (momentum)
30 |     m = lerp(grad, m, beta1);
31 |     m_memory[idx] = m;
32 |     // update the second moment (RMSprop)
33 |     v = lerp(grad * grad, v, beta2);
34 |     v_memory[idx] = v;
35 |     m /= beta1_correction;  // m_hat
36 |     v /= beta2_correction;  // v_hat
37 |     // fetch the old value of this parameter as a float, from either source
38 |     float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
39 |     // update this parameter
40 |     float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
41 |     // update our low precision version of the parameters using stochastic rounding
42 |     // this will be used in the next forward pass
43 |     stochastic_rounding(param, &params_memory[idx], seed);
44 |     // write the full, float version of the param into our master copy, if we maintain one
45 |     // this will be used in the next update
46 |     if (master_params_memory != NULL) { master_params_memory[idx] = param; }
47 | }
48 | 
49 | template <typename Tp, typename Tg>
50 | __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
51 |                               ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,
52 |                               float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
53 |                               float grad_scale, unsigned int seed) {
54 |     adamw_update(params_memory + blockIdx.y * w_stride,
55 |                  master_params_memory ? master_params_memory + blockIdx.y * s_stride : NULL,
56 |                  grads_memory + blockIdx.y * g_stride,
57 |                  m_memory + blockIdx.y * s_stride,
58 |                  v_memory + blockIdx.y * s_stride,
59 |                  num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, grad_scale,
60 |                  seed
61 |                  );
62 | }
63 | 
64 | template <typename Tp>
65 | __global__ void init_from_master_kernel(Tp* params_memory, float* master_params_memory, size_t num_parameters,
66 |                                           ptrdiff_t w_stride, ptrdiff_t s_stride, unsigned int seed) {
67 |     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
68 |     if (idx >= num_parameters) { return; }
69 |     params_memory += blockIdx.y * w_stride; // adjust for layer offset
70 |     master_params_memory += blockIdx.y * s_stride;
71 |     stochastic_rounding(master_params_memory[idx], &params_memory[idx], seed);
72 | }
73 | 
74 | template <typename Tp, typename Tg>
75 | void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
76 |                   ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,  int num_slices, float learning_rate, float beta1, float beta2, int t, float eps, float weight_decay,
77 |                   float grad_scale, unsigned int seed, cudaStream_t stream) {
78 |     // AdamW update
79 |     int block_size = 512;
80 |     int num_blocks = CEIL_DIV(num_parameters, block_size);
81 |     float beta1_correction = 1.0f - powf(beta1, t);
82 |     float beta2_correction = 1.0f - powf(beta2, t);
83 |     adamw_kernel3<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>(params_memory, master_params_memory, grads_memory,
84 |                                                          m_memory, v_memory, num_parameters, w_stride, g_stride, s_stride,
85 |                                                          learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
86 |                                                          grad_scale, seed);
87 |     cudaCheck(cudaGetLastError());
88 | }
89 | 
90 | template <typename Tp>
91 | void init_from_master(Tp* params_memory, float* master_params_memory, size_t num_parameters,
92 |                         ptrdiff_t w_stride, ptrdiff_t s_stride, int num_slices, unsigned int seed, cudaStream_t stream) {
93 |     int block_size = 512; // must match block size of adamw_update so that RNG also matches
94 |     int num_blocks = CEIL_DIV(num_parameters, block_size);
95 |     init_from_master_kernel<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>
96 |                              (params_memory, master_params_memory, num_parameters, w_stride, s_stride, seed);
97 |     cudaCheck(cudaGetLastError());
98 | }
99 | 


--------------------------------------------------------------------------------
/llmc/cublas_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cuBLAS related utils
 3 | */
 4 | #ifndef CUBLAS_COMMON_H
 5 | #define CUBLAS_COMMON_H
 6 | 
 7 | #include <stddef.h>
 8 | #include <stdlib.h>
 9 | #include <stdio.h>
10 | #include <cublas_v2.h>
11 | #include <cublasLt.h>
12 | 
13 | // ----------------------------------------------------------------------------
14 | // cuBLAS Precision settings
15 | 
16 | #if defined(ENABLE_FP32)
17 | #define CUBLAS_LOWP CUDA_R_32F
18 | #elif defined(ENABLE_FP16)
19 | #define CUBLAS_LOWP CUDA_R_16F
20 | #else // default to bfloat16
21 | #define CUBLAS_LOWP CUDA_R_16BF
22 | #endif
23 | 
24 | // ----------------------------------------------------------------------------
25 | // cuBLAS globals for workspace, handle, settings
26 | 
27 | // Hardcoding workspace to 32MiB but only Hopper needs 32 (for others 4 is OK)
28 | const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
29 | void* cublaslt_workspace = NULL;
30 | cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F;
31 | cublasLtHandle_t cublaslt_handle;
32 | 
33 | // ----------------------------------------------------------------------------
34 | // Error checking
35 | 
36 | // cuBLAS error checking
37 | void cublasCheck(cublasStatus_t status, const char *file, int line)
38 | {
39 |     if (status != CUBLAS_STATUS_SUCCESS) {
40 |         printf("[cuBLAS ERROR]: %d %s %d\n", status, file, line);
41 |         exit(EXIT_FAILURE);
42 |     }
43 | }
44 | #define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); }
45 | 
46 | #endif // CUBLAS_COMMON_H


--------------------------------------------------------------------------------
/llmc/cudnn_att.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cuDNN (flash) attention
 3 | */
 4 | #ifndef CUDNN_ATT_H
 5 | #define CUDNN_ATT_H
 6 | 
 7 | #include "cuda_common.h"
 8 | 
 9 | // forward declarations of functions defined in cudnn_att.cpp
10 | void create_cudnn();
11 | void destroy_cudnn();
12 | void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
13 |                              float* stats, // output for backward pass: (B, NH, T)
14 |                              floatX* inp,  // input: (B, T, 3, NH, HS) QKV
15 |                              int B, int T, int NH, int C, cudaStream_t stream);
16 | 
17 | void attention_backward_cudnn(floatX* dqkvr,                                       // output
18 |                               floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs
19 |                               int B, int T, int NH, int C, cudaStream_t stream);
20 | 
21 | #endif // CUDNN_ATT_H


--------------------------------------------------------------------------------
/llmc/gelu.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | (Approximate) GeLU non-linearity layer
 3 | */
 4 | #include <assert.h>
 5 | // llmc internal imports
 6 | #include "cuda_common.h"
 7 | #include "cuda_utils.cuh"
 8 | 
 9 | // ----------------------------------------------------------------------------
10 | // CUDA kernels
11 | 
12 | #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
13 | __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) {
14 |     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
15 | 
16 |     x128 packed_out;
17 |     x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
18 |     for(int k = 0; k < packed_inp.size; ++k) {
19 |         float xi = (float)packed_inp[k];
20 |         float cube = 0.044715f * xi * xi * xi;
21 |         packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
22 |     }
23 |     // store instead of storecs (without cache streaming) in case it is useful for the
24 |     // data to be in the cache for the next operation after this GeLU
25 |     store128(out + idx, packed_out);
26 | }
27 | 
28 | __global__ void gelu_backward_inplace_kernel(floatX* d_in_out, const floatX* inp) {
29 |     int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
30 | 
31 |     x128 packed_dinp;
32 |     x128 packed_inp = load128cs(inp + idx);
33 |     x128 packed_dout = load128(d_in_out + idx);
34 |     for (int k = 0; k < packed_inp.size; ++k) {
35 |         float x = (float)packed_inp[k];
36 |         float cube = 0.044715f * x * x * x;
37 |         float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
38 |         float tanh_out = tanhf(tanh_arg);
39 |         float coshf_out = coshf(tanh_arg);
40 |         float sech_out = 1.0f / (coshf_out * coshf_out);
41 |         float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
42 |         packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
43 |     }
44 |     store128(d_in_out + idx, packed_dinp);
45 | }
46 | 
47 | // ----------------------------------------------------------------------------
48 | // kernel launchers
49 | 
50 | void gelu_forward(floatX* out, const floatX* inp, int N, cudaStream_t stream) {
51 |     NVTX_RANGE_FN();
52 |     const int block_size = 512;
53 |     assert(N % (block_size * x128::size) == 0);
54 |     const int grid_size = CEIL_DIV(N, block_size * x128::size);
55 |     gelu_forward_kernel2<<<grid_size, block_size, 0, stream>>>(out, inp);
56 |     cudaCheck(cudaGetLastError());
57 | }
58 | 
59 | void gelu_backward_inplace(floatX* d_in_out, const floatX* inp, const int N, cudaStream_t stream) {
60 |     NVTX_RANGE_FN();
61 |     const int block_size = 128;
62 |     assert(N % (block_size * x128::size) == 0);
63 |     const int grid_size = CEIL_DIV(N, block_size * x128::size);
64 |     gelu_backward_inplace_kernel<<<grid_size, block_size, 0, stream>>>(d_in_out, inp);
65 |     cudaCheck(cudaGetLastError());
66 | }
67 | 


--------------------------------------------------------------------------------
/llmc/global_norm.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Global norm, used in gradient clipping
 3 | */
 4 | #include <assert.h>
 5 | #include <stddef.h>
 6 | #include <cuda_runtime_api.h>
 7 | // llmc internal imports
 8 | #include "cuda_common.h"
 9 | #include "cuda_utils.cuh"
10 | 
11 | // ----------------------------------------------------------------------------
12 | // CUDA kernels
13 | 
14 | template<class T>
15 | __device__ float global_norm_squared_for_range(const T* data, size_t count) {
16 |     size_t index = blockIdx.x * blockDim.x + threadIdx.x;
17 |     size_t grid_width = blockDim.x * gridDim.x;
18 |     float accumulator = 0.f;
19 |     for(size_t i = index; i < count; i += grid_width) {
20 |         accumulator += (float)data[i] * (float)data[i];
21 |     }
22 |     // block-level reduce
23 |     return blockReduce<warpReduceSum>(accumulator);
24 | }
25 | 
26 | template<class T>
27 | __global__ void global_norm_squared_kernel(float* out, const T* data, size_t count, ptrdiff_t stride) {
28 |     float block_sum = global_norm_squared_for_range(data + blockIdx.y * stride, count);
29 |     // each block accumulates its partial sum to out[out_index]
30 |     // we want to avoid using atomic add here so we combine this kernel with another kernel call
31 |     // that sums up the partial block sums
32 |     if(threadIdx.x == 0) {
33 |         size_t out_index = blockIdx.y * gridDim.x + blockIdx.x;
34 |         out[out_index] = out[out_index] + block_sum;
35 |     }
36 | }
37 | 
38 | __global__ void global_norm_aggregate_kernel(float* out, size_t grid_size) {
39 |     size_t index = threadIdx.x;
40 |     // grab block sums from the previous kernel, use 0. as the neutral sum element
41 |     float block_sum = (index < grid_size) ? out[index] : 0.f;
42 |     float sum = blockReduce<warpReduceSum>(block_sum);
43 |     if(threadIdx.x == 0) {
44 |         out[0] = sum;  // out[0] ends up with the final norm squared
45 |     }
46 | }
47 | 
48 | // ----------------------------------------------------------------------------
49 | // kernel launcher
50 | 
51 | // Helper function determines the maximum number of block sums
52 | int get_max_num_block_sums(int* num_slices_all, int numel) {
53 |     // NOTE: this needs to be kept in sync with `global_norm_squared` below.
54 |     const int block_size = 512;
55 |     const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
56 |     assert(grid_size > 0);
57 |     int max_num_block_sums = 0;
58 |     for (int i = 0; i < numel; i++) {
59 |         int num_slices = num_slices_all[i];
60 |         const int gx = CEIL_DIV(grid_size, num_slices);
61 |         const int gy = num_slices;
62 |         max_num_block_sums = max(max_num_block_sums, gx * gy);
63 |     }
64 | 
65 |     return max_num_block_sums;
66 | }
67 | 
68 | template<typename T>
69 | void global_norm_squared(float* out, const T* values, size_t count, ptrdiff_t stride, int num_slices, int max_num_block_sums, bool reset, cudaStream_t stream) {
70 |     const int block_size = 512;
71 |     // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
72 |     // having one block less than possible is a tiny performance hit, having
73 |     // one block too many is catastrophic, since it only can start once all the other
74 |     // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
75 |     // on all gpus, so the division really is going to be exact.
76 |     const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
77 |     assert(grid_size > 0);      // gives a better error than letting the call below fail
78 | 
79 |     const int gx = CEIL_DIV(grid_size, num_slices);
80 |     const int gy = num_slices;
81 | 
82 |     assert(gx * gy < 1024);  // we want to later accumulate the block sums in a single block
83 | 
84 |     if (reset) {
85 |         cudaCheck(cudaMemsetAsync(out, 0, max_num_block_sums * sizeof(float), stream));
86 |     }
87 |     global_norm_squared_kernel<<<dim3(gx, gy), block_size, 0, stream>>>(out, values, count, stride);
88 |     cudaCheck(cudaGetLastError());
89 | }
90 | 


--------------------------------------------------------------------------------
/llmc/logger.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implements a simple logger that writes log files in the output directory.
 3 | The Logger object is stateless and uses append mode to write to log files.
 4 | */
 5 | #ifndef LOGGER_H
 6 | #define LOGGER_H
 7 | 
 8 | #include <assert.h>
 9 | #include <stdio.h>
10 | #include <string.h>
11 | // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
12 | #include "utils.h"
13 | 
14 | typedef struct {
15 |     int active;
16 |     char output_log_file[512];
17 | } Logger;
18 | 
19 | void logger_init(Logger *logger, const char *log_dir, int process_rank, int resume) {
20 |     // currently, only rank 0 writes logs
21 |     logger->active = 0;
22 |     if (log_dir != NULL && process_rank == 0) {
23 |         logger->active = 1;
24 |         assert(strlen(log_dir) < 500); // being a bit lazy, could relax later
25 |         snprintf(logger->output_log_file, 512, "%s/main.log", log_dir);
26 |         if (resume == 0) {
27 |             // wipe any existing logfile clean if we're starting fresh
28 |             FILE *logfile = fopenCheck(logger->output_log_file, "w");
29 |             fclose(logfile);
30 |         }
31 |     }
32 | }
33 | 
34 | void logger_log_eval(Logger *logger, int step, float val) {
35 |     if (logger->active == 1) {
36 |         FILE *logfile = fopenCheck(logger->output_log_file, "a");
37 |         fprintf(logfile, "s:%d eval:%.4f\n", step, val);
38 |         fclose(logfile);
39 |     }
40 | }
41 | 
42 | void logger_log_val(Logger *logger, int step, float val_loss) {
43 |     if (logger->active == 1) {
44 |         FILE *logfile = fopenCheck(logger->output_log_file, "a");
45 |         fprintf(logfile, "s:%d tel:%.4f\n", step, val_loss);
46 |         fclose(logfile);
47 |     }
48 | }
49 | 
50 | void logger_log_train(Logger *logger, int step, float train_loss, float learning_rate, float grad_norm) {
51 |     if (logger->active == 1) {
52 |         FILE *logfile = fopenCheck(logger->output_log_file, "a");
53 |         fprintf(logfile, "s:%d trl:%.4f lr:%.6f norm:%.2f\n", step, train_loss, learning_rate, grad_norm);
54 |         fclose(logfile);
55 |     }
56 | }
57 | 
58 | #endif


--------------------------------------------------------------------------------
/llmc/outlier_detector.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Simple OutlierDetector that we can use to monitor the loss and grad norm
 3 | Internally, it keeps track of a window of measurements and each time we
 4 | add a measurement, it returns the z-score of the new value with respect to
 5 | the window of measurements. This can be used to detect outliers in the data.
 6 | 
 7 | We use double so that the detector doesn't drift too much, because we
 8 | update the mean and variance with += on each step for efficiency. We could
 9 | reconsider this choice in the future, as the compute cost here is minimal.
10 | */
11 | 
12 | #include <stdio.h>
13 | #include <math.h>
14 | 
15 | // use compile-time constant for window size to avoid dynamic memory allocations
16 | #define OUTLIER_DETECTOR_WINDOW_SIZE 128
17 | 
18 | typedef struct {
19 |     double buffer[OUTLIER_DETECTOR_WINDOW_SIZE];
20 |     int count;
21 |     int index;
22 |     double sum;
23 |     double sum_sq;
24 | } OutlierDetector;
25 | 
26 | void init_detector(OutlierDetector *detector) {
27 |     for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE; i++) {
28 |         detector->buffer[i] = 0.0;
29 |     }
30 |     detector->count = 0;
31 |     detector->index = 0;
32 |     detector->sum = 0.0;
33 |     detector->sum_sq = 0.0;
34 | }
35 | 
36 | double update_detector(OutlierDetector *detector, double new_value) {
37 | 
38 |     if (detector->count < OUTLIER_DETECTOR_WINDOW_SIZE) {
39 |         // here we are still building up a window of observations
40 |         detector->buffer[detector->count] = new_value;
41 |         detector->sum += new_value;
42 |         detector->sum_sq += new_value * new_value;
43 |         detector->count++;
44 |         return nan(""); // not enough data yet
45 | 
46 |     } else {
47 |         // we've filled the window, so now we can start detecting outliers
48 | 
49 |         // pop the oldest value from the window
50 |         double old_value = detector->buffer[detector->index];
51 |         detector->sum -= old_value;
52 |         detector->sum_sq -= old_value * old_value;
53 |         // push the new value into the window
54 |         detector->buffer[detector->index] = new_value;
55 |         detector->sum += new_value;
56 |         detector->sum_sq += new_value * new_value;
57 |         // move the index to the next position
58 |         detector->index = (detector->index + 1) % OUTLIER_DETECTOR_WINDOW_SIZE;
59 |         // calculate the z-score of the new value
60 |         double mean = detector->sum / OUTLIER_DETECTOR_WINDOW_SIZE;
61 |         double variance = (detector->sum_sq / OUTLIER_DETECTOR_WINDOW_SIZE) - (mean * mean);
62 |         double std_dev = sqrt(variance);
63 |         if (std_dev == 0.0) {
64 |             return 0.0;
65 |         }
66 |         double z = (new_value - mean) / std_dev;
67 | 
68 |         return z;
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/llmc/sampler.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Implements a simple Sampler, used during model inference to sample tokens.
 3 | */
 4 | #ifndef SAMPLER_H
 5 | #define SAMPLER_H
 6 | 
 7 | #include <math.h>
 8 | 
 9 | // Simple xorshift RNG
10 | unsigned int random_u32(unsigned long long *state) {
11 |     // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
12 |     *state ^= *state >> 12;
13 |     *state ^= *state << 25;
14 |     *state ^= *state >> 27;
15 |     return (*state * 0x2545F4914F6CDD1Dull) >> 32;
16 | }
17 | 
18 | float random_f32(unsigned long long *state) { // random float32 in [0,1)
19 |     return (random_u32(state) >> 8) / 16777216.0f;
20 | }
21 | 
22 | int sample_softmax(const float* logits, int n, float coin) {
23 |     // sample index from logits (converted to probabilities using softmax)
24 |     // coin is a random number in [0, 1), usually from random_f32()
25 |     double norm = 0;
26 |     for (int i = 0; i < n; i++) {
27 |         norm += expf(logits[i]);
28 |     }
29 |     // instead of dividing all exp(logits), we can just multiply coin.
30 |     coin *= norm;
31 |     float cdf = 0.0f;
32 |     for (int i = 0; i < n; i++) {
33 |         cdf += expf(logits[i]);
34 |         if (coin < cdf) {
35 |             return i;
36 |         }
37 |     }
38 |     return n - 1; // in case of rounding errors
39 | }
40 | 
41 | #endif


--------------------------------------------------------------------------------
/llmc/schedulers.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Implements various learning rate schedulers.
  3 | */
  4 | #ifndef SCHEDULERS_H
  5 | #define SCHEDULERS_H
  6 | 
  7 | #include <assert.h>
  8 | #include <math.h>
  9 | #include <string.h>
 10 | 
 11 | typedef struct {
 12 |     const char* type;
 13 |     float learning_rate;
 14 |     int warmup_iterations;
 15 |     int train_num_batches;
 16 |     float final_learning_rate_frac;
 17 | } LearningRateScheduler;
 18 | 
 19 | void lr_scheduler_init(LearningRateScheduler *scheduler, const char* scheduler_type, float learning_rate, int warmup_iterations, int train_num_batches, float final_learning_rate_frac) {
 20 |     scheduler->type = scheduler_type;
 21 |     scheduler->learning_rate = learning_rate;
 22 |     scheduler->warmup_iterations = warmup_iterations;
 23 |     scheduler->train_num_batches = train_num_batches;
 24 |     scheduler->final_learning_rate_frac = final_learning_rate_frac;
 25 | }
 26 | 
 27 | // cosine: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
 28 | float get_learning_rate_cosine(LearningRateScheduler *scheduler, int step) {
 29 |     float lr = scheduler->learning_rate;
 30 |     if (step < scheduler->warmup_iterations) {
 31 |         lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations;
 32 |     } else {
 33 |         float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations);
 34 |         assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
 35 |         float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0
 36 |         assert(0.0f <= coeff && coeff <= 1.0f);
 37 |         float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac;
 38 |         lr = min_lr + coeff * (scheduler->learning_rate - min_lr);
 39 |     }
 40 |     return lr;
 41 | }
 42 | 
 43 | // linear: warmup linearly to max LR, then decay linearly to LR * final_learning_rate_frac
 44 | float get_learning_rate_linear(LearningRateScheduler *scheduler, int step) {
 45 |     float lr = scheduler->learning_rate;
 46 |     if (step < scheduler->warmup_iterations) {
 47 |         lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations;
 48 |     } else {
 49 |         float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations);
 50 |         assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
 51 |         float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac;
 52 |         lr = scheduler->learning_rate - decay_ratio * (scheduler->learning_rate - min_lr);
 53 |     }
 54 |     return lr;
 55 | }
 56 | 
 57 | // constant
 58 | float get_learning_rate_constant(LearningRateScheduler *scheduler, int step) {
 59 |     return scheduler->learning_rate;
 60 | }
 61 | 
 62 | // wsd schedule: warmup linearly, keep constant, last 20% decay using 1 - sqrt decay to final_frac (should be 0.0)
 63 | // https://arxiv.org/abs/2405.18392
 64 | float get_learning_rate_wsd(LearningRateScheduler *scheduler, int step) {
 65 |     int decay_point = (int)(0.8f * scheduler->train_num_batches);
 66 |     float max_lr = scheduler->learning_rate;
 67 |     float lr = max_lr;
 68 |     if (step < scheduler->warmup_iterations) {
 69 |         float decay_ratio = ((float)(step + 1)) / scheduler->warmup_iterations;
 70 |         lr = max_lr * decay_ratio;
 71 |     } else if (step < decay_point) {
 72 |         // noop, keep lr constant
 73 |     } else {
 74 |         float decay_ratio = ((float)(step - decay_point)) / (scheduler->train_num_batches - decay_point);
 75 |         assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
 76 |         float min_lr = max_lr * scheduler->final_learning_rate_frac;
 77 |         return min_lr + (1.0f - sqrtf(decay_ratio)) * (max_lr - min_lr);
 78 |     }
 79 |     return lr;
 80 | }
 81 | 
 82 | // return the learning rate at a given step
 83 | float get_learning_rate(LearningRateScheduler *scheduler, int step) {
 84 |     float step_learning_rate;
 85 |     if (strcmp(scheduler->type, "cosine") == 0) {
 86 |         step_learning_rate = get_learning_rate_cosine(scheduler, step);
 87 |     } else if (strcmp(scheduler->type, "linear") == 0) {
 88 |         step_learning_rate = get_learning_rate_linear(scheduler, step);
 89 |     } else if (strcmp(scheduler->type, "constant") == 0) {
 90 |         step_learning_rate = get_learning_rate_constant(scheduler, step);
 91 |     } else if (strcmp(scheduler->type, "wsd") == 0) {
 92 |         step_learning_rate = get_learning_rate_wsd(scheduler, step);
 93 |     } else {
 94 |         fprintf(stderr, "Unknown learning rate scheduler type: %s\n", scheduler->type);
 95 |         exit(EXIT_FAILURE);
 96 |     }
 97 |     return step_learning_rate;
 98 | }
 99 | 
100 | #endif // SCHEDULERS_H
101 | 


--------------------------------------------------------------------------------
/llmc/tokenizer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Defines the GPT-2 Tokenizer.
  3 | Only supports decoding, i.e.: tokens (integers) -> strings
  4 | This is all we need for unconditional generation.
  5 | If we wanted to later prompt the model, we'd have to add decoding.
  6 | Which could be tricky in C because of the regex involved, to look into later.
  7 | */
  8 | 
  9 | #include <stdint.h>
 10 | #include <ctype.h>
 11 | #include <assert.h>
 12 | // our own utilities
 13 | // defines fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
 14 | #include "utils.h"
 15 | 
 16 | // ----------------------------------------------------------------------------
 17 | 
 18 | typedef struct {
 19 |     uint32_t vocab_size;
 20 |     char **token_table;
 21 |     int init_ok;
 22 |     int eot_token; // <|endoftext|> token id
 23 | } Tokenizer;
 24 | 
 25 | void safe_printf(const char *piece) {
 26 |     // the tokens are raw bytes, and we we only want to print the printable ones
 27 |     // many bytes can be various control codes, backspace, etc.
 28 |     if (piece == NULL) { return; }
 29 |     if (piece[0] == '\0') { return; }
 30 |     // handle individual byte tokens
 31 |     // every token is asserted to be at least one byte so doing piece[1] is ok
 32 |     if (piece[1] == '\0') {
 33 |         unsigned char byte_val = piece[0];
 34 |         if (!(isprint(byte_val) || isspace(byte_val))) {
 35 |             return; // weird byte, don't print it
 36 |         }
 37 |     }
 38 |     printf("%s", piece);
 39 | }
 40 | 
 41 | void tokenizer_init(Tokenizer *tokenizer, const char *filename) {
 42 |     FILE *file = fopen(filename, "rb");
 43 |     if (file == NULL) {
 44 |         // try to be more helpful as we just added this feature, erase later
 45 |         printf("---\n");
 46 |         printf("WARNING: Failed to open the tokenizer file %s\n", filename);
 47 |         printf("The Tokenizer is a new feature added April 14 2024.\n");
 48 |         printf("Re-run `python train_gpt2.py` to write it\n");
 49 |         printf("---\n");
 50 |         tokenizer->init_ok = 0;
 51 |         return;
 52 |     }
 53 |     // read in the header
 54 |     uint32_t header[256];
 55 |     freadCheck(header, sizeof(uint32_t), 256, file);
 56 |     assert(header[0] == 20240328);
 57 |     int version = header[1];
 58 |     tokenizer->vocab_size = header[2];
 59 |     if (version == 1) {
 60 |         // version 1 didn't include the EOT token id
 61 |         // so we assume it is 50256, the EOT in GPT-2
 62 |         assert(tokenizer->vocab_size == 50257); // let's be defensive here
 63 |         tokenizer->eot_token = 50256;
 64 |     } else if (version == 2) {
 65 |         tokenizer->eot_token = header[3];
 66 |     } else {
 67 |         fprintf(stderr, "Tokenizer model file %s has bad version: %d\n", filename, version);
 68 |         exit(EXIT_FAILURE);
 69 |     }
 70 |     // read in all the tokens
 71 |     unsigned char length;
 72 |     tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *));
 73 |     for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
 74 |         freadCheck(&length, sizeof(unsigned char), 1, file);
 75 |         assert(length > 0); // every token should be at least one character
 76 |         char *token_bytes = (char *)mallocCheck(length + 1);
 77 |         freadCheck(token_bytes, sizeof(char), length, file);
 78 |         token_bytes[length] = '\0';  // Add null terminator for printing
 79 |         tokenizer->token_table[i] = token_bytes;
 80 |     }
 81 |     // cleanups
 82 |     fcloseCheck(file);
 83 |     tokenizer->init_ok = 1;
 84 | }
 85 | 
 86 | const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) {
 87 |     if (tokenizer->init_ok == 0) {
 88 |         return NULL;
 89 |     }
 90 |     if (token_id < tokenizer->vocab_size) {
 91 |         return tokenizer->token_table[token_id];
 92 |     } else {
 93 |         printf("invalid token id %u!\n", token_id);
 94 |         return NULL;
 95 |     }
 96 | }
 97 | 
 98 | void tokenizer_free(Tokenizer *tokenizer) {
 99 |     if (tokenizer->init_ok) {
100 |         for (uint32_t i = 0; i < tokenizer->vocab_size; i++) {
101 |             free(tokenizer->token_table[i]);
102 |         }
103 |         free(tokenizer->token_table);
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/llmcpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # llm.cpp
 2 | add_library(nn nn.hpp)
 3 | target_link_libraries(nn
 4 |         absl::strings absl::log absl::check pthread)
 5 | 
 6 | add_library(gpt gpt.hpp)
 7 | target_link_libraries(gpt nn)
 8 | 
 9 | add_library(gpt2 gpt2.hpp)
10 | target_link_libraries(gpt2 gpt)
11 | 
12 | add_library(optim optim.hpp)
13 | target_link_libraries(optim nn)
14 | 
15 | add_executable(test_gpt2_cpu test_gpt2.cpp)
16 | target_link_libraries(test_gpt2_cpu gpt2 optim)
17 | target_compile_options(test_gpt2_cpu PRIVATE -Ofast -march=native)
18 | 
19 | add_executable(train_gpt2_cpu train_gpt2.cpp)
20 | target_link_libraries(train_gpt2_cpu
21 |         gpt2 optim
22 |         profiler
23 | )
24 | target_compile_options(train_gpt2_cpu PRIVATE -Ofast -march=native)
25 | 
26 | add_executable(nn_test nn_test.cpp)
27 | target_link_libraries(nn_test nn GTest::gtest_main)
28 | 
29 | add_executable(optim_test optim_test.cpp)
30 | target_link_libraries(optim_test nn GTest::gtest_main)
31 | 
32 | add_executable(gpt_test gpt_test.cpp)
33 | target_link_libraries(gpt_test gpt GTest::gtest_main)
34 | 
35 | add_executable(gpt_optim gpt_optim.cpp)
36 | target_link_libraries(gpt_optim gpt)
37 | 
38 | add_executable(test_eigen_cpu test_eigen_cpu.cpp)
39 | target_link_libraries(test_eigen_cpu nn)
40 | target_compile_options(test_eigen_cpu PRIVATE -Ofast -march=native)
41 | 
42 | set(CMAKE_CUDA_ARCHITECTURES 60 61 70 75)
43 | find_package(CUDA)
44 | if (CUDA_FOUND)
45 |     add_library(nn_gpu nn.hpp)
46 |     target_compile_definitions(nn_gpu PUBLIC EIGEN_USE_GPU)
47 |     target_link_libraries(nn_gpu
48 |             absl::strings absl::log absl::check
49 |             ${CUDA_LIBRARIES}
50 |     )
51 | 
52 |     add_library(gpt_gpu gpt.hpp)
53 |     target_link_libraries(gpt_gpu
54 |             nn_gpu
55 |     )
56 | 
57 |     add_executable(test_eigen_gpu test_eigen_gpu.cu)
58 |     target_compile_definitions(test_eigen_gpu PRIVATE EIGEN_USE_GPU)
59 |     target_link_libraries(test_eigen_gpu
60 |             nn_gpu
61 |     )
62 |     #    target_compile_options(test_eigen_gpu PRIVATE -Xcompiler=-Ofast,-march=native)
63 | 
64 |     # nn_test_gpu
65 |     add_executable(nn_test_gpu nn_test.cu)
66 |     target_link_libraries(nn_test_gpu
67 |             nn_gpu
68 |             GTest::gtest_main
69 |     )
70 | 
71 |     # gpt_test_gpu
72 |     add_executable(gpt_test_gpu gpt_test.cu)
73 |     target_link_libraries(gpt_test_gpu
74 |             gpt_gpu
75 |             GTest::gtest_main
76 |     )
77 | 
78 |     # gpt_optim_gpu
79 |     add_executable(gpt_optim_gpu gpt_optim.cu)
80 |     target_link_libraries(gpt_optim_gpu
81 |             gpt_gpu
82 |     )
83 | 
84 |     # train_gpt2_gpu
85 |     add_executable(train_gpt2_gpu train_gpt2.cu)
86 |     target_link_libraries(train_gpt2_gpu
87 |             gpt_gpu
88 |     )
89 |     target_compile_options(train_gpt2_gpu PRIVATE -O3)
90 | endif ()
91 | 


--------------------------------------------------------------------------------
/llmcpp/README.md:
--------------------------------------------------------------------------------
  1 | # llm.cpp
  2 | 项目 fork 自 karpathy 的 [llm.c](https://github.com/karpathy/llm.c)，使用 C++(with Eigen) 来复现 GPT-2，支持 CPU/CUDA 计算。
  3 | - 所有的计算部分都通过 Eigen Tensor 完成，所以同样一份代码通过简单地切换 Device 就可完成 CPU/CUDA 的计算
  4 | - 这里实现的 GPT-2 与 PyTorch 版本是完全对齐的
  5 | - 值得注意的是，CPU 版本比 PyTorch 快大约 20%，但是 GPU 版本比 PyTorch GPU 慢得多，主要原因是 Eigen 的 Tensor 不支持 BatchMatmul 
  6 | 
  7 | 
  8 | This repo is forked from karpathy's [llm.c](https://github.com/karpathy/llm.c), using C++ (with Eigen) to reproduce GPT-2.
  9 | 
 10 | - All calculations are done through the Eigen Tensor Module, so the same code can be used for CPU/CUDA calculations by simply switching the Device.
 11 | - Currently, this repo has reproduced GPT-2 and the results are completely aligned with the PyTorch version.
 12 | - It is worth noting that CPU calculations are about 20% faster than PyTorch, while GPU calculations are still far behind PyTorch's GPU due to the difficulty of Eigen Tensor Module to support BatchMatmul.
 13 | 
 14 | ## quick start (CPU)
 15 | 
 16 | ```bash
 17 | pip install -r requirements.txt
 18 | python dev/data/tinyshakespeare.py
 19 | python train_gpt2.py
 20 | mkdir build && cd build
 21 | cmake ..
 22 | make train_gpt2_cpu
 23 | cd ../
 24 | ./build/llmcpp/train_gpt2_cpu
 25 | ```
 26 | 
 27 | The above lines 
 28 | - (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, 
 29 | tokenize it with the GPT-2 Tokenizer
 30 | - (2) download and save the GPT-2 (124M) weights
 31 | - (3) init from them in C++ and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. The output looks like this on my LMDE3 (Intel© Core™ i7-10700K CPU @ 3.80GHz × 8):
 32 | 
 33 | ```
 34 | [GPT-2]
 35 | max_seq_len: 1024
 36 | vocab_size: 50257
 37 | padded_vocab_size: 50304
 38 | num_layers: 12
 39 | num_heads: 12
 40 | channels: 768
 41 | num_parameters: 124475904(474 MB)
 42 | train dataset num_batches: 1192
 43 | val dataset num_batches: 128
 44 | num_activations: 82723584(315 MB)
 45 | val loss 5.325413
 46 | step 0: train loss 5.356086 (took 786.515755 ms)
 47 | step 1: train loss 4.300581 (took 677.340087 ms)
 48 | step 2: train loss 4.623053 (took 674.843167 ms)
 49 | step 3: train loss 4.599307 (took 673.189660 ms)
 50 | ... (trunctated) ...
 51 | step 39: train loss 3.972404 (took 749.386021 ms)
 52 | val loss 4.017484
 53 | generating:
 54 | ---
 55 | Requinetarius,
 56 | Which; supreme, but
 57 | Commands jest in vain for ever.
 58 | 
 59 | <|endoftext|>Lady:
 60 | No, heavens,
 61 | I were not to haste
 62 | To retire valorously and look nobly in the face,
 63 | Before this
 64 | UNHISILIUS UNDERDEINTS
 65 | 
 66 | ---
 67 | step 40: train loss 4.378605 (took 692.830391 ms)
 68 | final 40 iters avg: 692.974 ms
 69 | ```
 70 | 
 71 | ## quick start (1 GPU, fp32 only)
 72 | ```bash
 73 | mkdir build && cd build
 74 | cmake ..
 75 | make train_gpt2_gpu
 76 | cd ../
 77 | ./build/llmcpp/train_gpt2_gpu
 78 | ```
 79 | 
 80 | 
 81 | ## datasets
 82 | 
 83 | The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to .bin files, readable easily from C. So for example when you run:
 84 | 
 85 | ```bash
 86 | python dev/data/tinyshakespeare.py
 87 | ```
 88 | 
 89 | We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this:
 90 | 
 91 | ```
 92 | writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin
 93 | writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin
 94 | ```
 95 | 
 96 | The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`.
 97 | 
 98 | ## test
 99 | 
100 | I am also attaching a simple unit test for making sure our C++ code agrees with the PyTorch code. On the CPU as an example, compile and run with:
101 | 
102 | ```bash
103 | mkdir build && cd build
104 | cmake ..
105 | make test_gpt2_cpu
106 | cd ../
107 | ./build/llmcpp/test_gpt2_cpu
108 | ```
109 | 
110 | This now loads the `gpt2_124M_debug_state.bin` file that gets written by train_gpt2.py, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch.
111 | This tests both the fp32 path and the mixed precision path. The test should pass and print `overall okay: 1`.
112 | 
113 | 
114 | ## license
115 | 
116 | MIT
117 | 


--------------------------------------------------------------------------------
/llmcpp/cuda_profile_util.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_
 2 | #define LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_
 3 | 
 4 | #include <nvtx3/nvToolsExt.h>
 5 | #include <string>
 6 | 
 7 | // Profiler utils
 8 | class NvtxRange {
 9 |  public:
10 |   NvtxRange(const char* s) { nvtxRangePush(s); }
11 |   NvtxRange(const char* prefix, const char* s) {
12 |     std::string message = std::string(prefix) + "::" + std::string(s);
13 |     nvtxRangePush(message.c_str());
14 |   }
15 |   NvtxRange(const std::string& base_str, int number) {
16 |     std::string range_string = base_str + " " + std::to_string(number);
17 |     nvtxRangePush(range_string.c_str());
18 |   }
19 |   ~NvtxRange() { nvtxRangePop(); }
20 | };
21 | #define NVTX_RANGE_FN(prefix) NvtxRange nvtx_range(prefix, __FUNCTION__)
22 | 
23 | #endif  // LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_
24 | 


--------------------------------------------------------------------------------
/llmcpp/gpt2.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef LLM_CPP__GPT2_HPP_
  2 | #define LLM_CPP__GPT2_HPP_
  3 | 
  4 | #include "absl/strings/string_view.h"
  5 | #include "gpt.hpp"
  6 | #include "llmc/utils.h"
  7 | 
  8 | namespace gpt2 {
  9 | struct GPT2Config {
 10 |   int max_seq_len;        // max sequence length, e.g. 1024
 11 |   int vocab_size;         // vocab size, e.g. 50257
 12 |   int padded_vocab_size;  // padded to e.g. %128==0, 50304
 13 |   int num_layers;         // number of layers, e.g. 12
 14 |   int num_heads;          // number of heads in attention, e.g. 12
 15 |   int channels;           // number of channels, e.g. 768
 16 | };
 17 | 
 18 | struct GPT2 {
 19 |   using Type = floatX;
 20 | 
 21 |   void BuildFromCheckpoint(absl::string_view checkpoint_path) {
 22 |     // read in model from a checkpoint file
 23 |     FILE* model_file = fopenCheck(checkpoint_path.data(), "rb");
 24 |     if (model_file == nullptr) {
 25 |       printf("Error opening model file\n");
 26 |       exit(1);
 27 |     }
 28 | 
 29 |     int model_header[256];
 30 |     freadCheck(model_header, sizeof(int), 256, model_file);
 31 |     if (model_header[0] != 20240326) {
 32 |       printf("Bad magic model file\n");
 33 |       exit(1);
 34 |     }
 35 |     if (model_header[1] != 3) {
 36 |       printf("Bad version in model file\n");
 37 |       printf("---> HINT: try to re-run `python train_gpt2.py`\n");
 38 |       exit(1);
 39 |     }
 40 | 
 41 |     // read in hyperparameters
 42 |     int maxT, V, Vp, L, NH, C;
 43 |     config.max_seq_len = maxT = model_header[2];
 44 |     config.vocab_size = V = model_header[3];
 45 |     config.num_layers = L = model_header[4];
 46 |     config.num_heads = NH = model_header[5];
 47 |     config.channels = C = model_header[6];
 48 |     config.padded_vocab_size = Vp = model_header[7];
 49 |     printf("[GPT-2]\n");
 50 |     printf("max_seq_len: %zu\n", maxT);
 51 |     printf("vocab_size: %zu\n", V);
 52 |     printf("padded_vocab_size: %zu\n", Vp);
 53 |     printf("num_layers: %zu\n", L);
 54 |     printf("num_heads: %zu\n", NH);
 55 |     printf("channels: %zu\n", C);
 56 | 
 57 |     gpt2_ = std::make_unique<gpt::GPT>(
 58 |         config.max_seq_len, config.vocab_size, config.padded_vocab_size,
 59 |         config.num_layers, config.num_heads, config.channels);
 60 |     // allocate space for all the parameters and read them in
 61 |     size_t num_parameters = gpt2_->NumParameters();
 62 |     printf("num_parameters: %zu(%zu MB)\n", num_parameters,
 63 |            num_parameters * sizeof(floatX) / 1024 / 1024);
 64 | 
 65 |     auto restore_fn = [&](nn::Parameter* p, const std::string& name) {
 66 | #ifdef EIGEN_USE_GPU
 67 |       std::vector<Type> cpu_data(p->size());
 68 |       freadCheck(cpu_data.data(), sizeof(Type), p->size(), model_file);
 69 |       nn::g_device.memcpyHostToDevice(p->data<Type>(), cpu_data.data(),
 70 |                                       sizeof(Type) * p->size());
 71 | #else
 72 |       freadCheck(p->data<Type>(), sizeof(Type), p->size(), model_file);
 73 | #endif
 74 |     };
 75 |     ApplyFn(restore_fn, L);
 76 |     fcloseCheck(model_file);
 77 |   }
 78 | 
 79 |   void Parameters(std::vector<nn::Parameter*>* parameters) const {
 80 |     auto collect_fn = [&](nn::Parameter* p, const std::string& name) {
 81 |       parameters->push_back(p);
 82 |     };
 83 |     ApplyFn(collect_fn, config.num_layers);
 84 |   }
 85 | 
 86 |   void Parameters(
 87 |       std::unordered_map<std::string, nn::Parameter*>* parameters) const {
 88 |     auto collect_fn = [&](nn::Parameter* p, const std::string& name) {
 89 |       parameters->insert({name, p});
 90 |     };
 91 |     ApplyFn(collect_fn, config.num_layers);
 92 |   }
 93 | 
 94 |   void ApplyFn(
 95 |       const std::function<void(nn::Parameter*, const std::string&)>& apply_fn,
 96 |       int L) const {
 97 |     apply_fn(gpt2_->wte_->weight_.get(), "wte");
 98 |     apply_fn(gpt2_->wpe_->weight_.get(), "wpe");
 99 | 
100 |     auto name_with_layer = [](const std::string& name, int l) {
101 |       return name + "-L" + std::to_string(l);
102 |     };
103 | 
104 |     // ln1w
105 |     for (int l = 0; l < L; ++l) {
106 |       const auto& block = gpt2_->h_[l];
107 |       nn::Parameter* p = block->ln1_->weight_.get();
108 |       apply_fn(p, name_with_layer("ln1w", l));
109 |     }
110 | 
111 |     // ln1b
112 |     for (int l = 0; l < L; ++l) {
113 |       const auto& block = gpt2_->h_[l];
114 |       nn::Parameter* p = block->ln1_->bias_.get();
115 |       apply_fn(p, name_with_layer("ln1b", l));
116 |     }
117 | 
118 |     // qkvw
119 |     for (int l = 0; l < L; ++l) {
120 |       const auto& block = gpt2_->h_[l];
121 |       nn::Parameter* p = block->attn_->c_attn_->weight_.get();
122 |       apply_fn(p, name_with_layer("qkvw", l));
123 |     }
124 | 
125 |     // qkvb
126 |     for (int l = 0; l < L; ++l) {
127 |       const auto& block = gpt2_->h_[l];
128 |       nn::Parameter* p = block->attn_->c_attn_->bias_.get();
129 |       apply_fn(p, name_with_layer("qkvb", l));
130 |     }
131 | 
132 |     // attprojw
133 |     for (int l = 0; l < L; ++l) {
134 |       const auto& block = gpt2_->h_[l];
135 |       nn::Parameter* p = block->attn_->c_proj_->weight_.get();
136 |       apply_fn(p, name_with_layer("attprojw", l));
137 |     }
138 | 
139 |     // attprojb
140 |     for (int l = 0; l < L; ++l) {
141 |       const auto& block = gpt2_->h_[l];
142 |       nn::Parameter* p = block->attn_->c_proj_->bias_.get();
143 |       apply_fn(p, name_with_layer("attprojb", l));
144 |     }
145 | 
146 |     // ln2w
147 |     for (int l = 0; l < L; ++l) {
148 |       const auto& block = gpt2_->h_[l];
149 |       nn::Parameter* p = block->ln2_->weight_.get();
150 |       apply_fn(p, name_with_layer("ln2w", l));
151 |     }
152 | 
153 |     // ln2b
154 |     for (int l = 0; l < L; ++l) {
155 |       const auto& block = gpt2_->h_[l];
156 |       nn::Parameter* p = block->ln2_->bias_.get();
157 |       apply_fn(p, name_with_layer("ln2b", l));
158 |     }
159 | 
160 |     // fcw
161 |     for (int l = 0; l < L; ++l) {
162 |       const auto& block = gpt2_->h_[l];
163 |       nn::Parameter* p = block->mlp_->c_fc_->weight_.get();
164 |       apply_fn(p, name_with_layer("fcw", l));
165 |     }
166 | 
167 |     // fcb
168 |     for (int l = 0; l < L; ++l) {
169 |       const auto& block = gpt2_->h_[l];
170 |       nn::Parameter* p = block->mlp_->c_fc_->bias_.get();
171 |       apply_fn(p, name_with_layer("fcb", l));
172 |     }
173 | 
174 |     // fcprojw
175 |     for (int l = 0; l < L; ++l) {
176 |       const auto& block = gpt2_->h_[l];
177 |       nn::Parameter* p = block->mlp_->c_proj_->weight_.get();
178 |       apply_fn(p, name_with_layer("fcprojw", l));
179 |     }
180 | 
181 |     // fcprojb
182 |     for (int l = 0; l < L; ++l) {
183 |       const auto& block = gpt2_->h_[l];
184 |       nn::Parameter* p = block->mlp_->c_proj_->bias_.get();
185 |       apply_fn(p, name_with_layer("fcprojb", l));
186 |     }
187 | 
188 |     // lnfw
189 |     apply_fn(gpt2_->lnf_->weight_.get(), "lnfw");
190 |     // lnfb
191 |     apply_fn(gpt2_->lnf_->bias_.get(), "lnfb");
192 |   }
193 | 
194 |   GPT2Config config;
195 |   std::unique_ptr<gpt::GPT> gpt2_;
196 | };
197 | }  // namespace gpt2
198 | 
199 | #endif  // LLM_CPP__GPT2_HPP_
200 | 


--------------------------------------------------------------------------------
/llmcpp/gpt_optim.cpp:
--------------------------------------------------------------------------------
 1 | #include "gpt.hpp"
 2 | #include "optim.hpp"
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   /*
 6 | torch.set_printoptions(precision=6)
 7 | torch.manual_seed(42)
 8 | config = GPTConfig(block_size=8, n_embd=16, n_head=4, n_layer=8, vocab_size=100)
 9 | gpt2 = GPT(config=config)
10 | B, T, C = 4, 8, 16
11 | idx = torch.LongTensor([[35, 28, 51,  9, 81, 41, 30, 22],
12 |                         [99, 91, 96, 20, 99, 46, 85, 63],
13 |                         [ 0, 78, 75, 43, 94, 99, 78, 93],
14 |                         [14, 42, 54, 11, 63, 42, 99, 48]])
15 | targets = torch.LongTensor([[28, 51,  9, 81, 41, 30, 22, 99],
16 |                             [91, 96, 20, 99, 46, 85, 63, 0],
17 |                             [78, 75, 43, 94, 99, 78, 93, 14],
18 |                             [42, 54, 11, 63, 42, 99, 48, 0]])
19 | optimizer = torch.optim.SGD(gpt2.parameters(),
20 |                             lr=1e-2)
21 | for i in range(10):
22 |   logit, loss = gpt2(idx, targets)
23 |   optimizer.zero_grad()
24 |   loss.backward()
25 |   optimizer.step()
26 |   print('loss', loss)
27 |   */
28 | 
29 |   Eigen::setNbThreads(4);
30 |   nn::ManualSeed(42);
31 |   int block_size = 8, n_embd = 16, n_head = 4, n_layer = 8, vocab_size = 100;
32 |   int B = 4, T = block_size, C = n_embd, nh = n_head, hs = n_embd / nh;
33 |   gpt::GPT gpt(block_size, vocab_size, vocab_size, n_layer, n_head, n_embd);
34 | 
35 |   std::vector<int> idx = {35, 28, 51, 9,  81, 41, 30, 22, 99, 91, 96,
36 |                           20, 99, 46, 85, 63, 0,  78, 75, 43, 94, 99,
37 |                           78, 93, 14, 42, 54, 11, 63, 42, 99, 48};
38 |   auto idx_m = TTypes<int>::ConstMatrix(idx.data(), B, T);
39 |   std::vector<float> logits(B * T * vocab_size);
40 |   auto logits_3d = Make3DTensor(logits.data(), B, T, vocab_size);
41 | 
42 |   std::vector<int> target = {28, 51, 9,  81, 41, 30, 22, 99, 91, 96, 20,
43 |                              99, 46, 85, 63, 0,  78, 75, 43, 94, 99, 78,
44 |                              93, 14, 42, 54, 11, 63, 42, 99, 48, 0};
45 |   auto target_m = TTypes<int>::ConstMatrix(target.data(), B, T);
46 | 
47 |   std::vector<nn::Parameter*> parameters;
48 |   gpt.Parameters(&parameters);
49 |   auto optimizer = optim::SGD(parameters, 1e-2f);
50 |   float expected_loss[] = {
51 |       4.691669, 4.668904, 4.646729, 4.625142, 4.604129,
52 |       4.583667, 4.563725, 4.544271, 4.525268, 4.506680,
53 |   };
54 |   for (int step = 0; step < 10; ++step) {
55 |     float loss = 0.0;
56 |     gpt.ForwardCPU(idx_m, target_m, logits_3d, &loss);
57 |     optimizer.ZeroGrad();
58 |     gpt.BackwardCPU(idx_m, target_m);
59 |     optimizer.Step();
60 |     fprintf(stdout, "Step %d, loss = %.6f\n", step, loss);
61 |     CHECK(std::abs(loss - expected_loss[step]) < 1e-5);
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/llmcpp/gpt_optim.cu:
--------------------------------------------------------------------------------
 1 | #include "gpt.hpp"
 2 | #include "optim.hpp"
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   /*
 6 | torch.set_printoptions(precision=6)
 7 | torch.manual_seed(42)
 8 | config = GPTConfig(block_size=8, n_embd=16, n_head=4, n_layer=8, vocab_size=100)
 9 | gpt2 = GPT(config=config)
10 | B, T, C = 4, 8, 16
11 | idx = torch.LongTensor([[35, 28, 51,  9, 81, 41, 30, 22],
12 |                         [99, 91, 96, 20, 99, 46, 85, 63],
13 |                         [ 0, 78, 75, 43, 94, 99, 78, 93],
14 |                         [14, 42, 54, 11, 63, 42, 99, 48]])
15 | targets = torch.LongTensor([[28, 51,  9, 81, 41, 30, 22, 99],
16 |                             [91, 96, 20, 99, 46, 85, 63, 0],
17 |                             [78, 75, 43, 94, 99, 78, 93, 14],
18 |                             [42, 54, 11, 63, 42, 99, 48, 0]])
19 | optimizer = torch.optim.SGD(gpt2.parameters(),
20 |                             lr=1e-2)
21 | for i in range(10):
22 |   logit, loss = gpt2(idx, targets)
23 |   optimizer.zero_grad()
24 |   loss.backward()
25 |   optimizer.step()
26 |   print('loss', loss)
27 |   */
28 | 
29 |   Eigen::setNbThreads(4);
30 |   nn::ManualSeed(42);
31 |   int block_size = 8, n_embd = 16, n_head = 4, n_layer = 8, vocab_size = 100;
32 |   int B = 4, T = block_size, C = n_embd, nh = n_head, hs = n_embd / nh;
33 |   gpt::GPT gpt(block_size, vocab_size, vocab_size, n_layer, n_head, n_embd);
34 | 
35 |   std::vector<int> idx = {35, 28, 51, 9,  81, 41, 30, 22, 99, 91, 96,
36 |                           20, 99, 46, 85, 63, 0,  78, 75, 43, 94, 99,
37 |                           78, 93, 14, 42, 54, 11, 63, 42, 99, 48};
38 |   auto idx_m = TTypes<int>::ConstMatrix(idx.data(), B, T);
39 |   nn::Parameter d_logits(nn::DT_FLOAT, B * T * vocab_size);
40 |   auto logits_3d = d_logits.tensor_3d<float>(B, T, vocab_size);
41 | 
42 |   std::vector<int> target = {28, 51, 9,  81, 41, 30, 22, 99, 91, 96, 20,
43 |                              99, 46, 85, 63, 0,  78, 75, 43, 94, 99, 78,
44 |                              93, 14, 42, 54, 11, 63, 42, 99, 48, 0};
45 |   std::vector<float> label(B * T * vocab_size, 0.f);
46 |   nn::OntHot(MakeConstFlat(target.data(), target.size()),
47 |              MakeMatrix(label.data(), target.size(), vocab_size));
48 |   nn::Parameter d_label(nn::DT_FLOAT, label.size());
49 |   nn::g_device.memcpyHostToDevice(d_label.data<float>(), label.data(),
50 |                                   sizeof(float) * label.size());
51 |   nn::g_device.synchronize();
52 |   auto label_3d = d_label.const_tensor_3d<float>(B, T, vocab_size);
53 | 
54 |   std::vector<nn::Parameter*> parameters;
55 |   gpt.Parameters(&parameters);
56 |   auto optimizer = optim::SGD(parameters, 1e-2f);
57 |   float expected_loss[] = {
58 |       4.691669, 4.668904, 4.646729, 4.625142, 4.604129,
59 |       4.583667, 4.563725, 4.544271, 4.525268, 4.506680,
60 |   };
61 |   for (int step = 0; step < 10; ++step) {
62 |     float loss = 0.0f;
63 |     gpt.ForwardGPU(idx_m, label_3d, logits_3d, &loss);
64 |     optimizer.ZeroGrad();
65 |     gpt.BackwardGPU(idx_m);
66 |     optimizer.Step();
67 |     fprintf(stdout, "Step %d, loss = %.6f\n", step, loss);
68 |     CHECK(std::abs(loss - expected_loss[step]) < 1e-5);
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/llmcpp/optim.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LLM_CPP__OPTIM_HPP_
 2 | #define LLM_CPP__OPTIM_HPP_
 3 | 
 4 | #include "nn.hpp"
 5 | 
 6 | namespace optim {
 7 | 
 8 | struct SGD {
 9 |   SGD(std::vector<nn::Parameter*> parameters, float lr)
10 |       : parameters_(std::move(parameters)), lr_(lr) {}
11 | 
12 |   void ZeroGrad() {
13 |     for (nn::Parameter* parameter : parameters_) {
14 |       parameter->ZeroGrad();
15 |     }
16 |   }
17 | 
18 |   void Step() {
19 |     for (nn::Parameter* parameter : parameters_) {
20 |       auto param = parameter->flat<float>();
21 |       auto grad = parameter->flat_grad<float>();
22 |       param.device(nn::g_device) -= lr_ * grad;
23 |     }
24 |   }
25 | 
26 |  private:
27 |   std::vector<nn::Parameter*> parameters_;
28 |   float lr_;
29 | };
30 | 
31 | struct AdamW {
32 |   AdamW(std::vector<nn::Parameter*> parameters, float lr, float beta1 = 0.9f,
33 |         float beta2 = 0.999f, float eps = 1e-8f, float weight_decay = 0.0f)
34 |       : parameters_(std::move(parameters)),
35 |         lr_(lr),
36 |         beta1_(beta1),
37 |         beta2_(beta2),
38 |         eps_(eps),
39 |         weight_decay_(weight_decay) {
40 |     for (const auto& parameter : parameters_) {
41 |       m_.emplace_back(
42 |           std::make_unique<nn::Parameter>(nn::DT_FLOAT, parameter->size()));
43 |       v_.emplace_back(
44 |           std::make_unique<nn::Parameter>(nn::DT_FLOAT, parameter->size()));
45 |     }
46 |   }
47 | 
48 |   void ZeroGrad() {
49 |     for (nn::Parameter* parameter : parameters_) {
50 |       parameter->ZeroGrad();
51 |     }
52 |   }
53 | 
54 |   void Step(int t) {
55 |     for (size_t i = 0; i < parameters_.size(); ++i) {
56 |       auto parameter = parameters_[i]->flat<float>();
57 |       auto grad = parameters_[i]->flat_grad<float>();
58 |       auto m = m_[i]->flat<float>();
59 |       auto v = v_[i]->flat<float>();
60 | 
61 |       // update the first moment (momentum)
62 |       m.device(nn::g_device) = beta1_ * m + (1.0f - beta1_) * grad;
63 |       // update the second moment (RMSprop)
64 |       v.device(nn::g_device) = beta2_ * v + (1.0f - beta2_) * grad * grad;
65 |       // bias-correct both moments
66 |       auto m_hat = m / (1.0f - static_cast<float>(std::pow(beta1_, t)));
67 |       auto v_hat = v / (1.0f - static_cast<float>(std::pow(beta2_, t)));
68 | 
69 |       // update
70 |       parameter.device(nn::g_device) -=
71 |           lr_ * (m_hat / (v_hat.sqrt() + eps_) + weight_decay_ * parameter);
72 |     }
73 |   }
74 | 
75 |  private:
76 |   std::vector<nn::Parameter*> parameters_;
77 |   std::vector<std::unique_ptr<nn::Parameter>> m_;
78 |   std::vector<std::unique_ptr<nn::Parameter>> v_;
79 |   float lr_;
80 |   float beta1_;
81 |   float beta2_;
82 |   float eps_;
83 |   float weight_decay_;
84 | };
85 | 
86 | }  // namespace optim
87 | 
88 | #endif  // LLM_CPP__OPTIM_HPP_
89 | 


--------------------------------------------------------------------------------
/llmcpp/optim_test.cpp:
--------------------------------------------------------------------------------
  1 | #include "optim.hpp"
  2 | #include "gtest/gtest.h"
  3 | 
  4 | TEST(Optimizer, SGD) {
  5 |   /*
  6 | torch.set_printoptions(precision=6)
  7 | torch.manual_seed(42)
  8 | m = nn.Linear(3, 2)
  9 | optimizer = torch.optim.SGD(m.parameters(), lr=0.01)
 10 | x = torch.randn(4, 3)
 11 | for _ in range(10):
 12 |   y = m(x)
 13 |   loss = torch.sum(y)
 14 |   optimizer.zero_grad()
 15 |   loss.backward()
 16 |   optimizer.step()
 17 |   */
 18 | 
 19 |   nn::ManualSeed(42);
 20 |   int B = 4, in_features = 3, out_features = 2;
 21 |   nn::Linear m(in_features, out_features, true);
 22 | 
 23 |   // forward
 24 |   std::vector<float> x(B * in_features), y(B * out_features);
 25 |   nn::NormalFill(absl::MakeSpan(x));
 26 |   auto xm = MakeConstMatrix(x.data(), B, in_features);
 27 |   auto ym = MakeMatrix(y.data(), B, out_features);
 28 | 
 29 |   // optimizer
 30 |   std::vector<nn::Parameter*> parameters;
 31 |   m.Parameters(&parameters);
 32 |   optim::SGD sgd(parameters, 0.01);
 33 | 
 34 |   // backward
 35 |   std::vector<float> y_grad(y.size(), 1.0f);
 36 |   std::vector<float> x_grad(x.size(), 0.f);
 37 |   auto y_gradm = MakeConstMatrix(y_grad.data(), B, out_features);
 38 |   auto x_gradm = MakeMatrix(x_grad.data(), B, in_features);
 39 | 
 40 |   int step = 10;
 41 |   for (int i = 0; i < step; ++i) {
 42 |     m.Forward(xm, ym);
 43 |     sgd.ZeroGrad();
 44 |     m.Backward(xm, y_gradm, x_gradm);
 45 |     sgd.Step();
 46 |   }
 47 | 
 48 |   auto weight = m.weight_->span<float>();
 49 |   auto bias = m.bias_->span<float>();
 50 |   std::vector<float> expected_weight = {0.732981, 0.469633,  -0.589639,
 51 |                                         0.821935, -0.136072, -0.337878};
 52 |   std::vector<float> expected_bias = {-0.681086, -0.060932};
 53 |   for (size_t i = 0; i < expected_weight.size(); ++i) {
 54 |     EXPECT_NEAR(expected_weight[i], weight[i], 1e-5);
 55 |   }
 56 |   for (size_t i = 0; i < expected_bias.size(); ++i) {
 57 |     EXPECT_NEAR(expected_bias[i], bias[i], 1e-5);
 58 |   }
 59 | }
 60 | 
 61 | TEST(Optimizer, AdamW) {
 62 |   /*
 63 | torch.set_printoptions(precision=6)
 64 | torch.manual_seed(42)
 65 | m = nn.Linear(3, 2)
 66 | optimizer = torch.optim.AdamW(m.parameters(), lr=0.01, betas=(0.9, 0.999),
 67 | eps=1e-8, weight_decay=0.001)
 68 | x = torch.randn(4, 3)
 69 | for _ in range(10):
 70 |   y = m(x)
 71 |   loss = torch.sum(y)
 72 |   optimizer.zero_grad()
 73 |   loss.backward()
 74 |   optimizer.step()
 75 |   */
 76 | 
 77 |   nn::ManualSeed(42);
 78 |   int B = 4, in_features = 3, out_features = 2;
 79 |   nn::Linear m(in_features, out_features, true);
 80 | 
 81 |   // forward
 82 |   std::vector<float> x(B * in_features), y(B * out_features);
 83 |   nn::NormalFill(absl::MakeSpan(x));
 84 |   auto xm = MakeConstMatrix(x.data(), B, in_features);
 85 |   auto ym = MakeMatrix(y.data(), B, out_features);
 86 | 
 87 |   // optimizer
 88 |   std::vector<nn::Parameter*> parameters;
 89 |   m.Parameters(&parameters);
 90 |   optim::AdamW adam_w(parameters, 0.01f, 0.9f, 0.999f, 1e-8f, 0.001f);
 91 | 
 92 |   // backward
 93 |   std::vector<float> y_grad(y.size(), 1.0f);
 94 |   std::vector<float> x_grad(x.size(), 0.f);
 95 |   auto y_gradm = MakeConstMatrix(y_grad.data(), B, out_features);
 96 |   auto x_gradm = MakeMatrix(x_grad.data(), B, in_features);
 97 | 
 98 |   int step = 10;
 99 |   for (int i = 0; i < step; ++i) {
100 |     m.Forward(xm, ym);
101 |     adam_w.ZeroGrad();
102 |     m.Backward(xm, y_gradm, x_gradm);
103 |     adam_w.Step(i + 1);
104 |   }
105 | 
106 |   auto weight = m.weight_->span<float>();
107 |   auto bias = m.bias_->span<float>();
108 |   std::vector<float> expected_weight = {0.541358, 0.379162,  -0.235239,
109 |                                         0.630303, -0.226482, 0.016497};
110 |   std::vector<float> expected_bias = {-0.381053, 0.239038};
111 |   for (size_t i = 0; i < expected_weight.size(); ++i) {
112 |     EXPECT_NEAR(expected_weight[i], weight[i], 1e-5);
113 |   }
114 |   for (size_t i = 0; i < expected_bias.size(); ++i) {
115 |     EXPECT_NEAR(expected_bias[i], bias[i], 1e-5);
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/llmcpp/tensor_types.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_
 2 | #define LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_
 3 | 
 4 | #include "Eigen/Dense"
 5 | #include "unsupported/Eigen/CXX11/Tensor"
 6 | 
 7 | // Helper to define Tensor types given that the scalar is of type T.
 8 | template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 9 | struct TTypes {
10 |   // Rank-<NDIMS> tensor of scalar type T.
11 |   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
12 |                            Eigen::Aligned>
13 |       Tensor;
14 |   typedef Eigen::TensorMap<
15 |       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
16 |       ConstTensor;
17 | 
18 |   // Unaligned Rank-<NDIMS> tensor of scalar type T.
19 |   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
20 |       UnalignedTensor;
21 |   typedef Eigen::TensorMap<
22 |       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
23 |       UnalignedConstTensor;
24 | 
25 |   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
26 |                            Eigen::Aligned>
27 |       Tensor32Bit;
28 | 
29 |   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
30 |   typedef Eigen::TensorMap<
31 |       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
32 |       Eigen::Aligned>
33 |       Scalar;
34 |   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
35 |                                                   Eigen::RowMajor, IndexType>,
36 |                            Eigen::Aligned>
37 |       ConstScalar;
38 | 
39 |   // Unaligned Scalar tensor of scalar type T.
40 |   typedef Eigen::TensorMap<
41 |       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
42 |       UnalignedScalar;
43 |   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
44 |                                                   Eigen::RowMajor, IndexType> >
45 |       UnalignedConstScalar;
46 | 
47 |   // Rank-1 tensor (vector) of scalar type T.
48 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
49 |                            Eigen::Aligned>
50 |       Flat;
51 |   typedef Eigen::TensorMap<
52 |       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
53 |       ConstFlat;
54 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
55 |                            Eigen::Aligned>
56 |       Vec;
57 |   typedef Eigen::TensorMap<
58 |       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
59 |       ConstVec;
60 | 
61 |   // Unaligned Rank-1 tensor (vector) of scalar type T.
62 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
63 |       UnalignedFlat;
64 |   typedef Eigen::TensorMap<
65 |       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
66 |       UnalignedConstFlat;
67 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
68 |       UnalignedVec;
69 |   typedef Eigen::TensorMap<
70 |       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
71 |       UnalignedConstVec;
72 | 
73 |   // Rank-2 tensor (matrix) of scalar type T.
74 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
75 |                            Eigen::Aligned>
76 |       Matrix;
77 |   typedef Eigen::TensorMap<
78 |       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
79 |       ConstMatrix;
80 | 
81 |   // Unaligned Rank-2 tensor (matrix) of scalar type T.
82 |   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
83 |       UnalignedMatrix;
84 |   typedef Eigen::TensorMap<
85 |       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
86 |       UnalignedConstMatrix;
87 | };
88 | 
89 | #endif  // LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_
90 | 


--------------------------------------------------------------------------------
/llmcpp/tensor_util.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_
 2 | #define LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_
 3 | 
 4 | #include "absl/container/inlined_vector.h"
 5 | #include "tensor_types.hpp"
 6 | 
 7 | using floatX = float;
 8 | 
 9 | // Raw pointer -> Flat
10 | template <typename T>
11 | typename TTypes<T>::Flat MakeFlat(T* t, int length) {
12 |   return {t, length};
13 | }
14 | template <typename T>
15 | typename TTypes<T>::ConstFlat MakeConstFlat(T* t, int length) {
16 |   return {t, length};
17 | }
18 | template <typename T>
19 | typename TTypes<T>::ConstFlat MakeConstFlat(const T* t, int length) {
20 |   return {t, length};
21 | }
22 | 
23 | // Raw pointer -> Matrix
24 | template <typename T>
25 | typename TTypes<T>::Matrix MakeMatrix(T* t, int rows, int cols) {
26 |   return {t, rows, cols};
27 | }
28 | template <typename T>
29 | typename TTypes<T>::ConstMatrix MakeConstMatrix(T* t, int rows, int cols) {
30 |   return {t, rows, cols};
31 | }
32 | template <typename T>
33 | typename TTypes<T>::ConstMatrix MakeConstMatrix(const T* t, int rows,
34 |                                                 int cols) {
35 |   return {t, rows, cols};
36 | }
37 | 
38 | // Raw pointer -> 3D Tensor
39 | template <typename T>
40 | typename TTypes<T, 3>::Tensor Make3DTensor(T* t, int dim0, int dim1, int dim2) {
41 |   return {t, dim0, dim1, dim2};
42 | }
43 | template <typename T>
44 | typename TTypes<T, 3>::ConstTensor MakeConst3DTensor(T* t, int dim0, int dim1,
45 |                                                      int dim2) {
46 |   return {t, dim0, dim1, dim2};
47 | }
48 | template <typename T>
49 | typename TTypes<T, 3>::ConstTensor MakeConst3DTensor(const T* t, int dim0,
50 |                                                      int dim1, int dim2) {
51 |   return {t, dim0, dim1, dim2};
52 | }
53 | 
54 | // Raw pointer -> 4D Tensor
55 | template <typename T>
56 | typename TTypes<T, 4>::Tensor Make4DTensor(T* t, int dim0, int dim1, int dim2,
57 |                                            int dim3) {
58 |   return {t, dim0, dim1, dim2, dim3};
59 | }
60 | template <typename T>
61 | typename TTypes<T, 4>::ConstTensor MakeConst4DTensor(T* t, int dim0, int dim1,
62 |                                                      int dim2, int dim3) {
63 |   return {t, dim0, dim1, dim2, dim3};
64 | }
65 | template <typename T>
66 | typename TTypes<T, 4>::ConstTensor MakeConst4DTensor(const T* t, int dim0,
67 |                                                      int dim1, int dim2,
68 |                                                      int dim3) {
69 |   return {t, dim0, dim1, dim2, dim3};
70 | }
71 | 
72 | #endif  // LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_
73 | 


--------------------------------------------------------------------------------
/llmcpp/test_eigen_cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include "nn.hpp"
 2 | 
 3 | using Tensor1D = Eigen::Tensor<float, 1, Eigen::RowMajor>;
 4 | using Tensor2D = Eigen::Tensor<float, 2, Eigen::RowMajor>;
 5 | using Tensor3D = Eigen::Tensor<float, 3, Eigen::RowMajor>;
 6 | using Tensor4D = Eigen::Tensor<float, 4, Eigen::RowMajor>;
 7 | 
 8 | int main(int argc, char** argv) {
 9 |   std::cout << "sizeof Tensor1D : " << sizeof(Tensor1D) << std::endl;
10 |   std::cout << "sizeof Tensor2D : " << sizeof(Tensor2D) << std::endl;
11 |   std::cout << "sizeof Tensor3D : " << sizeof(Tensor3D) << std::endl;
12 |   std::cout << "sizeof Tensor4D : " << sizeof(Tensor4D) << std::endl;
13 | 
14 |   std::cout << "sizeof map Tensor1D : " << sizeof(Eigen::TensorMap<Tensor1D>)
15 |             << std::endl;
16 |   std::cout << "sizeof map Tensor2D : " << sizeof(Eigen::TensorMap<Tensor2D>)
17 |             << std::endl;
18 |   std::cout << "sizeof map Tensor3D : " << sizeof(Eigen::TensorMap<Tensor3D>)
19 |             << std::endl;
20 |   std::cout << "sizeof map Tensor4D : " << sizeof(Eigen::TensorMap<Tensor4D>)
21 |             << std::endl;
22 | 
23 |   Eigen::setNbThreads(4);
24 |   nn::ManualSeed(42);
25 |   int B = 4, T = 64, C = 768, vocab_size = 50304;
26 |   std::vector<float> x(B * T * C), lm_head(C * vocab_size),
27 |       y(B * T * vocab_size);
28 |   nn::NormalFill(absl::MakeSpan(x));
29 |   nn::NormalFill(absl::MakeSpan(lm_head));
30 | 
31 |   auto xm = MakeConstMatrix(x.data(), B * T, C);
32 |   auto lm_headm = MakeConstMatrix(lm_head.data(), C, vocab_size);
33 |   auto ym = MakeMatrix(y.data(), B * T, vocab_size);
34 | 
35 |   auto start = std::chrono::steady_clock::now();
36 |   for (int i = 0; i < 10; ++i) {
37 |     nn::MatMul::Forward(xm, lm_headm, ym);
38 |   }
39 |   auto end = std::chrono::steady_clock::now();
40 |   std::cout << "avg: "
41 |             << std::chrono::duration_cast<std::chrono::milliseconds>(
42 |                    (end - start))
43 |                        .count() /
44 |                    10
45 |             << std::endl;
46 | 
47 |   return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/llmcpp/test_eigen_gpu.cu:
--------------------------------------------------------------------------------
 1 | //#define EIGEN_USE_GPU
 2 | 
 3 | #include "gpt.hpp"
 4 | // #include "optim.hpp"
 5 | 
 6 | #include "Eigen/Core"
 7 | #include "unsupported/Eigen/CXX11/Tensor"
 8 | 
 9 | using Tensor1D = Eigen::Tensor<float, 1, Eigen::RowMajor>;
10 | using Tensor2D = Eigen::Tensor<float, 2, Eigen::RowMajor>;
11 | using Tensor3D = Eigen::Tensor<float, 3, Eigen::RowMajor>;
12 | using Tensor4D = Eigen::Tensor<float, 4, Eigen::RowMajor>;
13 | 
14 | int main(int argc, char** argv) {
15 |   nn::ManualSeed(42);
16 |   int B = 4, T = 64, C = 768, vocab_size = 50304;
17 |   std::vector<float> x(B * T * C), lm_head(C * vocab_size),
18 |       y(B * T * vocab_size);
19 |   nn::NormalFill(absl::MakeSpan(x));
20 |   nn::NormalFill(absl::MakeSpan(lm_head));
21 |   Eigen::GpuStreamDevice stream;
22 |   Eigen::GpuDevice gpu_device(&stream);
23 |   //  Eigen::ThreadPool thread_pool(16);
24 |   //  Eigen::ThreadPoolDevice gpu_device(&thread_pool, 12);
25 | 
26 |   float *dx, *dy, *dlm_head;
27 |   dx = static_cast<float*>(gpu_device.allocate(sizeof(float) * B * T * C));
28 |   dlm_head =
29 |       static_cast<float*>(gpu_device.allocate(sizeof(float) * C * vocab_size));
30 |   dy = static_cast<float*>(
31 |       gpu_device.allocate(sizeof(float) * B * T * vocab_size));
32 |   gpu_device.memcpyHostToDevice(dx, x.data(), sizeof(float) * B * T * C);
33 |   gpu_device.memcpyHostToDevice(dlm_head, lm_head.data(),
34 |                                 sizeof(float) * C * vocab_size);
35 |   gpu_device.memcpyHostToDevice(dy, y.data(),
36 |                                 sizeof(float) * B * T * vocab_size);
37 | 
38 |   auto xm = Eigen::TensorMap<Tensor2D>(dx, B * T, C);
39 |   auto lm_headm = Eigen::TensorMap<Tensor2D>(dlm_head, C, vocab_size);
40 |   auto ym = Eigen::TensorMap<Tensor2D>(dy, B * T, vocab_size);
41 | 
42 |   auto start = std::chrono::steady_clock::now();
43 |   for (int i = 0; i < 10; ++i) {
44 |     Eigen::array<Eigen::IndexPair<int>, 1> product_dims = {
45 |         Eigen::IndexPair<int>(1, 0)};
46 |     ym.device(gpu_device) = xm.contract(lm_headm, product_dims);
47 |     //    nn::MatMul::Forward(xm, lm_headm, ym);
48 |   }
49 |   gpu_device.synchronize();
50 |   auto end = std::chrono::steady_clock::now();
51 |   std::cout << "avg: "
52 |             << std::chrono::duration_cast<std::chrono::milliseconds>(
53 |                    (end - start))
54 |                        .count() /
55 |                    10
56 |             << std::endl;
57 | 
58 |   return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/profile_gpt2.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | This code is a convenience tool for profiling the CUDA kernels in the training
 3 | loop of train_gpt2.cu. Compile:
 4 | 
 5 | make profile_gpt2cu NO_MULTI_GPU=1
 6 | 
 7 | And then e.g. use ncu from NVIDIA. The CLI docs for example:
 8 | https://docs.nvidia.com/nsight-compute/NsightComputeCli/
 9 | 
10 | TLDR run like:
11 | 
12 | sudo ncu --set full --import-source yes -o profile -f ./profile_gpt2cu
13 | 
14 | This:
15 | - `--set full` means we'll collect A LOT of metrics. take out for less
16 | - `--import-source yes` means we'll get the source code in the profile
17 | - `-o profile` writes the results into file profile.ncu-rep
18 | - `-f` forces overwrite of the profile.ncu-rep file
19 | - `./profile_gpt2cu` is the executable we want to profile
20 | 
21 | This writes results into profile.ncu-rep output file.
22 | You can open this up in NVIDIA Nsight Compute UI.
23 | For example, I have NVIDIA Nsight Compute installed on my Mac, and I rsync
24 | the profile.ncu-rep from a cloud box to local to pretty view.
25 | */
26 | 
27 | #define TESTING
28 | #include "train_gpt2.cu"
29 | 
30 | int main(int argc, char *argv[]) {
31 |     char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
32 |     int num_processes = -1;  // doesn't matter when using MPI
33 |     int process_rank = -1;  // doesn't matter when using MPI
34 |     int gpus_per_node = -1;  // doesn't matter when using MPI
35 |     char server_ip[256] = "";  // doesn't matter when using MPI
36 |     char fs_path[256] = "";  // doesn't matter when using MPI
37 |     multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method);
38 |     common_start(true, true);
39 | 
40 |     // build the GPT-2 model from a checkpoint
41 |     GPT2 model;
42 |     gpt2_init_common(&model);
43 |     gpt2_build_from_checkpoint(&model, "gpt2_124M_bf16.bin");
44 | 
45 |     int B = 24; // if program OOMs decrease this number, e.g. all the way down to 4 or etc
46 |     int T = 1024; // if even that OOMs move on to this one. keep them nice and powers of 2
47 |     printf("batch size: %d\n", B);
48 |     printf("sequence length: %d\n", T);
49 | 
50 |     int* x = (int*)mallocCheck(B * T * sizeof(int));
51 |     int* y = (int*)mallocCheck(B * T * sizeof(int));
52 |     for(int  i = 0; i < B  * T; ++i) {
53 |         x[i] = i % model.config.vocab_size;
54 |         y[i] = i % model.config.vocab_size;
55 |     }
56 | 
57 |     // override number of layers to 1 because all layers repeat the same kernels, only profile once
58 |     model.config.num_layers = 1;
59 |     set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
60 | 
61 |     gpt2_allocate_state(&model, B, T);
62 |     // do a training step
63 |     gpt2_forward(&model, x, B, T);
64 |     gpt2_backward_and_reduce(&model, x, y, 1, 0);
65 |     float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config);
66 |     float grad_scale = (grad_norm > 1.0f) ? 1.0f / grad_norm : 1.0f;
67 |     gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, grad_scale, 1, &multi_gpu_config);
68 |     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
69 | 
70 |     // free
71 |     gpt2_free(&model);
72 |     common_free(model);
73 |     return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | numpy<2
3 | torch
4 | tiktoken
5 | transformers
6 | datasets
7 | requests
8 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # scripts
 2 | 
 3 | These shell scripts hold the exact commands to llm.c that reproduce the GPT-2 and GPT-3 runs.
 4 | 
 5 | ### pytorch reference runs
 6 | 
 7 | For all pyrun scripts, current restrictions:
 8 | 
 9 | - does not write checkpoint, only logs of the train/val losses
10 | - does not evaluate hellaswag accuracy
11 | - cannot "resume training" (i.e. the `-y 1` flag)
12 | 
13 | ### memory considerations
14 | 
15 | In any of these scripts, if you are running out of memory on your GPU you'll want to meddle with two flags: the recompute setting `-r` and the microbatch size `-b`. Recompute throws away some activations during the forward pass and then recomputes them during the backward pass. This reduces the amount of memory we need to store and cache during the forward pass, but then increases the amount of computation we need to do during the backward pass. The microbatch size controls the number of token streams that are processed in a single forward/backward pass in parallel. Decreasing this number means we need to store less memory per microbatch, but then we have to increase the number of loops in the gradient accumulation to meet the same desired total batch size.
16 | 
17 | Long story short, try `-r 1` (recompute GeLU, trading off speed and memory) to conserve some memory. If that doesn't help, start dividing the micro batch size until things fit. For example if the deafult is `-b 64`, try `-b 32`, and then 16, 8, etc. until things fit. Once they do fit, experiment with dialing back the recompute flag `-r 0` to get some speed back. Alternatively to `-b`, if your application doesn't need a very long context length, you can dial back the number of max sequence length using `-t`. For example GPT-2 uses `-t 1024` and GPT-3 uses `-t 2048`. Your application may tolerate a lower context length.
18 | 
19 | ### multi-gpu considerations
20 | 
21 | It might be that you only have one GPU and not a whole box of them. Every script is fairly easy to change for just a single GPU. For llm.c, simply change line 1 to line 2 and leave everything else the same:
22 | 
23 | ```bash
24 | mpirun -np 8 ./train_gpt2cu \
25 | ./train_gpt2cu \
26 | ```
27 | 
28 | For PyTorch, the same thing:
29 | 
30 | ```bash
31 | torchrun --standalone --nproc_per_node=8 train_gpt2.py \
32 | python train_gpt2.py \
33 | ```
34 | 
35 | Both of these scripts automatically detect how many GPUs are available and adjust the gradient accumulation inner loop of the optimization accordingly, so the results come out the same, up to floating point error. Of course, you'll have to wait proportionally longer for the optimization to finish.
36 | 
37 | To run on multiple nodes of GPUs, have a look at this pending [PR](https://github.com/karpathy/llm.c/pull/426), alternatively for llm.c try something like this:
38 | 
39 | ```bash
40 | mpirun -np 16 --host node1:8,node2:8 ./train_gptcu ...
41 | ```
42 | 
43 | For PyTorch follow the torchrun docs.
44 | 


--------------------------------------------------------------------------------
/scripts/multi_node/run_gpt2_124M_fs.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=llmc-multinode                                     # job name
 3 | #SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log   # output file
 4 | #SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err    # error file
 5 | #SBATCH --partition=llmc                                              # Specify the GPU partition
 6 | #SBATCH --ntasks=16                                                   # total number of processes to launch on all nodes
 7 | #SBATCH --nodes=2                                                     # total number of nodes
 8 | #SBATCH --ntasks-per-node=8                                           # assuming each node has 8 gpus
 9 | #SBATCH --gres=gpu:8                                                  # request 8 gpus from each node
10 | 
11 | # NOTE: change the above slurm arguments to match your system!
12 | # Run with `sbatch <path_to_this_script.sh>`
13 | 
14 | make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
15 | 
16 | # NOTE: change the following to match your system
17 | binary_path="/home/ubuntu/llm.c/train_gpt2cu"
18 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
19 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
20 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
21 | sync_fs_path=$out_dir  # needs to be a shared filesystem path that all nodes can access
22 | 
23 | # In case the file system is shared this is a no-op.
24 | # Otherwise, we need to copy the binary to all nodes.
25 | current_user=$USER
26 | hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST)  # get the hostnames of the allocated nodes
27 | current_host=$(hostname)
28 | for host in $hosts; do
29 |     if [ $host == $current_host ]; then
30 |         continue
31 |     fi
32 |     echo "copying $binary_path to $current_user@$host"
33 |     scp -r $binary_path $current_user@$host:$binary_path
34 | done
35 | 
36 | # Use this for NCCL debugging if you run into issues
37 | # export NCCL_DEBUG=INFO
38 | # export NCCL_DEBUG_SUBSYS=ALL
39 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
40 | 
41 | # Optimization flags
42 | export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
43 | export NCCL_IB_DISABLE=0  # use InfiniBand if available
44 | 
45 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them
46 | export NCCL_SOCKET_IFNAME=ens17
47 | export OMPI_MCA_btl_tcp_if_include=ens17
48 | export NCCL_P2P_LEVEL=PXB
49 | 
50 | if [ -z "$SLURM_JOB_ID" ]; then
51 |     echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting."
52 |     exit 1
53 | else
54 |     DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"`
55 |     echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)"
56 |     echo "Running on hosts: $(echo $(scontrol show hostname))"
57 |     echo "$DATESTRING"
58 | fi
59 | 
60 | srun -l -u bash -c "
61 |     $binary_path \
62 |     -i '$train_data_path' \
63 |     -j '$val_data_path' \
64 |     -o $out_dir \
65 |     -v 250 -s 20000 -g 144 \
66 |     -h 1 \
67 |     -b 64 -t 1024 \
68 |     -d 2097152 \
69 |     -r 0 \
70 |     -z 1 \
71 |     -c 0.1 \
72 |     -l 0.0006 \
73 |     -q 0.0 \
74 |     -u 700 \
75 |     -n 5000 \
76 |     -y 1 \
77 |     -e d12 \
78 |     -pn \$SLURM_NTASKS \
79 |     -pr \$SLURM_PROCID \
80 |     -pg \$SLURM_NTASKS_PER_NODE \
81 |     -pf $sync_fs_path \
82 |     -pi "fs" \
83 | "
84 | 
85 | echo "$DATESTRING"


--------------------------------------------------------------------------------
/scripts/multi_node/run_gpt2_124M_mpi.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | make train_gpt2cu USE_CUDNN=1
 3 | 
 4 | # NOTE: change the following to match your system
 5 | binary_path="/home/ubuntu/llm.c/train_gpt2cu"
 6 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
 7 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
 8 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
 9 | # You can find these names either in `/etc/hosts`` file or in the terminal (user@host:~$).
10 | host1="h100-node-1-0"  # master and worker node
11 | host2="h100-node-1-1"  # worker node
12 | 
13 | # In case the file system is shared this is a no-op.
14 | # Otherwise, we need to copy the binary to all nodes.
15 | scp -r $binary_path $USER@$host2:$binary_path
16 | 
17 | # Use this for NCCL debugging if you run into issues
18 | # export NCCL_DEBUG=INFO
19 | # export NCCL_DEBUG_SUBSYS=ALL
20 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
21 | 
22 | # Optimization flags
23 | export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
24 | export NCCL_IB_DISABLE=0  # use InfiniBand if available
25 | 
26 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them
27 | export NCCL_SOCKET_IFNAME=ens17
28 | export OMPI_MCA_btl_tcp_if_include=ens17
29 | export NCCL_P2P_LEVEL=PXB
30 | 
31 | mpirun -np 16 --host $host1:8,$host2:8 \
32 |     $binary_path \
33 |     -i "$train_data_path" \
34 |     -j "$val_data_path" \
35 |     -o $out_dir \
36 |     -v 250 -s 20000 -g 144 \
37 |     -h 1 \
38 |     -b 64 -t 1024 \
39 |     -d 2097152 \
40 |     -r 0 \
41 |     -z 1 \
42 |     -c 0.1 \
43 |     -l 0.0006 \
44 |     -q 0.1 \
45 |     -u 700 \
46 |     -n 1000 \
47 |     -y 0 \
48 |     -e d12 \
49 |     -pi "mpi" \
50 | 


--------------------------------------------------------------------------------
/scripts/multi_node/run_gpt2_124M_tcp.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=llmc-multinode                                     # job name
 3 | #SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log   # output file
 4 | #SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err    # error file
 5 | #SBATCH --partition=llmc                                              # Specify the GPU partition
 6 | #SBATCH --ntasks=16                                                   # total number of processes to launch on all nodes
 7 | #SBATCH --nodes=2                                                     # total number of nodes
 8 | #SBATCH --ntasks-per-node=8                                           # assuming each node has 8 gpus
 9 | #SBATCH --gres=gpu:8                                                  # request 8 gpus from each node
10 | 
11 | # NOTE: change the above slurm arguments to match your system!
12 | # Run with `sbatch <path_to_this_script.sh>`
13 | 
14 | make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
15 | 
16 | # NOTE: change the following to match your system
17 | binary_path="/home/ubuntu/llm.c/train_gpt2cu"
18 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
19 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
20 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
21 | # NOTE: change the server_ip to the IP address of the machine that is running process zero
22 | server_ip="10.0.1.220"
23 | 
24 | # In case the file system is shared this is a no-op.
25 | # Otherwise, we need to copy the binary to all nodes.
26 | current_user=$USER
27 | hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST)  # get the hostnames of the allocated nodes
28 | current_host=$(hostname)
29 | for host in $hosts; do
30 |     if [ $host == $current_host ]; then
31 |         continue
32 |     fi
33 |     echo "copying $binary_path to $current_user@$host"
34 |     scp -r $binary_path $current_user@$host:$binary_path
35 | done
36 | 
37 | # Use this for NCCL debugging if you run into issues
38 | # export NCCL_DEBUG=INFO
39 | # export NCCL_DEBUG_SUBSYS=ALL
40 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
41 | 
42 | # Optimization flags
43 | export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
44 | export NCCL_IB_DISABLE=0  # use InfiniBand if available
45 | 
46 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them
47 | export NCCL_SOCKET_IFNAME=ens17
48 | export OMPI_MCA_btl_tcp_if_include=ens17
49 | export NCCL_P2P_LEVEL=PXB
50 | 
51 | if [ -z "$SLURM_JOB_ID" ]; then
52 |     echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting."
53 |     exit 1
54 | else
55 |     DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"`
56 |     echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)"
57 |     echo "Running on hosts: $(echo $(scontrol show hostname))"
58 |     echo "$DATESTRING"
59 | fi
60 | 
61 | srun -l -u bash -c "
62 |     $binary_path \
63 |     -i '$train_data_path' \
64 |     -j '$val_data_path' \
65 |     -o $out_dir \
66 |     -v 250 -s 20000 -g 144 \
67 |     -h 1 \
68 |     -b 64 -t 1024 \
69 |     -d 2097152 \
70 |     -r 0 \
71 |     -z 1 \
72 |     -c 0.1 \
73 |     -l 0.0006 \
74 |     -q 0.0 \
75 |     -u 700 \
76 |     -n 5000 \
77 |     -y 1 \
78 |     -e d12 \
79 |     -pn \$SLURM_NTASKS \
80 |     -pr \$SLURM_PROCID \
81 |     -pg \$SLURM_NTASKS_PER_NODE \
82 |     -ps $server_ip \
83 |     -pi "tcp" \
84 | "
85 | 
86 | echo "$DATESTRING"
87 | 


--------------------------------------------------------------------------------
/scripts/pyrun_gpt2_124M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # the same as scripts/run_gpt2_124M.sh but with PyTorch
 4 | 
 5 | # if you wish to train on just a single GPU, simply skip the torchrun part, i.e.
 6 | # python train_gpt2.py ... (all the other arguments the same)
 7 | torchrun --standalone --nproc_per_node=8 train_gpt2.py \
 8 |     --input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \
 9 |     --input_val_bin "dev/data/fineweb10B/fineweb_val_*.bin" \
10 |     --val_loss_every 250 \
11 |     --sample_every 0 \
12 |     --output_dir pylog_gpt2_124M \
13 |     --write_tensors 0 \
14 |     --model d12 \
15 |     --batch_size 32 \
16 |     --sequence_length 1024 \
17 |     --total_batch_size 524288 \
18 |     --dtype bfloat16 \
19 |     --compile 1 \
20 |     --tensorcores 1 \
21 |     --flash 1 \
22 |     --num_iterations 18865 \
23 |     --weight_decay 0.1 \
24 |     --zero_stage 1 \
25 |     --learning_rate 0.0006 \
26 |     --warmup_iters 700 \
27 |     --learning_rate_decay_frac 0.0 \
28 |     --overfit_single_batch 0
29 | 


--------------------------------------------------------------------------------
/scripts/run_gpt2_124M.sh:
--------------------------------------------------------------------------------
 1 | # GPT-2 (124M) repro on FineWeb
 2 | # 124M parameter model on 10B tokens
 3 | # => 6 * 124e6 * 10e9 = 7.44e18 ~= 7e18 capability model
 4 | # 18,865 steps of 524,288 tokens/step
 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~300ms/iter
 6 | # => training time 18,865 * 300ms = 94.3 min ~= $20
 7 | 
 8 | make train_gpt2cu USE_CUDNN=1
 9 | out_dir="log_gpt2_124M"
10 | done_file="$out_dir/DONE_00018865"
11 | 
12 | # in case the training stalls or crashes, loop to resume (-y 1)
13 | while true; do
14 | 
15 |     # exit condition is that optimization has finished
16 |     if [ -f "$done_file" ]; then
17 |         echo "File $done_file exists. Exiting the loop."
18 |         break
19 |     fi
20 | 
21 |     # run python dev/data/fineweb.py --version 10B to prepro data
22 |     # run python dev/data/hellaswag.py to prepro hellaswag eval
23 |     mpirun -np 8 ./train_gpt2cu \
24 |                 -i "dev/data/fineweb10B/fineweb_train_*.bin" \
25 |                 -j "dev/data/fineweb10B/fineweb_val_*.bin" \
26 |                 -o $out_dir \
27 |                 -v 250 -s 20000 -g 144 \
28 |                 -h 1 \
29 |                 -b 64 -t 1024 \
30 |                 -d 524288 \
31 |                 -r 0 \
32 |                 -z 1 \
33 |                 -c 0.1 \
34 |                 -l 0.0006 \
35 |                 -q 0.0 \
36 |                 -u 700 \
37 |                 -n 5000 \
38 |                 -y 1 \
39 |                 -e "d12"
40 | 
41 |     sleep 1
42 | done
43 | 


--------------------------------------------------------------------------------
/scripts/run_gpt2_1558M.sh:
--------------------------------------------------------------------------------
 1 | # GPT-2 (1558M) repro on FineWeb-EDU
 2 | # 1558M parameter model on 32B tokens
 3 | # => 6 * 1558e6 * 32e9 = 6.966e20 ~= 3e20 capability model
 4 | # 32,000 steps on ~1M tokens/step (1,048,576 to be precise)
 5 | # on 8X H100 80GB SXM ($28/hr) steps in 2.80s/iter
 6 | # => training time 32,000 steps * 2.7s => 24 hours ~= 1 day ~= $672
 7 | 
 8 | make train_gpt2cu USE_CUDNN=1
 9 | out_dir="log_gpt2_1558M"
10 | done_file="$out_dir/DONE_00032000"
11 | 
12 | # in case the training stalls or crashes, loop to resume (-y 1)
13 | while true; do
14 | 
15 |     # exit condition is that optimization has finished
16 |     if [ -f "$done_file" ]; then
17 |         echo "File $done_file exists. Exiting the loop."
18 |         break
19 |     fi
20 | 
21 |     mpirun -np 8 ./train_gpt2cu \
22 |                 -i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \
23 |                 -j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \
24 |                 -o $out_dir \
25 |                 -v 250 -s 300000 -g 384 \
26 |                 -h 1 \
27 |                 -b 16 -t 1024 \
28 |                 -d 1048576 \
29 |                 -r 0 \
30 |                 -z 1 \
31 |                 -c 0.1 \
32 |                 -k "cosine" \
33 |                 -l 0.0006 \
34 |                 -q 0.1 \
35 |                 -u 700 \
36 |                 -n 2000 \
37 |                 -x 32000 \
38 |                 -ge 1 \
39 |                 -y 1 \
40 |                 -e "d48"
41 | 
42 |     sleep 1
43 | done
44 | 


--------------------------------------------------------------------------------
/scripts/run_gpt2_350M.sh:
--------------------------------------------------------------------------------
 1 | # GPT-2 (350M) repro on FineWeb
 2 | # 350M parameter model on ~30B tokens
 3 | # => 6 * 350e6 * 31.5e9 = 6.615e19 ~= 7e19 capability model (10X 124M)
 4 | # 60K steps on 524,288 tokens/step
 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~820ms/iter
 6 | # => training time 60,000 steps * 820ms = 13.7 hours ~= $200 (10X 124M)
 7 | 
 8 | make train_gpt2cu USE_CUDNN=1
 9 | out_dir="log_gpt2_350M"
10 | done_file="$out_dir/DONE_00060000"
11 | 
12 | # in case the training stalls or crashes, loop to resume (-y 1)
13 | while true; do
14 | 
15 |     # exit condition is that optimization has finished
16 |     if [ -f "$done_file" ]; then
17 |         echo "File $done_file exists. Exiting the loop."
18 |         break
19 |     fi
20 | 
21 |     # run python dev/data/fineweb.py --version 100B to prepro data
22 |     # run python dev/data/hellaswag.py to prepro hellaswag eval
23 |     mpirun -np 8 ./train_gpt2cu \
24 |                 -i "dev/data/fineweb100B/fineweb_train_*.bin" \
25 |                 -j "dev/data/fineweb100B/fineweb_val_*.bin" \
26 |                 -o $out_dir \
27 |                 -v 250 -s 100000 -g 144 \
28 |                 -h 1 \
29 |                 -b 64 -t 1024 \
30 |                 -d 524288 \
31 |                 -r 0 \
32 |                 -z 1 \
33 |                 -c 0.1 \
34 |                 -l 0.0003 \
35 |                 -q 0.0 \
36 |                 -u 700 \
37 |                 -n 2000 \
38 |                 -x 60000 \
39 |                 -y 1 \
40 |                 -e "d24"
41 | 
42 |     sleep 1
43 | done
44 | 


--------------------------------------------------------------------------------
/scripts/run_gpt2_774M.sh:
--------------------------------------------------------------------------------
 1 | # GPT-2 (774M) repro on FineWeb
 2 | # 774M parameter model on ~150B tokens
 3 | # => 6 * 774e6 * 150e9 = 6.966e20 ~= 7e20 capability model (10X 350M)
 4 | # => 286,102 steps on 524,288 tokens/step
 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~1.7s/iter
 6 | # => training time 286,102 steps * 1.7s = 135 hours ~= 5.6 days ~= $2000 (10X 124M)
 7 | 
 8 | make train_gpt2cu USE_CUDNN=1
 9 | out_dir="log_gpt2_774M"
10 | done_file="$out_dir/DONE_00286102"
11 | 
12 | # in case the training stalls or crashes, loop to resume (-y 1)
13 | while true; do
14 | 
15 |     # exit condition is that optimization has finished
16 |     if [ -f "$done_file" ]; then
17 |         echo "File $done_file exists. Exiting the loop."
18 |         break
19 |     fi
20 | 
21 |     # run python dev/data/fineweb.py --version 100B to prepro data
22 |     # run python dev/data/hellaswag.py to prepro hellaswag eval
23 |     mpirun -np 8 ./train_gpt2cu \
24 |                 -i "dev/data/fineweb100B/fineweb_train_*.bin" \
25 |                 -j "dev/data/fineweb100B/fineweb_val_*.bin" \
26 |                 -o $out_dir \
27 |                 -v 250 -s 300000 -g 144 \
28 |                 -h 1 \
29 |                 -b 32 -t 1024 \
30 |                 -d 524288 \
31 |                 -r 0 \
32 |                 -z 1 \
33 |                 -c 0.1 \
34 |                 -l 0.00025 \
35 |                 -q 0.0 \
36 |                 -u 700 \
37 |                 -n 4000 \
38 |                 -x 286102 \
39 |                 -y 1 \
40 |                 -e "d36"
41 | 
42 |     sleep 1
43 | done
44 | 


--------------------------------------------------------------------------------
/scripts/run_gpt3_125M.sh:
--------------------------------------------------------------------------------
 1 | # GPT-3 (125M) repro, but using FineWeb
 2 | # 125M parameter model on 300B tokens
 3 | # note context length: 1024 -> 2048 for GPT-3
 4 | # => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model
 5 | # 572,204 steps of 524,288 tokens/step => 300B
 6 | # on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter
 7 | # => training time 572,204 * 150ms ~= 24 hours ~= $336
 8 | 
 9 | make train_gpt2cu USE_CUDNN=1
10 | out_dir="log_gpt3_125M"
11 | done_file="$out_dir/DONE_00572204"
12 | 
13 | while true; do
14 | 
15 |     # exit condition is that optimization has finished
16 |     if [ -f "$done_file" ]; then
17 |         echo "File $done_file exists. Exiting the loop."
18 |         break
19 |     fi
20 | 
21 |     mpirun -np 8 ./train_gpt2cu \
22 |                 -i "dev/data/fineweb100B/fineweb_train_*.bin" \
23 |                 -j "dev/data/fineweb100B/fineweb_val_*.bin" \
24 |                 -o $out_dir \
25 |                 -v 250 -s 20000 -g 144 \
26 |                 -h 1 \
27 |                 -b 32 -t 2048 \
28 |                 -d 524288 \
29 |                 -r 0 \
30 |                 -z 1 \
31 |                 -c 0.1 \
32 |                 -l 0.0006 \
33 |                 -q 0.1 \
34 |                 -u 700 \
35 |                 -n 10000 \
36 |                 -nk 5 \
37 |                 -nm 50000 \
38 |                 -ge 1 \
39 |                 -sl 7.0 \
40 |                 -sg 7.0 \
41 |                 -y 1 \
42 |                 -x 572204 \
43 |                 -e "gpt3:c768"
44 | 
45 |     sleep 1
46 | done
47 | 


--------------------------------------------------------------------------------