├── requirements.txt ├── .gitmodules ├── dev ├── data │ ├── README.md │ ├── edu_fineweb.sh │ ├── fineweb.sh │ ├── tinyshakespeare.py │ ├── tinystories.py │ ├── data_common.py │ ├── fineweb.py │ └── mmlu.py ├── eval │ ├── summarize_eval.py │ ├── README.md │ └── run_eval.sh ├── test │ ├── test_outlier_detector.c │ ├── device_file_io.cu │ └── Makefile ├── download_starter_pack.sh ├── cuda │ ├── README.md │ ├── Makefile │ ├── crossentropy_forward.cu │ ├── residual_forward.cu │ ├── benchmark_on_modal.py │ ├── gelu_forward.cu │ └── crossentropy_softmax_backward.cu ├── loss_checker_ci.py ├── unistd.h └── vislog.ipynb ├── .gitignore ├── llmcpp ├── cuda_profile_util.hpp ├── test_eigen_cpu.cpp ├── test_eigen_gpu.cu ├── tensor_util.hpp ├── gpt_optim.cpp ├── optim.hpp ├── CMakeLists.txt ├── gpt_optim.cu ├── tensor_types.hpp ├── optim_test.cpp ├── README.md └── gpt2.hpp ├── llmc ├── CMakeLists.txt ├── cudnn_att.h ├── sampler.h ├── cublas_common.h ├── logger.h ├── outlier_detector.h ├── gelu.cuh ├── global_norm.cuh ├── tokenizer.h ├── schedulers.h └── adamw.cuh ├── scripts ├── pyrun_gpt2_124M.sh ├── run_gpt2_124M.sh ├── run_gpt2_1558M.sh ├── run_gpt3_125M.sh ├── run_gpt2_350M.sh ├── run_gpt2_774M.sh ├── multi_node │ ├── run_gpt2_124M_mpi.sh │ ├── run_gpt2_124M_fs.sbatch │ └── run_gpt2_124M_tcp.sbatch └── README.md ├── CMakeLists.txt ├── LICENSE ├── doc └── layernorm │ ├── layernorm.py │ └── layernorm.c ├── profile_gpt2.cu ├── .github └── workflows │ ├── ci_tests.yml │ └── ci_gpu.yml └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | numpy<2 3 | torch 4 | tiktoken 5 | transformers 6 | datasets 7 | requests 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/abseil-cpp"] 2 | path = third_party/abseil-cpp 3 | url = https://github.com/abseil/abseil-cpp.git 4 | [submodule "third_party/eigen"] 5 | path = third_party/eigen 6 | url = https://gitlab.com/libeigen/eigen.git 7 | [submodule "third_party/googletest"] 8 | path = third_party/googletest 9 | url = https://github.com/google/googletest.git 10 | -------------------------------------------------------------------------------- /dev/data/README.md: -------------------------------------------------------------------------------- 1 | # dev/data organization 2 | 3 | The idea is that each dataset has a .py file here in the root of `dev/data`, and each dataset then creates a directory here, and writes and caches anything inside that directory. So for example: 4 | 5 | - running `python tinystories.py` will create a directory `tinystories` with its .bin files inside it 6 | - running `python tinyshakespeare.py` will create a directory `tinyshakespeare` with its .bin files inside it 7 | 8 | And so on. This way we can nicely organize multiple datasets here, share common utilities between them, and then point the .py/.c code in the root of the project accordingly to these. 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dot files and such 2 | .vscode 3 | .venv 4 | 5 | # .bin files generated by Python 6 | *.bin 7 | 8 | # data directories 9 | dev/data/__pycache__/ 10 | dev/data/fineweb10B/ 11 | dev/data/hellaswag/ 12 | dev/data/mmlu/ 13 | dev/data/tinyshakespeare/ 14 | dev/data/tinystories/ 15 | 16 | # binaries 17 | test_gpt2 18 | test_gpt2cu 19 | test_gpt2fp32cu 20 | train_gpt2 21 | train_gpt2cu 22 | train_gpt2fp32cu 23 | profile_gpt2cu 24 | dev/cuda/*_forward 25 | dev/cuda/*_backward 26 | dev/cuda/classifier_fused 27 | dev/cuda/adamw 28 | dev/cuda/matmul_backward_bias 29 | dev/cuda/nccl_all_reduce 30 | dev/cuda/global_norm 31 | *.obj 32 | *.exe 33 | *.o 34 | 35 | # log files 36 | *.log 37 | 38 | # clion files 39 | .idea 40 | cmake-build-* 41 | build 42 | -------------------------------------------------------------------------------- /llmcpp/cuda_profile_util.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_ 2 | #define LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_ 3 | 4 | #include 5 | #include 6 | 7 | // Profiler utils 8 | class NvtxRange { 9 | public: 10 | NvtxRange(const char* s) { nvtxRangePush(s); } 11 | NvtxRange(const char* prefix, const char* s) { 12 | std::string message = std::string(prefix) + "::" + std::string(s); 13 | nvtxRangePush(message.c_str()); 14 | } 15 | NvtxRange(const std::string& base_str, int number) { 16 | std::string range_string = base_str + " " + std::to_string(number); 17 | nvtxRangePush(range_string.c_str()); 18 | } 19 | ~NvtxRange() { nvtxRangePop(); } 20 | }; 21 | #define NVTX_RANGE_FN(prefix) NvtxRange nvtx_range(prefix, __FUNCTION__) 22 | 23 | #endif // LLM_CPP_LLMCPP_CUDA_PROFILE_UTIL_HPP_ 24 | -------------------------------------------------------------------------------- /llmc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # OpenMP 3 | find_package(OpenMP) 4 | if (OpenMP_FOUND) 5 | add_compile_definitions(OMP) 6 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") 8 | endif () 9 | 10 | # llm.c 11 | add_executable(train_gpt2 ../train_gpt2.c) 12 | target_link_libraries(train_gpt2 m ${OpenMP_CXX_LIBRARIES}) 13 | 14 | add_executable(test_gpt2 ../test_gpt2.c) 15 | target_link_libraries(test_gpt2 m ${OpenMP_CXX_LIBRARIES}) 16 | 17 | if (CUDA_FOUND) 18 | add_compile_definitions(ENABLE_FP32) 19 | add_executable(train_gpt2cu train_gpt2.cu) 20 | set_target_properties(train_gpt2cu PROPERTIES 21 | CUDA_SEPARABLE_COMPILATION ON 22 | CUDA_ARCHITECTURES "61;70;75" 23 | ) 24 | target_link_libraries(train_gpt2cu ${CUDA_LIBRARIES} cublas cublasLt) 25 | endif () 26 | -------------------------------------------------------------------------------- /llmc/cudnn_att.h: -------------------------------------------------------------------------------- 1 | /* 2 | cuDNN (flash) attention 3 | */ 4 | #ifndef CUDNN_ATT_H 5 | #define CUDNN_ATT_H 6 | 7 | #include "cuda_common.h" 8 | 9 | // forward declarations of functions defined in cudnn_att.cpp 10 | void create_cudnn(); 11 | void destroy_cudnn(); 12 | void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) 13 | float* stats, // output for backward pass: (B, NH, T) 14 | floatX* inp, // input: (B, T, 3, NH, HS) QKV 15 | int B, int T, int NH, int C, cudaStream_t stream); 16 | 17 | void attention_backward_cudnn(floatX* dqkvr, // output 18 | floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs 19 | int B, int T, int NH, int C, cudaStream_t stream); 20 | 21 | #endif // CUDNN_ATT_H -------------------------------------------------------------------------------- /scripts/pyrun_gpt2_124M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # the same as scripts/run_gpt2_124M.sh but with PyTorch 4 | 5 | # if you wish to train on just a single GPU, simply skip the torchrun part, i.e. 6 | # python train_gpt2.py ... (all the other arguments the same) 7 | torchrun --standalone --nproc_per_node=8 train_gpt2.py \ 8 | --input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \ 9 | --input_val_bin "dev/data/fineweb10B/fineweb_val_*.bin" \ 10 | --val_loss_every 250 \ 11 | --sample_every 0 \ 12 | --output_dir pylog_gpt2_124M \ 13 | --write_tensors 0 \ 14 | --model d12 \ 15 | --batch_size 32 \ 16 | --sequence_length 1024 \ 17 | --total_batch_size 524288 \ 18 | --dtype bfloat16 \ 19 | --compile 1 \ 20 | --tensorcores 1 \ 21 | --flash 1 \ 22 | --num_iterations 18865 \ 23 | --weight_decay 0.1 \ 24 | --zero_stage 1 \ 25 | --learning_rate 0.0006 \ 26 | --warmup_iters 700 \ 27 | --learning_rate_decay_frac 0.0 \ 28 | --overfit_single_batch 0 29 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) 2 | project(llm.cpp LANGUAGES C CXX CUDA) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_CUDA_STANDARD 17) 6 | set(BUILD_SHARED_LIBS OFF) 7 | # add_compile_options(-Ofast -march=native) 8 | # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Ofast -march=native") 9 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -march=native") 10 | 11 | find_program(CCACHE_PROGRAM ccache) 12 | if (CCACHE_PROGRAM) 13 | set(CMAKE_C_COMPILER_LAUNCHER ccache) 14 | set(CMAKE_CXX_COMPILER_LAUNCHER ccache) 15 | set(CMAKE_CUDA_COMPILER_LAUNCHER ccache) 16 | endif () 17 | 18 | enable_testing() 19 | include_directories(.) 20 | 21 | # Abseil 22 | set(ABSL_PROPAGATE_CXX_STD ON) 23 | add_subdirectory(third_party/abseil-cpp) 24 | 25 | # GoogleTest 26 | add_subdirectory(third_party/googletest) 27 | 28 | # Eigen 29 | set(EIGEN3_INCLUDE_DIR third_party/eigen) 30 | add_definitions(-DEIGEN_DONT_PARALLELIZE) 31 | #add_definitions(-DEIGEN_DONT_VECTORIZE) 32 | add_definitions(-DEIGEN_USE_THREADS) 33 | include_directories(${EIGEN3_INCLUDE_DIR}) 34 | 35 | add_subdirectory(llmc) 36 | add_subdirectory(llmcpp) 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Andrej Karpathy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dev/eval/summarize_eval.py: -------------------------------------------------------------------------------- 1 | # example run command 2 | # python dev/eval/summarize_eval.py lm-evaluation-harness/results/result774M 3 | # this script is optional, the run_eval.sh should already print these 4 | # but this script can be used to re-print them 5 | 6 | import json, sys 7 | 8 | RESULT = sys.argv[1] 9 | print("-"*40) 10 | 11 | key = {"arc_challenge_25shot.json": "acc_norm", 12 | "gsm8k_5shot.json": "acc", 13 | "hellaswag_10shot.json": "acc_norm", 14 | "mmlu_5shot.json": "acc", 15 | "truthfulqa_0shot.json": "mc2", 16 | "winogrande_5shot.json": "acc" 17 | } 18 | 19 | total = 0 20 | for test in ["arc_challenge_25shot.json", "gsm8k_5shot.json", "hellaswag_10shot.json", "mmlu_5shot.json", "truthfulqa_0shot.json", "winogrande_5shot.json"]: 21 | data = json.loads(open("./%s/%s"%(RESULT, test)).read()) 22 | r_count = 0 23 | r_total = 0 24 | for test_name in data['results']: 25 | r_count += 1 26 | r_total += data['results'][test_name][key[test]] 27 | score = (r_total*100)/r_count 28 | print(f"{test:<30} : {score:.4f}") 29 | total += score 30 | average = total / 6.0 31 | print("-"*40) 32 | print(f"Average Score : {average:.4f}") 33 | -------------------------------------------------------------------------------- /llmc/sampler.h: -------------------------------------------------------------------------------- 1 | /* 2 | Implements a simple Sampler, used during model inference to sample tokens. 3 | */ 4 | #ifndef SAMPLER_H 5 | #define SAMPLER_H 6 | 7 | #include 8 | 9 | // Simple xorshift RNG 10 | unsigned int random_u32(unsigned long long *state) { 11 | // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A 12 | *state ^= *state >> 12; 13 | *state ^= *state << 25; 14 | *state ^= *state >> 27; 15 | return (*state * 0x2545F4914F6CDD1Dull) >> 32; 16 | } 17 | 18 | float random_f32(unsigned long long *state) { // random float32 in [0,1) 19 | return (random_u32(state) >> 8) / 16777216.0f; 20 | } 21 | 22 | int sample_softmax(const float* logits, int n, float coin) { 23 | // sample index from logits (converted to probabilities using softmax) 24 | // coin is a random number in [0, 1), usually from random_f32() 25 | double norm = 0; 26 | for (int i = 0; i < n; i++) { 27 | norm += expf(logits[i]); 28 | } 29 | // instead of dividing all exp(logits), we can just multiply coin. 30 | coin *= norm; 31 | float cdf = 0.0f; 32 | for (int i = 0; i < n; i++) { 33 | cdf += expf(logits[i]); 34 | if (coin < cdf) { 35 | return i; 36 | } 37 | } 38 | return n - 1; // in case of rounding errors 39 | } 40 | 41 | #endif -------------------------------------------------------------------------------- /scripts/run_gpt2_124M.sh: -------------------------------------------------------------------------------- 1 | # GPT-2 (124M) repro on FineWeb 2 | # 124M parameter model on 10B tokens 3 | # => 6 * 124e6 * 10e9 = 7.44e18 ~= 7e18 capability model 4 | # 18,865 steps of 524,288 tokens/step 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~300ms/iter 6 | # => training time 18,865 * 300ms = 94.3 min ~= $20 7 | 8 | make train_gpt2cu USE_CUDNN=1 9 | out_dir="log_gpt2_124M" 10 | done_file="$out_dir/DONE_00018865" 11 | 12 | # in case the training stalls or crashes, loop to resume (-y 1) 13 | while true; do 14 | 15 | # exit condition is that optimization has finished 16 | if [ -f "$done_file" ]; then 17 | echo "File $done_file exists. Exiting the loop." 18 | break 19 | fi 20 | 21 | # run python dev/data/fineweb.py --version 10B to prepro data 22 | # run python dev/data/hellaswag.py to prepro hellaswag eval 23 | mpirun -np 8 ./train_gpt2cu \ 24 | -i "dev/data/fineweb10B/fineweb_train_*.bin" \ 25 | -j "dev/data/fineweb10B/fineweb_val_*.bin" \ 26 | -o $out_dir \ 27 | -v 250 -s 20000 -g 144 \ 28 | -h 1 \ 29 | -b 64 -t 1024 \ 30 | -d 524288 \ 31 | -r 0 \ 32 | -z 1 \ 33 | -c 0.1 \ 34 | -l 0.0006 \ 35 | -q 0.0 \ 36 | -u 700 \ 37 | -n 5000 \ 38 | -y 1 \ 39 | -e "d12" 40 | 41 | sleep 1 42 | done 43 | -------------------------------------------------------------------------------- /scripts/run_gpt2_1558M.sh: -------------------------------------------------------------------------------- 1 | # GPT-2 (1558M) repro on FineWeb-EDU 2 | # 1558M parameter model on 32B tokens 3 | # => 6 * 1558e6 * 32e9 = 6.966e20 ~= 3e20 capability model 4 | # 32,000 steps on ~1M tokens/step (1,048,576 to be precise) 5 | # on 8X H100 80GB SXM ($28/hr) steps in 2.80s/iter 6 | # => training time 32,000 steps * 2.7s => 24 hours ~= 1 day ~= $672 7 | 8 | make train_gpt2cu USE_CUDNN=1 9 | out_dir="log_gpt2_1558M" 10 | done_file="$out_dir/DONE_00032000" 11 | 12 | # in case the training stalls or crashes, loop to resume (-y 1) 13 | while true; do 14 | 15 | # exit condition is that optimization has finished 16 | if [ -f "$done_file" ]; then 17 | echo "File $done_file exists. Exiting the loop." 18 | break 19 | fi 20 | 21 | mpirun -np 8 ./train_gpt2cu \ 22 | -i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \ 23 | -j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \ 24 | -o $out_dir \ 25 | -v 250 -s 300000 -g 384 \ 26 | -h 1 \ 27 | -b 16 -t 1024 \ 28 | -d 1048576 \ 29 | -r 0 \ 30 | -z 1 \ 31 | -c 0.1 \ 32 | -k "cosine" \ 33 | -l 0.0006 \ 34 | -q 0.1 \ 35 | -u 700 \ 36 | -n 2000 \ 37 | -x 32000 \ 38 | -ge 1 \ 39 | -y 1 \ 40 | -e "d48" 41 | 42 | sleep 1 43 | done 44 | -------------------------------------------------------------------------------- /llmc/cublas_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | cuBLAS related utils 3 | */ 4 | #ifndef CUBLAS_COMMON_H 5 | #define CUBLAS_COMMON_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | // ---------------------------------------------------------------------------- 14 | // cuBLAS Precision settings 15 | 16 | #if defined(ENABLE_FP32) 17 | #define CUBLAS_LOWP CUDA_R_32F 18 | #elif defined(ENABLE_FP16) 19 | #define CUBLAS_LOWP CUDA_R_16F 20 | #else // default to bfloat16 21 | #define CUBLAS_LOWP CUDA_R_16BF 22 | #endif 23 | 24 | // ---------------------------------------------------------------------------- 25 | // cuBLAS globals for workspace, handle, settings 26 | 27 | // Hardcoding workspace to 32MiB but only Hopper needs 32 (for others 4 is OK) 28 | const size_t cublaslt_workspace_size = 32 * 1024 * 1024; 29 | void* cublaslt_workspace = NULL; 30 | cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F; 31 | cublasLtHandle_t cublaslt_handle; 32 | 33 | // ---------------------------------------------------------------------------- 34 | // Error checking 35 | 36 | // cuBLAS error checking 37 | void cublasCheck(cublasStatus_t status, const char *file, int line) 38 | { 39 | if (status != CUBLAS_STATUS_SUCCESS) { 40 | printf("[cuBLAS ERROR]: %d %s %d\n", status, file, line); 41 | exit(EXIT_FAILURE); 42 | } 43 | } 44 | #define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); } 45 | 46 | #endif // CUBLAS_COMMON_H -------------------------------------------------------------------------------- /scripts/run_gpt3_125M.sh: -------------------------------------------------------------------------------- 1 | # GPT-3 (125M) repro, but using FineWeb 2 | # 125M parameter model on 300B tokens 3 | # note context length: 1024 -> 2048 for GPT-3 4 | # => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model 5 | # 572,204 steps of 524,288 tokens/step => 300B 6 | # on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter 7 | # => training time 572,204 * 150ms ~= 24 hours ~= $336 8 | 9 | make train_gpt2cu USE_CUDNN=1 10 | out_dir="log_gpt3_125M" 11 | done_file="$out_dir/DONE_00572204" 12 | 13 | while true; do 14 | 15 | # exit condition is that optimization has finished 16 | if [ -f "$done_file" ]; then 17 | echo "File $done_file exists. Exiting the loop." 18 | break 19 | fi 20 | 21 | mpirun -np 8 ./train_gpt2cu \ 22 | -i "dev/data/fineweb100B/fineweb_train_*.bin" \ 23 | -j "dev/data/fineweb100B/fineweb_val_*.bin" \ 24 | -o $out_dir \ 25 | -v 250 -s 20000 -g 144 \ 26 | -h 1 \ 27 | -b 32 -t 2048 \ 28 | -d 524288 \ 29 | -r 0 \ 30 | -z 1 \ 31 | -c 0.1 \ 32 | -l 0.0006 \ 33 | -q 0.1 \ 34 | -u 700 \ 35 | -n 10000 \ 36 | -nk 5 \ 37 | -nm 50000 \ 38 | -ge 1 \ 39 | -sl 7.0 \ 40 | -sg 7.0 \ 41 | -y 1 \ 42 | -x 572204 \ 43 | -e "gpt3:c768" 44 | 45 | sleep 1 46 | done 47 | -------------------------------------------------------------------------------- /scripts/run_gpt2_350M.sh: -------------------------------------------------------------------------------- 1 | # GPT-2 (350M) repro on FineWeb 2 | # 350M parameter model on ~30B tokens 3 | # => 6 * 350e6 * 31.5e9 = 6.615e19 ~= 7e19 capability model (10X 124M) 4 | # 60K steps on 524,288 tokens/step 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~820ms/iter 6 | # => training time 60,000 steps * 820ms = 13.7 hours ~= $200 (10X 124M) 7 | 8 | make train_gpt2cu USE_CUDNN=1 9 | out_dir="log_gpt2_350M" 10 | done_file="$out_dir/DONE_00060000" 11 | 12 | # in case the training stalls or crashes, loop to resume (-y 1) 13 | while true; do 14 | 15 | # exit condition is that optimization has finished 16 | if [ -f "$done_file" ]; then 17 | echo "File $done_file exists. Exiting the loop." 18 | break 19 | fi 20 | 21 | # run python dev/data/fineweb.py --version 100B to prepro data 22 | # run python dev/data/hellaswag.py to prepro hellaswag eval 23 | mpirun -np 8 ./train_gpt2cu \ 24 | -i "dev/data/fineweb100B/fineweb_train_*.bin" \ 25 | -j "dev/data/fineweb100B/fineweb_val_*.bin" \ 26 | -o $out_dir \ 27 | -v 250 -s 100000 -g 144 \ 28 | -h 1 \ 29 | -b 64 -t 1024 \ 30 | -d 524288 \ 31 | -r 0 \ 32 | -z 1 \ 33 | -c 0.1 \ 34 | -l 0.0003 \ 35 | -q 0.0 \ 36 | -u 700 \ 37 | -n 2000 \ 38 | -x 60000 \ 39 | -y 1 \ 40 | -e "d24" 41 | 42 | sleep 1 43 | done 44 | -------------------------------------------------------------------------------- /scripts/run_gpt2_774M.sh: -------------------------------------------------------------------------------- 1 | # GPT-2 (774M) repro on FineWeb 2 | # 774M parameter model on ~150B tokens 3 | # => 6 * 774e6 * 150e9 = 6.966e20 ~= 7e20 capability model (10X 350M) 4 | # => 286,102 steps on 524,288 tokens/step 5 | # on 8X A100 80GB SXM ($14/hr) steps in ~1.7s/iter 6 | # => training time 286,102 steps * 1.7s = 135 hours ~= 5.6 days ~= $2000 (10X 124M) 7 | 8 | make train_gpt2cu USE_CUDNN=1 9 | out_dir="log_gpt2_774M" 10 | done_file="$out_dir/DONE_00286102" 11 | 12 | # in case the training stalls or crashes, loop to resume (-y 1) 13 | while true; do 14 | 15 | # exit condition is that optimization has finished 16 | if [ -f "$done_file" ]; then 17 | echo "File $done_file exists. Exiting the loop." 18 | break 19 | fi 20 | 21 | # run python dev/data/fineweb.py --version 100B to prepro data 22 | # run python dev/data/hellaswag.py to prepro hellaswag eval 23 | mpirun -np 8 ./train_gpt2cu \ 24 | -i "dev/data/fineweb100B/fineweb_train_*.bin" \ 25 | -j "dev/data/fineweb100B/fineweb_val_*.bin" \ 26 | -o $out_dir \ 27 | -v 250 -s 300000 -g 144 \ 28 | -h 1 \ 29 | -b 32 -t 1024 \ 30 | -d 524288 \ 31 | -r 0 \ 32 | -z 1 \ 33 | -c 0.1 \ 34 | -l 0.00025 \ 35 | -q 0.0 \ 36 | -u 700 \ 37 | -n 4000 \ 38 | -x 286102 \ 39 | -y 1 \ 40 | -e "d36" 41 | 42 | sleep 1 43 | done 44 | -------------------------------------------------------------------------------- /scripts/multi_node/run_gpt2_124M_mpi.sh: -------------------------------------------------------------------------------- 1 | 2 | make train_gpt2cu USE_CUDNN=1 3 | 4 | # NOTE: change the following to match your system 5 | binary_path="/home/ubuntu/llm.c/train_gpt2cu" 6 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi" 7 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin' 8 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin' 9 | # You can find these names either in `/etc/hosts`` file or in the terminal (user@host:~$). 10 | host1="h100-node-1-0" # master and worker node 11 | host2="h100-node-1-1" # worker node 12 | 13 | # In case the file system is shared this is a no-op. 14 | # Otherwise, we need to copy the binary to all nodes. 15 | scp -r $binary_path $USER@$host2:$binary_path 16 | 17 | # Use this for NCCL debugging if you run into issues 18 | # export NCCL_DEBUG=INFO 19 | # export NCCL_DEBUG_SUBSYS=ALL 20 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 21 | 22 | # Optimization flags 23 | export NCCL_NET_GDR_LEVEL=2 # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU 24 | export NCCL_IB_DISABLE=0 # use InfiniBand if available 25 | 26 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them 27 | export NCCL_SOCKET_IFNAME=ens17 28 | export OMPI_MCA_btl_tcp_if_include=ens17 29 | export NCCL_P2P_LEVEL=PXB 30 | 31 | mpirun -np 16 --host $host1:8,$host2:8 \ 32 | $binary_path \ 33 | -i "$train_data_path" \ 34 | -j "$val_data_path" \ 35 | -o $out_dir \ 36 | -v 250 -s 20000 -g 144 \ 37 | -h 1 \ 38 | -b 64 -t 1024 \ 39 | -d 2097152 \ 40 | -r 0 \ 41 | -z 1 \ 42 | -c 0.1 \ 43 | -l 0.0006 \ 44 | -q 0.1 \ 45 | -u 700 \ 46 | -n 1000 \ 47 | -y 0 \ 48 | -e d12 \ 49 | -pi "mpi" \ 50 | -------------------------------------------------------------------------------- /dev/test/test_outlier_detector.c: -------------------------------------------------------------------------------- 1 | /* 2 | Tests our OutlierDetector 3 | 4 | compile and run as (from dev/test directory) 5 | gcc -O3 -I../../llmc -o test_outlier_detector test_outlier_detector.c -lm && ./test_outlier_detector 6 | */ 7 | 8 | #include 9 | #include "../../llmc/outlier_detector.h" 10 | 11 | int main(void) { 12 | OutlierDetector detector; 13 | init_detector(&detector); 14 | 15 | srand(1337); // init rng 16 | 17 | // generate OUTLIER_DETECTOR_WINDOW_SIZE * 2 random numbers between -1 and 1 18 | for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE * 2; i++) { 19 | double val = (double)rand() / RAND_MAX * 2 - 1; // Random number between -1 and 1 20 | double zscore = update_detector(&detector, val); 21 | 22 | printf("Step %d: Value = %.4f, zscore = %.4f\n", i, val, zscore); 23 | 24 | // check that the first OUTLIER_DETECTOR_WINDOW_SIZE values return nan 25 | if (i < OUTLIER_DETECTOR_WINDOW_SIZE) { 26 | if (!isnan(zscore)) { 27 | printf("Error: Expected nan, got %.4f\n", zscore); 28 | return EXIT_FAILURE; 29 | } 30 | } else { 31 | // check that the zscore is within reasonable bounds 32 | if (zscore < -3.0 || zscore > 3.0) { 33 | printf("Error: Z-score %.4f is outside of expected range\n", zscore); 34 | return EXIT_FAILURE; 35 | } 36 | } 37 | } 38 | 39 | // simulate an outlier 40 | double outlier = 10.0; // <--- loss spike 41 | double zscore = update_detector(&detector, outlier); 42 | printf("Outlier Step: Value = %.4f, zscore = %.4f\n", outlier, zscore); 43 | 44 | // check that the z-score here is large 45 | if (zscore < 5.0) { 46 | printf("Error: Z-score %.4f is not large enough for an outlier\n", zscore); 47 | return EXIT_FAILURE; 48 | } 49 | 50 | printf("OK\n"); 51 | return EXIT_SUCCESS; 52 | } 53 | -------------------------------------------------------------------------------- /llmcpp/test_eigen_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "nn.hpp" 2 | 3 | using Tensor1D = Eigen::Tensor; 4 | using Tensor2D = Eigen::Tensor; 5 | using Tensor3D = Eigen::Tensor; 6 | using Tensor4D = Eigen::Tensor; 7 | 8 | int main(int argc, char** argv) { 9 | std::cout << "sizeof Tensor1D : " << sizeof(Tensor1D) << std::endl; 10 | std::cout << "sizeof Tensor2D : " << sizeof(Tensor2D) << std::endl; 11 | std::cout << "sizeof Tensor3D : " << sizeof(Tensor3D) << std::endl; 12 | std::cout << "sizeof Tensor4D : " << sizeof(Tensor4D) << std::endl; 13 | 14 | std::cout << "sizeof map Tensor1D : " << sizeof(Eigen::TensorMap) 15 | << std::endl; 16 | std::cout << "sizeof map Tensor2D : " << sizeof(Eigen::TensorMap) 17 | << std::endl; 18 | std::cout << "sizeof map Tensor3D : " << sizeof(Eigen::TensorMap) 19 | << std::endl; 20 | std::cout << "sizeof map Tensor4D : " << sizeof(Eigen::TensorMap) 21 | << std::endl; 22 | 23 | Eigen::setNbThreads(4); 24 | nn::ManualSeed(42); 25 | int B = 4, T = 64, C = 768, vocab_size = 50304; 26 | std::vector x(B * T * C), lm_head(C * vocab_size), 27 | y(B * T * vocab_size); 28 | nn::NormalFill(absl::MakeSpan(x)); 29 | nn::NormalFill(absl::MakeSpan(lm_head)); 30 | 31 | auto xm = MakeConstMatrix(x.data(), B * T, C); 32 | auto lm_headm = MakeConstMatrix(lm_head.data(), C, vocab_size); 33 | auto ym = MakeMatrix(y.data(), B * T, vocab_size); 34 | 35 | auto start = std::chrono::steady_clock::now(); 36 | for (int i = 0; i < 10; ++i) { 37 | nn::MatMul::Forward(xm, lm_headm, ym); 38 | } 39 | auto end = std::chrono::steady_clock::now(); 40 | std::cout << "avg: " 41 | << std::chrono::duration_cast( 42 | (end - start)) 43 | .count() / 44 | 10 45 | << std::endl; 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /llmc/logger.h: -------------------------------------------------------------------------------- 1 | /* 2 | Implements a simple logger that writes log files in the output directory. 3 | The Logger object is stateless and uses append mode to write to log files. 4 | */ 5 | #ifndef LOGGER_H 6 | #define LOGGER_H 7 | 8 | #include 9 | #include 10 | #include 11 | // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck 12 | #include "utils.h" 13 | 14 | typedef struct { 15 | int active; 16 | char output_log_file[512]; 17 | } Logger; 18 | 19 | void logger_init(Logger *logger, const char *log_dir, int process_rank, int resume) { 20 | // currently, only rank 0 writes logs 21 | logger->active = 0; 22 | if (log_dir != NULL && process_rank == 0) { 23 | logger->active = 1; 24 | assert(strlen(log_dir) < 500); // being a bit lazy, could relax later 25 | snprintf(logger->output_log_file, 512, "%s/main.log", log_dir); 26 | if (resume == 0) { 27 | // wipe any existing logfile clean if we're starting fresh 28 | FILE *logfile = fopenCheck(logger->output_log_file, "w"); 29 | fclose(logfile); 30 | } 31 | } 32 | } 33 | 34 | void logger_log_eval(Logger *logger, int step, float val) { 35 | if (logger->active == 1) { 36 | FILE *logfile = fopenCheck(logger->output_log_file, "a"); 37 | fprintf(logfile, "s:%d eval:%.4f\n", step, val); 38 | fclose(logfile); 39 | } 40 | } 41 | 42 | void logger_log_val(Logger *logger, int step, float val_loss) { 43 | if (logger->active == 1) { 44 | FILE *logfile = fopenCheck(logger->output_log_file, "a"); 45 | fprintf(logfile, "s:%d tel:%.4f\n", step, val_loss); 46 | fclose(logfile); 47 | } 48 | } 49 | 50 | void logger_log_train(Logger *logger, int step, float train_loss, float learning_rate, float grad_norm) { 51 | if (logger->active == 1) { 52 | FILE *logfile = fopenCheck(logger->output_log_file, "a"); 53 | fprintf(logfile, "s:%d trl:%.4f lr:%.6f norm:%.2f\n", step, train_loss, learning_rate, grad_norm); 54 | fclose(logfile); 55 | } 56 | } 57 | 58 | #endif -------------------------------------------------------------------------------- /dev/test/device_file_io.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Tests device <-> file IO functions 3 | 4 | compile and run as (from dev/test directory) 5 | nvcc -o device_file_io device_file_io.cu && ./device_file_io 6 | */ 7 | 8 | 9 | #include "../../llmc/cuda_common.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | void test(size_t nelem, size_t wt_buf_size, size_t rd_buf_size) { 16 | 17 | float* data; 18 | cudaCheck(cudaMalloc(&data, nelem*sizeof(float))); 19 | 20 | // generate random array 21 | std::vector random_data(nelem); 22 | std::mt19937 rng(42); 23 | std::uniform_real_distribution dist(-100.f, 100.f); 24 | std::generate(random_data.begin(), random_data.end(), [&](){ return dist(rng); }); 25 | 26 | cudaCheck(cudaMemcpy(data, random_data.data(), random_data.size()*sizeof(float), cudaMemcpyHostToDevice)); 27 | 28 | cudaStream_t stream; 29 | cudaStreamCreate(&stream); 30 | 31 | FILE* tmp = fopenCheck("tmp.bin", "w"); 32 | device_to_file(tmp, data, nelem * sizeof(float), wt_buf_size, stream); 33 | fcloseCheck(tmp); 34 | 35 | 36 | float* reload; 37 | cudaCheck(cudaMalloc(&reload, nelem*sizeof(float))); 38 | 39 | tmp = fopenCheck("tmp.bin", "r"); 40 | file_to_device(reload, tmp, nelem * sizeof(float), rd_buf_size, stream); 41 | fcloseCheck(tmp); 42 | 43 | std::vector cmp(nelem); 44 | cudaCheck(cudaMemcpy(cmp.data(), reload, nelem * sizeof(float), cudaMemcpyDeviceToHost)); 45 | for(int i = 0; i < nelem; ++i) { 46 | if(random_data[i] != cmp[i]) { 47 | fprintf(stderr, "FAIL: Mismatch at position %d: %f vs %f\n", i, random_data[i], cmp[i]); 48 | remove("tmp.bin"); 49 | exit(EXIT_FAILURE); 50 | } 51 | } 52 | 53 | cudaCheck(cudaFree(reload)); 54 | cudaCheck(cudaFree(data)); 55 | remove("tmp.bin"); 56 | } 57 | 58 | int main() { 59 | test(1025, 10000, 10000); // buffers larger than data 60 | test(1025, 1024, 513); // different and smaller 61 | test(500, 500*sizeof(float), 62 | 500*sizeof(float)); // exact match 63 | test(125'000, 10000, 10000); // large array 64 | } -------------------------------------------------------------------------------- /doc/layernorm/layernorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | eps = 1e-5 4 | 5 | class LayerNorm: 6 | 7 | @staticmethod 8 | def forward(x, w, b): 9 | B, T, C = x.size() 10 | mean = x.sum(-1, keepdim=True) / C # B,T,1 11 | xshift = x - mean # B,T,C 12 | var = (xshift**2).sum(-1, keepdim=True) / C # B,T,1 13 | rstd = (var + eps) ** -0.5 # B,T,1 14 | norm = xshift * rstd # B,T,C 15 | out = norm * w + b # B,T,C 16 | 17 | cache = (x, w, mean, rstd) 18 | return out, cache 19 | 20 | @staticmethod 21 | def backward(dout, cache): 22 | x, w, mean, rstd = cache 23 | # recompute the norm (save memory at the cost of compute) 24 | norm = (x - mean) * rstd 25 | # gradients for weights, bias 26 | db = dout.sum((0, 1)) 27 | dw = (dout * norm).sum((0, 1)) 28 | # gradients for input 29 | dnorm = dout * w 30 | dx = dnorm - dnorm.mean(-1, keepdim=True) - norm * (dnorm * norm).mean(-1, keepdim=True) 31 | dx *= rstd 32 | return dx, dw, db 33 | 34 | # create a small dummy example and check w.r.t PyTorch backward 35 | B = 2 36 | T = 3 37 | C = 4 38 | x = torch.randn(B, T, C, requires_grad=True) 39 | w = torch.randn(C, requires_grad=True) 40 | b = torch.randn(C, requires_grad=True) 41 | out, cache = LayerNorm.forward(x, w, b) 42 | 43 | dout = torch.randn(B, T, C) 44 | dx, dw, db = LayerNorm.backward(dout, cache) 45 | 46 | # compare to PyTorch autograd 47 | fakeloss = (out * dout).sum() 48 | fakeloss.backward() 49 | print("dx error:", (x.grad - dx).abs().max().item()) 50 | print("dw error:", (w.grad - dw).abs().max().item()) 51 | print("db error:", (b.grad - db).abs().max().item()) 52 | 53 | # for reference checking in C also 54 | x, w, mean, rstd = cache 55 | 56 | def write(tensor, handle): 57 | handle.write(tensor.detach().numpy().astype("float32").tobytes()) 58 | 59 | # Write to file 60 | with open('ln.bin', 'wb') as file: 61 | write(x, file) # (B, T, C) 62 | write(w, file) # (C, ) 63 | write(b, file) # (C, ) 64 | write(out, file) # (B, T, C) 65 | write(mean, file) # (B, T) 66 | write(rstd, file) # (B, T) 67 | write(dout, file) # (B, T, C) 68 | write(dx, file) # (B, T, C) 69 | write(dw, file) # (C, ) 70 | write(db, file) # (C, ) 71 | -------------------------------------------------------------------------------- /dev/data/edu_fineweb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads the FineWeb-Edu 100B dataset, but in an already tokenized format in .bin files 4 | # Example: ./edu_fineweb.sh 100 5 | # would download 100 shards 6 | # Default is all shards 7 | # Make sure to run this from current directory, i.e. inside ./dev/data! 8 | 9 | # Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024 10 | if [ $# -eq 0 ]; then 11 | MAX_SHARDS=1001 12 | else 13 | MAX_SHARDS=$1 14 | fi 15 | 16 | if [ $MAX_SHARDS -gt 1001 ]; then 17 | MAX_SHARDS=1001 18 | fi 19 | 20 | # Base URLs 21 | TRAIN_BASE_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_train_" 22 | VAL_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_val_000000.bin" 23 | 24 | # Directory to save files 25 | SAVE_DIR="edu_fineweb100B" 26 | 27 | # Create the directory if it doesn't exist 28 | mkdir -p "$SAVE_DIR" 29 | 30 | download() { 31 | local FILE_URL=$1 32 | local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1) 33 | local FILE_PATH="${SAVE_DIR}/${FILE_NAME}" 34 | curl -s -L -o "$FILE_PATH" "$FILE_URL" 35 | echo "Downloaded $FILE_NAME to $SAVE_DIR" 36 | } 37 | 38 | # Function to manage parallel jobs 39 | run_in_parallel() { 40 | local max_jobs=$1 41 | shift 42 | local commands=("$@") 43 | local job_count=0 44 | 45 | for cmd in "${commands[@]}"; do 46 | eval "$cmd" & 47 | ((job_count++)) 48 | if (( job_count >= max_jobs )); then 49 | wait -n 50 | ((job_count--)) 51 | fi 52 | done 53 | 54 | # Wait for any remaining jobs to finish 55 | wait 56 | } 57 | 58 | # Export the function so it's available in subshells 59 | export -f download 60 | 61 | # Download the validation shard 62 | download "$VAL_URL" & 63 | 64 | # Generate train file shard download commands 65 | train_commands=() 66 | for i in $(seq -f "%06g" 1 $MAX_SHARDS); do 67 | FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true" 68 | train_commands+=("download \"$FILE_URL\"") 69 | done 70 | 71 | # Run the train file commands in parallel 72 | run_in_parallel 40 "${train_commands[@]}" 73 | echo "The val shard and first $MAX_SHARDS train shards of FineWebEdu100B files downloaded in $SAVE_DIR" 74 | -------------------------------------------------------------------------------- /dev/data/fineweb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Downloads the FineWeb100B dataset, but in an already tokenized format in .bin files 4 | # Example: ./fineweb.sh 100 5 | # would download 100 shards 6 | # Default is all shards 7 | 8 | # Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024 9 | if [ $# -eq 0 ]; then 10 | MAX_SHARDS=1028 11 | else 12 | MAX_SHARDS=$1 13 | fi 14 | 15 | # Ensure MAX_SHARDS is not greater than 1028 16 | if [ $MAX_SHARDS -gt 1028 ]; then 17 | MAX_SHARDS=1028 18 | fi 19 | 20 | # Base URLs 21 | TRAIN_BASE_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_train_" 22 | VAL_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_val_000000.bin?download=true" 23 | 24 | # Directory to save files 25 | SAVE_DIR="fineweb100B" 26 | 27 | # Create the directory if it doesn't exist 28 | mkdir -p "$SAVE_DIR" 29 | 30 | # Function to download, decompress, and delete files 31 | download() { 32 | local FILE_URL=$1 33 | local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1) 34 | local FILE_PATH="${SAVE_DIR}/${FILE_NAME}" 35 | 36 | # Download the file 37 | curl -s -L -o "$FILE_PATH" "$FILE_URL" 38 | echo "Downloaded $FILE_NAME to $SAVE_DIR" 39 | } 40 | 41 | # Function to manage parallel jobs 42 | run_in_parallel() { 43 | local max_jobs=$1 44 | shift 45 | local commands=("$@") 46 | local job_count=0 47 | 48 | for cmd in "${commands[@]}"; do 49 | eval "$cmd" & 50 | ((job_count++)) 51 | if (( job_count >= max_jobs )); then 52 | wait -n 53 | ((job_count--)) 54 | fi 55 | done 56 | 57 | # Wait for any remaining jobs to finish 58 | wait 59 | } 60 | 61 | # Export the function so it's available in subshells 62 | export -f download 63 | 64 | # Download 65 | download "$VAL_URL" & 66 | 67 | # Generate train file commands 68 | train_commands=() 69 | for i in $(seq -f "%06g" 1 $MAX_SHARDS); do 70 | FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true" 71 | train_commands+=("download \"$FILE_URL\"") 72 | done 73 | 74 | # Run the train file commands in parallel 75 | run_in_parallel 40 "${train_commands[@]}" 76 | 77 | echo "The val shard and first $MAX_SHARDS train shards of FineWeb100B files downloaded in $SAVE_DIR" 78 | -------------------------------------------------------------------------------- /dev/download_starter_pack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get the directory of the script 4 | SCRIPT_DIR=$(dirname "$(realpath "$0")") 5 | 6 | # Base URL 7 | BASE_URL="https://huggingface.co/datasets/karpathy/llmc-starter-pack/resolve/main/" 8 | 9 | # Directory paths based on script location 10 | SAVE_DIR_PARENT="$SCRIPT_DIR/.." 11 | SAVE_DIR_TINY="$SCRIPT_DIR/data/tinyshakespeare" 12 | SAVE_DIR_HELLA="$SCRIPT_DIR/data/hellaswag" 13 | 14 | # Create the directories if they don't exist 15 | mkdir -p "$SAVE_DIR_TINY" 16 | mkdir -p "$SAVE_DIR_HELLA" 17 | 18 | # Files to download 19 | FILES=( 20 | "gpt2_124M.bin" 21 | "gpt2_124M_bf16.bin" 22 | "gpt2_124M_debug_state.bin" 23 | "gpt2_tokenizer.bin" 24 | "tiny_shakespeare_train.bin" 25 | "tiny_shakespeare_val.bin" 26 | "hellaswag_val.bin" 27 | ) 28 | 29 | # Function to download files to the appropriate directory 30 | download_file() { 31 | local FILE_NAME=$1 32 | local FILE_URL="${BASE_URL}${FILE_NAME}?download=true" 33 | local FILE_PATH 34 | 35 | # Determine the save directory based on the file name 36 | if [[ "$FILE_NAME" == tiny_shakespeare* ]]; then 37 | FILE_PATH="${SAVE_DIR_TINY}/${FILE_NAME}" 38 | elif [[ "$FILE_NAME" == hellaswag* ]]; then 39 | FILE_PATH="${SAVE_DIR_HELLA}/${FILE_NAME}" 40 | else 41 | FILE_PATH="${SAVE_DIR_PARENT}/${FILE_NAME}" 42 | fi 43 | 44 | # Download the file 45 | curl -s -L -o "$FILE_PATH" "$FILE_URL" 46 | echo "Downloaded $FILE_NAME to $FILE_PATH" 47 | } 48 | 49 | # Export the function so it's available in subshells 50 | export -f download_file 51 | 52 | # Generate download commands 53 | download_commands=() 54 | for FILE in "${FILES[@]}"; do 55 | download_commands+=("download_file \"$FILE\"") 56 | done 57 | 58 | # Function to manage parallel jobs in increments of a given size 59 | run_in_parallel() { 60 | local batch_size=$1 61 | shift 62 | local i=0 63 | local command 64 | 65 | for command; do 66 | eval "$command" & 67 | ((i = (i + 1) % batch_size)) 68 | if [ "$i" -eq 0 ]; then 69 | wait 70 | fi 71 | done 72 | 73 | # Wait for any remaining jobs to finish 74 | wait 75 | } 76 | 77 | # Run the download commands in parallel in batches of 2 78 | run_in_parallel 6 "${download_commands[@]}" 79 | 80 | echo "All files downloaded and saved in their respective directories" -------------------------------------------------------------------------------- /llmcpp/test_eigen_gpu.cu: -------------------------------------------------------------------------------- 1 | //#define EIGEN_USE_GPU 2 | 3 | #include "gpt.hpp" 4 | // #include "optim.hpp" 5 | 6 | #include "Eigen/Core" 7 | #include "unsupported/Eigen/CXX11/Tensor" 8 | 9 | using Tensor1D = Eigen::Tensor; 10 | using Tensor2D = Eigen::Tensor; 11 | using Tensor3D = Eigen::Tensor; 12 | using Tensor4D = Eigen::Tensor; 13 | 14 | int main(int argc, char** argv) { 15 | nn::ManualSeed(42); 16 | int B = 4, T = 64, C = 768, vocab_size = 50304; 17 | std::vector x(B * T * C), lm_head(C * vocab_size), 18 | y(B * T * vocab_size); 19 | nn::NormalFill(absl::MakeSpan(x)); 20 | nn::NormalFill(absl::MakeSpan(lm_head)); 21 | Eigen::GpuStreamDevice stream; 22 | Eigen::GpuDevice gpu_device(&stream); 23 | // Eigen::ThreadPool thread_pool(16); 24 | // Eigen::ThreadPoolDevice gpu_device(&thread_pool, 12); 25 | 26 | float *dx, *dy, *dlm_head; 27 | dx = static_cast(gpu_device.allocate(sizeof(float) * B * T * C)); 28 | dlm_head = 29 | static_cast(gpu_device.allocate(sizeof(float) * C * vocab_size)); 30 | dy = static_cast( 31 | gpu_device.allocate(sizeof(float) * B * T * vocab_size)); 32 | gpu_device.memcpyHostToDevice(dx, x.data(), sizeof(float) * B * T * C); 33 | gpu_device.memcpyHostToDevice(dlm_head, lm_head.data(), 34 | sizeof(float) * C * vocab_size); 35 | gpu_device.memcpyHostToDevice(dy, y.data(), 36 | sizeof(float) * B * T * vocab_size); 37 | 38 | auto xm = Eigen::TensorMap(dx, B * T, C); 39 | auto lm_headm = Eigen::TensorMap(dlm_head, C, vocab_size); 40 | auto ym = Eigen::TensorMap(dy, B * T, vocab_size); 41 | 42 | auto start = std::chrono::steady_clock::now(); 43 | for (int i = 0; i < 10; ++i) { 44 | Eigen::array, 1> product_dims = { 45 | Eigen::IndexPair(1, 0)}; 46 | ym.device(gpu_device) = xm.contract(lm_headm, product_dims); 47 | // nn::MatMul::Forward(xm, lm_headm, ym); 48 | } 49 | gpu_device.synchronize(); 50 | auto end = std::chrono::steady_clock::now(); 51 | std::cout << "avg: " 52 | << std::chrono::duration_cast( 53 | (end - start)) 54 | .count() / 55 | 10 56 | << std::endl; 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /dev/cuda/README.md: -------------------------------------------------------------------------------- 1 | # dev/cuda 2 | 3 | This directory is scratch space for developing various versions of the needed CUDA kernels. Each file develops a kernel, and usually multiple versions of that kernel that could have different running times and of different code or time complexity. 4 | 5 | See the top of each file for how to compile and run the kernel. Alternatively, the commands are also all grouped in the `Makefile` in this directory for convenience. 6 | 7 | For example, we can look at the top of `layernorm_forward.cu` to build the forward pass kernels for the LayerNorm: 8 | 9 | ```bash 10 | nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward 11 | ``` 12 | 13 | or simply 14 | 15 | ```bash 16 | make layernorm_forward 17 | ``` 18 | 19 | The comments at the top then document the different versions of this kernel available, usually these are in increasing complexity and decreasing running times. For example, inspecting the comments in the file on top, the most naive kernel we can then run as: 20 | 21 | ```bash 22 | ./layernorm_forward 1 23 | ``` 24 | 25 | You'll see that this first forwards the reference code on the CPU, then it runs kernel 1 on the GPU, compares the results to check for correctness, and then runs a number of configurations of this kernel (most often and most notably the block size), to time the kernel in these launch configurations. We can then run one of the faster kernels (kernel 4) instead: 26 | 27 | ```bash 28 | ./layernorm_forward 4 29 | ``` 30 | 31 | You'll see that this matches all the CPU results but runs much much faster. The typical process from here on is we copy paste the kernel that ran fastest, adjust it manually (e.g. to hardcode the best block size) and drop it into the training code file, e.g. `train_gpt2.cu`. 32 | 33 | To add a new version of a kernel, add the kernel to the corresponding file and adjust the docs. To add a new kernel, add the new file and adjust the Makefile. Run `make clean` to clean up binaries from your directory. 34 | 35 | If you do not have a GPU or is having trouble with CUDA dependencies, you can run the benchmarks on the [Modal platform](http://modal.com). For example, to run the benchmark for the attention forward pass on an A100 GPU with 80GB of memory, you can run the following command: 36 | 37 | ```bash 38 | GPU_MEM=80 modal run benchmark_on_modal.py --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas" --run-command "./attention_forward 1" 39 | ``` 40 | -------------------------------------------------------------------------------- /dev/data/tinyshakespeare.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads and tokenizes the TinyShakespeare dataset. 3 | - The download is from Github. 4 | - The tokenization is GPT-2 tokenizer with tiktoken 5 | 6 | The output is written to a newly created tinyshakespeare/ folder. 7 | The script prints: 8 | 9 | Saved 32768 tokens to tinyshakespeare/tiny_shakespeare_val.bin 10 | Saved 305260 tokens to tinyshakespeare/tiny_shakespeare_train.bin 11 | 12 | And runs in a few seconds depending on your internet 13 | connection and computer. The .bin files are raw byte 14 | streams of int32 numbers indicating the token ids. 15 | """ 16 | 17 | import os 18 | import tiktoken 19 | import numpy as np 20 | from data_common import download_file, write_datafile 21 | 22 | # ----------------------------------------------------------------------------- 23 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinyshakespeare") 24 | 25 | enc = tiktoken.get_encoding("gpt2") 26 | encode = lambda s: enc.encode(s, allowed_special={'<|endoftext|>'}) 27 | 28 | def download(): 29 | """Downloads the TinyShakespeare dataset to DATA_CACHE_DIR""" 30 | os.makedirs(DATA_CACHE_DIR, exist_ok=True) 31 | # download the TinyShakespeare dataset, unless it's already downloaded 32 | data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" 33 | data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt") 34 | if not os.path.exists(data_filename): 35 | print(f"Downloading {data_url} to {data_filename}...") 36 | download_file(data_url, data_filename) 37 | else: 38 | print(f"{data_filename} already exists, skipping download...") 39 | 40 | def tokenize(): 41 | data_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare.txt") 42 | text = open(data_filename, 'r').read() 43 | # let's treat every person's statement in the dialog as a separate document 44 | text = "<|endoftext|>" + text 45 | text = text.replace('\n\n', '\n\n<|endoftext|>') 46 | # encode the text 47 | tokens = encode(text) 48 | # let's take the first 32,768 tokens as the validation split (~10%) 49 | val_tokens = tokens[:32768] 50 | train_tokens = tokens[32768:] 51 | # save to file 52 | val_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_val.bin") 53 | train_filename = os.path.join(DATA_CACHE_DIR, "tiny_shakespeare_train.bin") 54 | write_datafile(val_filename, val_tokens) 55 | write_datafile(train_filename, train_tokens) 56 | 57 | if __name__ == "__main__": 58 | download() 59 | tokenize() 60 | -------------------------------------------------------------------------------- /llmcpp/tensor_util.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_ 2 | #define LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_ 3 | 4 | #include "absl/container/inlined_vector.h" 5 | #include "tensor_types.hpp" 6 | 7 | using floatX = float; 8 | 9 | // Raw pointer -> Flat 10 | template 11 | typename TTypes::Flat MakeFlat(T* t, int length) { 12 | return {t, length}; 13 | } 14 | template 15 | typename TTypes::ConstFlat MakeConstFlat(T* t, int length) { 16 | return {t, length}; 17 | } 18 | template 19 | typename TTypes::ConstFlat MakeConstFlat(const T* t, int length) { 20 | return {t, length}; 21 | } 22 | 23 | // Raw pointer -> Matrix 24 | template 25 | typename TTypes::Matrix MakeMatrix(T* t, int rows, int cols) { 26 | return {t, rows, cols}; 27 | } 28 | template 29 | typename TTypes::ConstMatrix MakeConstMatrix(T* t, int rows, int cols) { 30 | return {t, rows, cols}; 31 | } 32 | template 33 | typename TTypes::ConstMatrix MakeConstMatrix(const T* t, int rows, 34 | int cols) { 35 | return {t, rows, cols}; 36 | } 37 | 38 | // Raw pointer -> 3D Tensor 39 | template 40 | typename TTypes::Tensor Make3DTensor(T* t, int dim0, int dim1, int dim2) { 41 | return {t, dim0, dim1, dim2}; 42 | } 43 | template 44 | typename TTypes::ConstTensor MakeConst3DTensor(T* t, int dim0, int dim1, 45 | int dim2) { 46 | return {t, dim0, dim1, dim2}; 47 | } 48 | template 49 | typename TTypes::ConstTensor MakeConst3DTensor(const T* t, int dim0, 50 | int dim1, int dim2) { 51 | return {t, dim0, dim1, dim2}; 52 | } 53 | 54 | // Raw pointer -> 4D Tensor 55 | template 56 | typename TTypes::Tensor Make4DTensor(T* t, int dim0, int dim1, int dim2, 57 | int dim3) { 58 | return {t, dim0, dim1, dim2, dim3}; 59 | } 60 | template 61 | typename TTypes::ConstTensor MakeConst4DTensor(T* t, int dim0, int dim1, 62 | int dim2, int dim3) { 63 | return {t, dim0, dim1, dim2, dim3}; 64 | } 65 | template 66 | typename TTypes::ConstTensor MakeConst4DTensor(const T* t, int dim0, 67 | int dim1, int dim2, 68 | int dim3) { 69 | return {t, dim0, dim1, dim2, dim3}; 70 | } 71 | 72 | #endif // LLM_CPP_LLMCPP_TENSOR_UTIL_HPP_ 73 | -------------------------------------------------------------------------------- /llmcpp/gpt_optim.cpp: -------------------------------------------------------------------------------- 1 | #include "gpt.hpp" 2 | #include "optim.hpp" 3 | 4 | int main(int argc, char** argv) { 5 | /* 6 | torch.set_printoptions(precision=6) 7 | torch.manual_seed(42) 8 | config = GPTConfig(block_size=8, n_embd=16, n_head=4, n_layer=8, vocab_size=100) 9 | gpt2 = GPT(config=config) 10 | B, T, C = 4, 8, 16 11 | idx = torch.LongTensor([[35, 28, 51, 9, 81, 41, 30, 22], 12 | [99, 91, 96, 20, 99, 46, 85, 63], 13 | [ 0, 78, 75, 43, 94, 99, 78, 93], 14 | [14, 42, 54, 11, 63, 42, 99, 48]]) 15 | targets = torch.LongTensor([[28, 51, 9, 81, 41, 30, 22, 99], 16 | [91, 96, 20, 99, 46, 85, 63, 0], 17 | [78, 75, 43, 94, 99, 78, 93, 14], 18 | [42, 54, 11, 63, 42, 99, 48, 0]]) 19 | optimizer = torch.optim.SGD(gpt2.parameters(), 20 | lr=1e-2) 21 | for i in range(10): 22 | logit, loss = gpt2(idx, targets) 23 | optimizer.zero_grad() 24 | loss.backward() 25 | optimizer.step() 26 | print('loss', loss) 27 | */ 28 | 29 | Eigen::setNbThreads(4); 30 | nn::ManualSeed(42); 31 | int block_size = 8, n_embd = 16, n_head = 4, n_layer = 8, vocab_size = 100; 32 | int B = 4, T = block_size, C = n_embd, nh = n_head, hs = n_embd / nh; 33 | gpt::GPT gpt(block_size, vocab_size, vocab_size, n_layer, n_head, n_embd); 34 | 35 | std::vector idx = {35, 28, 51, 9, 81, 41, 30, 22, 99, 91, 96, 36 | 20, 99, 46, 85, 63, 0, 78, 75, 43, 94, 99, 37 | 78, 93, 14, 42, 54, 11, 63, 42, 99, 48}; 38 | auto idx_m = TTypes::ConstMatrix(idx.data(), B, T); 39 | std::vector logits(B * T * vocab_size); 40 | auto logits_3d = Make3DTensor(logits.data(), B, T, vocab_size); 41 | 42 | std::vector target = {28, 51, 9, 81, 41, 30, 22, 99, 91, 96, 20, 43 | 99, 46, 85, 63, 0, 78, 75, 43, 94, 99, 78, 44 | 93, 14, 42, 54, 11, 63, 42, 99, 48, 0}; 45 | auto target_m = TTypes::ConstMatrix(target.data(), B, T); 46 | 47 | std::vector parameters; 48 | gpt.Parameters(¶meters); 49 | auto optimizer = optim::SGD(parameters, 1e-2f); 50 | float expected_loss[] = { 51 | 4.691669, 4.668904, 4.646729, 4.625142, 4.604129, 52 | 4.583667, 4.563725, 4.544271, 4.525268, 4.506680, 53 | }; 54 | for (int step = 0; step < 10; ++step) { 55 | float loss = 0.0; 56 | gpt.ForwardCPU(idx_m, target_m, logits_3d, &loss); 57 | optimizer.ZeroGrad(); 58 | gpt.BackwardCPU(idx_m, target_m); 59 | optimizer.Step(); 60 | fprintf(stdout, "Step %d, loss = %.6f\n", step, loss); 61 | CHECK(std::abs(loss - expected_loss[step]) < 1e-5); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /dev/eval/README.md: -------------------------------------------------------------------------------- 1 | # eleuther eval readme 2 | 3 | The goal here is to run the Eleuther Eval harness exactly in the same way as that used in the [huggingface LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). 4 | 5 | The starting point is a `.bin` file trained by llm.c. We now have to export it to a huggingface model and then evaluate it. 6 | 7 | To export the model, use [export_hf.py](export_hf.py). See its documentation up top. Eample usage, from this directory: 8 | 9 | ```bash 10 | cd dev/eval 11 | python export_hf.py --input model.bin --output output_dir 12 | ``` 13 | 14 | Where you point to your model .bin file, and huggingface files get written to output_dir. The script can optionally also upload to huggingface hub. One more post-processing that is advisable is to go into the `output_dir`, open up the `config.json` there and add one more entry into the json object: 15 | 16 | ``` 17 | "_attn_implementation": "flash_attention_2" 18 | ``` 19 | 20 | To use FlashAttention 2. We had trouble evaluating in bfloat16 without using FlashAttention 2 (the scores are much lower, and this was never fully resolved). This is a temporary hack/workaround. 21 | 22 | Now that we have the model in huggingface format, we download the Eleuther Eval Harness repo and run it. Head over to the parent/root directory of the llm.c repo and: 23 | 24 | ```bash 25 | git clone https://github.com/EleutherAI/lm-evaluation-harness/ 26 | cd lm-evaluation-harness 27 | git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463 28 | pip install -e . 29 | ``` 30 | 31 | And then run the run_eval.sh script: 32 | 33 | ```bash 34 | ./dev/eval/run_eval.sh output_dir result_dir 35 | ``` 36 | 37 | Where output_dir can either be local output dir (above), or a huggingface repo name.This will write eval json objects to `./lm-evaluation-harness/results/results_dir`. It will print the results into console, e.g. for a 774M model we see: 38 | 39 | ``` 40 | ---------------------------------------- 41 | arc_challenge_25shot.json : 30.4608 42 | gsm8k_5shot.json : 0.1516 43 | hellaswag_10shot.json : 57.8072 44 | mmlu_5shot.json : 25.8682 45 | truthfulqa_0shot.json : 35.7830 46 | winogrande_5shot.json : 59.3528 47 | ---------------------------------------- 48 | Average Score : 34.9039 49 | ``` 50 | 51 | But you can additionally get these results later by running `summarize_eval.py`: 52 | 53 | ```bash 54 | python dev/eval/summarize_eval.py lm-evaluation-harness/results/results_dir 55 | ``` 56 | 57 | The same information will be printed again. 58 | 59 | For some reason, the evaluation is quite expensive and runs for somewhere around 1-3 hours, even though it should be a few minutes at most. This has not been satisfyingly resolved so far. -------------------------------------------------------------------------------- /llmc/outlier_detector.h: -------------------------------------------------------------------------------- 1 | /* 2 | Simple OutlierDetector that we can use to monitor the loss and grad norm 3 | Internally, it keeps track of a window of measurements and each time we 4 | add a measurement, it returns the z-score of the new value with respect to 5 | the window of measurements. This can be used to detect outliers in the data. 6 | 7 | We use double so that the detector doesn't drift too much, because we 8 | update the mean and variance with += on each step for efficiency. We could 9 | reconsider this choice in the future, as the compute cost here is minimal. 10 | */ 11 | 12 | #include 13 | #include 14 | 15 | // use compile-time constant for window size to avoid dynamic memory allocations 16 | #define OUTLIER_DETECTOR_WINDOW_SIZE 128 17 | 18 | typedef struct { 19 | double buffer[OUTLIER_DETECTOR_WINDOW_SIZE]; 20 | int count; 21 | int index; 22 | double sum; 23 | double sum_sq; 24 | } OutlierDetector; 25 | 26 | void init_detector(OutlierDetector *detector) { 27 | for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE; i++) { 28 | detector->buffer[i] = 0.0; 29 | } 30 | detector->count = 0; 31 | detector->index = 0; 32 | detector->sum = 0.0; 33 | detector->sum_sq = 0.0; 34 | } 35 | 36 | double update_detector(OutlierDetector *detector, double new_value) { 37 | 38 | if (detector->count < OUTLIER_DETECTOR_WINDOW_SIZE) { 39 | // here we are still building up a window of observations 40 | detector->buffer[detector->count] = new_value; 41 | detector->sum += new_value; 42 | detector->sum_sq += new_value * new_value; 43 | detector->count++; 44 | return nan(""); // not enough data yet 45 | 46 | } else { 47 | // we've filled the window, so now we can start detecting outliers 48 | 49 | // pop the oldest value from the window 50 | double old_value = detector->buffer[detector->index]; 51 | detector->sum -= old_value; 52 | detector->sum_sq -= old_value * old_value; 53 | // push the new value into the window 54 | detector->buffer[detector->index] = new_value; 55 | detector->sum += new_value; 56 | detector->sum_sq += new_value * new_value; 57 | // move the index to the next position 58 | detector->index = (detector->index + 1) % OUTLIER_DETECTOR_WINDOW_SIZE; 59 | // calculate the z-score of the new value 60 | double mean = detector->sum / OUTLIER_DETECTOR_WINDOW_SIZE; 61 | double variance = (detector->sum_sq / OUTLIER_DETECTOR_WINDOW_SIZE) - (mean * mean); 62 | double std_dev = sqrt(variance); 63 | if (std_dev == 0.0) { 64 | return 0.0; 65 | } 66 | double z = (new_value - mean) / std_dev; 67 | 68 | return z; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /llmcpp/optim.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LLM_CPP__OPTIM_HPP_ 2 | #define LLM_CPP__OPTIM_HPP_ 3 | 4 | #include "nn.hpp" 5 | 6 | namespace optim { 7 | 8 | struct SGD { 9 | SGD(std::vector parameters, float lr) 10 | : parameters_(std::move(parameters)), lr_(lr) {} 11 | 12 | void ZeroGrad() { 13 | for (nn::Parameter* parameter : parameters_) { 14 | parameter->ZeroGrad(); 15 | } 16 | } 17 | 18 | void Step() { 19 | for (nn::Parameter* parameter : parameters_) { 20 | auto param = parameter->flat(); 21 | auto grad = parameter->flat_grad(); 22 | param.device(nn::g_device) -= lr_ * grad; 23 | } 24 | } 25 | 26 | private: 27 | std::vector parameters_; 28 | float lr_; 29 | }; 30 | 31 | struct AdamW { 32 | AdamW(std::vector parameters, float lr, float beta1 = 0.9f, 33 | float beta2 = 0.999f, float eps = 1e-8f, float weight_decay = 0.0f) 34 | : parameters_(std::move(parameters)), 35 | lr_(lr), 36 | beta1_(beta1), 37 | beta2_(beta2), 38 | eps_(eps), 39 | weight_decay_(weight_decay) { 40 | for (const auto& parameter : parameters_) { 41 | m_.emplace_back( 42 | std::make_unique(nn::DT_FLOAT, parameter->size())); 43 | v_.emplace_back( 44 | std::make_unique(nn::DT_FLOAT, parameter->size())); 45 | } 46 | } 47 | 48 | void ZeroGrad() { 49 | for (nn::Parameter* parameter : parameters_) { 50 | parameter->ZeroGrad(); 51 | } 52 | } 53 | 54 | void Step(int t) { 55 | for (size_t i = 0; i < parameters_.size(); ++i) { 56 | auto parameter = parameters_[i]->flat(); 57 | auto grad = parameters_[i]->flat_grad(); 58 | auto m = m_[i]->flat(); 59 | auto v = v_[i]->flat(); 60 | 61 | // update the first moment (momentum) 62 | m.device(nn::g_device) = beta1_ * m + (1.0f - beta1_) * grad; 63 | // update the second moment (RMSprop) 64 | v.device(nn::g_device) = beta2_ * v + (1.0f - beta2_) * grad * grad; 65 | // bias-correct both moments 66 | auto m_hat = m / (1.0f - static_cast(std::pow(beta1_, t))); 67 | auto v_hat = v / (1.0f - static_cast(std::pow(beta2_, t))); 68 | 69 | // update 70 | parameter.device(nn::g_device) -= 71 | lr_ * (m_hat / (v_hat.sqrt() + eps_) + weight_decay_ * parameter); 72 | } 73 | } 74 | 75 | private: 76 | std::vector parameters_; 77 | std::vector> m_; 78 | std::vector> v_; 79 | float lr_; 80 | float beta1_; 81 | float beta2_; 82 | float eps_; 83 | float weight_decay_; 84 | }; 85 | 86 | } // namespace optim 87 | 88 | #endif // LLM_CPP__OPTIM_HPP_ 89 | -------------------------------------------------------------------------------- /llmcpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # llm.cpp 2 | add_library(nn nn.hpp) 3 | target_link_libraries(nn 4 | absl::strings absl::log absl::check pthread) 5 | 6 | add_library(gpt gpt.hpp) 7 | target_link_libraries(gpt nn) 8 | 9 | add_library(gpt2 gpt2.hpp) 10 | target_link_libraries(gpt2 gpt) 11 | 12 | add_library(optim optim.hpp) 13 | target_link_libraries(optim nn) 14 | 15 | add_executable(test_gpt2_cpu test_gpt2.cpp) 16 | target_link_libraries(test_gpt2_cpu gpt2 optim) 17 | target_compile_options(test_gpt2_cpu PRIVATE -Ofast -march=native) 18 | 19 | add_executable(train_gpt2_cpu train_gpt2.cpp) 20 | target_link_libraries(train_gpt2_cpu 21 | gpt2 optim 22 | profiler 23 | ) 24 | target_compile_options(train_gpt2_cpu PRIVATE -Ofast -march=native) 25 | 26 | add_executable(nn_test nn_test.cpp) 27 | target_link_libraries(nn_test nn GTest::gtest_main) 28 | 29 | add_executable(optim_test optim_test.cpp) 30 | target_link_libraries(optim_test nn GTest::gtest_main) 31 | 32 | add_executable(gpt_test gpt_test.cpp) 33 | target_link_libraries(gpt_test gpt GTest::gtest_main) 34 | 35 | add_executable(gpt_optim gpt_optim.cpp) 36 | target_link_libraries(gpt_optim gpt) 37 | 38 | add_executable(test_eigen_cpu test_eigen_cpu.cpp) 39 | target_link_libraries(test_eigen_cpu nn) 40 | target_compile_options(test_eigen_cpu PRIVATE -Ofast -march=native) 41 | 42 | set(CMAKE_CUDA_ARCHITECTURES 60 61 70 75) 43 | find_package(CUDA) 44 | if (CUDA_FOUND) 45 | add_library(nn_gpu nn.hpp) 46 | target_compile_definitions(nn_gpu PUBLIC EIGEN_USE_GPU) 47 | target_link_libraries(nn_gpu 48 | absl::strings absl::log absl::check 49 | ${CUDA_LIBRARIES} 50 | ) 51 | 52 | add_library(gpt_gpu gpt.hpp) 53 | target_link_libraries(gpt_gpu 54 | nn_gpu 55 | ) 56 | 57 | add_executable(test_eigen_gpu test_eigen_gpu.cu) 58 | target_compile_definitions(test_eigen_gpu PRIVATE EIGEN_USE_GPU) 59 | target_link_libraries(test_eigen_gpu 60 | nn_gpu 61 | ) 62 | # target_compile_options(test_eigen_gpu PRIVATE -Xcompiler=-Ofast,-march=native) 63 | 64 | # nn_test_gpu 65 | add_executable(nn_test_gpu nn_test.cu) 66 | target_link_libraries(nn_test_gpu 67 | nn_gpu 68 | GTest::gtest_main 69 | ) 70 | 71 | # gpt_test_gpu 72 | add_executable(gpt_test_gpu gpt_test.cu) 73 | target_link_libraries(gpt_test_gpu 74 | gpt_gpu 75 | GTest::gtest_main 76 | ) 77 | 78 | # gpt_optim_gpu 79 | add_executable(gpt_optim_gpu gpt_optim.cu) 80 | target_link_libraries(gpt_optim_gpu 81 | gpt_gpu 82 | ) 83 | 84 | # train_gpt2_gpu 85 | add_executable(train_gpt2_gpu train_gpt2.cu) 86 | target_link_libraries(train_gpt2_gpu 87 | gpt_gpu 88 | ) 89 | target_compile_options(train_gpt2_gpu PRIVATE -O3) 90 | endif () 91 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # scripts 2 | 3 | These shell scripts hold the exact commands to llm.c that reproduce the GPT-2 and GPT-3 runs. 4 | 5 | ### pytorch reference runs 6 | 7 | For all pyrun scripts, current restrictions: 8 | 9 | - does not write checkpoint, only logs of the train/val losses 10 | - does not evaluate hellaswag accuracy 11 | - cannot "resume training" (i.e. the `-y 1` flag) 12 | 13 | ### memory considerations 14 | 15 | In any of these scripts, if you are running out of memory on your GPU you'll want to meddle with two flags: the recompute setting `-r` and the microbatch size `-b`. Recompute throws away some activations during the forward pass and then recomputes them during the backward pass. This reduces the amount of memory we need to store and cache during the forward pass, but then increases the amount of computation we need to do during the backward pass. The microbatch size controls the number of token streams that are processed in a single forward/backward pass in parallel. Decreasing this number means we need to store less memory per microbatch, but then we have to increase the number of loops in the gradient accumulation to meet the same desired total batch size. 16 | 17 | Long story short, try `-r 1` (recompute GeLU, trading off speed and memory) to conserve some memory. If that doesn't help, start dividing the micro batch size until things fit. For example if the deafult is `-b 64`, try `-b 32`, and then 16, 8, etc. until things fit. Once they do fit, experiment with dialing back the recompute flag `-r 0` to get some speed back. Alternatively to `-b`, if your application doesn't need a very long context length, you can dial back the number of max sequence length using `-t`. For example GPT-2 uses `-t 1024` and GPT-3 uses `-t 2048`. Your application may tolerate a lower context length. 18 | 19 | ### multi-gpu considerations 20 | 21 | It might be that you only have one GPU and not a whole box of them. Every script is fairly easy to change for just a single GPU. For llm.c, simply change line 1 to line 2 and leave everything else the same: 22 | 23 | ```bash 24 | mpirun -np 8 ./train_gpt2cu \ 25 | ./train_gpt2cu \ 26 | ``` 27 | 28 | For PyTorch, the same thing: 29 | 30 | ```bash 31 | torchrun --standalone --nproc_per_node=8 train_gpt2.py \ 32 | python train_gpt2.py \ 33 | ``` 34 | 35 | Both of these scripts automatically detect how many GPUs are available and adjust the gradient accumulation inner loop of the optimization accordingly, so the results come out the same, up to floating point error. Of course, you'll have to wait proportionally longer for the optimization to finish. 36 | 37 | To run on multiple nodes of GPUs, have a look at this pending [PR](https://github.com/karpathy/llm.c/pull/426), alternatively for llm.c try something like this: 38 | 39 | ```bash 40 | mpirun -np 16 --host node1:8,node2:8 ./train_gptcu ... 41 | ``` 42 | 43 | For PyTorch follow the torchrun docs. 44 | -------------------------------------------------------------------------------- /llmc/gelu.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | (Approximate) GeLU non-linearity layer 3 | */ 4 | #include 5 | // llmc internal imports 6 | #include "cuda_common.h" 7 | #include "cuda_utils.cuh" 8 | 9 | // ---------------------------------------------------------------------------- 10 | // CUDA kernels 11 | 12 | #define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI) 13 | __global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) { 14 | int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; 15 | 16 | x128 packed_out; 17 | x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache 18 | for(int k = 0; k < packed_inp.size; ++k) { 19 | float xi = (float)packed_inp[k]; 20 | float cube = 0.044715f * xi * xi * xi; 21 | packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube)))); 22 | } 23 | // store instead of storecs (without cache streaming) in case it is useful for the 24 | // data to be in the cache for the next operation after this GeLU 25 | store128(out + idx, packed_out); 26 | } 27 | 28 | __global__ void gelu_backward_inplace_kernel(floatX* d_in_out, const floatX* inp) { 29 | int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; 30 | 31 | x128 packed_dinp; 32 | x128 packed_inp = load128cs(inp + idx); 33 | x128 packed_dout = load128(d_in_out + idx); 34 | for (int k = 0; k < packed_inp.size; ++k) { 35 | float x = (float)packed_inp[k]; 36 | float cube = 0.044715f * x * x * x; 37 | float tanh_arg = GELU_SCALING_FACTOR * (x + cube); 38 | float tanh_out = tanhf(tanh_arg); 39 | float coshf_out = coshf(tanh_arg); 40 | float sech_out = 1.0f / (coshf_out * coshf_out); 41 | float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x); 42 | packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]); 43 | } 44 | store128(d_in_out + idx, packed_dinp); 45 | } 46 | 47 | // ---------------------------------------------------------------------------- 48 | // kernel launchers 49 | 50 | void gelu_forward(floatX* out, const floatX* inp, int N, cudaStream_t stream) { 51 | NVTX_RANGE_FN(); 52 | const int block_size = 512; 53 | assert(N % (block_size * x128::size) == 0); 54 | const int grid_size = CEIL_DIV(N, block_size * x128::size); 55 | gelu_forward_kernel2<<>>(out, inp); 56 | cudaCheck(cudaGetLastError()); 57 | } 58 | 59 | void gelu_backward_inplace(floatX* d_in_out, const floatX* inp, const int N, cudaStream_t stream) { 60 | NVTX_RANGE_FN(); 61 | const int block_size = 128; 62 | assert(N % (block_size * x128::size) == 0); 63 | const int grid_size = CEIL_DIV(N, block_size * x128::size); 64 | gelu_backward_inplace_kernel<<>>(d_in_out, inp); 65 | cudaCheck(cudaGetLastError()); 66 | } 67 | -------------------------------------------------------------------------------- /llmcpp/gpt_optim.cu: -------------------------------------------------------------------------------- 1 | #include "gpt.hpp" 2 | #include "optim.hpp" 3 | 4 | int main(int argc, char** argv) { 5 | /* 6 | torch.set_printoptions(precision=6) 7 | torch.manual_seed(42) 8 | config = GPTConfig(block_size=8, n_embd=16, n_head=4, n_layer=8, vocab_size=100) 9 | gpt2 = GPT(config=config) 10 | B, T, C = 4, 8, 16 11 | idx = torch.LongTensor([[35, 28, 51, 9, 81, 41, 30, 22], 12 | [99, 91, 96, 20, 99, 46, 85, 63], 13 | [ 0, 78, 75, 43, 94, 99, 78, 93], 14 | [14, 42, 54, 11, 63, 42, 99, 48]]) 15 | targets = torch.LongTensor([[28, 51, 9, 81, 41, 30, 22, 99], 16 | [91, 96, 20, 99, 46, 85, 63, 0], 17 | [78, 75, 43, 94, 99, 78, 93, 14], 18 | [42, 54, 11, 63, 42, 99, 48, 0]]) 19 | optimizer = torch.optim.SGD(gpt2.parameters(), 20 | lr=1e-2) 21 | for i in range(10): 22 | logit, loss = gpt2(idx, targets) 23 | optimizer.zero_grad() 24 | loss.backward() 25 | optimizer.step() 26 | print('loss', loss) 27 | */ 28 | 29 | Eigen::setNbThreads(4); 30 | nn::ManualSeed(42); 31 | int block_size = 8, n_embd = 16, n_head = 4, n_layer = 8, vocab_size = 100; 32 | int B = 4, T = block_size, C = n_embd, nh = n_head, hs = n_embd / nh; 33 | gpt::GPT gpt(block_size, vocab_size, vocab_size, n_layer, n_head, n_embd); 34 | 35 | std::vector idx = {35, 28, 51, 9, 81, 41, 30, 22, 99, 91, 96, 36 | 20, 99, 46, 85, 63, 0, 78, 75, 43, 94, 99, 37 | 78, 93, 14, 42, 54, 11, 63, 42, 99, 48}; 38 | auto idx_m = TTypes::ConstMatrix(idx.data(), B, T); 39 | nn::Parameter d_logits(nn::DT_FLOAT, B * T * vocab_size); 40 | auto logits_3d = d_logits.tensor_3d(B, T, vocab_size); 41 | 42 | std::vector target = {28, 51, 9, 81, 41, 30, 22, 99, 91, 96, 20, 43 | 99, 46, 85, 63, 0, 78, 75, 43, 94, 99, 78, 44 | 93, 14, 42, 54, 11, 63, 42, 99, 48, 0}; 45 | std::vector label(B * T * vocab_size, 0.f); 46 | nn::OntHot(MakeConstFlat(target.data(), target.size()), 47 | MakeMatrix(label.data(), target.size(), vocab_size)); 48 | nn::Parameter d_label(nn::DT_FLOAT, label.size()); 49 | nn::g_device.memcpyHostToDevice(d_label.data(), label.data(), 50 | sizeof(float) * label.size()); 51 | nn::g_device.synchronize(); 52 | auto label_3d = d_label.const_tensor_3d(B, T, vocab_size); 53 | 54 | std::vector parameters; 55 | gpt.Parameters(¶meters); 56 | auto optimizer = optim::SGD(parameters, 1e-2f); 57 | float expected_loss[] = { 58 | 4.691669, 4.668904, 4.646729, 4.625142, 4.604129, 59 | 4.583667, 4.563725, 4.544271, 4.525268, 4.506680, 60 | }; 61 | for (int step = 0; step < 10; ++step) { 62 | float loss = 0.0f; 63 | gpt.ForwardGPU(idx_m, label_3d, logits_3d, &loss); 64 | optimizer.ZeroGrad(); 65 | gpt.BackwardGPU(idx_m); 66 | optimizer.Step(); 67 | fprintf(stdout, "Step %d, loss = %.6f\n", step, loss); 68 | CHECK(std::abs(loss - expected_loss[step]) < 1e-5); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /profile_gpt2.cu: -------------------------------------------------------------------------------- 1 | /* 2 | This code is a convenience tool for profiling the CUDA kernels in the training 3 | loop of train_gpt2.cu. Compile: 4 | 5 | make profile_gpt2cu NO_MULTI_GPU=1 6 | 7 | And then e.g. use ncu from NVIDIA. The CLI docs for example: 8 | https://docs.nvidia.com/nsight-compute/NsightComputeCli/ 9 | 10 | TLDR run like: 11 | 12 | sudo ncu --set full --import-source yes -o profile -f ./profile_gpt2cu 13 | 14 | This: 15 | - `--set full` means we'll collect A LOT of metrics. take out for less 16 | - `--import-source yes` means we'll get the source code in the profile 17 | - `-o profile` writes the results into file profile.ncu-rep 18 | - `-f` forces overwrite of the profile.ncu-rep file 19 | - `./profile_gpt2cu` is the executable we want to profile 20 | 21 | This writes results into profile.ncu-rep output file. 22 | You can open this up in NVIDIA Nsight Compute UI. 23 | For example, I have NVIDIA Nsight Compute installed on my Mac, and I rsync 24 | the profile.ncu-rep from a cloud box to local to pretty view. 25 | */ 26 | 27 | #define TESTING 28 | #include "train_gpt2.cu" 29 | 30 | int main(int argc, char *argv[]) { 31 | char nccl_init_method[256] = "mpi"; // "tcp" or "fs" or "mpi" 32 | int num_processes = -1; // doesn't matter when using MPI 33 | int process_rank = -1; // doesn't matter when using MPI 34 | int gpus_per_node = -1; // doesn't matter when using MPI 35 | char server_ip[256] = ""; // doesn't matter when using MPI 36 | char fs_path[256] = ""; // doesn't matter when using MPI 37 | multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method); 38 | common_start(true, true); 39 | 40 | // build the GPT-2 model from a checkpoint 41 | GPT2 model; 42 | gpt2_init_common(&model); 43 | gpt2_build_from_checkpoint(&model, "gpt2_124M_bf16.bin"); 44 | 45 | int B = 24; // if program OOMs decrease this number, e.g. all the way down to 4 or etc 46 | int T = 1024; // if even that OOMs move on to this one. keep them nice and powers of 2 47 | printf("batch size: %d\n", B); 48 | printf("sequence length: %d\n", T); 49 | 50 | int* x = (int*)mallocCheck(B * T * sizeof(int)); 51 | int* y = (int*)mallocCheck(B * T * sizeof(int)); 52 | for(int i = 0; i < B * T; ++i) { 53 | x[i] = i % model.config.vocab_size; 54 | y[i] = i % model.config.vocab_size; 55 | } 56 | 57 | // override number of layers to 1 because all layers repeat the same kernels, only profile once 58 | model.config.num_layers = 1; 59 | set_zero_configs(&multi_gpu_config, 0, model.num_parameters); 60 | 61 | gpt2_allocate_state(&model, B, T); 62 | // do a training step 63 | gpt2_forward(&model, x, B, T); 64 | gpt2_backward_and_reduce(&model, x, y, 1, 0); 65 | float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config); 66 | float grad_scale = (grad_norm > 1.0f) ? 1.0f / grad_norm : 1.0f; 67 | gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, grad_scale, 1, &multi_gpu_config); 68 | cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings 69 | 70 | // free 71 | gpt2_free(&model); 72 | common_free(model); 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /dev/loss_checker_ci.py: -------------------------------------------------------------------------------- 1 | # Description: A script to compare numbers in a file with fixed values and check for accuracy within a specified percent difference. 2 | # Usage: python loss_checker_ci.py -f -s -e -a 3 | # Example: python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 10.0 4 | import sys 5 | import argparse 6 | 7 | def read_numbers_from_file(file_path, col_start, col_end): 8 | try: 9 | numbers = [] 10 | with open(file_path, 'r') as file: 11 | lines = file.readlines() 12 | start_index = None 13 | for i, line in enumerate(lines): 14 | if "step 1/10" in line: 15 | start_index = i 16 | break 17 | 18 | if start_index is None: 19 | print("Error: Could not find the string 'step 1/10' in the file.") 20 | return None 21 | 22 | # Read 10 rows starting from the identified start row 23 | for line in lines[start_index:start_index + 10]: 24 | # Extracting the specified columns 25 | number = float(line[col_start:col_end].strip()) 26 | numbers.append(number) 27 | return numbers 28 | except Exception as e: 29 | print(f"Error reading the file: {e}") 30 | return None 31 | 32 | def compare_numbers(read_values, fixed_values, percent_accuracy): 33 | for i in range(len(read_values)): 34 | read_value = read_values[i] 35 | fixed_value = fixed_values[i] 36 | percent_difference = ((read_value - fixed_value) / fixed_value) * 100 37 | print(f"Fixed Value: {fixed_value}, Read Value: {read_value}, Percent Difference: {percent_difference:.2f}%") 38 | if abs(percent_difference) > percent_accuracy: 39 | print(f"Error: Percent difference {percent_difference:.2f}% exceeds the allowed accuracy of {percent_accuracy}%") 40 | return 1 41 | print("Success: All values are within the allowed accuracy.") 42 | return 0 43 | 44 | def main(): 45 | parser = argparse.ArgumentParser(description='Compare numbers in a file with fixed values.') 46 | parser.add_argument('-f', '--file', required=True, help='Path to the input file') 47 | parser.add_argument('-s', '--col_start', type=int, required=True, help='Starting column index (0-based)') 48 | parser.add_argument('-e', '--col_end', type=int, required=True, help='Ending column index (0-based)') 49 | parser.add_argument('-a', '--percent_accuracy', type=float, required=True, help='Allowed percent accuracy for comparison') 50 | 51 | args = parser.parse_args() 52 | 53 | # Read numbers from file 54 | read_values = read_numbers_from_file(args.file, args.col_start, args.col_end) 55 | if read_values is None: 56 | return 1 57 | 58 | # Use values from test_gpt2.cu for fp32 precision 59 | fixed_values = [5.270009,4.060681,3.320085,2.717550,2.181066,1.653923,1.168050,0.736873,0.401021,0.187493]; 60 | 61 | # Compare the numbers and check accuracy 62 | result = compare_numbers(read_values, fixed_values, args.percent_accuracy) 63 | return result 64 | 65 | if __name__ == "__main__": 66 | sys.exit(main()) 67 | -------------------------------------------------------------------------------- /scripts/multi_node/run_gpt2_124M_fs.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=llmc-multinode # job name 3 | #SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log # output file 4 | #SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err # error file 5 | #SBATCH --partition=llmc # Specify the GPU partition 6 | #SBATCH --ntasks=16 # total number of processes to launch on all nodes 7 | #SBATCH --nodes=2 # total number of nodes 8 | #SBATCH --ntasks-per-node=8 # assuming each node has 8 gpus 9 | #SBATCH --gres=gpu:8 # request 8 gpus from each node 10 | 11 | # NOTE: change the above slurm arguments to match your system! 12 | # Run with `sbatch ` 13 | 14 | make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1 15 | 16 | # NOTE: change the following to match your system 17 | binary_path="/home/ubuntu/llm.c/train_gpt2cu" 18 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi" 19 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin' 20 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin' 21 | sync_fs_path=$out_dir # needs to be a shared filesystem path that all nodes can access 22 | 23 | # In case the file system is shared this is a no-op. 24 | # Otherwise, we need to copy the binary to all nodes. 25 | current_user=$USER 26 | hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST) # get the hostnames of the allocated nodes 27 | current_host=$(hostname) 28 | for host in $hosts; do 29 | if [ $host == $current_host ]; then 30 | continue 31 | fi 32 | echo "copying $binary_path to $current_user@$host" 33 | scp -r $binary_path $current_user@$host:$binary_path 34 | done 35 | 36 | # Use this for NCCL debugging if you run into issues 37 | # export NCCL_DEBUG=INFO 38 | # export NCCL_DEBUG_SUBSYS=ALL 39 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 40 | 41 | # Optimization flags 42 | export NCCL_NET_GDR_LEVEL=2 # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU 43 | export NCCL_IB_DISABLE=0 # use InfiniBand if available 44 | 45 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them 46 | export NCCL_SOCKET_IFNAME=ens17 47 | export OMPI_MCA_btl_tcp_if_include=ens17 48 | export NCCL_P2P_LEVEL=PXB 49 | 50 | if [ -z "$SLURM_JOB_ID" ]; then 51 | echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting." 52 | exit 1 53 | else 54 | DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"` 55 | echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)" 56 | echo "Running on hosts: $(echo $(scontrol show hostname))" 57 | echo "$DATESTRING" 58 | fi 59 | 60 | srun -l -u bash -c " 61 | $binary_path \ 62 | -i '$train_data_path' \ 63 | -j '$val_data_path' \ 64 | -o $out_dir \ 65 | -v 250 -s 20000 -g 144 \ 66 | -h 1 \ 67 | -b 64 -t 1024 \ 68 | -d 2097152 \ 69 | -r 0 \ 70 | -z 1 \ 71 | -c 0.1 \ 72 | -l 0.0006 \ 73 | -q 0.0 \ 74 | -u 700 \ 75 | -n 5000 \ 76 | -y 1 \ 77 | -e d12 \ 78 | -pn \$SLURM_NTASKS \ 79 | -pr \$SLURM_PROCID \ 80 | -pg \$SLURM_NTASKS_PER_NODE \ 81 | -pf $sync_fs_path \ 82 | -pi "fs" \ 83 | " 84 | 85 | echo "$DATESTRING" -------------------------------------------------------------------------------- /scripts/multi_node/run_gpt2_124M_tcp.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=llmc-multinode # job name 3 | #SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log # output file 4 | #SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err # error file 5 | #SBATCH --partition=llmc # Specify the GPU partition 6 | #SBATCH --ntasks=16 # total number of processes to launch on all nodes 7 | #SBATCH --nodes=2 # total number of nodes 8 | #SBATCH --ntasks-per-node=8 # assuming each node has 8 gpus 9 | #SBATCH --gres=gpu:8 # request 8 gpus from each node 10 | 11 | # NOTE: change the above slurm arguments to match your system! 12 | # Run with `sbatch ` 13 | 14 | make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1 15 | 16 | # NOTE: change the following to match your system 17 | binary_path="/home/ubuntu/llm.c/train_gpt2cu" 18 | out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi" 19 | train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin' 20 | val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin' 21 | # NOTE: change the server_ip to the IP address of the machine that is running process zero 22 | server_ip="10.0.1.220" 23 | 24 | # In case the file system is shared this is a no-op. 25 | # Otherwise, we need to copy the binary to all nodes. 26 | current_user=$USER 27 | hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST) # get the hostnames of the allocated nodes 28 | current_host=$(hostname) 29 | for host in $hosts; do 30 | if [ $host == $current_host ]; then 31 | continue 32 | fi 33 | echo "copying $binary_path to $current_user@$host" 34 | scp -r $binary_path $current_user@$host:$binary_path 35 | done 36 | 37 | # Use this for NCCL debugging if you run into issues 38 | # export NCCL_DEBUG=INFO 39 | # export NCCL_DEBUG_SUBSYS=ALL 40 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 41 | 42 | # Optimization flags 43 | export NCCL_NET_GDR_LEVEL=2 # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU 44 | export NCCL_IB_DISABLE=0 # use InfiniBand if available 45 | 46 | # NOTE: change the following environment variables to match your system - or comment them out if you don't need them 47 | export NCCL_SOCKET_IFNAME=ens17 48 | export OMPI_MCA_btl_tcp_if_include=ens17 49 | export NCCL_P2P_LEVEL=PXB 50 | 51 | if [ -z "$SLURM_JOB_ID" ]; then 52 | echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting." 53 | exit 1 54 | else 55 | DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"` 56 | echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)" 57 | echo "Running on hosts: $(echo $(scontrol show hostname))" 58 | echo "$DATESTRING" 59 | fi 60 | 61 | srun -l -u bash -c " 62 | $binary_path \ 63 | -i '$train_data_path' \ 64 | -j '$val_data_path' \ 65 | -o $out_dir \ 66 | -v 250 -s 20000 -g 144 \ 67 | -h 1 \ 68 | -b 64 -t 1024 \ 69 | -d 2097152 \ 70 | -r 0 \ 71 | -z 1 \ 72 | -c 0.1 \ 73 | -l 0.0006 \ 74 | -q 0.0 \ 75 | -u 700 \ 76 | -n 5000 \ 77 | -y 1 \ 78 | -e d12 \ 79 | -pn \$SLURM_NTASKS \ 80 | -pr \$SLURM_PROCID \ 81 | -pg \$SLURM_NTASKS_PER_NODE \ 82 | -ps $server_ip \ 83 | -pi "tcp" \ 84 | " 85 | 86 | echo "$DATESTRING" 87 | -------------------------------------------------------------------------------- /dev/cuda/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for building dev/cuda kernels 2 | # Collects all the make commands in one file but each file also 3 | # has the compile and run commands in the header comments section. 4 | 5 | # Find nvcc (NVIDIA CUDA compiler) 6 | NVCC := $(shell which nvcc 2>/dev/null) 7 | ifeq ($(NVCC),) 8 | $(error nvcc not found.) 9 | endif 10 | 11 | ifneq ($(CI),true) # if not in CI, then use the GPU query 12 | ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 13 | GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) # assume if NVCC is present, then this likely is too 14 | GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY)) 15 | endif 16 | endif 17 | 18 | # Compiler flags 19 | ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY= 20 | CFLAGS = -O3 --use_fast_math 21 | else 22 | CFLAGS = -O3 --use_fast_math --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)] 23 | endif 24 | 25 | NVCCFLAGS = -lcublas -lcublasLt -std=c++17 26 | MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ 27 | 28 | # Default rule for our CUDA files 29 | %: %.cu 30 | $(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@ 31 | 32 | # Build all targets 33 | TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward global_norm permute 34 | 35 | all: $(TARGETS) 36 | all_ptx: $(TARGETS:%=%.ptx) 37 | all_sass: $(TARGETS:%=%.sass) 38 | 39 | # Individual targets: forward pass 40 | attention_forward: attention_forward.cu 41 | classifier_fused: classifier_fused.cu 42 | crossentropy_forward: crossentropy_forward.cu 43 | encoder_forward: encoder_forward.cu 44 | gelu_forward: gelu_forward.cu 45 | layernorm_forward: layernorm_forward.cu 46 | fused_residual_forward: fused_residual_forward.cu 47 | residual_forward: residual_forward.cu 48 | softmax_forward: softmax_forward.cu 49 | trimat_forward: trimat_forward.cu 50 | # matmul fwd/bwd also uses OpenMP (optionally) and cuBLASLt libs 51 | matmul_forward: matmul_forward.cu 52 | $(NVCC) $(CFLAGS) $(NVCCFLAGS) -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward 53 | 54 | # Individual targets: backward pass 55 | attention_backward: attention_backward.cu 56 | crossentropy_softmax_backward: crossentropy_softmax_backward.cu 57 | encoder_backward: encoder_backward.cu 58 | gelu_backward: gelu_backward.cu 59 | layernorm_backward: layernorm_backward.cu 60 | matmul_backward_bias: matmul_backward_bias.cu 61 | matmul_backward: matmul_backward.cu 62 | $(NVCC) $(CFLAGS) $(NVCCFLAGS) -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward 63 | 64 | # Update kernels 65 | adamw: adamw.cu 66 | global_norm: global_norm.cu 67 | 68 | permute: permute.cu 69 | 70 | # NCCL communication kernels 71 | nccl_all_reduce: nccl_all_reduce.cu 72 | $(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce 73 | 74 | # Generate PTX using cuobjdump 75 | %.ptx: % 76 | cuobjdump --dump-ptx $< > $@ 77 | 78 | # Generate SASS using cuobjdump 79 | %.sass: % 80 | cuobjdump --dump-sass $< > $@ 81 | 82 | # Run all targets 83 | run_all: all 84 | @for target in $(TARGETS); do \ 85 | echo "\n========================================"; \ 86 | echo "Running $$target ..."; \ 87 | echo "========================================\n"; \ 88 | ./$$target; \ 89 | done 90 | 91 | # Clean up 92 | clean: 93 | rm -f $(TARGETS) *.ptx *.sass 94 | -------------------------------------------------------------------------------- /llmcpp/tensor_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_ 2 | #define LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_ 3 | 4 | #include "Eigen/Dense" 5 | #include "unsupported/Eigen/CXX11/Tensor" 6 | 7 | // Helper to define Tensor types given that the scalar is of type T. 8 | template 9 | struct TTypes { 10 | // Rank- tensor of scalar type T. 11 | typedef Eigen::TensorMap, 12 | Eigen::Aligned> 13 | Tensor; 14 | typedef Eigen::TensorMap< 15 | Eigen::Tensor, Eigen::Aligned> 16 | ConstTensor; 17 | 18 | // Unaligned Rank- tensor of scalar type T. 19 | typedef Eigen::TensorMap > 20 | UnalignedTensor; 21 | typedef Eigen::TensorMap< 22 | Eigen::Tensor > 23 | UnalignedConstTensor; 24 | 25 | typedef Eigen::TensorMap, 26 | Eigen::Aligned> 27 | Tensor32Bit; 28 | 29 | // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. 30 | typedef Eigen::TensorMap< 31 | Eigen::TensorFixedSize, Eigen::RowMajor, IndexType>, 32 | Eigen::Aligned> 33 | Scalar; 34 | typedef Eigen::TensorMap, 35 | Eigen::RowMajor, IndexType>, 36 | Eigen::Aligned> 37 | ConstScalar; 38 | 39 | // Unaligned Scalar tensor of scalar type T. 40 | typedef Eigen::TensorMap< 41 | Eigen::TensorFixedSize, Eigen::RowMajor, IndexType> > 42 | UnalignedScalar; 43 | typedef Eigen::TensorMap, 44 | Eigen::RowMajor, IndexType> > 45 | UnalignedConstScalar; 46 | 47 | // Rank-1 tensor (vector) of scalar type T. 48 | typedef Eigen::TensorMap, 49 | Eigen::Aligned> 50 | Flat; 51 | typedef Eigen::TensorMap< 52 | Eigen::Tensor, Eigen::Aligned> 53 | ConstFlat; 54 | typedef Eigen::TensorMap, 55 | Eigen::Aligned> 56 | Vec; 57 | typedef Eigen::TensorMap< 58 | Eigen::Tensor, Eigen::Aligned> 59 | ConstVec; 60 | 61 | // Unaligned Rank-1 tensor (vector) of scalar type T. 62 | typedef Eigen::TensorMap > 63 | UnalignedFlat; 64 | typedef Eigen::TensorMap< 65 | Eigen::Tensor > 66 | UnalignedConstFlat; 67 | typedef Eigen::TensorMap > 68 | UnalignedVec; 69 | typedef Eigen::TensorMap< 70 | Eigen::Tensor > 71 | UnalignedConstVec; 72 | 73 | // Rank-2 tensor (matrix) of scalar type T. 74 | typedef Eigen::TensorMap, 75 | Eigen::Aligned> 76 | Matrix; 77 | typedef Eigen::TensorMap< 78 | Eigen::Tensor, Eigen::Aligned> 79 | ConstMatrix; 80 | 81 | // Unaligned Rank-2 tensor (matrix) of scalar type T. 82 | typedef Eigen::TensorMap > 83 | UnalignedMatrix; 84 | typedef Eigen::TensorMap< 85 | Eigen::Tensor > 86 | UnalignedConstMatrix; 87 | }; 88 | 89 | #endif // LLM_CPP_LLMCPP_TENSOR_TYPES_HPP_ 90 | -------------------------------------------------------------------------------- /.github/workflows/ci_tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit, Static and other Tests 2 | 3 | on: 4 | create: 5 | workflow_dispatch: 6 | push: 7 | branches: 8 | - master 9 | pull_request: 10 | branches: 11 | - master 12 | 13 | jobs: 14 | dataloader_test: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | 21 | - name: test the dataloader without / with sanitize address 22 | run: | 23 | cd dev/test 24 | make PRECISION=BF16 test_dataloader 25 | ./test_dataloader 26 | make clean 27 | make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader 28 | ./test_dataloader 29 | 30 | ptx_and_sass_files: 31 | runs-on: ubuntu-latest 32 | container: 33 | image: nvidia/cuda:12.4.1-devel-ubuntu22.04 34 | 35 | steps: 36 | - name: Checkout code 37 | uses: actions/checkout@v4 38 | 39 | - name: Install OpenMP and OpenMPI 40 | run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev 41 | 42 | - name: Generate ptx/sass files and upload them to persistent storage 43 | run: | 44 | mkdir -p dev/cuda/ptx_sass_logs 45 | make train_gpt2cu 46 | cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx 47 | cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass 48 | cd dev/cuda 49 | make -j all_ptx 50 | make -j all_sass 51 | cp *.ptx ptx_sass_logs/ 52 | cp *.sass ptx_sass_logs/ 53 | ls ptx_sass_logs/ 54 | 55 | - name: Generate ptx/sass files for A100 and upload them to persistent storage 56 | run: | 57 | mkdir -p dev/cuda/ptx_sass_logs_A100 58 | make train_gpt2cu GPU_COMPUTE_CAPABILITY=80 59 | cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx 60 | cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass 61 | cd dev/cuda 62 | make -j GPU_COMPUTE_CAPABILITY=80 all_ptx 63 | make -j GPU_COMPUTE_CAPABILITY=80 all_sass 64 | cp *.ptx ptx_sass_logs_A100/ 65 | cp *.sass ptx_sass_logs_A100/ 66 | ls ptx_sass_logs_A100/ 67 | 68 | - name: Generate ptx/sass files for H100 and upload them to persistent storage 69 | run: | 70 | mkdir -p dev/cuda/ptx_sass_logs_H100 71 | make train_gpt2cu GPU_COMPUTE_CAPABILITY=90 72 | cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx 73 | cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass 74 | cd dev/cuda 75 | make -j GPU_COMPUTE_CAPABILITY=90 all_ptx 76 | make -j GPU_COMPUTE_CAPABILITY=90 all_sass 77 | cp *.ptx ptx_sass_logs_H100/ 78 | cp *.sass ptx_sass_logs_H100/ 79 | ls ptx_sass_logs_H100/ 80 | 81 | - name: Upload ptx/sass files 82 | uses: actions/upload-artifact@v4 83 | with: 84 | name: ptx_sass_files 85 | path: dev/cuda/ptx_sass_logs/ 86 | retention-days: 30 # days to retain 87 | 88 | - name: Upload ptx/sass files for A100 89 | uses: actions/upload-artifact@v4 90 | with: 91 | name: ptx_sass_files_A100 92 | path: dev/cuda/ptx_sass_logs_A100/ 93 | retention-days: 30 # days to retain 94 | 95 | - name: Upload ptx/sass files for H100 96 | uses: actions/upload-artifact@v4 97 | with: 98 | name: ptx_sass_files_H100 99 | path: dev/cuda/ptx_sass_logs_H100/ 100 | retention-days: 30 # days to retain -------------------------------------------------------------------------------- /llmc/global_norm.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Global norm, used in gradient clipping 3 | */ 4 | #include 5 | #include 6 | #include 7 | // llmc internal imports 8 | #include "cuda_common.h" 9 | #include "cuda_utils.cuh" 10 | 11 | // ---------------------------------------------------------------------------- 12 | // CUDA kernels 13 | 14 | template 15 | __device__ float global_norm_squared_for_range(const T* data, size_t count) { 16 | size_t index = blockIdx.x * blockDim.x + threadIdx.x; 17 | size_t grid_width = blockDim.x * gridDim.x; 18 | float accumulator = 0.f; 19 | for(size_t i = index; i < count; i += grid_width) { 20 | accumulator += (float)data[i] * (float)data[i]; 21 | } 22 | // block-level reduce 23 | return blockReduce(accumulator); 24 | } 25 | 26 | template 27 | __global__ void global_norm_squared_kernel(float* out, const T* data, size_t count, ptrdiff_t stride) { 28 | float block_sum = global_norm_squared_for_range(data + blockIdx.y * stride, count); 29 | // each block accumulates its partial sum to out[out_index] 30 | // we want to avoid using atomic add here so we combine this kernel with another kernel call 31 | // that sums up the partial block sums 32 | if(threadIdx.x == 0) { 33 | size_t out_index = blockIdx.y * gridDim.x + blockIdx.x; 34 | out[out_index] = out[out_index] + block_sum; 35 | } 36 | } 37 | 38 | __global__ void global_norm_aggregate_kernel(float* out, size_t grid_size) { 39 | size_t index = threadIdx.x; 40 | // grab block sums from the previous kernel, use 0. as the neutral sum element 41 | float block_sum = (index < grid_size) ? out[index] : 0.f; 42 | float sum = blockReduce(block_sum); 43 | if(threadIdx.x == 0) { 44 | out[0] = sum; // out[0] ends up with the final norm squared 45 | } 46 | } 47 | 48 | // ---------------------------------------------------------------------------- 49 | // kernel launcher 50 | 51 | // Helper function determines the maximum number of block sums 52 | int get_max_num_block_sums(int* num_slices_all, int numel) { 53 | // NOTE: this needs to be kept in sync with `global_norm_squared` below. 54 | const int block_size = 512; 55 | const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size; 56 | assert(grid_size > 0); 57 | int max_num_block_sums = 0; 58 | for (int i = 0; i < numel; i++) { 59 | int num_slices = num_slices_all[i]; 60 | const int gx = CEIL_DIV(grid_size, num_slices); 61 | const int gy = num_slices; 62 | max_num_block_sums = max(max_num_block_sums, gx * gy); 63 | } 64 | 65 | return max_num_block_sums; 66 | } 67 | 68 | template 69 | void global_norm_squared(float* out, const T* values, size_t count, ptrdiff_t stride, int num_slices, int max_num_block_sums, bool reset, cudaStream_t stream) { 70 | const int block_size = 512; 71 | // launch just enough blocks to fill the grid. deliberately no DIV_CEIL. 72 | // having one block less than possible is a tiny performance hit, having 73 | // one block too many is catastrophic, since it only can start once all the other 74 | // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512 75 | // on all gpus, so the division really is going to be exact. 76 | const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size; 77 | assert(grid_size > 0); // gives a better error than letting the call below fail 78 | 79 | const int gx = CEIL_DIV(grid_size, num_slices); 80 | const int gy = num_slices; 81 | 82 | assert(gx * gy < 1024); // we want to later accumulate the block sums in a single block 83 | 84 | if (reset) { 85 | cudaCheck(cudaMemsetAsync(out, 0, max_num_block_sums * sizeof(float), stream)); 86 | } 87 | global_norm_squared_kernel<<>>(out, values, count, stride); 88 | cudaCheck(cudaGetLastError()); 89 | } 90 | -------------------------------------------------------------------------------- /llmcpp/optim_test.cpp: -------------------------------------------------------------------------------- 1 | #include "optim.hpp" 2 | #include "gtest/gtest.h" 3 | 4 | TEST(Optimizer, SGD) { 5 | /* 6 | torch.set_printoptions(precision=6) 7 | torch.manual_seed(42) 8 | m = nn.Linear(3, 2) 9 | optimizer = torch.optim.SGD(m.parameters(), lr=0.01) 10 | x = torch.randn(4, 3) 11 | for _ in range(10): 12 | y = m(x) 13 | loss = torch.sum(y) 14 | optimizer.zero_grad() 15 | loss.backward() 16 | optimizer.step() 17 | */ 18 | 19 | nn::ManualSeed(42); 20 | int B = 4, in_features = 3, out_features = 2; 21 | nn::Linear m(in_features, out_features, true); 22 | 23 | // forward 24 | std::vector x(B * in_features), y(B * out_features); 25 | nn::NormalFill(absl::MakeSpan(x)); 26 | auto xm = MakeConstMatrix(x.data(), B, in_features); 27 | auto ym = MakeMatrix(y.data(), B, out_features); 28 | 29 | // optimizer 30 | std::vector parameters; 31 | m.Parameters(¶meters); 32 | optim::SGD sgd(parameters, 0.01); 33 | 34 | // backward 35 | std::vector y_grad(y.size(), 1.0f); 36 | std::vector x_grad(x.size(), 0.f); 37 | auto y_gradm = MakeConstMatrix(y_grad.data(), B, out_features); 38 | auto x_gradm = MakeMatrix(x_grad.data(), B, in_features); 39 | 40 | int step = 10; 41 | for (int i = 0; i < step; ++i) { 42 | m.Forward(xm, ym); 43 | sgd.ZeroGrad(); 44 | m.Backward(xm, y_gradm, x_gradm); 45 | sgd.Step(); 46 | } 47 | 48 | auto weight = m.weight_->span(); 49 | auto bias = m.bias_->span(); 50 | std::vector expected_weight = {0.732981, 0.469633, -0.589639, 51 | 0.821935, -0.136072, -0.337878}; 52 | std::vector expected_bias = {-0.681086, -0.060932}; 53 | for (size_t i = 0; i < expected_weight.size(); ++i) { 54 | EXPECT_NEAR(expected_weight[i], weight[i], 1e-5); 55 | } 56 | for (size_t i = 0; i < expected_bias.size(); ++i) { 57 | EXPECT_NEAR(expected_bias[i], bias[i], 1e-5); 58 | } 59 | } 60 | 61 | TEST(Optimizer, AdamW) { 62 | /* 63 | torch.set_printoptions(precision=6) 64 | torch.manual_seed(42) 65 | m = nn.Linear(3, 2) 66 | optimizer = torch.optim.AdamW(m.parameters(), lr=0.01, betas=(0.9, 0.999), 67 | eps=1e-8, weight_decay=0.001) 68 | x = torch.randn(4, 3) 69 | for _ in range(10): 70 | y = m(x) 71 | loss = torch.sum(y) 72 | optimizer.zero_grad() 73 | loss.backward() 74 | optimizer.step() 75 | */ 76 | 77 | nn::ManualSeed(42); 78 | int B = 4, in_features = 3, out_features = 2; 79 | nn::Linear m(in_features, out_features, true); 80 | 81 | // forward 82 | std::vector x(B * in_features), y(B * out_features); 83 | nn::NormalFill(absl::MakeSpan(x)); 84 | auto xm = MakeConstMatrix(x.data(), B, in_features); 85 | auto ym = MakeMatrix(y.data(), B, out_features); 86 | 87 | // optimizer 88 | std::vector parameters; 89 | m.Parameters(¶meters); 90 | optim::AdamW adam_w(parameters, 0.01f, 0.9f, 0.999f, 1e-8f, 0.001f); 91 | 92 | // backward 93 | std::vector y_grad(y.size(), 1.0f); 94 | std::vector x_grad(x.size(), 0.f); 95 | auto y_gradm = MakeConstMatrix(y_grad.data(), B, out_features); 96 | auto x_gradm = MakeMatrix(x_grad.data(), B, in_features); 97 | 98 | int step = 10; 99 | for (int i = 0; i < step; ++i) { 100 | m.Forward(xm, ym); 101 | adam_w.ZeroGrad(); 102 | m.Backward(xm, y_gradm, x_gradm); 103 | adam_w.Step(i + 1); 104 | } 105 | 106 | auto weight = m.weight_->span(); 107 | auto bias = m.bias_->span(); 108 | std::vector expected_weight = {0.541358, 0.379162, -0.235239, 109 | 0.630303, -0.226482, 0.016497}; 110 | std::vector expected_bias = {-0.381053, 0.239038}; 111 | for (size_t i = 0; i < expected_weight.size(); ++i) { 112 | EXPECT_NEAR(expected_weight[i], weight[i], 1e-5); 113 | } 114 | for (size_t i = 0; i < expected_bias.size(); ++i) { 115 | EXPECT_NEAR(expected_bias[i], bias[i], 1e-5); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /llmc/tokenizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | Defines the GPT-2 Tokenizer. 3 | Only supports decoding, i.e.: tokens (integers) -> strings 4 | This is all we need for unconditional generation. 5 | If we wanted to later prompt the model, we'd have to add decoding. 6 | Which could be tricky in C because of the regex involved, to look into later. 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | // our own utilities 13 | // defines fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck 14 | #include "utils.h" 15 | 16 | // ---------------------------------------------------------------------------- 17 | 18 | typedef struct { 19 | uint32_t vocab_size; 20 | char **token_table; 21 | int init_ok; 22 | int eot_token; // <|endoftext|> token id 23 | } Tokenizer; 24 | 25 | void safe_printf(const char *piece) { 26 | // the tokens are raw bytes, and we we only want to print the printable ones 27 | // many bytes can be various control codes, backspace, etc. 28 | if (piece == NULL) { return; } 29 | if (piece[0] == '\0') { return; } 30 | // handle individual byte tokens 31 | // every token is asserted to be at least one byte so doing piece[1] is ok 32 | if (piece[1] == '\0') { 33 | unsigned char byte_val = piece[0]; 34 | if (!(isprint(byte_val) || isspace(byte_val))) { 35 | return; // weird byte, don't print it 36 | } 37 | } 38 | printf("%s", piece); 39 | } 40 | 41 | void tokenizer_init(Tokenizer *tokenizer, const char *filename) { 42 | FILE *file = fopen(filename, "rb"); 43 | if (file == NULL) { 44 | // try to be more helpful as we just added this feature, erase later 45 | printf("---\n"); 46 | printf("WARNING: Failed to open the tokenizer file %s\n", filename); 47 | printf("The Tokenizer is a new feature added April 14 2024.\n"); 48 | printf("Re-run `python train_gpt2.py` to write it\n"); 49 | printf("---\n"); 50 | tokenizer->init_ok = 0; 51 | return; 52 | } 53 | // read in the header 54 | uint32_t header[256]; 55 | freadCheck(header, sizeof(uint32_t), 256, file); 56 | assert(header[0] == 20240328); 57 | int version = header[1]; 58 | tokenizer->vocab_size = header[2]; 59 | if (version == 1) { 60 | // version 1 didn't include the EOT token id 61 | // so we assume it is 50256, the EOT in GPT-2 62 | assert(tokenizer->vocab_size == 50257); // let's be defensive here 63 | tokenizer->eot_token = 50256; 64 | } else if (version == 2) { 65 | tokenizer->eot_token = header[3]; 66 | } else { 67 | fprintf(stderr, "Tokenizer model file %s has bad version: %d\n", filename, version); 68 | exit(EXIT_FAILURE); 69 | } 70 | // read in all the tokens 71 | unsigned char length; 72 | tokenizer->token_table = (char **)mallocCheck(tokenizer->vocab_size * sizeof(char *)); 73 | for (uint32_t i = 0; i < tokenizer->vocab_size; i++) { 74 | freadCheck(&length, sizeof(unsigned char), 1, file); 75 | assert(length > 0); // every token should be at least one character 76 | char *token_bytes = (char *)mallocCheck(length + 1); 77 | freadCheck(token_bytes, sizeof(char), length, file); 78 | token_bytes[length] = '\0'; // Add null terminator for printing 79 | tokenizer->token_table[i] = token_bytes; 80 | } 81 | // cleanups 82 | fcloseCheck(file); 83 | tokenizer->init_ok = 1; 84 | } 85 | 86 | const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) { 87 | if (tokenizer->init_ok == 0) { 88 | return NULL; 89 | } 90 | if (token_id < tokenizer->vocab_size) { 91 | return tokenizer->token_table[token_id]; 92 | } else { 93 | printf("invalid token id %u!\n", token_id); 94 | return NULL; 95 | } 96 | } 97 | 98 | void tokenizer_free(Tokenizer *tokenizer) { 99 | if (tokenizer->init_ok) { 100 | for (uint32_t i = 0; i < tokenizer->vocab_size; i++) { 101 | free(tokenizer->token_table[i]); 102 | } 103 | free(tokenizer->token_table); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /.github/workflows/ci_gpu.yml: -------------------------------------------------------------------------------- 1 | name: GPU Builds and Tests 2 | 3 | on: 4 | create: 5 | workflow_dispatch: 6 | push: 7 | branches: 8 | - master 9 | pull_request: 10 | branches: 11 | - master 12 | 13 | jobs: 14 | build-and-test-gpu: 15 | runs-on: ubicloud-gpu-standard-1-latest 16 | 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v4 20 | 21 | - name: Install OpenMP 22 | run: sudo apt-get update && sudo apt-get install -y libomp-dev 23 | 24 | - name: Install dependencies 25 | run: pip install -r requirements.txt 26 | 27 | - name: Run preprocessing 28 | run: python dev/data/tinyshakespeare.py 29 | 30 | - name: Train model 31 | run: python train_gpt2.py 32 | 33 | - name: Compile training and testing program 34 | run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu 35 | 36 | - name: Train model (With OpenMP) 37 | run: OMP_NUM_THREADS=8 ./train_gpt2cu 38 | 39 | - name: Train model (FP32) with gpt2_124M.bin 40 | run: | 41 | PRECISION=FP32 make train_gpt2cu 42 | ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" 43 | 44 | - name: Test for percent loss differential for FP32 45 | run: | 46 | PRECISION=FP32 make train_gpt2cu 47 | ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt 48 | python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0 49 | 50 | - name: Build FP32 precision 51 | run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu 52 | 53 | - name: Run default 54 | run: ./test_gpt2cu 55 | 56 | - name: Run no recompute GeLU 57 | run: ./test_gpt2cu -r 0 58 | 59 | - name: Run recompute LN 60 | run: ./test_gpt2cu -r 2 61 | 62 | - name: Build BF16 precision 63 | run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu 64 | 65 | - name: Run default 66 | run: ./test_gpt2cu 67 | 68 | - name: Run no recompute GeLU 69 | run: ./test_gpt2cu -r 0 70 | 71 | - name: Run no master weights 72 | run: ./test_gpt2cu -w 0 73 | 74 | - name: Run recompute LN 75 | run: ./test_gpt2cu -r 2 76 | 77 | - name: Train model fp32 (With OpenMP) 78 | run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu 79 | 80 | - name: Execute testing program (With OpenMP) 81 | run: OMP_NUM_THREADS=8 ./test_gpt2cu 82 | 83 | - name: Execute testing program fp32 (With OpenMP) 84 | run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu 85 | 86 | - name: Compile training and testing program without OpenMP 87 | run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu 88 | 89 | - name: Train model (No OpenMP) 90 | run: NO_OMP=1 ./train_gpt2cu 91 | 92 | - name: Train model fp32 (No OpenMP) 93 | run: NO_OMP=1 ./train_gpt2fp32cu 94 | 95 | - name: Execute testing program (No OpenMP) 96 | run: ./test_gpt2cu -b 32 97 | 98 | - name: Execute testing program fp32 (No OpenMP) 99 | run: ./test_gpt2fp32cu 100 | 101 | - name: Install cuDNN-frontend 102 | run: 103 | git clone https://github.com/NVIDIA/cudnn-frontend.git 104 | 105 | - name: Build with cuDNN 106 | run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu 107 | 108 | - name: Train model with cuDNN 109 | run: ./train_gpt2cu 110 | 111 | - name: Train model fp32 with cuDNN 112 | run: ./train_gpt2fp32cu 113 | 114 | - name: Execute testing program with cuDNN 115 | run: ./test_gpt2cu 116 | 117 | - name: Execute testing program fp32 with cuDNN 118 | run: ./test_gpt2fp32cu 119 | 120 | unit-tests-gpu: 121 | runs-on: ubicloud-gpu-standard-1-latest 122 | 123 | steps: 124 | - name: Checkout code 125 | uses: actions/checkout@v4 126 | 127 | - name: Test Device<->File IO 128 | run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io 129 | -------------------------------------------------------------------------------- /dev/data/tinystories.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads and tokenizes the TinyStories dataset. 3 | - The download is from HuggingFace datasets. 4 | - The tokenization is GPT-2 tokenizer with tiktoken 5 | 6 | The output is written to a newly created tinystories/ folder. 7 | The script prints: 8 | 9 | Tokenizing val split... 10 | Saved 19043638 tokens to tinystories/TinyStories_val.bin 11 | Tokenizing train split... 12 | Saved 925653391 tokens to tinystories/TinyStories_train.bin 13 | 14 | And runs in 1-2 minutes two depending on your internet 15 | connection and computer. The .bin files are raw byte 16 | streams of int32 numbers indicating the token ids. 17 | """ 18 | 19 | import os 20 | import glob 21 | import json 22 | import random 23 | import requests 24 | from tqdm import tqdm 25 | from concurrent.futures import ProcessPoolExecutor, as_completed 26 | import tiktoken 27 | import numpy as np 28 | from data_common import download_file, write_datafile 29 | 30 | # ----------------------------------------------------------------------------- 31 | DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), "tinystories") 32 | 33 | enc = tiktoken.get_encoding("gpt2") 34 | encode = lambda s: enc.encode_ordinary(s) 35 | 36 | def download(): 37 | """Downloads the TinyStories dataset to DATA_CACHE_DIR""" 38 | os.makedirs(DATA_CACHE_DIR, exist_ok=True) 39 | 40 | # download the TinyStories dataset, unless it's already downloaded 41 | data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz" 42 | data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz") 43 | if not os.path.exists(data_filename): 44 | print(f"Downloading {data_url} to {data_filename}...") 45 | download_file(data_url, data_filename) 46 | else: 47 | print(f"{data_filename} already exists, skipping download...") 48 | 49 | # unpack the tar.gz file into all the data shards (json files) 50 | data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 51 | if not os.path.exists(data_dir): 52 | os.makedirs(data_dir, exist_ok=True) 53 | print(f"Unpacking {data_filename}...") 54 | os.system(f"tar -xzf {data_filename} -C {data_dir}") 55 | else: 56 | print(f"{data_dir} already exists, skipping unpacking...") 57 | 58 | # print a single example just for debugging and such 59 | shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) 60 | print("Download done.") 61 | print(f"Number of shards: {len(shard_filenames)}") 62 | # with open(shard_filenames[0], "r") as f: 63 | # data = json.load(f) 64 | # print(f"Example story:\n{data[0]}") 65 | 66 | def process_shard(shard_index, shard_filename): 67 | with open(shard_filename, "r") as f: 68 | data = json.load(f) 69 | eot = enc._special_tokens['<|endoftext|>'] # end of text token 70 | rng = random.Random(1337 + shard_index) 71 | rng.shuffle(data) 72 | all_tokens = [] 73 | for example in data: 74 | text = example["story"] 75 | text = text.strip() # get rid of leading/trailing whitespace 76 | tokens = encode(text) 77 | all_tokens.append(eot) 78 | all_tokens.extend(tokens) 79 | return all_tokens 80 | 81 | def tokenize(): 82 | # shard 0 will be the val split, rest is train 83 | data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") 84 | shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) 85 | val_shards = [shard_filenames[0]] 86 | train_shards = shard_filenames[1:] 87 | for split_name, split_shards in [("val", val_shards), ("train", train_shards)]: 88 | 89 | print(f"Tokenizing {split_name} split...") 90 | all_tokens = [] 91 | with ProcessPoolExecutor() as executor: 92 | futures = [executor.submit(process_shard, shard_index, shard_filename) 93 | for shard_index, shard_filename in enumerate(split_shards)] 94 | for future in as_completed(futures): 95 | all_tokens.extend(future.result()) 96 | 97 | split_filename = os.path.join(DATA_CACHE_DIR, f"TinyStories_{split_name}.bin") 98 | write_datafile(split_filename, all_tokens) 99 | 100 | if __name__ == "__main__": 101 | download() 102 | tokenize() 103 | 104 | # Prints: 105 | # Tokenizing val split... 106 | # Saved 19043638 tokens to data/TinyStories_val.bin 107 | # Tokenizing train split... 108 | # Saved 925653391 tokens to data/TinyStories_train.bin 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llm.cpp 2 | 项目 fork 自 karpathy 的 [llm.c](https://github.com/karpathy/llm.c),使用 C++(with Eigen) 来复现 GPT-2,支持 CPU/CUDA 计算。 3 | - 所有的计算部分都通过 Eigen Tensor 完成,所以同样一份代码通过简单地切换 Device 就可完成 CPU/CUDA 的计算 4 | - 这里实现的 GPT-2 与 PyTorch 版本是完全对齐的 5 | - 值得注意的是,CPU 版本比 PyTorch 快大约 20%,但是 GPU 版本比 PyTorch GPU 慢得多,主要原因是 Eigen 的 Tensor 不支持 BatchMatmul 6 | 7 | 8 | This repo is forked from karpathy's [llm.c](https://github.com/karpathy/llm.c), using C++ (with Eigen) to reproduce GPT-2. 9 | 10 | - All calculations are done through the Eigen Tensor Module, so the same code can be used for CPU/CUDA calculations by simply switching the Device. 11 | - Currently, this repo has reproduced GPT-2 and the results are completely aligned with the PyTorch version. 12 | - It is worth noting that CPU calculations are about 20% faster than PyTorch, while GPU calculations are still far behind PyTorch's GPU due to the difficulty of Eigen Tensor Module to support BatchMatmul. 13 | 14 | ## quick start (CPU) 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | python dev/data/tinyshakespeare.py 19 | python train_gpt2.py 20 | mkdir build && cd build 21 | cmake .. 22 | make train_gpt2_cpu 23 | cd ../ 24 | ./build/llmcpp/train_gpt2_cpu 25 | ``` 26 | 27 | The above lines 28 | - (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, 29 | tokenize it with the GPT-2 Tokenizer 30 | - (2) download and save the GPT-2 (124M) weights 31 | - (3) init from them in C++ and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. The output looks like this on my LMDE3 (Intel© Core™ i7-10700K CPU @ 3.80GHz × 8): 32 | 33 | ``` 34 | [GPT-2] 35 | max_seq_len: 1024 36 | vocab_size: 50257 37 | padded_vocab_size: 50304 38 | num_layers: 12 39 | num_heads: 12 40 | channels: 768 41 | num_parameters: 124475904(474 MB) 42 | train dataset num_batches: 1192 43 | val dataset num_batches: 128 44 | num_activations: 82723584(315 MB) 45 | val loss 5.325413 46 | step 0: train loss 5.356086 (took 786.515755 ms) 47 | step 1: train loss 4.300581 (took 677.340087 ms) 48 | step 2: train loss 4.623053 (took 674.843167 ms) 49 | step 3: train loss 4.599307 (took 673.189660 ms) 50 | ... (trunctated) ... 51 | step 39: train loss 3.972404 (took 749.386021 ms) 52 | val loss 4.017484 53 | generating: 54 | --- 55 | Requinetarius, 56 | Which; supreme, but 57 | Commands jest in vain for ever. 58 | 59 | <|endoftext|>Lady: 60 | No, heavens, 61 | I were not to haste 62 | To retire valorously and look nobly in the face, 63 | Before this 64 | UNHISILIUS UNDERDEINTS 65 | 66 | --- 67 | step 40: train loss 4.378605 (took 692.830391 ms) 68 | final 40 iters avg: 692.974 ms 69 | ``` 70 | 71 | ## quick start (1 GPU, fp32 only) 72 | ```bash 73 | mkdir build && cd build 74 | cmake .. 75 | make train_gpt2_gpu 76 | cd ../ 77 | ./build/llmcpp/train_gpt2_gpu 78 | ``` 79 | 80 | 81 | ## datasets 82 | 83 | The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to .bin files, readable easily from C. So for example when you run: 84 | 85 | ```bash 86 | python dev/data/tinyshakespeare.py 87 | ``` 88 | 89 | We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this: 90 | 91 | ``` 92 | writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin 93 | writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin 94 | ``` 95 | 96 | The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`. 97 | 98 | ## test 99 | 100 | I am also attaching a simple unit test for making sure our C++ code agrees with the PyTorch code. On the CPU as an example, compile and run with: 101 | 102 | ```bash 103 | mkdir build && cd build 104 | cmake .. 105 | make test_gpt2_cpu 106 | cd ../ 107 | ./build/llmcpp/test_gpt2_cpu 108 | ``` 109 | 110 | This now loads the `gpt2_124M_debug_state.bin` file that gets written by train_gpt2.py, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. 111 | This tests both the fp32 path and the mixed precision path. The test should pass and print `overall okay: 1`. 112 | 113 | 114 | ## license 115 | 116 | MIT 117 | -------------------------------------------------------------------------------- /llmcpp/README.md: -------------------------------------------------------------------------------- 1 | # llm.cpp 2 | 项目 fork 自 karpathy 的 [llm.c](https://github.com/karpathy/llm.c),使用 C++(with Eigen) 来复现 GPT-2,支持 CPU/CUDA 计算。 3 | - 所有的计算部分都通过 Eigen Tensor 完成,所以同样一份代码通过简单地切换 Device 就可完成 CPU/CUDA 的计算 4 | - 这里实现的 GPT-2 与 PyTorch 版本是完全对齐的 5 | - 值得注意的是,CPU 版本比 PyTorch 快大约 20%,但是 GPU 版本比 PyTorch GPU 慢得多,主要原因是 Eigen 的 Tensor 不支持 BatchMatmul 6 | 7 | 8 | This repo is forked from karpathy's [llm.c](https://github.com/karpathy/llm.c), using C++ (with Eigen) to reproduce GPT-2. 9 | 10 | - All calculations are done through the Eigen Tensor Module, so the same code can be used for CPU/CUDA calculations by simply switching the Device. 11 | - Currently, this repo has reproduced GPT-2 and the results are completely aligned with the PyTorch version. 12 | - It is worth noting that CPU calculations are about 20% faster than PyTorch, while GPU calculations are still far behind PyTorch's GPU due to the difficulty of Eigen Tensor Module to support BatchMatmul. 13 | 14 | ## quick start (CPU) 15 | 16 | ```bash 17 | pip install -r requirements.txt 18 | python dev/data/tinyshakespeare.py 19 | python train_gpt2.py 20 | mkdir build && cd build 21 | cmake .. 22 | make train_gpt2_cpu 23 | cd ../ 24 | ./build/llmcpp/train_gpt2_cpu 25 | ``` 26 | 27 | The above lines 28 | - (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, 29 | tokenize it with the GPT-2 Tokenizer 30 | - (2) download and save the GPT-2 (124M) weights 31 | - (3) init from them in C++ and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. The output looks like this on my LMDE3 (Intel© Core™ i7-10700K CPU @ 3.80GHz × 8): 32 | 33 | ``` 34 | [GPT-2] 35 | max_seq_len: 1024 36 | vocab_size: 50257 37 | padded_vocab_size: 50304 38 | num_layers: 12 39 | num_heads: 12 40 | channels: 768 41 | num_parameters: 124475904(474 MB) 42 | train dataset num_batches: 1192 43 | val dataset num_batches: 128 44 | num_activations: 82723584(315 MB) 45 | val loss 5.325413 46 | step 0: train loss 5.356086 (took 786.515755 ms) 47 | step 1: train loss 4.300581 (took 677.340087 ms) 48 | step 2: train loss 4.623053 (took 674.843167 ms) 49 | step 3: train loss 4.599307 (took 673.189660 ms) 50 | ... (trunctated) ... 51 | step 39: train loss 3.972404 (took 749.386021 ms) 52 | val loss 4.017484 53 | generating: 54 | --- 55 | Requinetarius, 56 | Which; supreme, but 57 | Commands jest in vain for ever. 58 | 59 | <|endoftext|>Lady: 60 | No, heavens, 61 | I were not to haste 62 | To retire valorously and look nobly in the face, 63 | Before this 64 | UNHISILIUS UNDERDEINTS 65 | 66 | --- 67 | step 40: train loss 4.378605 (took 692.830391 ms) 68 | final 40 iters avg: 692.974 ms 69 | ``` 70 | 71 | ## quick start (1 GPU, fp32 only) 72 | ```bash 73 | mkdir build && cd build 74 | cmake .. 75 | make train_gpt2_gpu 76 | cd ../ 77 | ./build/llmcpp/train_gpt2_gpu 78 | ``` 79 | 80 | 81 | ## datasets 82 | 83 | The data files inside `/dev/data/(dataset).py` are responsible for downloading, tokenizing and saving the tokens to .bin files, readable easily from C. So for example when you run: 84 | 85 | ```bash 86 | python dev/data/tinyshakespeare.py 87 | ``` 88 | 89 | We download and tokenize the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. The output of this looks like this: 90 | 91 | ``` 92 | writing 32,768 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_val.bin 93 | writing 305,260 tokens to ./dev/data/tinyshakespeare/tiny_shakespeare_train.bin 94 | ``` 95 | 96 | The .bin files contain a short header (1024 bytes) and then a stream of tokens in uint16, indicating the token ids with the GPT-2 tokenizer. More datasets are available in `/dev/data`. 97 | 98 | ## test 99 | 100 | I am also attaching a simple unit test for making sure our C++ code agrees with the PyTorch code. On the CPU as an example, compile and run with: 101 | 102 | ```bash 103 | mkdir build && cd build 104 | cmake .. 105 | make test_gpt2_cpu 106 | cd ../ 107 | ./build/llmcpp/test_gpt2_cpu 108 | ``` 109 | 110 | This now loads the `gpt2_124M_debug_state.bin` file that gets written by train_gpt2.py, runs a forward pass, compares the logits and loss with the PyTorch reference implementation, then it does 10 iterations of training with Adam and makes sure the losses match PyTorch. 111 | This tests both the fp32 path and the mixed precision path. The test should pass and print `overall okay: 1`. 112 | 113 | 114 | ## license 115 | 116 | MIT 117 | -------------------------------------------------------------------------------- /llmc/schedulers.h: -------------------------------------------------------------------------------- 1 | /* 2 | Implements various learning rate schedulers. 3 | */ 4 | #ifndef SCHEDULERS_H 5 | #define SCHEDULERS_H 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | typedef struct { 12 | const char* type; 13 | float learning_rate; 14 | int warmup_iterations; 15 | int train_num_batches; 16 | float final_learning_rate_frac; 17 | } LearningRateScheduler; 18 | 19 | void lr_scheduler_init(LearningRateScheduler *scheduler, const char* scheduler_type, float learning_rate, int warmup_iterations, int train_num_batches, float final_learning_rate_frac) { 20 | scheduler->type = scheduler_type; 21 | scheduler->learning_rate = learning_rate; 22 | scheduler->warmup_iterations = warmup_iterations; 23 | scheduler->train_num_batches = train_num_batches; 24 | scheduler->final_learning_rate_frac = final_learning_rate_frac; 25 | } 26 | 27 | // cosine: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac 28 | float get_learning_rate_cosine(LearningRateScheduler *scheduler, int step) { 29 | float lr = scheduler->learning_rate; 30 | if (step < scheduler->warmup_iterations) { 31 | lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations; 32 | } else { 33 | float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations); 34 | assert(0.0f <= decay_ratio && decay_ratio <= 1.0f); 35 | float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0 36 | assert(0.0f <= coeff && coeff <= 1.0f); 37 | float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac; 38 | lr = min_lr + coeff * (scheduler->learning_rate - min_lr); 39 | } 40 | return lr; 41 | } 42 | 43 | // linear: warmup linearly to max LR, then decay linearly to LR * final_learning_rate_frac 44 | float get_learning_rate_linear(LearningRateScheduler *scheduler, int step) { 45 | float lr = scheduler->learning_rate; 46 | if (step < scheduler->warmup_iterations) { 47 | lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations; 48 | } else { 49 | float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations); 50 | assert(0.0f <= decay_ratio && decay_ratio <= 1.0f); 51 | float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac; 52 | lr = scheduler->learning_rate - decay_ratio * (scheduler->learning_rate - min_lr); 53 | } 54 | return lr; 55 | } 56 | 57 | // constant 58 | float get_learning_rate_constant(LearningRateScheduler *scheduler, int step) { 59 | return scheduler->learning_rate; 60 | } 61 | 62 | // wsd schedule: warmup linearly, keep constant, last 20% decay using 1 - sqrt decay to final_frac (should be 0.0) 63 | // https://arxiv.org/abs/2405.18392 64 | float get_learning_rate_wsd(LearningRateScheduler *scheduler, int step) { 65 | int decay_point = (int)(0.8f * scheduler->train_num_batches); 66 | float max_lr = scheduler->learning_rate; 67 | float lr = max_lr; 68 | if (step < scheduler->warmup_iterations) { 69 | float decay_ratio = ((float)(step + 1)) / scheduler->warmup_iterations; 70 | lr = max_lr * decay_ratio; 71 | } else if (step < decay_point) { 72 | // noop, keep lr constant 73 | } else { 74 | float decay_ratio = ((float)(step - decay_point)) / (scheduler->train_num_batches - decay_point); 75 | assert(0.0f <= decay_ratio && decay_ratio <= 1.0f); 76 | float min_lr = max_lr * scheduler->final_learning_rate_frac; 77 | return min_lr + (1.0f - sqrtf(decay_ratio)) * (max_lr - min_lr); 78 | } 79 | return lr; 80 | } 81 | 82 | // return the learning rate at a given step 83 | float get_learning_rate(LearningRateScheduler *scheduler, int step) { 84 | float step_learning_rate; 85 | if (strcmp(scheduler->type, "cosine") == 0) { 86 | step_learning_rate = get_learning_rate_cosine(scheduler, step); 87 | } else if (strcmp(scheduler->type, "linear") == 0) { 88 | step_learning_rate = get_learning_rate_linear(scheduler, step); 89 | } else if (strcmp(scheduler->type, "constant") == 0) { 90 | step_learning_rate = get_learning_rate_constant(scheduler, step); 91 | } else if (strcmp(scheduler->type, "wsd") == 0) { 92 | step_learning_rate = get_learning_rate_wsd(scheduler, step); 93 | } else { 94 | fprintf(stderr, "Unknown learning rate scheduler type: %s\n", scheduler->type); 95 | exit(EXIT_FAILURE); 96 | } 97 | return step_learning_rate; 98 | } 99 | 100 | #endif // SCHEDULERS_H 101 | -------------------------------------------------------------------------------- /dev/eval/run_eval.sh: -------------------------------------------------------------------------------- 1 | # https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard 2 | # (See About tab -> REPRODUCIBILITY) 3 | 4 | # This script is intended to be run from the parent/root directory of llm.c repo. 5 | 6 | # Clone the evaluation harness: 7 | 8 | # git clone https://github.com/EleutherAI/lm-evaluation-harness/ 9 | # cd lm-evaluation-harness 10 | # git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463 11 | # pip install -e . 12 | 13 | # Then return to the parent directory and run this script 14 | 15 | # cd .. 16 | # ./dev/eval/run_eval.sh [model_name] [result_name] 17 | 18 | # where model_name is either a HF model such as openai-community/gpt2 or a local path such as ./gpt2-124M-run1 19 | # and result_name is the name of the folder under lm-evaluation-harness/results to store the evaluations 20 | 21 | # Since the evals can take a couple of hours to run, depending on the model size, you may wish to 22 | # run within a "screen" session or by using nohup to run the script: 23 | 24 | # nohup ./dev/eval/run_eval.sh [model_name] [result_name] > run.txt 2> err.txt & 25 | 26 | if [ -z "$1" ]; then 27 | echo "Error: missing HuggingFace model name or path to local model" 28 | echo "./run_eval.sh hf_account/model_name my_result" 29 | exit 1 30 | fi 31 | if [ -z "$2" ]; then 32 | echo "Error: missing output name for results" 33 | echo "./run_eval.sh hf_account/model_name my_result" 34 | exit 1 35 | fi 36 | 37 | export MODEL="$(realpath -s "$1")" 38 | export RESULT="$2" 39 | echo "Evaluating model $MODEL" 40 | echo "Saving results to ./lm-evaluation-harness/results/$RESULT" 41 | 42 | cd lm-evaluation-harness 43 | 44 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks truthfulqa_mc --batch_size 1 --no_cache --write_out --output_path results/$RESULT/truthfulqa_0shot.json --device cuda 45 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks winogrande --batch_size 1 --no_cache --write_out --output_path results/$RESULT/winogrande_5shot.json --device cuda --num_fewshot 5 46 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks arc_challenge --batch_size 1 --no_cache --write_out --output_path results/$RESULT/arc_challenge_25shot.json --device cuda --num_fewshot 25 47 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hellaswag --batch_size 1 --no_cache --write_out --output_path results/$RESULT/hellaswag_10shot.json --device cuda --num_fewshot 10 48 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks gsm8k --batch_size 1 --no_cache --write_out --output_path results/$RESULT/gsm8k_5shot.json --device cuda --num_fewshot 5 49 | python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --batch_size 1 --no_cache --write_out --output_path results/$RESULT/mmlu_5shot.json --device cuda --num_fewshot 5 50 | 51 | cd .. 52 | python dev/eval/summarize_eval.py lm-evaluation-harness/results/$RESULT 53 | -------------------------------------------------------------------------------- /dev/data/data_common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utilities for the datasets 3 | """ 4 | 5 | import requests 6 | from tqdm import tqdm 7 | import numpy as np 8 | 9 | 10 | def download_file(url: str, fname: str, chunk_size=1024): 11 | """Helper function to download a file from a given url""" 12 | resp = requests.get(url, stream=True) 13 | total = int(resp.headers.get("content-length", 0)) 14 | with open(fname, "wb") as file, tqdm( 15 | desc=fname, 16 | total=total, 17 | unit="iB", 18 | unit_scale=True, 19 | unit_divisor=1024, 20 | ) as bar: 21 | for data in resp.iter_content(chunk_size=chunk_size): 22 | size = file.write(data) 23 | bar.update(size) 24 | 25 | 26 | def write_datafile(filename, toks): 27 | """ 28 | Saves token data as a .bin file, for reading in C. 29 | - First comes a header with 256 int32s 30 | - The tokens follow, each as a uint16 31 | """ 32 | assert len(toks) < 2**31, "token count too large" # ~2.1B tokens 33 | # construct the header 34 | header = np.zeros(256, dtype=np.int32) 35 | header[0] = 20240520 # magic 36 | header[1] = 1 # version 37 | header[2] = len(toks) # number of tokens after the 256*4 bytes of header (each 2 bytes as uint16) 38 | # construct the tokens numpy array, if not already 39 | if not isinstance(toks, np.ndarray) or not toks.dtype == np.uint16: 40 | # validate that no token exceeds a uint16 41 | maxtok = 2**16 42 | assert all(0 <= t < maxtok for t in toks), "token dictionary too large for uint16" 43 | toks_np = np.array(toks, dtype=np.uint16) 44 | else: 45 | toks_np = toks 46 | # write to file 47 | print(f"writing {len(toks):,} tokens to {filename}") 48 | with open(filename, "wb") as f: 49 | f.write(header.tobytes()) 50 | f.write(toks_np.tobytes()) 51 | 52 | def write_evalfile(filename, datas): 53 | """ 54 | Saves eval data as a .bin file, for reading in C. 55 | Used for multiple-choice style evals, e.g. HellaSwag and MMLU 56 | - First comes a header with 256 int32s 57 | - The examples follow, each example is a stream of uint16_t: 58 | - delimiter of 2**16-1, i.e. 65,535 59 | - , bytes encoding this example, allowing efficient skip to next 60 | - , the index of the example in the dataset 61 | -