├── .github └── workflows │ └── build.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── convert-pth-to-ggml.py ├── ggml.c ├── ggml.h ├── main.cpp ├── quantize.cpp ├── quantize.sh ├── screencast.gif ├── utils.cpp └── utils.h /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | workflow_dispatch: # allows manual triggering 5 | inputs: 6 | create_release: 7 | description: 'Create new release' 8 | required: true 9 | type: boolean 10 | push: 11 | paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] 12 | pull_request: 13 | types: [opened, synchronize, edited, reopened, review_requested, ready_for_review] 14 | paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp'] 15 | 16 | env: 17 | BRANCH_NAME: ${{ github.head_ref || github.ref_name }} 18 | 19 | jobs: 20 | ubuntu-latest: 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - name: Clone 25 | id: checkout 26 | uses: actions/checkout@v1 27 | 28 | - name: Dependencies 29 | id: depends 30 | run: | 31 | sudo apt-get update 32 | sudo apt-get install build-essential 33 | 34 | - name: Build 35 | id: make_build 36 | run: | 37 | make 38 | 39 | - name: Archive production artifacts 40 | uses: actions/upload-artifact@v3 41 | with: 42 | name: ubuntu 43 | path: | 44 | chat 45 | 46 | 47 | macOS-latest: 48 | runs-on: macOS-latest 49 | 50 | steps: 51 | - name: Clone 52 | id: checkout 53 | uses: actions/checkout@v1 54 | 55 | - name: Dependencies 56 | id: depends 57 | run: | 58 | brew update 59 | 60 | - name: Build 61 | id: make_build 62 | run: | 63 | make 64 | 65 | - name: Archive production artifacts 66 | uses: actions/upload-artifact@v3 67 | with: 68 | name: macos 69 | path: | 70 | chat 71 | 72 | # macos-arm64: 73 | # runs-on: macos-arm64 74 | 75 | # steps: 76 | # - name: Clone 77 | # id: checkout 78 | # uses: actions/checkout@v1 79 | 80 | # - name: Dependencies 81 | # id: depends 82 | # run: | 83 | # brew update 84 | 85 | # - name: Build 86 | # id: make_build 87 | # run: | 88 | # make 89 | 90 | # - name: Archive production artifacts 91 | # uses: actions/upload-artifact@v3 92 | # with: 93 | # name: macos 94 | # path: | 95 | # chat 96 | 97 | windows-latest: 98 | runs-on: windows-latest 99 | 100 | steps: 101 | - name: Clone 102 | id: checkout 103 | uses: actions/checkout@v1 104 | 105 | - name: Build 106 | id: cmake_build 107 | run: | 108 | mkdir build 109 | cd build 110 | cmake .. 111 | cmake --build . --config Release 112 | 113 | - name: Set commit hash variables 114 | id: commit 115 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} 116 | uses: pr-mpt/actions-commit-hash@v2 117 | 118 | - name: Pack artifacts 119 | id: pack_artifacts 120 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} 121 | run: | 122 | 7z a alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\* 123 | 124 | - name: Create release 125 | id: create_release 126 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} 127 | uses: zendesk/action-create-release@v1 128 | env: 129 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 130 | with: 131 | tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }} 132 | 133 | - name: Upload release 134 | id: upload_release 135 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} 136 | uses: actions/upload-release-asset@v1 137 | env: 138 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 139 | with: 140 | upload_url: ${{ steps.create_release.outputs.upload_url }} 141 | asset_path: .\alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip 142 | asset_name: alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip 143 | asset_content_type: application/octet-stream 144 | 145 | # ubuntu-latest-gcc: 146 | # runs-on: ubuntu-latest 147 | # 148 | # strategy: 149 | # matrix: 150 | # build: [Debug, Release] 151 | # 152 | # steps: 153 | # - name: Clone 154 | # uses: actions/checkout@v1 155 | # 156 | # - name: Dependencies 157 | # run: | 158 | # sudo apt-get update 159 | # sudo apt-get install build-essential 160 | # sudo apt-get install cmake 161 | # 162 | # - name: Configure 163 | # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} 164 | # 165 | # - name: Build 166 | # run: | 167 | # make 168 | # 169 | # ubuntu-latest-clang: 170 | # runs-on: ubuntu-latest 171 | # 172 | # strategy: 173 | # matrix: 174 | # build: [Debug, Release] 175 | # 176 | # steps: 177 | # - name: Clone 178 | # uses: actions/checkout@v1 179 | # 180 | # - name: Dependencies 181 | # run: | 182 | # sudo apt-get update 183 | # sudo apt-get install build-essential 184 | # sudo apt-get install cmake 185 | # 186 | # - name: Configure 187 | # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang 188 | # 189 | # - name: Build 190 | # run: | 191 | # make 192 | # 193 | # ubuntu-latest-gcc-sanitized: 194 | # runs-on: ubuntu-latest 195 | # 196 | # strategy: 197 | # matrix: 198 | # sanitizer: [ADDRESS, THREAD, UNDEFINED] 199 | # 200 | # steps: 201 | # - name: Clone 202 | # uses: actions/checkout@v1 203 | # 204 | # - name: Dependencies 205 | # run: | 206 | # sudo apt-get update 207 | # sudo apt-get install build-essential 208 | # sudo apt-get install cmake 209 | # 210 | # - name: Configure 211 | # run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON 212 | # 213 | # - name: Build 214 | # run: | 215 | # make 216 | # 217 | # windows: 218 | # runs-on: windows-latest 219 | # 220 | # strategy: 221 | # matrix: 222 | # build: [Release] 223 | # arch: [Win32, x64] 224 | # include: 225 | # - arch: Win32 226 | # s2arc: x86 227 | # - arch: x64 228 | # s2arc: x64 229 | # 230 | # steps: 231 | # - name: Clone 232 | # uses: actions/checkout@v1 233 | # 234 | # - name: Add msbuild to PATH 235 | # uses: microsoft/setup-msbuild@v1 236 | # 237 | # - name: Configure 238 | # run: > 239 | # cmake -S . -B ./build -A ${{ matrix.arch }} 240 | # -DCMAKE_BUILD_TYPE=${{ matrix.build }} 241 | # 242 | # - name: Build 243 | # run: | 244 | # cd ./build 245 | # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} 246 | # 247 | # - name: Upload binaries 248 | # uses: actions/upload-artifact@v1 249 | # with: 250 | # name: llama-bin-${{ matrix.arch }} 251 | # path: build/bin/${{ matrix.build }} 252 | # 253 | # windows-blas: 254 | # runs-on: windows-latest 255 | # 256 | # strategy: 257 | # matrix: 258 | # build: [Release] 259 | # arch: [Win32, x64] 260 | # blas: [ON] 261 | # include: 262 | # - arch: Win32 263 | # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip 264 | # s2arc: x86 265 | # - arch: x64 266 | # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip 267 | # s2arc: x64 268 | # 269 | # steps: 270 | # - name: Clone 271 | # uses: actions/checkout@v1 272 | # 273 | # - name: Add msbuild to PATH 274 | # uses: microsoft/setup-msbuild@v1 275 | # 276 | # - name: Fetch OpenBLAS 277 | # if: matrix.blas == 'ON' 278 | # run: | 279 | # C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }} 280 | # 7z x blas.zip -oblas -y 281 | # copy blas/include/cblas.h . 282 | # copy blas/include/openblas_config.h . 283 | # echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV 284 | # 285 | # - name: Configure 286 | # run: > 287 | # cmake -S . -B ./build -A ${{ matrix.arch }} 288 | # -DCMAKE_BUILD_TYPE=${{ matrix.build }} 289 | # -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }} 290 | # -DCMAKE_LIBRARY_PATH="$env:blasdir/lib" 291 | # 292 | # - name: Build 293 | # run: | 294 | # cd ./build 295 | # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} 296 | # 297 | # - name: Copy libopenblas.dll 298 | # if: matrix.blas == 'ON' 299 | # run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }} 300 | # 301 | # - name: Upload binaries 302 | # if: matrix.blas == 'ON' 303 | # uses: actions/upload-artifact@v1 304 | # with: 305 | # name: llama-blas-bin-${{ matrix.arch }} 306 | # path: build/bin/${{ matrix.build }} 307 | # 308 | # emscripten: 309 | # runs-on: ubuntu-latest 310 | # 311 | # strategy: 312 | # matrix: 313 | # build: [Release] 314 | # 315 | # steps: 316 | # - name: Clone 317 | # uses: actions/checkout@v1 318 | # 319 | # - name: Dependencies 320 | # run: | 321 | # wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz 322 | # tar -xvf master.tar.gz 323 | # emsdk-master/emsdk update 324 | # emsdk-master/emsdk install latest 325 | # emsdk-master/emsdk activate latest 326 | # 327 | # - name: Configure 328 | # run: echo "tmp" 329 | # 330 | # - name: Build 331 | # run: | 332 | # pushd emsdk-master 333 | # source ./emsdk_env.sh 334 | # popd 335 | # emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} 336 | # make 337 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /chat 2 | 3 | *.o 4 | *.a 5 | .cache/ 6 | .vs/ 7 | .vscode/ 8 | .DS_Store 9 | 10 | build/ 11 | build-em/ 12 | build-debug/ 13 | build-release/ 14 | build-static/ 15 | build-no-accel/ 16 | build-sanitize-addr/ 17 | build-sanitize-thread/ 18 | 19 | models/* 20 | *.bin 21 | 22 | /main 23 | /quantize 24 | 25 | arm_neon.h 26 | compile_commands.json 27 | 28 | # Windows CMake files 29 | *.vcxproj 30 | *.filters 31 | *.cmake 32 | *.sln 33 | x64/ 34 | Debug/ 35 | Release/ 36 | CMakeFiles/ 37 | CMakeCache.txt 38 | *.dir/ 39 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | project("alpaca.cpp") 3 | 4 | set(CMAKE_CXX_STANDARD 20) 5 | set(CMAKE_CXX_STANDARD_REQUIRED true) 6 | set(CMAKE_C_STANDARD 11) 7 | 8 | if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) 10 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") 11 | endif() 12 | 13 | option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) 14 | option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) 15 | 16 | option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) 17 | option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) 18 | option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) 19 | 20 | if (APPLE) 21 | option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF) 22 | option(LLAMA_NO_AVX "llama: disable AVX" OFF) 23 | option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF) 24 | option(LLAMA_NO_FMA "llama: disable FMA" OFF) 25 | endif() 26 | 27 | if (NOT MSVC) 28 | if (LLAMA_SANITIZE_THREAD) 29 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") 30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") 31 | endif() 32 | 33 | if (LLAMA_SANITIZE_ADDRESS) 34 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 35 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 36 | endif() 37 | 38 | if (LLAMA_SANITIZE_UNDEFINED) 39 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") 40 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") 41 | endif() 42 | endif() 43 | 44 | if (APPLE AND NOT LLAMA_NO_ACCELERATE) 45 | find_library(ACCELERATE_FRAMEWORK Accelerate) 46 | if (ACCELERATE_FRAMEWORK) 47 | message(STATUS "Accelerate framework found") 48 | 49 | set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) 50 | set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) 51 | else() 52 | message(WARNING "Accelerate framework not found") 53 | endif() 54 | endif() 55 | 56 | if (LLAMA_ALL_WARNINGS) 57 | if (NOT MSVC) 58 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \ 59 | -Wall \ 60 | -Wextra \ 61 | -Wpedantic \ 62 | -Wshadow \ 63 | -Wcast-qual \ 64 | -Wstrict-prototypes \ 65 | -Wpointer-arith \ 66 | -Wno-unused-function \ 67 | ") 68 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ 69 | -Wall \ 70 | -Wextra \ 71 | -Wpedantic \ 72 | -Wcast-qual \ 73 | ") 74 | else() 75 | # todo : msvc 76 | endif() 77 | endif() 78 | 79 | message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") 80 | 81 | if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") 82 | message(STATUS "ARM detected") 83 | else() 84 | message(STATUS "x86 detected") 85 | if (MSVC) 86 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") 87 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") 88 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") 89 | else() 90 | if(NOT LLAMA_NO_AVX) 91 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") 92 | endif() 93 | if(NOT LLAMA_NO_AVX2) 94 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") 95 | endif() 96 | if(NOT LLAMA_NO_FMA) 97 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") 98 | endif() 99 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") 100 | endif() 101 | endif() 102 | 103 | # if (LLAMA_PERF) 104 | # set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF) 105 | # endif() 106 | 107 | add_executable(main 108 | main.cpp 109 | utils.cpp 110 | utils.h) 111 | 112 | add_executable(quantize 113 | quantize.cpp 114 | utils.cpp 115 | utils.h) 116 | 117 | add_library(ggml 118 | ggml.c 119 | ggml.h) 120 | 121 | target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS}) 122 | target_compile_definitions(main PUBLIC ${LLAMA_EXTRA_FLAGS}) 123 | target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS}) 124 | 125 | target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) 126 | target_include_directories(ggml PUBLIC .) 127 | target_link_libraries(quantize PRIVATE ggml) 128 | target_link_libraries(main PRIVATE ggml) 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Georgi Gerganov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifndef UNAME_S 2 | UNAME_S := $(shell uname -s) 3 | endif 4 | 5 | ifndef UNAME_P 6 | UNAME_P := $(shell uname -p) 7 | endif 8 | 9 | ifndef UNAME_M 10 | UNAME_M := $(shell uname -m) 11 | endif 12 | 13 | CCV := $(shell $(CC) --version | head -n 1) 14 | CXXV := $(shell $(CXX) --version | head -n 1) 15 | 16 | # Mac OS + Arm can report x86_64 17 | # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 18 | ifeq ($(UNAME_S),Darwin) 19 | ifneq ($(UNAME_P),arm) 20 | SYSCTL_M := $(shell sysctl -n hw.optional.arm64) 21 | ifeq ($(SYSCTL_M),1) 22 | # UNAME_P := arm 23 | # UNAME_M := arm64 24 | warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) 25 | endif 26 | endif 27 | endif 28 | 29 | # 30 | # Compile flags 31 | # 32 | 33 | CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC 34 | CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC 35 | LDFLAGS = 36 | 37 | # OS specific 38 | # TODO: support Windows 39 | ifeq ($(UNAME_S),Linux) 40 | CFLAGS += -pthread 41 | CXXFLAGS += -pthread 42 | endif 43 | ifeq ($(UNAME_S),Darwin) 44 | CFLAGS += -pthread 45 | CXXFLAGS += -pthread 46 | endif 47 | ifeq ($(UNAME_S),FreeBSD) 48 | CFLAGS += -pthread 49 | CXXFLAGS += -pthread 50 | endif 51 | ifeq ($(UNAME_S),NetBSD) 52 | CFLAGS += -pthread 53 | CXXFLAGS += -pthread 54 | endif 55 | ifeq ($(UNAME_S),Haiku) 56 | CFLAGS += -pthread 57 | CXXFLAGS += -pthread 58 | endif 59 | 60 | # Architecture specific 61 | # TODO: probably these flags need to be tweaked on some architectures 62 | # feel free to update the Makefile for your architecture and send a pull request or issue 63 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) 64 | ifeq ($(UNAME_S),Darwin) 65 | CFLAGS += -mf16c 66 | AVX1_M := $(shell sysctl machdep.cpu.features) 67 | ifneq (,$(findstring FMA,$(AVX1_M))) 68 | CFLAGS += -mfma 69 | endif 70 | ifneq (,$(findstring AVX1.0,$(AVX1_M))) 71 | CFLAGS += -mavx 72 | endif 73 | AVX2_M := $(shell sysctl machdep.cpu.leaf7_features) 74 | ifneq (,$(findstring AVX2,$(AVX2_M))) 75 | CFLAGS += -mavx2 76 | endif 77 | else ifeq ($(UNAME_S),Linux) 78 | AVX1_M := $(shell grep "avx " /proc/cpuinfo) 79 | ifneq (,$(findstring avx,$(AVX1_M))) 80 | CFLAGS += -mavx 81 | endif 82 | AVX2_M := $(shell grep "avx2 " /proc/cpuinfo) 83 | ifneq (,$(findstring avx2,$(AVX2_M))) 84 | CFLAGS += -mavx2 85 | endif 86 | FMA_M := $(shell grep "fma " /proc/cpuinfo) 87 | ifneq (,$(findstring fma,$(FMA_M))) 88 | CFLAGS += -mfma 89 | endif 90 | F16C_M := $(shell grep "f16c " /proc/cpuinfo) 91 | ifneq (,$(findstring f16c,$(F16C_M))) 92 | CFLAGS += -mf16c 93 | endif 94 | SSE3_M := $(shell grep "sse3 " /proc/cpuinfo) 95 | ifneq (,$(findstring sse3,$(SSE3_M))) 96 | CFLAGS += -msse3 97 | endif 98 | else ifeq ($(UNAME_S),Haiku) 99 | AVX1_M := $(shell sysinfo -cpu | grep "AVX ") 100 | ifneq (,$(findstring avx,$(AVX1_M))) 101 | CFLAGS += -mavx 102 | endif 103 | AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ") 104 | ifneq (,$(findstring avx2,$(AVX2_M))) 105 | CFLAGS += -mavx2 106 | endif 107 | FMA_M := $(shell sysinfo -cpu | grep "FMA ") 108 | ifneq (,$(findstring fma,$(FMA_M))) 109 | CFLAGS += -mfma 110 | endif 111 | F16C_M := $(shell sysinfo -cpu | grep "F16C ") 112 | ifneq (,$(findstring f16c,$(F16C_M))) 113 | CFLAGS += -mf16c 114 | endif 115 | else 116 | CFLAGS += -mfma -mf16c -mavx -mavx2 117 | endif 118 | endif 119 | ifeq ($(UNAME_M),amd64) 120 | CFLAGS += -mavx -mavx2 -mfma -mf16c 121 | endif 122 | ifneq ($(filter ppc64%,$(UNAME_M)),) 123 | POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) 124 | ifneq (,$(findstring POWER9,$(POWER9_M))) 125 | CFLAGS += -mpower9-vector 126 | endif 127 | # Require c++23's std::byteswap for big-endian support. 128 | ifeq ($(UNAME_M),ppc64) 129 | CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN 130 | endif 131 | endif 132 | ifndef LLAMA_NO_ACCELERATE 133 | # Mac M1 - include Accelerate framework 134 | ifeq ($(UNAME_S),Darwin) 135 | CFLAGS += -DGGML_USE_ACCELERATE 136 | LDFLAGS += -framework Accelerate 137 | endif 138 | endif 139 | ifdef LLAMA_OPENBLAS 140 | CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas 141 | LDFLAGS += -lopenblas 142 | endif 143 | ifdef LLAMA_GPROF 144 | CFLAGS += -pg 145 | CXXFLAGS += -pg 146 | endif 147 | ifneq ($(filter aarch64%,$(UNAME_M)),) 148 | CFLAGS += -mcpu=native 149 | CXXFLAGS += -mcpu=native 150 | endif 151 | ifneq ($(filter armv6%,$(UNAME_M)),) 152 | # Raspberry Pi 1, 2, 3 153 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access 154 | endif 155 | ifneq ($(filter armv7%,$(UNAME_M)),) 156 | # Raspberry Pi 4 157 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations 158 | endif 159 | ifneq ($(filter armv8%,$(UNAME_M)),) 160 | # Raspberry Pi 4 161 | CFLAGS += -mfp16-format=ieee -mno-unaligned-access 162 | endif 163 | 164 | # 165 | # Print build information 166 | # 167 | 168 | $(info I llama.cpp build info: ) 169 | $(info I UNAME_S: $(UNAME_S)) 170 | $(info I UNAME_P: $(UNAME_P)) 171 | $(info I UNAME_M: $(UNAME_M)) 172 | $(info I CFLAGS: $(CFLAGS)) 173 | $(info I CXXFLAGS: $(CXXFLAGS)) 174 | $(info I LDFLAGS: $(LDFLAGS)) 175 | $(info I CC: $(CCV)) 176 | $(info I CXX: $(CXXV)) 177 | $(info ) 178 | 179 | default: main quantize 180 | 181 | # 182 | # Build library 183 | # 184 | 185 | ggml.o: ggml.c ggml.h 186 | $(CC) $(CFLAGS) -c ggml.c -o ggml.o 187 | 188 | utils.o: utils.cpp utils.h 189 | $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o 190 | 191 | clean: 192 | rm -f *.o main quantize 193 | 194 | main: main.cpp ggml.o utils.o 195 | $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) 196 | ./main -h 197 | 198 | 199 | quantize: quantize.cpp ggml.o utils.o 200 | $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) 201 | 202 | # 203 | # Tests 204 | # 205 | 206 | .PHONY: tests 207 | tests: 208 | bash ./tests/run-tests.sh 209 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Alpaca.cpp 2 | 3 | Run a fast ChatGPT-like model locally on your device. The screencast below is not sped up and running on an M2 Macbook Air with 4GB of weights. 4 | 5 | 6 | [![asciicast](screencast.gif)](https://asciinema.org/a/dfJ8QXZ4u978Ona59LPEldtKK) 7 | 8 | 9 | This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT) and a set of modifications to [llama.cpp](https://github.com/ggerganov/llama.cpp) to add a chat interface. 10 | 11 | ## Get started 12 | 13 | ```sh 14 | git clone https://github.com/antimatter15/alpaca.cpp 15 | cd alpaca.cpp 16 | 17 | make chat 18 | ./chat 19 | ``` 20 | 21 | You can download the weights for `ggml-alpaca-7b-q4.bin` with BitTorrent `magnet:?xt=urn:btih:5aaceaec63b03e51a98f04fd5c42320b2a033010&dn=ggml-alpaca-7b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce` 22 | 23 | 24 | Alternatively you can download them with IPFS. 25 | 26 | ``` 27 | # any of these will work 28 | curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC 29 | curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC 30 | curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC 31 | ``` 32 | 33 | Save the `ggml-alpaca-7b-q4.bin` file in the same directory as your `./chat` executable. 34 | 35 | The weights are based on the published fine-tunes from `alpaca-lora`, converted back into a pytorch checkpoint with a [modified script](https://github.com/tloen/alpaca-lora/pull/19) and then quantized with llama.cpp the regular way. 36 | 37 | ## Windows Setup 38 | 39 | - Download and install CMake: 40 | - Download and install `git`. If you've never used git before, consider a GUI client like 41 | - Clone this repo using your git client of choice (for GitHub Desktop, go to File -> Clone repository -> From URL and paste `https://github.com/antimatter15/alpaca.cpp` in as the URL) 42 | - Open a Windows Terminal inside the folder you cloned the repository to 43 | - Run the following commands one by one: 44 | 45 | ```ps1 46 | cmake . 47 | cmake --build . --config Release 48 | ``` 49 | 50 | - Download the weights via any of the links in "Get started" above, and save the file as `ggml-alpaca-7b-q4.bin` in the main Alpaca directory. 51 | - In the terminal window, run this command: 52 | ```ps1 53 | .\Release\chat.exe 54 | ``` 55 | - (You can add other launch options like `--n 8` as preferred onto the same line) 56 | - You can now type to the AI in the terminal and it will reply. Enjoy! 57 | 58 | ## 13B 59 | 60 | TODO: write more docs here (PRs welcome) 61 | 62 | Torrent: `magnet:?xt=urn:btih:053b3d54d2e77ff020ebddf51dad681f2a651071&dn=ggml-alpaca-13b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A6969%2Fannounce&tr=udp%3A%2F%2F9.rarbg.com%3A2810%2Fannounce` 63 | 64 | 65 | ``` 66 | ./chat -m ggml-alpaca-13b-q4.bin 67 | ``` 68 | 69 | ## Credit 70 | 71 | This combines [Facebook's LLaMA](https://github.com/facebookresearch/llama), [Stanford Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), [alpaca-lora](https://github.com/tloen/alpaca-lora) and [corresponding weights](https://huggingface.co/tloen/alpaca-lora-7b/tree/main) by Eric Wang (which uses [Jason Phang's implementation of LLaMA](https://github.com/huggingface/transformers/pull/21955) on top of Hugging Face Transformers), and [llama.cpp](https://github.com/ggerganov/llama.cpp) by Georgi Gerganov. The chat implementation is based on Matvey Soloviev's [Interactive Mode](https://github.com/ggerganov/llama.cpp/pull/61) for llama.cpp. Inspired by [Simon Willison's](https://til.simonwillison.net/llms/llama-7b-m2) getting started guide for LLaMA. [Andy Matuschak](https://twitter.com/andy_matuschak/status/1636769182066053120)'s thread on adapting this to 13B, using fine tuning weights by [Sam Witteveen](https://huggingface.co/samwit/alpaca13B-lora). 72 | 73 | 74 | ## Disclaimer 75 | 76 | Note that the model weights are only to be used for research purposes, as they are derivative of LLaMA, and uses the published instruction data from the Stanford Alpaca project which is generated by OpenAI, which itself disallows the usage of its outputs to train competing models. 77 | 78 | 79 | -------------------------------------------------------------------------------- /convert-pth-to-ggml.py: -------------------------------------------------------------------------------- 1 | # Convert a LLaMA model checkpoint to a ggml compatible file 2 | # 3 | # Load the model using Torch 4 | # Iterate over all variables and write them to a binary file. 5 | # 6 | # For each variable, write the following: 7 | # - Number of dimensions (int) 8 | # - Name length (int) 9 | # - Dimensions (int[n_dims]) 10 | # - Name (char[name_length]) 11 | # - Data (float[n_dims]) 12 | # 13 | # By default, the bigger matrices are converted to 16-bit floats. 14 | # This can be disabled by adding the "use-f32" CLI argument. 15 | # 16 | # At the start of the ggml file we write the model parameters 17 | # and vocabulary. 18 | # 19 | 20 | import sys 21 | import json 22 | import struct 23 | import numpy as np 24 | import torch 25 | from sentencepiece import SentencePieceProcessor 26 | 27 | if len(sys.argv) < 3: 28 | print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n") 29 | print(" ftype == 0 -> float32") 30 | print(" ftype == 1 -> float16") 31 | sys.exit(1) 32 | 33 | # output in the same directory as the model 34 | dir_model = sys.argv[1] 35 | 36 | fname_hparams = sys.argv[1] + "/params.json" 37 | fname_tokenizer = sys.argv[1] + "/../tokenizer.model" 38 | 39 | def get_n_parts(dim): 40 | if dim == 4096: 41 | return 1 42 | elif dim == 5120: 43 | return 2 44 | elif dim == 6656: 45 | return 4 46 | elif dim == 8192: 47 | return 8 48 | else: 49 | print("Invalid dim: " + str(dim)) 50 | sys.exit(1) 51 | 52 | # possible data types 53 | # ftype == 0 -> float32 54 | # ftype == 1 -> float16 55 | # 56 | # map from ftype to string 57 | ftype_str = ["f32", "f16"] 58 | 59 | ftype = 1 60 | if len(sys.argv) > 2: 61 | ftype = int(sys.argv[2]) 62 | if ftype < 0 or ftype > 1: 63 | print("Invalid ftype: " + str(ftype)) 64 | sys.exit(1) 65 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" 66 | 67 | with open(fname_hparams, "r") as f: 68 | hparams = json.load(f) 69 | 70 | tokenizer = SentencePieceProcessor(fname_tokenizer) 71 | 72 | hparams.update({"vocab_size": tokenizer.vocab_size()}) 73 | 74 | n_parts = get_n_parts(hparams["dim"]) 75 | 76 | print(hparams) 77 | print('n_parts = ', n_parts) 78 | 79 | for p in range(n_parts): 80 | print('Processing part ', p) 81 | 82 | #fname_model = sys.argv[1] + "/consolidated.00.pth" 83 | fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth" 84 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" 85 | if (p > 0): 86 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p) 87 | 88 | model = torch.load(fname_model, map_location="cpu") 89 | 90 | fout = open(fname_out, "wb") 91 | 92 | fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex 93 | fout.write(struct.pack("i", hparams["vocab_size"])) 94 | fout.write(struct.pack("i", hparams["dim"])) 95 | fout.write(struct.pack("i", hparams["multiple_of"])) 96 | fout.write(struct.pack("i", hparams["n_heads"])) 97 | fout.write(struct.pack("i", hparams["n_layers"])) 98 | fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) 99 | fout.write(struct.pack("i", ftype)) 100 | 101 | # Is this correct?? 102 | for i in range(tokenizer.vocab_size()): 103 | if tokenizer.is_unknown(i): 104 | # "" token (translated as ??) 105 | text = " \u2047 ".encode("utf-8") 106 | fout.write(struct.pack("i", len(text))) 107 | fout.write(text) 108 | elif tokenizer.is_control(i): 109 | # ""/"" tokens 110 | fout.write(struct.pack("i", 0)) 111 | elif tokenizer.is_byte(i): 112 | # "" tokens (which may be invalid UTF-8) 113 | piece = tokenizer.id_to_piece(i) 114 | if len(piece) != 6: 115 | print("Invalid token: " + piece) 116 | sys.exit(1) 117 | byte_value = int(piece[3:-1], 16) 118 | fout.write(struct.pack("i", 1)) 119 | fout.write(struct.pack("B", byte_value)) 120 | else: 121 | # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces. 122 | text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") 123 | fout.write(struct.pack("i", len(text))) 124 | fout.write(text) 125 | 126 | for k, v in model.items(): 127 | name = k 128 | shape = v.shape 129 | 130 | # skip layers.X.attention.inner_attention.rope.freqs 131 | if name[-5:] == "freqs": 132 | continue 133 | 134 | print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) 135 | 136 | #data = tf.train.load_variable(dir_model, name).squeeze() 137 | data = v.numpy().squeeze() 138 | n_dims = len(data.shape); 139 | 140 | # for efficiency - transpose some matrices 141 | # "model/h.*/attn/c_attn/w" 142 | # "model/h.*/attn/c_proj/w" 143 | # "model/h.*/mlp/c_fc/w" 144 | # "model/h.*/mlp/c_proj/w" 145 | #if name[-14:] == "/attn/c_attn/w" or \ 146 | # name[-14:] == "/attn/c_proj/w" or \ 147 | # name[-11:] == "/mlp/c_fc/w" or \ 148 | # name[-13:] == "/mlp/c_proj/w": 149 | # print(" Transposing") 150 | # data = data.transpose() 151 | 152 | dshape = data.shape 153 | 154 | # default type is fp16 155 | ftype_cur = 1 156 | if ftype == 0 or n_dims == 1: 157 | print(" Converting to float32") 158 | data = data.astype(np.float32) 159 | ftype_cur = 0 160 | 161 | # header 162 | sname = name.encode('utf-8') 163 | fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) 164 | for i in range(n_dims): 165 | fout.write(struct.pack("i", dshape[n_dims - 1 - i])) 166 | fout.write(sname); 167 | 168 | # data 169 | data.tofile(fout) 170 | 171 | # I hope this deallocates the memory .. 172 | model = None 173 | 174 | fout.close() 175 | 176 | print("Done. Output file: " + fname_out + ", (part ", p, ")") 177 | print("") 178 | -------------------------------------------------------------------------------- /ggml.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // 4 | // GGML Tensor Library 5 | // 6 | // This documentation is still a work in progress. 7 | // If you wish some specific topics to be covered, feel free to drop a comment: 8 | // 9 | // https://github.com/ggerganov/whisper.cpp/issues/40 10 | // 11 | // ## Overview 12 | // 13 | // This library implements: 14 | // 15 | // - a set of tensor operations 16 | // - automatic differentiation 17 | // - basic optimization algorithms 18 | // 19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, 20 | // but is not limited to, the following: 21 | // 22 | // - linear regression 23 | // - support vector machines 24 | // - neural networks 25 | // 26 | // The library allows the user to define a certain function using the available tensor operations. This function 27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition 28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the 29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized 30 | // using one of the available optimization algorithms. 31 | // 32 | // For example, here we define the function: f(x) = a*x^2 + b 33 | // 34 | // { 35 | // struct ggml_init_params params = { 36 | // .mem_size = 16*1024*1024, 37 | // .mem_buffer = NULL, 38 | // }; 39 | // 40 | // // memory allocation happens here 41 | // struct ggml_context * ctx = ggml_init(params); 42 | // 43 | // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 44 | // 45 | // ggml_set_param(ctx, x); // x is an input variable 46 | // 47 | // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 48 | // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 49 | // struct ggml_tensor * x2 = ggml_mul(ctx, x, x); 50 | // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); 51 | // 52 | // ... 53 | // } 54 | // 55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only 56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: 57 | // 58 | // { 59 | // ... 60 | // 61 | // struct ggml_cgraph gf = ggml_build_forward(f); 62 | // 63 | // // set the input variable and parameter values 64 | // ggml_set_f32(x, 2.0f); 65 | // ggml_set_f32(a, 3.0f); 66 | // ggml_set_f32(b, 4.0f); 67 | // 68 | // ggml_graph_compute(ctx0, &gf); 69 | // 70 | // printf("f = %f\n", ggml_get_f32_1d(f, 0)); 71 | // 72 | // ... 73 | // } 74 | // 75 | // The actual computation is performed in the ggml_graph_compute() function. 76 | // 77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the 78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know 79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory 80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was 81 | // actually needed. 82 | // 83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic 84 | // differentiation and optimization algorithms. 85 | // 86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs 87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way 88 | // the user can avoid the memory allocation overhead at runtime. 89 | // 90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class 91 | // citizens, but in theory the library can be extended to support FP8 and integer data types. 92 | // 93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary 94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became 95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear 96 | // yet, but a few examples are demonstrated in the following operations: 97 | // 98 | // - ggml_permute() 99 | // - ggml_conv_1d_1s() 100 | // - ggml_conv_1d_2s() 101 | // 102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function 103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the 104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a 105 | // calculus class, or watch the following video: 106 | // 107 | // What is Automatic Differentiation? 108 | // https://www.youtube.com/watch?v=wG_nF1awSSY 109 | // 110 | // 111 | // ## Tensor data (struct ggml_tensor) 112 | // 113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of 114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains 115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: 116 | // 117 | // { 118 | // struct ggml_tensor * c = ggml_add(ctx, a, b); 119 | // 120 | // assert(c->src[0] == a); 121 | // assert(c->src[1] == b); 122 | // } 123 | // 124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the 125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows 126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and 127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is 128 | // contiguous in memory. 129 | // 130 | // The data of the tensor is accessed via the "data" pointer. For example: 131 | // 132 | // { 133 | // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); 134 | // 135 | // // a[1, 2] = 1.0f; 136 | // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; 137 | // 138 | // // a[2, 0] = 2.0f; 139 | // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; 140 | // 141 | // ... 142 | // } 143 | // 144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. 145 | // 146 | // ## The matrix multiplication operator (ggml_mul_mat) 147 | // 148 | // TODO 149 | // 150 | // 151 | // ## Multi-threading 152 | // 153 | // TODO 154 | // 155 | // 156 | // ## Overview of ggml.c 157 | // 158 | // TODO 159 | // 160 | // 161 | // ## SIMD optimizations 162 | // 163 | // TODO 164 | // 165 | // 166 | // ## Debugging ggml 167 | // 168 | // TODO 169 | // 170 | // 171 | 172 | #ifdef __cplusplus 173 | extern "C" { 174 | #endif 175 | 176 | #include 177 | #include 178 | #include 179 | 180 | #define GGML_MAX_DIMS 4 181 | #define GGML_MAX_NODES 4096 182 | #define GGML_MAX_PARAMS 16 183 | #define GGML_MAX_CONTEXTS 64 184 | #define GGML_MAX_OPT 4 185 | 186 | #ifdef __ARM_NEON 187 | // we use the built-in 16-bit float type 188 | typedef __fp16 ggml_fp16_t; 189 | #else 190 | typedef uint16_t ggml_fp16_t; 191 | #endif 192 | 193 | // convert FP16 <-> FP32 194 | float ggml_fp16_to_fp32(ggml_fp16_t x); 195 | ggml_fp16_t ggml_fp32_to_fp16(float x); 196 | 197 | struct ggml_object; 198 | struct ggml_context; 199 | 200 | enum ggml_type { 201 | GGML_TYPE_Q4_0, 202 | GGML_TYPE_Q4_1, 203 | GGML_TYPE_I8, 204 | GGML_TYPE_I16, 205 | GGML_TYPE_I32, 206 | GGML_TYPE_F16, 207 | GGML_TYPE_F32, 208 | GGML_TYPE_COUNT, 209 | }; 210 | 211 | // available tensor operations: 212 | enum ggml_op { 213 | GGML_OP_NONE = 0, 214 | 215 | GGML_OP_DUP, 216 | GGML_OP_ADD, 217 | GGML_OP_SUB, 218 | GGML_OP_MUL, 219 | GGML_OP_DIV, 220 | GGML_OP_SQR, 221 | GGML_OP_SQRT, 222 | GGML_OP_SUM, 223 | GGML_OP_MEAN, 224 | GGML_OP_REPEAT, 225 | GGML_OP_ABS, 226 | GGML_OP_SGN, 227 | GGML_OP_NEG, 228 | GGML_OP_STEP, 229 | GGML_OP_RELU, 230 | GGML_OP_GELU, 231 | GGML_OP_SILU, 232 | GGML_OP_NORM, // normalize 233 | GGML_OP_RMS_NORM, 234 | 235 | GGML_OP_MUL_MAT, 236 | 237 | GGML_OP_SCALE, 238 | GGML_OP_CPY, 239 | GGML_OP_RESHAPE, 240 | GGML_OP_VIEW, 241 | GGML_OP_PERMUTE, 242 | GGML_OP_TRANSPOSE, 243 | GGML_OP_GET_ROWS, 244 | GGML_OP_DIAG_MASK_INF, 245 | GGML_OP_SOFT_MAX, 246 | GGML_OP_ROPE, 247 | GGML_OP_CONV_1D_1S, 248 | GGML_OP_CONV_1D_2S, 249 | 250 | GGML_OP_FLASH_ATTN, 251 | GGML_OP_FLASH_FF, 252 | 253 | GGML_OP_COUNT, 254 | }; 255 | 256 | // n-dimensional tensor 257 | struct ggml_tensor { 258 | enum ggml_type type; 259 | 260 | int n_dims; 261 | int ne[GGML_MAX_DIMS]; // number of elements 262 | size_t nb[GGML_MAX_DIMS]; // stride in bytes: 263 | // nb[0] = sizeof(type) 264 | // nb[1] = nb[0] * ne[0] + padding 265 | // nb[i] = nb[i-1] * ne[i-1] 266 | 267 | // compute data 268 | enum ggml_op op; 269 | 270 | bool is_param; 271 | 272 | struct ggml_tensor * grad; 273 | struct ggml_tensor * src0; 274 | struct ggml_tensor * src1; 275 | struct ggml_tensor * opt[GGML_MAX_OPT]; 276 | 277 | // thread scheduling 278 | int n_tasks; 279 | 280 | // performance 281 | int perf_runs; 282 | int64_t perf_cycles; 283 | int64_t perf_time_us; 284 | 285 | void * data; 286 | char padding[8]; 287 | }; 288 | 289 | // computation graph 290 | struct ggml_cgraph { 291 | int n_nodes; 292 | int n_leafs; 293 | int n_threads; 294 | 295 | size_t work_size; 296 | struct ggml_tensor * work; 297 | 298 | struct ggml_tensor * nodes[GGML_MAX_NODES]; 299 | struct ggml_tensor * grads[GGML_MAX_NODES]; 300 | struct ggml_tensor * leafs[GGML_MAX_NODES]; 301 | 302 | // performance 303 | int perf_runs; 304 | int64_t perf_cycles; 305 | int64_t perf_time_us; 306 | }; 307 | 308 | // scratch buffer 309 | struct ggml_scratch { 310 | size_t offs; 311 | size_t size; 312 | void * data; 313 | }; 314 | 315 | struct ggml_init_params { 316 | // memory pool 317 | size_t mem_size; // bytes 318 | void * mem_buffer; // if NULL, memory will be allocated internally 319 | }; 320 | 321 | void ggml_time_init(void); // call this once at the beginning of the program 322 | int64_t ggml_time_ms(void); 323 | int64_t ggml_time_us(void); 324 | int64_t ggml_cycles(void); 325 | int64_t ggml_cycles_per_ms(void); 326 | 327 | void ggml_print_object (const struct ggml_object * obj); 328 | void ggml_print_objects(const struct ggml_context * ctx); 329 | 330 | int ggml_nelements(const struct ggml_tensor * tensor); 331 | size_t ggml_nbytes (const struct ggml_tensor * tensor); 332 | 333 | int ggml_blck_size (enum ggml_type type); 334 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block 335 | float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float 336 | 337 | size_t ggml_element_size(const struct ggml_tensor * tensor); 338 | 339 | struct ggml_context * ggml_init(struct ggml_init_params params); 340 | void ggml_free(struct ggml_context * ctx); 341 | 342 | size_t ggml_used_mem(const struct ggml_context * ctx); 343 | 344 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); 345 | 346 | struct ggml_tensor * ggml_new_tensor( 347 | struct ggml_context * ctx, 348 | enum ggml_type type, 349 | int n_dims, 350 | const int *ne); 351 | 352 | struct ggml_tensor * ggml_new_tensor_1d( 353 | struct ggml_context * ctx, 354 | enum ggml_type type, 355 | int ne0); 356 | 357 | struct ggml_tensor * ggml_new_tensor_2d( 358 | struct ggml_context * ctx, 359 | enum ggml_type type, 360 | int ne0, 361 | int ne1); 362 | 363 | struct ggml_tensor * ggml_new_tensor_3d( 364 | struct ggml_context * ctx, 365 | enum ggml_type type, 366 | int ne0, 367 | int ne1, 368 | int ne2); 369 | 370 | struct ggml_tensor * ggml_new_tensor_4d( 371 | struct ggml_context * ctx, 372 | enum ggml_type type, 373 | int ne0, 374 | int ne1, 375 | int ne2, 376 | int ne3); 377 | 378 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); 379 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); 380 | 381 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); 382 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); 383 | 384 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); 385 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); 386 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); 387 | 388 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); 389 | void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); 390 | 391 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); 392 | void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); 393 | 394 | void * ggml_get_data (const struct ggml_tensor * tensor); 395 | float * ggml_get_data_f32(const struct ggml_tensor * tensor); 396 | 397 | // 398 | // operations on tensors with backpropagation 399 | // 400 | 401 | struct ggml_tensor * ggml_dup( 402 | struct ggml_context * ctx, 403 | struct ggml_tensor * a); 404 | 405 | struct ggml_tensor * ggml_add( 406 | struct ggml_context * ctx, 407 | struct ggml_tensor * a, 408 | struct ggml_tensor * b); 409 | 410 | struct ggml_tensor * ggml_sub( 411 | struct ggml_context * ctx, 412 | struct ggml_tensor * a, 413 | struct ggml_tensor * b); 414 | 415 | struct ggml_tensor * ggml_mul( 416 | struct ggml_context * ctx, 417 | struct ggml_tensor * a, 418 | struct ggml_tensor * b); 419 | 420 | struct ggml_tensor * ggml_div( 421 | struct ggml_context * ctx, 422 | struct ggml_tensor * a, 423 | struct ggml_tensor * b); 424 | 425 | struct ggml_tensor * ggml_sqr( 426 | struct ggml_context * ctx, 427 | struct ggml_tensor * a); 428 | 429 | struct ggml_tensor * ggml_sqrt( 430 | struct ggml_context * ctx, 431 | struct ggml_tensor * a); 432 | 433 | // return scalar 434 | // TODO: compute sum along rows 435 | struct ggml_tensor * ggml_sum( 436 | struct ggml_context * ctx, 437 | struct ggml_tensor * a); 438 | 439 | // mean along rows 440 | struct ggml_tensor * ggml_mean( 441 | struct ggml_context * ctx, 442 | struct ggml_tensor * a); 443 | 444 | // if a is the same shape as b, and a is not parameter, return a 445 | // otherwise, return a new tensor: repeat(a) to fit in b 446 | struct ggml_tensor * ggml_repeat( 447 | struct ggml_context * ctx, 448 | struct ggml_tensor * a, 449 | struct ggml_tensor * b); 450 | 451 | struct ggml_tensor * ggml_abs( 452 | struct ggml_context * ctx, 453 | struct ggml_tensor * a); 454 | 455 | struct ggml_tensor * ggml_sgn( 456 | struct ggml_context * ctx, 457 | struct ggml_tensor * a); 458 | 459 | struct ggml_tensor * ggml_neg( 460 | struct ggml_context * ctx, 461 | struct ggml_tensor * a); 462 | 463 | struct ggml_tensor * ggml_step( 464 | struct ggml_context * ctx, 465 | struct ggml_tensor * a); 466 | 467 | struct ggml_tensor * ggml_relu( 468 | struct ggml_context * ctx, 469 | struct ggml_tensor * a); 470 | 471 | // TODO: double-check this computation is correct 472 | struct ggml_tensor * ggml_gelu( 473 | struct ggml_context * ctx, 474 | struct ggml_tensor * a); 475 | 476 | struct ggml_tensor * ggml_silu( 477 | struct ggml_context * ctx, 478 | struct ggml_tensor * a); 479 | 480 | // normalize along rows 481 | // TODO: eps is hardcoded to 1e-5 for now 482 | struct ggml_tensor * ggml_norm( 483 | struct ggml_context * ctx, 484 | struct ggml_tensor * a); 485 | 486 | struct ggml_tensor * ggml_rms_norm( 487 | struct ggml_context * ctx, 488 | struct ggml_tensor * a); 489 | 490 | // A: m rows, n columns 491 | // B: p rows, n columns (i.e. we transpose it internally) 492 | // result is m columns, p rows 493 | struct ggml_tensor * ggml_mul_mat( 494 | struct ggml_context * ctx, 495 | struct ggml_tensor * a, 496 | struct ggml_tensor * b); 497 | 498 | // 499 | // operations on tensors without backpropagation 500 | // 501 | 502 | // in-place, returns view(a) 503 | struct ggml_tensor * ggml_scale( 504 | struct ggml_context * ctx, 505 | struct ggml_tensor * a, 506 | struct ggml_tensor * b); 507 | 508 | // a -> b, return view(b) 509 | struct ggml_tensor * ggml_cpy( 510 | struct ggml_context * ctx, 511 | struct ggml_tensor * a, 512 | struct ggml_tensor * b); 513 | 514 | // return view(a), b specifies the new shape 515 | // TODO: when we start computing gradient, make a copy instead of view 516 | struct ggml_tensor * ggml_reshape( 517 | struct ggml_context * ctx, 518 | struct ggml_tensor * a, 519 | struct ggml_tensor * b); 520 | 521 | // return view(a) 522 | // TODO: when we start computing gradient, make a copy instead of view 523 | struct ggml_tensor * ggml_reshape_2d( 524 | struct ggml_context * ctx, 525 | struct ggml_tensor * a, 526 | int ne0, 527 | int ne1); 528 | 529 | // return view(a) 530 | // TODO: when we start computing gradient, make a copy instead of view 531 | struct ggml_tensor * ggml_reshape_3d( 532 | struct ggml_context * ctx, 533 | struct ggml_tensor * a, 534 | int ne0, 535 | int ne1, 536 | int ne2); 537 | 538 | // offset in bytes 539 | struct ggml_tensor * ggml_view_1d( 540 | struct ggml_context * ctx, 541 | struct ggml_tensor * a, 542 | int ne0, 543 | size_t offset); 544 | 545 | struct ggml_tensor * ggml_view_2d( 546 | struct ggml_context * ctx, 547 | struct ggml_tensor * a, 548 | int ne0, 549 | int ne1, 550 | size_t nb1, // row stride in bytes 551 | size_t offset); 552 | 553 | struct ggml_tensor * ggml_permute( 554 | struct ggml_context * ctx, 555 | struct ggml_tensor * a, 556 | int axis0, 557 | int axis1, 558 | int axis2, 559 | int axis3); 560 | 561 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3) 562 | struct ggml_tensor * ggml_transpose( 563 | struct ggml_context * ctx, 564 | struct ggml_tensor * a); 565 | 566 | struct ggml_tensor * ggml_get_rows( 567 | struct ggml_context * ctx, 568 | struct ggml_tensor * a, 569 | struct ggml_tensor * b); 570 | 571 | // set elements above the diagonal to -INF 572 | // in-place, returns view(a) 573 | struct ggml_tensor * ggml_diag_mask_inf( 574 | struct ggml_context * ctx, 575 | struct ggml_tensor * a, 576 | int n_past); 577 | 578 | // in-place, returns view(a) 579 | struct ggml_tensor * ggml_soft_max( 580 | struct ggml_context * ctx, 581 | struct ggml_tensor * a); 582 | 583 | // rotary position embedding 584 | // in-place, returns view(a) 585 | // if mode == 1, skip n_past elements 586 | // TODO: avoid creating a new tensor every time 587 | struct ggml_tensor * ggml_rope( 588 | struct ggml_context * ctx, 589 | struct ggml_tensor * a, 590 | int n_past, 591 | int n_dims, 592 | int mode); 593 | 594 | // padding = 1 595 | // TODO: we don't support extra parameters for now 596 | // that's why we are hard-coding the stride, padding, and dilation 597 | // not great .. 598 | struct ggml_tensor * ggml_conv_1d_1s( 599 | struct ggml_context * ctx, 600 | struct ggml_tensor * a, 601 | struct ggml_tensor * b); 602 | 603 | struct ggml_tensor * ggml_conv_1d_2s( 604 | struct ggml_context * ctx, 605 | struct ggml_tensor * a, 606 | struct ggml_tensor * b); 607 | 608 | struct ggml_tensor * ggml_flash_attn( 609 | struct ggml_context * ctx, 610 | struct ggml_tensor * q, 611 | struct ggml_tensor * k, 612 | struct ggml_tensor * v, 613 | bool masked); 614 | 615 | struct ggml_tensor * ggml_flash_ff( 616 | struct ggml_context * ctx, 617 | struct ggml_tensor * a, 618 | struct ggml_tensor * b0, 619 | struct ggml_tensor * b1, 620 | struct ggml_tensor * c0, 621 | struct ggml_tensor * c1); 622 | 623 | // 624 | // automatic differentiation 625 | // 626 | 627 | void ggml_set_param( 628 | struct ggml_context * ctx, 629 | struct ggml_tensor * tensor); 630 | 631 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); 632 | 633 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); 634 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); 635 | 636 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); 637 | void ggml_graph_reset (struct ggml_cgraph * cgraph); 638 | 639 | // print info and performance information for the graph 640 | void ggml_graph_print(const struct ggml_cgraph * cgraph); 641 | 642 | // dump the graph into a file using the dot format 643 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); 644 | 645 | // 646 | // optimization 647 | // 648 | 649 | // optimization methods 650 | enum ggml_opt_type { 651 | GGML_OPT_ADAM, 652 | GGML_OPT_LBFGS, 653 | }; 654 | 655 | // linesearch methods 656 | enum ggml_linesearch { 657 | GGML_LINESEARCH_DEFAULT = 1, 658 | 659 | GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, 660 | GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, 661 | GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, 662 | }; 663 | 664 | // optimization return values 665 | enum ggml_opt_result { 666 | GGML_OPT_OK = 0, 667 | GGML_OPT_DID_NOT_CONVERGE, 668 | GGML_OPT_NO_CONTEXT, 669 | GGML_OPT_INVALID_WOLFE, 670 | GGML_OPT_FAIL, 671 | 672 | GGML_LINESEARCH_FAIL = -128, 673 | GGML_LINESEARCH_MINIMUM_STEP, 674 | GGML_LINESEARCH_MAXIMUM_STEP, 675 | GGML_LINESEARCH_MAXIMUM_ITERATIONS, 676 | GGML_LINESEARCH_INVALID_PARAMETERS, 677 | }; 678 | 679 | // optimization parameters 680 | // 681 | // see ggml.c (ggml_opt_default_params) for default values 682 | // 683 | struct ggml_opt_params { 684 | enum ggml_opt_type type; 685 | 686 | int n_threads; 687 | 688 | // delta-based convergence test 689 | // 690 | // if past == 0 - disabled 691 | // if past > 0: 692 | // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) 693 | // 694 | int past; 695 | float delta; 696 | 697 | // maximum number of iterations without improvement 698 | // 699 | // if 0 - disabled 700 | // if > 0: 701 | // assume convergence if no cost improvement in this number of iterations 702 | // 703 | int max_no_improvement; 704 | 705 | bool print_forward_graph; 706 | bool print_backward_graph; 707 | 708 | // ADAM parameters 709 | struct { 710 | int n_iter; 711 | 712 | float alpha; // learning rate 713 | float beta1; 714 | float beta2; 715 | float eps; // epsilon for numerical stability 716 | float eps_f; // epsilon for convergence test 717 | float eps_g; // epsilon for convergence test 718 | } adam; 719 | 720 | // LBFGS parameters 721 | struct { 722 | int m; // number of corrections to approximate the inv. Hessian 723 | int n_iter; 724 | int max_linesearch; 725 | 726 | float eps; // convergence tolerance 727 | float ftol; // line search tolerance 728 | float wolfe; 729 | float min_step; 730 | float max_step; 731 | 732 | enum ggml_linesearch linesearch; 733 | } lbfgs; 734 | }; 735 | 736 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); 737 | 738 | // optimize the function defined by the tensor f 739 | enum ggml_opt_result ggml_opt( 740 | struct ggml_context * ctx, 741 | struct ggml_opt_params params, 742 | struct ggml_tensor * f); 743 | 744 | // 745 | // system info 746 | // 747 | 748 | int ggml_cpu_has_avx(void); 749 | int ggml_cpu_has_avx2(void); 750 | int ggml_cpu_has_avx512(void); 751 | int ggml_cpu_has_fma(void); 752 | int ggml_cpu_has_neon(void); 753 | int ggml_cpu_has_arm_fma(void); 754 | int ggml_cpu_has_f16c(void); 755 | int ggml_cpu_has_fp16_va(void); 756 | int ggml_cpu_has_wasm_simd(void); 757 | int ggml_cpu_has_blas(void); 758 | int ggml_cpu_has_sse3(void); 759 | int ggml_cpu_has_vsx(void); 760 | 761 | #ifdef __cplusplus 762 | } 763 | #endif 764 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #define NOMINMAX 2 | #include "ggml.h" 3 | 4 | #include "utils.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) 16 | #include 17 | #include 18 | #elif defined (_WIN32) 19 | #include 20 | #include 21 | #endif 22 | 23 | #define ANSI_COLOR_RED "\x1b[31m" 24 | #define ANSI_COLOR_GREEN "\x1b[32m" 25 | #define ANSI_COLOR_YELLOW "\x1b[33m" 26 | #define ANSI_COLOR_BLUE "\x1b[34m" 27 | #define ANSI_COLOR_MAGENTA "\x1b[35m" 28 | #define ANSI_COLOR_CYAN "\x1b[36m" 29 | #define ANSI_COLOR_RESET "\x1b[0m" 30 | #define ANSI_BOLD "\x1b[1m" 31 | 32 | // determine number of model parts based on the dimension 33 | static const std::map LLAMA_N_PARTS = { 34 | { 4096, 1 }, 35 | { 5120, 1 }, 36 | { 6656, 1 }, 37 | { 8192, 1 }, 38 | }; 39 | 40 | // default hparams (LLaMA 7B) 41 | struct llama_hparams { 42 | int32_t n_vocab = 32000; 43 | int32_t n_ctx = 512; // this is provided as user input? 44 | int32_t n_embd = 4096; 45 | int32_t n_mult = 256; 46 | int32_t n_head = 32; 47 | int32_t n_layer = 32; 48 | int32_t n_rot = 64; 49 | int32_t f16 = 1; 50 | }; 51 | 52 | struct llama_layer { 53 | // normalization 54 | struct ggml_tensor * attention_norm; 55 | 56 | // attention 57 | struct ggml_tensor * wq; 58 | struct ggml_tensor * wk; 59 | struct ggml_tensor * wv; 60 | struct ggml_tensor * wo; 61 | 62 | // normalization 63 | struct ggml_tensor * ffn_norm; 64 | 65 | // ff 66 | struct ggml_tensor * w1; 67 | struct ggml_tensor * w2; 68 | struct ggml_tensor * w3; 69 | }; 70 | 71 | struct llama_model { 72 | llama_hparams hparams; 73 | 74 | struct ggml_tensor * tok_embeddings; 75 | 76 | struct ggml_tensor * norm; 77 | struct ggml_tensor * output; 78 | 79 | std::vector layers; 80 | 81 | // key + value memory 82 | struct ggml_tensor * memory_k; 83 | struct ggml_tensor * memory_v; 84 | 85 | // 86 | struct ggml_context * ctx; 87 | std::map tensors; 88 | }; 89 | 90 | // load the model's weights from a file 91 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { 92 | fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); 93 | 94 | std::vector f_buf(1024*1024); 95 | 96 | auto fin = std::ifstream(fname, std::ios::binary); 97 | fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); 98 | if (!fin) { 99 | fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); 100 | return false; 101 | } 102 | 103 | // verify magic 104 | { 105 | uint32_t magic; 106 | fin.read((char *) &magic, sizeof(magic)); 107 | if (magic != 0x67676d6c) { 108 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); 109 | return false; 110 | } 111 | } 112 | 113 | int n_ff = 0; 114 | int n_parts = 0; 115 | 116 | // load hparams 117 | { 118 | auto & hparams = model.hparams; 119 | 120 | fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 121 | //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 122 | fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 123 | fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 124 | fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 125 | fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 126 | fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 127 | fin.read((char *) &hparams.f16, sizeof(hparams.f16)); 128 | 129 | hparams.n_ctx = n_ctx; 130 | 131 | n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; 132 | n_parts = LLAMA_N_PARTS.at(hparams.n_embd); 133 | 134 | // fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); 135 | // fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); 136 | // fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); 137 | // fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); 138 | // fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); 139 | // fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); 140 | // fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); 141 | // fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); 142 | // fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); 143 | // fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); 144 | } 145 | 146 | // load vocab 147 | { 148 | const int32_t n_vocab = model.hparams.n_vocab; 149 | 150 | if (n_vocab != model.hparams.n_vocab) { 151 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 152 | __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); 153 | return false; 154 | } 155 | 156 | std::string word; 157 | for (int i = 0; i < n_vocab; i++) { 158 | uint32_t len; 159 | fin.read((char *) &len, sizeof(len)); 160 | 161 | word.resize(len); 162 | fin.read((char *) word.data(), len); 163 | 164 | vocab.token_to_id[word] = i; 165 | vocab.id_to_token[i] = word; 166 | 167 | //if (i < 30000) { 168 | // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); 169 | //} 170 | } 171 | } 172 | 173 | // for the big tensors, we have the option to store the data in 16-bit floats or quantized 174 | // in order to save memory and also to speed up the computation 175 | ggml_type wtype = GGML_TYPE_COUNT; 176 | switch (model.hparams.f16) { 177 | case 0: wtype = GGML_TYPE_F32; break; 178 | case 1: wtype = GGML_TYPE_F16; break; 179 | case 2: wtype = GGML_TYPE_Q4_0; break; 180 | case 3: wtype = GGML_TYPE_Q4_1; break; 181 | default: 182 | { 183 | fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", 184 | __func__, fname.c_str(), model.hparams.f16); 185 | return false; 186 | } 187 | } 188 | 189 | const ggml_type wtype2 = GGML_TYPE_F32; 190 | 191 | auto & ctx = model.ctx; 192 | 193 | size_t ctx_size = 0; 194 | 195 | { 196 | const auto & hparams = model.hparams; 197 | 198 | const int n_embd = hparams.n_embd; 199 | const int n_layer = hparams.n_layer; 200 | const int n_ctx = hparams.n_ctx; 201 | const int n_vocab = hparams.n_vocab; 202 | 203 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings 204 | 205 | ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm 206 | 207 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output 208 | 209 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm 210 | 211 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq 212 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk 213 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv 214 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo 215 | 216 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm 217 | 218 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 219 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 220 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 221 | 222 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k 223 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v 224 | 225 | ctx_size += (5 + 10*n_layer)*256; // object overhead 226 | 227 | fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); 228 | } 229 | 230 | // create the ggml context 231 | { 232 | struct ggml_init_params params = { 233 | /*.mem_size =*/ ctx_size, 234 | /*.mem_buffer =*/ NULL, 235 | }; 236 | 237 | model.ctx = ggml_init(params); 238 | if (!model.ctx) { 239 | fprintf(stderr, "%s: ggml_init() failed\n", __func__); 240 | return false; 241 | } 242 | } 243 | 244 | // prepare memory for the weights 245 | { 246 | const auto & hparams = model.hparams; 247 | 248 | const int n_embd = hparams.n_embd; 249 | const int n_layer = hparams.n_layer; 250 | const int n_ctx = hparams.n_ctx; 251 | const int n_vocab = hparams.n_vocab; 252 | 253 | model.layers.resize(n_layer); 254 | 255 | model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 256 | 257 | model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 258 | model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 259 | 260 | // map by name 261 | model.tensors["tok_embeddings.weight"] = model.tok_embeddings; 262 | 263 | model.tensors["norm.weight"] = model.norm; 264 | model.tensors["output.weight"] = model.output; 265 | 266 | for (int i = 0; i < n_layer; ++i) { 267 | auto & layer = model.layers[i]; 268 | 269 | layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 270 | 271 | layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 272 | layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 273 | layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 274 | layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 275 | 276 | layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 277 | 278 | layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 279 | layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); 280 | layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 281 | 282 | // map by name 283 | model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; 284 | 285 | model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; 286 | model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; 287 | model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; 288 | model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; 289 | 290 | model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; 291 | 292 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; 293 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; 294 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; 295 | } 296 | } 297 | 298 | // key + value memory 299 | { 300 | const auto & hparams = model.hparams; 301 | 302 | const int n_embd = hparams.n_embd; 303 | const int n_layer = hparams.n_layer; 304 | const int n_ctx = hparams.n_ctx; 305 | 306 | const int n_mem = n_layer*n_ctx; 307 | const int n_elements = n_embd*n_mem; 308 | 309 | model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 310 | model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 311 | 312 | const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); 313 | 314 | fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); 315 | } 316 | 317 | const size_t file_offset = fin.tellg(); 318 | 319 | fin.close(); 320 | 321 | std::vector tmp; 322 | 323 | for (int i = 0; i < n_parts; ++i) { 324 | const int part_id = i; 325 | //const int part_id = n_parts - i - 1; 326 | 327 | std::string fname_part = fname; 328 | if (i > 0) { 329 | fname_part += "." + std::to_string(i); 330 | } 331 | 332 | fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); 333 | 334 | fin = std::ifstream(fname_part, std::ios::binary); 335 | fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); 336 | fin.seekg(file_offset); 337 | 338 | // load weights 339 | { 340 | int n_tensors = 0; 341 | size_t total_size = 0; 342 | 343 | fprintf(stderr, "%s: ", __func__); 344 | 345 | while (true) { 346 | int32_t n_dims; 347 | int32_t length; 348 | int32_t ftype; 349 | 350 | fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 351 | fin.read(reinterpret_cast(&length), sizeof(length)); 352 | fin.read(reinterpret_cast(&ftype), sizeof(ftype)); 353 | 354 | if (fin.eof()) { 355 | break; 356 | } 357 | 358 | int32_t nelements = 1; 359 | int32_t ne[2] = { 1, 1 }; 360 | for (int i = 0; i < n_dims; ++i) { 361 | fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); 362 | nelements *= ne[i]; 363 | } 364 | 365 | std::string name(length, 0); 366 | fin.read(&name[0], length); 367 | 368 | if (model.tensors.find(name.data()) == model.tensors.end()) { 369 | fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); 370 | return false; 371 | } 372 | 373 | // split_type = 0: split by columns 374 | // split_type = 1: split by rows 375 | int split_type = 0; 376 | 377 | // split_type = 0: 378 | // regex: 379 | // - tok_embeddings.* 380 | // - layers.*.attention.wo.weight 381 | // - layers.*.feed_forward.w2.weight 382 | 383 | // split_type = 1: 384 | // regex: 385 | // - output.* 386 | // - layers.*.attention.wq.weight 387 | // - layers.*.attention.wk.weight 388 | // - layers.*.attention.wv.weight 389 | // - layers.*.feed_forward.w1.weight 390 | // - layers.*.feed_forward.w3.weight 391 | if (name.find("tok_embeddings") != std::string::npos) { 392 | split_type = 0; 393 | } else if (name.find("layers") != std::string::npos) { 394 | if (name.find("attention.wo.weight") != std::string::npos) { 395 | split_type = 0; 396 | } else if (name.find("feed_forward.w2.weight") != std::string::npos) { 397 | split_type = 0; 398 | } else { 399 | split_type = 1; 400 | } 401 | } else if (name.find("output") != std::string::npos) { 402 | split_type = 1; 403 | } 404 | 405 | auto tensor = model.tensors[name.data()]; 406 | 407 | if (n_dims == 1) { 408 | if (ggml_nelements(tensor) != nelements) { 409 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 410 | return false; 411 | } 412 | } else { 413 | if (ggml_nelements(tensor)/n_parts != nelements) { 414 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 415 | return false; 416 | } 417 | } 418 | 419 | if (n_dims == 1) { 420 | if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { 421 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 422 | __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); 423 | return false; 424 | } 425 | } else { 426 | if (split_type == 0) { 427 | if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { 428 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 429 | __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); 430 | return false; 431 | } 432 | } else { 433 | if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { 434 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 435 | __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); 436 | return false; 437 | } 438 | } 439 | } 440 | 441 | if (0) { 442 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 443 | fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); 444 | } 445 | 446 | size_t bpe = 0; 447 | 448 | switch (ftype) { 449 | case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; 450 | case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; 451 | case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; 452 | case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; 453 | default: 454 | { 455 | fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); 456 | return false; 457 | } 458 | }; 459 | 460 | if (n_dims == 1 || n_parts == 1) { 461 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { 462 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 463 | __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); 464 | return false; 465 | } 466 | 467 | if (part_id == 0) { 468 | fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); 469 | } else { 470 | fin.seekg(ggml_nbytes(tensor), std::ios::cur); 471 | } 472 | 473 | total_size += ggml_nbytes(tensor); 474 | } else { 475 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { 476 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 477 | __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); 478 | return false; 479 | } 480 | 481 | if (split_type == 0) { 482 | const int np0 = ne[0]; 483 | 484 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 485 | assert(row_size == tensor->nb[1]); 486 | 487 | for (int i1 = 0; i1 < ne[1]; ++i1) { 488 | const size_t offset_row = i1*row_size; 489 | const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 490 | fin.read(reinterpret_cast(tensor->data) + offset, row_size/n_parts); 491 | } 492 | } else { 493 | const int np1 = ne[1]; 494 | 495 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 496 | 497 | for (int i1 = 0; i1 < ne[1]; ++i1) { 498 | const size_t offset_row = (i1 + part_id*np1)*row_size; 499 | fin.read(reinterpret_cast(tensor->data) + offset_row, row_size); 500 | } 501 | } 502 | 503 | total_size += ggml_nbytes(tensor)/n_parts; 504 | } 505 | 506 | //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); 507 | if (++n_tensors % 8 == 0) { 508 | fprintf(stderr, "."); 509 | fflush(stderr); 510 | } 511 | } 512 | 513 | fprintf(stderr, " done\n"); 514 | 515 | fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); 516 | } 517 | 518 | fin.close(); 519 | } 520 | 521 | return true; 522 | } 523 | 524 | // evaluate the transformer 525 | // 526 | // - model: the model 527 | // - n_threads: number of threads to use 528 | // - n_past: the context size so far 529 | // - embd_inp: the embeddings of the tokens in the context 530 | // - embd_w: the predicted logits for the next token 531 | // 532 | // The GPT-J model requires about 16MB of memory per input token. 533 | // 534 | bool llama_eval( 535 | const llama_model & model, 536 | const int n_threads, 537 | const int n_past, 538 | const std::vector & embd_inp, 539 | std::vector & embd_w, 540 | size_t & mem_per_token) { 541 | const int N = embd_inp.size(); 542 | 543 | const auto & hparams = model.hparams; 544 | 545 | const int n_embd = hparams.n_embd; 546 | const int n_layer = hparams.n_layer; 547 | const int n_ctx = hparams.n_ctx; 548 | const int n_head = hparams.n_head; 549 | const int n_vocab = hparams.n_vocab; 550 | const int n_rot = hparams.n_embd/hparams.n_head; 551 | 552 | const int d_key = n_embd/n_head; 553 | 554 | // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case 555 | // static size_t buf_size = hparams.n_ctx*1024*1024; 556 | static size_t buf_size = 512u*1024*1024; 557 | static void * buf = malloc(buf_size); 558 | 559 | if (mem_per_token > 0 && mem_per_token*N > buf_size) { 560 | const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead 561 | //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); 562 | 563 | // reallocate 564 | buf_size = buf_size_new; 565 | buf = realloc(buf, buf_size); 566 | if (buf == nullptr) { 567 | fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); 568 | return false; 569 | } 570 | } 571 | 572 | struct ggml_init_params params = { 573 | /*.mem_size =*/ buf_size, 574 | /*.mem_buffer =*/ buf, 575 | }; 576 | 577 | struct ggml_context * ctx0 = ggml_init(params); 578 | ggml_cgraph gf = {}; 579 | gf.n_threads = n_threads; 580 | 581 | struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); 582 | memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); 583 | 584 | struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); 585 | 586 | for (int il = 0; il < n_layer; ++il) { 587 | struct ggml_tensor * inpSA = inpL; 588 | 589 | struct ggml_tensor * cur; 590 | 591 | // norm 592 | { 593 | cur = ggml_rms_norm(ctx0, inpL); 594 | 595 | // cur = attention_norm*cur 596 | cur = ggml_mul(ctx0, 597 | ggml_repeat(ctx0, model.layers[il].attention_norm, cur), 598 | cur); 599 | } 600 | 601 | // self-attention 602 | { 603 | struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); 604 | struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); 605 | struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); 606 | 607 | // store key and value to memory 608 | if (N >= 1) { 609 | struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); 610 | struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); 611 | 612 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); 613 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); 614 | } 615 | 616 | // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) 617 | struct ggml_tensor * Q = 618 | ggml_permute(ctx0, 619 | ggml_rope(ctx0, 620 | ggml_cpy(ctx0, 621 | Qcur, 622 | ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 623 | n_past, n_rot, 0), 624 | 0, 2, 1, 3); 625 | 626 | // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) 627 | struct ggml_tensor * K = 628 | ggml_permute(ctx0, 629 | ggml_rope(ctx0, 630 | ggml_reshape_3d(ctx0, 631 | ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), 632 | n_embd/n_head, n_head, n_past + N), 633 | n_past, n_rot, 1), 634 | 0, 2, 1, 3); 635 | 636 | // K * Q 637 | struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); 638 | 639 | // KQ_scaled = KQ / sqrt(n_embd/n_head) 640 | struct ggml_tensor * KQ_scaled = 641 | ggml_scale(ctx0, 642 | KQ, 643 | ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) 644 | ); 645 | 646 | // KQ_masked = mask_past(KQ_scaled) 647 | struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); 648 | 649 | // KQ = soft_max(KQ_masked) 650 | struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); 651 | 652 | // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() 653 | struct ggml_tensor * V_trans = 654 | ggml_permute(ctx0, 655 | ggml_reshape_3d(ctx0, 656 | ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), 657 | n_embd/n_head, n_head, n_past + N), 658 | 1, 2, 0, 3); 659 | 660 | // KQV = transpose(V) * KQ_soft_max 661 | struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); 662 | 663 | // KQV_merged = KQV.permute(0, 2, 1, 3) 664 | struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); 665 | 666 | // cur = KQV_merged.contiguous().view(n_embd, N) 667 | cur = ggml_cpy(ctx0, 668 | KQV_merged, 669 | ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); 670 | 671 | // projection (no bias) 672 | cur = ggml_mul_mat(ctx0, 673 | model.layers[il].wo, 674 | cur); 675 | } 676 | 677 | struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); 678 | 679 | // feed-forward network 680 | { 681 | // norm 682 | { 683 | cur = ggml_rms_norm(ctx0, inpFF); 684 | 685 | // cur = ffn_norm*cur 686 | cur = ggml_mul(ctx0, 687 | ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), 688 | cur); 689 | } 690 | 691 | struct ggml_tensor * tmp = ggml_mul_mat(ctx0, 692 | model.layers[il].w3, 693 | cur); 694 | 695 | 696 | cur = ggml_mul_mat(ctx0, 697 | model.layers[il].w1, 698 | cur); 699 | 700 | // SILU activation 701 | cur = ggml_silu(ctx0, cur); 702 | 703 | cur = ggml_mul(ctx0, cur, tmp); 704 | 705 | cur = ggml_mul_mat(ctx0, 706 | model.layers[il].w2, 707 | cur); 708 | } 709 | 710 | cur = ggml_add(ctx0, cur, inpFF); 711 | 712 | // input for next layer 713 | inpL = cur; 714 | } 715 | 716 | // norm 717 | { 718 | inpL = ggml_rms_norm(ctx0, inpL); 719 | 720 | // inpL = norm*inpL 721 | inpL = ggml_mul(ctx0, 722 | ggml_repeat(ctx0, model.norm, inpL), 723 | inpL); 724 | } 725 | 726 | // lm_head 727 | { 728 | inpL = ggml_mul_mat(ctx0, model.output, inpL); 729 | } 730 | 731 | // logits -> probs 732 | //inpL = ggml_soft_max(ctx0, inpL); 733 | 734 | // run the computation 735 | ggml_build_forward_expand(&gf, inpL); 736 | ggml_graph_compute (ctx0, &gf); 737 | 738 | //if (n_past%100 == 0) { 739 | // ggml_graph_print (&gf); 740 | // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); 741 | //} 742 | 743 | //embd_w.resize(n_vocab*N); 744 | //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); 745 | 746 | // return result for just the last token 747 | embd_w.resize(n_vocab); 748 | memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); 749 | 750 | if (mem_per_token == 0) { 751 | mem_per_token = ggml_used_mem(ctx0)/N; 752 | } 753 | //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); 754 | 755 | ggml_free(ctx0); 756 | 757 | return true; 758 | } 759 | 760 | static bool is_interacting = false; 761 | 762 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) 763 | void sigint_handler(int signo) { 764 | printf(ANSI_COLOR_RESET); 765 | if (signo == SIGINT) { 766 | if (!is_interacting) { 767 | is_interacting=true; 768 | } else { 769 | _exit(130); 770 | } 771 | } 772 | } 773 | #endif 774 | 775 | const char * llama_print_system_info(void) { 776 | static std::string s; 777 | 778 | s = ""; 779 | s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; 780 | s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; 781 | s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; 782 | s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; 783 | s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; 784 | s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; 785 | s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; 786 | s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; 787 | s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; 788 | s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; 789 | s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; 790 | s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; 791 | 792 | return s.c_str(); 793 | } 794 | 795 | int main(int argc, char ** argv) { 796 | ggml_time_init(); 797 | const int64_t t_main_start_us = ggml_time_us(); 798 | 799 | gpt_params params; 800 | 801 | params.temp = 0.1f; 802 | params.top_p = 0.95f; 803 | params.n_ctx = 2048; 804 | params.model = "ggml-alpaca-7b-q4.bin"; 805 | 806 | if (gpt_params_parse(argc, argv, params) == false) { 807 | return 1; 808 | } 809 | 810 | if (params.seed < 0) { 811 | params.seed = time(NULL); 812 | } 813 | 814 | fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); 815 | 816 | std::mt19937 rng(params.seed); 817 | // if (params.prompt.empty()) { 818 | // params.prompt = gpt_random_prompt(rng); 819 | // } 820 | 821 | // params.prompt = R"(// this function checks if the number n is prime 822 | //bool is_prime(int n) {)"; 823 | 824 | int64_t t_load_us = 0; 825 | 826 | gpt_vocab vocab; 827 | llama_model model; 828 | 829 | // load the model 830 | { 831 | const int64_t t_start_us = ggml_time_us(); 832 | if (!llama_model_load(params.model, model, vocab, params.n_ctx)) { 833 | fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); 834 | return 1; 835 | } 836 | 837 | t_load_us = ggml_time_us() - t_start_us; 838 | } 839 | 840 | // print system information 841 | { 842 | fprintf(stderr, "\n"); 843 | fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", 844 | params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); 845 | } 846 | 847 | int n_past = 0; 848 | 849 | int64_t t_sample_us = 0; 850 | int64_t t_predict_us = 0; 851 | 852 | std::vector logits; 853 | 854 | // Add a space in front of the first character to match OG llama tokenizer behavior 855 | // params.prompt.insert(0, 1, ' '); 856 | // tokenize the prompt 857 | std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); 858 | 859 | params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); 860 | 861 | 862 | // tokenize the reverse prompt 863 | std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); 864 | 865 | fprintf(stderr, "\n"); 866 | fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); 867 | fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); 868 | for (int i = 0; i < (int) embd_inp.size(); i++) { 869 | fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str()); 870 | } 871 | fprintf(stderr, "\n"); 872 | 873 | if (params.interactive) { 874 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) 875 | struct sigaction sigint_action; 876 | sigint_action.sa_handler = sigint_handler; 877 | sigemptyset (&sigint_action.sa_mask); 878 | sigint_action.sa_flags = 0; 879 | sigaction(SIGINT, &sigint_action, NULL); 880 | #elif defined (_WIN32) 881 | signal(SIGINT, sigint_handler); 882 | 883 | // Windows console ANSI color fix 884 | DWORD mode = 0; 885 | HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); 886 | if (hConsole && hConsole != INVALID_HANDLE_VALUE && GetConsoleMode(hConsole, &mode)) 887 | SetConsoleMode(hConsole, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING); 888 | #endif 889 | 890 | fprintf(stderr, "%s: interactive mode on.\n", __func__); 891 | 892 | if(antiprompt_inp.size()) { 893 | fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str()); 894 | fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size()); 895 | for (int i = 0; i < (int) antiprompt_inp.size(); i++) { 896 | fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str()); 897 | } 898 | fprintf(stderr, "\n"); 899 | } 900 | } 901 | fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); 902 | fprintf(stderr, "\n\n"); 903 | 904 | std::vector embd; 905 | 906 | // determine the required inference memory per token: 907 | size_t mem_per_token = 0; 908 | llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); 909 | 910 | int last_n_size = params.repeat_last_n; 911 | std::vector last_n_tokens(last_n_size); 912 | std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); 913 | 914 | 915 | if (params.interactive) { 916 | fprintf(stderr, "== Running in chat mode. ==\n" 917 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) 918 | " - Press Ctrl+C to interject at any time.\n" 919 | #endif 920 | " - Press Return to return control to LLaMA.\n" 921 | " - If you want to submit another line, end your input in '\\'.\n"); 922 | } 923 | 924 | // we may want to slide the input window along with the context, but for now we restrict to the context length 925 | int remaining_tokens = model.hparams.n_ctx - embd_inp.size(); 926 | int input_consumed = 0; 927 | bool input_noecho = false; 928 | 929 | // prompt user immediately after the starting prompt has been loaded 930 | if (params.interactive_start) { 931 | is_interacting = true; 932 | } 933 | 934 | // set the color for the prompt which will be output initially 935 | if (params.use_color) { 936 | printf(ANSI_COLOR_YELLOW); 937 | } 938 | 939 | 940 | 941 | while (remaining_tokens > 0) { 942 | // predict 943 | if (embd.size() > 0) { 944 | const int64_t t_start_us = ggml_time_us(); 945 | 946 | if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { 947 | fprintf(stderr, "Failed to predict\n"); 948 | return 1; 949 | } 950 | 951 | t_predict_us += ggml_time_us() - t_start_us; 952 | } 953 | 954 | n_past += embd.size(); 955 | embd.clear(); 956 | 957 | if (embd_inp.size() <= input_consumed) { 958 | // out of user input, sample next token 959 | const float top_k = params.top_k; 960 | const float top_p = params.top_p; 961 | const float temp = params.temp; 962 | const float repeat_penalty = params.repeat_penalty; 963 | 964 | const int n_vocab = model.hparams.n_vocab; 965 | 966 | gpt_vocab::id id = 0; 967 | 968 | { 969 | const int64_t t_start_sample_us = ggml_time_us(); 970 | 971 | id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); 972 | 973 | last_n_tokens.erase(last_n_tokens.begin()); 974 | last_n_tokens.push_back(id); 975 | 976 | t_sample_us += ggml_time_us() - t_start_sample_us; 977 | } 978 | 979 | // add it to the context 980 | embd.push_back(id); 981 | 982 | // echo this to console 983 | input_noecho = false; 984 | 985 | // decrement remaining sampling budget 986 | --remaining_tokens; 987 | } else { 988 | // some user input remains from prompt or interaction, forward it to processing 989 | while (embd_inp.size() > input_consumed) { 990 | // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str()); 991 | 992 | embd.push_back(embd_inp[input_consumed]); 993 | last_n_tokens.erase(last_n_tokens.begin()); 994 | last_n_tokens.push_back(embd_inp[input_consumed]); 995 | ++input_consumed; 996 | if (embd.size() > params.n_batch) { 997 | break; 998 | } 999 | } 1000 | 1001 | // reset color to default if we there is no pending user input 1002 | if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) { 1003 | printf(ANSI_COLOR_RESET); 1004 | } 1005 | } 1006 | 1007 | // display text 1008 | if (!input_noecho) { 1009 | for (auto id : embd) { 1010 | printf("%s", vocab.id_to_token[id].c_str()); 1011 | } 1012 | fflush(stdout); 1013 | } 1014 | 1015 | // in interactive mode, and not currently processing queued inputs; 1016 | // check if we should prompt the user for more 1017 | if (params.interactive && embd_inp.size() <= input_consumed) { 1018 | // check for reverse prompt 1019 | if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) { 1020 | // reverse prompt found 1021 | is_interacting = true; 1022 | } 1023 | if (is_interacting) { 1024 | // currently being interactive 1025 | bool another_line=true; 1026 | while (another_line) { 1027 | fflush(stdout); 1028 | char buf[256] = {0}; 1029 | int n_read; 1030 | if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN); 1031 | if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) { 1032 | // presumable empty line, consume the newline 1033 | if (scanf("%*c") <= 0) { /*ignore*/ } 1034 | n_read=0; 1035 | } 1036 | if(params.use_color) printf(ANSI_COLOR_RESET); 1037 | 1038 | if (n_read > 0 && buf[n_read-1]=='\\') { 1039 | another_line = true; 1040 | buf[n_read-1] = '\n'; 1041 | buf[n_read] = 0; 1042 | } else { 1043 | another_line = false; 1044 | buf[n_read] = '\n'; 1045 | buf[n_read+1] = 0; 1046 | } 1047 | 1048 | std::vector line_inp = ::llama_tokenize(vocab, buf, false); 1049 | embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); 1050 | remaining_tokens -= line_inp.size(); 1051 | 1052 | input_noecho = true; // do not echo this again 1053 | } 1054 | 1055 | is_interacting = false; 1056 | } 1057 | } 1058 | 1059 | // end of text token 1060 | if (embd.back() == 2) { 1061 | fprintf(stderr, " [end of text]\n"); 1062 | break; 1063 | } 1064 | } 1065 | 1066 | #if defined (_WIN32) 1067 | signal(SIGINT, SIG_DFL); 1068 | #endif 1069 | 1070 | // report timing 1071 | { 1072 | const int64_t t_main_end_us = ggml_time_us(); 1073 | 1074 | fprintf(stderr, "\n\n"); 1075 | fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token); 1076 | fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); 1077 | fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); 1078 | fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); 1079 | fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); 1080 | } 1081 | 1082 | ggml_free(model.ctx); 1083 | 1084 | if (params.use_color) { 1085 | printf(ANSI_COLOR_RESET); 1086 | } 1087 | 1088 | return 0; 1089 | } 1090 | -------------------------------------------------------------------------------- /quantize.cpp: -------------------------------------------------------------------------------- 1 | #include "ggml.h" 2 | 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | // TODO: move somewhere else 16 | #define QK 32 17 | 18 | // default hparams (LLaMA76B) 19 | struct llama_hparams { 20 | int32_t n_vocab = 32000; 21 | int32_t n_ctx = 512; // this is provided as user input? 22 | int32_t n_embd = 4096; 23 | int32_t n_mult = 256; 24 | int32_t n_head = 32; 25 | int32_t n_layer = 32; 26 | int32_t n_rot = 64; 27 | int32_t f16 = 1; 28 | }; 29 | 30 | 31 | // quantize a model 32 | bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) { 33 | ggml_type type = GGML_TYPE_Q4_1; 34 | 35 | switch (itype) { 36 | case 2: type = GGML_TYPE_Q4_0; break; 37 | case 3: type = GGML_TYPE_Q4_1; break; 38 | default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; 39 | }; 40 | 41 | if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { 42 | fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); 43 | return false; 44 | } 45 | 46 | gpt_vocab vocab; 47 | 48 | printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); 49 | 50 | auto finp = std::ifstream(fname_inp, std::ios::binary); 51 | if (!finp) { 52 | fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); 53 | return false; 54 | } 55 | 56 | auto fout = std::ofstream(fname_out, std::ios::binary); 57 | if (!fout) { 58 | fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); 59 | return false; 60 | } 61 | 62 | // verify magic 63 | { 64 | uint32_t magic; 65 | finp.read((char *) &magic, sizeof(magic)); 66 | if (magic != 0x67676d6c) { 67 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); 68 | return false; 69 | } 70 | 71 | fout.write((char *) &magic, sizeof(magic)); 72 | } 73 | 74 | llama_hparams hparams; 75 | 76 | // load hparams 77 | { 78 | finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 79 | //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 80 | finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 81 | finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 82 | finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 83 | finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 84 | finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 85 | finp.read((char *) &hparams.f16, sizeof(hparams.f16)); 86 | 87 | printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); 88 | printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); 89 | printf("%s: n_embd = %d\n", __func__, hparams.n_embd); 90 | printf("%s: n_mult = %d\n", __func__, hparams.n_mult); 91 | printf("%s: n_head = %d\n", __func__, hparams.n_head); 92 | printf("%s: n_layer = %d\n", __func__, hparams.n_layer); 93 | printf("%s: f16 = %d\n", __func__, hparams.f16); 94 | 95 | fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 96 | //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 97 | fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 98 | fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 99 | fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); 100 | fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 101 | fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 102 | fout.write((char *) &itype, sizeof(hparams.f16)); 103 | } 104 | 105 | // load vocab 106 | { 107 | const int32_t n_vocab = hparams.n_vocab; 108 | 109 | if (n_vocab != hparams.n_vocab) { 110 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 111 | __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab); 112 | return false; 113 | } 114 | 115 | std::string word; 116 | for (int i = 0; i < n_vocab; i++) { 117 | uint32_t len; 118 | finp.read ((char *) &len, sizeof(len)); 119 | fout.write((char *) &len, sizeof(len)); 120 | 121 | word.resize(len); 122 | finp.read ((char *) word.data(), len); 123 | fout.write((char *) word.data(), len); 124 | 125 | vocab.token_to_id[word] = i; 126 | vocab.id_to_token[i] = word; 127 | } 128 | } 129 | 130 | // load weights 131 | { 132 | size_t total_size_org = 0; 133 | size_t total_size_new = 0; 134 | 135 | std::vector work; 136 | 137 | std::vector data_u8; 138 | std::vector data_f16; 139 | std::vector data_f32; 140 | 141 | std::vector hist_all(1 << 4, 0); 142 | 143 | while (true) { 144 | int32_t n_dims; 145 | int32_t length; 146 | int32_t ftype; 147 | 148 | finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 149 | finp.read(reinterpret_cast(&length), sizeof(length)); 150 | finp.read(reinterpret_cast(&ftype), sizeof(ftype)); 151 | 152 | if (finp.eof()) { 153 | break; 154 | } 155 | 156 | int32_t nelements = 1; 157 | int32_t ne[2] = { 1, 1 }; 158 | for (int i = 0; i < n_dims; ++i) { 159 | finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); 160 | nelements *= ne[i]; 161 | } 162 | 163 | std::string name(length, 0); 164 | finp.read (&name[0], length); 165 | 166 | { 167 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 168 | printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); 169 | } 170 | 171 | // regexes of tensor names to be quantized 172 | const std::vector k_names = { 173 | ".*weight", 174 | }; 175 | 176 | bool quantize = false; 177 | for (const auto & s : k_names) { 178 | if (std::regex_match(name, std::regex(s))) { 179 | quantize = true; 180 | break; 181 | } 182 | } 183 | 184 | // quantize only 2D tensors 185 | quantize &= (n_dims == 2); 186 | 187 | if (quantize) { 188 | if (ftype != 0 && ftype != 1) { 189 | fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); 190 | return false; 191 | } 192 | 193 | if (ftype == 1) { 194 | data_f16.resize(nelements); 195 | finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); 196 | data_f32.resize(nelements); 197 | for (int i = 0; i < nelements; ++i) { 198 | data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); 199 | } 200 | } else { 201 | data_f32.resize(nelements); 202 | finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); 203 | } 204 | 205 | ftype = itype; 206 | } else { 207 | const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); 208 | 209 | data_u8.resize(nelements*bpe); 210 | finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); 211 | } 212 | 213 | fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); 214 | fout.write(reinterpret_cast(&length), sizeof(length)); 215 | fout.write(reinterpret_cast(&ftype), sizeof(ftype)); 216 | for (int i = 0; i < n_dims; ++i) { 217 | fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); 218 | } 219 | fout.write(&name[0], length); 220 | 221 | if (quantize) { 222 | printf("quantizing .. "); 223 | work.resize(nelements); // for quantization 224 | 225 | size_t cur_size = 0; 226 | std::vector hist_cur(1 << 4, 0); 227 | 228 | switch (type) { 229 | case GGML_TYPE_Q4_0: 230 | { 231 | cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); 232 | } break; 233 | case GGML_TYPE_Q4_1: 234 | { 235 | cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data()); 236 | } break; 237 | default: 238 | { 239 | fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); 240 | return false; 241 | } 242 | } 243 | 244 | fout.write(reinterpret_cast(work.data()), cur_size); 245 | total_size_new += cur_size; 246 | 247 | printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); 248 | for (int i = 0; i < hist_cur.size(); ++i) { 249 | hist_all[i] += hist_cur[i]; 250 | } 251 | 252 | for (int i = 0; i < hist_cur.size(); ++i) { 253 | printf("%5.3f ", hist_cur[i] / (float)nelements); 254 | } 255 | printf("\n"); 256 | } else { 257 | printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); 258 | fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); 259 | total_size_new += data_u8.size(); 260 | } 261 | 262 | total_size_org += nelements * sizeof(float); 263 | } 264 | 265 | printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); 266 | printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); 267 | 268 | { 269 | int64_t sum_all = 0; 270 | for (int i = 0; i < hist_all.size(); ++i) { 271 | sum_all += hist_all[i]; 272 | } 273 | 274 | printf("%s: hist: ", __func__); 275 | for (int i = 0; i < hist_all.size(); ++i) { 276 | printf("%5.3f ", hist_all[i] / (float)sum_all); 277 | } 278 | printf("\n"); 279 | } 280 | } 281 | 282 | finp.close(); 283 | fout.close(); 284 | 285 | return true; 286 | } 287 | 288 | // usage: 289 | // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type 290 | // 291 | int main(int argc, char ** argv) { 292 | ggml_time_init(); 293 | if (argc != 4) { 294 | fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); 295 | fprintf(stderr, " type = 2 - q4_0\n"); 296 | fprintf(stderr, " type = 3 - q4_1\n"); 297 | return 1; 298 | } 299 | 300 | // needed to initialize f16 tables 301 | { 302 | struct ggml_init_params params = { 0, NULL }; 303 | struct ggml_context * ctx = ggml_init(params); 304 | ggml_free(ctx); 305 | } 306 | 307 | const std::string fname_inp = argv[1]; 308 | const std::string fname_out = argv[2]; 309 | 310 | const int itype = atoi(argv[3]); 311 | 312 | const int64_t t_main_start_us = ggml_time_us(); 313 | 314 | int64_t t_quantize_us = 0; 315 | 316 | // load the model 317 | { 318 | const int64_t t_start_us = ggml_time_us(); 319 | 320 | if (!llama_model_quantize(fname_inp, fname_out, itype)) { 321 | fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); 322 | return 1; 323 | } 324 | 325 | t_quantize_us = ggml_time_us() - t_start_us; 326 | } 327 | 328 | // report timing 329 | { 330 | const int64_t t_main_end_us = ggml_time_us(); 331 | 332 | printf("\n"); 333 | printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); 334 | printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); 335 | } 336 | 337 | return 0; 338 | } 339 | -------------------------------------------------------------------------------- /quantize.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then 4 | echo 5 | echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]" 6 | echo 7 | exit 1 8 | fi 9 | 10 | for i in `ls models/$1/ggml-model-f16.bin*`; do 11 | ./quantize "$i" "${i/f16/q4_0}" 2 12 | if [[ "$2" == "--remove-f16" ]]; then 13 | rm "$i" 14 | fi 15 | done 16 | -------------------------------------------------------------------------------- /screencast.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ItsPi3141/alpaca.cpp/779a873fb2ac2c40b4595c8ad4e93bf6ce133b14/screencast.gif -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #if defined(_MSC_VER) || defined(__MINGW32__) 13 | #include // using malloc.h with MSC/MINGW 14 | #elif !defined(__FreeBSD__) && !defined(__NetBSD__) 15 | #include 16 | #endif 17 | 18 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { 19 | for (int i = 1; i < argc; i++) { 20 | std::string arg = argv[i]; 21 | 22 | if (arg == "-s" || arg == "--seed") { 23 | params.seed = std::stoi(argv[++i]); 24 | } else if (arg == "-t" || arg == "--threads") { 25 | params.n_threads = std::stoi(argv[++i]); 26 | } else if (arg == "-p" || arg == "--prompt") { 27 | params.prompt = argv[++i]; 28 | } else if (arg == "-f" || arg == "--file") { 29 | 30 | std::ifstream file(argv[++i]); 31 | 32 | std::copy(std::istreambuf_iterator(file), 33 | std::istreambuf_iterator(), 34 | back_inserter(params.prompt)); 35 | 36 | } else if (arg == "-n" || arg == "--n_predict") { 37 | params.n_predict = std::stoi(argv[++i]); 38 | } else if (arg == "--top_k") { 39 | params.top_k = std::stoi(argv[++i]); 40 | } else if (arg == "-c" || arg == "--ctx_size") { 41 | params.n_ctx = std::stoi(argv[++i]); 42 | } else if (arg == "--top_p") { 43 | params.top_p = std::stof(argv[++i]); 44 | } else if (arg == "--temp") { 45 | params.temp = std::stof(argv[++i]); 46 | } else if (arg == "--repeat_last_n") { 47 | params.repeat_last_n = std::stoi(argv[++i]); 48 | } else if (arg == "--repeat_penalty") { 49 | params.repeat_penalty = std::stof(argv[++i]); 50 | } else if (arg == "-b" || arg == "--batch_size") { 51 | params.n_batch = std::stoi(argv[++i]); 52 | } else if (arg == "-m" || arg == "--model") { 53 | params.model = argv[++i]; 54 | } else if (arg == "-i" || arg == "--interactive") { 55 | params.interactive = true; 56 | } else if (arg == "--interactive-start") { 57 | params.interactive = true; 58 | params.interactive_start = true; 59 | } else if (arg == "--color") { 60 | params.use_color = true; 61 | } else if (arg == "-r" || arg == "--reverse-prompt") { 62 | params.antiprompt = argv[++i]; 63 | } else if (arg == "-h" || arg == "--help") { 64 | gpt_print_usage(argc, argv, params); 65 | exit(0); 66 | } else { 67 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); 68 | gpt_print_usage(argc, argv, params); 69 | exit(0); 70 | } 71 | } 72 | 73 | return true; 74 | } 75 | 76 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { 77 | fprintf(stderr, "usage: %s [options]\n", argv[0]); 78 | fprintf(stderr, "\n"); 79 | fprintf(stderr, "options:\n"); 80 | fprintf(stderr, " -h, --help show this help message and exit\n"); 81 | fprintf(stderr, " -i, --interactive run in interactive mode\n"); 82 | fprintf(stderr, " --interactive-start run in interactive mode and poll user input at startup\n"); 83 | fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); 84 | fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n"); 85 | fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); 86 | fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); 87 | fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); 88 | fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); 89 | fprintf(stderr, " prompt to start generation with (default: random)\n"); 90 | fprintf(stderr, " -f FNAME, --file FNAME\n"); 91 | fprintf(stderr, " prompt file to start generation.\n"); 92 | fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); 93 | fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); 94 | fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); 95 | fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); 96 | fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); 97 | fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); 98 | fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); 99 | fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); 100 | fprintf(stderr, " -m FNAME, --model FNAME\n"); 101 | fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); 102 | fprintf(stderr, "\n"); 103 | } 104 | 105 | std::string gpt_random_prompt(std::mt19937 & rng) { 106 | const int r = rng() % 10; 107 | switch (r) { 108 | case 0: return "So"; 109 | case 1: return "Once upon a time"; 110 | case 2: return "When"; 111 | case 3: return "The"; 112 | case 4: return "After"; 113 | case 5: return "If"; 114 | case 6: return "import"; 115 | case 7: return "He"; 116 | case 8: return "She"; 117 | case 9: return "They"; 118 | default: return "To"; 119 | } 120 | 121 | return "The"; 122 | } 123 | 124 | void replace(std::string & str, const std::string & needle, const std::string & replacement) { 125 | size_t pos = 0; 126 | while ((pos = str.find(needle, pos)) != std::string::npos) { 127 | str.replace(pos, needle.length(), replacement); 128 | pos += replacement.length(); 129 | } 130 | } 131 | 132 | std::map json_parse(const std::string & fname) { 133 | std::map result; 134 | 135 | // read file into string 136 | std::string json; 137 | { 138 | std::ifstream ifs(fname); 139 | if (!ifs) { 140 | fprintf(stderr, "Failed to open %s\n", fname.c_str()); 141 | exit(1); 142 | } 143 | 144 | json = std::string((std::istreambuf_iterator(ifs)), 145 | (std::istreambuf_iterator())); 146 | } 147 | 148 | if (json[0] != '{') { 149 | return result; 150 | } 151 | 152 | // parse json 153 | { 154 | bool has_key = false; 155 | bool in_token = false; 156 | 157 | std::string str_key = ""; 158 | std::string str_val = ""; 159 | 160 | int n = json.size(); 161 | for (int i = 1; i < n; ++i) { 162 | if (!in_token) { 163 | if (json[i] == ' ') continue; 164 | if (json[i] == '"') { 165 | in_token = true; 166 | continue; 167 | } 168 | } else { 169 | if (json[i] == '\\' && i+1 < n) { 170 | if (has_key == false) { 171 | str_key += json[i]; 172 | } else { 173 | str_val += json[i]; 174 | } 175 | ++i; 176 | } else if (json[i] == '"') { 177 | if (has_key == false) { 178 | has_key = true; 179 | ++i; 180 | while (json[i] == ' ') ++i; 181 | ++i; // : 182 | while (json[i] == ' ') ++i; 183 | if (json[i] != '\"') { 184 | while (json[i] != ',' && json[i] != '}') { 185 | str_val += json[i++]; 186 | } 187 | has_key = false; 188 | } else { 189 | in_token = true; 190 | continue; 191 | } 192 | } else { 193 | has_key = false; 194 | } 195 | 196 | ::replace(str_key, "\\u0120", " " ); // \u0120 -> space 197 | ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line 198 | ::replace(str_key, "\\\"", "\""); // \\\" -> " 199 | 200 | try { 201 | result[str_key] = std::stoi(str_val); 202 | } catch (...) { 203 | //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); 204 | 205 | } 206 | str_key = ""; 207 | str_val = ""; 208 | in_token = false; 209 | continue; 210 | } 211 | if (has_key == false) { 212 | str_key += json[i]; 213 | } else { 214 | str_val += json[i]; 215 | } 216 | } 217 | } 218 | } 219 | 220 | return result; 221 | } 222 | 223 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { 224 | std::vector words; 225 | 226 | // first split the text into words 227 | { 228 | std::string str = text; 229 | std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; 230 | 231 | std::regex re(pat); 232 | std::smatch m; 233 | 234 | while (std::regex_search(str, m, re)) { 235 | for (auto x : m) { 236 | words.push_back(x); 237 | } 238 | str = m.suffix(); 239 | } 240 | } 241 | 242 | // find the longest tokens that form the words: 243 | std::vector tokens; 244 | for (const auto & word : words) { 245 | if (word.size() == 0) continue; 246 | 247 | int i = 0; 248 | int n = word.size(); 249 | while (i < n) { 250 | int j = n; 251 | while (j > i) { 252 | auto it = vocab.token_to_id.find(word.substr(i, j-i)); 253 | if (it != vocab.token_to_id.end()) { 254 | tokens.push_back(it->second); 255 | i = j; 256 | break; 257 | } 258 | --j; 259 | } 260 | if (i == n) { 261 | break; 262 | } 263 | if (j == i) { 264 | auto sub = word.substr(i, 1); 265 | if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { 266 | tokens.push_back(vocab.token_to_id.at(sub)); 267 | } else { 268 | fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); 269 | } 270 | ++i; 271 | } 272 | } 273 | } 274 | 275 | return tokens; 276 | } 277 | 278 | // TODO: Calculate this constant from the vocabulary 279 | #define MAX_TOKEN_LEN 18 280 | // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece 281 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { 282 | std::vector res; 283 | std::vector score; 284 | std::vector prev; 285 | int len = text.length(); 286 | 287 | score.resize(len + 1); 288 | prev.resize(len + 1); 289 | 290 | // Forward pass 291 | for (int i = 0; i < len; i++) { 292 | int max_len = std::min(len - i, MAX_TOKEN_LEN); 293 | for (int sub_len = 1; sub_len <= len - i; sub_len++) { 294 | auto sub = text.substr(i, sub_len); 295 | auto token = vocab.token_to_id.find(sub); 296 | if (token != vocab.token_to_id.end()) { 297 | int token_score = sub.length() * sub.length(); 298 | int local_score = score[i] + token_score; 299 | int next = i + sub_len; 300 | if (score[next] < local_score) { 301 | score[next] = local_score; 302 | prev[next] = (*token).second; 303 | } 304 | } 305 | } 306 | } 307 | 308 | // Backward pass 309 | int i = len; 310 | while (i > 0) { 311 | gpt_vocab::id token_id = prev[i]; 312 | if (token_id == 0) { 313 | // TODO: Return error or something more meaningful 314 | printf("failed to tokenize string!\n"); 315 | break; 316 | } 317 | res.push_back(token_id); 318 | auto token = (*vocab.id_to_token.find(token_id)).second; 319 | i -= token.length(); 320 | } 321 | 322 | if (bos) { 323 | res.push_back(1); // TODO: replace with vocab.bos 324 | } 325 | 326 | // Pieces are in reverse order so correct that 327 | std::reverse(res.begin(), res.end()); 328 | 329 | return res; 330 | } 331 | 332 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { 333 | printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); 334 | 335 | vocab.token_to_id = ::json_parse(fname); 336 | 337 | for (const auto & kv : vocab.token_to_id) { 338 | vocab.id_to_token[kv.second] = kv.first; 339 | } 340 | 341 | printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); 342 | 343 | // print the vocabulary 344 | //for (auto kv : vocab.token_to_id) { 345 | // printf("'%s' -> %d\n", kv.first.data(), kv.second); 346 | //} 347 | 348 | return true; 349 | } 350 | 351 | 352 | void sample_top_k(std::vector> & logits_id, int top_k) { 353 | // find the top K tokens 354 | std::partial_sort( 355 | logits_id.begin(), 356 | logits_id.begin() + top_k, logits_id.end(), 357 | [](const std::pair & a, const std::pair & b) { 358 | return a.first > b.first; 359 | }); 360 | 361 | logits_id.resize(top_k); 362 | } 363 | 364 | gpt_vocab::id llama_sample_top_p_top_k( 365 | const gpt_vocab & vocab, 366 | const float * logits, 367 | std::vector & last_n_tokens, 368 | double repeat_penalty, 369 | int top_k, 370 | double top_p, 371 | double temp, 372 | std::mt19937 & rng) { 373 | int n_logits = vocab.id_to_token.size(); 374 | 375 | std::vector> logits_id; 376 | logits_id.reserve(n_logits); 377 | 378 | { 379 | const double scale = 1.0/temp; 380 | for (int i = 0; i < n_logits; ++i) { 381 | // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) 382 | // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main 383 | if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { 384 | // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability 385 | if (logits[i] < 0.0) { 386 | logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); 387 | } else { 388 | logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); 389 | } 390 | } else { 391 | logits_id.push_back(std::make_pair(logits[i]*scale, i)); 392 | } 393 | } 394 | } 395 | 396 | sample_top_k(logits_id, top_k); 397 | 398 | double maxl = -INFINITY; 399 | for (const auto & kv : logits_id) { 400 | maxl = std::max(maxl, kv.first); 401 | } 402 | 403 | // compute probs for the top K tokens 404 | std::vector probs; 405 | probs.reserve(logits_id.size()); 406 | 407 | double sum = 0.0; 408 | for (const auto & kv : logits_id) { 409 | double p = exp(kv.first - maxl); 410 | probs.push_back(p); 411 | sum += p; 412 | } 413 | 414 | // normalize the probs 415 | for (auto & p : probs) { 416 | p /= sum; 417 | } 418 | 419 | if (top_p < 1.0f) { 420 | double cumsum = 0.0f; 421 | for (int i = 0; i < (int) probs.size(); i++) { 422 | cumsum += probs[i]; 423 | if (cumsum >= top_p) { 424 | probs.resize(i + 1); 425 | logits_id.resize(i + 1); 426 | break; 427 | } 428 | } 429 | 430 | cumsum = 1.0/cumsum; 431 | for (int i = 0; i < (int) probs.size(); i++) { 432 | probs[i] *= cumsum; 433 | } 434 | } 435 | 436 | //printf("\n"); 437 | //for (int i = 0; i < (int) 10; i++) { 438 | // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); 439 | //} 440 | //printf("\n\n"); 441 | //exit(0); 442 | 443 | std::discrete_distribution<> dist(probs.begin(), probs.end()); 444 | int idx = dist(rng); 445 | 446 | return logits_id[idx].second; 447 | } 448 | 449 | 450 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 451 | const int nb = k / qk; 452 | const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); 453 | const size_t row_size = nb*bs; 454 | 455 | assert(k % qk == 0); 456 | 457 | const size_t pp_size = qk / 2; 458 | uint8_t *pp = static_cast(alloca(pp_size)); 459 | 460 | char * pdst = (char *) dst; 461 | 462 | for (int j = 0; j < n; j += k) { 463 | uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); 464 | uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); 465 | 466 | for (int i = 0; i < nb; i++) { 467 | float amax = 0.0f; // absolute max 468 | 469 | { 470 | for (int l = 0; l < qk; l++) { 471 | const float v = src[j + i*qk + l]; 472 | amax = std::max(amax, fabsf(v)); 473 | } 474 | 475 | const float d = amax / ((1 << 3) - 1); 476 | const float id = d ? 1.0f/d : 0.0f; 477 | 478 | *(float *) pd = d; 479 | pd += bs; 480 | 481 | for (int l = 0; l < qk; l += 2) { 482 | const float v0 = (src[j + i*qk + l + 0])*id; 483 | const float v1 = (src[j + i*qk + l + 1])*id; 484 | 485 | const uint8_t vi0 = ((int8_t) (round(v0))) + 8; 486 | const uint8_t vi1 = ((int8_t) (round(v1))) + 8; 487 | 488 | assert(vi0 >= 0 && vi0 < 16); 489 | assert(vi1 >= 0 && vi1 < 16); 490 | 491 | hist[vi0]++; 492 | hist[vi1]++; 493 | 494 | pp[l/2] = vi0 | (vi1 << 4); 495 | } 496 | 497 | memcpy(pb, pp, pp_size); 498 | pb += bs; 499 | } 500 | } 501 | } 502 | 503 | return (n/k)*row_size; 504 | } 505 | 506 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 507 | const int nb = k / qk; 508 | const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); 509 | 510 | assert(k % qk == 0); 511 | 512 | const size_t pp_size = qk / 2; 513 | uint8_t *pp = static_cast(alloca(pp_size)); 514 | 515 | char * pdst = (char *) dst; 516 | 517 | for (int j = 0; j < n; j += k) { 518 | float * pm = (float *) (pdst + (j/k)*row_size); 519 | float * pd = (float *) (pm + nb); 520 | uint8_t * pb = (uint8_t *) (pd + nb); 521 | 522 | //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); 523 | 524 | for (int i = 0; i < nb; i++) { 525 | float min = std::numeric_limits::max(); 526 | float max = std::numeric_limits::min(); 527 | 528 | { 529 | for (int l = 0; l < qk; l++) { 530 | const float v = src[j + i*qk + l]; 531 | if (v < min) min = v; 532 | if (v > max) max = v; 533 | } 534 | 535 | const float d = (max - min) / ((1 << 4) - 1); 536 | const float id = d ? 1.0f/d : 0.0f; 537 | 538 | pm[i] = min; 539 | pd[i] = d; 540 | 541 | for (int l = 0; l < qk; l += 2) { 542 | const float v0 = (src[j + i*qk + l + 0] - min)*id; 543 | const float v1 = (src[j + i*qk + l + 1] - min)*id; 544 | 545 | const uint8_t vi0 = round(v0); 546 | const uint8_t vi1 = round(v1); 547 | 548 | assert(vi0 >= 0 && vi0 < 16); 549 | assert(vi1 >= 0 && vi1 < 16); 550 | 551 | hist[vi0]++; 552 | hist[vi1]++; 553 | 554 | pp[l/2] = vi0 | (vi1 << 4); 555 | } 556 | 557 | memcpy(pb + i*qk/2, pp, pp_size); 558 | } 559 | } 560 | } 561 | 562 | return (n/k)*row_size; 563 | } 564 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | // Various helper functions and utilities 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // 12 | // CLI argument parsing 13 | // 14 | 15 | struct gpt_params { 16 | int32_t seed = -1; // RNG seed 17 | int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); 18 | int32_t n_predict = 128; // new tokens to predict 19 | int32_t repeat_last_n = 64; // last n tokens to penalize 20 | int32_t n_ctx = 512; //context size 21 | 22 | // sampling parameters 23 | int32_t top_k = 40; 24 | float top_p = 0.95f; 25 | float temp = 0.80f; 26 | float repeat_penalty = 1.30f; 27 | 28 | int32_t n_batch = 8; // batch size for prompt processing 29 | 30 | std::string model = "models/lamma-7B/ggml-model.bin"; // model path 31 | std::string prompt; 32 | 33 | bool use_color = false; // use color to distinguish generations and inputs 34 | 35 | bool interactive = false; // interactive mode 36 | bool interactive_start = false; // reverse prompt immediately 37 | std::string antiprompt = ""; // string upon seeing which more user input is prompted 38 | }; 39 | 40 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params); 41 | 42 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params); 43 | 44 | std::string gpt_random_prompt(std::mt19937 & rng); 45 | 46 | // 47 | // Vocab utils 48 | // 49 | 50 | struct gpt_vocab { 51 | using id = int32_t; 52 | using token = std::string; 53 | 54 | std::map token_to_id; 55 | std::map id_to_token; 56 | }; 57 | 58 | void replace(std::string & str, const std::string & needle, const std::string & replacement); 59 | 60 | // poor-man's JSON parsing 61 | std::map json_parse(const std::string & fname); 62 | 63 | // split text into tokens 64 | // 65 | // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 66 | // 67 | // Regex (Python): 68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" 69 | // 70 | // Regex (C++): 71 | // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" 72 | // 73 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); 74 | 75 | // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. 76 | // ref: https://github.com/google/sentencepiece 77 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); 78 | 79 | // load the tokens from encoder.json 80 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); 81 | 82 | // sample next token given probabilities for each embedding 83 | // 84 | // - consider only the top K tokens 85 | // - from them, consider only the top tokens with cumulative probability > P 86 | // 87 | gpt_vocab::id llama_sample_top_p_top_k( 88 | const gpt_vocab & vocab, 89 | const float * logits, 90 | std::vector & last_n_tokens, 91 | double repeat_penalty, 92 | int top_k, 93 | double top_p, 94 | double temp, 95 | std::mt19937 & rng); 96 | 97 | // filer to top K tokens from list of logits 98 | void sample_top_k(std::vector> & logits_id, int top_k); 99 | 100 | // 101 | // Quantization 102 | // 103 | 104 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); 105 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); 106 | --------------------------------------------------------------------------------