├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── convert-pth-to-ggml.py
├── ggml.c
├── ggml.h
├── main.cpp
├── quantize.cpp
├── quantize.sh
├── screencast.gif
├── utils.cpp
└── utils.h


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   workflow_dispatch: # allows manual triggering
  5 |     inputs:
  6 |       create_release:
  7 |         description: 'Create new release'
  8 |         required: true
  9 |         type: boolean
 10 |   push:
 11 |     paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
 12 |   pull_request:
 13 |     types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
 14 |     paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
 15 | 
 16 | env:
 17 |  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 18 | 
 19 | jobs:
 20 |   ubuntu-latest:
 21 |     runs-on: ubuntu-latest
 22 | 
 23 |     steps:
 24 |       - name: Clone
 25 |         id: checkout
 26 |         uses: actions/checkout@v1
 27 | 
 28 |       - name: Dependencies
 29 |         id: depends
 30 |         run: |
 31 |           sudo apt-get update
 32 |           sudo apt-get install build-essential
 33 | 
 34 |       - name: Build
 35 |         id: make_build
 36 |         run: |
 37 |           make
 38 | 
 39 |       - name: Archive production artifacts
 40 |         uses: actions/upload-artifact@v3
 41 |         with:
 42 |           name: ubuntu
 43 |           path: |
 44 |             chat
 45 | 
 46 | 
 47 |   macOS-latest:
 48 |     runs-on: macOS-latest
 49 | 
 50 |     steps:
 51 |       - name: Clone
 52 |         id: checkout
 53 |         uses: actions/checkout@v1
 54 | 
 55 |       - name: Dependencies
 56 |         id: depends
 57 |         run: |
 58 |           brew update
 59 | 
 60 |       - name: Build
 61 |         id: make_build
 62 |         run: |
 63 |           make
 64 | 
 65 |       - name: Archive production artifacts
 66 |         uses: actions/upload-artifact@v3
 67 |         with:
 68 |           name: macos
 69 |           path: |
 70 |             chat
 71 | 
 72 |   # macos-arm64:
 73 |   #   runs-on: macos-arm64
 74 | 
 75 |   #   steps:
 76 |   #     - name: Clone
 77 |   #       id: checkout
 78 |   #       uses: actions/checkout@v1
 79 | 
 80 |   #     - name: Dependencies
 81 |   #       id: depends
 82 |   #       run: |
 83 |   #         brew update
 84 | 
 85 |   #     - name: Build
 86 |   #       id: make_build
 87 |   #       run: |
 88 |   #         make
 89 | 
 90 |   #     - name: Archive production artifacts
 91 |   #       uses: actions/upload-artifact@v3
 92 |   #       with:
 93 |   #         name: macos
 94 |   #         path: |
 95 |   #           chat
 96 | 
 97 |   windows-latest:
 98 |     runs-on: windows-latest
 99 | 
100 |     steps:
101 |       - name: Clone
102 |         id: checkout
103 |         uses: actions/checkout@v1
104 | 
105 |       - name: Build
106 |         id: cmake_build
107 |         run: |
108 |           mkdir build
109 |           cd build
110 |           cmake ..
111 |           cmake --build . --config Release
112 | 
113 |       - name: Set commit hash variables
114 |         id: commit
115 |         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
116 |         uses: pr-mpt/actions-commit-hash@v2
117 | 
118 |       - name: Pack artifacts
119 |         id: pack_artifacts
120 |         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
121 |         run: |
122 |           7z a alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
123 | 
124 |       - name: Create release
125 |         id: create_release
126 |         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
127 |         uses: zendesk/action-create-release@v1
128 |         env:
129 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
130 |         with:
131 |           tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
132 | 
133 |       - name: Upload release
134 |         id: upload_release
135 |         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
136 |         uses: actions/upload-release-asset@v1
137 |         env:
138 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
139 |         with:
140 |           upload_url: ${{ steps.create_release.outputs.upload_url }} 
141 |           asset_path: .\alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
142 |           asset_name: alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
143 |           asset_content_type: application/octet-stream
144 | 
145 | #  ubuntu-latest-gcc:
146 | #    runs-on: ubuntu-latest
147 | #
148 | #    strategy:
149 | #      matrix:
150 | #        build: [Debug, Release]
151 | #
152 | #    steps:
153 | #      - name: Clone
154 | #        uses: actions/checkout@v1
155 | #
156 | #      - name: Dependencies
157 | #        run: |
158 | #          sudo apt-get update
159 | #          sudo apt-get install build-essential
160 | #          sudo apt-get install cmake
161 | #
162 | #      - name: Configure
163 | #        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
164 | #
165 | #      - name: Build
166 | #        run: |
167 | #          make
168 | #
169 | #  ubuntu-latest-clang:
170 | #    runs-on: ubuntu-latest
171 | #
172 | #    strategy:
173 | #      matrix:
174 | #        build: [Debug, Release]
175 | #
176 | #    steps:
177 | #      - name: Clone
178 | #        uses: actions/checkout@v1
179 | #
180 | #      - name: Dependencies
181 | #        run: |
182 | #          sudo apt-get update
183 | #          sudo apt-get install build-essential
184 | #          sudo apt-get install cmake
185 | #
186 | #      - name: Configure
187 | #        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
188 | #
189 | #      - name: Build
190 | #        run: |
191 | #          make
192 | #
193 | #  ubuntu-latest-gcc-sanitized:
194 | #    runs-on: ubuntu-latest
195 | #
196 | #    strategy:
197 | #      matrix:
198 | #        sanitizer: [ADDRESS, THREAD, UNDEFINED]
199 | #
200 | #    steps:
201 | #      - name: Clone
202 | #        uses: actions/checkout@v1
203 | #
204 | #      - name: Dependencies
205 | #        run: |
206 | #          sudo apt-get update
207 | #          sudo apt-get install build-essential
208 | #          sudo apt-get install cmake
209 | #
210 | #      - name: Configure
211 | #        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
212 | #
213 | #      - name: Build
214 | #        run: |
215 | #          make
216 | #
217 | #  windows:
218 | #    runs-on: windows-latest
219 | #
220 | #    strategy:
221 | #      matrix:
222 | #        build: [Release]
223 | #        arch: [Win32, x64]
224 | #        include:
225 | #          - arch: Win32
226 | #            s2arc: x86
227 | #          - arch: x64
228 | #            s2arc: x64
229 | #
230 | #    steps:
231 | #      - name: Clone
232 | #        uses: actions/checkout@v1
233 | #
234 | #      - name: Add msbuild to PATH
235 | #        uses: microsoft/setup-msbuild@v1
236 | #
237 | #      - name: Configure
238 | #        run: >
239 | #          cmake -S . -B ./build -A ${{ matrix.arch }}
240 | #          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
241 | #
242 | #      - name: Build
243 | #        run: |
244 | #          cd ./build
245 | #          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
246 | #
247 | #      - name: Upload binaries
248 | #        uses: actions/upload-artifact@v1
249 | #        with:
250 | #          name: llama-bin-${{ matrix.arch }}
251 | #          path: build/bin/${{ matrix.build }}
252 | #
253 | #  windows-blas:
254 | #    runs-on: windows-latest
255 | #
256 | #    strategy:
257 | #      matrix:
258 | #        build: [Release]
259 | #        arch: [Win32, x64]
260 | #        blas: [ON]
261 | #        include:
262 | #          - arch: Win32
263 | #            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
264 | #            s2arc: x86
265 | #          - arch: x64
266 | #            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
267 | #            s2arc: x64
268 | #
269 | #    steps:
270 | #      - name: Clone
271 | #        uses: actions/checkout@v1
272 | #
273 | #      - name: Add msbuild to PATH
274 | #        uses: microsoft/setup-msbuild@v1
275 | #
276 | #      - name: Fetch OpenBLAS
277 | #        if: matrix.blas == 'ON'
278 | #        run: |
279 | #          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
280 | #          7z x blas.zip -oblas -y
281 | #          copy blas/include/cblas.h .
282 | #          copy blas/include/openblas_config.h .
283 | #          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
284 | #
285 | #      - name: Configure
286 | #        run: >
287 | #          cmake -S . -B ./build -A ${{ matrix.arch }}
288 | #          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
289 | #          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
290 | #          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
291 | #
292 | #      - name: Build
293 | #        run: |
294 | #          cd ./build
295 | #          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
296 | #
297 | #      - name: Copy libopenblas.dll
298 | #        if: matrix.blas == 'ON'
299 | #        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
300 | #
301 | #      - name: Upload binaries
302 | #        if: matrix.blas == 'ON'
303 | #        uses: actions/upload-artifact@v1
304 | #        with:
305 | #          name: llama-blas-bin-${{ matrix.arch }}
306 | #          path: build/bin/${{ matrix.build }}
307 | #
308 | #  emscripten:
309 | #    runs-on: ubuntu-latest
310 | #
311 | #    strategy:
312 | #      matrix:
313 | #        build: [Release]
314 | #
315 | #    steps:
316 | #      - name: Clone
317 | #        uses: actions/checkout@v1
318 | #
319 | #      - name: Dependencies
320 | #        run: |
321 | #          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
322 | #          tar -xvf master.tar.gz
323 | #          emsdk-master/emsdk update
324 | #          emsdk-master/emsdk install latest
325 | #          emsdk-master/emsdk activate latest
326 | #
327 | #      - name: Configure
328 | #        run: echo "tmp"
329 | #
330 | #      - name: Build
331 | #        run: |
332 | #          pushd emsdk-master
333 | #          source ./emsdk_env.sh
334 | #          popd
335 | #          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
336 | #          make
337 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /chat
 2 | 
 3 | *.o
 4 | *.a
 5 | .cache/
 6 | .vs/
 7 | .vscode/
 8 | .DS_Store
 9 | 
10 | build/
11 | build-em/
12 | build-debug/
13 | build-release/
14 | build-static/
15 | build-no-accel/
16 | build-sanitize-addr/
17 | build-sanitize-thread/
18 | 
19 | models/*
20 | *.bin
21 | 
22 | /main
23 | /quantize
24 | 
25 | arm_neon.h
26 | compile_commands.json
27 | 
28 | # Windows CMake files
29 | *.vcxproj
30 | *.filters
31 | *.cmake
32 | *.sln
33 | x64/
34 | Debug/
35 | Release/
36 | CMakeFiles/
37 | CMakeCache.txt
38 | *.dir/
39 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.8)
  2 | project("alpaca.cpp")
  3 | 
  4 | set(CMAKE_CXX_STANDARD 20)
  5 | set(CMAKE_CXX_STANDARD_REQUIRED true)
  6 | set(CMAKE_C_STANDARD 11)
  7 | 
  8 | if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
  9 |     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
 10 |     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 11 | endif()
 12 | 
 13 | option(LLAMA_ALL_WARNINGS            "llama: enable all compiler warnings"                   ON)
 14 | option(LLAMA_ALL_WARNINGS_3RD_PARTY  "llama: enable all compiler warnings in 3rd party libs" OFF)
 15 | 
 16 | option(LLAMA_SANITIZE_THREAD         "llama: enable thread sanitizer"    OFF)
 17 | option(LLAMA_SANITIZE_ADDRESS        "llama: enable address sanitizer"   OFF)
 18 | option(LLAMA_SANITIZE_UNDEFINED      "llama: enable undefined sanitizer" OFF)
 19 | 
 20 | if (APPLE)
 21 |     option(LLAMA_NO_ACCELERATE       "llama: disable Accelerate framework" OFF)
 22 |     option(LLAMA_NO_AVX              "llama: disable AVX" OFF)
 23 |     option(LLAMA_NO_AVX2             "llama: disable AVX2" OFF)
 24 |     option(LLAMA_NO_FMA              "llama: disable FMA" OFF)
 25 | endif()
 26 | 
 27 | if (NOT MSVC)
 28 |     if (LLAMA_SANITIZE_THREAD)
 29 |         set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
 30 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
 31 |     endif()
 32 | 
 33 |     if (LLAMA_SANITIZE_ADDRESS)
 34 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
 35 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
 36 |     endif()
 37 | 
 38 |     if (LLAMA_SANITIZE_UNDEFINED)
 39 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
 40 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
 41 |     endif()
 42 | endif()
 43 | 
 44 | if (APPLE AND NOT LLAMA_NO_ACCELERATE)
 45 |     find_library(ACCELERATE_FRAMEWORK Accelerate)
 46 |     if (ACCELERATE_FRAMEWORK)
 47 |         message(STATUS "Accelerate framework found")
 48 | 
 49 |         set(LLAMA_EXTRA_LIBS  ${LLAMA_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
 50 |         set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
 51 |     else()
 52 |         message(WARNING "Accelerate framework not found")
 53 |     endif()
 54 | endif()
 55 | 
 56 | if (LLAMA_ALL_WARNINGS)
 57 |     if (NOT MSVC)
 58 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
 59 |             -Wall                           \
 60 |             -Wextra                         \
 61 |             -Wpedantic                      \
 62 |             -Wshadow                        \
 63 |             -Wcast-qual                     \
 64 |             -Wstrict-prototypes             \
 65 |             -Wpointer-arith                 \
 66 |             -Wno-unused-function            \
 67 |         ")
 68 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
 69 |             -Wall                           \
 70 |             -Wextra                         \
 71 |             -Wpedantic                      \
 72 |             -Wcast-qual                     \
 73 |         ")
 74 |     else()
 75 |         # todo : msvc
 76 |     endif()
 77 | endif()
 78 | 
 79 | message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 80 | 
 81 | if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
 82 |     message(STATUS "ARM detected")
 83 | else()
 84 |     message(STATUS "x86 detected")
 85 |     if (MSVC)
 86 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
 87 |         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
 88 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
 89 |     else()
 90 |         if(NOT LLAMA_NO_AVX)
 91 |             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
 92 |         endif()
 93 |         if(NOT LLAMA_NO_AVX2)
 94 |             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
 95 |         endif()
 96 |         if(NOT LLAMA_NO_FMA)
 97 |             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
 98 |         endif()
 99 |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
100 |     endif()
101 | endif()
102 | 
103 | # if (LLAMA_PERF)
104 | #     set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
105 | # endif()
106 | 
107 | add_executable(main
108 |     main.cpp
109 |     utils.cpp
110 |     utils.h)
111 | 
112 | add_executable(quantize
113 |     quantize.cpp
114 |     utils.cpp
115 |     utils.h)
116 | 
117 | add_library(ggml
118 |     ggml.c
119 |     ggml.h)
120 | 
121 | target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
122 | target_compile_definitions(main PUBLIC ${LLAMA_EXTRA_FLAGS})
123 | target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
124 | 
125 | target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
126 | target_include_directories(ggml PUBLIC .)
127 | target_link_libraries(quantize PRIVATE ggml)
128 | target_link_libraries(main PRIVATE ggml)
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Georgi Gerganov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ifndef UNAME_S
  2 | UNAME_S := $(shell uname -s)
  3 | endif
  4 | 
  5 | ifndef UNAME_P
  6 | UNAME_P := $(shell uname -p)
  7 | endif
  8 | 
  9 | ifndef UNAME_M
 10 | UNAME_M := $(shell uname -m)
 11 | endif
 12 | 
 13 | CCV := $(shell $(CC) --version | head -n 1)
 14 | CXXV := $(shell $(CXX) --version | head -n 1)
 15 | 
 16 | # Mac OS + Arm can report x86_64
 17 | # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 18 | ifeq ($(UNAME_S),Darwin)
 19 | 	ifneq ($(UNAME_P),arm)
 20 | 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
 21 | 		ifeq ($(SYSCTL_M),1)
 22 | 			# UNAME_P := arm
 23 | 			# UNAME_M := arm64
 24 | 			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
 25 | 		endif
 26 | 	endif
 27 | endif
 28 | 
 29 | #
 30 | # Compile flags
 31 | #
 32 | 
 33 | CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 34 | CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 35 | LDFLAGS  =
 36 | 
 37 | # OS specific
 38 | # TODO: support Windows
 39 | ifeq ($(UNAME_S),Linux)
 40 | 	CFLAGS   += -pthread
 41 | 	CXXFLAGS += -pthread
 42 | endif
 43 | ifeq ($(UNAME_S),Darwin)
 44 | 	CFLAGS   += -pthread
 45 | 	CXXFLAGS += -pthread
 46 | endif
 47 | ifeq ($(UNAME_S),FreeBSD)
 48 | 	CFLAGS   += -pthread
 49 | 	CXXFLAGS += -pthread
 50 | endif
 51 | ifeq ($(UNAME_S),NetBSD)
 52 | 	CFLAGS   += -pthread
 53 | 	CXXFLAGS += -pthread
 54 | endif
 55 | ifeq ($(UNAME_S),Haiku)
 56 | 	CFLAGS   += -pthread
 57 | 	CXXFLAGS += -pthread
 58 | endif
 59 | 
 60 | # Architecture specific
 61 | # TODO: probably these flags need to be tweaked on some architectures
 62 | #       feel free to update the Makefile for your architecture and send a pull request or issue
 63 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 64 | 	ifeq ($(UNAME_S),Darwin)
 65 | 		CFLAGS += -mf16c
 66 | 		AVX1_M := $(shell sysctl machdep.cpu.features)
 67 | 		ifneq (,$(findstring FMA,$(AVX1_M)))
 68 | 			CFLAGS += -mfma
 69 | 		endif
 70 | 		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
 71 | 			CFLAGS += -mavx
 72 | 		endif
 73 | 		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
 74 | 		ifneq (,$(findstring AVX2,$(AVX2_M)))
 75 | 			CFLAGS += -mavx2
 76 | 		endif
 77 | 	else ifeq ($(UNAME_S),Linux)
 78 | 		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
 79 | 		ifneq (,$(findstring avx,$(AVX1_M)))
 80 | 			CFLAGS += -mavx
 81 | 		endif
 82 | 		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
 83 | 		ifneq (,$(findstring avx2,$(AVX2_M)))
 84 | 			CFLAGS += -mavx2
 85 | 		endif
 86 | 		FMA_M := $(shell grep "fma " /proc/cpuinfo)
 87 | 		ifneq (,$(findstring fma,$(FMA_M)))
 88 | 			CFLAGS += -mfma
 89 | 		endif
 90 | 		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
 91 | 		ifneq (,$(findstring f16c,$(F16C_M)))
 92 | 			CFLAGS += -mf16c
 93 | 		endif
 94 | 		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
 95 | 		ifneq (,$(findstring sse3,$(SSE3_M)))
 96 | 			CFLAGS += -msse3
 97 | 		endif
 98 | 	else ifeq ($(UNAME_S),Haiku)
 99 | 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
100 | 		ifneq (,$(findstring avx,$(AVX1_M)))
101 | 			CFLAGS += -mavx
102 | 		endif
103 | 		AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
104 | 		ifneq (,$(findstring avx2,$(AVX2_M)))
105 | 			CFLAGS += -mavx2
106 | 		endif
107 | 		FMA_M := $(shell sysinfo -cpu | grep "FMA ")
108 | 		ifneq (,$(findstring fma,$(FMA_M)))
109 | 			CFLAGS += -mfma
110 | 		endif
111 | 		F16C_M := $(shell sysinfo -cpu | grep "F16C ")
112 | 		ifneq (,$(findstring f16c,$(F16C_M)))
113 | 			CFLAGS += -mf16c
114 | 		endif
115 | 	else
116 | 		CFLAGS += -mfma -mf16c -mavx -mavx2
117 | 	endif
118 | endif
119 | ifeq ($(UNAME_M),amd64)
120 | 	CFLAGS += -mavx -mavx2 -mfma -mf16c
121 | endif
122 | ifneq ($(filter ppc64%,$(UNAME_M)),)
123 | 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
124 | 	ifneq (,$(findstring POWER9,$(POWER9_M)))
125 | 		CFLAGS += -mpower9-vector
126 | 	endif
127 | 	# Require c++23's std::byteswap for big-endian support.
128 | 	ifeq ($(UNAME_M),ppc64)
129 | 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
130 | 	endif
131 | endif
132 | ifndef LLAMA_NO_ACCELERATE
133 | 	# Mac M1 - include Accelerate framework
134 | 	ifeq ($(UNAME_S),Darwin)
135 | 		CFLAGS  += -DGGML_USE_ACCELERATE
136 | 		LDFLAGS += -framework Accelerate
137 | 	endif
138 | endif
139 | ifdef LLAMA_OPENBLAS
140 | 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
141 | 	LDFLAGS += -lopenblas
142 | endif
143 | ifdef LLAMA_GPROF
144 | 	CFLAGS   += -pg
145 | 	CXXFLAGS += -pg
146 | endif
147 | ifneq ($(filter aarch64%,$(UNAME_M)),)
148 | 	CFLAGS += -mcpu=native
149 | 	CXXFLAGS += -mcpu=native
150 | endif
151 | ifneq ($(filter armv6%,$(UNAME_M)),)
152 | 	# Raspberry Pi 1, 2, 3
153 | 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
154 | endif
155 | ifneq ($(filter armv7%,$(UNAME_M)),)
156 | 	# Raspberry Pi 4
157 | 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
158 | endif
159 | ifneq ($(filter armv8%,$(UNAME_M)),)
160 | 	# Raspberry Pi 4
161 | 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
162 | endif
163 | 
164 | #
165 | # Print build information
166 | #
167 | 
168 | $(info I llama.cpp build info: )
169 | $(info I UNAME_S:  $(UNAME_S))
170 | $(info I UNAME_P:  $(UNAME_P))
171 | $(info I UNAME_M:  $(UNAME_M))
172 | $(info I CFLAGS:   $(CFLAGS))
173 | $(info I CXXFLAGS: $(CXXFLAGS))
174 | $(info I LDFLAGS:  $(LDFLAGS))
175 | $(info I CC:       $(CCV))
176 | $(info I CXX:      $(CXXV))
177 | $(info )
178 | 
179 | default: main quantize
180 | 
181 | #
182 | # Build library
183 | #
184 | 
185 | ggml.o: ggml.c ggml.h
186 | 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
187 | 
188 | utils.o: utils.cpp utils.h
189 | 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
190 | 
191 | clean:
192 | 	rm -f *.o main quantize
193 | 
194 | main: main.cpp ggml.o utils.o
195 | 	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
196 | 	./main -h
197 | 
198 | 
199 | quantize: quantize.cpp ggml.o utils.o
200 | 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
201 | 
202 | #
203 | # Tests
204 | #
205 | 
206 | .PHONY: tests
207 | tests:
208 | 	bash ./tests/run-tests.sh
209 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Alpaca.cpp
 2 | 
 3 | Run a fast ChatGPT-like model locally on your device. The screencast below is not sped up and running on an M2 Macbook Air with 4GB of weights. 
 4 | 
 5 | 
 6 | [![asciicast](screencast.gif)](https://asciinema.org/a/dfJ8QXZ4u978Ona59LPEldtKK)
 7 | 
 8 | 
 9 | This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT) and a set of modifications to [llama.cpp](https://github.com/ggerganov/llama.cpp) to add a chat interface. 
10 | 
11 | ## Get started
12 | 
13 | ```sh
14 | git clone https://github.com/antimatter15/alpaca.cpp
15 | cd alpaca.cpp
16 | 
17 | make chat
18 | ./chat
19 | ```
20 | 
21 | You can download the weights for `ggml-alpaca-7b-q4.bin` with BitTorrent `magnet:?xt=urn:btih:5aaceaec63b03e51a98f04fd5c42320b2a033010&dn=ggml-alpaca-7b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce`
22 | 
23 | 
24 | Alternatively you can download them with IPFS.
25 | 
26 | ```
27 | # any of these will work
28 | curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
29 | curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
30 | curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
31 | ```
32 | 
33 | Save the `ggml-alpaca-7b-q4.bin` file in the same directory as your `./chat` executable. 
34 | 
35 | The weights are based on the published fine-tunes from `alpaca-lora`, converted back into a pytorch checkpoint with a [modified script](https://github.com/tloen/alpaca-lora/pull/19) and then quantized with llama.cpp the regular way. 
36 | 
37 | ## Windows Setup
38 | 
39 | - Download and install CMake: <https://cmake.org/download/>
40 | - Download and install `git`. If you've never used git before, consider a GUI client like <https://desktop.github.com/>
41 | - Clone this repo using your git client of choice (for GitHub Desktop, go to File -> Clone repository -> From URL and paste `https://github.com/antimatter15/alpaca.cpp` in as the URL)
42 | - Open a Windows Terminal inside the folder you cloned the repository to
43 | - Run the following commands one by one:
44 | 
45 | ```ps1
46 | cmake .
47 | cmake --build . --config Release
48 | ```
49 | 
50 | - Download the weights via any of the links in "Get started" above, and save the file as `ggml-alpaca-7b-q4.bin` in the main Alpaca directory.
51 | - In the terminal window, run this command:
52 | ```ps1
53 | .\Release\chat.exe
54 | ```
55 | - (You can add other launch options like `--n 8` as preferred onto the same line)
56 | - You can now type to the AI in the terminal and it will reply. Enjoy!
57 | 
58 | ## 13B
59 | 
60 | TODO: write more docs here (PRs welcome)
61 | 
62 | Torrent: `magnet:?xt=urn:btih:053b3d54d2e77ff020ebddf51dad681f2a651071&dn=ggml-alpaca-13b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A6969%2Fannounce&tr=udp%3A%2F%2F9.rarbg.com%3A2810%2Fannounce`
63 | 
64 | 
65 | ```
66 | ./chat -m ggml-alpaca-13b-q4.bin
67 | ```
68 | 
69 | ## Credit
70 | 
71 | This combines [Facebook's LLaMA](https://github.com/facebookresearch/llama), [Stanford Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), [alpaca-lora](https://github.com/tloen/alpaca-lora) and [corresponding weights](https://huggingface.co/tloen/alpaca-lora-7b/tree/main) by Eric Wang (which uses [Jason Phang's implementation of LLaMA](https://github.com/huggingface/transformers/pull/21955) on top of Hugging Face Transformers), and [llama.cpp](https://github.com/ggerganov/llama.cpp) by Georgi Gerganov. The chat implementation is based on Matvey Soloviev's [Interactive Mode](https://github.com/ggerganov/llama.cpp/pull/61) for llama.cpp. Inspired by [Simon Willison's](https://til.simonwillison.net/llms/llama-7b-m2) getting started guide for LLaMA. [Andy Matuschak](https://twitter.com/andy_matuschak/status/1636769182066053120)'s thread on adapting this to 13B, using fine tuning weights by [Sam Witteveen](https://huggingface.co/samwit/alpaca13B-lora). 
72 | 
73 | 
74 | ## Disclaimer
75 | 
76 | Note that the model weights are only to be used for research purposes, as they are derivative of LLaMA, and uses the published instruction data from the Stanford Alpaca project which is generated by OpenAI, which itself disallows the usage of its outputs to train competing models. 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/convert-pth-to-ggml.py:
--------------------------------------------------------------------------------
  1 | # Convert a LLaMA model checkpoint to a ggml compatible file
  2 | #
  3 | # Load the model using Torch
  4 | # Iterate over all variables and write them to a binary file.
  5 | #
  6 | # For each variable, write the following:
  7 | #   - Number of dimensions (int)
  8 | #   - Name length (int)
  9 | #   - Dimensions (int[n_dims])
 10 | #   - Name (char[name_length])
 11 | #   - Data (float[n_dims])
 12 | #
 13 | # By default, the bigger matrices are converted to 16-bit floats.
 14 | # This can be disabled by adding the "use-f32" CLI argument.
 15 | #
 16 | # At the start of the ggml file we write the model parameters
 17 | # and vocabulary.
 18 | #
 19 | 
 20 | import sys
 21 | import json
 22 | import struct
 23 | import numpy as np
 24 | import torch
 25 | from sentencepiece import SentencePieceProcessor
 26 | 
 27 | if len(sys.argv) < 3:
 28 |     print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
 29 |     print("  ftype == 0 -> float32")
 30 |     print("  ftype == 1 -> float16")
 31 |     sys.exit(1)
 32 | 
 33 | # output in the same directory as the model
 34 | dir_model = sys.argv[1]
 35 | 
 36 | fname_hparams   = sys.argv[1] + "/params.json"
 37 | fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
 38 | 
 39 | def get_n_parts(dim):
 40 |     if dim == 4096:
 41 |         return 1
 42 |     elif dim == 5120:
 43 |         return 2
 44 |     elif dim == 6656:
 45 |         return 4
 46 |     elif dim == 8192:
 47 |         return 8
 48 |     else:
 49 |         print("Invalid dim: " + str(dim))
 50 |         sys.exit(1)
 51 | 
 52 | # possible data types
 53 | #   ftype == 0 -> float32
 54 | #   ftype == 1 -> float16
 55 | #
 56 | # map from ftype to string
 57 | ftype_str = ["f32", "f16"]
 58 | 
 59 | ftype = 1
 60 | if len(sys.argv) > 2:
 61 |     ftype = int(sys.argv[2])
 62 |     if ftype < 0 or ftype > 1:
 63 |         print("Invalid ftype: " + str(ftype))
 64 |         sys.exit(1)
 65 |     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 66 | 
 67 | with open(fname_hparams, "r") as f:
 68 |     hparams = json.load(f)
 69 | 
 70 | tokenizer = SentencePieceProcessor(fname_tokenizer)
 71 | 
 72 | hparams.update({"vocab_size": tokenizer.vocab_size()})
 73 | 
 74 | n_parts = get_n_parts(hparams["dim"])
 75 | 
 76 | print(hparams)
 77 | print('n_parts = ', n_parts)
 78 | 
 79 | for p in range(n_parts):
 80 |     print('Processing part ', p)
 81 | 
 82 |     #fname_model = sys.argv[1] + "/consolidated.00.pth"
 83 |     fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
 84 |     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 85 |     if (p > 0):
 86 |         fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
 87 | 
 88 |     model = torch.load(fname_model, map_location="cpu")
 89 | 
 90 |     fout = open(fname_out, "wb")
 91 | 
 92 |     fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 93 |     fout.write(struct.pack("i", hparams["vocab_size"]))
 94 |     fout.write(struct.pack("i", hparams["dim"]))
 95 |     fout.write(struct.pack("i", hparams["multiple_of"]))
 96 |     fout.write(struct.pack("i", hparams["n_heads"]))
 97 |     fout.write(struct.pack("i", hparams["n_layers"]))
 98 |     fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
 99 |     fout.write(struct.pack("i", ftype))
100 | 
101 |     # Is this correct??
102 |     for i in range(tokenizer.vocab_size()):
103 |         if tokenizer.is_unknown(i):
104 |             # "<unk>" token (translated as ??)
105 |             text = " \u2047 ".encode("utf-8")
106 |             fout.write(struct.pack("i", len(text)))
107 |             fout.write(text)
108 |         elif tokenizer.is_control(i):
109 |             # "<s>"/"</s>" tokens
110 |             fout.write(struct.pack("i", 0))
111 |         elif tokenizer.is_byte(i):
112 |             # "<U+XX>" tokens (which may be invalid UTF-8)
113 |             piece = tokenizer.id_to_piece(i)
114 |             if len(piece) != 6:
115 |                 print("Invalid token: " + piece)
116 |                 sys.exit(1)
117 |             byte_value = int(piece[3:-1], 16)
118 |             fout.write(struct.pack("i", 1))
119 |             fout.write(struct.pack("B", byte_value))
120 |         else:
121 |             # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
122 |             text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
123 |             fout.write(struct.pack("i", len(text)))
124 |             fout.write(text)
125 | 
126 |     for k, v in model.items():
127 |         name = k
128 |         shape = v.shape
129 | 
130 |         # skip layers.X.attention.inner_attention.rope.freqs
131 |         if name[-5:] == "freqs":
132 |             continue
133 | 
134 |         print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
135 | 
136 |         #data = tf.train.load_variable(dir_model, name).squeeze()
137 |         data = v.numpy().squeeze()
138 |         n_dims = len(data.shape);
139 | 
140 |         # for efficiency - transpose some matrices
141 |         # "model/h.*/attn/c_attn/w"
142 |         # "model/h.*/attn/c_proj/w"
143 |         # "model/h.*/mlp/c_fc/w"
144 |         # "model/h.*/mlp/c_proj/w"
145 |         #if name[-14:] == "/attn/c_attn/w" or \
146 |         #   name[-14:] == "/attn/c_proj/w" or \
147 |         #   name[-11:] == "/mlp/c_fc/w" or \
148 |         #   name[-13:] == "/mlp/c_proj/w":
149 |         #    print("  Transposing")
150 |         #    data = data.transpose()
151 | 
152 |         dshape = data.shape
153 | 
154 |         # default type is fp16
155 |         ftype_cur = 1
156 |         if ftype == 0 or n_dims == 1:
157 |             print("  Converting to float32")
158 |             data = data.astype(np.float32)
159 |             ftype_cur = 0
160 | 
161 |         # header
162 |         sname = name.encode('utf-8')
163 |         fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
164 |         for i in range(n_dims):
165 |             fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
166 |         fout.write(sname);
167 | 
168 |         # data
169 |         data.tofile(fout)
170 | 
171 |     # I hope this deallocates the memory ..
172 |     model = None
173 | 
174 |     fout.close()
175 | 
176 |     print("Done. Output file: " + fname_out + ", (part ", p, ")")
177 |     print("")
178 | 


--------------------------------------------------------------------------------
/ggml.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | //
  4 | // GGML Tensor Library
  5 | //
  6 | // This documentation is still a work in progress.
  7 | // If you wish some specific topics to be covered, feel free to drop a comment:
  8 | //
  9 | //   https://github.com/ggerganov/whisper.cpp/issues/40
 10 | //
 11 | // ## Overview
 12 | //
 13 | // This library implements:
 14 | //
 15 | //  - a set of tensor operations
 16 | //  - automatic differentiation
 17 | //  - basic optimization algorithms
 18 | //
 19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
 20 | // but is not limited to, the following:
 21 | //
 22 | //  - linear regression
 23 | //  - support vector machines
 24 | //  - neural networks
 25 | //
 26 | // The library allows the user to define a certain function using the available tensor operations. This function
 27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition
 28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
 29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
 30 | // using one of the available optimization algorithms.
 31 | //
 32 | // For example, here we define the function: f(x) = a*x^2 + b
 33 | //
 34 | //   {
 35 | //       struct ggml_init_params params = {
 36 | //           .mem_size   = 16*1024*1024,
 37 | //           .mem_buffer = NULL,
 38 | //       };
 39 | //
 40 | //       // memory allocation happens here
 41 | //       struct ggml_context * ctx = ggml_init(params);
 42 | //
 43 | //       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 44 | //
 45 | //       ggml_set_param(ctx, x); // x is an input variable
 46 | //
 47 | //       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 48 | //       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 49 | //       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
 50 | //       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
 51 | //
 52 | //       ...
 53 | //   }
 54 | //
 55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only
 56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
 57 | //
 58 | //   {
 59 | //       ...
 60 | //
 61 | //       struct ggml_cgraph gf = ggml_build_forward(f);
 62 | //
 63 | //       // set the input variable and parameter values
 64 | //       ggml_set_f32(x, 2.0f);
 65 | //       ggml_set_f32(a, 3.0f);
 66 | //       ggml_set_f32(b, 4.0f);
 67 | //
 68 | //       ggml_graph_compute(ctx0, &gf);
 69 | //
 70 | //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 71 | //
 72 | //       ...
 73 | //   }
 74 | //
 75 | // The actual computation is performed in the ggml_graph_compute() function.
 76 | //
 77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
 78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
 79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
 80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
 81 | // actually needed.
 82 | //
 83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
 84 | // differentiation and optimization algorithms.
 85 | //
 86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs
 87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
 88 | // the user can avoid the memory allocation overhead at runtime.
 89 | //
 90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
 91 | // citizens, but in theory the library can be extended to support FP8 and integer data types.
 92 | //
 93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
 94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
 95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear
 96 | // yet, but a few examples are demonstrated in the following operations:
 97 | //
 98 | //   - ggml_permute()
 99 | //   - ggml_conv_1d_1s()
100 | //   - ggml_conv_1d_2s()
101 | //
102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function
103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
105 | // calculus class, or watch the following video:
106 | //
107 | //   What is Automatic Differentiation?
108 | //   https://www.youtube.com/watch?v=wG_nF1awSSY
109 | //
110 | //
111 | // ## Tensor data (struct ggml_tensor)
112 | //
113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
116 | //
117 | //   {
118 | //       struct ggml_tensor * c = ggml_add(ctx, a, b);
119 | //
120 | //       assert(c->src[0] == a);
121 | //       assert(c->src[1] == b);
122 | //   }
123 | //
124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
128 | // contiguous in memory.
129 | //
130 | // The data of the tensor is accessed via the "data" pointer. For example:
131 | //
132 | //   {
133 | //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134 | //
135 | //       // a[1, 2] = 1.0f;
136 | //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137 | //
138 | //       // a[2, 0] = 2.0f;
139 | //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140 | //
141 | //       ...
142 | //   }
143 | //
144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
145 | //
146 | // ## The matrix multiplication operator (ggml_mul_mat)
147 | //
148 | // TODO
149 | //
150 | //
151 | // ## Multi-threading
152 | //
153 | // TODO
154 | //
155 | //
156 | // ## Overview of ggml.c
157 | //
158 | // TODO
159 | //
160 | //
161 | // ## SIMD optimizations
162 | //
163 | // TODO
164 | //
165 | //
166 | // ## Debugging ggml
167 | //
168 | // TODO
169 | //
170 | //
171 | 
172 | #ifdef  __cplusplus
173 | extern "C" {
174 | #endif
175 | 
176 | #include <stdint.h>
177 | #include <stddef.h>
178 | #include <stdbool.h>
179 | 
180 | #define GGML_MAX_DIMS     4
181 | #define GGML_MAX_NODES    4096
182 | #define GGML_MAX_PARAMS   16
183 | #define GGML_MAX_CONTEXTS 64
184 | #define GGML_MAX_OPT      4
185 | 
186 | #ifdef __ARM_NEON
187 | // we use the built-in 16-bit float type
188 | typedef __fp16 ggml_fp16_t;
189 | #else
190 | typedef uint16_t ggml_fp16_t;
191 | #endif
192 | 
193 | // convert FP16 <-> FP32
194 | float       ggml_fp16_to_fp32(ggml_fp16_t x);
195 | ggml_fp16_t ggml_fp32_to_fp16(float x);
196 | 
197 | struct ggml_object;
198 | struct ggml_context;
199 | 
200 | enum ggml_type {
201 |     GGML_TYPE_Q4_0,
202 |     GGML_TYPE_Q4_1,
203 |     GGML_TYPE_I8,
204 |     GGML_TYPE_I16,
205 |     GGML_TYPE_I32,
206 |     GGML_TYPE_F16,
207 |     GGML_TYPE_F32,
208 |     GGML_TYPE_COUNT,
209 | };
210 | 
211 | // available tensor operations:
212 | enum ggml_op {
213 |     GGML_OP_NONE = 0,
214 | 
215 |     GGML_OP_DUP,
216 |     GGML_OP_ADD,
217 |     GGML_OP_SUB,
218 |     GGML_OP_MUL,
219 |     GGML_OP_DIV,
220 |     GGML_OP_SQR,
221 |     GGML_OP_SQRT,
222 |     GGML_OP_SUM,
223 |     GGML_OP_MEAN,
224 |     GGML_OP_REPEAT,
225 |     GGML_OP_ABS,
226 |     GGML_OP_SGN,
227 |     GGML_OP_NEG,
228 |     GGML_OP_STEP,
229 |     GGML_OP_RELU,
230 |     GGML_OP_GELU,
231 |     GGML_OP_SILU,
232 |     GGML_OP_NORM, // normalize
233 |     GGML_OP_RMS_NORM,
234 | 
235 |     GGML_OP_MUL_MAT,
236 | 
237 |     GGML_OP_SCALE,
238 |     GGML_OP_CPY,
239 |     GGML_OP_RESHAPE,
240 |     GGML_OP_VIEW,
241 |     GGML_OP_PERMUTE,
242 |     GGML_OP_TRANSPOSE,
243 |     GGML_OP_GET_ROWS,
244 |     GGML_OP_DIAG_MASK_INF,
245 |     GGML_OP_SOFT_MAX,
246 |     GGML_OP_ROPE,
247 |     GGML_OP_CONV_1D_1S,
248 |     GGML_OP_CONV_1D_2S,
249 | 
250 |     GGML_OP_FLASH_ATTN,
251 |     GGML_OP_FLASH_FF,
252 | 
253 |     GGML_OP_COUNT,
254 | };
255 | 
256 | // n-dimensional tensor
257 | struct ggml_tensor {
258 |     enum ggml_type type;
259 | 
260 |     int    n_dims;
261 |     int    ne[GGML_MAX_DIMS]; // number of elements
262 |     size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263 |                               // nb[0] = sizeof(type)
264 |                               // nb[1] = nb[0]   * ne[0] + padding
265 |                               // nb[i] = nb[i-1] * ne[i-1]
266 | 
267 |     // compute data
268 |     enum ggml_op op;
269 | 
270 |     bool is_param;
271 | 
272 |     struct ggml_tensor * grad;
273 |     struct ggml_tensor * src0;
274 |     struct ggml_tensor * src1;
275 |     struct ggml_tensor * opt[GGML_MAX_OPT];
276 | 
277 |     // thread scheduling
278 |     int n_tasks;
279 | 
280 |     // performance
281 |     int     perf_runs;
282 |     int64_t perf_cycles;
283 |     int64_t perf_time_us;
284 | 
285 |     void * data;
286 |     char padding[8];
287 | };
288 | 
289 | // computation graph
290 | struct ggml_cgraph {
291 |     int n_nodes;
292 |     int n_leafs;
293 |     int n_threads;
294 | 
295 |     size_t work_size;
296 |     struct ggml_tensor * work;
297 | 
298 |     struct ggml_tensor * nodes[GGML_MAX_NODES];
299 |     struct ggml_tensor * grads[GGML_MAX_NODES];
300 |     struct ggml_tensor * leafs[GGML_MAX_NODES];
301 | 
302 |     // performance
303 |     int     perf_runs;
304 |     int64_t perf_cycles;
305 |     int64_t perf_time_us;
306 | };
307 | 
308 | // scratch buffer
309 | struct ggml_scratch {
310 |     size_t offs;
311 |     size_t size;
312 |     void * data;
313 | };
314 | 
315 | struct ggml_init_params {
316 |     // memory pool
317 |     size_t mem_size;   // bytes
318 |     void * mem_buffer; // if NULL, memory will be allocated internally
319 | };
320 | 
321 | void    ggml_time_init(void); // call this once at the beginning of the program
322 | int64_t ggml_time_ms(void);
323 | int64_t ggml_time_us(void);
324 | int64_t ggml_cycles(void);
325 | int64_t ggml_cycles_per_ms(void);
326 | 
327 | void ggml_print_object (const struct ggml_object * obj);
328 | void ggml_print_objects(const struct ggml_context * ctx);
329 | 
330 | int    ggml_nelements(const struct ggml_tensor * tensor);
331 | size_t ggml_nbytes   (const struct ggml_tensor * tensor);
332 | 
333 | int    ggml_blck_size (enum ggml_type type);
334 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
335 | float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
336 | 
337 | size_t ggml_element_size(const struct ggml_tensor * tensor);
338 | 
339 | struct ggml_context * ggml_init(struct ggml_init_params params);
340 | void ggml_free(struct ggml_context * ctx);
341 | 
342 | size_t ggml_used_mem(const struct ggml_context * ctx);
343 | 
344 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345 | 
346 | struct ggml_tensor * ggml_new_tensor(
347 |         struct ggml_context * ctx,
348 |         enum   ggml_type type,
349 |         int    n_dims,
350 |         const int *ne);
351 | 
352 | struct ggml_tensor * ggml_new_tensor_1d(
353 |         struct ggml_context * ctx,
354 |         enum   ggml_type type,
355 |         int    ne0);
356 | 
357 | struct ggml_tensor * ggml_new_tensor_2d(
358 |         struct ggml_context * ctx,
359 |         enum   ggml_type type,
360 |         int    ne0,
361 |         int    ne1);
362 | 
363 | struct ggml_tensor * ggml_new_tensor_3d(
364 |         struct ggml_context * ctx,
365 |         enum   ggml_type type,
366 |         int    ne0,
367 |         int    ne1,
368 |         int    ne2);
369 | 
370 | struct ggml_tensor * ggml_new_tensor_4d(
371 |         struct ggml_context * ctx,
372 |         enum   ggml_type type,
373 |         int    ne0,
374 |         int    ne1,
375 |         int    ne2,
376 |         int    ne3);
377 | 
378 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
379 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
380 | 
381 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
382 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
383 | 
384 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
385 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
386 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
387 | 
388 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
389 | void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
390 | 
391 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
392 | void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
393 | 
394 |  void * ggml_get_data    (const struct ggml_tensor * tensor);
395 | float * ggml_get_data_f32(const struct ggml_tensor * tensor);
396 | 
397 | //
398 | // operations on tensors with backpropagation
399 | //
400 | 
401 | struct ggml_tensor * ggml_dup(
402 |         struct ggml_context * ctx,
403 |         struct ggml_tensor  * a);
404 | 
405 | struct ggml_tensor * ggml_add(
406 |         struct ggml_context * ctx,
407 |         struct ggml_tensor  * a,
408 |         struct ggml_tensor  * b);
409 | 
410 | struct ggml_tensor * ggml_sub(
411 |         struct ggml_context * ctx,
412 |         struct ggml_tensor  * a,
413 |         struct ggml_tensor  * b);
414 | 
415 | struct ggml_tensor * ggml_mul(
416 |         struct ggml_context * ctx,
417 |         struct ggml_tensor  * a,
418 |         struct ggml_tensor  * b);
419 | 
420 | struct ggml_tensor * ggml_div(
421 |         struct ggml_context * ctx,
422 |         struct ggml_tensor  * a,
423 |         struct ggml_tensor  * b);
424 | 
425 | struct ggml_tensor * ggml_sqr(
426 |         struct ggml_context * ctx,
427 |         struct ggml_tensor  * a);
428 | 
429 | struct ggml_tensor * ggml_sqrt(
430 |         struct ggml_context * ctx,
431 |         struct ggml_tensor  * a);
432 | 
433 | // return scalar
434 | // TODO: compute sum along rows
435 | struct ggml_tensor * ggml_sum(
436 |         struct ggml_context * ctx,
437 |         struct ggml_tensor  * a);
438 | 
439 | // mean along rows
440 | struct ggml_tensor * ggml_mean(
441 |         struct ggml_context * ctx,
442 |         struct ggml_tensor  * a);
443 | 
444 | // if a is the same shape as b, and a is not parameter, return a
445 | // otherwise, return a new tensor: repeat(a) to fit in b
446 | struct ggml_tensor * ggml_repeat(
447 |         struct ggml_context * ctx,
448 |         struct ggml_tensor  * a,
449 |         struct ggml_tensor  * b);
450 | 
451 | struct ggml_tensor * ggml_abs(
452 |         struct ggml_context * ctx,
453 |         struct ggml_tensor  * a);
454 | 
455 | struct ggml_tensor * ggml_sgn(
456 |         struct ggml_context * ctx,
457 |         struct ggml_tensor  * a);
458 | 
459 | struct ggml_tensor * ggml_neg(
460 |         struct ggml_context * ctx,
461 |         struct ggml_tensor  * a);
462 | 
463 | struct ggml_tensor * ggml_step(
464 |         struct ggml_context * ctx,
465 |         struct ggml_tensor  * a);
466 | 
467 | struct ggml_tensor * ggml_relu(
468 |         struct ggml_context * ctx,
469 |         struct ggml_tensor  * a);
470 | 
471 | // TODO: double-check this computation is correct
472 | struct ggml_tensor * ggml_gelu(
473 |         struct ggml_context * ctx,
474 |         struct ggml_tensor  * a);
475 | 
476 | struct ggml_tensor * ggml_silu(
477 |         struct ggml_context * ctx,
478 |         struct ggml_tensor  * a);
479 | 
480 | // normalize along rows
481 | // TODO: eps is hardcoded to 1e-5 for now
482 | struct ggml_tensor * ggml_norm(
483 |         struct ggml_context * ctx,
484 |         struct ggml_tensor  * a);
485 | 
486 | struct ggml_tensor * ggml_rms_norm(
487 |         struct ggml_context * ctx,
488 |         struct ggml_tensor  * a);
489 | 
490 | // A: m rows, n columns
491 | // B: p rows, n columns (i.e. we transpose it internally)
492 | // result is m columns, p rows
493 | struct ggml_tensor * ggml_mul_mat(
494 |         struct ggml_context * ctx,
495 |         struct ggml_tensor  * a,
496 |         struct ggml_tensor  * b);
497 | 
498 | //
499 | // operations on tensors without backpropagation
500 | //
501 | 
502 | // in-place, returns view(a)
503 | struct ggml_tensor * ggml_scale(
504 |         struct ggml_context * ctx,
505 |         struct ggml_tensor  * a,
506 |         struct ggml_tensor  * b);
507 | 
508 | // a -> b, return view(b)
509 | struct ggml_tensor * ggml_cpy(
510 |         struct ggml_context * ctx,
511 |         struct ggml_tensor  * a,
512 |         struct ggml_tensor  * b);
513 | 
514 | // return view(a), b specifies the new shape
515 | // TODO: when we start computing gradient, make a copy instead of view
516 | struct ggml_tensor * ggml_reshape(
517 |         struct ggml_context * ctx,
518 |         struct ggml_tensor  * a,
519 |         struct ggml_tensor  * b);
520 | 
521 | // return view(a)
522 | // TODO: when we start computing gradient, make a copy instead of view
523 | struct ggml_tensor * ggml_reshape_2d(
524 |         struct ggml_context * ctx,
525 |         struct ggml_tensor  * a,
526 |         int                   ne0,
527 |         int                   ne1);
528 | 
529 | // return view(a)
530 | // TODO: when we start computing gradient, make a copy instead of view
531 | struct ggml_tensor * ggml_reshape_3d(
532 |         struct ggml_context * ctx,
533 |         struct ggml_tensor  * a,
534 |         int                   ne0,
535 |         int                   ne1,
536 |         int                   ne2);
537 | 
538 | // offset in bytes
539 | struct ggml_tensor * ggml_view_1d(
540 |         struct ggml_context * ctx,
541 |         struct ggml_tensor  * a,
542 |         int                   ne0,
543 |         size_t                offset);
544 | 
545 | struct ggml_tensor * ggml_view_2d(
546 |         struct ggml_context * ctx,
547 |         struct ggml_tensor  * a,
548 |         int                   ne0,
549 |         int                   ne1,
550 |         size_t                nb1, // row stride in bytes
551 |         size_t                offset);
552 | 
553 | struct ggml_tensor * ggml_permute(
554 |         struct ggml_context * ctx,
555 |         struct ggml_tensor  * a,
556 |         int                   axis0,
557 |         int                   axis1,
558 |         int                   axis2,
559 |         int                   axis3);
560 | 
561 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
562 | struct ggml_tensor * ggml_transpose(
563 |         struct ggml_context * ctx,
564 |         struct ggml_tensor  * a);
565 | 
566 | struct ggml_tensor * ggml_get_rows(
567 |         struct ggml_context * ctx,
568 |         struct ggml_tensor  * a,
569 |         struct ggml_tensor  * b);
570 | 
571 | // set elements above the diagonal to -INF
572 | // in-place, returns view(a)
573 | struct ggml_tensor * ggml_diag_mask_inf(
574 |         struct ggml_context * ctx,
575 |         struct ggml_tensor  * a,
576 |         int                   n_past);
577 | 
578 | // in-place, returns view(a)
579 | struct ggml_tensor * ggml_soft_max(
580 |         struct ggml_context * ctx,
581 |         struct ggml_tensor  * a);
582 | 
583 | // rotary position embedding
584 | // in-place, returns view(a)
585 | // if mode == 1, skip n_past elements
586 | // TODO: avoid creating a new tensor every time
587 | struct ggml_tensor * ggml_rope(
588 |         struct ggml_context * ctx,
589 |         struct ggml_tensor  * a,
590 |         int                   n_past,
591 |         int                   n_dims,
592 |         int                   mode);
593 | 
594 | // padding = 1
595 | // TODO: we don't support extra parameters for now
596 | //       that's why we are hard-coding the stride, padding, and dilation
597 | //       not great ..
598 | struct ggml_tensor * ggml_conv_1d_1s(
599 |         struct ggml_context * ctx,
600 |         struct ggml_tensor  * a,
601 |         struct ggml_tensor  * b);
602 | 
603 | struct ggml_tensor * ggml_conv_1d_2s(
604 |         struct ggml_context * ctx,
605 |         struct ggml_tensor  * a,
606 |         struct ggml_tensor  * b);
607 | 
608 | struct ggml_tensor * ggml_flash_attn(
609 |         struct ggml_context * ctx,
610 |         struct ggml_tensor  * q,
611 |         struct ggml_tensor  * k,
612 |         struct ggml_tensor  * v,
613 |         bool                  masked);
614 | 
615 | struct ggml_tensor * ggml_flash_ff(
616 |         struct ggml_context * ctx,
617 |         struct ggml_tensor  * a,
618 |         struct ggml_tensor  * b0,
619 |         struct ggml_tensor  * b1,
620 |         struct ggml_tensor  * c0,
621 |         struct ggml_tensor  * c1);
622 | 
623 | //
624 | // automatic differentiation
625 | //
626 | 
627 | void ggml_set_param(
628 |         struct ggml_context * ctx,
629 |         struct ggml_tensor * tensor);
630 | 
631 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
632 | 
633 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
634 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
635 | 
636 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
637 | void ggml_graph_reset  (struct ggml_cgraph * cgraph);
638 | 
639 | // print info and performance information for the graph
640 | void ggml_graph_print(const struct ggml_cgraph * cgraph);
641 | 
642 | // dump the graph into a file using the dot format
643 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
644 | 
645 | //
646 | // optimization
647 | //
648 | 
649 | // optimization methods
650 | enum ggml_opt_type {
651 |     GGML_OPT_ADAM,
652 |     GGML_OPT_LBFGS,
653 | };
654 | 
655 | // linesearch methods
656 | enum ggml_linesearch {
657 |     GGML_LINESEARCH_DEFAULT = 1,
658 | 
659 |     GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
660 |     GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
661 |     GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
662 | };
663 | 
664 | // optimization return values
665 | enum ggml_opt_result {
666 |     GGML_OPT_OK = 0,
667 |     GGML_OPT_DID_NOT_CONVERGE,
668 |     GGML_OPT_NO_CONTEXT,
669 |     GGML_OPT_INVALID_WOLFE,
670 |     GGML_OPT_FAIL,
671 | 
672 |     GGML_LINESEARCH_FAIL = -128,
673 |     GGML_LINESEARCH_MINIMUM_STEP,
674 |     GGML_LINESEARCH_MAXIMUM_STEP,
675 |     GGML_LINESEARCH_MAXIMUM_ITERATIONS,
676 |     GGML_LINESEARCH_INVALID_PARAMETERS,
677 | };
678 | 
679 | // optimization parameters
680 | //
681 | //   see ggml.c (ggml_opt_default_params) for default values
682 | //
683 | struct ggml_opt_params {
684 |     enum ggml_opt_type type;
685 | 
686 |     int n_threads;
687 | 
688 |     // delta-based convergence test
689 |     //
690 |     //   if past == 0 - disabled
691 |     //   if past > 0:
692 |     //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
693 |     //
694 |     int past;
695 |     float delta;
696 | 
697 |     // maximum number of iterations without improvement
698 |     //
699 |     //   if 0 - disabled
700 |     //   if > 0:
701 |     //     assume convergence if no cost improvement in this number of iterations
702 |     //
703 |     int max_no_improvement;
704 | 
705 |     bool print_forward_graph;
706 |     bool print_backward_graph;
707 | 
708 |     // ADAM parameters
709 |     struct {
710 |         int n_iter;
711 | 
712 |         float alpha; // learning rate
713 |         float beta1;
714 |         float beta2;
715 |         float eps;   // epsilon for numerical stability
716 |         float eps_f; // epsilon for convergence test
717 |         float eps_g; // epsilon for convergence test
718 |     } adam;
719 | 
720 |     // LBFGS parameters
721 |     struct {
722 |         int m; // number of corrections to approximate the inv. Hessian
723 |         int n_iter;
724 |         int max_linesearch;
725 | 
726 |         float eps;      // convergence tolerance
727 |         float ftol;     // line search tolerance
728 |         float wolfe;
729 |         float min_step;
730 |         float max_step;
731 | 
732 |         enum ggml_linesearch linesearch;
733 |     } lbfgs;
734 | };
735 | 
736 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
737 | 
738 | // optimize the function defined by the tensor f
739 | enum ggml_opt_result ggml_opt(
740 |         struct ggml_context * ctx,
741 |         struct ggml_opt_params params,
742 |         struct ggml_tensor * f);
743 | 
744 | //
745 | // system info
746 | //
747 | 
748 | int ggml_cpu_has_avx(void);
749 | int ggml_cpu_has_avx2(void);
750 | int ggml_cpu_has_avx512(void);
751 | int ggml_cpu_has_fma(void);
752 | int ggml_cpu_has_neon(void);
753 | int ggml_cpu_has_arm_fma(void);
754 | int ggml_cpu_has_f16c(void);
755 | int ggml_cpu_has_fp16_va(void);
756 | int ggml_cpu_has_wasm_simd(void);
757 | int ggml_cpu_has_blas(void);
758 | int ggml_cpu_has_sse3(void);
759 | int ggml_cpu_has_vsx(void);
760 | 
761 | #ifdef  __cplusplus
762 | }
763 | #endif
764 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
   1 | #define NOMINMAX
   2 | #include "ggml.h"
   3 | 
   4 | #include "utils.h"
   5 | 
   6 | #include <cassert>
   7 | #include <cmath>
   8 | #include <cstdio>
   9 | #include <cstring>
  10 | #include <fstream>
  11 | #include <map>
  12 | #include <string>
  13 | #include <vector>
  14 | 
  15 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
  16 | #include <signal.h>
  17 | #include <unistd.h>
  18 | #elif defined (_WIN32)
  19 | #include <signal.h>
  20 | #include <Windows.h>
  21 | #endif
  22 | 
  23 | #define ANSI_COLOR_RED     "\x1b[31m"
  24 | #define ANSI_COLOR_GREEN   "\x1b[32m"
  25 | #define ANSI_COLOR_YELLOW  "\x1b[33m"
  26 | #define ANSI_COLOR_BLUE    "\x1b[34m"
  27 | #define ANSI_COLOR_MAGENTA "\x1b[35m"
  28 | #define ANSI_COLOR_CYAN    "\x1b[36m"
  29 | #define ANSI_COLOR_RESET   "\x1b[0m"
  30 | #define ANSI_BOLD          "\x1b[1m"
  31 | 
  32 | // determine number of model parts based on the dimension
  33 | static const std::map<int, int> LLAMA_N_PARTS = {
  34 |     { 4096, 1 },
  35 |     { 5120, 1 },
  36 |     { 6656, 1 },
  37 |     { 8192, 1 },
  38 | };
  39 | 
  40 | // default hparams (LLaMA 7B)
  41 | struct llama_hparams {
  42 |     int32_t n_vocab = 32000;
  43 |     int32_t n_ctx   = 512;   // this is provided as user input?
  44 |     int32_t n_embd  = 4096;
  45 |     int32_t n_mult  = 256;
  46 |     int32_t n_head  = 32;
  47 |     int32_t n_layer = 32;
  48 |     int32_t n_rot   = 64;
  49 |     int32_t f16     = 1;
  50 | };
  51 | 
  52 | struct llama_layer {
  53 |     // normalization
  54 |     struct ggml_tensor * attention_norm;
  55 | 
  56 |     // attention
  57 |     struct ggml_tensor * wq;
  58 |     struct ggml_tensor * wk;
  59 |     struct ggml_tensor * wv;
  60 |     struct ggml_tensor * wo;
  61 | 
  62 |     // normalization
  63 |     struct ggml_tensor * ffn_norm;
  64 | 
  65 |     // ff
  66 |     struct ggml_tensor * w1;
  67 |     struct ggml_tensor * w2;
  68 |     struct ggml_tensor * w3;
  69 | };
  70 | 
  71 | struct llama_model {
  72 |     llama_hparams hparams;
  73 | 
  74 |     struct ggml_tensor * tok_embeddings;
  75 | 
  76 |     struct ggml_tensor * norm;
  77 |     struct ggml_tensor * output;
  78 | 
  79 |     std::vector<llama_layer> layers;
  80 | 
  81 |     // key + value memory
  82 |     struct ggml_tensor * memory_k;
  83 |     struct ggml_tensor * memory_v;
  84 | 
  85 |     //
  86 |     struct ggml_context * ctx;
  87 |     std::map<std::string, struct ggml_tensor *> tensors;
  88 | };
  89 | 
  90 | // load the model's weights from a file
  91 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
  92 |     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
  93 | 
  94 |     std::vector<char> f_buf(1024*1024);
  95 | 
  96 |     auto fin = std::ifstream(fname, std::ios::binary);
  97 |     fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
  98 |     if (!fin) {
  99 |         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
 100 |         return false;
 101 |     }
 102 | 
 103 |     // verify magic
 104 |     {
 105 |         uint32_t magic;
 106 |         fin.read((char *) &magic, sizeof(magic));
 107 |         if (magic != 0x67676d6c) {
 108 |             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
 109 |             return false;
 110 |         }
 111 |     }
 112 | 
 113 |     int n_ff = 0;
 114 |     int n_parts = 0;
 115 | 
 116 |     // load hparams
 117 |     {
 118 |         auto & hparams = model.hparams;
 119 | 
 120 |         fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
 121 |         //fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
 122 |         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
 123 |         fin.read((char *) &hparams.n_mult,  sizeof(hparams.n_mult));
 124 |         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
 125 |         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
 126 |         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
 127 |         fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
 128 | 
 129 |         hparams.n_ctx = n_ctx;
 130 | 
 131 |         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 132 |         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
 133 | 
 134 |         // fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
 135 |         // fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
 136 |         // fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
 137 |         // fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
 138 |         // fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
 139 |         // fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
 140 |         // fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
 141 |         // fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
 142 |         // fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
 143 |         // fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
 144 |     }
 145 | 
 146 |     // load vocab
 147 |     {
 148 |         const int32_t n_vocab = model.hparams.n_vocab;
 149 | 
 150 |         if (n_vocab != model.hparams.n_vocab) {
 151 |             fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
 152 |                     __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
 153 |             return false;
 154 |         }
 155 | 
 156 |         std::string word;
 157 |         for (int i = 0; i < n_vocab; i++) {
 158 |             uint32_t len;
 159 |             fin.read((char *) &len, sizeof(len));
 160 | 
 161 |             word.resize(len);
 162 |             fin.read((char *) word.data(), len);
 163 | 
 164 |             vocab.token_to_id[word] = i;
 165 |             vocab.id_to_token[i] = word;
 166 | 
 167 |             //if (i < 30000) {
 168 |             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
 169 |             //}
 170 |         }
 171 |     }
 172 | 
 173 |     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
 174 |     // in order to save memory and also to speed up the computation
 175 |     ggml_type wtype = GGML_TYPE_COUNT;
 176 |     switch (model.hparams.f16) {
 177 |         case 0: wtype = GGML_TYPE_F32;  break;
 178 |         case 1: wtype = GGML_TYPE_F16;  break;
 179 |         case 2: wtype = GGML_TYPE_Q4_0; break;
 180 |         case 3: wtype = GGML_TYPE_Q4_1; break;
 181 |         default:
 182 |                 {
 183 |                     fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
 184 |                             __func__, fname.c_str(), model.hparams.f16);
 185 |                     return false;
 186 |                 }
 187 |     }
 188 | 
 189 |     const ggml_type wtype2 = GGML_TYPE_F32;
 190 | 
 191 |     auto & ctx = model.ctx;
 192 | 
 193 |     size_t ctx_size = 0;
 194 | 
 195 |     {
 196 |         const auto & hparams = model.hparams;
 197 | 
 198 |         const int n_embd  = hparams.n_embd;
 199 |         const int n_layer = hparams.n_layer;
 200 |         const int n_ctx   = hparams.n_ctx;
 201 |         const int n_vocab = hparams.n_vocab;
 202 | 
 203 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings
 204 | 
 205 |         ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
 206 | 
 207 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output
 208 | 
 209 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
 210 | 
 211 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
 212 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
 213 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
 214 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
 215 | 
 216 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
 217 | 
 218 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
 219 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
 220 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
 221 | 
 222 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
 223 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 224 | 
 225 |         ctx_size += (5 + 10*n_layer)*256; // object overhead
 226 | 
 227 |         fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
 228 |     }
 229 | 
 230 |     // create the ggml context
 231 |     {
 232 |         struct ggml_init_params params = {
 233 |             /*.mem_size   =*/ ctx_size,
 234 |             /*.mem_buffer =*/ NULL,
 235 |         };
 236 | 
 237 |         model.ctx = ggml_init(params);
 238 |         if (!model.ctx) {
 239 |             fprintf(stderr, "%s: ggml_init() failed\n", __func__);
 240 |             return false;
 241 |         }
 242 |     }
 243 | 
 244 |     // prepare memory for the weights
 245 |     {
 246 |         const auto & hparams = model.hparams;
 247 | 
 248 |         const int n_embd  = hparams.n_embd;
 249 |         const int n_layer = hparams.n_layer;
 250 |         const int n_ctx   = hparams.n_ctx;
 251 |         const int n_vocab = hparams.n_vocab;
 252 | 
 253 |         model.layers.resize(n_layer);
 254 | 
 255 |         model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
 256 | 
 257 |         model.norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 258 |         model.output = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 259 | 
 260 |         // map by name
 261 |         model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
 262 | 
 263 |         model.tensors["norm.weight"]   = model.norm;
 264 |         model.tensors["output.weight"] = model.output;
 265 | 
 266 |         for (int i = 0; i < n_layer; ++i) {
 267 |             auto & layer = model.layers[i];
 268 | 
 269 |             layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 270 | 
 271 |             layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 272 |             layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 273 |             layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 274 |             layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 275 | 
 276 |             layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 277 | 
 278 |             layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);
 279 |             layer.w2 = ggml_new_tensor_2d(ctx, wtype,   n_ff, n_embd);
 280 |             layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);
 281 | 
 282 |             // map by name
 283 |             model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
 284 | 
 285 |             model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
 286 |             model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
 287 |             model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
 288 |             model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
 289 | 
 290 |             model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
 291 | 
 292 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
 293 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
 294 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
 295 |         }
 296 |     }
 297 | 
 298 |     // key + value memory
 299 |     {
 300 |         const auto & hparams = model.hparams;
 301 | 
 302 |         const int n_embd  = hparams.n_embd;
 303 |         const int n_layer = hparams.n_layer;
 304 |         const int n_ctx   = hparams.n_ctx;
 305 | 
 306 |         const int n_mem      = n_layer*n_ctx;
 307 |         const int n_elements = n_embd*n_mem;
 308 | 
 309 |         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
 310 |         model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
 311 | 
 312 |         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 313 | 
 314 |         fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
 315 |     }
 316 | 
 317 |     const size_t file_offset = fin.tellg();
 318 | 
 319 |     fin.close();
 320 | 
 321 |     std::vector<uint8_t> tmp;
 322 | 
 323 |     for (int i = 0; i < n_parts; ++i) {
 324 |         const int part_id = i;
 325 |         //const int part_id = n_parts - i - 1;
 326 | 
 327 |         std::string fname_part = fname;
 328 |         if (i > 0) {
 329 |             fname_part += "." + std::to_string(i);
 330 |         }
 331 | 
 332 |         fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 333 | 
 334 |         fin = std::ifstream(fname_part, std::ios::binary);
 335 |         fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
 336 |         fin.seekg(file_offset);
 337 | 
 338 |         // load weights
 339 |         {
 340 |             int n_tensors = 0;
 341 |             size_t total_size = 0;
 342 | 
 343 |             fprintf(stderr, "%s: ", __func__);
 344 | 
 345 |             while (true) {
 346 |                 int32_t n_dims;
 347 |                 int32_t length;
 348 |                 int32_t ftype;
 349 | 
 350 |                 fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
 351 |                 fin.read(reinterpret_cast<char *>(&length), sizeof(length));
 352 |                 fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
 353 | 
 354 |                 if (fin.eof()) {
 355 |                     break;
 356 |                 }
 357 | 
 358 |                 int32_t nelements = 1;
 359 |                 int32_t ne[2] = { 1, 1 };
 360 |                 for (int i = 0; i < n_dims; ++i) {
 361 |                     fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
 362 |                     nelements *= ne[i];
 363 |                 }
 364 | 
 365 |                 std::string name(length, 0);
 366 |                 fin.read(&name[0], length);
 367 | 
 368 |                 if (model.tensors.find(name.data()) == model.tensors.end()) {
 369 |                     fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
 370 |                     return false;
 371 |                 }
 372 | 
 373 |                 // split_type = 0: split by columns
 374 |                 // split_type = 1: split by rows
 375 |                 int split_type = 0;
 376 | 
 377 |                 // split_type = 0:
 378 |                 // regex:
 379 |                 //   - tok_embeddings.*
 380 |                 //   - layers.*.attention.wo.weight
 381 |                 //   - layers.*.feed_forward.w2.weight
 382 | 
 383 |                 // split_type = 1:
 384 |                 // regex:
 385 |                 //   - output.*
 386 |                 //   - layers.*.attention.wq.weight
 387 |                 //   - layers.*.attention.wk.weight
 388 |                 //   - layers.*.attention.wv.weight
 389 |                 //   - layers.*.feed_forward.w1.weight
 390 |                 //   - layers.*.feed_forward.w3.weight
 391 |                 if (name.find("tok_embeddings") != std::string::npos) {
 392 |                     split_type = 0;
 393 |                 } else if (name.find("layers") != std::string::npos) {
 394 |                     if (name.find("attention.wo.weight") != std::string::npos) {
 395 |                         split_type = 0;
 396 |                     } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
 397 |                         split_type = 0;
 398 |                     } else {
 399 |                         split_type = 1;
 400 |                     }
 401 |                 } else if (name.find("output") != std::string::npos) {
 402 |                     split_type = 1;
 403 |                 }
 404 | 
 405 |                 auto tensor = model.tensors[name.data()];
 406 | 
 407 |                 if (n_dims == 1) {
 408 |                     if (ggml_nelements(tensor) != nelements) {
 409 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 410 |                         return false;
 411 |                     }
 412 |                 } else {
 413 |                     if (ggml_nelements(tensor)/n_parts != nelements) {
 414 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 415 |                         return false;
 416 |                     }
 417 |                 }
 418 | 
 419 |                 if (n_dims == 1) {
 420 |                     if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
 421 |                         fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 422 |                                 __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
 423 |                         return false;
 424 |                     }
 425 |                 } else {
 426 |                     if (split_type == 0) {
 427 |                         if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
 428 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 429 |                                     __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
 430 |                             return false;
 431 |                         }
 432 |                     } else {
 433 |                         if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
 434 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 435 |                                     __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
 436 |                             return false;
 437 |                         }
 438 |                     }
 439 |                 }
 440 | 
 441 |                 if (0) {
 442 |                     static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
 443 |                     fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
 444 |                 }
 445 | 
 446 |                 size_t bpe = 0;
 447 | 
 448 |                 switch (ftype) {
 449 |                     case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
 450 |                     case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
 451 |                     case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
 452 |                     case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
 453 |                     default:
 454 |                             {
 455 |                                 fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
 456 |                                 return false;
 457 |                             }
 458 |                 };
 459 | 
 460 |                 if (n_dims == 1 || n_parts == 1) {
 461 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
 462 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
 463 |                                 __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
 464 |                         return false;
 465 |                     }
 466 | 
 467 |                     if (part_id == 0) {
 468 |                         fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
 469 |                     } else {
 470 |                         fin.seekg(ggml_nbytes(tensor), std::ios::cur);
 471 |                     }
 472 | 
 473 |                     total_size += ggml_nbytes(tensor);
 474 |                 } else {
 475 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
 476 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
 477 |                                 __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
 478 |                         return false;
 479 |                     }
 480 | 
 481 |                     if (split_type == 0) {
 482 |                         const int np0 = ne[0];
 483 | 
 484 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 485 |                         assert(row_size == tensor->nb[1]);
 486 | 
 487 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
 488 |                             const size_t offset_row = i1*row_size;
 489 |                             const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 490 |                             fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
 491 |                         }
 492 |                     } else {
 493 |                         const int np1 = ne[1];
 494 | 
 495 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 496 | 
 497 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
 498 |                             const size_t offset_row = (i1 + part_id*np1)*row_size;
 499 |                             fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
 500 |                         }
 501 |                     }
 502 | 
 503 |                     total_size += ggml_nbytes(tensor)/n_parts;
 504 |                 }
 505 | 
 506 |                 //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 507 |                 if (++n_tensors % 8 == 0) {
 508 |                     fprintf(stderr, ".");
 509 |                     fflush(stderr);
 510 |                 }
 511 |             }
 512 | 
 513 |             fprintf(stderr, " done\n");
 514 | 
 515 |             fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
 516 |         }
 517 | 
 518 |         fin.close();
 519 |     }
 520 | 
 521 |     return true;
 522 | }
 523 | 
 524 | // evaluate the transformer
 525 | //
 526 | //   - model:     the model
 527 | //   - n_threads: number of threads to use
 528 | //   - n_past:    the context size so far
 529 | //   - embd_inp:  the embeddings of the tokens in the context
 530 | //   - embd_w:    the predicted logits for the next token
 531 | //
 532 | // The GPT-J model requires about 16MB of memory per input token.
 533 | //
 534 | bool llama_eval(
 535 |         const llama_model & model,
 536 |         const int n_threads,
 537 |         const int n_past,
 538 |         const std::vector<gpt_vocab::id> & embd_inp,
 539 |               std::vector<float>         & embd_w,
 540 |               size_t                     & mem_per_token) {
 541 |     const int N = embd_inp.size();
 542 | 
 543 |     const auto & hparams = model.hparams;
 544 | 
 545 |     const int n_embd  = hparams.n_embd;
 546 |     const int n_layer = hparams.n_layer;
 547 |     const int n_ctx   = hparams.n_ctx;
 548 |     const int n_head  = hparams.n_head;
 549 |     const int n_vocab = hparams.n_vocab;
 550 |     const int n_rot   = hparams.n_embd/hparams.n_head;
 551 | 
 552 |     const int d_key = n_embd/n_head;
 553 | 
 554 |      // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
 555 |     // static size_t buf_size = hparams.n_ctx*1024*1024;
 556 |     static size_t buf_size = 512u*1024*1024;
 557 |     static void * buf = malloc(buf_size);
 558 | 
 559 |     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
 560 |         const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
 561 |         //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 562 | 
 563 |         // reallocate
 564 |         buf_size = buf_size_new;
 565 |         buf = realloc(buf, buf_size);
 566 |         if (buf == nullptr) {
 567 |             fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
 568 |             return false;
 569 |         }
 570 |     }
 571 | 
 572 |     struct ggml_init_params params = {
 573 |         /*.mem_size   =*/ buf_size,
 574 |         /*.mem_buffer =*/ buf,
 575 |     };
 576 | 
 577 |     struct ggml_context * ctx0 = ggml_init(params);
 578 |     ggml_cgraph gf = {};
 579 |     gf.n_threads = n_threads;
 580 | 
 581 |     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 582 |     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
 583 | 
 584 |     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 585 | 
 586 |     for (int il = 0; il < n_layer; ++il) {
 587 |         struct ggml_tensor * inpSA = inpL;
 588 | 
 589 |         struct ggml_tensor * cur;
 590 | 
 591 |         // norm
 592 |         {
 593 |             cur = ggml_rms_norm(ctx0, inpL);
 594 | 
 595 |             // cur = attention_norm*cur
 596 |             cur = ggml_mul(ctx0,
 597 |                         ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
 598 |                         cur);
 599 |         }
 600 | 
 601 |         // self-attention
 602 |         {
 603 |             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
 604 |             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
 605 |             struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
 606 | 
 607 |             // store key and value to memory
 608 |             if (N >= 1) {
 609 |                 struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
 610 |                 struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
 611 | 
 612 |                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
 613 |                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
 614 |             }
 615 | 
 616 |             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
 617 |             struct ggml_tensor * Q =
 618 |                 ggml_permute(ctx0,
 619 |                         ggml_rope(ctx0,
 620 |                             ggml_cpy(ctx0,
 621 |                                 Qcur,
 622 |                                 ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
 623 |                             n_past, n_rot, 0),
 624 |                         0, 2, 1, 3);
 625 | 
 626 |             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
 627 |             struct ggml_tensor * K =
 628 |                 ggml_permute(ctx0,
 629 |                         ggml_rope(ctx0,
 630 |                             ggml_reshape_3d(ctx0,
 631 |                                 ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
 632 |                                 n_embd/n_head, n_head, n_past + N),
 633 |                             n_past, n_rot, 1),
 634 |                         0, 2, 1, 3);
 635 | 
 636 |             // K * Q
 637 |             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 638 | 
 639 |             // KQ_scaled = KQ / sqrt(n_embd/n_head)
 640 |             struct ggml_tensor * KQ_scaled =
 641 |                 ggml_scale(ctx0,
 642 |                         KQ,
 643 |                         ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
 644 |                         );
 645 | 
 646 |             // KQ_masked = mask_past(KQ_scaled)
 647 |             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
 648 | 
 649 |             // KQ = soft_max(KQ_masked)
 650 |             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
 651 | 
 652 |             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
 653 |             struct ggml_tensor * V_trans =
 654 |                 ggml_permute(ctx0,
 655 |                         ggml_reshape_3d(ctx0,
 656 |                             ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
 657 |                             n_embd/n_head, n_head, n_past + N),
 658 |                         1, 2, 0, 3);
 659 | 
 660 |             // KQV = transpose(V) * KQ_soft_max
 661 |             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
 662 | 
 663 |             // KQV_merged = KQV.permute(0, 2, 1, 3)
 664 |             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
 665 | 
 666 |             // cur = KQV_merged.contiguous().view(n_embd, N)
 667 |             cur = ggml_cpy(ctx0,
 668 |                     KQV_merged,
 669 |                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
 670 | 
 671 |             // projection (no bias)
 672 |             cur = ggml_mul_mat(ctx0,
 673 |                     model.layers[il].wo,
 674 |                     cur);
 675 |         }
 676 | 
 677 |         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
 678 | 
 679 |         // feed-forward network
 680 |         {
 681 |             // norm
 682 |             {
 683 |                 cur = ggml_rms_norm(ctx0, inpFF);
 684 | 
 685 |                 // cur = ffn_norm*cur
 686 |                 cur = ggml_mul(ctx0,
 687 |                         ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
 688 |                         cur);
 689 |             }
 690 | 
 691 |             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
 692 |                     model.layers[il].w3,
 693 |                     cur);
 694 | 
 695 | 
 696 |             cur = ggml_mul_mat(ctx0,
 697 |                     model.layers[il].w1,
 698 |                     cur);
 699 | 
 700 |             // SILU activation
 701 |             cur = ggml_silu(ctx0, cur);
 702 | 
 703 |             cur = ggml_mul(ctx0, cur, tmp);
 704 | 
 705 |             cur = ggml_mul_mat(ctx0,
 706 |                     model.layers[il].w2,
 707 |                     cur);
 708 |         }
 709 | 
 710 |         cur  = ggml_add(ctx0, cur, inpFF);
 711 | 
 712 |         // input for next layer
 713 |         inpL = cur;
 714 |     }
 715 | 
 716 |     // norm
 717 |     {
 718 |         inpL = ggml_rms_norm(ctx0, inpL);
 719 | 
 720 |         // inpL = norm*inpL
 721 |         inpL = ggml_mul(ctx0,
 722 |                     ggml_repeat(ctx0, model.norm, inpL),
 723 |                     inpL);
 724 |     }
 725 | 
 726 |     // lm_head
 727 |     {
 728 |         inpL = ggml_mul_mat(ctx0, model.output, inpL);
 729 |     }
 730 | 
 731 |     // logits -> probs
 732 |     //inpL = ggml_soft_max(ctx0, inpL);
 733 | 
 734 |     // run the computation
 735 |     ggml_build_forward_expand(&gf, inpL);
 736 |     ggml_graph_compute       (ctx0, &gf);
 737 | 
 738 |     //if (n_past%100 == 0) {
 739 |     //    ggml_graph_print   (&gf);
 740 |     //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
 741 |     //}
 742 | 
 743 |     //embd_w.resize(n_vocab*N);
 744 |     //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
 745 | 
 746 |     // return result for just the last token
 747 |     embd_w.resize(n_vocab);
 748 |     memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
 749 | 
 750 |     if (mem_per_token == 0) {
 751 |         mem_per_token = ggml_used_mem(ctx0)/N;
 752 |     }
 753 |     //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
 754 | 
 755 |     ggml_free(ctx0);
 756 | 
 757 |     return true;
 758 | }
 759 | 
 760 | static bool is_interacting = false;
 761 | 
 762 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 763 | void sigint_handler(int signo) {
 764 |     printf(ANSI_COLOR_RESET);
 765 |     if (signo == SIGINT) {
 766 |         if (!is_interacting) {
 767 |             is_interacting=true;
 768 |         } else {
 769 |             _exit(130);
 770 |         }
 771 |     }
 772 | }
 773 | #endif
 774 | 
 775 | const char * llama_print_system_info(void) {
 776 |     static std::string s;
 777 | 
 778 |     s  = "";
 779 |     s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
 780 |     s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
 781 |     s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
 782 |     s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
 783 |     s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
 784 |     s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
 785 |     s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
 786 |     s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
 787 |     s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
 788 |     s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
 789 |     s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
 790 |     s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
 791 | 
 792 |     return s.c_str();
 793 | }
 794 | 
 795 | int main(int argc, char ** argv) {
 796 |     ggml_time_init();
 797 |     const int64_t t_main_start_us = ggml_time_us();
 798 | 
 799 |     gpt_params params;
 800 | 
 801 |     params.temp = 0.1f;
 802 |     params.top_p = 0.95f;
 803 |     params.n_ctx = 2048;
 804 |     params.model = "ggml-alpaca-7b-q4.bin";
 805 | 
 806 |     if (gpt_params_parse(argc, argv, params) == false) {
 807 |         return 1;
 808 |     }
 809 | 
 810 |     if (params.seed < 0) {
 811 |         params.seed = time(NULL);
 812 |     }
 813 | 
 814 |     fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
 815 | 
 816 |     std::mt19937 rng(params.seed);
 817 |     // if (params.prompt.empty()) {
 818 |     //     params.prompt = gpt_random_prompt(rng);
 819 |     // }
 820 | 
 821 | //    params.prompt = R"(// this function checks if the number n is prime
 822 | //bool is_prime(int n) {)";
 823 | 
 824 |     int64_t t_load_us = 0;
 825 | 
 826 |     gpt_vocab vocab;
 827 |     llama_model model;
 828 | 
 829 |     // load the model
 830 |     {
 831 |         const int64_t t_start_us = ggml_time_us();
 832 |         if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {  
 833 |             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
 834 |             return 1;
 835 |         }
 836 | 
 837 |         t_load_us = ggml_time_us() - t_start_us;
 838 |     }
 839 | 
 840 |     // print system information
 841 |     {
 842 |         fprintf(stderr, "\n");
 843 |         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
 844 |                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
 845 |     }
 846 | 
 847 |     int n_past = 0;
 848 | 
 849 |     int64_t t_sample_us  = 0;
 850 |     int64_t t_predict_us = 0;
 851 | 
 852 |     std::vector<float> logits;
 853 | 
 854 |     // Add a space in front of the first character to match OG llama tokenizer behavior
 855 |     // params.prompt.insert(0, 1, ' ');
 856 |     // tokenize the prompt
 857 |     std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 858 | 
 859 |     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 860 | 
 861 | 
 862 |     // tokenize the reverse prompt
 863 |     std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
 864 | 
 865 |     fprintf(stderr, "\n");
 866 |     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
 867 |     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
 868 |     for (int i = 0; i < (int) embd_inp.size(); i++) {
 869 |         fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
 870 |     }
 871 |     fprintf(stderr, "\n");
 872 | 
 873 |     if (params.interactive) {
 874 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 875 |         struct sigaction sigint_action;
 876 |         sigint_action.sa_handler = sigint_handler;
 877 |         sigemptyset (&sigint_action.sa_mask);
 878 |         sigint_action.sa_flags = 0;
 879 |         sigaction(SIGINT, &sigint_action, NULL);
 880 | #elif defined (_WIN32)
 881 |         signal(SIGINT, sigint_handler);
 882 | 
 883 |         // Windows console ANSI color fix
 884 |         DWORD mode = 0;
 885 |         HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
 886 |         if (hConsole && hConsole != INVALID_HANDLE_VALUE && GetConsoleMode(hConsole, &mode))
 887 |             SetConsoleMode(hConsole, mode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
 888 | #endif
 889 | 
 890 |         fprintf(stderr, "%s: interactive mode on.\n", __func__);
 891 | 
 892 |         if(antiprompt_inp.size()) {
 893 |             fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
 894 |             fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
 895 |             for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
 896 |                 fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
 897 |             }
 898 |             fprintf(stderr, "\n");
 899 |         }
 900 |     }
 901 |     fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
 902 |     fprintf(stderr, "\n\n");
 903 | 
 904 |     std::vector<gpt_vocab::id> embd;
 905 | 
 906 |     // determine the required inference memory per token:
 907 |     size_t mem_per_token = 0;
 908 |     llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
 909 | 
 910 |     int last_n_size = params.repeat_last_n;
 911 |     std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
 912 |     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 913 | 
 914 | 
 915 |     if (params.interactive) {
 916 |         fprintf(stderr, "== Running in chat mode. ==\n"
 917 | #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 918 |                " - Press Ctrl+C to interject at any time.\n"
 919 | #endif
 920 |                " - Press Return to return control to LLaMA.\n"
 921 |                " - If you want to submit another line, end your input in '\\'.\n");
 922 |     }
 923 | 
 924 |     // we may want to slide the input window along with the context, but for now we restrict to the context length
 925 |     int remaining_tokens = model.hparams.n_ctx - embd_inp.size();
 926 |     int input_consumed = 0;
 927 |     bool input_noecho = false;
 928 | 
 929 |     // prompt user immediately after the starting prompt has been loaded
 930 |     if (params.interactive_start) {
 931 |         is_interacting = true;
 932 |     }
 933 | 
 934 |     // set the color for the prompt which will be output initially
 935 |     if (params.use_color) {
 936 |         printf(ANSI_COLOR_YELLOW);
 937 |     }
 938 | 
 939 |     
 940 | 
 941 |     while (remaining_tokens > 0) {
 942 |         // predict
 943 |         if (embd.size() > 0) {
 944 |             const int64_t t_start_us = ggml_time_us();
 945 | 
 946 |             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
 947 |                 fprintf(stderr, "Failed to predict\n");
 948 |                 return 1;
 949 |             }
 950 | 
 951 |             t_predict_us += ggml_time_us() - t_start_us;
 952 |         }
 953 | 
 954 |         n_past += embd.size();
 955 |         embd.clear();
 956 | 
 957 |         if (embd_inp.size() <= input_consumed) {
 958 |             // out of user input, sample next token
 959 |             const float top_k = params.top_k;
 960 |             const float top_p = params.top_p;
 961 |             const float temp  = params.temp;
 962 |             const float repeat_penalty = params.repeat_penalty;
 963 | 
 964 |             const int n_vocab = model.hparams.n_vocab;
 965 | 
 966 |             gpt_vocab::id id = 0;
 967 | 
 968 |             {
 969 |                 const int64_t t_start_sample_us = ggml_time_us();
 970 | 
 971 |                 id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
 972 | 
 973 |                 last_n_tokens.erase(last_n_tokens.begin());
 974 |                 last_n_tokens.push_back(id);
 975 | 
 976 |                 t_sample_us += ggml_time_us() - t_start_sample_us;
 977 |             }
 978 | 
 979 |             // add it to the context
 980 |             embd.push_back(id);
 981 | 
 982 |             // echo this to console
 983 |             input_noecho = false;
 984 | 
 985 |             // decrement remaining sampling budget
 986 |             --remaining_tokens;
 987 |         } else {
 988 |             // some user input remains from prompt or interaction, forward it to processing
 989 |             while (embd_inp.size() > input_consumed) {
 990 |                 // fprintf(stderr, "%6d -> '%s'\n", embd_inp[input_consumed], vocab.id_to_token.at(embd_inp[input_consumed]).c_str());
 991 | 
 992 |                 embd.push_back(embd_inp[input_consumed]);
 993 |                 last_n_tokens.erase(last_n_tokens.begin());
 994 |                 last_n_tokens.push_back(embd_inp[input_consumed]);
 995 |                 ++input_consumed;
 996 |                 if (embd.size() > params.n_batch) {
 997 |                     break;
 998 |                 }
 999 |             }
1000 | 
1001 |             // reset color to default if we there is no pending user input
1002 |             if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
1003 |                 printf(ANSI_COLOR_RESET);
1004 |             }
1005 |         }
1006 | 
1007 |         // display text
1008 |         if (!input_noecho) {
1009 |             for (auto id : embd) {
1010 |                 printf("%s", vocab.id_to_token[id].c_str());
1011 |             }
1012 |             fflush(stdout);
1013 |         }
1014 | 
1015 |         // in interactive mode, and not currently processing queued inputs;
1016 |         // check if we should prompt the user for more
1017 |         if (params.interactive && embd_inp.size() <= input_consumed) {
1018 |             // check for reverse prompt
1019 |             if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
1020 |                 // reverse prompt found
1021 |                 is_interacting = true;
1022 |             }
1023 |             if (is_interacting) {
1024 |                 // currently being interactive
1025 |                 bool another_line=true;
1026 |                 while (another_line) {
1027 |                     fflush(stdout);
1028 |                     char buf[256] = {0};
1029 |                     int n_read;
1030 |                     if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
1031 |                     if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
1032 |                         // presumable empty line, consume the newline
1033 |                         if (scanf("%*c") <= 0) { /*ignore*/ }
1034 |                         n_read=0;
1035 |                     }
1036 |                     if(params.use_color) printf(ANSI_COLOR_RESET);
1037 | 
1038 |                     if (n_read > 0 && buf[n_read-1]=='\\') {
1039 |                         another_line = true;
1040 |                         buf[n_read-1] = '\n';
1041 |                         buf[n_read] = 0;
1042 |                     } else {
1043 |                         another_line = false;
1044 |                         buf[n_read] = '\n';
1045 |                         buf[n_read+1] = 0;
1046 |                     }
1047 | 
1048 |                     std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
1049 |                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
1050 |                     remaining_tokens -= line_inp.size();
1051 | 
1052 |                     input_noecho = true; // do not echo this again
1053 |                 }
1054 | 
1055 |                 is_interacting = false;
1056 |             }
1057 |         }
1058 | 
1059 |         // end of text token
1060 |         if (embd.back() == 2) {
1061 |             fprintf(stderr, " [end of text]\n");
1062 |             break;
1063 |         }
1064 |     }
1065 | 
1066 | #if defined (_WIN32)
1067 |     signal(SIGINT, SIG_DFL);
1068 | #endif
1069 | 
1070 |     // report timing
1071 |     {
1072 |         const int64_t t_main_end_us = ggml_time_us();
1073 | 
1074 |         fprintf(stderr, "\n\n");
1075 |         fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
1076 |         fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
1077 |         fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
1078 |         fprintf(stderr, "%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
1079 |         fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
1080 |     }
1081 | 
1082 |     ggml_free(model.ctx);
1083 | 
1084 |     if (params.use_color) {
1085 |         printf(ANSI_COLOR_RESET);
1086 |     }
1087 | 
1088 |     return 0;
1089 | }
1090 | 


--------------------------------------------------------------------------------
/quantize.cpp:
--------------------------------------------------------------------------------
  1 | #include "ggml.h"
  2 | 
  3 | #include "utils.h"
  4 | 
  5 | #include <cassert>
  6 | #include <cmath>
  7 | #include <cstdio>
  8 | #include <cstring>
  9 | #include <fstream>
 10 | #include <map>
 11 | #include <string>
 12 | #include <vector>
 13 | #include <regex>
 14 | 
 15 | // TODO: move somewhere else
 16 | #define QK 32
 17 | 
 18 | // default hparams (LLaMA76B)
 19 | struct llama_hparams {
 20 |     int32_t n_vocab = 32000;
 21 |     int32_t n_ctx   = 512;   // this is provided as user input?
 22 |     int32_t n_embd  = 4096;
 23 |     int32_t n_mult  = 256;
 24 |     int32_t n_head  = 32;
 25 |     int32_t n_layer = 32;
 26 |     int32_t n_rot   = 64;
 27 |     int32_t f16     = 1;
 28 | };
 29 | 
 30 | 
 31 | // quantize a model
 32 | bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
 33 |     ggml_type type = GGML_TYPE_Q4_1;
 34 | 
 35 |     switch (itype) {
 36 |         case 2: type = GGML_TYPE_Q4_0; break;
 37 |         case 3: type = GGML_TYPE_Q4_1; break;
 38 |         default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
 39 |     };
 40 | 
 41 |     if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
 42 |         fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
 43 |         return false;
 44 |     }
 45 | 
 46 |     gpt_vocab vocab;
 47 | 
 48 |     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
 49 | 
 50 |     auto finp = std::ifstream(fname_inp, std::ios::binary);
 51 |     if (!finp) {
 52 |         fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
 53 |         return false;
 54 |     }
 55 | 
 56 |     auto fout = std::ofstream(fname_out, std::ios::binary);
 57 |     if (!fout) {
 58 |         fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
 59 |         return false;
 60 |     }
 61 | 
 62 |     // verify magic
 63 |     {
 64 |         uint32_t magic;
 65 |         finp.read((char *) &magic, sizeof(magic));
 66 |         if (magic != 0x67676d6c) {
 67 |             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
 68 |             return false;
 69 |         }
 70 | 
 71 |         fout.write((char *) &magic, sizeof(magic));
 72 |     }
 73 | 
 74 |     llama_hparams hparams;
 75 | 
 76 |     // load hparams
 77 |     {
 78 |         finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
 79 |         //finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
 80 |         finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
 81 |         finp.read((char *) &hparams.n_mult,  sizeof(hparams.n_mult));
 82 |         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
 83 |         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
 84 |         finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
 85 |         finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
 86 | 
 87 |         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
 88 |         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
 89 |         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
 90 |         printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
 91 |         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
 92 |         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
 93 |         printf("%s: f16     = %d\n", __func__, hparams.f16);
 94 | 
 95 |         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
 96 |         //fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
 97 |         fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
 98 |         fout.write((char *) &hparams.n_mult,  sizeof(hparams.n_mult));
 99 |         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
100 |         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
101 |         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
102 |         fout.write((char *) &itype,           sizeof(hparams.f16));
103 |     }
104 | 
105 |     // load vocab
106 |     {
107 |         const int32_t n_vocab = hparams.n_vocab;
108 | 
109 |         if (n_vocab != hparams.n_vocab) {
110 |             fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
111 |                     __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
112 |             return false;
113 |         }
114 | 
115 |         std::string word;
116 |         for (int i = 0; i < n_vocab; i++) {
117 |             uint32_t len;
118 |             finp.read ((char *) &len, sizeof(len));
119 |             fout.write((char *) &len, sizeof(len));
120 | 
121 |             word.resize(len);
122 |             finp.read ((char *) word.data(), len);
123 |             fout.write((char *) word.data(), len);
124 | 
125 |             vocab.token_to_id[word] = i;
126 |             vocab.id_to_token[i] = word;
127 |         }
128 |     }
129 | 
130 |     // load weights
131 |     {
132 |         size_t total_size_org = 0;
133 |         size_t total_size_new = 0;
134 | 
135 |         std::vector<float> work;
136 | 
137 |         std::vector<uint8_t>     data_u8;
138 |         std::vector<ggml_fp16_t> data_f16;
139 |         std::vector<float>       data_f32;
140 | 
141 |         std::vector<int64_t> hist_all(1 << 4, 0);
142 | 
143 |         while (true) {
144 |             int32_t n_dims;
145 |             int32_t length;
146 |             int32_t ftype;
147 | 
148 |             finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
149 |             finp.read(reinterpret_cast<char *>(&length), sizeof(length));
150 |             finp.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
151 | 
152 |             if (finp.eof()) {
153 |                 break;
154 |             }
155 | 
156 |             int32_t nelements = 1;
157 |             int32_t ne[2] = { 1, 1 };
158 |             for (int i = 0; i < n_dims; ++i) {
159 |                 finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
160 |                 nelements *= ne[i];
161 |             }
162 | 
163 |             std::string name(length, 0);
164 |             finp.read (&name[0], length);
165 | 
166 |             {
167 |                 static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
168 |                 printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
169 |             }
170 | 
171 |             // regexes of tensor names to be quantized
172 |             const std::vector<std::string> k_names = {
173 |                 ".*weight",
174 |             };
175 | 
176 |             bool quantize = false;
177 |             for (const auto & s : k_names) {
178 |                 if (std::regex_match(name, std::regex(s))) {
179 |                     quantize = true;
180 |                     break;
181 |                 }
182 |             }
183 | 
184 |             // quantize only 2D tensors
185 |             quantize &= (n_dims == 2);
186 | 
187 |             if (quantize) {
188 |                 if (ftype != 0 && ftype != 1) {
189 |                     fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
190 |                     return false;
191 |                 }
192 | 
193 |                 if (ftype == 1) {
194 |                     data_f16.resize(nelements);
195 |                     finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
196 |                     data_f32.resize(nelements);
197 |                     for (int i = 0; i < nelements; ++i) {
198 |                         data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
199 |                     }
200 |                 } else {
201 |                     data_f32.resize(nelements);
202 |                     finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
203 |                 }
204 | 
205 |                 ftype = itype;
206 |             } else {
207 |                 const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
208 | 
209 |                 data_u8.resize(nelements*bpe);
210 |                 finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
211 |             }
212 | 
213 |             fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
214 |             fout.write(reinterpret_cast<char *>(&length), sizeof(length));
215 |             fout.write(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
216 |             for (int i = 0; i < n_dims; ++i) {
217 |                 fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
218 |             }
219 |             fout.write(&name[0], length);
220 | 
221 |             if (quantize) {
222 |                 printf("quantizing .. ");
223 |                 work.resize(nelements); // for quantization
224 | 
225 |                 size_t cur_size = 0;
226 |                 std::vector<int64_t> hist_cur(1 << 4, 0);
227 | 
228 |                 switch (type) {
229 |                     case GGML_TYPE_Q4_0:
230 |                         {
231 |                             cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
232 |                         } break;
233 |                     case GGML_TYPE_Q4_1:
234 |                         {
235 |                             cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
236 |                         } break;
237 |                     default:
238 |                         {
239 |                             fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
240 |                             return false;
241 |                         }
242 |                 }
243 | 
244 |                 fout.write(reinterpret_cast<char *>(work.data()), cur_size);
245 |                 total_size_new += cur_size;
246 | 
247 |                 printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
248 |                 for (int i = 0; i < hist_cur.size(); ++i) {
249 |                     hist_all[i] += hist_cur[i];
250 |                 }
251 | 
252 |                 for (int i = 0; i < hist_cur.size(); ++i) {
253 |                     printf("%5.3f ", hist_cur[i] / (float)nelements);
254 |                 }
255 |                 printf("\n");
256 |             } else {
257 |                 printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
258 |                 fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
259 |                 total_size_new += data_u8.size();
260 |             }
261 | 
262 |             total_size_org += nelements * sizeof(float);
263 |         }
264 | 
265 |         printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
266 |         printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
267 | 
268 |         {
269 |             int64_t sum_all = 0;
270 |             for (int i = 0; i < hist_all.size(); ++i) {
271 |                 sum_all += hist_all[i];
272 |             }
273 | 
274 |             printf("%s: hist: ", __func__);
275 |             for (int i = 0; i < hist_all.size(); ++i) {
276 |                 printf("%5.3f ", hist_all[i] / (float)sum_all);
277 |             }
278 |             printf("\n");
279 |         }
280 |     }
281 | 
282 |     finp.close();
283 |     fout.close();
284 | 
285 |     return true;
286 | }
287 | 
288 | // usage:
289 | //  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
290 | //
291 | int main(int argc, char ** argv) {
292 |     ggml_time_init();
293 |     if (argc != 4) {
294 |         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
295 |         fprintf(stderr, "  type = 2 - q4_0\n");
296 |         fprintf(stderr, "  type = 3 - q4_1\n");
297 |         return 1;
298 |     }
299 | 
300 |     // needed to initialize f16 tables
301 |     {
302 |         struct ggml_init_params params = { 0, NULL };
303 |         struct ggml_context * ctx = ggml_init(params);
304 |         ggml_free(ctx);
305 |     }
306 | 
307 |     const std::string fname_inp = argv[1];
308 |     const std::string fname_out = argv[2];
309 | 
310 |     const int itype = atoi(argv[3]);
311 | 
312 |     const int64_t t_main_start_us = ggml_time_us();
313 | 
314 |     int64_t t_quantize_us = 0;
315 | 
316 |     // load the model
317 |     {
318 |         const int64_t t_start_us = ggml_time_us();
319 | 
320 |         if (!llama_model_quantize(fname_inp, fname_out, itype)) {
321 |             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
322 |             return 1;
323 |         }
324 | 
325 |         t_quantize_us = ggml_time_us() - t_start_us;
326 |     }
327 | 
328 |     // report timing
329 |     {
330 |         const int64_t t_main_end_us = ggml_time_us();
331 | 
332 |         printf("\n");
333 |         printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
334 |         printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
335 |     }
336 | 
337 |     return 0;
338 | }
339 | 


--------------------------------------------------------------------------------
/quantize.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then
 4 |     echo
 5 |     echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]"
 6 |     echo
 7 |     exit 1
 8 | fi
 9 | 
10 | for i in `ls models/$1/ggml-model-f16.bin*`; do
11 |     ./quantize "$i" "${i/f16/q4_0}" 2
12 |     if [[ "$2" == "--remove-f16" ]]; then
13 |         rm "$i"
14 |     fi
15 | done
16 | 


--------------------------------------------------------------------------------
/screencast.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ItsPi3141/alpaca.cpp/779a873fb2ac2c40b4595c8ad4e93bf6ce133b14/screencast.gif


--------------------------------------------------------------------------------
/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | #include <cassert>
  4 | #include <cstring>
  5 | #include <fstream>
  6 | #include <regex>
  7 | #include <iostream>
  8 | #include <iterator>
  9 | #include <string>
 10 | #include <math.h>
 11 | 
 12 |  #if defined(_MSC_VER) || defined(__MINGW32__)
 13 |  #include <malloc.h> // using malloc.h with MSC/MINGW
 14 |  #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 15 |  #include <alloca.h>
 16 |  #endif
 17 | 
 18 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 19 |     for (int i = 1; i < argc; i++) {
 20 |         std::string arg = argv[i];
 21 | 
 22 |         if (arg == "-s" || arg == "--seed") {
 23 |             params.seed = std::stoi(argv[++i]);
 24 |         } else if (arg == "-t" || arg == "--threads") {
 25 |             params.n_threads = std::stoi(argv[++i]);
 26 |         } else if (arg == "-p" || arg == "--prompt") {
 27 |             params.prompt = argv[++i];
 28 |         } else if (arg == "-f" || arg == "--file") {
 29 | 
 30 |             std::ifstream file(argv[++i]);
 31 | 
 32 |             std::copy(std::istreambuf_iterator<char>(file),
 33 |                     std::istreambuf_iterator<char>(),
 34 |                     back_inserter(params.prompt));
 35 |                 
 36 |         } else if (arg == "-n" || arg == "--n_predict") {
 37 |             params.n_predict = std::stoi(argv[++i]);
 38 |         } else if (arg == "--top_k") {
 39 |             params.top_k = std::stoi(argv[++i]);
 40 |         } else if (arg == "-c" || arg == "--ctx_size") {
 41 |             params.n_ctx = std::stoi(argv[++i]);
 42 |         } else if (arg == "--top_p") {
 43 |             params.top_p = std::stof(argv[++i]);
 44 |         } else if (arg == "--temp") {
 45 |             params.temp = std::stof(argv[++i]);
 46 |         } else if (arg == "--repeat_last_n") {
 47 |             params.repeat_last_n = std::stoi(argv[++i]);
 48 |         } else if (arg == "--repeat_penalty") {
 49 |             params.repeat_penalty = std::stof(argv[++i]);
 50 |         } else if (arg == "-b" || arg == "--batch_size") {
 51 |             params.n_batch = std::stoi(argv[++i]);
 52 |         } else if (arg == "-m" || arg == "--model") {
 53 |             params.model = argv[++i];
 54 |         } else if (arg == "-i" || arg == "--interactive") {
 55 |             params.interactive = true;
 56 |         } else if (arg == "--interactive-start") {
 57 |             params.interactive = true;
 58 |             params.interactive_start = true;
 59 |         } else if (arg == "--color") {
 60 |             params.use_color = true;
 61 |         } else if (arg == "-r" || arg == "--reverse-prompt") {
 62 |             params.antiprompt = argv[++i];
 63 |         } else if (arg == "-h" || arg == "--help") {
 64 |             gpt_print_usage(argc, argv, params);
 65 |             exit(0);
 66 |         } else {
 67 |             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 68 |             gpt_print_usage(argc, argv, params);
 69 |             exit(0);
 70 |         }
 71 |     }
 72 | 
 73 |     return true;
 74 | }
 75 | 
 76 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
 77 |     fprintf(stderr, "usage: %s [options]\n", argv[0]);
 78 |     fprintf(stderr, "\n");
 79 |     fprintf(stderr, "options:\n");
 80 |     fprintf(stderr, "  -h, --help            show this help message and exit\n");
 81 |     fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
 82 |     fprintf(stderr, "  --interactive-start   run in interactive mode and poll user input at startup\n");
 83 |     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
 84 |     fprintf(stderr, "                        in interactive mode, poll user input upon seeing PROMPT\n");
 85 |     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
 86 |     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
 87 |     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
 88 |     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
 89 |     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
 90 |     fprintf(stderr, "  -f FNAME, --file FNAME\n");
 91 |     fprintf(stderr, "                        prompt file to start generation.\n");
 92 |     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
 93 |     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
 94 |     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
 95 |     fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
 96 |     fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
 97 |     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
 98 |     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
 99 |     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
100 |     fprintf(stderr, "  -m FNAME, --model FNAME\n");
101 |     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
102 |     fprintf(stderr, "\n");
103 | }
104 | 
105 | std::string gpt_random_prompt(std::mt19937 & rng) {
106 |     const int r = rng() % 10;
107 |     switch (r) {
108 |         case 0: return "So";
109 |         case 1: return "Once upon a time";
110 |         case 2: return "When";
111 |         case 3: return "The";
112 |         case 4: return "After";
113 |         case 5: return "If";
114 |         case 6: return "import";
115 |         case 7: return "He";
116 |         case 8: return "She";
117 |         case 9: return "They";
118 |         default: return "To";
119 |     }
120 | 
121 |     return "The";
122 | }
123 | 
124 | void replace(std::string & str, const std::string & needle, const std::string & replacement) {
125 |     size_t pos = 0;
126 |     while ((pos = str.find(needle, pos)) != std::string::npos) {
127 |         str.replace(pos, needle.length(), replacement);
128 |         pos += replacement.length();
129 |     }
130 | }
131 | 
132 | std::map<std::string, int32_t> json_parse(const std::string & fname) {
133 |     std::map<std::string, int32_t> result;
134 | 
135 |     // read file into string
136 |     std::string json;
137 |     {
138 |         std::ifstream ifs(fname);
139 |         if (!ifs) {
140 |             fprintf(stderr, "Failed to open %s\n", fname.c_str());
141 |             exit(1);
142 |         }
143 | 
144 |         json = std::string((std::istreambuf_iterator<char>(ifs)),
145 |                 (std::istreambuf_iterator<char>()));
146 |     }
147 | 
148 |     if (json[0] != '{') {
149 |         return result;
150 |     }
151 | 
152 |     // parse json
153 |     {
154 |         bool has_key  = false;
155 |         bool in_token = false;
156 | 
157 |         std::string str_key = "";
158 |         std::string str_val = "";
159 | 
160 |         int n = json.size();
161 |         for (int i = 1; i < n; ++i) {
162 |             if (!in_token) {
163 |                 if (json[i] == ' ') continue;
164 |                 if (json[i] == '"') {
165 |                     in_token = true;
166 |                     continue;
167 |                 }
168 |             } else {
169 |                 if (json[i] == '\\' && i+1 < n) {
170 |                     if (has_key == false) {
171 |                         str_key += json[i];
172 |                     } else {
173 |                         str_val += json[i];
174 |                     }
175 |                     ++i;
176 |                 } else if (json[i] == '"') {
177 |                     if (has_key == false) {
178 |                         has_key = true;
179 |                         ++i;
180 |                         while (json[i] == ' ') ++i;
181 |                         ++i; // :
182 |                         while (json[i] == ' ') ++i;
183 |                         if (json[i] != '\"') {
184 |                             while (json[i] != ',' && json[i] != '}') {
185 |                                 str_val += json[i++];
186 |                             }
187 |                             has_key = false;
188 |                         } else {
189 |                             in_token = true;
190 |                             continue;
191 |                         }
192 |                     } else {
193 |                         has_key = false;
194 |                     }
195 | 
196 |                     ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
197 |                     ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
198 |                     ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
199 | 
200 |                     try {
201 |                         result[str_key] = std::stoi(str_val);
202 |                     } catch (...) {
203 |                         //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
204 | 
205 |                     }
206 |                     str_key = "";
207 |                     str_val = "";
208 |                     in_token = false;
209 |                     continue;
210 |                 }
211 |                 if (has_key == false) {
212 |                     str_key += json[i];
213 |                 } else {
214 |                     str_val += json[i];
215 |                 }
216 |             }
217 |         }
218 |     }
219 | 
220 |     return result;
221 | }
222 | 
223 | std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
224 |     std::vector<std::string> words;
225 | 
226 |     // first split the text into words
227 |     {
228 |         std::string str = text;
229 |         std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
230 | 
231 |         std::regex re(pat);
232 |         std::smatch m;
233 | 
234 |         while (std::regex_search(str, m, re)) {
235 |             for (auto x : m) {
236 |                 words.push_back(x);
237 |             }
238 |             str = m.suffix();
239 |         }
240 |     }
241 | 
242 |     // find the longest tokens that form the words:
243 |     std::vector<gpt_vocab::id> tokens;
244 |     for (const auto & word : words) {
245 |         if (word.size() == 0) continue;
246 | 
247 |         int i = 0;
248 |         int n = word.size();
249 |         while (i < n) {
250 |             int j = n;
251 |             while (j > i) {
252 |                 auto it = vocab.token_to_id.find(word.substr(i, j-i));
253 |                 if (it != vocab.token_to_id.end()) {
254 |                     tokens.push_back(it->second);
255 |                     i = j;
256 |                     break;
257 |                 }
258 |                 --j;
259 |             }
260 |             if (i == n) {
261 |                 break;
262 |             }
263 |             if (j == i) {
264 |                 auto sub = word.substr(i, 1);
265 |                 if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
266 |                     tokens.push_back(vocab.token_to_id.at(sub));
267 |                 } else {
268 |                     fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
269 |                 }
270 |                 ++i;
271 |             }
272 |         }
273 |     }
274 | 
275 |     return tokens;
276 | }
277 | 
278 | // TODO: Calculate this constant from the vocabulary
279 | #define MAX_TOKEN_LEN 18
280 | // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
281 | std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
282 |     std::vector<gpt_vocab::id> res;
283 |     std::vector<int> score;
284 |     std::vector<gpt_vocab::id> prev;
285 |     int len = text.length();
286 | 
287 |     score.resize(len + 1);
288 |     prev.resize(len + 1);
289 | 
290 |     // Forward pass
291 |     for (int i = 0; i < len; i++) {
292 |         int max_len = std::min(len - i, MAX_TOKEN_LEN);
293 |         for (int sub_len = 1; sub_len <= len - i; sub_len++) {
294 |             auto sub = text.substr(i, sub_len);
295 |             auto token = vocab.token_to_id.find(sub);
296 |             if (token != vocab.token_to_id.end()) {
297 |                 int token_score = sub.length() * sub.length();
298 |                 int local_score = score[i] + token_score;
299 |                 int next = i + sub_len;
300 |                 if (score[next] < local_score) {
301 |                     score[next] = local_score;
302 |                     prev[next] = (*token).second;
303 |                 }
304 |             }
305 |         }
306 |     }
307 | 
308 |     // Backward pass
309 |     int i = len;
310 |     while (i > 0) {
311 |         gpt_vocab::id token_id = prev[i];
312 |         if (token_id == 0) {
313 | 	    // TODO: Return error or something more meaningful
314 |             printf("failed to tokenize string!\n");
315 | 	    break;
316 |         }
317 |         res.push_back(token_id);
318 |         auto token = (*vocab.id_to_token.find(token_id)).second;
319 |         i -= token.length();
320 |     }
321 | 
322 |     if (bos) {
323 |         res.push_back(1); // TODO: replace with vocab.bos
324 |     }
325 | 
326 |     // Pieces are in reverse order so correct that
327 |     std::reverse(res.begin(), res.end());
328 | 
329 |     return res;
330 | }
331 | 
332 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
333 |     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
334 | 
335 |     vocab.token_to_id = ::json_parse(fname);
336 | 
337 |     for (const auto & kv : vocab.token_to_id) {
338 |         vocab.id_to_token[kv.second] = kv.first;
339 |     }
340 | 
341 |     printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
342 | 
343 |     // print the vocabulary
344 |     //for (auto kv : vocab.token_to_id) {
345 |     //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
346 |     //}
347 | 
348 |     return true;
349 | }
350 | 
351 | 
352 | void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
353 |     // find the top K tokens
354 |     std::partial_sort(
355 |             logits_id.begin(),
356 |             logits_id.begin() + top_k, logits_id.end(),
357 |             [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
358 |         return a.first > b.first;
359 |     });
360 | 
361 |     logits_id.resize(top_k);
362 | }
363 | 
364 | gpt_vocab::id llama_sample_top_p_top_k(
365 |         const gpt_vocab & vocab,
366 |         const float * logits,
367 |         std::vector<gpt_vocab::id> & last_n_tokens,
368 |         double repeat_penalty,
369 |         int top_k,
370 |         double top_p,
371 |         double temp,
372 |         std::mt19937 & rng) {
373 |     int n_logits = vocab.id_to_token.size();
374 | 
375 |     std::vector<std::pair<double, gpt_vocab::id>> logits_id;
376 |     logits_id.reserve(n_logits);
377 | 
378 |     {
379 |         const double scale = 1.0/temp;
380 |         for (int i = 0; i < n_logits; ++i) {
381 |             // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
382 |             // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
383 |             if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
384 |                 // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
385 |                 if (logits[i] < 0.0) {
386 |                     logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
387 |                 } else {
388 |                     logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
389 |                 }                
390 |             } else {
391 |                 logits_id.push_back(std::make_pair(logits[i]*scale, i));
392 |             }
393 |         }
394 |     }
395 | 
396 |     sample_top_k(logits_id, top_k);
397 | 
398 |     double maxl = -INFINITY;
399 |     for (const auto & kv : logits_id) {
400 |         maxl = std::max(maxl, kv.first);
401 |     }
402 | 
403 |     // compute probs for the top K tokens
404 |     std::vector<double> probs;
405 |     probs.reserve(logits_id.size());
406 | 
407 |     double sum = 0.0;
408 |     for (const auto & kv : logits_id) {
409 |         double p = exp(kv.first - maxl);
410 |         probs.push_back(p);
411 |         sum += p;
412 |     }
413 | 
414 |     // normalize the probs
415 |     for (auto & p : probs) {
416 |         p /= sum;
417 |     }
418 | 
419 |     if (top_p < 1.0f) {
420 |         double cumsum = 0.0f;
421 |         for (int i = 0; i < (int) probs.size(); i++) {
422 |             cumsum += probs[i];
423 |             if (cumsum >= top_p) {
424 |                 probs.resize(i + 1);
425 |                 logits_id.resize(i + 1);
426 |                 break;
427 |             }
428 |         }
429 | 
430 |         cumsum = 1.0/cumsum;
431 |         for (int i = 0; i < (int) probs.size(); i++) {
432 |             probs[i] *= cumsum;
433 |         }
434 |     }
435 | 
436 |     //printf("\n");
437 |     //for (int i = 0; i < (int) 10; i++) {
438 |     //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
439 |     //}
440 |     //printf("\n\n");
441 |     //exit(0);
442 | 
443 |     std::discrete_distribution<> dist(probs.begin(), probs.end());
444 |     int idx = dist(rng);
445 | 
446 |     return logits_id[idx].second;
447 | }
448 | 
449 | 
450 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
451 |     const int nb = k / qk;
452 |     const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
453 |     const size_t row_size = nb*bs;
454 | 
455 |     assert(k % qk == 0);
456 | 
457 |     const size_t pp_size = qk / 2;
458 |     uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
459 | 
460 |     char * pdst = (char *) dst;
461 | 
462 |     for (int j = 0; j < n; j += k) {
463 |         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
464 |         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
465 | 
466 |         for (int i = 0; i < nb; i++) {
467 |             float amax = 0.0f; // absolute max
468 | 
469 |             {
470 |                 for (int l = 0; l < qk; l++) {
471 |                     const float v = src[j + i*qk + l];
472 |                     amax = std::max(amax, fabsf(v));
473 |                 }
474 | 
475 |                 const float d = amax / ((1 << 3) - 1);
476 |                 const float id = d ? 1.0f/d : 0.0f;
477 | 
478 |                 *(float *) pd = d;
479 |                 pd += bs;
480 | 
481 |                 for (int l = 0; l < qk; l += 2) {
482 |                     const float v0 = (src[j + i*qk + l + 0])*id;
483 |                     const float v1 = (src[j + i*qk + l + 1])*id;
484 | 
485 |                     const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
486 |                     const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
487 | 
488 |                     assert(vi0 >= 0 && vi0 < 16);
489 |                     assert(vi1 >= 0 && vi1 < 16);
490 | 
491 |                     hist[vi0]++;
492 |                     hist[vi1]++;
493 | 
494 |                     pp[l/2] = vi0 | (vi1 << 4);
495 |                 }
496 | 
497 |                 memcpy(pb, pp, pp_size);
498 |                 pb += bs;
499 |             }
500 |         }
501 |     }
502 | 
503 |     return (n/k)*row_size;
504 | }
505 | 
506 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
507 |     const int nb = k / qk;
508 |     const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
509 | 
510 |     assert(k % qk == 0);
511 | 
512 |     const size_t pp_size = qk / 2;
513 |     uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
514 | 
515 |     char * pdst = (char *) dst;
516 | 
517 |     for (int j = 0; j < n; j += k) {
518 |         float   * pm = (float *)   (pdst + (j/k)*row_size);
519 |         float   * pd = (float *)   (pm + nb);
520 |         uint8_t * pb = (uint8_t *) (pd + nb);
521 | 
522 |         //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
523 | 
524 |         for (int i = 0; i < nb; i++) {
525 |             float min = std::numeric_limits<float>::max();
526 |             float max = std::numeric_limits<float>::min();
527 | 
528 |             {
529 |                 for (int l = 0; l < qk; l++) {
530 |                     const float v = src[j + i*qk + l];
531 |                     if (v < min) min = v;
532 |                     if (v > max) max = v;
533 |                 }
534 | 
535 |                 const float d = (max - min) / ((1 << 4) - 1);
536 |                 const float id = d ? 1.0f/d : 0.0f;
537 | 
538 |                 pm[i] = min;
539 |                 pd[i] = d;
540 | 
541 |                 for (int l = 0; l < qk; l += 2) {
542 |                     const float v0 = (src[j + i*qk + l + 0] - min)*id;
543 |                     const float v1 = (src[j + i*qk + l + 1] - min)*id;
544 | 
545 |                     const uint8_t vi0 = round(v0);
546 |                     const uint8_t vi1 = round(v1);
547 | 
548 |                     assert(vi0 >= 0 && vi0 < 16);
549 |                     assert(vi1 >= 0 && vi1 < 16);
550 | 
551 |                     hist[vi0]++;
552 |                     hist[vi1]++;
553 | 
554 |                     pp[l/2] = vi0 | (vi1 << 4);
555 |                 }
556 | 
557 |                 memcpy(pb + i*qk/2, pp, pp_size);
558 |             }
559 |         }
560 |     }
561 | 
562 |     return (n/k)*row_size;
563 | }
564 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
  1 | // Various helper functions and utilities
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <string>
  6 | #include <map>
  7 | #include <vector>
  8 | #include <random>
  9 | #include <thread>
 10 | 
 11 | //
 12 | // CLI argument parsing
 13 | //
 14 | 
 15 | struct gpt_params {
 16 |     int32_t seed      = -1; // RNG seed
 17 |     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 18 |     int32_t n_predict = 128; // new tokens to predict
 19 |     int32_t repeat_last_n = 64;  // last n tokens to penalize
 20 |     int32_t n_ctx = 512; //context size
 21 |     
 22 |     // sampling parameters
 23 |     int32_t top_k = 40;
 24 |     float   top_p = 0.95f;
 25 |     float   temp  = 0.80f;
 26 |     float   repeat_penalty  = 1.30f;
 27 | 
 28 |     int32_t n_batch = 8; // batch size for prompt processing
 29 | 
 30 |     std::string model = "models/lamma-7B/ggml-model.bin"; // model path
 31 |     std::string prompt;
 32 | 
 33 |     bool use_color = false; // use color to distinguish generations and inputs
 34 | 
 35 |     bool interactive = false; // interactive mode
 36 |     bool interactive_start = false; // reverse prompt immediately
 37 |     std::string antiprompt = ""; // string upon seeing which more user input is prompted
 38 | };
 39 | 
 40 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 41 | 
 42 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 43 | 
 44 | std::string gpt_random_prompt(std::mt19937 & rng);
 45 | 
 46 | //
 47 | // Vocab utils
 48 | //
 49 | 
 50 | struct gpt_vocab {
 51 |     using id    = int32_t;
 52 |     using token = std::string;
 53 | 
 54 |     std::map<token, id> token_to_id;
 55 |     std::map<id, token> id_to_token;
 56 | };
 57 | 
 58 | void replace(std::string & str, const std::string & needle, const std::string & replacement);
 59 | 
 60 | // poor-man's JSON parsing
 61 | std::map<std::string, int32_t> json_parse(const std::string & fname);
 62 | 
 63 | // split text into tokens
 64 | //
 65 | // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 66 | //
 67 | // Regex (Python):
 68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 69 | //
 70 | // Regex (C++):
 71 | // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 72 | //
 73 | std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 74 | 
 75 | // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 76 | // ref: https://github.com/google/sentencepiece
 77 | std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
 78 | 
 79 | // load the tokens from encoder.json
 80 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 81 | 
 82 | // sample next token given probabilities for each embedding
 83 | //
 84 | //   - consider only the top K tokens
 85 | //   - from them, consider only the top tokens with cumulative probability > P
 86 | //
 87 | gpt_vocab::id llama_sample_top_p_top_k(
 88 |         const gpt_vocab & vocab,
 89 |         const float * logits,
 90 |         std::vector<gpt_vocab::id> & last_n_tokens,
 91 |         double repeat_penalty,
 92 |         int top_k,
 93 |         double top_p,
 94 |         double temp,
 95 |         std::mt19937 & rng);
 96 | 
 97 | // filer to top K tokens from list of logits
 98 | void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);
 99 | 
100 | //
101 | // Quantization
102 | //
103 | 
104 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
105 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
106 | 


--------------------------------------------------------------------------------