├── .github
└── workflows
│ └── build.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── convert-pth-to-ggml.py
├── ggml.c
├── ggml.h
├── main.cpp
├── quantize.cpp
├── quantize.sh
├── screencast.gif
├── utils.cpp
└── utils.h
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | workflow_dispatch: # allows manual triggering
5 | inputs:
6 | create_release:
7 | description: 'Create new release'
8 | required: true
9 | type: boolean
10 | push:
11 | paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
12 | pull_request:
13 | types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
14 | paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
15 |
16 | env:
17 | BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
18 |
19 | jobs:
20 | ubuntu-latest:
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - name: Clone
25 | id: checkout
26 | uses: actions/checkout@v1
27 |
28 | - name: Dependencies
29 | id: depends
30 | run: |
31 | sudo apt-get update
32 | sudo apt-get install build-essential
33 |
34 | - name: Build
35 | id: make_build
36 | run: |
37 | make
38 |
39 | - name: Archive production artifacts
40 | uses: actions/upload-artifact@v3
41 | with:
42 | name: ubuntu
43 | path: |
44 | chat
45 |
46 |
47 | macOS-latest:
48 | runs-on: macOS-latest
49 |
50 | steps:
51 | - name: Clone
52 | id: checkout
53 | uses: actions/checkout@v1
54 |
55 | - name: Dependencies
56 | id: depends
57 | run: |
58 | brew update
59 |
60 | - name: Build
61 | id: make_build
62 | run: |
63 | make
64 |
65 | - name: Archive production artifacts
66 | uses: actions/upload-artifact@v3
67 | with:
68 | name: macos
69 | path: |
70 | chat
71 |
72 | # macos-arm64:
73 | # runs-on: macos-arm64
74 |
75 | # steps:
76 | # - name: Clone
77 | # id: checkout
78 | # uses: actions/checkout@v1
79 |
80 | # - name: Dependencies
81 | # id: depends
82 | # run: |
83 | # brew update
84 |
85 | # - name: Build
86 | # id: make_build
87 | # run: |
88 | # make
89 |
90 | # - name: Archive production artifacts
91 | # uses: actions/upload-artifact@v3
92 | # with:
93 | # name: macos
94 | # path: |
95 | # chat
96 |
97 | windows-latest:
98 | runs-on: windows-latest
99 |
100 | steps:
101 | - name: Clone
102 | id: checkout
103 | uses: actions/checkout@v1
104 |
105 | - name: Build
106 | id: cmake_build
107 | run: |
108 | mkdir build
109 | cd build
110 | cmake ..
111 | cmake --build . --config Release
112 |
113 | - name: Set commit hash variables
114 | id: commit
115 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
116 | uses: pr-mpt/actions-commit-hash@v2
117 |
118 | - name: Pack artifacts
119 | id: pack_artifacts
120 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
121 | run: |
122 | 7z a alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
123 |
124 | - name: Create release
125 | id: create_release
126 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
127 | uses: zendesk/action-create-release@v1
128 | env:
129 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
130 | with:
131 | tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
132 |
133 | - name: Upload release
134 | id: upload_release
135 | if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
136 | uses: actions/upload-release-asset@v1
137 | env:
138 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
139 | with:
140 | upload_url: ${{ steps.create_release.outputs.upload_url }}
141 | asset_path: .\alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
142 | asset_name: alpaca-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
143 | asset_content_type: application/octet-stream
144 |
145 | # ubuntu-latest-gcc:
146 | # runs-on: ubuntu-latest
147 | #
148 | # strategy:
149 | # matrix:
150 | # build: [Debug, Release]
151 | #
152 | # steps:
153 | # - name: Clone
154 | # uses: actions/checkout@v1
155 | #
156 | # - name: Dependencies
157 | # run: |
158 | # sudo apt-get update
159 | # sudo apt-get install build-essential
160 | # sudo apt-get install cmake
161 | #
162 | # - name: Configure
163 | # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
164 | #
165 | # - name: Build
166 | # run: |
167 | # make
168 | #
169 | # ubuntu-latest-clang:
170 | # runs-on: ubuntu-latest
171 | #
172 | # strategy:
173 | # matrix:
174 | # build: [Debug, Release]
175 | #
176 | # steps:
177 | # - name: Clone
178 | # uses: actions/checkout@v1
179 | #
180 | # - name: Dependencies
181 | # run: |
182 | # sudo apt-get update
183 | # sudo apt-get install build-essential
184 | # sudo apt-get install cmake
185 | #
186 | # - name: Configure
187 | # run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
188 | #
189 | # - name: Build
190 | # run: |
191 | # make
192 | #
193 | # ubuntu-latest-gcc-sanitized:
194 | # runs-on: ubuntu-latest
195 | #
196 | # strategy:
197 | # matrix:
198 | # sanitizer: [ADDRESS, THREAD, UNDEFINED]
199 | #
200 | # steps:
201 | # - name: Clone
202 | # uses: actions/checkout@v1
203 | #
204 | # - name: Dependencies
205 | # run: |
206 | # sudo apt-get update
207 | # sudo apt-get install build-essential
208 | # sudo apt-get install cmake
209 | #
210 | # - name: Configure
211 | # run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
212 | #
213 | # - name: Build
214 | # run: |
215 | # make
216 | #
217 | # windows:
218 | # runs-on: windows-latest
219 | #
220 | # strategy:
221 | # matrix:
222 | # build: [Release]
223 | # arch: [Win32, x64]
224 | # include:
225 | # - arch: Win32
226 | # s2arc: x86
227 | # - arch: x64
228 | # s2arc: x64
229 | #
230 | # steps:
231 | # - name: Clone
232 | # uses: actions/checkout@v1
233 | #
234 | # - name: Add msbuild to PATH
235 | # uses: microsoft/setup-msbuild@v1
236 | #
237 | # - name: Configure
238 | # run: >
239 | # cmake -S . -B ./build -A ${{ matrix.arch }}
240 | # -DCMAKE_BUILD_TYPE=${{ matrix.build }}
241 | #
242 | # - name: Build
243 | # run: |
244 | # cd ./build
245 | # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
246 | #
247 | # - name: Upload binaries
248 | # uses: actions/upload-artifact@v1
249 | # with:
250 | # name: llama-bin-${{ matrix.arch }}
251 | # path: build/bin/${{ matrix.build }}
252 | #
253 | # windows-blas:
254 | # runs-on: windows-latest
255 | #
256 | # strategy:
257 | # matrix:
258 | # build: [Release]
259 | # arch: [Win32, x64]
260 | # blas: [ON]
261 | # include:
262 | # - arch: Win32
263 | # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
264 | # s2arc: x86
265 | # - arch: x64
266 | # obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
267 | # s2arc: x64
268 | #
269 | # steps:
270 | # - name: Clone
271 | # uses: actions/checkout@v1
272 | #
273 | # - name: Add msbuild to PATH
274 | # uses: microsoft/setup-msbuild@v1
275 | #
276 | # - name: Fetch OpenBLAS
277 | # if: matrix.blas == 'ON'
278 | # run: |
279 | # C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
280 | # 7z x blas.zip -oblas -y
281 | # copy blas/include/cblas.h .
282 | # copy blas/include/openblas_config.h .
283 | # echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
284 | #
285 | # - name: Configure
286 | # run: >
287 | # cmake -S . -B ./build -A ${{ matrix.arch }}
288 | # -DCMAKE_BUILD_TYPE=${{ matrix.build }}
289 | # -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
290 | # -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
291 | #
292 | # - name: Build
293 | # run: |
294 | # cd ./build
295 | # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
296 | #
297 | # - name: Copy libopenblas.dll
298 | # if: matrix.blas == 'ON'
299 | # run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
300 | #
301 | # - name: Upload binaries
302 | # if: matrix.blas == 'ON'
303 | # uses: actions/upload-artifact@v1
304 | # with:
305 | # name: llama-blas-bin-${{ matrix.arch }}
306 | # path: build/bin/${{ matrix.build }}
307 | #
308 | # emscripten:
309 | # runs-on: ubuntu-latest
310 | #
311 | # strategy:
312 | # matrix:
313 | # build: [Release]
314 | #
315 | # steps:
316 | # - name: Clone
317 | # uses: actions/checkout@v1
318 | #
319 | # - name: Dependencies
320 | # run: |
321 | # wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
322 | # tar -xvf master.tar.gz
323 | # emsdk-master/emsdk update
324 | # emsdk-master/emsdk install latest
325 | # emsdk-master/emsdk activate latest
326 | #
327 | # - name: Configure
328 | # run: echo "tmp"
329 | #
330 | # - name: Build
331 | # run: |
332 | # pushd emsdk-master
333 | # source ./emsdk_env.sh
334 | # popd
335 | # emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
336 | # make
337 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /chat
2 |
3 | *.o
4 | *.a
5 | .cache/
6 | .vs/
7 | .vscode/
8 | .DS_Store
9 |
10 | build/
11 | build-em/
12 | build-debug/
13 | build-release/
14 | build-static/
15 | build-no-accel/
16 | build-sanitize-addr/
17 | build-sanitize-thread/
18 |
19 | models/*
20 | *.bin
21 |
22 | /main
23 | /quantize
24 |
25 | arm_neon.h
26 | compile_commands.json
27 |
28 | # Windows CMake files
29 | *.vcxproj
30 | *.filters
31 | *.cmake
32 | *.sln
33 | x64/
34 | Debug/
35 | Release/
36 | CMakeFiles/
37 | CMakeCache.txt
38 | *.dir/
39 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8)
2 | project("alpaca.cpp")
3 |
4 | set(CMAKE_CXX_STANDARD 20)
5 | set(CMAKE_CXX_STANDARD_REQUIRED true)
6 | set(CMAKE_C_STANDARD 11)
7 |
8 | if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
9 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
10 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
11 | endif()
12 |
13 | option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
14 | option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
15 |
16 | option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
17 | option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
18 | option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
19 |
20 | if (APPLE)
21 | option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF)
22 | option(LLAMA_NO_AVX "llama: disable AVX" OFF)
23 | option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF)
24 | option(LLAMA_NO_FMA "llama: disable FMA" OFF)
25 | endif()
26 |
27 | if (NOT MSVC)
28 | if (LLAMA_SANITIZE_THREAD)
29 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
30 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
31 | endif()
32 |
33 | if (LLAMA_SANITIZE_ADDRESS)
34 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
35 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
36 | endif()
37 |
38 | if (LLAMA_SANITIZE_UNDEFINED)
39 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
40 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
41 | endif()
42 | endif()
43 |
44 | if (APPLE AND NOT LLAMA_NO_ACCELERATE)
45 | find_library(ACCELERATE_FRAMEWORK Accelerate)
46 | if (ACCELERATE_FRAMEWORK)
47 | message(STATUS "Accelerate framework found")
48 |
49 | set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
50 | set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
51 | else()
52 | message(WARNING "Accelerate framework not found")
53 | endif()
54 | endif()
55 |
56 | if (LLAMA_ALL_WARNINGS)
57 | if (NOT MSVC)
58 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
59 | -Wall \
60 | -Wextra \
61 | -Wpedantic \
62 | -Wshadow \
63 | -Wcast-qual \
64 | -Wstrict-prototypes \
65 | -Wpointer-arith \
66 | -Wno-unused-function \
67 | ")
68 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
69 | -Wall \
70 | -Wextra \
71 | -Wpedantic \
72 | -Wcast-qual \
73 | ")
74 | else()
75 | # todo : msvc
76 | endif()
77 | endif()
78 |
79 | message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
80 |
81 | if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
82 | message(STATUS "ARM detected")
83 | else()
84 | message(STATUS "x86 detected")
85 | if (MSVC)
86 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
87 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
88 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
89 | else()
90 | if(NOT LLAMA_NO_AVX)
91 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
92 | endif()
93 | if(NOT LLAMA_NO_AVX2)
94 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
95 | endif()
96 | if(NOT LLAMA_NO_FMA)
97 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
98 | endif()
99 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
100 | endif()
101 | endif()
102 |
103 | # if (LLAMA_PERF)
104 | # set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
105 | # endif()
106 |
107 | add_executable(main
108 | main.cpp
109 | utils.cpp
110 | utils.h)
111 |
112 | add_executable(quantize
113 | quantize.cpp
114 | utils.cpp
115 | utils.h)
116 |
117 | add_library(ggml
118 | ggml.c
119 | ggml.h)
120 |
121 | target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
122 | target_compile_definitions(main PUBLIC ${LLAMA_EXTRA_FLAGS})
123 | target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
124 |
125 | target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
126 | target_include_directories(ggml PUBLIC .)
127 | target_link_libraries(quantize PRIVATE ggml)
128 | target_link_libraries(main PRIVATE ggml)
129 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Georgi Gerganov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ifndef UNAME_S
2 | UNAME_S := $(shell uname -s)
3 | endif
4 |
5 | ifndef UNAME_P
6 | UNAME_P := $(shell uname -p)
7 | endif
8 |
9 | ifndef UNAME_M
10 | UNAME_M := $(shell uname -m)
11 | endif
12 |
13 | CCV := $(shell $(CC) --version | head -n 1)
14 | CXXV := $(shell $(CXX) --version | head -n 1)
15 |
16 | # Mac OS + Arm can report x86_64
17 | # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
18 | ifeq ($(UNAME_S),Darwin)
19 | ifneq ($(UNAME_P),arm)
20 | SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
21 | ifeq ($(SYSCTL_M),1)
22 | # UNAME_P := arm
23 | # UNAME_M := arm64
24 | warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
25 | endif
26 | endif
27 | endif
28 |
29 | #
30 | # Compile flags
31 | #
32 |
33 | CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34 | CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
35 | LDFLAGS =
36 |
37 | # OS specific
38 | # TODO: support Windows
39 | ifeq ($(UNAME_S),Linux)
40 | CFLAGS += -pthread
41 | CXXFLAGS += -pthread
42 | endif
43 | ifeq ($(UNAME_S),Darwin)
44 | CFLAGS += -pthread
45 | CXXFLAGS += -pthread
46 | endif
47 | ifeq ($(UNAME_S),FreeBSD)
48 | CFLAGS += -pthread
49 | CXXFLAGS += -pthread
50 | endif
51 | ifeq ($(UNAME_S),NetBSD)
52 | CFLAGS += -pthread
53 | CXXFLAGS += -pthread
54 | endif
55 | ifeq ($(UNAME_S),Haiku)
56 | CFLAGS += -pthread
57 | CXXFLAGS += -pthread
58 | endif
59 |
60 | # Architecture specific
61 | # TODO: probably these flags need to be tweaked on some architectures
62 | # feel free to update the Makefile for your architecture and send a pull request or issue
63 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
64 | ifeq ($(UNAME_S),Darwin)
65 | CFLAGS += -mf16c
66 | AVX1_M := $(shell sysctl machdep.cpu.features)
67 | ifneq (,$(findstring FMA,$(AVX1_M)))
68 | CFLAGS += -mfma
69 | endif
70 | ifneq (,$(findstring AVX1.0,$(AVX1_M)))
71 | CFLAGS += -mavx
72 | endif
73 | AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
74 | ifneq (,$(findstring AVX2,$(AVX2_M)))
75 | CFLAGS += -mavx2
76 | endif
77 | else ifeq ($(UNAME_S),Linux)
78 | AVX1_M := $(shell grep "avx " /proc/cpuinfo)
79 | ifneq (,$(findstring avx,$(AVX1_M)))
80 | CFLAGS += -mavx
81 | endif
82 | AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
83 | ifneq (,$(findstring avx2,$(AVX2_M)))
84 | CFLAGS += -mavx2
85 | endif
86 | FMA_M := $(shell grep "fma " /proc/cpuinfo)
87 | ifneq (,$(findstring fma,$(FMA_M)))
88 | CFLAGS += -mfma
89 | endif
90 | F16C_M := $(shell grep "f16c " /proc/cpuinfo)
91 | ifneq (,$(findstring f16c,$(F16C_M)))
92 | CFLAGS += -mf16c
93 | endif
94 | SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
95 | ifneq (,$(findstring sse3,$(SSE3_M)))
96 | CFLAGS += -msse3
97 | endif
98 | else ifeq ($(UNAME_S),Haiku)
99 | AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
100 | ifneq (,$(findstring avx,$(AVX1_M)))
101 | CFLAGS += -mavx
102 | endif
103 | AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
104 | ifneq (,$(findstring avx2,$(AVX2_M)))
105 | CFLAGS += -mavx2
106 | endif
107 | FMA_M := $(shell sysinfo -cpu | grep "FMA ")
108 | ifneq (,$(findstring fma,$(FMA_M)))
109 | CFLAGS += -mfma
110 | endif
111 | F16C_M := $(shell sysinfo -cpu | grep "F16C ")
112 | ifneq (,$(findstring f16c,$(F16C_M)))
113 | CFLAGS += -mf16c
114 | endif
115 | else
116 | CFLAGS += -mfma -mf16c -mavx -mavx2
117 | endif
118 | endif
119 | ifeq ($(UNAME_M),amd64)
120 | CFLAGS += -mavx -mavx2 -mfma -mf16c
121 | endif
122 | ifneq ($(filter ppc64%,$(UNAME_M)),)
123 | POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
124 | ifneq (,$(findstring POWER9,$(POWER9_M)))
125 | CFLAGS += -mpower9-vector
126 | endif
127 | # Require c++23's std::byteswap for big-endian support.
128 | ifeq ($(UNAME_M),ppc64)
129 | CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
130 | endif
131 | endif
132 | ifndef LLAMA_NO_ACCELERATE
133 | # Mac M1 - include Accelerate framework
134 | ifeq ($(UNAME_S),Darwin)
135 | CFLAGS += -DGGML_USE_ACCELERATE
136 | LDFLAGS += -framework Accelerate
137 | endif
138 | endif
139 | ifdef LLAMA_OPENBLAS
140 | CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
141 | LDFLAGS += -lopenblas
142 | endif
143 | ifdef LLAMA_GPROF
144 | CFLAGS += -pg
145 | CXXFLAGS += -pg
146 | endif
147 | ifneq ($(filter aarch64%,$(UNAME_M)),)
148 | CFLAGS += -mcpu=native
149 | CXXFLAGS += -mcpu=native
150 | endif
151 | ifneq ($(filter armv6%,$(UNAME_M)),)
152 | # Raspberry Pi 1, 2, 3
153 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
154 | endif
155 | ifneq ($(filter armv7%,$(UNAME_M)),)
156 | # Raspberry Pi 4
157 | CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
158 | endif
159 | ifneq ($(filter armv8%,$(UNAME_M)),)
160 | # Raspberry Pi 4
161 | CFLAGS += -mfp16-format=ieee -mno-unaligned-access
162 | endif
163 |
164 | #
165 | # Print build information
166 | #
167 |
168 | $(info I llama.cpp build info: )
169 | $(info I UNAME_S: $(UNAME_S))
170 | $(info I UNAME_P: $(UNAME_P))
171 | $(info I UNAME_M: $(UNAME_M))
172 | $(info I CFLAGS: $(CFLAGS))
173 | $(info I CXXFLAGS: $(CXXFLAGS))
174 | $(info I LDFLAGS: $(LDFLAGS))
175 | $(info I CC: $(CCV))
176 | $(info I CXX: $(CXXV))
177 | $(info )
178 |
179 | default: main quantize
180 |
181 | #
182 | # Build library
183 | #
184 |
185 | ggml.o: ggml.c ggml.h
186 | $(CC) $(CFLAGS) -c ggml.c -o ggml.o
187 |
188 | utils.o: utils.cpp utils.h
189 | $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
190 |
191 | clean:
192 | rm -f *.o main quantize
193 |
194 | main: main.cpp ggml.o utils.o
195 | $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
196 | ./main -h
197 |
198 |
199 | quantize: quantize.cpp ggml.o utils.o
200 | $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
201 |
202 | #
203 | # Tests
204 | #
205 |
206 | .PHONY: tests
207 | tests:
208 | bash ./tests/run-tests.sh
209 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Alpaca.cpp
2 |
3 | Run a fast ChatGPT-like model locally on your device. The screencast below is not sped up and running on an M2 Macbook Air with 4GB of weights.
4 |
5 |
6 | [](https://asciinema.org/a/dfJ8QXZ4u978Ona59LPEldtKK)
7 |
8 |
9 | This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT) and a set of modifications to [llama.cpp](https://github.com/ggerganov/llama.cpp) to add a chat interface.
10 |
11 | ## Get started
12 |
13 | ```sh
14 | git clone https://github.com/antimatter15/alpaca.cpp
15 | cd alpaca.cpp
16 |
17 | make chat
18 | ./chat
19 | ```
20 |
21 | You can download the weights for `ggml-alpaca-7b-q4.bin` with BitTorrent `magnet:?xt=urn:btih:5aaceaec63b03e51a98f04fd5c42320b2a033010&dn=ggml-alpaca-7b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce`
22 |
23 |
24 | Alternatively you can download them with IPFS.
25 |
26 | ```
27 | # any of these will work
28 | curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
29 | curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
30 | curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
31 | ```
32 |
33 | Save the `ggml-alpaca-7b-q4.bin` file in the same directory as your `./chat` executable.
34 |
35 | The weights are based on the published fine-tunes from `alpaca-lora`, converted back into a pytorch checkpoint with a [modified script](https://github.com/tloen/alpaca-lora/pull/19) and then quantized with llama.cpp the regular way.
36 |
37 | ## Windows Setup
38 |
39 | - Download and install CMake:
40 | - Download and install `git`. If you've never used git before, consider a GUI client like
41 | - Clone this repo using your git client of choice (for GitHub Desktop, go to File -> Clone repository -> From URL and paste `https://github.com/antimatter15/alpaca.cpp` in as the URL)
42 | - Open a Windows Terminal inside the folder you cloned the repository to
43 | - Run the following commands one by one:
44 |
45 | ```ps1
46 | cmake .
47 | cmake --build . --config Release
48 | ```
49 |
50 | - Download the weights via any of the links in "Get started" above, and save the file as `ggml-alpaca-7b-q4.bin` in the main Alpaca directory.
51 | - In the terminal window, run this command:
52 | ```ps1
53 | .\Release\chat.exe
54 | ```
55 | - (You can add other launch options like `--n 8` as preferred onto the same line)
56 | - You can now type to the AI in the terminal and it will reply. Enjoy!
57 |
58 | ## 13B
59 |
60 | TODO: write more docs here (PRs welcome)
61 |
62 | Torrent: `magnet:?xt=urn:btih:053b3d54d2e77ff020ebddf51dad681f2a651071&dn=ggml-alpaca-13b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce&tr=udp%3A%2F%2Ftracker.openbittorrent.com%3A6969%2Fannounce&tr=udp%3A%2F%2F9.rarbg.com%3A2810%2Fannounce`
63 |
64 |
65 | ```
66 | ./chat -m ggml-alpaca-13b-q4.bin
67 | ```
68 |
69 | ## Credit
70 |
71 | This combines [Facebook's LLaMA](https://github.com/facebookresearch/llama), [Stanford Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), [alpaca-lora](https://github.com/tloen/alpaca-lora) and [corresponding weights](https://huggingface.co/tloen/alpaca-lora-7b/tree/main) by Eric Wang (which uses [Jason Phang's implementation of LLaMA](https://github.com/huggingface/transformers/pull/21955) on top of Hugging Face Transformers), and [llama.cpp](https://github.com/ggerganov/llama.cpp) by Georgi Gerganov. The chat implementation is based on Matvey Soloviev's [Interactive Mode](https://github.com/ggerganov/llama.cpp/pull/61) for llama.cpp. Inspired by [Simon Willison's](https://til.simonwillison.net/llms/llama-7b-m2) getting started guide for LLaMA. [Andy Matuschak](https://twitter.com/andy_matuschak/status/1636769182066053120)'s thread on adapting this to 13B, using fine tuning weights by [Sam Witteveen](https://huggingface.co/samwit/alpaca13B-lora).
72 |
73 |
74 | ## Disclaimer
75 |
76 | Note that the model weights are only to be used for research purposes, as they are derivative of LLaMA, and uses the published instruction data from the Stanford Alpaca project which is generated by OpenAI, which itself disallows the usage of its outputs to train competing models.
77 |
78 |
79 |
--------------------------------------------------------------------------------
/convert-pth-to-ggml.py:
--------------------------------------------------------------------------------
1 | # Convert a LLaMA model checkpoint to a ggml compatible file
2 | #
3 | # Load the model using Torch
4 | # Iterate over all variables and write them to a binary file.
5 | #
6 | # For each variable, write the following:
7 | # - Number of dimensions (int)
8 | # - Name length (int)
9 | # - Dimensions (int[n_dims])
10 | # - Name (char[name_length])
11 | # - Data (float[n_dims])
12 | #
13 | # By default, the bigger matrices are converted to 16-bit floats.
14 | # This can be disabled by adding the "use-f32" CLI argument.
15 | #
16 | # At the start of the ggml file we write the model parameters
17 | # and vocabulary.
18 | #
19 |
20 | import sys
21 | import json
22 | import struct
23 | import numpy as np
24 | import torch
25 | from sentencepiece import SentencePieceProcessor
26 |
27 | if len(sys.argv) < 3:
28 | print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
29 | print(" ftype == 0 -> float32")
30 | print(" ftype == 1 -> float16")
31 | sys.exit(1)
32 |
33 | # output in the same directory as the model
34 | dir_model = sys.argv[1]
35 |
36 | fname_hparams = sys.argv[1] + "/params.json"
37 | fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
38 |
39 | def get_n_parts(dim):
40 | if dim == 4096:
41 | return 1
42 | elif dim == 5120:
43 | return 2
44 | elif dim == 6656:
45 | return 4
46 | elif dim == 8192:
47 | return 8
48 | else:
49 | print("Invalid dim: " + str(dim))
50 | sys.exit(1)
51 |
52 | # possible data types
53 | # ftype == 0 -> float32
54 | # ftype == 1 -> float16
55 | #
56 | # map from ftype to string
57 | ftype_str = ["f32", "f16"]
58 |
59 | ftype = 1
60 | if len(sys.argv) > 2:
61 | ftype = int(sys.argv[2])
62 | if ftype < 0 or ftype > 1:
63 | print("Invalid ftype: " + str(ftype))
64 | sys.exit(1)
65 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
66 |
67 | with open(fname_hparams, "r") as f:
68 | hparams = json.load(f)
69 |
70 | tokenizer = SentencePieceProcessor(fname_tokenizer)
71 |
72 | hparams.update({"vocab_size": tokenizer.vocab_size()})
73 |
74 | n_parts = get_n_parts(hparams["dim"])
75 |
76 | print(hparams)
77 | print('n_parts = ', n_parts)
78 |
79 | for p in range(n_parts):
80 | print('Processing part ', p)
81 |
82 | #fname_model = sys.argv[1] + "/consolidated.00.pth"
83 | fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
84 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
85 | if (p > 0):
86 | fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
87 |
88 | model = torch.load(fname_model, map_location="cpu")
89 |
90 | fout = open(fname_out, "wb")
91 |
92 | fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
93 | fout.write(struct.pack("i", hparams["vocab_size"]))
94 | fout.write(struct.pack("i", hparams["dim"]))
95 | fout.write(struct.pack("i", hparams["multiple_of"]))
96 | fout.write(struct.pack("i", hparams["n_heads"]))
97 | fout.write(struct.pack("i", hparams["n_layers"]))
98 | fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
99 | fout.write(struct.pack("i", ftype))
100 |
101 | # Is this correct??
102 | for i in range(tokenizer.vocab_size()):
103 | if tokenizer.is_unknown(i):
104 | # "" token (translated as ??)
105 | text = " \u2047 ".encode("utf-8")
106 | fout.write(struct.pack("i", len(text)))
107 | fout.write(text)
108 | elif tokenizer.is_control(i):
109 | # ""/"" tokens
110 | fout.write(struct.pack("i", 0))
111 | elif tokenizer.is_byte(i):
112 | # "" tokens (which may be invalid UTF-8)
113 | piece = tokenizer.id_to_piece(i)
114 | if len(piece) != 6:
115 | print("Invalid token: " + piece)
116 | sys.exit(1)
117 | byte_value = int(piece[3:-1], 16)
118 | fout.write(struct.pack("i", 1))
119 | fout.write(struct.pack("B", byte_value))
120 | else:
121 | # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
122 | text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
123 | fout.write(struct.pack("i", len(text)))
124 | fout.write(text)
125 |
126 | for k, v in model.items():
127 | name = k
128 | shape = v.shape
129 |
130 | # skip layers.X.attention.inner_attention.rope.freqs
131 | if name[-5:] == "freqs":
132 | continue
133 |
134 | print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
135 |
136 | #data = tf.train.load_variable(dir_model, name).squeeze()
137 | data = v.numpy().squeeze()
138 | n_dims = len(data.shape);
139 |
140 | # for efficiency - transpose some matrices
141 | # "model/h.*/attn/c_attn/w"
142 | # "model/h.*/attn/c_proj/w"
143 | # "model/h.*/mlp/c_fc/w"
144 | # "model/h.*/mlp/c_proj/w"
145 | #if name[-14:] == "/attn/c_attn/w" or \
146 | # name[-14:] == "/attn/c_proj/w" or \
147 | # name[-11:] == "/mlp/c_fc/w" or \
148 | # name[-13:] == "/mlp/c_proj/w":
149 | # print(" Transposing")
150 | # data = data.transpose()
151 |
152 | dshape = data.shape
153 |
154 | # default type is fp16
155 | ftype_cur = 1
156 | if ftype == 0 or n_dims == 1:
157 | print(" Converting to float32")
158 | data = data.astype(np.float32)
159 | ftype_cur = 0
160 |
161 | # header
162 | sname = name.encode('utf-8')
163 | fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
164 | for i in range(n_dims):
165 | fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
166 | fout.write(sname);
167 |
168 | # data
169 | data.tofile(fout)
170 |
171 | # I hope this deallocates the memory ..
172 | model = None
173 |
174 | fout.close()
175 |
176 | print("Done. Output file: " + fname_out + ", (part ", p, ")")
177 | print("")
178 |
--------------------------------------------------------------------------------
/ggml.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | //
4 | // GGML Tensor Library
5 | //
6 | // This documentation is still a work in progress.
7 | // If you wish some specific topics to be covered, feel free to drop a comment:
8 | //
9 | // https://github.com/ggerganov/whisper.cpp/issues/40
10 | //
11 | // ## Overview
12 | //
13 | // This library implements:
14 | //
15 | // - a set of tensor operations
16 | // - automatic differentiation
17 | // - basic optimization algorithms
18 | //
19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20 | // but is not limited to, the following:
21 | //
22 | // - linear regression
23 | // - support vector machines
24 | // - neural networks
25 | //
26 | // The library allows the user to define a certain function using the available tensor operations. This function
27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition
28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30 | // using one of the available optimization algorithms.
31 | //
32 | // For example, here we define the function: f(x) = a*x^2 + b
33 | //
34 | // {
35 | // struct ggml_init_params params = {
36 | // .mem_size = 16*1024*1024,
37 | // .mem_buffer = NULL,
38 | // };
39 | //
40 | // // memory allocation happens here
41 | // struct ggml_context * ctx = ggml_init(params);
42 | //
43 | // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44 | //
45 | // ggml_set_param(ctx, x); // x is an input variable
46 | //
47 | // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48 | // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49 | // struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50 | // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51 | //
52 | // ...
53 | // }
54 | //
55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only
56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57 | //
58 | // {
59 | // ...
60 | //
61 | // struct ggml_cgraph gf = ggml_build_forward(f);
62 | //
63 | // // set the input variable and parameter values
64 | // ggml_set_f32(x, 2.0f);
65 | // ggml_set_f32(a, 3.0f);
66 | // ggml_set_f32(b, 4.0f);
67 | //
68 | // ggml_graph_compute(ctx0, &gf);
69 | //
70 | // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71 | //
72 | // ...
73 | // }
74 | //
75 | // The actual computation is performed in the ggml_graph_compute() function.
76 | //
77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
81 | // actually needed.
82 | //
83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
84 | // differentiation and optimization algorithms.
85 | //
86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs
87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
88 | // the user can avoid the memory allocation overhead at runtime.
89 | //
90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
91 | // citizens, but in theory the library can be extended to support FP8 and integer data types.
92 | //
93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear
96 | // yet, but a few examples are demonstrated in the following operations:
97 | //
98 | // - ggml_permute()
99 | // - ggml_conv_1d_1s()
100 | // - ggml_conv_1d_2s()
101 | //
102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function
103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
105 | // calculus class, or watch the following video:
106 | //
107 | // What is Automatic Differentiation?
108 | // https://www.youtube.com/watch?v=wG_nF1awSSY
109 | //
110 | //
111 | // ## Tensor data (struct ggml_tensor)
112 | //
113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
116 | //
117 | // {
118 | // struct ggml_tensor * c = ggml_add(ctx, a, b);
119 | //
120 | // assert(c->src[0] == a);
121 | // assert(c->src[1] == b);
122 | // }
123 | //
124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
128 | // contiguous in memory.
129 | //
130 | // The data of the tensor is accessed via the "data" pointer. For example:
131 | //
132 | // {
133 | // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134 | //
135 | // // a[1, 2] = 1.0f;
136 | // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137 | //
138 | // // a[2, 0] = 2.0f;
139 | // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140 | //
141 | // ...
142 | // }
143 | //
144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
145 | //
146 | // ## The matrix multiplication operator (ggml_mul_mat)
147 | //
148 | // TODO
149 | //
150 | //
151 | // ## Multi-threading
152 | //
153 | // TODO
154 | //
155 | //
156 | // ## Overview of ggml.c
157 | //
158 | // TODO
159 | //
160 | //
161 | // ## SIMD optimizations
162 | //
163 | // TODO
164 | //
165 | //
166 | // ## Debugging ggml
167 | //
168 | // TODO
169 | //
170 | //
171 |
172 | #ifdef __cplusplus
173 | extern "C" {
174 | #endif
175 |
176 | #include
177 | #include
178 | #include
179 |
180 | #define GGML_MAX_DIMS 4
181 | #define GGML_MAX_NODES 4096
182 | #define GGML_MAX_PARAMS 16
183 | #define GGML_MAX_CONTEXTS 64
184 | #define GGML_MAX_OPT 4
185 |
186 | #ifdef __ARM_NEON
187 | // we use the built-in 16-bit float type
188 | typedef __fp16 ggml_fp16_t;
189 | #else
190 | typedef uint16_t ggml_fp16_t;
191 | #endif
192 |
193 | // convert FP16 <-> FP32
194 | float ggml_fp16_to_fp32(ggml_fp16_t x);
195 | ggml_fp16_t ggml_fp32_to_fp16(float x);
196 |
197 | struct ggml_object;
198 | struct ggml_context;
199 |
200 | enum ggml_type {
201 | GGML_TYPE_Q4_0,
202 | GGML_TYPE_Q4_1,
203 | GGML_TYPE_I8,
204 | GGML_TYPE_I16,
205 | GGML_TYPE_I32,
206 | GGML_TYPE_F16,
207 | GGML_TYPE_F32,
208 | GGML_TYPE_COUNT,
209 | };
210 |
211 | // available tensor operations:
212 | enum ggml_op {
213 | GGML_OP_NONE = 0,
214 |
215 | GGML_OP_DUP,
216 | GGML_OP_ADD,
217 | GGML_OP_SUB,
218 | GGML_OP_MUL,
219 | GGML_OP_DIV,
220 | GGML_OP_SQR,
221 | GGML_OP_SQRT,
222 | GGML_OP_SUM,
223 | GGML_OP_MEAN,
224 | GGML_OP_REPEAT,
225 | GGML_OP_ABS,
226 | GGML_OP_SGN,
227 | GGML_OP_NEG,
228 | GGML_OP_STEP,
229 | GGML_OP_RELU,
230 | GGML_OP_GELU,
231 | GGML_OP_SILU,
232 | GGML_OP_NORM, // normalize
233 | GGML_OP_RMS_NORM,
234 |
235 | GGML_OP_MUL_MAT,
236 |
237 | GGML_OP_SCALE,
238 | GGML_OP_CPY,
239 | GGML_OP_RESHAPE,
240 | GGML_OP_VIEW,
241 | GGML_OP_PERMUTE,
242 | GGML_OP_TRANSPOSE,
243 | GGML_OP_GET_ROWS,
244 | GGML_OP_DIAG_MASK_INF,
245 | GGML_OP_SOFT_MAX,
246 | GGML_OP_ROPE,
247 | GGML_OP_CONV_1D_1S,
248 | GGML_OP_CONV_1D_2S,
249 |
250 | GGML_OP_FLASH_ATTN,
251 | GGML_OP_FLASH_FF,
252 |
253 | GGML_OP_COUNT,
254 | };
255 |
256 | // n-dimensional tensor
257 | struct ggml_tensor {
258 | enum ggml_type type;
259 |
260 | int n_dims;
261 | int ne[GGML_MAX_DIMS]; // number of elements
262 | size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263 | // nb[0] = sizeof(type)
264 | // nb[1] = nb[0] * ne[0] + padding
265 | // nb[i] = nb[i-1] * ne[i-1]
266 |
267 | // compute data
268 | enum ggml_op op;
269 |
270 | bool is_param;
271 |
272 | struct ggml_tensor * grad;
273 | struct ggml_tensor * src0;
274 | struct ggml_tensor * src1;
275 | struct ggml_tensor * opt[GGML_MAX_OPT];
276 |
277 | // thread scheduling
278 | int n_tasks;
279 |
280 | // performance
281 | int perf_runs;
282 | int64_t perf_cycles;
283 | int64_t perf_time_us;
284 |
285 | void * data;
286 | char padding[8];
287 | };
288 |
289 | // computation graph
290 | struct ggml_cgraph {
291 | int n_nodes;
292 | int n_leafs;
293 | int n_threads;
294 |
295 | size_t work_size;
296 | struct ggml_tensor * work;
297 |
298 | struct ggml_tensor * nodes[GGML_MAX_NODES];
299 | struct ggml_tensor * grads[GGML_MAX_NODES];
300 | struct ggml_tensor * leafs[GGML_MAX_NODES];
301 |
302 | // performance
303 | int perf_runs;
304 | int64_t perf_cycles;
305 | int64_t perf_time_us;
306 | };
307 |
308 | // scratch buffer
309 | struct ggml_scratch {
310 | size_t offs;
311 | size_t size;
312 | void * data;
313 | };
314 |
315 | struct ggml_init_params {
316 | // memory pool
317 | size_t mem_size; // bytes
318 | void * mem_buffer; // if NULL, memory will be allocated internally
319 | };
320 |
321 | void ggml_time_init(void); // call this once at the beginning of the program
322 | int64_t ggml_time_ms(void);
323 | int64_t ggml_time_us(void);
324 | int64_t ggml_cycles(void);
325 | int64_t ggml_cycles_per_ms(void);
326 |
327 | void ggml_print_object (const struct ggml_object * obj);
328 | void ggml_print_objects(const struct ggml_context * ctx);
329 |
330 | int ggml_nelements(const struct ggml_tensor * tensor);
331 | size_t ggml_nbytes (const struct ggml_tensor * tensor);
332 |
333 | int ggml_blck_size (enum ggml_type type);
334 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
335 | float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
336 |
337 | size_t ggml_element_size(const struct ggml_tensor * tensor);
338 |
339 | struct ggml_context * ggml_init(struct ggml_init_params params);
340 | void ggml_free(struct ggml_context * ctx);
341 |
342 | size_t ggml_used_mem(const struct ggml_context * ctx);
343 |
344 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345 |
346 | struct ggml_tensor * ggml_new_tensor(
347 | struct ggml_context * ctx,
348 | enum ggml_type type,
349 | int n_dims,
350 | const int *ne);
351 |
352 | struct ggml_tensor * ggml_new_tensor_1d(
353 | struct ggml_context * ctx,
354 | enum ggml_type type,
355 | int ne0);
356 |
357 | struct ggml_tensor * ggml_new_tensor_2d(
358 | struct ggml_context * ctx,
359 | enum ggml_type type,
360 | int ne0,
361 | int ne1);
362 |
363 | struct ggml_tensor * ggml_new_tensor_3d(
364 | struct ggml_context * ctx,
365 | enum ggml_type type,
366 | int ne0,
367 | int ne1,
368 | int ne2);
369 |
370 | struct ggml_tensor * ggml_new_tensor_4d(
371 | struct ggml_context * ctx,
372 | enum ggml_type type,
373 | int ne0,
374 | int ne1,
375 | int ne2,
376 | int ne3);
377 |
378 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
379 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
380 |
381 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
382 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
383 |
384 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
385 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
386 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
387 |
388 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
389 | void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
390 |
391 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
392 | void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
393 |
394 | void * ggml_get_data (const struct ggml_tensor * tensor);
395 | float * ggml_get_data_f32(const struct ggml_tensor * tensor);
396 |
397 | //
398 | // operations on tensors with backpropagation
399 | //
400 |
401 | struct ggml_tensor * ggml_dup(
402 | struct ggml_context * ctx,
403 | struct ggml_tensor * a);
404 |
405 | struct ggml_tensor * ggml_add(
406 | struct ggml_context * ctx,
407 | struct ggml_tensor * a,
408 | struct ggml_tensor * b);
409 |
410 | struct ggml_tensor * ggml_sub(
411 | struct ggml_context * ctx,
412 | struct ggml_tensor * a,
413 | struct ggml_tensor * b);
414 |
415 | struct ggml_tensor * ggml_mul(
416 | struct ggml_context * ctx,
417 | struct ggml_tensor * a,
418 | struct ggml_tensor * b);
419 |
420 | struct ggml_tensor * ggml_div(
421 | struct ggml_context * ctx,
422 | struct ggml_tensor * a,
423 | struct ggml_tensor * b);
424 |
425 | struct ggml_tensor * ggml_sqr(
426 | struct ggml_context * ctx,
427 | struct ggml_tensor * a);
428 |
429 | struct ggml_tensor * ggml_sqrt(
430 | struct ggml_context * ctx,
431 | struct ggml_tensor * a);
432 |
433 | // return scalar
434 | // TODO: compute sum along rows
435 | struct ggml_tensor * ggml_sum(
436 | struct ggml_context * ctx,
437 | struct ggml_tensor * a);
438 |
439 | // mean along rows
440 | struct ggml_tensor * ggml_mean(
441 | struct ggml_context * ctx,
442 | struct ggml_tensor * a);
443 |
444 | // if a is the same shape as b, and a is not parameter, return a
445 | // otherwise, return a new tensor: repeat(a) to fit in b
446 | struct ggml_tensor * ggml_repeat(
447 | struct ggml_context * ctx,
448 | struct ggml_tensor * a,
449 | struct ggml_tensor * b);
450 |
451 | struct ggml_tensor * ggml_abs(
452 | struct ggml_context * ctx,
453 | struct ggml_tensor * a);
454 |
455 | struct ggml_tensor * ggml_sgn(
456 | struct ggml_context * ctx,
457 | struct ggml_tensor * a);
458 |
459 | struct ggml_tensor * ggml_neg(
460 | struct ggml_context * ctx,
461 | struct ggml_tensor * a);
462 |
463 | struct ggml_tensor * ggml_step(
464 | struct ggml_context * ctx,
465 | struct ggml_tensor * a);
466 |
467 | struct ggml_tensor * ggml_relu(
468 | struct ggml_context * ctx,
469 | struct ggml_tensor * a);
470 |
471 | // TODO: double-check this computation is correct
472 | struct ggml_tensor * ggml_gelu(
473 | struct ggml_context * ctx,
474 | struct ggml_tensor * a);
475 |
476 | struct ggml_tensor * ggml_silu(
477 | struct ggml_context * ctx,
478 | struct ggml_tensor * a);
479 |
480 | // normalize along rows
481 | // TODO: eps is hardcoded to 1e-5 for now
482 | struct ggml_tensor * ggml_norm(
483 | struct ggml_context * ctx,
484 | struct ggml_tensor * a);
485 |
486 | struct ggml_tensor * ggml_rms_norm(
487 | struct ggml_context * ctx,
488 | struct ggml_tensor * a);
489 |
490 | // A: m rows, n columns
491 | // B: p rows, n columns (i.e. we transpose it internally)
492 | // result is m columns, p rows
493 | struct ggml_tensor * ggml_mul_mat(
494 | struct ggml_context * ctx,
495 | struct ggml_tensor * a,
496 | struct ggml_tensor * b);
497 |
498 | //
499 | // operations on tensors without backpropagation
500 | //
501 |
502 | // in-place, returns view(a)
503 | struct ggml_tensor * ggml_scale(
504 | struct ggml_context * ctx,
505 | struct ggml_tensor * a,
506 | struct ggml_tensor * b);
507 |
508 | // a -> b, return view(b)
509 | struct ggml_tensor * ggml_cpy(
510 | struct ggml_context * ctx,
511 | struct ggml_tensor * a,
512 | struct ggml_tensor * b);
513 |
514 | // return view(a), b specifies the new shape
515 | // TODO: when we start computing gradient, make a copy instead of view
516 | struct ggml_tensor * ggml_reshape(
517 | struct ggml_context * ctx,
518 | struct ggml_tensor * a,
519 | struct ggml_tensor * b);
520 |
521 | // return view(a)
522 | // TODO: when we start computing gradient, make a copy instead of view
523 | struct ggml_tensor * ggml_reshape_2d(
524 | struct ggml_context * ctx,
525 | struct ggml_tensor * a,
526 | int ne0,
527 | int ne1);
528 |
529 | // return view(a)
530 | // TODO: when we start computing gradient, make a copy instead of view
531 | struct ggml_tensor * ggml_reshape_3d(
532 | struct ggml_context * ctx,
533 | struct ggml_tensor * a,
534 | int ne0,
535 | int ne1,
536 | int ne2);
537 |
538 | // offset in bytes
539 | struct ggml_tensor * ggml_view_1d(
540 | struct ggml_context * ctx,
541 | struct ggml_tensor * a,
542 | int ne0,
543 | size_t offset);
544 |
545 | struct ggml_tensor * ggml_view_2d(
546 | struct ggml_context * ctx,
547 | struct ggml_tensor * a,
548 | int ne0,
549 | int ne1,
550 | size_t nb1, // row stride in bytes
551 | size_t offset);
552 |
553 | struct ggml_tensor * ggml_permute(
554 | struct ggml_context * ctx,
555 | struct ggml_tensor * a,
556 | int axis0,
557 | int axis1,
558 | int axis2,
559 | int axis3);
560 |
561 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
562 | struct ggml_tensor * ggml_transpose(
563 | struct ggml_context * ctx,
564 | struct ggml_tensor * a);
565 |
566 | struct ggml_tensor * ggml_get_rows(
567 | struct ggml_context * ctx,
568 | struct ggml_tensor * a,
569 | struct ggml_tensor * b);
570 |
571 | // set elements above the diagonal to -INF
572 | // in-place, returns view(a)
573 | struct ggml_tensor * ggml_diag_mask_inf(
574 | struct ggml_context * ctx,
575 | struct ggml_tensor * a,
576 | int n_past);
577 |
578 | // in-place, returns view(a)
579 | struct ggml_tensor * ggml_soft_max(
580 | struct ggml_context * ctx,
581 | struct ggml_tensor * a);
582 |
583 | // rotary position embedding
584 | // in-place, returns view(a)
585 | // if mode == 1, skip n_past elements
586 | // TODO: avoid creating a new tensor every time
587 | struct ggml_tensor * ggml_rope(
588 | struct ggml_context * ctx,
589 | struct ggml_tensor * a,
590 | int n_past,
591 | int n_dims,
592 | int mode);
593 |
594 | // padding = 1
595 | // TODO: we don't support extra parameters for now
596 | // that's why we are hard-coding the stride, padding, and dilation
597 | // not great ..
598 | struct ggml_tensor * ggml_conv_1d_1s(
599 | struct ggml_context * ctx,
600 | struct ggml_tensor * a,
601 | struct ggml_tensor * b);
602 |
603 | struct ggml_tensor * ggml_conv_1d_2s(
604 | struct ggml_context * ctx,
605 | struct ggml_tensor * a,
606 | struct ggml_tensor * b);
607 |
608 | struct ggml_tensor * ggml_flash_attn(
609 | struct ggml_context * ctx,
610 | struct ggml_tensor * q,
611 | struct ggml_tensor * k,
612 | struct ggml_tensor * v,
613 | bool masked);
614 |
615 | struct ggml_tensor * ggml_flash_ff(
616 | struct ggml_context * ctx,
617 | struct ggml_tensor * a,
618 | struct ggml_tensor * b0,
619 | struct ggml_tensor * b1,
620 | struct ggml_tensor * c0,
621 | struct ggml_tensor * c1);
622 |
623 | //
624 | // automatic differentiation
625 | //
626 |
627 | void ggml_set_param(
628 | struct ggml_context * ctx,
629 | struct ggml_tensor * tensor);
630 |
631 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
632 |
633 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
634 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
635 |
636 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
637 | void ggml_graph_reset (struct ggml_cgraph * cgraph);
638 |
639 | // print info and performance information for the graph
640 | void ggml_graph_print(const struct ggml_cgraph * cgraph);
641 |
642 | // dump the graph into a file using the dot format
643 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
644 |
645 | //
646 | // optimization
647 | //
648 |
649 | // optimization methods
650 | enum ggml_opt_type {
651 | GGML_OPT_ADAM,
652 | GGML_OPT_LBFGS,
653 | };
654 |
655 | // linesearch methods
656 | enum ggml_linesearch {
657 | GGML_LINESEARCH_DEFAULT = 1,
658 |
659 | GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
660 | GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
661 | GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
662 | };
663 |
664 | // optimization return values
665 | enum ggml_opt_result {
666 | GGML_OPT_OK = 0,
667 | GGML_OPT_DID_NOT_CONVERGE,
668 | GGML_OPT_NO_CONTEXT,
669 | GGML_OPT_INVALID_WOLFE,
670 | GGML_OPT_FAIL,
671 |
672 | GGML_LINESEARCH_FAIL = -128,
673 | GGML_LINESEARCH_MINIMUM_STEP,
674 | GGML_LINESEARCH_MAXIMUM_STEP,
675 | GGML_LINESEARCH_MAXIMUM_ITERATIONS,
676 | GGML_LINESEARCH_INVALID_PARAMETERS,
677 | };
678 |
679 | // optimization parameters
680 | //
681 | // see ggml.c (ggml_opt_default_params) for default values
682 | //
683 | struct ggml_opt_params {
684 | enum ggml_opt_type type;
685 |
686 | int n_threads;
687 |
688 | // delta-based convergence test
689 | //
690 | // if past == 0 - disabled
691 | // if past > 0:
692 | // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
693 | //
694 | int past;
695 | float delta;
696 |
697 | // maximum number of iterations without improvement
698 | //
699 | // if 0 - disabled
700 | // if > 0:
701 | // assume convergence if no cost improvement in this number of iterations
702 | //
703 | int max_no_improvement;
704 |
705 | bool print_forward_graph;
706 | bool print_backward_graph;
707 |
708 | // ADAM parameters
709 | struct {
710 | int n_iter;
711 |
712 | float alpha; // learning rate
713 | float beta1;
714 | float beta2;
715 | float eps; // epsilon for numerical stability
716 | float eps_f; // epsilon for convergence test
717 | float eps_g; // epsilon for convergence test
718 | } adam;
719 |
720 | // LBFGS parameters
721 | struct {
722 | int m; // number of corrections to approximate the inv. Hessian
723 | int n_iter;
724 | int max_linesearch;
725 |
726 | float eps; // convergence tolerance
727 | float ftol; // line search tolerance
728 | float wolfe;
729 | float min_step;
730 | float max_step;
731 |
732 | enum ggml_linesearch linesearch;
733 | } lbfgs;
734 | };
735 |
736 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
737 |
738 | // optimize the function defined by the tensor f
739 | enum ggml_opt_result ggml_opt(
740 | struct ggml_context * ctx,
741 | struct ggml_opt_params params,
742 | struct ggml_tensor * f);
743 |
744 | //
745 | // system info
746 | //
747 |
748 | int ggml_cpu_has_avx(void);
749 | int ggml_cpu_has_avx2(void);
750 | int ggml_cpu_has_avx512(void);
751 | int ggml_cpu_has_fma(void);
752 | int ggml_cpu_has_neon(void);
753 | int ggml_cpu_has_arm_fma(void);
754 | int ggml_cpu_has_f16c(void);
755 | int ggml_cpu_has_fp16_va(void);
756 | int ggml_cpu_has_wasm_simd(void);
757 | int ggml_cpu_has_blas(void);
758 | int ggml_cpu_has_sse3(void);
759 | int ggml_cpu_has_vsx(void);
760 |
761 | #ifdef __cplusplus
762 | }
763 | #endif
764 |
--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
1 | #define NOMINMAX
2 | #include "ggml.h"
3 |
4 | #include "utils.h"
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include