├── .clang-tidy
├── .devops
    ├── cloud-v-pipeline
    ├── full-cuda.Dockerfile
    ├── full-rocm.Dockerfile
    ├── full.Dockerfile
    ├── llama-cli-cann.Dockerfile
    ├── llama-cli-cuda.Dockerfile
    ├── llama-cli-intel.Dockerfile
    ├── llama-cli-rocm.Dockerfile
    ├── llama-cli-vulkan.Dockerfile
    ├── llama-cli.Dockerfile
    ├── llama-cpp-cuda.srpm.spec
    ├── llama-cpp.srpm.spec
    ├── llama-server-cuda.Dockerfile
    ├── llama-server-intel.Dockerfile
    ├── llama-server-rocm.Dockerfile
    ├── llama-server-vulkan.Dockerfile
    ├── llama-server.Dockerfile
    ├── nix
    │   ├── apps.nix
    │   ├── devshells.nix
    │   ├── docker.nix
    │   ├── jetson-support.nix
    │   ├── nixpkgs-instances.nix
    │   ├── package-gguf-py.nix
    │   ├── package.nix
    │   ├── python-scripts.nix
    │   ├── scope.nix
    │   └── sif.nix
    └── tools.sh
├── .dockerignore
├── .ecrc
├── .editorconfig
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 01-bug-low.yml
    │   ├── 02-bug-medium.yml
    │   ├── 03-bug-high.yml
    │   ├── 04-bug-critical.yml
    │   ├── 05-enhancement.yml
    │   ├── 06-research.yml
    │   ├── 07-refactor.yml
    │   └── config.yml
    ├── labeler.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── bench.yml.disabled
    │   ├── build.yml
    │   ├── close-issue.yml
    │   ├── docker.yml
    │   ├── editorconfig.yml
    │   ├── gguf-publish.yml
    │   ├── labeler.yml
    │   ├── nix-ci-aarch64.yml
    │   ├── nix-ci.yml
    │   ├── nix-flake-update.yml
    │   ├── nix-publish-flake.yml
    │   ├── python-check-requirements.yml
    │   ├── python-lint.yml
    │   ├── python-type-check.yml
    │   └── server.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── AUTHORS
├── CMakeLists.txt
├── CMakePresets.json
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── Package.swift
├── README.md
├── ci
    ├── README.md
    └── run.sh
├── cmake
    ├── arm64-windows-llvm.cmake
    ├── arm64-windows-msvc.cmake
    ├── build-info.cmake
    ├── git-vars.cmake
    ├── llama-config.cmake.in
    └── llama.pc.in
├── common
    ├── CMakeLists.txt
    ├── arg.cpp
    ├── arg.h
    ├── base64.hpp
    ├── build-info.cpp.in
    ├── cmake
    │   └── build-info-gen-cpp.cmake
    ├── common.cpp
    ├── common.h
    ├── console.cpp
    ├── console.h
    ├── json-schema-to-grammar.cpp
    ├── json-schema-to-grammar.h
    ├── json.hpp
    ├── log.cpp
    ├── log.h
    ├── ngram-cache.cpp
    ├── ngram-cache.h
    ├── sampling.cpp
    ├── sampling.h
    ├── stb_image.h
    ├── train.cpp
    └── train.h
├── convert_hf_to_gguf.py
├── convert_hf_to_gguf_update.py
├── convert_llama_ggml_to_gguf.py
├── convert_lora_to_gguf.py
├── docs
    ├── android.md
    ├── backend
    │   ├── BLIS.md
    │   ├── CANN.md
    │   └── SYCL.md
    ├── build.md
    ├── development
    │   ├── HOWTO-add-model.md
    │   ├── debugging-tests.md
    │   ├── llama-star
    │   │   ├── idea-arch.key
    │   │   └── idea-arch.pdf
    │   └── token_generation_performance_tips.md
    ├── docker.md
    └── install.md
├── flake.lock
├── flake.nix
├── ggml
    ├── .gitignore
    ├── CMakeLists.txt
    ├── cmake
    │   └── FindSIMD.cmake
    ├── include
    │   ├── ggml-alloc.h
    │   ├── ggml-backend.h
    │   ├── ggml-blas.h
    │   ├── ggml-cann.h
    │   ├── ggml-cuda.h
    │   ├── ggml-kompute.h
    │   ├── ggml-metal.h
    │   ├── ggml-rpc.h
    │   ├── ggml-sycl.h
    │   ├── ggml-vulkan.h
    │   └── ggml.h
    └── src
    │   ├── CMakeLists.txt
    │   ├── ggml-aarch64.c
    │   ├── ggml-aarch64.h
    │   ├── ggml-alloc.c
    │   ├── ggml-backend-impl.h
    │   ├── ggml-backend.cpp
    │   ├── ggml-blas.cpp
    │   ├── ggml-cann.cpp
    │   ├── ggml-cann
    │       ├── .clang-format
    │       ├── Doxyfile
    │       ├── acl_tensor.cpp
    │       ├── acl_tensor.h
    │       ├── aclnn_ops.cpp
    │       ├── aclnn_ops.h
    │       ├── common.h
    │       └── kernels
    │       │   ├── CMakeLists.txt
    │       │   ├── ascendc_kernels.h
    │       │   ├── dup.cpp
    │       │   ├── get_row_f16.cpp
    │       │   ├── get_row_f32.cpp
    │       │   ├── get_row_q4_0.cpp
    │       │   ├── get_row_q8_0.cpp
    │       │   ├── quantize_f16_q8_0.cpp
    │       │   ├── quantize_f32_q8_0.cpp
    │       │   └── quantize_float_to_q4_0.cpp
    │   ├── ggml-common.h
    │   ├── ggml-cpu-impl.h
    │   ├── ggml-cuda.cu
    │   ├── ggml-cuda
    │       ├── acc.cu
    │       ├── acc.cuh
    │       ├── arange.cu
    │       ├── arange.cuh
    │       ├── argmax.cu
    │       ├── argmax.cuh
    │       ├── argsort.cu
    │       ├── argsort.cuh
    │       ├── binbcast.cu
    │       ├── binbcast.cuh
    │       ├── clamp.cu
    │       ├── clamp.cuh
    │       ├── common.cuh
    │       ├── concat.cu
    │       ├── concat.cuh
    │       ├── conv-transpose-1d.cu
    │       ├── conv-transpose-1d.cuh
    │       ├── convert.cu
    │       ├── convert.cuh
    │       ├── count-equal.cu
    │       ├── count-equal.cuh
    │       ├── cpy.cu
    │       ├── cpy.cuh
    │       ├── cross-entropy-loss.cu
    │       ├── cross-entropy-loss.cuh
    │       ├── dequantize.cuh
    │       ├── diagmask.cu
    │       ├── diagmask.cuh
    │       ├── dmmv.cu
    │       ├── dmmv.cuh
    │       ├── fattn-common.cuh
    │       ├── fattn-tile-f16.cu
    │       ├── fattn-tile-f16.cuh
    │       ├── fattn-tile-f32.cu
    │       ├── fattn-tile-f32.cuh
    │       ├── fattn-vec-f16.cuh
    │       ├── fattn-vec-f32.cuh
    │       ├── fattn-wmma-f16.cuh
    │       ├── fattn.cu
    │       ├── fattn.cuh
    │       ├── getrows.cu
    │       ├── getrows.cuh
    │       ├── im2col.cu
    │       ├── im2col.cuh
    │       ├── mma.cuh
    │       ├── mmq.cu
    │       ├── mmq.cuh
    │       ├── mmvq.cu
    │       ├── mmvq.cuh
    │       ├── norm.cu
    │       ├── norm.cuh
    │       ├── opt-step-adamw.cu
    │       ├── opt-step-adamw.cuh
    │       ├── out-prod.cu
    │       ├── out-prod.cuh
    │       ├── pad.cu
    │       ├── pad.cuh
    │       ├── pool2d.cu
    │       ├── pool2d.cuh
    │       ├── quantize.cu
    │       ├── quantize.cuh
    │       ├── rope.cu
    │       ├── rope.cuh
    │       ├── rwkv-wkv.cu
    │       ├── rwkv-wkv.cuh
    │       ├── scale.cu
    │       ├── scale.cuh
    │       ├── softmax.cu
    │       ├── softmax.cuh
    │       ├── sum.cu
    │       ├── sum.cuh
    │       ├── sumrows.cu
    │       ├── sumrows.cuh
    │       ├── template-instances
    │       │   ├── fattn-vec-f16-instance-hs128-f16-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-f16-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-f16-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-f16-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-f16-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-f16-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
    │       │   ├── fattn-vec-f16-instance-hs256-f16-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-f16.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-q4_0.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-q4_1.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-q5_0.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-q5_1.cu
    │       │   ├── fattn-vec-f16-instance-hs64-f16-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-f16-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
    │       │   ├── fattn-vec-f32-instance-hs256-f16-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-f16.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-q4_0.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-q4_1.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-q5_0.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-q5_1.cu
    │       │   ├── fattn-vec-f32-instance-hs64-f16-q8_0.cu
    │       │   ├── fattn-wmma-f16-instance-kqfloat-cpb16.cu
    │       │   ├── fattn-wmma-f16-instance-kqfloat-cpb32.cu
    │       │   ├── fattn-wmma-f16-instance-kqhalf-cpb16.cu
    │       │   ├── fattn-wmma-f16-instance-kqhalf-cpb32.cu
    │       │   ├── fattn-wmma-f16-instance-kqhalf-cpb8.cu
    │       │   ├── generate_cu_files.py
    │       │   ├── mmq-instance-iq1_s.cu
    │       │   ├── mmq-instance-iq2_s.cu
    │       │   ├── mmq-instance-iq2_xs.cu
    │       │   ├── mmq-instance-iq2_xxs.cu
    │       │   ├── mmq-instance-iq3_s.cu
    │       │   ├── mmq-instance-iq3_xxs.cu
    │       │   ├── mmq-instance-iq4_nl.cu
    │       │   ├── mmq-instance-iq4_xs.cu
    │       │   ├── mmq-instance-q2_k.cu
    │       │   ├── mmq-instance-q3_k.cu
    │       │   ├── mmq-instance-q4_0.cu
    │       │   ├── mmq-instance-q4_1.cu
    │       │   ├── mmq-instance-q4_k.cu
    │       │   ├── mmq-instance-q5_0.cu
    │       │   ├── mmq-instance-q5_1.cu
    │       │   ├── mmq-instance-q5_k.cu
    │       │   ├── mmq-instance-q6_k.cu
    │       │   └── mmq-instance-q8_0.cu
    │       ├── tsembd.cu
    │       ├── tsembd.cuh
    │       ├── unary.cu
    │       ├── unary.cuh
    │       ├── upscale.cu
    │       ├── upscale.cuh
    │       ├── vecdotq.cuh
    │       └── vendors
    │       │   ├── cuda.h
    │       │   ├── hip.h
    │       │   └── musa.h
    │   ├── ggml-impl.h
    │   ├── ggml-kompute.cpp
    │   ├── ggml-metal.m
    │   ├── ggml-metal.metal
    │   ├── ggml-quants.c
    │   ├── ggml-quants.h
    │   ├── ggml-rpc.cpp
    │   ├── ggml-sycl.cpp
    │   ├── ggml-sycl
    │       ├── backend.hpp
    │       ├── common.cpp
    │       ├── common.hpp
    │       ├── concat.cpp
    │       ├── concat.hpp
    │       ├── conv.cpp
    │       ├── conv.hpp
    │       ├── convert.cpp
    │       ├── convert.hpp
    │       ├── dequantize.hpp
    │       ├── dmmv.cpp
    │       ├── dmmv.hpp
    │       ├── dpct
    │       │   └── helper.hpp
    │       ├── gemm.hpp
    │       ├── im2col.cpp
    │       ├── im2col.hpp
    │       ├── mmq.cpp
    │       ├── mmq.hpp
    │       ├── mmvq.cpp
    │       ├── mmvq.hpp
    │       ├── norm.cpp
    │       ├── norm.hpp
    │       ├── presets.hpp
    │       ├── rope.cpp
    │       ├── rope.hpp
    │       ├── softmax.cpp
    │       ├── softmax.hpp
    │       ├── tsembd.cpp
    │       ├── tsembd.hpp
    │       └── vecdotq.hpp
    │   ├── ggml-vulkan.cpp
    │   ├── ggml.c
    │   ├── kompute-shaders
    │       ├── common.comp
    │       ├── op_add.comp
    │       ├── op_addrow.comp
    │       ├── op_cpy_f16_f16.comp
    │       ├── op_cpy_f16_f32.comp
    │       ├── op_cpy_f32_f16.comp
    │       ├── op_cpy_f32_f32.comp
    │       ├── op_diagmask.comp
    │       ├── op_gelu.comp
    │       ├── op_getrows.comp
    │       ├── op_getrows_f16.comp
    │       ├── op_getrows_f32.comp
    │       ├── op_getrows_q4_0.comp
    │       ├── op_getrows_q4_1.comp
    │       ├── op_getrows_q6_k.comp
    │       ├── op_mul.comp
    │       ├── op_mul_mat_f16.comp
    │       ├── op_mul_mat_mat_f32.comp
    │       ├── op_mul_mat_q4_0.comp
    │       ├── op_mul_mat_q4_1.comp
    │       ├── op_mul_mat_q6_k.comp
    │       ├── op_mul_mat_q8_0.comp
    │       ├── op_mul_mv_q_n.comp
    │       ├── op_mul_mv_q_n_pre.comp
    │       ├── op_norm.comp
    │       ├── op_relu.comp
    │       ├── op_rmsnorm.comp
    │       ├── op_rope_f16.comp
    │       ├── op_rope_f32.comp
    │       ├── op_scale.comp
    │       ├── op_scale_8.comp
    │       ├── op_silu.comp
    │       ├── op_softmax.comp
    │       └── rope_common.comp
    │   ├── llamafile
    │       ├── sgemm.cpp
    │       └── sgemm.h
    │   └── vulkan-shaders
    │       ├── CMakeLists.txt
    │       ├── acc.comp
    │       ├── add.comp
    │       ├── argsort.comp
    │       ├── clamp.comp
    │       ├── concat.comp
    │       ├── copy.comp
    │       ├── cos.comp
    │       ├── dequant_f32.comp
    │       ├── dequant_funcs.comp
    │       ├── dequant_head.comp
    │       ├── dequant_iq4_nl.comp
    │       ├── dequant_q2_k.comp
    │       ├── dequant_q3_k.comp
    │       ├── dequant_q4_0.comp
    │       ├── dequant_q4_1.comp
    │       ├── dequant_q4_k.comp
    │       ├── dequant_q5_0.comp
    │       ├── dequant_q5_1.comp
    │       ├── dequant_q5_k.comp
    │       ├── dequant_q6_k.comp
    │       ├── dequant_q8_0.comp
    │       ├── diag_mask_inf.comp
    │       ├── div.comp
    │       ├── gelu.comp
    │       ├── gelu_quick.comp
    │       ├── generic_binary_head.comp
    │       ├── generic_head.comp
    │       ├── generic_unary_head.comp
    │       ├── get_rows.comp
    │       ├── get_rows_quant.comp
    │       ├── group_norm.comp
    │       ├── im2col.comp
    │       ├── leaky_relu.comp
    │       ├── mul.comp
    │       ├── mul_mat_split_k_reduce.comp
    │       ├── mul_mat_vec.comp
    │       ├── mul_mat_vec_base.comp
    │       ├── mul_mat_vec_nc.comp
    │       ├── mul_mat_vec_p021.comp
    │       ├── mul_mat_vec_q2_k.comp
    │       ├── mul_mat_vec_q3_k.comp
    │       ├── mul_mat_vec_q4_k.comp
    │       ├── mul_mat_vec_q5_k.comp
    │       ├── mul_mat_vec_q6_k.comp
    │       ├── mul_mm.comp
    │       ├── norm.comp
    │       ├── pad.comp
    │       ├── relu.comp
    │       ├── repeat.comp
    │       ├── rms_norm.comp
    │       ├── rope_head.comp
    │       ├── rope_neox.comp
    │       ├── rope_norm.comp
    │       ├── scale.comp
    │       ├── silu.comp
    │       ├── sin.comp
    │       ├── soft_max.comp
    │       ├── square.comp
    │       ├── sum_rows.comp
    │       ├── tanh.comp
    │       ├── timestep_embedding.comp
    │       ├── types.comp
    │       ├── upscale.comp
    │       └── vulkan-shaders-gen.cpp
├── gguf-py
    ├── LICENSE
    ├── README.md
    ├── examples
    │   ├── reader.py
    │   └── writer.py
    ├── gguf
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── gguf.py
    │   ├── gguf_reader.py
    │   ├── gguf_writer.py
    │   ├── lazy.py
    │   ├── metadata.py
    │   ├── py.typed
    │   ├── quants.py
    │   ├── tensor_mapping.py
    │   ├── utility.py
    │   └── vocab.py
    ├── pyproject.toml
    ├── scripts
    │   ├── __init__.py
    │   ├── gguf_convert_endian.py
    │   ├── gguf_dump.py
    │   ├── gguf_hash.py
    │   ├── gguf_new_metadata.py
    │   └── gguf_set_metadata.py
    └── tests
    │   ├── __init__.py
    │   ├── test_metadata.py
    │   └── test_quants.py
├── grammars
    ├── README.md
    ├── arithmetic.gbnf
    ├── c.gbnf
    ├── chess.gbnf
    ├── japanese.gbnf
    ├── json.gbnf
    ├── json_arr.gbnf
    └── list.gbnf
├── include
    └── llama.h
├── lac.cpp
├── lang-cli-src
    ├── config.cpp
    ├── config.h
    ├── console_manager.cpp
    ├── console_manager.h
    ├── file_manager.cpp
    ├── file_manager.h
    ├── model_manager.cpp
    ├── model_manager.h
    ├── output_parser.cpp
    ├── output_parser.h
    ├── shell_executor.cpp
    ├── shell_executor.h
    ├── str_parser.cpp
    └── str_parser.h
├── media
    ├── llama-leader.jpeg
    ├── llama0-banner.png
    ├── llama0-logo.png
    ├── llama1-banner.png
    ├── llama1-logo.png
    ├── matmul.png
    └── matmul.svg
├── models
    ├── .editorconfig
    ├── ggml-vocab-aquila.gguf
    ├── ggml-vocab-baichuan.gguf
    ├── ggml-vocab-bert-bge.gguf
    ├── ggml-vocab-bert-bge.gguf.inp
    ├── ggml-vocab-bert-bge.gguf.out
    ├── ggml-vocab-chameleon.gguf.inp
    ├── ggml-vocab-chameleon.gguf.out
    ├── ggml-vocab-command-r.gguf
    ├── ggml-vocab-command-r.gguf.inp
    ├── ggml-vocab-command-r.gguf.out
    ├── ggml-vocab-deepseek-coder.gguf
    ├── ggml-vocab-deepseek-coder.gguf.inp
    ├── ggml-vocab-deepseek-coder.gguf.out
    ├── ggml-vocab-deepseek-llm.gguf
    ├── ggml-vocab-deepseek-llm.gguf.inp
    ├── ggml-vocab-deepseek-llm.gguf.out
    ├── ggml-vocab-falcon.gguf
    ├── ggml-vocab-falcon.gguf.inp
    ├── ggml-vocab-falcon.gguf.out
    ├── ggml-vocab-gpt-2.gguf
    ├── ggml-vocab-gpt-2.gguf.inp
    ├── ggml-vocab-gpt-2.gguf.out
    ├── ggml-vocab-gpt-neox.gguf
    ├── ggml-vocab-llama-bpe.gguf
    ├── ggml-vocab-llama-bpe.gguf.inp
    ├── ggml-vocab-llama-bpe.gguf.out
    ├── ggml-vocab-llama-spm.gguf
    ├── ggml-vocab-llama-spm.gguf.inp
    ├── ggml-vocab-llama-spm.gguf.out
    ├── ggml-vocab-mpt.gguf
    ├── ggml-vocab-mpt.gguf.inp
    ├── ggml-vocab-mpt.gguf.out
    ├── ggml-vocab-phi-3.gguf
    ├── ggml-vocab-phi-3.gguf.inp
    ├── ggml-vocab-phi-3.gguf.out
    ├── ggml-vocab-qwen2.gguf
    ├── ggml-vocab-qwen2.gguf.inp
    ├── ggml-vocab-qwen2.gguf.out
    ├── ggml-vocab-refact.gguf
    ├── ggml-vocab-refact.gguf.inp
    ├── ggml-vocab-refact.gguf.out
    ├── ggml-vocab-starcoder.gguf
    ├── ggml-vocab-starcoder.gguf.inp
    └── ggml-vocab-starcoder.gguf.out
├── mypy.ini
├── output.gif
├── pocs
    ├── CMakeLists.txt
    └── vdot
    │   ├── CMakeLists.txt
    │   ├── q8dot.cpp
    │   └── vdot.cpp
├── poetry.lock
├── prompts
    ├── LLM-questions.txt
    ├── alpaca.txt
    ├── assistant.txt
    ├── chat-with-baichuan.txt
    ├── chat-with-bob.txt
    ├── chat-with-qwen.txt
    ├── chat-with-vicuna-v0.txt
    ├── chat-with-vicuna-v1.txt
    ├── chat.txt
    ├── dan-modified.txt
    ├── dan.txt
    ├── mnemonics.txt
    ├── parallel-questions.txt
    └── reason-act.txt
├── pyproject.toml
├── pyrightconfig.json
├── requirements.txt
├── requirements
    ├── requirements-all.txt
    ├── requirements-compare-llama-bench.txt
    ├── requirements-convert_hf_to_gguf.txt
    ├── requirements-convert_hf_to_gguf_update.txt
    ├── requirements-convert_legacy_llama.txt
    ├── requirements-convert_llama_ggml_to_gguf.txt
    ├── requirements-convert_lora_to_gguf.txt
    ├── requirements-pydantic.txt
    └── requirements-test-tokenizer-random.txt
├── scripts
    ├── build-info.sh
    ├── check-requirements.sh
    ├── ci-run.sh
    ├── compare-commits.sh
    ├── compare-llama-bench.py
    ├── debug-test.sh
    ├── gen-authors.sh
    ├── gen-unicode-data.py
    ├── get-flags.mk
    ├── get-hellaswag.sh
    ├── get-pg.sh
    ├── get-wikitext-103.sh
    ├── get-wikitext-2.sh
    ├── get-winogrande.sh
    ├── hf.sh
    ├── install-oneapi.bat
    ├── pod-llama.sh
    ├── qnt-all.sh
    ├── run-all-perf.sh
    ├── run-all-ppl.sh
    ├── run-with-preset.py
    ├── server-llm.sh
    ├── sync-ggml-am.sh
    ├── sync-ggml.last
    ├── sync-ggml.sh
    ├── verify-checksum-models.py
    └── xxd.cmake
├── spm-headers
    ├── ggml-alloc.h
    ├── ggml-backend.h
    ├── ggml-metal.h
    ├── ggml.h
    └── llama.h
├── src
    ├── CMakeLists.txt
    ├── llama-grammar.cpp
    ├── llama-grammar.h
    ├── llama-impl.h
    ├── llama-sampling.cpp
    ├── llama-sampling.h
    ├── llama-vocab.cpp
    ├── llama-vocab.h
    ├── llama.cpp
    ├── unicode-data.cpp
    ├── unicode-data.h
    ├── unicode.cpp
    └── unicode.h
└── tests
    ├── .gitignore
    ├── CMakeLists.txt
    ├── get-model.cpp
    ├── get-model.h
    ├── run-json-schema-to-grammar.mjs
    ├── test-arg-parser.cpp
    ├── test-autorelease.cpp
    ├── test-backend-ops.cpp
    ├── test-barrier.cpp
    ├── test-c.c
    ├── test-chat-template.cpp
    ├── test-double-float.cpp
    ├── test-grad0.cpp
    ├── test-grammar-integration.cpp
    ├── test-grammar-parser.cpp
    ├── test-json-schema-to-grammar.cpp
    ├── test-llama-grammar.cpp
    ├── test-log.cpp
    ├── test-lora-conversion-inference.sh
    ├── test-model-load-cancel.cpp
    ├── test-opt.cpp
    ├── test-quantize-fns.cpp
    ├── test-quantize-perf.cpp
    ├── test-rope.cpp
    ├── test-sampling.cpp
    ├── test-tokenizer-0.cpp
    ├── test-tokenizer-0.py
    ├── test-tokenizer-0.sh
    ├── test-tokenizer-1-bpe.cpp
    ├── test-tokenizer-1-spm.cpp
    └── test-tokenizer-random.py


/.clang-tidy:
--------------------------------------------------------------------------------
 1 | ---
 2 | Checks: >
 3 |     bugprone-*,
 4 |     -bugprone-easily-swappable-parameters,
 5 |     -bugprone-implicit-widening-of-multiplication-result,
 6 |     -bugprone-misplaced-widening-cast,
 7 |     -bugprone-narrowing-conversions,
 8 |     readability-*,
 9 |     -readability-avoid-unconditional-preprocessor-if,
10 |     -readability-function-cognitive-complexity,
11 |     -readability-identifier-length,
12 |     -readability-implicit-bool-conversion,
13 |     -readability-magic-numbers,
14 |     -readability-uppercase-literal-suffix,
15 |     -readability-simplify-boolean-expr,
16 |     clang-analyzer-*,
17 |     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
18 |     performance-*,
19 |     portability-*,
20 |     misc-*,
21 |     -misc-const-correctness,
22 |     -misc-non-private-member-variables-in-classes,
23 |     -misc-no-recursion,
24 | FormatStyle: none
25 | 


--------------------------------------------------------------------------------
/.devops/cloud-v-pipeline:
--------------------------------------------------------------------------------
 1 | node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
 2 |     stage('Cleanup'){
 3 |         cleanWs()               // Cleaning previous CI build in workspace
 4 |     }
 5 |     stage('checkout repo'){
 6 |         retry(5){               // Retry if the cloning fails due to some reason
 7 |             checkout scm        // Clone the repo on Runner
 8 |         }
 9 |     }
10 |     stage('Compiling llama.cpp'){
11 |         sh'''#!/bin/bash
12 |             make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
13 |         '''
14 |     }
15 |     stage('Running llama.cpp'){
16 |         sh'''#!/bin/bash
17 |             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
18 |             qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
19 |             cat llama_log.txt                   # Printing results
20 |         '''
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/.devops/full-cuda.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | # This needs to generally match the container host's environment.
 3 | ARG CUDA_VERSION=12.6.0
 4 | # Target the CUDA build image
 5 | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 6 | 
 7 | FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 8 | 
 9 | # CUDA architecture to build for (defaults to all supported archs)
10 | ARG CUDA_DOCKER_ARCH=default
11 | 
12 | RUN apt-get update && \
13 |     apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
14 | 
15 | COPY requirements.txt   requirements.txt
16 | COPY requirements       requirements
17 | 
18 | RUN pip install --upgrade pip setuptools wheel \
19 |     && pip install -r requirements.txt
20 | 
21 | WORKDIR /app
22 | 
23 | COPY . .
24 | 
25 | # Use the default CUDA archs if not specified
26 | RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
27 |         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
28 |     fi && \
29 |     cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
30 |     cmake --build build --config Release -j$(nproc) && \
31 |     cp build/bin/* .
32 | 
33 | ENTRYPOINT ["/app/.devops/tools.sh"]
34 | 


--------------------------------------------------------------------------------
/.devops/full-rocm.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | 
 3 | # This needs to generally match the container host's environment.
 4 | ARG ROCM_VERSION=5.6
 5 | 
 6 | # Target the CUDA build image
 7 | ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 8 | 
 9 | FROM ${BASE_ROCM_DEV_CONTAINER} AS build
10 | 
11 | # Unless otherwise specified, we make a fat build.
12 | # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13 | # This is mostly tied to rocBLAS supported archs.
14 | ARG ROCM_DOCKER_ARCH="\
15 |     gfx803 \
16 |     gfx900 \
17 |     gfx906 \
18 |     gfx908 \
19 |     gfx90a \
20 |     gfx1010 \
21 |     gfx1030 \
22 |     gfx1100 \
23 |     gfx1101 \
24 |     gfx1102"
25 | 
26 | COPY requirements.txt   requirements.txt
27 | COPY requirements       requirements
28 | 
29 | RUN pip install --upgrade pip setuptools wheel \
30 |     && pip install -r requirements.txt
31 | 
32 | WORKDIR /app
33 | 
34 | COPY . .
35 | 
36 | # Set nvcc architecture
37 | ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
38 | # Enable ROCm
39 | ENV GGML_HIPBLAS=1
40 | ENV CC=/opt/rocm/llvm/bin/clang
41 | ENV CXX=/opt/rocm/llvm/bin/clang++
42 | 
43 | # Enable cURL
44 | ENV LLAMA_CURL=1
45 | RUN apt-get update && \
46 |     apt-get install -y libcurl4-openssl-dev
47 | 
48 | RUN make -j$(nproc)
49 | 
50 | ENTRYPOINT ["/app/.devops/tools.sh"]
51 | 


--------------------------------------------------------------------------------
/.devops/full.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | 
 3 | FROM ubuntu:$UBUNTU_VERSION AS build
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 7 | 
 8 | COPY requirements.txt   requirements.txt
 9 | COPY requirements       requirements
10 | 
11 | RUN pip install --upgrade pip setuptools wheel \
12 |     && pip install -r requirements.txt
13 | 
14 | WORKDIR /app
15 | 
16 | COPY . .
17 | 
18 | ENV LLAMA_CURL=1
19 | 
20 | 
21 | RUN make -j$(nproc)
22 | 
23 | ENV LC_ALL=C.utf8
24 | 
25 | ENTRYPOINT ["/app/.devops/tools.sh"]
26 | 


--------------------------------------------------------------------------------
/.devops/llama-cli-cuda.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | # This needs to generally match the container host's environment.
 3 | ARG CUDA_VERSION=12.6.0
 4 | # Target the CUDA build image
 5 | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 6 | # Target the CUDA runtime image
 7 | ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 8 | 
 9 | FROM ${BASE_CUDA_DEV_CONTAINER} AS build
10 | 
11 | # CUDA architecture to build for (defaults to all supported archs)
12 | ARG CUDA_DOCKER_ARCH=default
13 | 
14 | RUN apt-get update && \
15 |     apt-get install -y build-essential git cmake
16 | 
17 | WORKDIR /app
18 | 
19 | COPY . .
20 | 
21 | # Use the default CUDA archs if not specified
22 | RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
23 |         export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
24 |     fi && \
25 |     cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
26 |     cmake --build build --config Release --target llama-cli -j$(nproc)
27 | 
28 | FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
29 | 
30 | RUN apt-get update && \
31 |     apt-get install -y libgomp1
32 | 
33 | COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
34 | COPY --from=build /app/build/src/libllama.so /libllama.so
35 | COPY --from=build /app/build/bin/llama-cli /llama-cli
36 | 
37 | ENTRYPOINT [ "/llama-cli" ]
38 | 


--------------------------------------------------------------------------------
/.devops/llama-cli-intel.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 2 | 
 3 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 4 | 
 5 | ARG GGML_SYCL_F16=OFF
 6 | RUN apt-get update && \
 7 |     apt-get install -y git
 8 | 
 9 | WORKDIR /app
10 | 
11 | COPY . .
12 | 
13 | RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
14 |         echo "GGML_SYCL_F16 is set" && \
15 |         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
16 |     fi && \
17 |     echo "Building with static libs" && \
18 |     cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
19 |     ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
20 |     cmake --build build --config Release --target llama-cli
21 | 
22 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
23 | 
24 | COPY --from=build /app/build/bin/llama-cli /llama-cli
25 | 
26 | ENV LC_ALL=C.utf8
27 | 
28 | ENTRYPOINT [ "/llama-cli" ]
29 | 


--------------------------------------------------------------------------------
/.devops/llama-cli-rocm.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | 
 3 | # This needs to generally match the container host's environment.
 4 | ARG ROCM_VERSION=5.6
 5 | 
 6 | # Target the CUDA build image
 7 | ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 8 | 
 9 | FROM ${BASE_ROCM_DEV_CONTAINER} AS build
10 | 
11 | # Unless otherwise specified, we make a fat build.
12 | # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13 | # This is mostly tied to rocBLAS supported archs.
14 | ARG ROCM_DOCKER_ARCH="\
15 |     gfx803 \
16 |     gfx900 \
17 |     gfx906 \
18 |     gfx908 \
19 |     gfx90a \
20 |     gfx1010 \
21 |     gfx1030 \
22 |     gfx1100 \
23 |     gfx1101 \
24 |     gfx1102"
25 | 
26 | COPY requirements.txt   requirements.txt
27 | COPY requirements       requirements
28 | 
29 | RUN pip install --upgrade pip setuptools wheel \
30 |     && pip install -r requirements.txt
31 | 
32 | WORKDIR /app
33 | 
34 | COPY . .
35 | 
36 | # Set nvcc architecture
37 | ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
38 | # Enable ROCm
39 | ENV GGML_HIPBLAS=1
40 | ENV CC=/opt/rocm/llvm/bin/clang
41 | ENV CXX=/opt/rocm/llvm/bin/clang++
42 | 
43 | RUN make -j$(nproc) llama-cli
44 | 
45 | ENTRYPOINT [ "/app/llama-cli" ]
46 | 


--------------------------------------------------------------------------------
/.devops/llama-cli-vulkan.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=jammy
 2 | 
 3 | FROM ubuntu:$UBUNTU_VERSION AS build
 4 | 
 5 | # Install build tools
 6 | RUN apt update && apt install -y git build-essential cmake wget libgomp1
 7 | 
 8 | # Install Vulkan SDK
 9 | RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
10 |     wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
11 |     apt update -y && \
12 |     apt-get install -y vulkan-sdk
13 | 
14 | # Build it
15 | WORKDIR /app
16 | COPY . .
17 | RUN cmake -B build -DGGML_VULKAN=1 && \
18 |     cmake --build build --config Release --target llama-cli
19 | 
20 | # Clean up
21 | WORKDIR /
22 | RUN cp /app/build/bin/llama-cli /llama-cli && \
23 |     rm -rf /app
24 | 
25 | ENV LC_ALL=C.utf8
26 | 
27 | ENTRYPOINT [ "/llama-cli" ]
28 | 


--------------------------------------------------------------------------------
/.devops/llama-cli.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | 
 3 | FROM ubuntu:$UBUNTU_VERSION AS build
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y build-essential git
 7 | 
 8 | WORKDIR /app
 9 | 
10 | COPY . .
11 | 
12 | RUN make -j$(nproc) llama-cli
13 | 
14 | FROM ubuntu:$UBUNTU_VERSION AS runtime
15 | 
16 | RUN apt-get update && \
17 |     apt-get install -y libgomp1
18 | 
19 | COPY --from=build /app/llama-cli /llama-cli
20 | 
21 | ENV LC_ALL=C.utf8
22 | 
23 | ENTRYPOINT [ "/llama-cli" ]
24 | 


--------------------------------------------------------------------------------
/.devops/llama-server-intel.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 2 | 
 3 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 4 | 
 5 | ARG GGML_SYCL_F16=OFF
 6 | RUN apt-get update && \
 7 |     apt-get install -y git libcurl4-openssl-dev
 8 | 
 9 | WORKDIR /app
10 | 
11 | COPY . .
12 | 
13 | RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
14 |         echo "GGML_SYCL_F16 is set" && \
15 |         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
16 |     fi && \
17 |     echo "Building with dynamic libs" && \
18 |     cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
19 |     cmake --build build --config Release --target llama-server
20 | 
21 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
22 | 
23 | RUN apt-get update && \
24 |     apt-get install -y libcurl4-openssl-dev curl
25 | 
26 | COPY --from=build /app/build/bin/llama-server /llama-server
27 | 
28 | ENV LC_ALL=C.utf8
29 | # Must be set to 0.0.0.0 so it can listen to requests from host machine
30 | ENV LLAMA_ARG_HOST=0.0.0.0
31 | 
32 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
33 | 
34 | ENTRYPOINT [ "/llama-server" ]
35 | 


--------------------------------------------------------------------------------
/.devops/llama-server-vulkan.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=jammy
 2 | 
 3 | FROM ubuntu:$UBUNTU_VERSION AS build
 4 | 
 5 | # Install build tools
 6 | RUN apt update && apt install -y git build-essential cmake wget
 7 | 
 8 | # Install Vulkan SDK and cURL
 9 | RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
10 |     wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
11 |     apt update -y && \
12 |     apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
13 | 
14 | # Build it
15 | WORKDIR /app
16 | COPY . .
17 | RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
18 |     cmake --build build --config Release --target llama-server
19 | 
20 | # Clean up
21 | WORKDIR /
22 | RUN cp /app/build/bin/llama-server /llama-server && \
23 |     rm -rf /app
24 | 
25 | ENV LC_ALL=C.utf8
26 | # Must be set to 0.0.0.0 so it can listen to requests from host machine
27 | ENV LLAMA_ARG_HOST=0.0.0.0
28 | 
29 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
30 | 
31 | ENTRYPOINT [ "/llama-server" ]
32 | 


--------------------------------------------------------------------------------
/.devops/llama-server.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG UBUNTU_VERSION=22.04
 2 | 
 3 | FROM ubuntu:$UBUNTU_VERSION AS build
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y build-essential git libcurl4-openssl-dev
 7 | 
 8 | WORKDIR /app
 9 | 
10 | COPY . .
11 | 
12 | ENV LLAMA_CURL=1
13 | 
14 | RUN make -j$(nproc) llama-server
15 | 
16 | FROM ubuntu:$UBUNTU_VERSION AS runtime
17 | 
18 | RUN apt-get update && \
19 |     apt-get install -y libcurl4-openssl-dev libgomp1 curl
20 | 
21 | COPY --from=build /app/llama-server /llama-server
22 | 
23 | ENV LC_ALL=C.utf8
24 | # Must be set to 0.0.0.0 so it can listen to requests from host machine
25 | ENV LLAMA_ARG_HOST=0.0.0.0
26 | 
27 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
28 | 
29 | ENTRYPOINT [ "/llama-server" ]
30 | 


--------------------------------------------------------------------------------
/.devops/nix/apps.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   perSystem =
 3 |     { config, lib, ... }:
 4 |     {
 5 |       apps =
 6 |         let
 7 |           inherit (config.packages) default;
 8 |           binaries = [
 9 |             "llama-cli"
10 |             "llama-embedding"
11 |             "llama-server"
12 |             "llama-quantize"
13 |           ];
14 |           mkApp = name: {
15 |             type = "app";
16 |             program = "${default}/bin/${name}";
17 |           };
18 |         in
19 |         lib.genAttrs binaries mkApp;
20 |     };
21 | }
22 | 


--------------------------------------------------------------------------------
/.devops/nix/docker.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   lib,
 3 |   dockerTools,
 4 |   buildEnv,
 5 |   llama-cpp,
 6 |   interactive ? true,
 7 |   coreutils,
 8 | }:
 9 | 
10 | # A tar that can be fed into `docker load`:
11 | #
12 | # $ nix build .#llamaPackages.docker
13 | # $ docker load < result
14 | 
15 | # For details and variations cf.
16 | # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
17 | # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
18 | # - https://nixery.dev/
19 | 
20 | # Approximate (compressed) sizes, at the time of writing, are:
21 | #
22 | # .#llamaPackages.docker: 125M;
23 | # .#llamaPackagesCuda.docker: 537M;
24 | # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
25 | 
26 | dockerTools.buildLayeredImage {
27 |   name = llama-cpp.pname;
28 |   tag = "latest";
29 | 
30 |   contents =
31 |     [ llama-cpp ]
32 |     ++ lib.optionals interactive [
33 |       coreutils
34 |       dockerTools.binSh
35 |       dockerTools.caCertificates
36 |     ];
37 | }
38 | 


--------------------------------------------------------------------------------
/.devops/nix/jetson-support.nix:
--------------------------------------------------------------------------------
 1 | { inputs, ... }:
 2 | {
 3 |   perSystem =
 4 |     {
 5 |       config,
 6 |       system,
 7 |       lib,
 8 |       pkgsCuda,
 9 |       ...
10 |     }:
11 |     {
12 |       legacyPackages =
13 |         let
14 |           caps.llamaPackagesXavier = "7.2";
15 |           caps.llamaPackagesOrin = "8.7";
16 |           caps.llamaPackagesTX2 = "6.2";
17 |           caps.llamaPackagesNano = "5.3";
18 | 
19 |           pkgsFor =
20 |             cap:
21 |             import inputs.nixpkgs {
22 |               inherit system;
23 |               config = {
24 |                 cudaSupport = true;
25 |                 cudaCapabilities = [ cap ];
26 |                 cudaEnableForwardCompat = false;
27 |                 inherit (pkgsCuda.config) allowUnfreePredicate;
28 |               };
29 |             };
30 |         in
31 |         builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
32 | 
33 |       packages = lib.optionalAttrs (system == "aarch64-linux") {
34 |         jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
35 |         jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
36 |         jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
37 |       };
38 |     };
39 | }
40 | 


--------------------------------------------------------------------------------
/.devops/nix/package-gguf-py.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   lib,
 3 |   llamaVersion,
 4 |   numpy,
 5 |   tqdm,
 6 |   sentencepiece,
 7 |   pyyaml,
 8 |   poetry-core,
 9 |   buildPythonPackage,
10 |   pytestCheckHook,
11 | }:
12 | 
13 | buildPythonPackage {
14 |   pname = "gguf";
15 |   version = llamaVersion;
16 |   pyproject = true;
17 |   nativeBuildInputs = [ poetry-core ];
18 |   propagatedBuildInputs = [
19 |     numpy
20 |     tqdm
21 |     sentencepiece
22 |     pyyaml
23 |   ];
24 |   src = lib.cleanSource ../../gguf-py;
25 |   pythonImportsCheck = [
26 |     "numpy"
27 |     "gguf"
28 |   ];
29 |   nativeCheckInputs = [ pytestCheckHook ];
30 |   doCheck = true;
31 |   meta = with lib; {
32 |     description = "Python package for writing binary files in the GGUF format";
33 |     license = licenses.mit;
34 |     maintainers = [ maintainers.ditsuke ];
35 |   };
36 | }
37 | 


--------------------------------------------------------------------------------
/.devops/nix/scope.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   lib,
 3 |   newScope,
 4 |   python3,
 5 |   llamaVersion ? "0.0.0",
 6 | }:
 7 | 
 8 | let
 9 |   pythonPackages = python3.pkgs;
10 |   buildPythonPackage = pythonPackages.buildPythonPackage;
11 |   numpy = pythonPackages.numpy;
12 |   tqdm = pythonPackages.tqdm;
13 |   sentencepiece = pythonPackages.sentencepiece;
14 |   pyyaml = pythonPackages.pyyaml;
15 |   poetry-core = pythonPackages.poetry-core;
16 |   pytestCheckHook = pythonPackages.pytestCheckHook;
17 | in
18 | 
19 | # We're using `makeScope` instead of just writing out an attrset
20 | # because it allows users to apply overlays later using `overrideScope'`.
21 | # Cf. https://noogle.dev/f/lib/makeScope
22 | 
23 | lib.makeScope newScope (self: {
24 |   inherit llamaVersion;
25 |   gguf-py = self.callPackage ./package-gguf-py.nix {
26 |     inherit
27 |       buildPythonPackage
28 |       numpy
29 |       tqdm
30 |       sentencepiece
31 |       poetry-core
32 |       pyyaml
33 |       pytestCheckHook
34 |       ;
35 |   };
36 |   python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
37 |   llama-cpp = self.callPackage ./package.nix { };
38 |   docker = self.callPackage ./docker.nix { };
39 |   docker-min = self.callPackage ./docker.nix { interactive = false; };
40 |   sif = self.callPackage ./sif.nix { };
41 | })
42 | 


--------------------------------------------------------------------------------
/.devops/nix/sif.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   lib,
 3 |   singularity-tools,
 4 |   llama-cpp,
 5 |   bashInteractive,
 6 |   interactive ? false,
 7 | }:
 8 | 
 9 | let
10 |   optionalInt = cond: x: if cond then x else 0;
11 | in
12 | singularity-tools.buildImage rec {
13 |   inherit (llama-cpp) name;
14 |   contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
15 | 
16 |   # These are excessive (but safe) for most variants. Building singularity
17 |   # images requires superuser privileges, so we build them inside a VM in a
18 |   # writable image of pre-determined size.
19 |   #
20 |   # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
21 |   #
22 |   # Expected image sizes:
23 |   # - cpu/blas: 150M,
24 |   # - cuda, all gencodes: 560M,
25 |   diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
26 |   memSize = diskSize;
27 | }
28 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.a
 3 | .cache/
 4 | # Do not ignore .git directory, otherwise the reported build number will always be 0
 5 | .github/
 6 | .gitignore
 7 | .vs/
 8 | .vscode/
 9 | .DS_Store
10 | 
11 | build*/
12 | 
13 | models/*
14 | 
15 | /llama-cli
16 | /llama-quantize
17 | 
18 | arm_neon.h
19 | compile_commands.json
20 | Dockerfile
21 | 


--------------------------------------------------------------------------------
/.ecrc:
--------------------------------------------------------------------------------
1 | {
2 |   "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
3 |   "Disable": {
4 |     "IndentSize": true
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://EditorConfig.org
 2 | 
 3 | # Top-most EditorConfig file
 4 | root = true
 5 | 
 6 | # Unix-style newlines with a newline ending every file, utf-8 charset
 7 | [*]
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | trim_trailing_whitespace = true
11 | charset = utf-8
12 | indent_style = space
13 | indent_size = 4
14 | 
15 | [Makefile]
16 | indent_style = tab
17 | 
18 | [scripts/*.mk]
19 | indent_style = tab
20 | 
21 | [prompts/*.txt]
22 | insert_final_newline = unset
23 | 
24 | [examples/server/public/*]
25 | indent_size = 2
26 | 
27 | [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
28 | indent_style = tab
29 | 
30 | [examples/cvector-generator/*.txt]
31 | trim_trailing_whitespace = unset
32 | insert_final_newline = unset
33 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 125
 3 | ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 4 | exclude =
 5 |     # Do not traverse examples
 6 |     examples,
 7 |     # Do not include package initializers
 8 |     __init__.py,
 9 |     # No need to traverse our git directory
10 |     .git,
11 |     # There's no value in checking cache directories
12 |     __pycache__,
13 |     # No need to include the build path
14 |     build,
15 |     # This contains builds that we don't want to check
16 |     dist  # This is generated with `python build .` for package releases
17 | # max-complexity = 10
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/07-refactor.yml:
--------------------------------------------------------------------------------
 1 | name: Refactor (Maintainers)
 2 | description: Used to track refactoring opportunities
 3 | title: "Refactor: "
 4 | labels: ["refactor"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 |         Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
10 |         Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
11 | 
12 |   - type: textarea
13 |     id: background-description
14 |     attributes:
15 |       label: Background Description
16 |       description: Please provide a detailed written description of the pain points you are trying to solve.
17 |       placeholder: Detailed description behind your motivation to request refactor
18 |     validations:
19 |       required: true
20 | 
21 |   - type: textarea
22 |     id: possible-approaches
23 |     attributes:
24 |       label: Possible Refactor Approaches
25 |       description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
26 |       placeholder: Your idea of possible refactoring opportunity/approaches
27 |     validations:
28 |       required: false
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
 1 | blank_issues_enabled: true
 2 | contact_links:
 3 |   - name: Got an idea?
 4 |     url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
 5 |     about: Pop it there. It may then become an enhancement ticket.
 6 |   - name: Got a question?
 7 |     url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
 8 |     about: Ask a question there!
 9 |   - name: Want to contribute?
10 |     url: https://github.com/ggerganov/llama.cpp/wiki/contribute
11 |     about: Head to the contribution guide page of the wiki for areas you can help with
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | - [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
4 | - Self-reported review complexity:
5 |   - [ ] Low
6 |   - [ ] Medium
7 |   - [ ] High
8 | 


--------------------------------------------------------------------------------
/.github/workflows/close-issue.yml:
--------------------------------------------------------------------------------
 1 | name: Close inactive issues
 2 | on:
 3 |   schedule:
 4 |     - cron: "42 0 * * *"
 5 | 
 6 | # Fine-grant permission
 7 | # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
 8 | permissions:
 9 |   issues: write
10 | 
11 | jobs:
12 |   close-issues:
13 |     runs-on: ubuntu-latest
14 |     permissions:
15 |       issues: write
16 |       pull-requests: write
17 |     steps:
18 |       - uses: actions/stale@v5
19 |         with:
20 |           exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
21 |           days-before-issue-stale: 30
22 |           days-before-issue-close: 14
23 |           stale-issue-label: "stale"
24 |           close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
25 |           days-before-pr-stale: -1
26 |           days-before-pr-close: -1
27 |           operations-per-run: 10000
28 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/editorconfig.yml:
--------------------------------------------------------------------------------
 1 | name: EditorConfig Checker
 2 | 
 3 | on:
 4 |   workflow_dispatch: # allows manual triggering
 5 |     inputs:
 6 |       create_release:
 7 |         description: 'Create new release'
 8 |         required: true
 9 |         type: boolean
10 |   push:
11 |     branches:
12 |       - master
13 |   pull_request:
14 |     branches:
15 |       - master
16 | 
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19 |   cancel-in-progress: true
20 | 
21 | jobs:
22 |   editorconfig:
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - uses: actions/checkout@v4
26 |       - uses: editorconfig-checker/action-editorconfig-checker@main
27 |       - run: editorconfig-checker
28 | 


--------------------------------------------------------------------------------
/.github/workflows/gguf-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a GGUF release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # See `gguf-py/README.md` for how to make a release.
 5 | 
 6 | # This workflow uses actions that are not certified by GitHub.
 7 | # They are provided by a third-party and are governed by
 8 | # separate terms of service, privacy policy, and support
 9 | # documentation.
10 | 
11 | name: Upload Python Package
12 | 
13 | on:
14 |   workflow_dispatch:
15 |   push:
16 |     # Pattern matched against refs/tags
17 |     tags:
18 |       - 'gguf-v*'           # Push events to every version tag
19 | 
20 | 
21 | jobs:
22 |   deploy:
23 | 
24 |     runs-on: ubuntu-latest
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v4
28 |     - name: Set up Python
29 |       uses: actions/setup-python@v5
30 |       with:
31 |         python-version: '3.9.x'
32 |     - name: Install dependencies
33 |       run: |
34 |         cd gguf-py
35 |         python -m pip install poetry
36 |         poetry install
37 | 
38 |     - name: Build package
39 |       run: cd gguf-py && poetry build
40 |     - name: Publish package
41 |       uses: pypa/gh-action-pypi-publish@release/v1
42 |       with:
43 |         password: ${{ secrets.PYPI_API_TOKEN }}
44 |         packages-dir: gguf-py/dist
45 | 


--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
 1 | name: "Pull Request Labeler"
 2 | on:
 3 | - pull_request_target
 4 | 
 5 | jobs:
 6 |   labeler:
 7 |     permissions:
 8 |       contents: read
 9 |       pull-requests: write
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |       with:
14 |         repository: "ggerganov/llama.cpp"
15 |     - uses: actions/labeler@v5
16 |       with:
17 |         configuration-path: '.github/labeler.yml'
18 | 


--------------------------------------------------------------------------------
/.github/workflows/nix-flake-update.yml:
--------------------------------------------------------------------------------
 1 | name: update-flake-lock
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 6 | 
 7 | jobs:
 8 |   lockfile:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout repository
12 |         uses: actions/checkout@v4
13 |       - name: Install Nix
14 |         uses: DeterminateSystems/nix-installer-action@main
15 |       - name: Update flake.lock
16 |         uses: DeterminateSystems/update-flake-lock@main
17 |         with:
18 |           pr-title: "nix: update flake.lock"
19 |           pr-labels: |
20 |             nix
21 |           pr-reviewers: philiptaron,SomeoneSerge
22 |           token: ${{ secrets.FLAKE_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/nix-publish-flake.yml:
--------------------------------------------------------------------------------
 1 | # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 2 | name: "Publish a flake to flakestry & flakehub"
 3 | on:
 4 |     push:
 5 |         tags:
 6 |         - "*"
 7 |     workflow_dispatch:
 8 |         inputs:
 9 |             tag:
10 |                 description: "The existing tag to publish"
11 |                 type: "string"
12 |                 required: true
13 | jobs:
14 |     flakestry-publish:
15 |         runs-on: ubuntu-latest
16 |         permissions:
17 |             id-token: "write"
18 |             contents: "read"
19 |         steps:
20 |             - uses: flakestry/flakestry-publish@main
21 |               with:
22 |                 version: "${{ inputs.tag || github.ref_name }}"
23 |     flakehub-publish:
24 |       runs-on: "ubuntu-latest"
25 |       permissions:
26 |         id-token: "write"
27 |         contents: "read"
28 |       steps:
29 |         - uses: "actions/checkout@v4"
30 |           with:
31 |             ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
32 |         - uses: "DeterminateSystems/nix-installer-action@main"
33 |         - uses: "DeterminateSystems/flakehub-push@main"
34 |           with:
35 |             visibility: "public"
36 |             tag: "${{ inputs.tag }}"
37 | 


--------------------------------------------------------------------------------
/.github/workflows/python-check-requirements.yml:
--------------------------------------------------------------------------------
 1 | name: Python check requirements.txt
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - '.github/workflows/python-check-requirements.yml'
 7 |       - 'scripts/check-requirements.sh'
 8 |       - 'convert*.py'
 9 |       - '**/requirements*.txt'
10 |   pull_request:
11 |     paths:
12 |       - '.github/workflows/python-check-requirements.yml'
13 |       - 'scripts/check-requirements.sh'
14 |       - 'convert*.py'
15 |       - '**/requirements*.txt'
16 | 
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19 |   cancel-in-progress: true
20 | 
21 | jobs:
22 |   python-check-requirements:
23 |     runs-on: ubuntu-latest
24 |     name: check-requirements
25 |     steps:
26 |       - name: Check out source repository
27 |         uses: actions/checkout@v4
28 |       - name: Set up Python environment
29 |         uses: actions/setup-python@v5
30 |         with:
31 |           python-version: "3.11"
32 |       - name: Run check-requirements.sh script
33 |         run:  bash scripts/check-requirements.sh
34 | 


--------------------------------------------------------------------------------
/.github/workflows/python-lint.yml:
--------------------------------------------------------------------------------
 1 | name: flake8 Lint
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   flake8-lint:
11 |     runs-on: ubuntu-latest
12 |     name: Lint
13 |     steps:
14 |       - name: Check out source repository
15 |         uses: actions/checkout@v4
16 |       - name: Set up Python environment
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: "3.11"
20 |       - name: flake8 Lint
21 |         uses: py-actions/flake8@v2
22 |         with:
23 |             plugins: "flake8-no-print"
24 | 


--------------------------------------------------------------------------------
/.github/workflows/python-type-check.yml:
--------------------------------------------------------------------------------
 1 | name: Python Type-Check
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - '.github/workflows/python-type-check.yml'
 7 |       - 'pyrightconfig.json'
 8 |       - '**.py'
 9 |       - '**/requirements*.txt'
10 |   pull_request:
11 |     paths:
12 |       - '.github/workflows/python-type-check.yml'
13 |       - 'pyrightconfig.json'
14 |       - '**.py'
15 |       - '**/requirements*.txt'
16 | 
17 | concurrency:
18 |   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
19 |   cancel-in-progress: true
20 | 
21 | jobs:
22 |   python-type-check:
23 |     runs-on: ubuntu-latest
24 |     name: pyright type-check
25 |     steps:
26 |       - name: Check out source repository
27 |         uses: actions/checkout@v4
28 |       - name: Set up Python environment
29 |         uses: actions/setup-python@v5
30 |         with:
31 |           python-version: "3.11"
32 |       - name: Install Python dependencies
33 |         # TODO: use a venv
34 |         run: pip install -r requirements/requirements-all.txt
35 |       - name: Type-check with Pyright
36 |         uses: jakebailey/pyright-action@v2
37 |         with:
38 |           version: 1.1.382
39 |           level: warning
40 |           warnings: true
41 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "kompute"]
2 | 	path = ggml/src/kompute
3 | 	url = https://github.com/nomic-ai/kompute.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | exclude: prompts/.*.txt
 4 | repos:
 5 | - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |   rev: v4.6.0
 7 |   hooks:
 8 |   - id: trailing-whitespace
 9 |   - id: end-of-file-fixer
10 |   - id: check-yaml
11 |   - id: check-added-large-files
12 | - repo: https://github.com/PyCQA/flake8
13 |   rev: 7.0.0
14 |   hooks:
15 |   -   id: flake8
16 |       additional_dependencies: [flake8-no-print]
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2024 The ggml authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ci/README.md:
--------------------------------------------------------------------------------
 1 | # CI
 2 | 
 3 | In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
 4 | 
 5 | https://github.com/ggml-org/ci
 6 | 
 7 | It monitors the `master` branch for new commits and runs the
 8 | [ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 9 | to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
10 | to cover various hardware architectures, including GPU and Apple Silicon instances.
11 | 
12 | Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
13 | Only the branches of this repo are monitored for this keyword.
14 | 
15 | It is a good practice, before publishing changes to execute the full CI locally on your machine:
16 | 
17 | ```bash
18 | mkdir tmp
19 | 
20 | # CPU-only build
21 | bash ./ci/run.sh ./tmp/results ./tmp/mnt
22 | 
23 | # with CUDA support
24 | GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
25 | 
26 | # with SYCL support
27 | source /opt/intel/oneapi/setvars.sh
28 | GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
29 | ```
30 | 


--------------------------------------------------------------------------------
/cmake/arm64-windows-llvm.cmake:
--------------------------------------------------------------------------------
 1 | set( CMAKE_SYSTEM_NAME Windows )
 2 | set( CMAKE_SYSTEM_PROCESSOR arm64 )
 3 | 
 4 | set( target arm64-pc-windows-msvc )
 5 | 
 6 | set( CMAKE_C_COMPILER    clang )
 7 | set( CMAKE_CXX_COMPILER  clang++ )
 8 | 
 9 | set( CMAKE_C_COMPILER_TARGET   ${target} )
10 | set( CMAKE_CXX_COMPILER_TARGET ${target} )
11 | 
12 | set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13 | set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
14 | 
15 | set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
16 | set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
17 | 


--------------------------------------------------------------------------------
/cmake/arm64-windows-msvc.cmake:
--------------------------------------------------------------------------------
1 | set( CMAKE_SYSTEM_NAME Windows )
2 | set( CMAKE_SYSTEM_PROCESSOR arm64 )
3 | 
4 | set( target arm64-pc-windows-msvc )
5 | set( CMAKE_C_COMPILER_TARGET   ${target} )
6 | set( CMAKE_CXX_COMPILER_TARGET ${target} )
7 | 


--------------------------------------------------------------------------------
/cmake/git-vars.cmake:
--------------------------------------------------------------------------------
 1 | find_package(Git)
 2 | 
 3 | # the commit's SHA1
 4 | execute_process(COMMAND
 5 |     "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
 6 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
 7 |     OUTPUT_VARIABLE GIT_SHA1
 8 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 9 | 
10 | # the date of the commit
11 | execute_process(COMMAND
12 |     "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14 |     OUTPUT_VARIABLE GIT_DATE
15 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16 | 
17 | # the subject of the commit
18 | execute_process(COMMAND
19 |     "${GIT_EXECUTABLE}" log -1 --format=%s
20 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21 |     OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
23 | 


--------------------------------------------------------------------------------
/cmake/llama.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@CMAKE_INSTALL_PREFIX@
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | 
 6 | Name: llama
 7 | Description: Port of Facebook's LLaMA model in C/C++
 8 | Version: @PROJECT_VERSION@
 9 | Libs: -L${libdir} -lllama
10 | Cflags: -I${includedir}
11 | 


--------------------------------------------------------------------------------
/common/build-info.cpp.in:
--------------------------------------------------------------------------------
1 | int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
2 | char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
3 | char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
4 | char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
5 | 


--------------------------------------------------------------------------------
/common/cmake/build-info-gen-cpp.cmake:
--------------------------------------------------------------------------------
 1 | include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 2 | 
 3 | set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
 4 | set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 5 | 
 6 | # Only write the build info if it changed
 7 | if(EXISTS ${OUTPUT_FILE})
 8 |     file(READ ${OUTPUT_FILE} CONTENTS)
 9 |     string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
10 |     set(OLD_COMMIT ${CMAKE_MATCH_1})
11 |     string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
12 |     set(OLD_COMPILER ${CMAKE_MATCH_1})
13 |     string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
14 |     set(OLD_TARGET ${CMAKE_MATCH_1})
15 |     if (
16 |         NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
17 |         NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
18 |         NOT OLD_TARGET   STREQUAL BUILD_TARGET
19 |     )
20 |         configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
21 |     endif()
22 | else()
23 |     configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
24 | endif()
25 | 


--------------------------------------------------------------------------------
/common/console.h:
--------------------------------------------------------------------------------
 1 | // Console functions
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <string>
 6 | 
 7 | namespace console {
 8 |     enum display_t {
 9 |         reset = 0,
10 |         prompt,
11 |         user_input,
12 |         error
13 |     };
14 | 
15 |     void init(bool use_simple_io, bool use_advanced_display);
16 |     void cleanup();
17 |     void set_display(display_t display);
18 |     bool readline(std::string & line, bool multiline_input);
19 |     char32_t getchar32();
20 | }
21 | 


--------------------------------------------------------------------------------
/common/json-schema-to-grammar.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "ggml.h"
4 | // Change JSON_ASSERT from assert() to GGML_ASSERT:
5 | #define JSON_ASSERT GGML_ASSERT
6 | #include "json.hpp"
7 | 
8 | std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
9 | 


--------------------------------------------------------------------------------
/docs/development/llama-star/idea-arch.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/docs/development/llama-star/idea-arch.key


--------------------------------------------------------------------------------
/docs/development/llama-star/idea-arch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/docs/development/llama-star/idea-arch.pdf


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Install pre-built version of llama.cpp
 2 | 
 3 | ## Homebrew
 4 | 
 5 | On Mac and Linux, the homebrew package manager can be used via
 6 | 
 7 | ```sh
 8 | brew install llama.cpp
 9 | ```
10 | The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
11 | 
12 | ## Nix
13 | 
14 | On Mac and Linux, the Nix package manager can be used via
15 | 
16 | ```sh
17 | nix profile install nixpkgs#llama-cpp
18 | ```
19 | For flake enabled installs.
20 | 
21 | Or
22 | 
23 | ```sh
24 | nix-env --file '<nixpkgs>' --install --attr llama-cpp
25 | ```
26 | 
27 | For non-flake enabled installs.
28 | 
29 | This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
30 | 
31 | ## Flox
32 | 
33 | On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
34 | 
35 | ```sh
36 | flox install llama-cpp
37 | ```
38 | 
39 | Flox follows the nixpkgs build of llama.cpp.
40 | 


--------------------------------------------------------------------------------
/ggml/.gitignore:
--------------------------------------------------------------------------------
1 | src/ggml-vulkan-shaders.hpp
2 | src/ggml-vulkan-shaders.cpp
3 | 


--------------------------------------------------------------------------------
/ggml/include/ggml-blas.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ggml.h"
 4 | #include "ggml-backend.h"
 5 | 
 6 | 
 7 | #ifdef  __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // backend API
12 | GGML_API ggml_backend_t ggml_backend_blas_init(void);
13 | 
14 | GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
15 | 
16 | // number of threads used for conversion to float
17 | // for openblas and blis, this will also set the number of threads used for blas operations
18 | GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19 | 
20 | 
21 | #ifdef  __cplusplus
22 | }
23 | #endif
24 | 


--------------------------------------------------------------------------------
/ggml/include/ggml-kompute.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ggml.h"
 4 | #include "ggml-backend.h"
 5 | 
 6 | #include <stdbool.h>
 7 | #include <stddef.h>
 8 | #include <stdint.h>
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | struct ggml_vk_device {
15 |     int index;
16 |     int type; // same as VkPhysicalDeviceType
17 |     size_t heapSize;
18 |     const char * name;
19 |     const char * vendor;
20 |     int subgroupSize;
21 |     uint64_t bufferAlignment;
22 |     uint64_t maxAlloc;
23 | };
24 | 
25 | struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
26 | bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
27 | bool ggml_vk_has_vulkan(void);
28 | bool ggml_vk_has_device(void);
29 | struct ggml_vk_device ggml_vk_current_device(void);
30 | 
31 | //
32 | // backend API
33 | //
34 | 
35 | // forward declaration
36 | typedef struct ggml_backend * ggml_backend_t;
37 | 
38 | GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
39 | 
40 | GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
41 | 
42 | GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
43 | 
44 | #ifdef __cplusplus
45 | }
46 | #endif
47 | 


--------------------------------------------------------------------------------
/ggml/include/ggml-rpc.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ggml.h"
 4 | #include "ggml-backend.h"
 5 | 
 6 | #ifdef  __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | #define GGML_RPC_MAX_SERVERS       16
11 | 
12 | // backend API
13 | GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
14 | GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
15 | 
16 | GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
17 | 
18 | GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19 | 
20 | GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
21 | 
22 | #ifdef  __cplusplus
23 | }
24 | #endif
25 | 


--------------------------------------------------------------------------------
/ggml/include/ggml-vulkan.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ggml.h"
 4 | #include "ggml-backend.h"
 5 | 
 6 | #ifdef  __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | #define GGML_VK_NAME "Vulkan"
11 | #define GGML_VK_MAX_DEVICES 16
12 | 
13 | GGML_API void ggml_vk_instance_init(void);
14 | 
15 | // backend API
16 | GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
17 | 
18 | GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
19 | GGML_API int  ggml_backend_vk_get_device_count(void);
20 | GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
21 | GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
22 | 
23 | GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
24 | // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
25 | GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
26 | 
27 | #ifdef  __cplusplus
28 | }
29 | #endif
30 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cann/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (NOT SOC_TYPE)
 2 |     set (SOC_TYPE "Ascend910B3")
 3 | endif()
 4 | 
 5 | file(GLOB SRC_FILES
 6 |     get_row_f32.cpp
 7 |     get_row_f16.cpp
 8 |     get_row_q4_0.cpp
 9 |     get_row_q8_0.cpp
10 |     quantize_f32_q8_0.cpp
11 |     quantize_f16_q8_0.cpp
12 |     quantize_float_to_q4_0.cpp
13 |     dup.cpp
14 | )
15 | 
16 | string(TOLOWER ${SOC_TYPE} SOC_VERSION)
17 | set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
18 | set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
19 | 
20 | if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
21 |     set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
22 | elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
23 |     set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
24 | else()
25 |     message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
26 | endif()
27 | include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
28 | 
29 | ascendc_library(ascendc_kernels STATIC
30 |     ${SRC_FILES}
31 | )
32 | 
33 | # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
34 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cann/kernels/ascendc_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef ASCENDC_KERNELS_H
 2 | #define ASCENDC_KERNELS_H
 3 | 
 4 | #include "aclrtlaunch_ascendc_get_row_f32.h"
 5 | #include "aclrtlaunch_ascendc_get_row_f16.h"
 6 | #include "aclrtlaunch_ascendc_get_row_q8_0.h"
 7 | #include "aclrtlaunch_ascendc_get_row_q4_0.h"
 8 | 
 9 | #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10 | #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11 | #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12 | #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
13 | 
14 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
15 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
16 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
17 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
18 | 
19 | #endif  // ASCENDC_KERNELS_H
20 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/acc.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ACC_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/arange.cu:
--------------------------------------------------------------------------------
 1 | #include "arange.cuh"
 2 | 
 3 | static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
 4 |     // blockIDx.x: idx of ne0 / BLOCK_SIZE
 5 |     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 6 |     if (nidx >= ne0) {
 7 |         return;
 8 |     }
 9 |     dst[nidx] = start + step * nidx;
10 | }
11 | 
12 | static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13 |     int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14 |     arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
15 | }
16 | 
17 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18 |     float * dst_d = (float *)dst->data;
19 |     cudaStream_t stream = ctx.stream();
20 | 
21 |     GGML_ASSERT(dst->type == GGML_TYPE_F32);
22 | 
23 |     float start;
24 |     float stop;
25 |     float step;
26 |     memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27 |     memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
28 |     memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
29 | 
30 |     int64_t steps = (int64_t)ceil((stop - start) / step);
31 |     GGML_ASSERT(ggml_nelements(dst) == steps);
32 | 
33 |     arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/arange.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ARANGE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/argmax.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/argsort.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/binbcast.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 4 | void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 5 | void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 6 | void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 7 | void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 8 | 
 9 | void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
10 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/clamp.cu:
--------------------------------------------------------------------------------
 1 | #include "clamp.cuh"
 2 | 
 3 | static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
 4 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 5 | 
 6 |     if (i >= k) {
 7 |         return;
 8 |     }
 9 | 
10 |     dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11 | }
12 | 
13 | static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14 |     const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15 |     clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16 | }
17 | 
18 | 
19 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20 |     const ggml_tensor * src0 = dst->src[0];
21 |     const float * src0_d = (const float *)src0->data;
22 |     float * dst_d = (float *)dst->data;
23 |     cudaStream_t stream = ctx.stream();
24 | 
25 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
26 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
27 | 
28 |     float min;
29 |     float max;
30 |     memcpy(&min, dst->op_params, sizeof(float));
31 |     memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32 | 
33 |     clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/clamp.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CLAMP_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/concat.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CONCAT_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/conv-transpose-1d.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/convert.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 4 | 
 5 | template<typename T>
 6 | using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
 7 | 
 8 | typedef to_t_cuda_t<float> to_fp32_cuda_t;
 9 | typedef to_t_cuda_t<half> to_fp16_cuda_t;
10 | 
11 | to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
12 | 
13 | to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
14 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/count-equal.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
4 | 
5 | void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/cpy.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define CUDA_CPY_BLOCK_SIZE 32
 4 | 
 5 | void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
 6 | 
 7 | void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 8 | 
 9 | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
10 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/cross-entropy-loss.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 
7 | void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/diagmask.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
4 | 
5 | void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/dmmv.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | // dmmv = dequantize_mul_mat_vec
 4 | 
 5 | // TODO: remove this?
 6 | #ifndef GGML_CUDA_DMMV_X
 7 | #define GGML_CUDA_DMMV_X 32
 8 | #endif
 9 | 
10 | #ifndef GGML_CUDA_MMV_Y
11 | #define GGML_CUDA_MMV_Y 1
12 | #endif
13 | 
14 | void ggml_cuda_op_dequantize_mul_mat_vec(
15 |     ggml_backend_cuda_context & ctx,
16 |     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
17 |     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
18 |     const int64_t src1_padded_row_size, cudaStream_t stream);
19 | 
20 | bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
21 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/fattn-tile-f16.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/fattn-tile-f32.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/fattn.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/getrows.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_GET_ROWS_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/im2col.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_IM2COL_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/mmvq.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 4 | 
 5 | void ggml_cuda_op_mul_mat_vec_q(
 6 |     ggml_backend_cuda_context & ctx,
 7 |     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 8 |     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 9 |     const int64_t src1_padded_row_size, cudaStream_t stream);
10 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/norm.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 
5 | void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 
7 | void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/opt-step-adamw.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/out-prod.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/pad.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_PAD_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/pool2d.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_POOL2D_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/quantize.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.cuh"
 4 | #include "mmq.cuh"
 5 | 
 6 | #include <cstdint>
 7 | 
 8 | #define CUDA_QUANTIZE_BLOCK_SIZE     256
 9 | #define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
10 | 
11 | static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
12 | static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
13 | 
14 | typedef void (*quantize_cuda_t)(
15 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
16 |     const ggml_type type_x, cudaStream_t stream);
17 | 
18 | void quantize_row_q8_1_cuda(
19 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
20 |     const ggml_type type_x, cudaStream_t stream);
21 | 
22 | void quantize_mmq_q8_1_cuda(
23 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
24 |     const ggml_type type_x, cudaStream_t stream);
25 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/rope.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ROPE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/rwkv-wkv.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_WKV_BLOCK_SIZE 64
4 | 
5 | void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/scale.cu:
--------------------------------------------------------------------------------
 1 | #include "scale.cuh"
 2 | 
 3 | static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
 4 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 5 | 
 6 |     if (i >= k) {
 7 |         return;
 8 |     }
 9 | 
10 |     dst[i] = scale * x[i];
11 | }
12 | 
13 | static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
14 |     const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
15 |     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
16 | }
17 | 
18 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
19 |     const ggml_tensor * src0 = dst->src[0];
20 |     const float * src0_d = (const float *)src0->data;
21 |     float * dst_d = (float *)dst->data;
22 |     cudaStream_t stream = ctx.stream();
23 | 
24 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
25 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
26 | 
27 |     float scale;
28 |     memcpy(&scale, dst->op_params, sizeof(float));
29 | 
30 |     scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
31 | }
32 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/scale.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_SCALE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/softmax.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
4 | 
5 | void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/sum.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
4 | 
5 | void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/sumrows.cu:
--------------------------------------------------------------------------------
 1 | #include "sumrows.cuh"
 2 | 
 3 | static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
 4 |     const int row = blockIdx.x;
 5 |     const int col = threadIdx.x;
 6 | 
 7 |     float sum = 0.0f;
 8 |     for (int i = col; i < ncols; i += blockDim.x) {
 9 |         sum += x[row * ncols + i];
10 |     }
11 | 
12 |     sum = warp_reduce_sum(sum);
13 | 
14 |     if (col == 0) {
15 |         dst[row] = sum;
16 |     }
17 | }
18 | 
19 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
20 |     const dim3 block_dims(WARP_SIZE, 1, 1);
21 |     const dim3 block_nums(nrows, 1, 1);
22 |     k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
23 | }
24 | 
25 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
26 |     const ggml_tensor * src0 = dst->src[0];
27 |     const float * src0_d = (const float *)src0->data;
28 |     float * dst_d = (float *)dst->data;
29 |     cudaStream_t stream = ctx.stream();
30 | 
31 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
32 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
33 |     GGML_ASSERT(ggml_is_contiguous(src0));
34 | 
35 |     const int64_t ncols = src0->ne[0];
36 |     const int64_t nrows = ggml_nrows(src0);
37 | 
38 |     sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
39 | }
40 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/sumrows.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
4 | 
5 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, float);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, float);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, float);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, float);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, float);
10 | DECL_FATTN_WMMA_F16_CASE(256, 16, float);
11 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, float);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, float);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, float);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, float);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, float);
10 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, half);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, half);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, half);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, half);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, half);
10 | DECL_FATTN_WMMA_F16_CASE(256, 16, half);
11 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, half);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, half);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, half);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, half);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, half);
10 | DECL_FATTN_WMMA_F16_CASE(256, 32, half);
11 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-wmma-f16.cuh"
4 | 
5 | DECL_FATTN_WMMA_F16_CASE(64, 8, half);
6 | DECL_FATTN_WMMA_F16_CASE(96, 8, half);
7 | DECL_FATTN_WMMA_F16_CASE(128, 8, half);
8 | DECL_FATTN_WMMA_F16_CASE(256, 8, half);
9 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q2_K);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q3_K);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_K);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_K);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q6_K);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/tsembd.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/upscale.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_UPSCALE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-cuda/vendors/cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cuda.h>
 5 | #include <cublas_v2.h>
 6 | #include <cuda_fp16.h>
 7 | 
 8 | #if CUDART_VERSION < 11020
 9 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
10 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
11 | #define CUBLAS_COMPUTE_16F CUDA_R_16F
12 | #define CUBLAS_COMPUTE_32F CUDA_R_32F
13 | #define cublasComputeType_t cudaDataType_t
14 | #endif // CUDART_VERSION < 11020
15 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/backend.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_BACKEND_HPP
14 | #define GGML_SYCL_BACKEND_HPP
15 | 
16 | #include "concat.hpp"
17 | #include "common.hpp"
18 | #include "conv.hpp"
19 | #include "convert.hpp"
20 | #include "dequantize.hpp"
21 | #include "dmmv.hpp"
22 | #include "mmq.hpp"
23 | #include "mmvq.hpp"
24 | #include "rope.hpp"
25 | #include "norm.hpp"
26 | #include "softmax.hpp"
27 | #include "tsembd.hpp"
28 | #include "im2col.hpp"
29 | 
30 | #endif // GGML_SYCL_BACKEND_HPP
31 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/concat.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_CONCAT_HPP
14 | #define GGML_SYCL_CONCAT_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19 |                                 const ggml_tensor *src1, ggml_tensor *dst);
20 | 
21 | #endif // GGML_SYCL_CONCAT_HPP
22 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/conv.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_CONV_HPP
14 | #define GGML_SYCL_CONV_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19 |   const ggml_tensor *src1, ggml_tensor *dst);
20 | 
21 | #endif // GGML_SYCL_CONV_HPP
22 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/convert.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_CONVERT_HPP
14 | #define GGML_SYCL_CONVERT_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | template <typename T>
19 | using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
20 |                              int64_t k, dpct::queue_ptr stream);
21 | typedef to_t_sycl_t<float> to_fp32_sycl_t;
22 | typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23 | 
24 | to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25 | to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
26 | 
27 | #endif // GGML_SYCL_CONVERT_HPP
28 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/dmmv.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_DMMV_HPP
14 | #define GGML_SYCL_DMMV_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | 
19 | void ggml_sycl_op_dequantize_mul_mat_vec(
20 |     ggml_backend_sycl_context & ctx,
21 |     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
22 |     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
23 |     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
24 |     const int64_t src1_ncols, const int64_t src1_padded_row_size,
25 |     const dpct::queue_ptr &stream);
26 | 
27 | #endif // GGML_SYCL_DMMV_HPP
28 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/im2col.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_IM2COL_HPP
14 | #define GGML_SYCL_IM2COL_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_im2col(
19 |         ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
20 |         ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
21 |         const queue_ptr &main_stream);
22 | 
23 | #endif // GGML_SYCL_IM2COL_HPP
24 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/mmq.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_MMQ_HPP
14 | #define GGML_SYCL_MMQ_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_mul_mat_q(
19 |     ggml_backend_sycl_context & ctx,
20 |     const ggml_tensor* src0,
21 |     const ggml_tensor* src1,
22 |     ggml_tensor* dst,
23 |     const char* src0_dd_i,
24 |     const float* src1_ddf_i,
25 |     const char* src1_ddq_i,
26 |     float* dst_dd_i,
27 |     const int64_t row_low,
28 |     const int64_t row_high,
29 |     const int64_t src1_ncols,
30 |     const int64_t src1_padded_row_size,
31 |     const dpct::queue_ptr& stream);
32 | 
33 | #endif // GGML_SYCL_MMQ_HPP
34 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/mmvq.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_MMVQ_HPP
14 | #define GGML_SYCL_MMVQ_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | 
19 | void ggml_sycl_op_mul_mat_vec_q(
20 |     ggml_backend_sycl_context & ctx,
21 |     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
22 |     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
23 |     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
24 |     const int64_t src1_ncols, const int64_t src1_padded_row_size,
25 |     const dpct::queue_ptr &stream);
26 | 
27 | #endif // GGML_SYCL_MMVQ_HPP
28 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/norm.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_NORM_HPP
14 | #define GGML_SYCL_NORM_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
19 |     ggml_tensor* dst, const float* src0_dd,
20 |     const float* src1_dd, float* dst_dd,
21 |     const queue_ptr& main_stream);
22 | 
23 | void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
24 |     const ggml_tensor* src1, ggml_tensor* dst,
25 |     const float* src0_dd, const float* src1_dd,
26 |     float* dst_dd,
27 |     const queue_ptr& main_stream);
28 | 
29 | void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
30 |     const ggml_tensor* src1, ggml_tensor* dst,
31 |     const float* src0_dd, const float* src1_dd,
32 |     float* dst_dd,
33 |     const queue_ptr& main_stream);
34 | 
35 | #endif // GGML_SYCL_NORM_HPP
36 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/rope.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_ROPE_HPP
14 | #define GGML_SYCL_ROPE_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_rope(
19 |     ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
20 |     const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
21 | 
22 | #endif // GGML_SYCL_ROPE_HPP
23 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/softmax.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_SOFTMAX_HPP
14 | #define GGML_SYCL_SOFTMAX_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0,
19 |     const ggml_tensor *src1, ggml_tensor *dst,
20 |     const float *src0_dd, const float *src1_dd,
21 |     float *dst_dd,
22 |     const queue_ptr &main_stream);
23 | 
24 | #endif // GGML_SYCL_SOFTMAX_HPP
25 | 


--------------------------------------------------------------------------------
/ggml/src/ggml-sycl/tsembd.hpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // MIT license
 3 | // Copyright (C) 2024 Intel Corporation
 4 | // SPDX-License-Identifier: MIT
 5 | //
 6 | 
 7 | //
 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 9 | // See https://llvm.org/LICENSE.txt for license information.
10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 | //
12 | 
13 | #ifndef GGML_SYCL_TSEMBD_HPP
14 | #define GGML_SYCL_TSEMBD_HPP
15 | 
16 | #include "common.hpp"
17 | 
18 | void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
19 |     const ggml_tensor *src1, ggml_tensor * dst);
20 | 
21 | #endif // GGML_SYCL_TSEMBD_HPP
22 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_addrow.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10 | 
11 | layout(push_constant) uniform PushConstants {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     uint row;
16 | } pcs;
17 | 
18 | void main() {
19 |     const uint baseIndex = gl_WorkGroupID.x * 4;
20 | 
21 |     for (uint x = 0; x < 4; x++) {
22 |         const uint i = baseIndex + x;
23 |         out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_diagmask.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     uint n_past;
14 |     int ne00;
15 |     int ne01;
16 | } pcs;
17 | 
18 | void main() {
19 |     const uint i02 = gl_WorkGroupID.z;
20 |     const uint i01 = gl_WorkGroupID.y;
21 |     const uint i00 = gl_WorkGroupID.x;
22 | 
23 |     const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
24 | 
25 |     if (i00 > pcs.n_past + i01) {
26 |         out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
27 |     } else {
28 |         out_[index + pcs.outOff] = in_[index + pcs.inOff];
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_gelu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 8;
16 | 
17 |     for (uint x = 0; x < 8; x++) {
18 |         const uint i = baseIndex + x;
19 |         const float y = in_[i + pcs.inOff];
20 |         out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows.comp:
--------------------------------------------------------------------------------
 1 | void main() {
 2 |     const uint i = gl_WorkGroupID.x;
 3 |     const int r = inB[i + pcs.inBOff];
 4 | 
 5 |     int z = 0;
 6 |     for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
 7 |         const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
 8 |         const mat4 result = dequantize_block(inIndex, ind%NL);
 9 |         for (uint j = 0; j < 4; ++j) {
10 |             for (uint k = 0; k < 4; ++k) {
11 |                 const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
12 |                 out_[outIndex] = result[j][k];
13 |                 ++z;
14 |             }
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows_f16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10 | 
11 | layout (push_constant) uniform parameter {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     int ne00;
16 |     int nb01;
17 |     int nb1;
18 | } pcs;
19 | 
20 | void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21 |     for (int j = 0; j < k; j++) {
22 |         out_[y + j] = inA[x + j];
23 |     }
24 | }
25 | 
26 | void main() {
27 |     const uint i = gl_WorkGroupID.x;
28 |     const int r = inB[i + pcs.inBOff];
29 | 
30 |     dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31 | }
32 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows_f32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer tensorInA { float inA[]; };
 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10 | 
11 | layout (push_constant) uniform parameter {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     int ne00;
16 |     int nb01;
17 |     int nb1;
18 | } pcs;
19 | 
20 | void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21 |     for (int j = 0; j < k; j++) {
22 |         out_[y + j] = inA[x + j];
23 |     }
24 | }
25 | 
26 | void main() {
27 |     const uint i = gl_WorkGroupID.x;
28 |     const int r = inB[i + pcs.inBOff];
29 | 
30 |     dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31 | }
32 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 2
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q4_0
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q4_0 get_unaligned_block_q4_0(uint index) {
25 |     block_q4_0 fres;
26 |     fres.d = u8BufToFloat16(inA, index);
27 |     [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
28 |         fres.qs[it] = inA[index+2+it];
29 |     }
30 |     return fres;
31 | }
32 | 
33 | mat4 dequantize_block(uint index, uint il) {
34 |     const block_q4_0 block = get_unaligned_block_q4_0(index);
35 |     return dequantize_q4_0(block, il);
36 | }
37 | 
38 | #include "op_getrows.comp"
39 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 2
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q4_1
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q4_1 get_unaligned_block_q4_1(uint index) {
25 |     block_q4_1 fres;
26 |     fres.d = u8BufToFloat16(inA, index);
27 |     fres.m = u8BufToFloat16(inA, index+2);
28 |     [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
29 |         fres.qs[it] = inA[index+4+it];
30 |     }
31 |     return fres;
32 | }
33 | 
34 | mat4 dequantize_block(uint index, uint il) {
35 |     const block_q4_1 block = get_unaligned_block_q4_1(index);
36 |     return dequantize_q4_1(block, il);
37 | }
38 | 
39 | #include "op_getrows.comp"
40 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_getrows_q6_k.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 16
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q6_k
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q6_k get_unaligned_block_q6_k(uint index) {
25 |     block_q6_k fres;
26 |     [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
27 |         fres.ql[it] = inA[index + it];
28 |     }
29 |     [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
30 |         fres.qh[it] = inA[index + QK_K/2 + it];
31 |     }
32 |     [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
33 |         fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
34 |     }
35 |     fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
36 |     return fres;
37 | }
38 | 
39 | mat4 dequantize_block(uint index, uint il) {
40 |     const block_q6_k block = get_unaligned_block_q6_k(index);
41 |     return dequantize_q6_k(block, il);
42 | }
43 | 
44 | #include "op_getrows.comp"
45 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define BLOCKS_IN_QUANT QK4_0
 6 | #define SIZE_OF_BLOCK sizeof_block_q4_0
 7 | #define N_ROWS 4
 8 | 
 9 | #include "op_mul_mv_q_n_pre.comp"
10 | 
11 | // The q4_0 version of this function
12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13 |     vec2 acc = vec2(0.0, 0.0);
14 |     const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15 |     float d = float(u8BufToFloat16(inA, index));
16 |     float sumy = 0.0f;
17 |     for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
18 |         const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
19 | 
20 |         const float yl0 = inB[yb + i];
21 |         const float yl1 = inB[yb + i + 1];
22 |         const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
23 |         const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
24 | 
25 |         sumy += yl0 + yl1 + yl8 + yl9;
26 | 
27 |         acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
28 |         acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
29 |     }
30 |     return d * (sumy * -8.f + acc[0] + acc[1]);
31 | }
32 | 
33 | #include "op_mul_mv_q_n.comp"
34 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define BLOCKS_IN_QUANT QK4_1
 6 | #define SIZE_OF_BLOCK sizeof_block_q4_1
 7 | #define N_ROWS 4
 8 | 
 9 | #include "op_mul_mv_q_n_pre.comp"
10 | 
11 | // The q4_1 version of this function
12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13 |     vec2 acc = vec2(0.0, 0.0);
14 |     const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15 |     float d = float(u8BufToFloat16(inA, index));
16 |     float m = float(u8BufToFloat16(inA, index+2));
17 | 
18 |     float sumy = 0.0f;
19 |     for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
20 |         const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
21 | 
22 |         const float yl0 = inB[yb + i];
23 |         const float yl1 = inB[yb + i + 1];
24 |         const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
25 |         const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
26 | 
27 |         sumy += yl0 + yl1 + yl8 + yl9;
28 | 
29 |         acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
30 |         acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
31 |     }
32 |     return d * (acc[0] + acc[1]) + sumy * m;
33 | }
34 | 
35 | #include "op_mul_mv_q_n.comp"
36 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp:
--------------------------------------------------------------------------------
 1 | layout(local_size_x_id = 0) in;
 2 | layout(local_size_y = 1) in;
 3 | layout(local_size_z = 1) in;
 4 | 
 5 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 6 | layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 7 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 8 | 
 9 | layout (push_constant) uniform parameter {
10 |     uint inAOff;
11 |     uint inBOff;
12 |     uint outOff;
13 |     int  ne00;
14 |     int  ne01;
15 |     int  ne02;
16 |     int  ne10;
17 |     int  ne12;
18 |     int  ne0;
19 |     int  ne1;
20 |     uint r2;
21 |     uint r3;
22 | } pcs;
23 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 4;
16 | 
17 |     for (uint x = 0; x < 4; x++) {
18 |         const uint i = baseIndex + x;
19 |         out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_scale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     float scale;
14 | } pcs;
15 | 
16 | void main() {
17 |     const uint i = gl_WorkGroupID.x;
18 |     out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
19 | }
20 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_scale_8.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     float scale;
14 | } pcs;
15 | 
16 | void main() {
17 |     const uint baseIndex = gl_WorkGroupID.x * 8;
18 | 
19 |     for (uint x = 0; x < 8; x++) {
20 |         const uint i = baseIndex + x;
21 |         out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/ggml/src/kompute-shaders/op_silu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 4;
16 | 
17 |     for (uint x = 0; x < 4; x++) {
18 |         const uint i = baseIndex + x;
19 |         const float y = in_[i + pcs.inOff];
20 |         out_[i + pcs.outOff] = y / (1.0 + exp(-y));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/ggml/src/llamafile/sgemm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdint.h>
 3 | #include <stdbool.h>
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | 
 8 | bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
 9 |                      const void *, int64_t, void *, int64_t, int, int,
10 |                      int, int, int);
11 | 
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | find_package (Threads REQUIRED)
2 | 
3 | set(TARGET vulkan-shaders-gen)
4 | add_executable(${TARGET} vulkan-shaders-gen.cpp)
5 | install(TARGETS ${TARGET} RUNTIME)
6 | target_compile_features(${TARGET} PRIVATE cxx_std_11)
7 | target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
8 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/acc.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = gl_GlobalInvocationID.x;
 8 |     if (idx >= p.ne) {
 9 |         return;
10 |     }
11 | 
12 |     const uint offset = p.param3;
13 |     const uint src1_i = idx - offset;
14 |     const uint oz = src1_i / p.nb02;
15 |     const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
16 |     const uint ox = src1_i % p.nb01;
17 | 
18 |     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
19 |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
20 |     } else {
21 |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
22 |     }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/add.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/clamp.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
15 | }
16 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/copy.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 | #ifndef OPTIMIZATION_ERROR_WORKAROUND
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
15 | #else
16 |     data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
17 | #endif
18 | }
19 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/cos.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
15 | }
16 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_f32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {float data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_GlobalInvocationID.x * 16;
12 | 
13 |     if (i >= p.nel) {
14 |         return;
15 |     }
16 | 
17 |     [[unroll]] for (uint l = 0; l < 16; l++) {
18 |         data_b[i + l] = D_TYPE(data_a[i + l]);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_head.comp:
--------------------------------------------------------------------------------
 1 | #extension GL_EXT_control_flow_attributes : require
 2 | #extension GL_EXT_shader_16bit_storage : require
 3 | 
 4 | layout (push_constant) uniform parameter
 5 | {
 6 |     uint M;
 7 |     uint K;
 8 |     uint stride_a;
 9 |     uint stride_b;
10 |     uint nel;
11 | } p;
12 | 
13 | #include "types.comp"
14 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_iq4_nl.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint q_idx = 8*il;
22 |     const uint b_idx = 1024*i + 32*ir + q_idx;
23 | 
24 |     const float d = float(data_a[ib].d);
25 | 
26 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
27 |         data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
28 |         data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint q_idx = 8*il;
22 |     const uint b_idx = 1024*i + 32*ir + q_idx;
23 | 
24 |     const float d = float(data_a[ib].d);
25 | 
26 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
27 |         data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
28 |         data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const float m = float(data_a[ib].m);
25 | 
26 |     const uint q_idx = 8*il;
27 | 
28 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
29 |         data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
30 |         data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + m);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_q5_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q5_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
25 | 
26 |     const uint q_idx = 8*il;
27 | 
28 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
29 |         const uint iqs = q_idx + l;
30 |         const uint vui = uint(data_a[ib].qs[iqs]);
31 |         data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f));
32 |         data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f));
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_q5_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q5_1 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const float m = float(data_a[ib].m);
25 |     const uint qh = data_a[ib].qh;
26 | 
27 |     const uint q_idx = 8*il;
28 | 
29 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
30 |         const uint iqs = q_idx + l;
31 |         const uint vui = uint(data_a[ib].qs[iqs]);
32 |         data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m);
33 |         data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10))) + m);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/dequant_q8_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q8_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 16*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 | 
25 |     const uint q_idx = 16*il;
26 | 
27 |     [[unroll]] for (uint l = 0; l < 16; l += 2) {
28 |         data_b[b_idx + l    ] = D_TYPE(d * data_a[ib].qs[q_idx + l    ]);
29 |         data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/diag_mask_inf.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_shader_16bit_storage : require
 4 | #extension GL_EXT_control_flow_attributes : enable
 5 | 
 6 | layout (push_constant) uniform parameter
 7 | {
 8 |     uint ncols;
 9 |     uint rows_per_channel;
10 |     uint n_past;
11 | } p;
12 | 
13 | #include "types.comp"
14 | 
15 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
16 | 
17 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
18 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
19 | 
20 | void main() {
21 |     const uint col = gl_GlobalInvocationID.y;
22 |     const uint row = gl_GlobalInvocationID.x;
23 | 
24 |     if (col >= p.ncols) {
25 |         return;
26 |     }
27 | 
28 |     const uint i = row*p.ncols + col;
29 |     if (col > p.n_past + row % p.rows_per_channel) {
30 |         data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000));
31 |     } else {
32 |         data_d[i] = D_TYPE(data_a[i]);
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/div.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/gelu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const float GELU_COEF_A    = 0.044715f;
15 |     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
16 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
17 | 
18 |     if (i >= p.KX) {
19 |         return;
20 |     }
21 | 
22 |     const float xi = float(data_a[i]);
23 |     const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
24 |     data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
25 | }
26 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/gelu_quick.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const float GELU_QUICK_COEF = -1.702f;
15 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
16 | 
17 |     if (i >= p.KX) {
18 |         return;
19 |     }
20 | 
21 |     const float x = float(data_a[i]);
22 |     data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
23 | }
24 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/generic_head.comp:
--------------------------------------------------------------------------------
 1 | #extension GL_EXT_shader_16bit_storage : require
 2 | 
 3 | layout (push_constant) uniform parameter
 4 | {
 5 |     uint KX;
 6 |     uint KY;
 7 |     float param1;
 8 |     float param2;
 9 | } p;
10 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/get_rows.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint i00 = gl_GlobalInvocationID.x;
 8 |     const uint i10 = gl_GlobalInvocationID.y;
 9 |     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
10 |     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
11 | 
12 |     if (i00 >= p.ne00) {
13 |         return;
14 |     }
15 | 
16 |     const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
17 | 
18 |     const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
19 |     const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
20 | 
21 | #ifndef OPTIMIZATION_ERROR_WORKAROUND
22 |     data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
23 | #else
24 |     data_d[d_offset + i00] = data_a[a_offset + i00];
25 | #endif
26 | }
27 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/get_rows_quant.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | #include "dequant_funcs.comp"
 6 | 
 7 | void main() {
 8 |     const uint i00 = (gl_GlobalInvocationID.x)*2;
 9 |     const uint i10 = gl_GlobalInvocationID.y;
10 |     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
11 |     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
12 | 
13 |     if (i00 >= p.ne00) {
14 |         return;
15 |     }
16 | 
17 |     const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
18 | 
19 |     const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
20 |     const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
21 | 
22 |     const uint ib = a_offset + i00/QUANT_K; // block index
23 |     const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
24 |     const uint iybs = i00 - i00%QUANT_K; // dst block start index
25 |     const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
26 | 
27 |     vec2 v = dequantize(ib, iqs, 0);
28 | 
29 |     data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
30 |     data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
31 | }
32 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/leaky_relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     const float val = float(data_a[i]);
21 |     data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
22 | }
23 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/mul.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_control_flow_attributes : enable
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {float data_a[];};
 8 | layout (binding = 1) writeonly buffer D {float data_d[];};
 9 | 
10 | layout (push_constant) uniform parameter {
11 |     uint ne;
12 |     uint k_num;
13 | } p;
14 | 
15 | void main() {
16 |     const uint idx = gl_GlobalInvocationID.x;
17 | 
18 |     if (idx >= p.ne) {
19 |         return;
20 |     }
21 | 
22 |     float result = 0.0f;
23 | 
24 |     [[unroll]] for (uint i = 0; i < p.k_num; i++) {
25 |         result += data_a[i * p.ne + idx];
26 |     }
27 | 
28 |     data_d[idx] = result;
29 | }
30 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/pad.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
14 |     const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
15 |     const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
16 |     const uint i2_offset = i2*p.ne11*p.ne10;
17 |     const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
18 |     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
19 | 
20 |     const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
21 |     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
22 | 
23 |     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
24 | 
25 |     data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
26 | }
27 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     data_d[i] = max(float(data_a[i]), 0);
21 | }
22 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/repeat.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | uint src0_idx_mod(uint idx) {
 7 |     const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
 8 |     const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
 9 |     const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
10 |     const uint i12_offset = i12*p.ne11*p.ne10;
11 |     const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
12 |     const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
13 |     return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
14 | }
15 | 
16 | void main() {
17 |     const uint idx = get_idx();
18 | 
19 |     if (idx >= p.ne) {
20 |         return;
21 |     }
22 | 
23 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
24 | }
25 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/rope_neox.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "rope_head.comp"
 4 | 
 5 | void main() {
 6 |     const uint col = gl_GlobalInvocationID.y * 2;
 7 |     const uint row = gl_GlobalInvocationID.x;
 8 | 
 9 |     if (col >= p.ncols) {
10 |         return;
11 |     }
12 | 
13 |     if (col >= p.n_dims) {
14 |         const uint i = row*p.ncols + col;
15 | 
16 |         data_d[i + 0] = data_a[i + 0];
17 |         data_d[i + 1] = data_a[i + 1];
18 | 
19 |         return;
20 |     }
21 | 
22 |     const uint i  = row*p.ncols + col/2;
23 |     const uint i2 = row/p.p_delta_rows;
24 | 
25 |     const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
26 | 
27 |     const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
28 | 
29 |     float cos_theta, sin_theta;
30 |     rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
31 | 
32 |     const float x0 = float(data_a[i + 0]);
33 |     const float x1 = float(data_a[i + p.n_dims/2]);
34 | 
35 |     data_d[i + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
36 |     data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
37 | }
38 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/rope_norm.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "rope_head.comp"
 4 | 
 5 | void main() {
 6 |     const uint col = gl_GlobalInvocationID.y * 2;
 7 |     const uint row = gl_GlobalInvocationID.x;
 8 | 
 9 |     if (col >= p.ncols) {
10 |         return;
11 |     }
12 | 
13 |     if (col >= p.n_dims) {
14 |         const uint i = row*p.ncols + col;
15 | 
16 |         data_d[i + 0] = data_a[i + 0];
17 |         data_d[i + 1] = data_a[i + 1];
18 | 
19 |         return;
20 |     }
21 | 
22 |     const uint i = row*p.ncols + col;
23 |     const uint i2 = row/p.p_delta_rows;
24 | 
25 |     const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
26 | 
27 |     const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
28 | 
29 |     float cos_theta, sin_theta;
30 |     rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
31 | 
32 |     const float x0 = float(data_a[i + 0]);
33 |     const float x1 = float(data_a[i + 1]);
34 | 
35 |     data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
36 |     data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
37 | }
38 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/scale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
14 | }
15 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/silu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     const float xi = float(data_a[i]);
21 |     data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
22 | }
23 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/sin.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
15 | }
16 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/square.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
15 | }
16 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/sum_rows.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 8 | 
 9 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
10 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
11 | 
12 | layout (constant_id = 0) const uint BLOCK_SIZE = 32;
13 | 
14 | shared FLOAT_TYPE tmp[BLOCK_SIZE];
15 | 
16 | void main() {
17 |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18 |     const uint col = gl_LocalInvocationID.x;
19 | 
20 |     tmp[col] = FLOAT_TYPE(0.0f);
21 | 
22 |     for (uint i = col; i < p.KX; i += BLOCK_SIZE) {
23 |         tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]);
24 |     }
25 | 
26 |     barrier();
27 |     [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) {
28 |         if (col < s) {
29 |             tmp[col] += tmp[col + s];
30 |         }
31 |         barrier();
32 |     }
33 | 
34 |     if (col == 0) {
35 |         data_d[row] = D_TYPE(tmp[0]);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/tanh.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     data_d[i] = D_TYPE(tanh(data_a[i]));
21 | }
22 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/timestep_embedding.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_shader_16bit_storage : require
 4 | 
 5 | layout (push_constant) uniform parameter
 6 | {
 7 |     uint nb1;
 8 |     uint dim;
 9 |     uint max_period;
10 | } p;
11 | 
12 | #include "types.comp"
13 | 
14 | #extension GL_EXT_control_flow_attributes : enable
15 | #define BLOCK_SIZE 256
16 | 
17 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
18 | 
19 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
20 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
21 | 
22 | void main() {
23 |     const uint i = gl_WorkGroupID.y;
24 |     const uint j = gl_GlobalInvocationID.x;
25 |     const uint d_offset = i * p.nb1;
26 | 
27 |     if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
28 |         data_d[d_offset + p.dim] = 0.f;
29 |     }
30 | 
31 |     const uint half_dim = p.dim / 2;
32 |     if (j >= half_dim) {
33 |         return;
34 |     }
35 | 
36 |     const float timestep = float(data_a[i]);
37 |     const float freq = float(exp(-log(p.max_period) * j / half_dim));
38 |     const float arg = timestep * freq;
39 |     data_d[d_offset + j] = D_TYPE(cos(arg));
40 |     data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
41 | }
42 | 


--------------------------------------------------------------------------------
/ggml/src/vulkan-shaders/upscale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (push_constant) uniform parameter
 4 | {
 5 |     uint ne; uint d_offset;
 6 |     uint nb00; uint nb01; uint nb02; uint nb03;
 7 |     uint ne10; uint ne11; uint ne12; uint ne13;
 8 |     float sf0; float sf1; float sf2; float sf3;
 9 | } p;
10 | 
11 | #include "types.comp"
12 | 
13 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
14 | 
15 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
16 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
17 | 
18 | void main() {
19 |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
20 | 
21 |     if (idx >= p.ne) {
22 |         return;
23 |     }
24 | 
25 |     const uint i10 = idx % p.ne10;
26 |     const uint i11 = (idx / p.ne10) % p.ne11;
27 |     const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
28 |     const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
29 | 
30 |     const uint i00 = uint(i10 / p.sf0);
31 |     const uint i01 = uint(i11 / p.sf1);
32 |     const uint i02 = uint(i12 / p.sf2);
33 |     const uint i03 = uint(i13 / p.sf3);
34 | 
35 |     data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36 | }
37 | 


--------------------------------------------------------------------------------
/gguf-py/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Georgi Gerganov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/gguf-py/examples/writer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | 
 7 | # Necessary to load the local gguf package
 8 | sys.path.insert(0, str(Path(__file__).parent.parent))
 9 | 
10 | from gguf import GGUFWriter  # noqa: E402
11 | 
12 | 
13 | # Example usage:
14 | def writer_example() -> None:
15 |     # Example usage with a file
16 |     gguf_writer = GGUFWriter("example.gguf", "llama")
17 | 
18 |     gguf_writer.add_block_count(12)
19 |     gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
20 |     gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
21 |     gguf_writer.add_custom_alignment(64)
22 | 
23 |     tensor1 = np.ones((32,), dtype=np.float32) * 100.0
24 |     tensor2 = np.ones((64,), dtype=np.float32) * 101.0
25 |     tensor3 = np.ones((96,), dtype=np.float32) * 102.0
26 | 
27 |     gguf_writer.add_tensor("tensor1", tensor1)
28 |     gguf_writer.add_tensor("tensor2", tensor2)
29 |     gguf_writer.add_tensor("tensor3", tensor3)
30 | 
31 |     gguf_writer.write_header_to_file()
32 |     gguf_writer.write_kv_data_to_file()
33 |     gguf_writer.write_tensors_to_file()
34 | 
35 |     gguf_writer.close()
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     writer_example()
40 | 


--------------------------------------------------------------------------------
/gguf-py/gguf/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import *
 2 | from .lazy import *
 3 | from .gguf_reader import *
 4 | from .gguf_writer import *
 5 | from .quants import *
 6 | from .tensor_mapping import *
 7 | from .vocab import *
 8 | from .utility import *
 9 | from .metadata import *
10 | 


--------------------------------------------------------------------------------
/gguf-py/gguf/gguf.py:
--------------------------------------------------------------------------------
 1 | # This file left for compatibility. If you want to use the GGUF API from Python
 2 | # then don't import gguf/gguf.py directly. If you're looking for examples, see the
 3 | # examples/ directory for gguf-py
 4 | 
 5 | import importlib
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | sys.path.insert(0, str(Path(__file__).parent.parent))
10 | 
11 | # Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
12 | importlib.invalidate_caches()
13 | import gguf  # noqa: E402
14 | 
15 | importlib.reload(gguf)
16 | 


--------------------------------------------------------------------------------
/gguf-py/gguf/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/gguf-py/gguf/py.typed


--------------------------------------------------------------------------------
/gguf-py/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "gguf"
 3 | version = "0.10.0"
 4 | description = "Read and write ML models in GGUF for GGML"
 5 | authors = ["GGML <ggml@ggml.ai>"]
 6 | packages = [
 7 |     {include = "gguf"},
 8 |     {include = "gguf/py.typed"},
 9 |     {include = "scripts"},
10 | ]
11 | readme = "README.md"
12 | homepage = "https://ggml.ai"
13 | repository = "https://github.com/ggerganov/llama.cpp"
14 | keywords = ["ggml", "gguf", "llama.cpp"]
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Operating System :: OS Independent",
19 | ]
20 | 
21 | [tool.poetry.dependencies]
22 | python = ">=3.8"
23 | numpy = ">=1.17"
24 | tqdm = ">=4.27"
25 | pyyaml = ">=5.1"
26 | sentencepiece = ">=0.1.98,<=0.2.0"
27 | 
28 | [tool.poetry.dev-dependencies]
29 | pytest = "^5.2"
30 | 
31 | [build-system]
32 | requires = ["poetry-core>=1.0.0"]
33 | build-backend = "poetry.core.masonry.api"
34 | 
35 | [tool.poetry.scripts]
36 | gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
37 | gguf-dump = "scripts:gguf_dump_entrypoint"
38 | gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
39 | gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
40 | 


--------------------------------------------------------------------------------
/gguf-py/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # pyright: reportUnusedImport=false
2 | 
3 | from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
4 | from .gguf_dump import main as gguf_dump_entrypoint
5 | from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
6 | from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
7 | 


--------------------------------------------------------------------------------
/gguf-py/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_metadata import *
2 | 


--------------------------------------------------------------------------------
/grammars/arithmetic.gbnf:
--------------------------------------------------------------------------------
1 | root  ::= (expr "=" ws term "\n")+
2 | expr  ::= term ([-+*/] term)*
3 | term  ::= ident | num | "(" ws expr ")" ws
4 | ident ::= [a-z] [a-z0-9_]* ws
5 | num   ::= [0-9]+ ws
6 | ws    ::= [ \t\n]*
7 | 


--------------------------------------------------------------------------------
/grammars/chess.gbnf:
--------------------------------------------------------------------------------
 1 | # Specifies chess moves as a list in algebraic notation, using PGN conventions
 2 | 
 3 | # Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
 4 | root    ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
 5 | move    ::= (pawn | nonpawn | castle) [+#]?
 6 | 
 7 | # piece type, optional file/rank, optional capture, dest file & rank
 8 | nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
 9 | 
10 | # optional file & capture, dest file & rank, optional promotion
11 | pawn    ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
12 | 
13 | castle  ::= "O-O" "-O"?
14 | 


--------------------------------------------------------------------------------
/grammars/japanese.gbnf:
--------------------------------------------------------------------------------
1 | # A probably incorrect grammar for Japanese
2 | root        ::= jp-char+ ([ \t\n] jp-char+)*
3 | jp-char     ::= hiragana | katakana | punctuation | cjk
4 | hiragana    ::= [ぁ-ゟ]
5 | katakana    ::= [ァ-ヿ]
6 | punctuation ::= [、-〾]
7 | cjk         ::= [一-鿿]
8 | 


--------------------------------------------------------------------------------
/grammars/json.gbnf:
--------------------------------------------------------------------------------
 1 | root   ::= object
 2 | value  ::= object | array | string | number | ("true" | "false" | "null") ws
 3 | 
 4 | object ::=
 5 |   "{" ws (
 6 |             string ":" ws value
 7 |     ("," ws string ":" ws value)*
 8 |   )? "}" ws
 9 | 
10 | array  ::=
11 |   "[" ws (
12 |             value
13 |     ("," ws value)*
14 |   )? "]" ws
15 | 
16 | string ::=
17 |   "\"" (
18 |     [^"\\\x7F\x00-\x1F] |
19 |     "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
20 |   )* "\"" ws
21 | 
22 | number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
23 | 
24 | # Optional space: by convention, applied in this grammar after literal chars when allowed
25 | ws ::= | " " | "\n" [ \t]{0,20}
26 | 


--------------------------------------------------------------------------------
/grammars/json_arr.gbnf:
--------------------------------------------------------------------------------
 1 | # This is the same as json.gbnf but we restrict whitespaces at the end of the root array
 2 | # Useful for generating JSON arrays
 3 | 
 4 | root   ::= arr
 5 | value  ::= object | array | string | number | ("true" | "false" | "null") ws
 6 | 
 7 | arr  ::=
 8 |   "[\n" ws (
 9 |             value
10 |     (",\n" ws value)*
11 |   )? "]"
12 | 
13 | object ::=
14 |   "{" ws (
15 |             string ":" ws value
16 |     ("," ws string ":" ws value)*
17 |   )? "}" ws
18 | 
19 | array  ::=
20 |   "[" ws (
21 |             value
22 |     ("," ws value)*
23 |   )? "]" ws
24 | 
25 | string ::=
26 |   "\"" (
27 |     [^"\\\x7F\x00-\x1F] |
28 |     "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
29 |   )* "\"" ws
30 | 
31 | number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
32 | 
33 | # Optional space: by convention, applied in this grammar after literal chars when allowed
34 | ws ::= | " " | "\n" [ \t]{0,20}
35 | 


--------------------------------------------------------------------------------
/grammars/list.gbnf:
--------------------------------------------------------------------------------
1 | root ::= item+
2 | 
3 | # Excludes various line break characters
4 | item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"
5 | 


--------------------------------------------------------------------------------
/lang-cli-src/config.cpp:
--------------------------------------------------------------------------------
1 | #include "config.h"


--------------------------------------------------------------------------------
/lang-cli-src/config.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | 
 4 | #ifndef CONFIG_MANAGER_H
 5 | #define CONFIG_MANAGER_H
 6 | 
 7 | #define CONFIG_FILE "langcommand_config.json"
 8 | #define CMD_NAME "langcommand"
 9 | 
10 | #endif // CONFIG_MANAGER_H
11 | 


--------------------------------------------------------------------------------
/lang-cli-src/file_manager.h:
--------------------------------------------------------------------------------
 1 | #ifndef DOWNLOADER_H
 2 | #define DOWNLOADER_H
 3 | 
 4 | #include <string>
 5 | 
 6 | bool download_file(const std::string& url, const std::string& file_path);
 7 | void show_progress_bar(double percentage, double speed);
 8 | bool file_exists(const std::string & path);
 9 | bool file_is_empty(const std::string & path);
10 | bool file_create(const std::string & path);
11 | bool json_file_create(const std::string & path);
12 | #endif // DOWNLOADER_H
13 | 


--------------------------------------------------------------------------------
/lang-cli-src/output_parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef OUTPUT_PARSER_H
 2 | #define OUTPUT_PARSER_H
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | std::vector<std::string> extract_suggestions(const std::string& input);
 8 | 
 9 | size_t check_early_stop(const std::string& output_buffer);
10 | 
11 | std::string colorize_text(const std::string& input);
12 | #endif // OUTPUT_PARSER_H
13 | 


--------------------------------------------------------------------------------
/lang-cli-src/shell_executor.h:
--------------------------------------------------------------------------------
 1 | #ifndef SHELL_EXECUTOR_H
 2 | #define SHELL_EXECUTOR_H
 3 | 
 4 | #include <string>
 5 | 
 6 | // Declare the function prototype
 7 | std::string exec_command(const std::string& cmd);
 8 | void choose_edit_exec(std::vector<std::string>& output_lines);
 9 | #endif // SHELL_EXECUTOR_H
10 | 


--------------------------------------------------------------------------------
/lang-cli-src/str_parser.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef STR_PARSER_H
 3 | #define STR_PARSER_H
 4 | 
 5 | #include <string>
 6 | #include <vector>
 7 | // Function declaration for extracting bash blocks from input string
 8 | std::vector<std::string> extract_strs(const std::string& input, const std::string& regex_str);
 9 | 
10 | std::string extract_str(const std::string& input, const std::string& regex_str);
11 | 
12 | 
13 | size_t get_nth_delimiters(const std::string& input, const std::string& delimiter, size_t n);
14 | 
15 | size_t get_num_delimiters(const std::string& input, const std::string& delimiter);
16 | 
17 | std::vector<std::string> split_str(const std::string& str, char delimiter);
18 | 
19 | 
20 | std::string to_lower_case(const std::string& input);
21 | 
22 | #endif // STR_PARSER_H


--------------------------------------------------------------------------------
/media/llama-leader.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama-leader.jpeg


--------------------------------------------------------------------------------
/media/llama0-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama0-banner.png


--------------------------------------------------------------------------------
/media/llama0-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama0-logo.png


--------------------------------------------------------------------------------
/media/llama1-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama1-banner.png


--------------------------------------------------------------------------------
/media/llama1-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama1-logo.png


--------------------------------------------------------------------------------
/media/matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/matmul.png


--------------------------------------------------------------------------------
/models/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 | 


--------------------------------------------------------------------------------
/models/ggml-vocab-aquila.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-aquila.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-baichuan.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-baichuan.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-bert-bge.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-bert-bge.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-command-r.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-command-r.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-deepseek-coder.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-deepseek-coder.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-deepseek-llm.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-deepseek-llm.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-falcon.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-falcon.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-gpt-2.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-gpt-2.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-gpt-neox.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-gpt-neox.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-llama-bpe.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-llama-bpe.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-llama-spm.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-llama-spm.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-mpt.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-mpt.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-phi-3.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-phi-3.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-qwen2.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-qwen2.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-refact.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-refact.gguf


--------------------------------------------------------------------------------
/models/ggml-vocab-starcoder.gguf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-starcoder.gguf


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | strict = true
3 | allow_untyped_calls = true
4 | allow_untyped_defs = true
5 | allow_incomplete_defs = true
6 | disable_error_code = import-untyped
7 | warn_return_any = false
8 | 


--------------------------------------------------------------------------------
/output.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/output.gif


--------------------------------------------------------------------------------
/pocs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | 
 3 | find_package(Threads REQUIRED)
 4 | 
 5 | # third-party
 6 | 
 7 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 8 | 
 9 | if (EMSCRIPTEN)
10 | else()
11 |     add_subdirectory(vdot)
12 | endif()
13 | 


--------------------------------------------------------------------------------
/pocs/vdot/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET llama-vdot)
 2 | add_executable(${TARGET} vdot.cpp)
 3 | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 4 | target_compile_features(${TARGET} PRIVATE cxx_std_11)
 5 | 
 6 | set(TARGET llama-q8dot)
 7 | add_executable(${TARGET} q8dot.cpp)
 8 | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 9 | target_compile_features(${TARGET} PRIVATE cxx_std_11)
10 | 


--------------------------------------------------------------------------------
/prompts/alpaca.txt:
--------------------------------------------------------------------------------
1 | Below is an instruction that describes a task. Write a response that appropriately completes the request.
2 | 


--------------------------------------------------------------------------------
/prompts/chat-with-baichuan.txt:
--------------------------------------------------------------------------------
1 | 以下内容为人类用户与与一位智能助手的对话。
2 | 
3 | 用户:你好！
4 | 助手:
5 | 


--------------------------------------------------------------------------------
/prompts/chat-with-bob.txt:
--------------------------------------------------------------------------------
1 | Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
2 | 
3 | User: Hello, Bob.
4 | Bob: Hello. How may I help you today?
5 | User: Please tell me the largest city in Europe.
6 | Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
7 | User:


--------------------------------------------------------------------------------
/prompts/chat-with-qwen.txt:
--------------------------------------------------------------------------------
1 | You are a helpful assistant.


--------------------------------------------------------------------------------
/prompts/chat-with-vicuna-v0.txt:
--------------------------------------------------------------------------------
1 | A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
2 | 
3 | ### [[USER_NAME]]: Hello, [[AI_NAME]].
4 | ### [[AI_NAME]]: Hello. How may I help you today?
5 | ### [[USER_NAME]]: Please tell me the largest city in Europe.
6 | ### [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
7 | ### [[USER_NAME]]:
8 | 


--------------------------------------------------------------------------------
/prompts/chat-with-vicuna-v1.txt:
--------------------------------------------------------------------------------
1 | A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
2 | 
3 | [[USER_NAME]]: Hello, [[AI_NAME]].
4 | [[AI_NAME]]: Hello. How may I help you today?
5 | [[USER_NAME]]: Please tell me the largest city in Europe.
6 | [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
7 | [[USER_NAME]]:
8 | 


--------------------------------------------------------------------------------
/prompts/reason-act.txt:
--------------------------------------------------------------------------------
 1 | You run in a loop of Thought, Action, Observation.
 2 | At the end of the loop either Answer or restate your Thought and Action.
 3 | Use Thought to describe your thoughts about the question you have been asked.
 4 | Use Action to run one of these actions available to you:
 5 | - calculate[python math expression]
 6 | Observation will be the result of running those actions
 7 | 
 8 | 
 9 | Question: What is 4 * 7 / 3?
10 | Thought: Do I need to use an action? Yes, I use calculate to do math
11 | Action: calculate[4 * 7 / 3]
12 | Observation: 9.3333333333
13 | Thought: Do I need to use an action? No, have the result
14 | Answer: The calculate tool says it is 9.3333333333
15 | Question: What is capital of france?
16 | Thought: Do I need to use an action? No, I know the answer
17 | Answer: Paris is the capital of France
18 | Question:


--------------------------------------------------------------------------------
/pyrightconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extraPaths": ["gguf-py"],
 3 |   "pythonVersion": "3.9",
 4 |   "pythonPlatform": "All",
 5 |   "reportUnusedImport": "warning",
 6 |   "reportDuplicateImport": "error",
 7 |   "reportDeprecated": "warning",
 8 |   "reportUnnecessaryTypeIgnoreComment": "information",
 9 |   "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum
10 |   "executionEnvironments": [
11 |     {
12 |       // TODO: make this version override work correctly
13 |       "root": "gguf-py",
14 |       "pythonVersion": "3.8",
15 |     },
16 |     {
17 |       // uses match expressions in steps.py
18 |       "root": "examples/server/tests",
19 |       "pythonVersion": "3.10",
20 |     },
21 |   ],
22 |  }
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # These requirements include all dependencies for all top-level python scripts
 2 | # for llama.cpp. Avoid adding packages here directly.
 3 | #
 4 | # Package versions must stay compatible across all top-level python scripts.
 5 | #
 6 | 
 7 | -r ./requirements/requirements-convert_legacy_llama.txt
 8 | 
 9 | -r ./requirements/requirements-convert_hf_to_gguf.txt
10 | -r ./requirements/requirements-convert_hf_to_gguf_update.txt
11 | -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
12 | -r ./requirements/requirements-convert_lora_to_gguf.txt
13 | 


--------------------------------------------------------------------------------
/requirements/requirements-all.txt:
--------------------------------------------------------------------------------
 1 | -r ../examples/llava/requirements.txt
 2 | -r ../examples/server/bench/requirements.txt
 3 | -r ../examples/server/tests/requirements.txt
 4 | 
 5 | -r ./requirements-compare-llama-bench.txt
 6 | -r ./requirements-pydantic.txt
 7 | -r ./requirements-test-tokenizer-random.txt
 8 | 
 9 | -r ./requirements-convert_hf_to_gguf.txt
10 | -r ./requirements-convert_hf_to_gguf_update.txt
11 | -r ./requirements-convert_legacy_llama.txt
12 | -r ./requirements-convert_llama_ggml_to_gguf.txt
13 | 


--------------------------------------------------------------------------------
/requirements/requirements-compare-llama-bench.txt:
--------------------------------------------------------------------------------
1 | tabulate~=0.9.0
2 | GitPython~=3.1.43
3 | 


--------------------------------------------------------------------------------
/requirements/requirements-convert_hf_to_gguf.txt:
--------------------------------------------------------------------------------
1 | -r ./requirements-convert_legacy_llama.txt
2 | --extra-index-url https://download.pytorch.org/whl/cpu
3 | torch~=2.2.1
4 | 


--------------------------------------------------------------------------------
/requirements/requirements-convert_hf_to_gguf_update.txt:
--------------------------------------------------------------------------------
1 | -r ./requirements-convert_legacy_llama.txt
2 | --extra-index-url https://download.pytorch.org/whl/cpu
3 | torch~=2.2.1
4 | 


--------------------------------------------------------------------------------
/requirements/requirements-convert_legacy_llama.txt:
--------------------------------------------------------------------------------
1 | numpy~=1.26.4
2 | sentencepiece~=0.2.0
3 | transformers>=4.45.1,<5.0.0
4 | gguf>=0.1.0
5 | protobuf>=4.21.0,<5.0.0
6 | 


--------------------------------------------------------------------------------
/requirements/requirements-convert_llama_ggml_to_gguf.txt:
--------------------------------------------------------------------------------
1 | -r ./requirements-convert_legacy_llama.txt
2 | 


--------------------------------------------------------------------------------
/requirements/requirements-convert_lora_to_gguf.txt:
--------------------------------------------------------------------------------
1 | -r ./requirements-convert_hf_to_gguf.txt
2 | --extra-index-url https://download.pytorch.org/whl/cpu
3 | 


--------------------------------------------------------------------------------
/requirements/requirements-pydantic.txt:
--------------------------------------------------------------------------------
1 | docstring_parser~=0.15
2 | pydantic~=2.6.3
3 | requests
4 | 


--------------------------------------------------------------------------------
/requirements/requirements-test-tokenizer-random.txt:
--------------------------------------------------------------------------------
1 | cffi~=1.16.0
2 | 


--------------------------------------------------------------------------------
/scripts/build-info.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | CC=$1
 4 | 
 5 | build_number="0"
 6 | build_commit="unknown"
 7 | build_compiler="unknown"
 8 | build_target="unknown"
 9 | 
10 | if out=$(git rev-list --count HEAD); then
11 |     # git is broken on WSL so we need to strip extra newlines
12 |     build_number=$(printf '%s' "$out" | tr -d '\n')
13 | fi
14 | 
15 | if out=$(git rev-parse --short HEAD); then
16 |     build_commit=$(printf '%s' "$out" | tr -d '\n')
17 | fi
18 | 
19 | if out=$($CC --version | head -1); then
20 |     build_compiler=$out
21 | fi
22 | 
23 | if out=$($CC -dumpmachine); then
24 |     build_target=$out
25 | fi
26 | 
27 | echo "int LLAMA_BUILD_NUMBER = ${build_number};"
28 | echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
29 | echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
30 | echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
31 | 


--------------------------------------------------------------------------------
/scripts/compare-commits.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -lt 2 ]; then
 4 |     echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
 5 |     exit 1
 6 | fi
 7 | 
 8 | set -e
 9 | set -x
10 | 
11 | # verify at the start that the compare script has all the necessary dependencies installed
12 | ./scripts/compare-llama-bench.py --check
13 | 
14 | bench_args="${@:3}"
15 | 
16 | rm -f llama-bench.sqlite > /dev/null
17 | 
18 | # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
19 | 
20 | git checkout $1 > /dev/null
21 | make clean > /dev/null
22 | make -j$(nproc) $make_opts llama-bench > /dev/null
23 | ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
24 | 
25 | git checkout $2 > /dev/null
26 | make clean > /dev/null
27 | make -j$(nproc) $make_opts llama-bench > /dev/null
28 | ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
29 | 
30 | ./scripts/compare-llama-bench.py -b $1 -c $2
31 | 


--------------------------------------------------------------------------------
/scripts/gen-authors.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | printf "# date: $(date)\n" > AUTHORS
 4 | printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
 5 | 
 6 | git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
 7 | 
 8 | # if necessary, update your name here. for example: jdoe -> John Doe
 9 | sed -i '' 's/^jdoe/John Doe/g' AUTHORS
10 | 


--------------------------------------------------------------------------------
/scripts/get-flags.mk:
--------------------------------------------------------------------------------
 1 | ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
 2 | 	GF_CC_IS_GCC = 1
 3 | 	GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
 4 | else
 5 | 	GF_CC_IS_CLANG = 1
 6 | 	ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
 7 | 		GF_CC_IS_LLVM_CLANG = 1
 8 | 	else
 9 | 		GF_CC_IS_APPLE_CLANG = 1
10 | 	endif
11 | 	GF_CC_VER := \
12 | 		$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
13 | 		| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
14 | endif
15 | 
16 | ifeq ($(GF_CC_IS_CLANG), 1)
17 | 	# clang options
18 | 	GF_CFLAGS   = -Wunreachable-code-break -Wunreachable-code-return
19 | 	GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
20 | 
21 | 	ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
22 | 		GF_CFLAGS += -Wdouble-promotion
23 | 	endif
24 | 	ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
25 | 		GF_CFLAGS += -Wdouble-promotion
26 | 	endif
27 | else
28 | 	# gcc options
29 | 	GF_CFLAGS   = -Wdouble-promotion
30 | 	GF_CXXFLAGS = -Wno-array-bounds
31 | 
32 | 	ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
33 | 		GF_CXXFLAGS += -Wno-format-truncation
34 | 	endif
35 | 	ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
36 | 		GF_CXXFLAGS += -Wextra-semi
37 | 	endif
38 | endif
39 | 


--------------------------------------------------------------------------------
/scripts/get-hellaswag.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt
 4 | 
 5 | echo "Usage:"
 6 | echo ""
 7 | echo "  ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
 8 | echo ""
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/scripts/get-wikitext-103.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
 4 | 
 5 | echo "Usage:"
 6 | echo ""
 7 | echo "  ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]"
 8 | echo ""
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/scripts/get-wikitext-2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
 4 | unzip wikitext-2-raw-v1.zip
 5 | 
 6 | echo "Usage:"
 7 | echo ""
 8 | echo "  ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]"
 9 | echo ""
10 | 
11 | exit 0
12 | 


--------------------------------------------------------------------------------
/scripts/get-winogrande.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv
 4 | 
 5 | echo "Usage:"
 6 | echo ""
 7 | echo "  ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
 8 | echo ""
 9 | 
10 | exit 0
11 | 


--------------------------------------------------------------------------------
/scripts/install-oneapi.bat:
--------------------------------------------------------------------------------
 1 | ::  MIT license
 2 | ::  Copyright (C) 2024 Intel Corporation
 3 | ::  SPDX-License-Identifier: MIT
 4 | 
 5 | 
 6 | set URL=%1
 7 | set COMPONENTS=%2
 8 | 
 9 | curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
10 | start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
11 | del %TEMP%\webimage.exe
12 | if "%COMPONENTS%"=="" (
13 |   webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
14 | ) else (
15 |   webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
16 | )
17 | set installer_exit_code=%ERRORLEVEL%
18 | rd /s/q "webimage_extracted"
19 | exit /b %installer_exit_code%
20 | 


--------------------------------------------------------------------------------
/scripts/qnt-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 4 | args=""
 5 | 
 6 | if [ -z "$1" ]; then
 7 |     echo "usage: $0 <model> [qnt] [args]"
 8 |     echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
 9 |     exit 1
10 | fi
11 | 
12 | if [ ! -z "$2" ]; then
13 |     qnt=($2)
14 | fi
15 | 
16 | if [ ! -z "$3" ]; then
17 |     args="$3"
18 | fi
19 | 
20 | model="$1"
21 | out="../tmp/results-${model}"
22 | 
23 | set -o pipefail
24 | set -e
25 | 
26 | mkdir -p ${out}
27 | 
28 | for q in ${qnt[@]}; do
29 |     time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
30 | done
31 | 


--------------------------------------------------------------------------------
/scripts/run-all-perf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 4 | args="-ngl 999 -n 64 -p 512"
 5 | 
 6 | if [ -z "$1" ]; then
 7 |     echo "usage: $0 <model> [qnt] [args]"
 8 |     echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
 9 |     exit 1
10 | fi
11 | 
12 | if [ ! -z "$2" ]; then
13 |     qnt=($2)
14 | fi
15 | 
16 | if [ ! -z "$3" ]; then
17 |     args="$3"
18 | fi
19 | 
20 | model="$1"
21 | out="../tmp/results-${model}"
22 | 
23 | set -o pipefail
24 | set -e
25 | 
26 | mkdir -p ${out}
27 | 
28 | mstr=""
29 | 
30 | for q in ${qnt[@]}; do
31 |     mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf"
32 | done
33 | 
34 | ./bin/llama-bench ${mstr} ${args} 2> /dev/null
35 | 


--------------------------------------------------------------------------------
/scripts/run-all-ppl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
 4 | args="-ngl 999 -t 8"
 5 | 
 6 | if [ -z "$1" ]; then
 7 |     echo "usage: $0 <model> [qnt] [args]"
 8 |     echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
 9 |     exit 1
10 | fi
11 | 
12 | if [ ! -z "$2" ]; then
13 |     qnt=($2)
14 | fi
15 | 
16 | if [ ! -z "$3" ]; then
17 |     args="$3"
18 | fi
19 | 
20 | set -o pipefail
21 | set -e
22 | 
23 | model="$1"
24 | out="../tmp/results-${model}"
25 | 
26 | mkdir -p ${out}
27 | 
28 | for q in ${qnt[@]}; do
29 |     time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
30 | done
31 | 


--------------------------------------------------------------------------------
/scripts/sync-ggml.last:
--------------------------------------------------------------------------------
1 | b77f48b1efa671e094696b99fbf566aac8c87d74
2 | 


--------------------------------------------------------------------------------
/scripts/xxd.cmake:
--------------------------------------------------------------------------------
 1 | # CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
 2 | # Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
 3 | 
 4 | SET(INPUT "" CACHE STRING "Input File")
 5 | SET(OUTPUT "" CACHE STRING "Output File")
 6 | 
 7 | get_filename_component(filename "${INPUT}" NAME)
 8 | string(REGEX REPLACE "\\.|-" "_" name "${filename}")
 9 | 
10 | file(READ "${INPUT}" hex_data HEX)
11 | string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
12 | 
13 | string(LENGTH ${hex_data} hex_len)
14 | math(EXPR len "${hex_len} / 2")
15 | 
16 | file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")
17 | 


--------------------------------------------------------------------------------
/spm-headers/ggml-alloc.h:
--------------------------------------------------------------------------------
1 | ../ggml/include/ggml-alloc.h


--------------------------------------------------------------------------------
/spm-headers/ggml-backend.h:
--------------------------------------------------------------------------------
1 | ../ggml/include/ggml-backend.h


--------------------------------------------------------------------------------
/spm-headers/ggml-metal.h:
--------------------------------------------------------------------------------
1 | ../ggml/include/ggml-metal.h


--------------------------------------------------------------------------------
/spm-headers/ggml.h:
--------------------------------------------------------------------------------
1 | ../ggml/include/ggml.h


--------------------------------------------------------------------------------
/spm-headers/llama.h:
--------------------------------------------------------------------------------
1 | ../include/llama.h


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # TODO: should not use this
 2 | if (WIN32)
 3 |     if (BUILD_SHARED_LIBS)
 4 |         set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 5 |     endif()
 6 | endif()
 7 | 
 8 | #
 9 | # libraries
10 | #
11 | 
12 | # llama
13 | 
14 | add_library(llama
15 |             ../include/llama.h
16 |             llama.cpp
17 |             llama-vocab.cpp
18 |             llama-grammar.cpp
19 |             llama-sampling.cpp
20 |             unicode.h
21 |             unicode.cpp
22 |             unicode-data.cpp
23 |             )
24 | 
25 | target_include_directories(llama PUBLIC . ../include)
26 | target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
27 | 
28 | target_link_libraries(llama PUBLIC ggml)
29 | 
30 | if (BUILD_SHARED_LIBS)
31 |     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
32 |     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
33 | endif()
34 | 


--------------------------------------------------------------------------------
/src/llama-sampling.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
 4 | 
 5 | #include "llama-grammar.h"
 6 | 
 7 | #include <unordered_map>
 8 | 
 9 | struct llama_vocab;
10 | struct llama_grammar;
11 | 
12 | // sampler chain
13 | 
14 | struct llama_sampler_chain {
15 |     llama_sampler_chain_params params;
16 | 
17 |     std::vector<struct llama_sampler *> samplers;
18 | 
19 |     // timing
20 | 
21 |     mutable int64_t t_sample_us;
22 | 
23 |     mutable int32_t n_sample;
24 | };
25 | 
26 | struct llama_sampler * llama_sampler_init_grammar_impl(
27 |         const struct llama_vocab & vocab,
28 |                       const char * grammar_str,
29 |                       const char * grammar_root);
30 | 


--------------------------------------------------------------------------------
/src/unicode-data.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <vector>
 5 | #include <unordered_map>
 6 | #include <unordered_set>
 7 | 
 8 | struct range_nfd {
 9 |     uint32_t first;
10 |     uint32_t last;
11 |     uint32_t nfd;
12 | };
13 | 
14 | static const uint32_t MAX_CODEPOINTS = 0x110000;
15 | 
16 | extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
17 | extern const std::unordered_set<uint32_t> unicode_set_whitespace;
18 | extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
19 | extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
20 | extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
21 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !*.*
3 | *.o
4 | ggml-common.h
5 | 


--------------------------------------------------------------------------------
/tests/get-model.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <cstring>
 4 | 
 5 | #include "get-model.h"
 6 | 
 7 | char * get_model_or_exit(int argc, char *argv[]) {
 8 |     char * model_path;
 9 |     if (argc > 1) {
10 |         model_path = argv[1];
11 | 
12 |     } else {
13 |         model_path = getenv("LLAMACPP_TEST_MODELFILE");
14 |         if (!model_path || strlen(model_path) == 0) {
15 |             fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
16 |             exit(EXIT_SUCCESS);
17 |         }
18 |     }
19 | 
20 |     return model_path;
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/get-model.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | char * get_model_or_exit(int, char*[]);
3 | 


--------------------------------------------------------------------------------
/tests/run-json-schema-to-grammar.mjs:
--------------------------------------------------------------------------------
 1 | import { readFileSync } from "fs"
 2 | import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
 3 | 
 4 | const [, , file] = process.argv
 5 | const url = `file://${file}`
 6 | let schema = JSON.parse(readFileSync(file, "utf8"));
 7 | const converter = new SchemaConverter({})
 8 | schema = await converter.resolveRefs(schema, url)
 9 | converter.visit(schema, '')
10 | console.log(converter.formatGrammar())
11 | 


--------------------------------------------------------------------------------
/tests/test-autorelease.cpp:
--------------------------------------------------------------------------------
 1 | // ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
 2 | 
 3 | #include <cstdio>
 4 | #include <string>
 5 | #include <thread>
 6 | 
 7 | #include "llama.h"
 8 | #include "get-model.h"
 9 | 
10 | // This creates a new context inside a pthread and then tries to exit cleanly.
11 | int main(int argc, char ** argv) {
12 |     auto * model_path = get_model_or_exit(argc, argv);
13 | 
14 |     std::thread([&model_path]() {
15 |         llama_backend_init();
16 |         auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
17 |         auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
18 |         llama_free(ctx);
19 |         llama_free_model(model);
20 |         llama_backend_free();
21 |     }).join();
22 | 
23 |     return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/test-c.c:
--------------------------------------------------------------------------------
1 | #include "llama.h"
2 | 
3 | #ifdef GGML_USE_KOMPUTE
4 | #include "ggml-kompute.h"
5 | #endif
6 | 
7 | int main(void) {}
8 | 


--------------------------------------------------------------------------------
/tests/test-log.cpp:
--------------------------------------------------------------------------------
 1 | #include "log.h"
 2 | 
 3 | #include <cstdlib>
 4 | #include <thread>
 5 | 
 6 | int main() {
 7 |     const int n_thread = 8;
 8 | 
 9 |     std::thread threads[n_thread];
10 |     for (int i = 0; i < n_thread; i++) {
11 |         threads[i] = std::thread([i]() {
12 |             const int n_msg = 1000;
13 | 
14 |             for (int j = 0; j < n_msg; j++) {
15 |                 const int log_type = std::rand() % 4;
16 | 
17 |                 switch (log_type) {
18 |                     case 0: LOG_INF("Thread %d: %d\n", i, j); break;
19 |                     case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
20 |                     case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
21 |                     case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
22 |                     default:
23 |                         break;
24 |                 }
25 | 
26 |                 if (rand () % 10 < 5) {
27 |                     gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
28 |                     gpt_log_set_prefix    (gpt_log_main(), rand() % 2);
29 |                 }
30 |             }
31 |         });
32 |     }
33 | 
34 |     for (int i = 0; i < n_thread; i++) {
35 |         threads[i].join();
36 |     }
37 | 
38 |     return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/test-model-load-cancel.cpp:
--------------------------------------------------------------------------------
 1 | #include "llama.h"
 2 | #include "get-model.h"
 3 | 
 4 | #include <cstdlib>
 5 | 
 6 | int main(int argc, char *argv[] ) {
 7 |     auto * model_path = get_model_or_exit(argc, argv);
 8 |     auto * file = fopen(model_path, "r");
 9 |     if (file == nullptr) {
10 |         fprintf(stderr, "no model at '%s' found\n", model_path);
11 |         return EXIT_FAILURE;
12 |     }
13 | 
14 |     fprintf(stderr, "using '%s'\n", model_path);
15 |     fclose(file);
16 | 
17 |     llama_backend_init();
18 |     auto params = llama_model_params{};
19 |     params.use_mmap = false;
20 |     params.progress_callback = [](float progress, void * ctx){
21 |         (void) ctx;
22 |         return progress > 0.50;
23 |     };
24 |     auto * model = llama_load_model_from_file(model_path, params);
25 |     llama_backend_free();
26 |     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/test-tokenizer-0.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Usage:
 4 | #
 5 | #   test-tokenizer-0.sh <name> <input>
 6 | #
 7 | 
 8 | if [ $# -ne 2 ]; then
 9 |     printf "Usage: $0 <name> <input>\n"
10 |     exit 1
11 | fi
12 | 
13 | name=$1
14 | input=$2
15 | 
16 | make -j tests/test-tokenizer-0
17 | 
18 | printf "Testing %s on %s ...\n" $name $input
19 | 
20 | set -e
21 | 
22 | printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
23 | python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
24 | 
25 | printf "Tokenizing using (cpp) llama.cpp ...\n"
26 | ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
27 | 
28 | cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
29 | cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
30 | 
31 | set +e
32 | 
33 | diff $input.tok $input.tokcpp > /dev/null 2>&1
34 | 
35 | if [ $? -eq 0 ]; then
36 |     printf "Tokenization is correct!\n"
37 | else
38 |     diff $input.tok $input.tokcpp | head -n 32
39 | 
40 |     printf "Tokenization differs!\n"
41 | fi
42 | 


--------------------------------------------------------------------------------