├── .clang-tidy ├── .devops ├── cloud-v-pipeline ├── full-cuda.Dockerfile ├── full-rocm.Dockerfile ├── full.Dockerfile ├── llama-cli-cann.Dockerfile ├── llama-cli-cuda.Dockerfile ├── llama-cli-intel.Dockerfile ├── llama-cli-rocm.Dockerfile ├── llama-cli-vulkan.Dockerfile ├── llama-cli.Dockerfile ├── llama-cpp-cuda.srpm.spec ├── llama-cpp.srpm.spec ├── llama-server-cuda.Dockerfile ├── llama-server-intel.Dockerfile ├── llama-server-rocm.Dockerfile ├── llama-server-vulkan.Dockerfile ├── llama-server.Dockerfile ├── nix │ ├── apps.nix │ ├── devshells.nix │ ├── docker.nix │ ├── jetson-support.nix │ ├── nixpkgs-instances.nix │ ├── package-gguf-py.nix │ ├── package.nix │ ├── python-scripts.nix │ ├── scope.nix │ └── sif.nix └── tools.sh ├── .dockerignore ├── .ecrc ├── .editorconfig ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── 01-bug-low.yml │ ├── 02-bug-medium.yml │ ├── 03-bug-high.yml │ ├── 04-bug-critical.yml │ ├── 05-enhancement.yml │ ├── 06-research.yml │ ├── 07-refactor.yml │ └── config.yml ├── labeler.yml ├── pull_request_template.md └── workflows │ ├── bench.yml.disabled │ ├── build.yml │ ├── close-issue.yml │ ├── docker.yml │ ├── editorconfig.yml │ ├── gguf-publish.yml │ ├── labeler.yml │ ├── nix-ci-aarch64.yml │ ├── nix-ci.yml │ ├── nix-flake-update.yml │ ├── nix-publish-flake.yml │ ├── python-check-requirements.yml │ ├── python-lint.yml │ ├── python-type-check.yml │ └── server.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── AUTHORS ├── CMakeLists.txt ├── CMakePresets.json ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── Package.swift ├── README.md ├── ci ├── README.md └── run.sh ├── cmake ├── arm64-windows-llvm.cmake ├── arm64-windows-msvc.cmake ├── build-info.cmake ├── git-vars.cmake ├── llama-config.cmake.in └── llama.pc.in ├── common ├── CMakeLists.txt ├── arg.cpp ├── arg.h ├── base64.hpp ├── build-info.cpp.in ├── cmake │ └── build-info-gen-cpp.cmake ├── common.cpp ├── common.h ├── console.cpp ├── console.h ├── json-schema-to-grammar.cpp ├── json-schema-to-grammar.h ├── json.hpp ├── log.cpp ├── log.h ├── ngram-cache.cpp ├── ngram-cache.h ├── sampling.cpp ├── sampling.h ├── stb_image.h ├── train.cpp └── train.h ├── convert_hf_to_gguf.py ├── convert_hf_to_gguf_update.py ├── convert_llama_ggml_to_gguf.py ├── convert_lora_to_gguf.py ├── docs ├── android.md ├── backend │ ├── BLIS.md │ ├── CANN.md │ └── SYCL.md ├── build.md ├── development │ ├── HOWTO-add-model.md │ ├── debugging-tests.md │ ├── llama-star │ │ ├── idea-arch.key │ │ └── idea-arch.pdf │ └── token_generation_performance_tips.md ├── docker.md └── install.md ├── flake.lock ├── flake.nix ├── ggml ├── .gitignore ├── CMakeLists.txt ├── cmake │ └── FindSIMD.cmake ├── include │ ├── ggml-alloc.h │ ├── ggml-backend.h │ ├── ggml-blas.h │ ├── ggml-cann.h │ ├── ggml-cuda.h │ ├── ggml-kompute.h │ ├── ggml-metal.h │ ├── ggml-rpc.h │ ├── ggml-sycl.h │ ├── ggml-vulkan.h │ └── ggml.h └── src │ ├── CMakeLists.txt │ ├── ggml-aarch64.c │ ├── ggml-aarch64.h │ ├── ggml-alloc.c │ ├── ggml-backend-impl.h │ ├── ggml-backend.cpp │ ├── ggml-blas.cpp │ ├── ggml-cann.cpp │ ├── ggml-cann │ ├── .clang-format │ ├── Doxyfile │ ├── acl_tensor.cpp │ ├── acl_tensor.h │ ├── aclnn_ops.cpp │ ├── aclnn_ops.h │ ├── common.h │ └── kernels │ │ ├── CMakeLists.txt │ │ ├── ascendc_kernels.h │ │ ├── dup.cpp │ │ ├── get_row_f16.cpp │ │ ├── get_row_f32.cpp │ │ ├── get_row_q4_0.cpp │ │ ├── get_row_q8_0.cpp │ │ ├── quantize_f16_q8_0.cpp │ │ ├── quantize_f32_q8_0.cpp │ │ └── quantize_float_to_q4_0.cpp │ ├── ggml-common.h │ ├── ggml-cpu-impl.h │ ├── ggml-cuda.cu │ ├── ggml-cuda │ ├── acc.cu │ ├── acc.cuh │ ├── arange.cu │ ├── arange.cuh │ ├── argmax.cu │ ├── argmax.cuh │ ├── argsort.cu │ ├── argsort.cuh │ ├── binbcast.cu │ ├── binbcast.cuh │ ├── clamp.cu │ ├── clamp.cuh │ ├── common.cuh │ ├── concat.cu │ ├── concat.cuh │ ├── conv-transpose-1d.cu │ ├── conv-transpose-1d.cuh │ ├── convert.cu │ ├── convert.cuh │ ├── count-equal.cu │ ├── count-equal.cuh │ ├── cpy.cu │ ├── cpy.cuh │ ├── cross-entropy-loss.cu │ ├── cross-entropy-loss.cuh │ ├── dequantize.cuh │ ├── diagmask.cu │ ├── diagmask.cuh │ ├── dmmv.cu │ ├── dmmv.cuh │ ├── fattn-common.cuh │ ├── fattn-tile-f16.cu │ ├── fattn-tile-f16.cuh │ ├── fattn-tile-f32.cu │ ├── fattn-tile-f32.cuh │ ├── fattn-vec-f16.cuh │ ├── fattn-vec-f32.cuh │ ├── fattn-wmma-f16.cuh │ ├── fattn.cu │ ├── fattn.cuh │ ├── getrows.cu │ ├── getrows.cuh │ ├── im2col.cu │ ├── im2col.cuh │ ├── mma.cuh │ ├── mmq.cu │ ├── mmq.cuh │ ├── mmvq.cu │ ├── mmvq.cuh │ ├── norm.cu │ ├── norm.cuh │ ├── opt-step-adamw.cu │ ├── opt-step-adamw.cuh │ ├── out-prod.cu │ ├── out-prod.cuh │ ├── pad.cu │ ├── pad.cuh │ ├── pool2d.cu │ ├── pool2d.cuh │ ├── quantize.cu │ ├── quantize.cuh │ ├── rope.cu │ ├── rope.cuh │ ├── rwkv-wkv.cu │ ├── rwkv-wkv.cuh │ ├── scale.cu │ ├── scale.cuh │ ├── softmax.cu │ ├── softmax.cuh │ ├── sum.cu │ ├── sum.cuh │ ├── sumrows.cu │ ├── sumrows.cuh │ ├── template-instances │ │ ├── fattn-vec-f16-instance-hs128-f16-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-f16-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-f16-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-f16-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-f16-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-f16-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-f16.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q8_0.cu │ │ ├── fattn-vec-f16-instance-hs256-f16-f16.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-f16.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-q4_0.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-q4_1.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-q5_0.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-q5_1.cu │ │ ├── fattn-vec-f16-instance-hs64-f16-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-f16-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-f16.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q8_0.cu │ │ ├── fattn-vec-f32-instance-hs256-f16-f16.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-f16.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-q4_0.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-q4_1.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-q5_0.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-q5_1.cu │ │ ├── fattn-vec-f32-instance-hs64-f16-q8_0.cu │ │ ├── fattn-wmma-f16-instance-kqfloat-cpb16.cu │ │ ├── fattn-wmma-f16-instance-kqfloat-cpb32.cu │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb16.cu │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb32.cu │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb8.cu │ │ ├── generate_cu_files.py │ │ ├── mmq-instance-iq1_s.cu │ │ ├── mmq-instance-iq2_s.cu │ │ ├── mmq-instance-iq2_xs.cu │ │ ├── mmq-instance-iq2_xxs.cu │ │ ├── mmq-instance-iq3_s.cu │ │ ├── mmq-instance-iq3_xxs.cu │ │ ├── mmq-instance-iq4_nl.cu │ │ ├── mmq-instance-iq4_xs.cu │ │ ├── mmq-instance-q2_k.cu │ │ ├── mmq-instance-q3_k.cu │ │ ├── mmq-instance-q4_0.cu │ │ ├── mmq-instance-q4_1.cu │ │ ├── mmq-instance-q4_k.cu │ │ ├── mmq-instance-q5_0.cu │ │ ├── mmq-instance-q5_1.cu │ │ ├── mmq-instance-q5_k.cu │ │ ├── mmq-instance-q6_k.cu │ │ └── mmq-instance-q8_0.cu │ ├── tsembd.cu │ ├── tsembd.cuh │ ├── unary.cu │ ├── unary.cuh │ ├── upscale.cu │ ├── upscale.cuh │ ├── vecdotq.cuh │ └── vendors │ │ ├── cuda.h │ │ ├── hip.h │ │ └── musa.h │ ├── ggml-impl.h │ ├── ggml-kompute.cpp │ ├── ggml-metal.m │ ├── ggml-metal.metal │ ├── ggml-quants.c │ ├── ggml-quants.h │ ├── ggml-rpc.cpp │ ├── ggml-sycl.cpp │ ├── ggml-sycl │ ├── backend.hpp │ ├── common.cpp │ ├── common.hpp │ ├── concat.cpp │ ├── concat.hpp │ ├── conv.cpp │ ├── conv.hpp │ ├── convert.cpp │ ├── convert.hpp │ ├── dequantize.hpp │ ├── dmmv.cpp │ ├── dmmv.hpp │ ├── dpct │ │ └── helper.hpp │ ├── gemm.hpp │ ├── im2col.cpp │ ├── im2col.hpp │ ├── mmq.cpp │ ├── mmq.hpp │ ├── mmvq.cpp │ ├── mmvq.hpp │ ├── norm.cpp │ ├── norm.hpp │ ├── presets.hpp │ ├── rope.cpp │ ├── rope.hpp │ ├── softmax.cpp │ ├── softmax.hpp │ ├── tsembd.cpp │ ├── tsembd.hpp │ └── vecdotq.hpp │ ├── ggml-vulkan.cpp │ ├── ggml.c │ ├── kompute-shaders │ ├── common.comp │ ├── op_add.comp │ ├── op_addrow.comp │ ├── op_cpy_f16_f16.comp │ ├── op_cpy_f16_f32.comp │ ├── op_cpy_f32_f16.comp │ ├── op_cpy_f32_f32.comp │ ├── op_diagmask.comp │ ├── op_gelu.comp │ ├── op_getrows.comp │ ├── op_getrows_f16.comp │ ├── op_getrows_f32.comp │ ├── op_getrows_q4_0.comp │ ├── op_getrows_q4_1.comp │ ├── op_getrows_q6_k.comp │ ├── op_mul.comp │ ├── op_mul_mat_f16.comp │ ├── op_mul_mat_mat_f32.comp │ ├── op_mul_mat_q4_0.comp │ ├── op_mul_mat_q4_1.comp │ ├── op_mul_mat_q6_k.comp │ ├── op_mul_mat_q8_0.comp │ ├── op_mul_mv_q_n.comp │ ├── op_mul_mv_q_n_pre.comp │ ├── op_norm.comp │ ├── op_relu.comp │ ├── op_rmsnorm.comp │ ├── op_rope_f16.comp │ ├── op_rope_f32.comp │ ├── op_scale.comp │ ├── op_scale_8.comp │ ├── op_silu.comp │ ├── op_softmax.comp │ └── rope_common.comp │ ├── llamafile │ ├── sgemm.cpp │ └── sgemm.h │ └── vulkan-shaders │ ├── CMakeLists.txt │ ├── acc.comp │ ├── add.comp │ ├── argsort.comp │ ├── clamp.comp │ ├── concat.comp │ ├── copy.comp │ ├── cos.comp │ ├── dequant_f32.comp │ ├── dequant_funcs.comp │ ├── dequant_head.comp │ ├── dequant_iq4_nl.comp │ ├── dequant_q2_k.comp │ ├── dequant_q3_k.comp │ ├── dequant_q4_0.comp │ ├── dequant_q4_1.comp │ ├── dequant_q4_k.comp │ ├── dequant_q5_0.comp │ ├── dequant_q5_1.comp │ ├── dequant_q5_k.comp │ ├── dequant_q6_k.comp │ ├── dequant_q8_0.comp │ ├── diag_mask_inf.comp │ ├── div.comp │ ├── gelu.comp │ ├── gelu_quick.comp │ ├── generic_binary_head.comp │ ├── generic_head.comp │ ├── generic_unary_head.comp │ ├── get_rows.comp │ ├── get_rows_quant.comp │ ├── group_norm.comp │ ├── im2col.comp │ ├── leaky_relu.comp │ ├── mul.comp │ ├── mul_mat_split_k_reduce.comp │ ├── mul_mat_vec.comp │ ├── mul_mat_vec_base.comp │ ├── mul_mat_vec_nc.comp │ ├── mul_mat_vec_p021.comp │ ├── mul_mat_vec_q2_k.comp │ ├── mul_mat_vec_q3_k.comp │ ├── mul_mat_vec_q4_k.comp │ ├── mul_mat_vec_q5_k.comp │ ├── mul_mat_vec_q6_k.comp │ ├── mul_mm.comp │ ├── norm.comp │ ├── pad.comp │ ├── relu.comp │ ├── repeat.comp │ ├── rms_norm.comp │ ├── rope_head.comp │ ├── rope_neox.comp │ ├── rope_norm.comp │ ├── scale.comp │ ├── silu.comp │ ├── sin.comp │ ├── soft_max.comp │ ├── square.comp │ ├── sum_rows.comp │ ├── tanh.comp │ ├── timestep_embedding.comp │ ├── types.comp │ ├── upscale.comp │ └── vulkan-shaders-gen.cpp ├── gguf-py ├── LICENSE ├── README.md ├── examples │ ├── reader.py │ └── writer.py ├── gguf │ ├── __init__.py │ ├── constants.py │ ├── gguf.py │ ├── gguf_reader.py │ ├── gguf_writer.py │ ├── lazy.py │ ├── metadata.py │ ├── py.typed │ ├── quants.py │ ├── tensor_mapping.py │ ├── utility.py │ └── vocab.py ├── pyproject.toml ├── scripts │ ├── __init__.py │ ├── gguf_convert_endian.py │ ├── gguf_dump.py │ ├── gguf_hash.py │ ├── gguf_new_metadata.py │ └── gguf_set_metadata.py └── tests │ ├── __init__.py │ ├── test_metadata.py │ └── test_quants.py ├── grammars ├── README.md ├── arithmetic.gbnf ├── c.gbnf ├── chess.gbnf ├── japanese.gbnf ├── json.gbnf ├── json_arr.gbnf └── list.gbnf ├── include └── llama.h ├── lac.cpp ├── lang-cli-src ├── config.cpp ├── config.h ├── console_manager.cpp ├── console_manager.h ├── file_manager.cpp ├── file_manager.h ├── model_manager.cpp ├── model_manager.h ├── output_parser.cpp ├── output_parser.h ├── shell_executor.cpp ├── shell_executor.h ├── str_parser.cpp └── str_parser.h ├── media ├── llama-leader.jpeg ├── llama0-banner.png ├── llama0-logo.png ├── llama1-banner.png ├── llama1-logo.png ├── matmul.png └── matmul.svg ├── models ├── .editorconfig ├── ggml-vocab-aquila.gguf ├── ggml-vocab-baichuan.gguf ├── ggml-vocab-bert-bge.gguf ├── ggml-vocab-bert-bge.gguf.inp ├── ggml-vocab-bert-bge.gguf.out ├── ggml-vocab-chameleon.gguf.inp ├── ggml-vocab-chameleon.gguf.out ├── ggml-vocab-command-r.gguf ├── ggml-vocab-command-r.gguf.inp ├── ggml-vocab-command-r.gguf.out ├── ggml-vocab-deepseek-coder.gguf ├── ggml-vocab-deepseek-coder.gguf.inp ├── ggml-vocab-deepseek-coder.gguf.out ├── ggml-vocab-deepseek-llm.gguf ├── ggml-vocab-deepseek-llm.gguf.inp ├── ggml-vocab-deepseek-llm.gguf.out ├── ggml-vocab-falcon.gguf ├── ggml-vocab-falcon.gguf.inp ├── ggml-vocab-falcon.gguf.out ├── ggml-vocab-gpt-2.gguf ├── ggml-vocab-gpt-2.gguf.inp ├── ggml-vocab-gpt-2.gguf.out ├── ggml-vocab-gpt-neox.gguf ├── ggml-vocab-llama-bpe.gguf ├── ggml-vocab-llama-bpe.gguf.inp ├── ggml-vocab-llama-bpe.gguf.out ├── ggml-vocab-llama-spm.gguf ├── ggml-vocab-llama-spm.gguf.inp ├── ggml-vocab-llama-spm.gguf.out ├── ggml-vocab-mpt.gguf ├── ggml-vocab-mpt.gguf.inp ├── ggml-vocab-mpt.gguf.out ├── ggml-vocab-phi-3.gguf ├── ggml-vocab-phi-3.gguf.inp ├── ggml-vocab-phi-3.gguf.out ├── ggml-vocab-qwen2.gguf ├── ggml-vocab-qwen2.gguf.inp ├── ggml-vocab-qwen2.gguf.out ├── ggml-vocab-refact.gguf ├── ggml-vocab-refact.gguf.inp ├── ggml-vocab-refact.gguf.out ├── ggml-vocab-starcoder.gguf ├── ggml-vocab-starcoder.gguf.inp └── ggml-vocab-starcoder.gguf.out ├── mypy.ini ├── output.gif ├── pocs ├── CMakeLists.txt └── vdot │ ├── CMakeLists.txt │ ├── q8dot.cpp │ └── vdot.cpp ├── poetry.lock ├── prompts ├── LLM-questions.txt ├── alpaca.txt ├── assistant.txt ├── chat-with-baichuan.txt ├── chat-with-bob.txt ├── chat-with-qwen.txt ├── chat-with-vicuna-v0.txt ├── chat-with-vicuna-v1.txt ├── chat.txt ├── dan-modified.txt ├── dan.txt ├── mnemonics.txt ├── parallel-questions.txt └── reason-act.txt ├── pyproject.toml ├── pyrightconfig.json ├── requirements.txt ├── requirements ├── requirements-all.txt ├── requirements-compare-llama-bench.txt ├── requirements-convert_hf_to_gguf.txt ├── requirements-convert_hf_to_gguf_update.txt ├── requirements-convert_legacy_llama.txt ├── requirements-convert_llama_ggml_to_gguf.txt ├── requirements-convert_lora_to_gguf.txt ├── requirements-pydantic.txt └── requirements-test-tokenizer-random.txt ├── scripts ├── build-info.sh ├── check-requirements.sh ├── ci-run.sh ├── compare-commits.sh ├── compare-llama-bench.py ├── debug-test.sh ├── gen-authors.sh ├── gen-unicode-data.py ├── get-flags.mk ├── get-hellaswag.sh ├── get-pg.sh ├── get-wikitext-103.sh ├── get-wikitext-2.sh ├── get-winogrande.sh ├── hf.sh ├── install-oneapi.bat ├── pod-llama.sh ├── qnt-all.sh ├── run-all-perf.sh ├── run-all-ppl.sh ├── run-with-preset.py ├── server-llm.sh ├── sync-ggml-am.sh ├── sync-ggml.last ├── sync-ggml.sh ├── verify-checksum-models.py └── xxd.cmake ├── spm-headers ├── ggml-alloc.h ├── ggml-backend.h ├── ggml-metal.h ├── ggml.h └── llama.h ├── src ├── CMakeLists.txt ├── llama-grammar.cpp ├── llama-grammar.h ├── llama-impl.h ├── llama-sampling.cpp ├── llama-sampling.h ├── llama-vocab.cpp ├── llama-vocab.h ├── llama.cpp ├── unicode-data.cpp ├── unicode-data.h ├── unicode.cpp └── unicode.h └── tests ├── .gitignore ├── CMakeLists.txt ├── get-model.cpp ├── get-model.h ├── run-json-schema-to-grammar.mjs ├── test-arg-parser.cpp ├── test-autorelease.cpp ├── test-backend-ops.cpp ├── test-barrier.cpp ├── test-c.c ├── test-chat-template.cpp ├── test-double-float.cpp ├── test-grad0.cpp ├── test-grammar-integration.cpp ├── test-grammar-parser.cpp ├── test-json-schema-to-grammar.cpp ├── test-llama-grammar.cpp ├── test-log.cpp ├── test-lora-conversion-inference.sh ├── test-model-load-cancel.cpp ├── test-opt.cpp ├── test-quantize-fns.cpp ├── test-quantize-perf.cpp ├── test-rope.cpp ├── test-sampling.cpp ├── test-tokenizer-0.cpp ├── test-tokenizer-0.py ├── test-tokenizer-0.sh ├── test-tokenizer-1-bpe.cpp ├── test-tokenizer-1-spm.cpp └── test-tokenizer-random.py /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: > 3 | bugprone-*, 4 | -bugprone-easily-swappable-parameters, 5 | -bugprone-implicit-widening-of-multiplication-result, 6 | -bugprone-misplaced-widening-cast, 7 | -bugprone-narrowing-conversions, 8 | readability-*, 9 | -readability-avoid-unconditional-preprocessor-if, 10 | -readability-function-cognitive-complexity, 11 | -readability-identifier-length, 12 | -readability-implicit-bool-conversion, 13 | -readability-magic-numbers, 14 | -readability-uppercase-literal-suffix, 15 | -readability-simplify-boolean-expr, 16 | clang-analyzer-*, 17 | -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, 18 | performance-*, 19 | portability-*, 20 | misc-*, 21 | -misc-const-correctness, 22 | -misc-non-private-member-variables-in-classes, 23 | -misc-no-recursion, 24 | FormatStyle: none 25 | -------------------------------------------------------------------------------- /.devops/cloud-v-pipeline: -------------------------------------------------------------------------------- 1 | node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries 2 | stage('Cleanup'){ 3 | cleanWs() // Cleaning previous CI build in workspace 4 | } 5 | stage('checkout repo'){ 6 | retry(5){ // Retry if the cloning fails due to some reason 7 | checkout scm // Clone the repo on Runner 8 | } 9 | } 10 | stage('Compiling llama.cpp'){ 11 | sh'''#!/bin/bash 12 | make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V 13 | ''' 14 | } 15 | stage('Running llama.cpp'){ 16 | sh'''#!/bin/bash 17 | module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc 18 | qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64 19 | cat llama_log.txt # Printing results 20 | ''' 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.devops/full-cuda.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | # This needs to generally match the container host's environment. 3 | ARG CUDA_VERSION=12.6.0 4 | # Target the CUDA build image 5 | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} 6 | 7 | FROM ${BASE_CUDA_DEV_CONTAINER} AS build 8 | 9 | # CUDA architecture to build for (defaults to all supported archs) 10 | ARG CUDA_DOCKER_ARCH=default 11 | 12 | RUN apt-get update && \ 13 | apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 14 | 15 | COPY requirements.txt requirements.txt 16 | COPY requirements requirements 17 | 18 | RUN pip install --upgrade pip setuptools wheel \ 19 | && pip install -r requirements.txt 20 | 21 | WORKDIR /app 22 | 23 | COPY . . 24 | 25 | # Use the default CUDA archs if not specified 26 | RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ 27 | export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ 28 | fi && \ 29 | cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ 30 | cmake --build build --config Release -j$(nproc) && \ 31 | cp build/bin/* . 32 | 33 | ENTRYPOINT ["/app/.devops/tools.sh"] 34 | -------------------------------------------------------------------------------- /.devops/full-rocm.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | # This needs to generally match the container host's environment. 4 | ARG ROCM_VERSION=5.6 5 | 6 | # Target the CUDA build image 7 | ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete 8 | 9 | FROM ${BASE_ROCM_DEV_CONTAINER} AS build 10 | 11 | # Unless otherwise specified, we make a fat build. 12 | # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 13 | # This is mostly tied to rocBLAS supported archs. 14 | ARG ROCM_DOCKER_ARCH="\ 15 | gfx803 \ 16 | gfx900 \ 17 | gfx906 \ 18 | gfx908 \ 19 | gfx90a \ 20 | gfx1010 \ 21 | gfx1030 \ 22 | gfx1100 \ 23 | gfx1101 \ 24 | gfx1102" 25 | 26 | COPY requirements.txt requirements.txt 27 | COPY requirements requirements 28 | 29 | RUN pip install --upgrade pip setuptools wheel \ 30 | && pip install -r requirements.txt 31 | 32 | WORKDIR /app 33 | 34 | COPY . . 35 | 36 | # Set nvcc architecture 37 | ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} 38 | # Enable ROCm 39 | ENV GGML_HIPBLAS=1 40 | ENV CC=/opt/rocm/llvm/bin/clang 41 | ENV CXX=/opt/rocm/llvm/bin/clang++ 42 | 43 | # Enable cURL 44 | ENV LLAMA_CURL=1 45 | RUN apt-get update && \ 46 | apt-get install -y libcurl4-openssl-dev 47 | 48 | RUN make -j$(nproc) 49 | 50 | ENTRYPOINT ["/app/.devops/tools.sh"] 51 | -------------------------------------------------------------------------------- /.devops/full.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | FROM ubuntu:$UBUNTU_VERSION AS build 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 7 | 8 | COPY requirements.txt requirements.txt 9 | COPY requirements requirements 10 | 11 | RUN pip install --upgrade pip setuptools wheel \ 12 | && pip install -r requirements.txt 13 | 14 | WORKDIR /app 15 | 16 | COPY . . 17 | 18 | ENV LLAMA_CURL=1 19 | 20 | 21 | RUN make -j$(nproc) 22 | 23 | ENV LC_ALL=C.utf8 24 | 25 | ENTRYPOINT ["/app/.devops/tools.sh"] 26 | -------------------------------------------------------------------------------- /.devops/llama-cli-cuda.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | # This needs to generally match the container host's environment. 3 | ARG CUDA_VERSION=12.6.0 4 | # Target the CUDA build image 5 | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} 6 | # Target the CUDA runtime image 7 | ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} 8 | 9 | FROM ${BASE_CUDA_DEV_CONTAINER} AS build 10 | 11 | # CUDA architecture to build for (defaults to all supported archs) 12 | ARG CUDA_DOCKER_ARCH=default 13 | 14 | RUN apt-get update && \ 15 | apt-get install -y build-essential git cmake 16 | 17 | WORKDIR /app 18 | 19 | COPY . . 20 | 21 | # Use the default CUDA archs if not specified 22 | RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ 23 | export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ 24 | fi && \ 25 | cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ 26 | cmake --build build --config Release --target llama-cli -j$(nproc) 27 | 28 | FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime 29 | 30 | RUN apt-get update && \ 31 | apt-get install -y libgomp1 32 | 33 | COPY --from=build /app/build/ggml/src/libggml.so /libggml.so 34 | COPY --from=build /app/build/src/libllama.so /libllama.so 35 | COPY --from=build /app/build/bin/llama-cli /llama-cli 36 | 37 | ENTRYPOINT [ "/llama-cli" ] 38 | -------------------------------------------------------------------------------- /.devops/llama-cli-intel.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 2 | 3 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build 4 | 5 | ARG GGML_SYCL_F16=OFF 6 | RUN apt-get update && \ 7 | apt-get install -y git 8 | 9 | WORKDIR /app 10 | 11 | COPY . . 12 | 13 | RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ 14 | echo "GGML_SYCL_F16 is set" && \ 15 | export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ 16 | fi && \ 17 | echo "Building with static libs" && \ 18 | cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ 19 | ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ 20 | cmake --build build --config Release --target llama-cli 21 | 22 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime 23 | 24 | COPY --from=build /app/build/bin/llama-cli /llama-cli 25 | 26 | ENV LC_ALL=C.utf8 27 | 28 | ENTRYPOINT [ "/llama-cli" ] 29 | -------------------------------------------------------------------------------- /.devops/llama-cli-rocm.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | # This needs to generally match the container host's environment. 4 | ARG ROCM_VERSION=5.6 5 | 6 | # Target the CUDA build image 7 | ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete 8 | 9 | FROM ${BASE_ROCM_DEV_CONTAINER} AS build 10 | 11 | # Unless otherwise specified, we make a fat build. 12 | # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 13 | # This is mostly tied to rocBLAS supported archs. 14 | ARG ROCM_DOCKER_ARCH="\ 15 | gfx803 \ 16 | gfx900 \ 17 | gfx906 \ 18 | gfx908 \ 19 | gfx90a \ 20 | gfx1010 \ 21 | gfx1030 \ 22 | gfx1100 \ 23 | gfx1101 \ 24 | gfx1102" 25 | 26 | COPY requirements.txt requirements.txt 27 | COPY requirements requirements 28 | 29 | RUN pip install --upgrade pip setuptools wheel \ 30 | && pip install -r requirements.txt 31 | 32 | WORKDIR /app 33 | 34 | COPY . . 35 | 36 | # Set nvcc architecture 37 | ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} 38 | # Enable ROCm 39 | ENV GGML_HIPBLAS=1 40 | ENV CC=/opt/rocm/llvm/bin/clang 41 | ENV CXX=/opt/rocm/llvm/bin/clang++ 42 | 43 | RUN make -j$(nproc) llama-cli 44 | 45 | ENTRYPOINT [ "/app/llama-cli" ] 46 | -------------------------------------------------------------------------------- /.devops/llama-cli-vulkan.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=jammy 2 | 3 | FROM ubuntu:$UBUNTU_VERSION AS build 4 | 5 | # Install build tools 6 | RUN apt update && apt install -y git build-essential cmake wget libgomp1 7 | 8 | # Install Vulkan SDK 9 | RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ 10 | wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ 11 | apt update -y && \ 12 | apt-get install -y vulkan-sdk 13 | 14 | # Build it 15 | WORKDIR /app 16 | COPY . . 17 | RUN cmake -B build -DGGML_VULKAN=1 && \ 18 | cmake --build build --config Release --target llama-cli 19 | 20 | # Clean up 21 | WORKDIR / 22 | RUN cp /app/build/bin/llama-cli /llama-cli && \ 23 | rm -rf /app 24 | 25 | ENV LC_ALL=C.utf8 26 | 27 | ENTRYPOINT [ "/llama-cli" ] 28 | -------------------------------------------------------------------------------- /.devops/llama-cli.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | FROM ubuntu:$UBUNTU_VERSION AS build 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y build-essential git 7 | 8 | WORKDIR /app 9 | 10 | COPY . . 11 | 12 | RUN make -j$(nproc) llama-cli 13 | 14 | FROM ubuntu:$UBUNTU_VERSION AS runtime 15 | 16 | RUN apt-get update && \ 17 | apt-get install -y libgomp1 18 | 19 | COPY --from=build /app/llama-cli /llama-cli 20 | 21 | ENV LC_ALL=C.utf8 22 | 23 | ENTRYPOINT [ "/llama-cli" ] 24 | -------------------------------------------------------------------------------- /.devops/llama-server-intel.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 2 | 3 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build 4 | 5 | ARG GGML_SYCL_F16=OFF 6 | RUN apt-get update && \ 7 | apt-get install -y git libcurl4-openssl-dev 8 | 9 | WORKDIR /app 10 | 11 | COPY . . 12 | 13 | RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ 14 | echo "GGML_SYCL_F16 is set" && \ 15 | export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ 16 | fi && \ 17 | echo "Building with dynamic libs" && \ 18 | cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ 19 | cmake --build build --config Release --target llama-server 20 | 21 | FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime 22 | 23 | RUN apt-get update && \ 24 | apt-get install -y libcurl4-openssl-dev curl 25 | 26 | COPY --from=build /app/build/bin/llama-server /llama-server 27 | 28 | ENV LC_ALL=C.utf8 29 | # Must be set to 0.0.0.0 so it can listen to requests from host machine 30 | ENV LLAMA_ARG_HOST=0.0.0.0 31 | 32 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] 33 | 34 | ENTRYPOINT [ "/llama-server" ] 35 | -------------------------------------------------------------------------------- /.devops/llama-server-vulkan.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=jammy 2 | 3 | FROM ubuntu:$UBUNTU_VERSION AS build 4 | 5 | # Install build tools 6 | RUN apt update && apt install -y git build-essential cmake wget 7 | 8 | # Install Vulkan SDK and cURL 9 | RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ 10 | wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ 11 | apt update -y && \ 12 | apt-get install -y vulkan-sdk libcurl4-openssl-dev curl 13 | 14 | # Build it 15 | WORKDIR /app 16 | COPY . . 17 | RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ 18 | cmake --build build --config Release --target llama-server 19 | 20 | # Clean up 21 | WORKDIR / 22 | RUN cp /app/build/bin/llama-server /llama-server && \ 23 | rm -rf /app 24 | 25 | ENV LC_ALL=C.utf8 26 | # Must be set to 0.0.0.0 so it can listen to requests from host machine 27 | ENV LLAMA_ARG_HOST=0.0.0.0 28 | 29 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] 30 | 31 | ENTRYPOINT [ "/llama-server" ] 32 | -------------------------------------------------------------------------------- /.devops/llama-server.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG UBUNTU_VERSION=22.04 2 | 3 | FROM ubuntu:$UBUNTU_VERSION AS build 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y build-essential git libcurl4-openssl-dev 7 | 8 | WORKDIR /app 9 | 10 | COPY . . 11 | 12 | ENV LLAMA_CURL=1 13 | 14 | RUN make -j$(nproc) llama-server 15 | 16 | FROM ubuntu:$UBUNTU_VERSION AS runtime 17 | 18 | RUN apt-get update && \ 19 | apt-get install -y libcurl4-openssl-dev libgomp1 curl 20 | 21 | COPY --from=build /app/llama-server /llama-server 22 | 23 | ENV LC_ALL=C.utf8 24 | # Must be set to 0.0.0.0 so it can listen to requests from host machine 25 | ENV LLAMA_ARG_HOST=0.0.0.0 26 | 27 | HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] 28 | 29 | ENTRYPOINT [ "/llama-server" ] 30 | -------------------------------------------------------------------------------- /.devops/nix/apps.nix: -------------------------------------------------------------------------------- 1 | { 2 | perSystem = 3 | { config, lib, ... }: 4 | { 5 | apps = 6 | let 7 | inherit (config.packages) default; 8 | binaries = [ 9 | "llama-cli" 10 | "llama-embedding" 11 | "llama-server" 12 | "llama-quantize" 13 | ]; 14 | mkApp = name: { 15 | type = "app"; 16 | program = "${default}/bin/${name}"; 17 | }; 18 | in 19 | lib.genAttrs binaries mkApp; 20 | }; 21 | } 22 | -------------------------------------------------------------------------------- /.devops/nix/docker.nix: -------------------------------------------------------------------------------- 1 | { 2 | lib, 3 | dockerTools, 4 | buildEnv, 5 | llama-cpp, 6 | interactive ? true, 7 | coreutils, 8 | }: 9 | 10 | # A tar that can be fed into `docker load`: 11 | # 12 | # $ nix build .#llamaPackages.docker 13 | # $ docker load < result 14 | 15 | # For details and variations cf. 16 | # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage 17 | # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922 18 | # - https://nixery.dev/ 19 | 20 | # Approximate (compressed) sizes, at the time of writing, are: 21 | # 22 | # .#llamaPackages.docker: 125M; 23 | # .#llamaPackagesCuda.docker: 537M; 24 | # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. 25 | 26 | dockerTools.buildLayeredImage { 27 | name = llama-cpp.pname; 28 | tag = "latest"; 29 | 30 | contents = 31 | [ llama-cpp ] 32 | ++ lib.optionals interactive [ 33 | coreutils 34 | dockerTools.binSh 35 | dockerTools.caCertificates 36 | ]; 37 | } 38 | -------------------------------------------------------------------------------- /.devops/nix/jetson-support.nix: -------------------------------------------------------------------------------- 1 | { inputs, ... }: 2 | { 3 | perSystem = 4 | { 5 | config, 6 | system, 7 | lib, 8 | pkgsCuda, 9 | ... 10 | }: 11 | { 12 | legacyPackages = 13 | let 14 | caps.llamaPackagesXavier = "7.2"; 15 | caps.llamaPackagesOrin = "8.7"; 16 | caps.llamaPackagesTX2 = "6.2"; 17 | caps.llamaPackagesNano = "5.3"; 18 | 19 | pkgsFor = 20 | cap: 21 | import inputs.nixpkgs { 22 | inherit system; 23 | config = { 24 | cudaSupport = true; 25 | cudaCapabilities = [ cap ]; 26 | cudaEnableForwardCompat = false; 27 | inherit (pkgsCuda.config) allowUnfreePredicate; 28 | }; 29 | }; 30 | in 31 | builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps; 32 | 33 | packages = lib.optionalAttrs (system == "aarch64-linux") { 34 | jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp; 35 | jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp; 36 | jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp; 37 | }; 38 | }; 39 | } 40 | -------------------------------------------------------------------------------- /.devops/nix/package-gguf-py.nix: -------------------------------------------------------------------------------- 1 | { 2 | lib, 3 | llamaVersion, 4 | numpy, 5 | tqdm, 6 | sentencepiece, 7 | pyyaml, 8 | poetry-core, 9 | buildPythonPackage, 10 | pytestCheckHook, 11 | }: 12 | 13 | buildPythonPackage { 14 | pname = "gguf"; 15 | version = llamaVersion; 16 | pyproject = true; 17 | nativeBuildInputs = [ poetry-core ]; 18 | propagatedBuildInputs = [ 19 | numpy 20 | tqdm 21 | sentencepiece 22 | pyyaml 23 | ]; 24 | src = lib.cleanSource ../../gguf-py; 25 | pythonImportsCheck = [ 26 | "numpy" 27 | "gguf" 28 | ]; 29 | nativeCheckInputs = [ pytestCheckHook ]; 30 | doCheck = true; 31 | meta = with lib; { 32 | description = "Python package for writing binary files in the GGUF format"; 33 | license = licenses.mit; 34 | maintainers = [ maintainers.ditsuke ]; 35 | }; 36 | } 37 | -------------------------------------------------------------------------------- /.devops/nix/scope.nix: -------------------------------------------------------------------------------- 1 | { 2 | lib, 3 | newScope, 4 | python3, 5 | llamaVersion ? "0.0.0", 6 | }: 7 | 8 | let 9 | pythonPackages = python3.pkgs; 10 | buildPythonPackage = pythonPackages.buildPythonPackage; 11 | numpy = pythonPackages.numpy; 12 | tqdm = pythonPackages.tqdm; 13 | sentencepiece = pythonPackages.sentencepiece; 14 | pyyaml = pythonPackages.pyyaml; 15 | poetry-core = pythonPackages.poetry-core; 16 | pytestCheckHook = pythonPackages.pytestCheckHook; 17 | in 18 | 19 | # We're using `makeScope` instead of just writing out an attrset 20 | # because it allows users to apply overlays later using `overrideScope'`. 21 | # Cf. https://noogle.dev/f/lib/makeScope 22 | 23 | lib.makeScope newScope (self: { 24 | inherit llamaVersion; 25 | gguf-py = self.callPackage ./package-gguf-py.nix { 26 | inherit 27 | buildPythonPackage 28 | numpy 29 | tqdm 30 | sentencepiece 31 | poetry-core 32 | pyyaml 33 | pytestCheckHook 34 | ; 35 | }; 36 | python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; }; 37 | llama-cpp = self.callPackage ./package.nix { }; 38 | docker = self.callPackage ./docker.nix { }; 39 | docker-min = self.callPackage ./docker.nix { interactive = false; }; 40 | sif = self.callPackage ./sif.nix { }; 41 | }) 42 | -------------------------------------------------------------------------------- /.devops/nix/sif.nix: -------------------------------------------------------------------------------- 1 | { 2 | lib, 3 | singularity-tools, 4 | llama-cpp, 5 | bashInteractive, 6 | interactive ? false, 7 | }: 8 | 9 | let 10 | optionalInt = cond: x: if cond then x else 0; 11 | in 12 | singularity-tools.buildImage rec { 13 | inherit (llama-cpp) name; 14 | contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; 15 | 16 | # These are excessive (but safe) for most variants. Building singularity 17 | # images requires superuser privileges, so we build them inside a VM in a 18 | # writable image of pre-determined size. 19 | # 20 | # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846 21 | # 22 | # Expected image sizes: 23 | # - cpu/blas: 150M, 24 | # - cuda, all gencodes: 560M, 25 | diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; 26 | memSize = diskSize; 27 | } 28 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.a 3 | .cache/ 4 | # Do not ignore .git directory, otherwise the reported build number will always be 0 5 | .github/ 6 | .gitignore 7 | .vs/ 8 | .vscode/ 9 | .DS_Store 10 | 11 | build*/ 12 | 13 | models/* 14 | 15 | /llama-cli 16 | /llama-quantize 17 | 18 | arm_neon.h 19 | compile_commands.json 20 | Dockerfile 21 | -------------------------------------------------------------------------------- /.ecrc: -------------------------------------------------------------------------------- 1 | { 2 | "Exclude": ["^\\.gitmodules$", "stb_image\\.h"], 3 | "Disable": { 4 | "IndentSize": true 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://EditorConfig.org 2 | 3 | # Top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file, utf-8 charset 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | trim_trailing_whitespace = true 11 | charset = utf-8 12 | indent_style = space 13 | indent_size = 4 14 | 15 | [Makefile] 16 | indent_style = tab 17 | 18 | [scripts/*.mk] 19 | indent_style = tab 20 | 21 | [prompts/*.txt] 22 | insert_final_newline = unset 23 | 24 | [examples/server/public/*] 25 | indent_size = 2 26 | 27 | [examples/llama.swiftui/llama.swiftui.xcodeproj/*] 28 | indent_style = tab 29 | 30 | [examples/cvector-generator/*.txt] 31 | trim_trailing_whitespace = unset 32 | insert_final_newline = unset 33 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 125 3 | ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 4 | exclude = 5 | # Do not traverse examples 6 | examples, 7 | # Do not include package initializers 8 | __init__.py, 9 | # No need to traverse our git directory 10 | .git, 11 | # There's no value in checking cache directories 12 | __pycache__, 13 | # No need to include the build path 14 | build, 15 | # This contains builds that we don't want to check 16 | dist # This is generated with `python build .` for package releases 17 | # max-complexity = 10 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/07-refactor.yml: -------------------------------------------------------------------------------- 1 | name: Refactor (Maintainers) 2 | description: Used to track refactoring opportunities 3 | title: "Refactor: " 4 | labels: ["refactor"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. 10 | Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. 11 | 12 | - type: textarea 13 | id: background-description 14 | attributes: 15 | label: Background Description 16 | description: Please provide a detailed written description of the pain points you are trying to solve. 17 | placeholder: Detailed description behind your motivation to request refactor 18 | validations: 19 | required: true 20 | 21 | - type: textarea 22 | id: possible-approaches 23 | attributes: 24 | label: Possible Refactor Approaches 25 | description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list. 26 | placeholder: Your idea of possible refactoring opportunity/approaches 27 | validations: 28 | required: false 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Got an idea? 4 | url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas 5 | about: Pop it there. It may then become an enhancement ticket. 6 | - name: Got a question? 7 | url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a 8 | about: Ask a question there! 9 | - name: Want to contribute? 10 | url: https://github.com/ggerganov/llama.cpp/wiki/contribute 11 | about: Head to the contribution guide page of the wiki for areas you can help with 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | - [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) 4 | - Self-reported review complexity: 5 | - [ ] Low 6 | - [ ] Medium 7 | - [ ] High 8 | -------------------------------------------------------------------------------- /.github/workflows/close-issue.yml: -------------------------------------------------------------------------------- 1 | name: Close inactive issues 2 | on: 3 | schedule: 4 | - cron: "42 0 * * *" 5 | 6 | # Fine-grant permission 7 | # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token 8 | permissions: 9 | issues: write 10 | 11 | jobs: 12 | close-issues: 13 | runs-on: ubuntu-latest 14 | permissions: 15 | issues: write 16 | pull-requests: write 17 | steps: 18 | - uses: actions/stale@v5 19 | with: 20 | exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" 21 | days-before-issue-stale: 30 22 | days-before-issue-close: 14 23 | stale-issue-label: "stale" 24 | close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." 25 | days-before-pr-stale: -1 26 | days-before-pr-close: -1 27 | operations-per-run: 10000 28 | repo-token: ${{ secrets.GITHUB_TOKEN }} 29 | -------------------------------------------------------------------------------- /.github/workflows/editorconfig.yml: -------------------------------------------------------------------------------- 1 | name: EditorConfig Checker 2 | 3 | on: 4 | workflow_dispatch: # allows manual triggering 5 | inputs: 6 | create_release: 7 | description: 'Create new release' 8 | required: true 9 | type: boolean 10 | push: 11 | branches: 12 | - master 13 | pull_request: 14 | branches: 15 | - master 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | editorconfig: 23 | runs-on: ubuntu-latest 24 | steps: 25 | - uses: actions/checkout@v4 26 | - uses: editorconfig-checker/action-editorconfig-checker@main 27 | - run: editorconfig-checker 28 | -------------------------------------------------------------------------------- /.github/workflows/gguf-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a GGUF release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # See `gguf-py/README.md` for how to make a release. 5 | 6 | # This workflow uses actions that are not certified by GitHub. 7 | # They are provided by a third-party and are governed by 8 | # separate terms of service, privacy policy, and support 9 | # documentation. 10 | 11 | name: Upload Python Package 12 | 13 | on: 14 | workflow_dispatch: 15 | push: 16 | # Pattern matched against refs/tags 17 | tags: 18 | - 'gguf-v*' # Push events to every version tag 19 | 20 | 21 | jobs: 22 | deploy: 23 | 24 | runs-on: ubuntu-latest 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Python 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: '3.9.x' 32 | - name: Install dependencies 33 | run: | 34 | cd gguf-py 35 | python -m pip install poetry 36 | poetry install 37 | 38 | - name: Build package 39 | run: cd gguf-py && poetry build 40 | - name: Publish package 41 | uses: pypa/gh-action-pypi-publish@release/v1 42 | with: 43 | password: ${{ secrets.PYPI_API_TOKEN }} 44 | packages-dir: gguf-py/dist 45 | -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Labeler" 2 | on: 3 | - pull_request_target 4 | 5 | jobs: 6 | labeler: 7 | permissions: 8 | contents: read 9 | pull-requests: write 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | repository: "ggerganov/llama.cpp" 15 | - uses: actions/labeler@v5 16 | with: 17 | configuration-path: '.github/labeler.yml' 18 | -------------------------------------------------------------------------------- /.github/workflows/nix-flake-update.yml: -------------------------------------------------------------------------------- 1 | name: update-flake-lock 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00 6 | 7 | jobs: 8 | lockfile: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout repository 12 | uses: actions/checkout@v4 13 | - name: Install Nix 14 | uses: DeterminateSystems/nix-installer-action@main 15 | - name: Update flake.lock 16 | uses: DeterminateSystems/update-flake-lock@main 17 | with: 18 | pr-title: "nix: update flake.lock" 19 | pr-labels: | 20 | nix 21 | pr-reviewers: philiptaron,SomeoneSerge 22 | token: ${{ secrets.FLAKE_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/nix-publish-flake.yml: -------------------------------------------------------------------------------- 1 | # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes 2 | name: "Publish a flake to flakestry & flakehub" 3 | on: 4 | push: 5 | tags: 6 | - "*" 7 | workflow_dispatch: 8 | inputs: 9 | tag: 10 | description: "The existing tag to publish" 11 | type: "string" 12 | required: true 13 | jobs: 14 | flakestry-publish: 15 | runs-on: ubuntu-latest 16 | permissions: 17 | id-token: "write" 18 | contents: "read" 19 | steps: 20 | - uses: flakestry/flakestry-publish@main 21 | with: 22 | version: "${{ inputs.tag || github.ref_name }}" 23 | flakehub-publish: 24 | runs-on: "ubuntu-latest" 25 | permissions: 26 | id-token: "write" 27 | contents: "read" 28 | steps: 29 | - uses: "actions/checkout@v4" 30 | with: 31 | ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}" 32 | - uses: "DeterminateSystems/nix-installer-action@main" 33 | - uses: "DeterminateSystems/flakehub-push@main" 34 | with: 35 | visibility: "public" 36 | tag: "${{ inputs.tag }}" 37 | -------------------------------------------------------------------------------- /.github/workflows/python-check-requirements.yml: -------------------------------------------------------------------------------- 1 | name: Python check requirements.txt 2 | 3 | on: 4 | push: 5 | paths: 6 | - '.github/workflows/python-check-requirements.yml' 7 | - 'scripts/check-requirements.sh' 8 | - 'convert*.py' 9 | - '**/requirements*.txt' 10 | pull_request: 11 | paths: 12 | - '.github/workflows/python-check-requirements.yml' 13 | - 'scripts/check-requirements.sh' 14 | - 'convert*.py' 15 | - '**/requirements*.txt' 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | python-check-requirements: 23 | runs-on: ubuntu-latest 24 | name: check-requirements 25 | steps: 26 | - name: Check out source repository 27 | uses: actions/checkout@v4 28 | - name: Set up Python environment 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: "3.11" 32 | - name: Run check-requirements.sh script 33 | run: bash scripts/check-requirements.sh 34 | -------------------------------------------------------------------------------- /.github/workflows/python-lint.yml: -------------------------------------------------------------------------------- 1 | name: flake8 Lint 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | flake8-lint: 11 | runs-on: ubuntu-latest 12 | name: Lint 13 | steps: 14 | - name: Check out source repository 15 | uses: actions/checkout@v4 16 | - name: Set up Python environment 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.11" 20 | - name: flake8 Lint 21 | uses: py-actions/flake8@v2 22 | with: 23 | plugins: "flake8-no-print" 24 | -------------------------------------------------------------------------------- /.github/workflows/python-type-check.yml: -------------------------------------------------------------------------------- 1 | name: Python Type-Check 2 | 3 | on: 4 | push: 5 | paths: 6 | - '.github/workflows/python-type-check.yml' 7 | - 'pyrightconfig.json' 8 | - '**.py' 9 | - '**/requirements*.txt' 10 | pull_request: 11 | paths: 12 | - '.github/workflows/python-type-check.yml' 13 | - 'pyrightconfig.json' 14 | - '**.py' 15 | - '**/requirements*.txt' 16 | 17 | concurrency: 18 | group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} 19 | cancel-in-progress: true 20 | 21 | jobs: 22 | python-type-check: 23 | runs-on: ubuntu-latest 24 | name: pyright type-check 25 | steps: 26 | - name: Check out source repository 27 | uses: actions/checkout@v4 28 | - name: Set up Python environment 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: "3.11" 32 | - name: Install Python dependencies 33 | # TODO: use a venv 34 | run: pip install -r requirements/requirements-all.txt 35 | - name: Type-check with Pyright 36 | uses: jakebailey/pyright-action@v2 37 | with: 38 | version: 1.1.382 39 | level: warning 40 | warnings: true 41 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "kompute"] 2 | path = ggml/src/kompute 3 | url = https://github.com/nomic-ai/kompute.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: prompts/.*.txt 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.6.0 7 | hooks: 8 | - id: trailing-whitespace 9 | - id: end-of-file-fixer 10 | - id: check-yaml 11 | - id: check-added-large-files 12 | - repo: https://github.com/PyCQA/flake8 13 | rev: 7.0.0 14 | hooks: 15 | - id: flake8 16 | additional_dependencies: [flake8-no-print] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2024 The ggml authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ci/README.md: -------------------------------------------------------------------------------- 1 | # CI 2 | 3 | In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: 4 | 5 | https://github.com/ggml-org/ci 6 | 7 | It monitors the `master` branch for new commits and runs the 8 | [ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us 9 | to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled 10 | to cover various hardware architectures, including GPU and Apple Silicon instances. 11 | 12 | Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message. 13 | Only the branches of this repo are monitored for this keyword. 14 | 15 | It is a good practice, before publishing changes to execute the full CI locally on your machine: 16 | 17 | ```bash 18 | mkdir tmp 19 | 20 | # CPU-only build 21 | bash ./ci/run.sh ./tmp/results ./tmp/mnt 22 | 23 | # with CUDA support 24 | GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt 25 | 26 | # with SYCL support 27 | source /opt/intel/oneapi/setvars.sh 28 | GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt 29 | ``` 30 | -------------------------------------------------------------------------------- /cmake/arm64-windows-llvm.cmake: -------------------------------------------------------------------------------- 1 | set( CMAKE_SYSTEM_NAME Windows ) 2 | set( CMAKE_SYSTEM_PROCESSOR arm64 ) 3 | 4 | set( target arm64-pc-windows-msvc ) 5 | 6 | set( CMAKE_C_COMPILER clang ) 7 | set( CMAKE_CXX_COMPILER clang++ ) 8 | 9 | set( CMAKE_C_COMPILER_TARGET ${target} ) 10 | set( CMAKE_CXX_COMPILER_TARGET ${target} ) 11 | 12 | set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) 13 | set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" ) 14 | 15 | set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) 16 | set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) 17 | -------------------------------------------------------------------------------- /cmake/arm64-windows-msvc.cmake: -------------------------------------------------------------------------------- 1 | set( CMAKE_SYSTEM_NAME Windows ) 2 | set( CMAKE_SYSTEM_PROCESSOR arm64 ) 3 | 4 | set( target arm64-pc-windows-msvc ) 5 | set( CMAKE_C_COMPILER_TARGET ${target} ) 6 | set( CMAKE_CXX_COMPILER_TARGET ${target} ) 7 | -------------------------------------------------------------------------------- /cmake/git-vars.cmake: -------------------------------------------------------------------------------- 1 | find_package(Git) 2 | 3 | # the commit's SHA1 4 | execute_process(COMMAND 5 | "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8 6 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 7 | OUTPUT_VARIABLE GIT_SHA1 8 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 9 | 10 | # the date of the commit 11 | execute_process(COMMAND 12 | "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local 13 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 14 | OUTPUT_VARIABLE GIT_DATE 15 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 16 | 17 | # the subject of the commit 18 | execute_process(COMMAND 19 | "${GIT_EXECUTABLE}" log -1 --format=%s 20 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 21 | OUTPUT_VARIABLE GIT_COMMIT_SUBJECT 22 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 23 | -------------------------------------------------------------------------------- /cmake/llama.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | 6 | Name: llama 7 | Description: Port of Facebook's LLaMA model in C/C++ 8 | Version: @PROJECT_VERSION@ 9 | Libs: -L${libdir} -lllama 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /common/build-info.cpp.in: -------------------------------------------------------------------------------- 1 | int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; 2 | char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; 3 | char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; 4 | char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; 5 | -------------------------------------------------------------------------------- /common/cmake/build-info-gen-cpp.cmake: -------------------------------------------------------------------------------- 1 | include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) 2 | 3 | set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in") 4 | set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp") 5 | 6 | # Only write the build info if it changed 7 | if(EXISTS ${OUTPUT_FILE}) 8 | file(READ ${OUTPUT_FILE} CONTENTS) 9 | string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS}) 10 | set(OLD_COMMIT ${CMAKE_MATCH_1}) 11 | string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS}) 12 | set(OLD_COMPILER ${CMAKE_MATCH_1}) 13 | string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS}) 14 | set(OLD_TARGET ${CMAKE_MATCH_1}) 15 | if ( 16 | NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR 17 | NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR 18 | NOT OLD_TARGET STREQUAL BUILD_TARGET 19 | ) 20 | configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) 21 | endif() 22 | else() 23 | configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE}) 24 | endif() 25 | -------------------------------------------------------------------------------- /common/console.h: -------------------------------------------------------------------------------- 1 | // Console functions 2 | 3 | #pragma once 4 | 5 | #include 6 | 7 | namespace console { 8 | enum display_t { 9 | reset = 0, 10 | prompt, 11 | user_input, 12 | error 13 | }; 14 | 15 | void init(bool use_simple_io, bool use_advanced_display); 16 | void cleanup(); 17 | void set_display(display_t display); 18 | bool readline(std::string & line, bool multiline_input); 19 | char32_t getchar32(); 20 | } 21 | -------------------------------------------------------------------------------- /common/json-schema-to-grammar.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | // Change JSON_ASSERT from assert() to GGML_ASSERT: 5 | #define JSON_ASSERT GGML_ASSERT 6 | #include "json.hpp" 7 | 8 | std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); 9 | -------------------------------------------------------------------------------- /docs/development/llama-star/idea-arch.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/docs/development/llama-star/idea-arch.key -------------------------------------------------------------------------------- /docs/development/llama-star/idea-arch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/docs/development/llama-star/idea-arch.pdf -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Install pre-built version of llama.cpp 2 | 3 | ## Homebrew 4 | 5 | On Mac and Linux, the homebrew package manager can be used via 6 | 7 | ```sh 8 | brew install llama.cpp 9 | ``` 10 | The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 11 | 12 | ## Nix 13 | 14 | On Mac and Linux, the Nix package manager can be used via 15 | 16 | ```sh 17 | nix profile install nixpkgs#llama-cpp 18 | ``` 19 | For flake enabled installs. 20 | 21 | Or 22 | 23 | ```sh 24 | nix-env --file '' --install --attr llama-cpp 25 | ``` 26 | 27 | For non-flake enabled installs. 28 | 29 | This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). 30 | 31 | ## Flox 32 | 33 | On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via 34 | 35 | ```sh 36 | flox install llama-cpp 37 | ``` 38 | 39 | Flox follows the nixpkgs build of llama.cpp. 40 | -------------------------------------------------------------------------------- /ggml/.gitignore: -------------------------------------------------------------------------------- 1 | src/ggml-vulkan-shaders.hpp 2 | src/ggml-vulkan-shaders.cpp 3 | -------------------------------------------------------------------------------- /ggml/include/ggml-blas.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | #include "ggml-backend.h" 5 | 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // backend API 12 | GGML_API ggml_backend_t ggml_backend_blas_init(void); 13 | 14 | GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); 15 | 16 | // number of threads used for conversion to float 17 | // for openblas and blis, this will also set the number of threads used for blas operations 18 | GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); 19 | 20 | 21 | #ifdef __cplusplus 22 | } 23 | #endif 24 | -------------------------------------------------------------------------------- /ggml/include/ggml-kompute.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | #include "ggml-backend.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | struct ggml_vk_device { 15 | int index; 16 | int type; // same as VkPhysicalDeviceType 17 | size_t heapSize; 18 | const char * name; 19 | const char * vendor; 20 | int subgroupSize; 21 | uint64_t bufferAlignment; 22 | uint64_t maxAlloc; 23 | }; 24 | 25 | struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); 26 | bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name); 27 | bool ggml_vk_has_vulkan(void); 28 | bool ggml_vk_has_device(void); 29 | struct ggml_vk_device ggml_vk_current_device(void); 30 | 31 | // 32 | // backend API 33 | // 34 | 35 | // forward declaration 36 | typedef struct ggml_backend * ggml_backend_t; 37 | 38 | GGML_API ggml_backend_t ggml_backend_kompute_init(int device); 39 | 40 | GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); 41 | 42 | GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | -------------------------------------------------------------------------------- /ggml/include/ggml-rpc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | #include "ggml-backend.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | #define GGML_RPC_MAX_SERVERS 16 11 | 12 | // backend API 13 | GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); 14 | GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); 15 | 16 | GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); 17 | 18 | GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); 19 | 20 | GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); 21 | 22 | #ifdef __cplusplus 23 | } 24 | #endif 25 | -------------------------------------------------------------------------------- /ggml/include/ggml-vulkan.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ggml.h" 4 | #include "ggml-backend.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | #define GGML_VK_NAME "Vulkan" 11 | #define GGML_VK_MAX_DEVICES 16 12 | 13 | GGML_API void ggml_vk_instance_init(void); 14 | 15 | // backend API 16 | GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); 17 | 18 | GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); 19 | GGML_API int ggml_backend_vk_get_device_count(void); 20 | GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); 21 | GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); 22 | 23 | GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); 24 | // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU 25 | GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | -------------------------------------------------------------------------------- /ggml/src/ggml-cann/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (NOT SOC_TYPE) 2 | set (SOC_TYPE "Ascend910B3") 3 | endif() 4 | 5 | file(GLOB SRC_FILES 6 | get_row_f32.cpp 7 | get_row_f16.cpp 8 | get_row_q4_0.cpp 9 | get_row_q8_0.cpp 10 | quantize_f32_q8_0.cpp 11 | quantize_f16_q8_0.cpp 12 | quantize_float_to_q4_0.cpp 13 | dup.cpp 14 | ) 15 | 16 | string(TOLOWER ${SOC_TYPE} SOC_VERSION) 17 | set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) 18 | set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") 19 | 20 | if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) 21 | set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) 22 | elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) 23 | set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) 24 | else() 25 | message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") 26 | endif() 27 | include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) 28 | 29 | ascendc_library(ascendc_kernels STATIC 30 | ${SRC_FILES} 31 | ) 32 | 33 | # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) 34 | -------------------------------------------------------------------------------- /ggml/src/ggml-cann/kernels/ascendc_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef ASCENDC_KERNELS_H 2 | #define ASCENDC_KERNELS_H 3 | 4 | #include "aclrtlaunch_ascendc_get_row_f32.h" 5 | #include "aclrtlaunch_ascendc_get_row_f16.h" 6 | #include "aclrtlaunch_ascendc_get_row_q8_0.h" 7 | #include "aclrtlaunch_ascendc_get_row_q4_0.h" 8 | 9 | #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" 10 | #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" 11 | #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" 12 | #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" 13 | 14 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" 15 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" 16 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" 17 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" 18 | 19 | #endif // ASCENDC_KERNELS_H 20 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/acc.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ACC_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/arange.cu: -------------------------------------------------------------------------------- 1 | #include "arange.cuh" 2 | 3 | static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { 4 | // blockIDx.x: idx of ne0 / BLOCK_SIZE 5 | int nidx = threadIdx.x + blockIdx.x * blockDim.x; 6 | if (nidx >= ne0) { 7 | return; 8 | } 9 | dst[nidx] = start + step * nidx; 10 | } 11 | 12 | static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { 13 | int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; 14 | arange_f32<<>>(dst, ne0, start, step); 15 | } 16 | 17 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 18 | float * dst_d = (float *)dst->data; 19 | cudaStream_t stream = ctx.stream(); 20 | 21 | GGML_ASSERT(dst->type == GGML_TYPE_F32); 22 | 23 | float start; 24 | float stop; 25 | float step; 26 | memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); 27 | memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); 28 | memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); 29 | 30 | int64_t steps = (int64_t)ceil((stop - start) / step); 31 | GGML_ASSERT(ggml_nelements(dst) == steps); 32 | 33 | arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); 34 | } 35 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/arange.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ARANGE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/argmax.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/argsort.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/binbcast.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 5 | void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 7 | void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | 9 | void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 10 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/clamp.cu: -------------------------------------------------------------------------------- 1 | #include "clamp.cuh" 2 | 3 | static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { 4 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 5 | 6 | if (i >= k) { 7 | return; 8 | } 9 | 10 | dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); 11 | } 12 | 13 | static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { 14 | const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; 15 | clamp_f32<<>>(x, dst, min, max, k); 16 | } 17 | 18 | 19 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 20 | const ggml_tensor * src0 = dst->src[0]; 21 | const float * src0_d = (const float *)src0->data; 22 | float * dst_d = (float *)dst->data; 23 | cudaStream_t stream = ctx.stream(); 24 | 25 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 26 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 27 | 28 | float min; 29 | float max; 30 | memcpy(&min, dst->op_params, sizeof(float)); 31 | memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); 32 | 33 | clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream); 34 | } 35 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/clamp.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CLAMP_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/concat.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CONCAT_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/conv-transpose-1d.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/convert.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 4 | 5 | template 6 | using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); 7 | 8 | typedef to_t_cuda_t to_fp32_cuda_t; 9 | typedef to_t_cuda_t to_fp16_cuda_t; 10 | 11 | to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); 12 | 13 | to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); 14 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/count-equal.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_COUNT_EQUAL_CHUNK_SIZE 128 4 | 5 | void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/cpy.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CPY_BLOCK_SIZE 32 4 | 5 | void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); 6 | 7 | void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | 9 | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1); 10 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/cross-entropy-loss.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | 7 | void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/diagmask.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 4 | 5 | void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/dmmv.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | // dmmv = dequantize_mul_mat_vec 4 | 5 | // TODO: remove this? 6 | #ifndef GGML_CUDA_DMMV_X 7 | #define GGML_CUDA_DMMV_X 32 8 | #endif 9 | 10 | #ifndef GGML_CUDA_MMV_Y 11 | #define GGML_CUDA_MMV_Y 1 12 | #endif 13 | 14 | void ggml_cuda_op_dequantize_mul_mat_vec( 15 | ggml_backend_cuda_context & ctx, 16 | const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, 17 | const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, 18 | const int64_t src1_padded_row_size, cudaStream_t stream); 19 | 20 | bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); 21 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/fattn-tile-f16.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/fattn-tile-f32.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/fattn.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/getrows.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_GET_ROWS_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/im2col.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_IM2COL_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/mmvq.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. 4 | 5 | void ggml_cuda_op_mul_mat_vec_q( 6 | ggml_backend_cuda_context & ctx, 7 | const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, 8 | const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, 9 | const int64_t src1_padded_row_size, cudaStream_t stream); 10 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/norm.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | 5 | void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | 7 | void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/opt-step-adamw.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/out-prod.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/pad.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_PAD_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/pool2d.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_POOL2D_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/quantize.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.cuh" 4 | #include "mmq.cuh" 5 | 6 | #include 7 | 8 | #define CUDA_QUANTIZE_BLOCK_SIZE 256 9 | #define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128 10 | 11 | static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access."); 12 | static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); 13 | 14 | typedef void (*quantize_cuda_t)( 15 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 16 | const ggml_type type_x, cudaStream_t stream); 17 | 18 | void quantize_row_q8_1_cuda( 19 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 20 | const ggml_type type_x, cudaStream_t stream); 21 | 22 | void quantize_mmq_q8_1_cuda( 23 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 24 | const ggml_type type_x, cudaStream_t stream); 25 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/rope.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ROPE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/rwkv-wkv.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_WKV_BLOCK_SIZE 64 4 | 5 | void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/scale.cu: -------------------------------------------------------------------------------- 1 | #include "scale.cuh" 2 | 3 | static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { 4 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 5 | 6 | if (i >= k) { 7 | return; 8 | } 9 | 10 | dst[i] = scale * x[i]; 11 | } 12 | 13 | static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { 14 | const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; 15 | scale_f32<<>>(x, dst, scale, k); 16 | } 17 | 18 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 19 | const ggml_tensor * src0 = dst->src[0]; 20 | const float * src0_d = (const float *)src0->data; 21 | float * dst_d = (float *)dst->data; 22 | cudaStream_t stream = ctx.stream(); 23 | 24 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 25 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 26 | 27 | float scale; 28 | memcpy(&scale, dst->op_params, sizeof(float)); 29 | 30 | scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream); 31 | } 32 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/scale.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_SCALE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/softmax.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_SOFT_MAX_BLOCK_SIZE 1024 4 | 5 | void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/sum.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream); 4 | 5 | void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/sumrows.cu: -------------------------------------------------------------------------------- 1 | #include "sumrows.cuh" 2 | 3 | static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { 4 | const int row = blockIdx.x; 5 | const int col = threadIdx.x; 6 | 7 | float sum = 0.0f; 8 | for (int i = col; i < ncols; i += blockDim.x) { 9 | sum += x[row * ncols + i]; 10 | } 11 | 12 | sum = warp_reduce_sum(sum); 13 | 14 | if (col == 0) { 15 | dst[row] = sum; 16 | } 17 | } 18 | 19 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { 20 | const dim3 block_dims(WARP_SIZE, 1, 1); 21 | const dim3 block_nums(nrows, 1, 1); 22 | k_sum_rows_f32<<>>(x, dst, ncols); 23 | } 24 | 25 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 26 | const ggml_tensor * src0 = dst->src[0]; 27 | const float * src0_d = (const float *)src0->data; 28 | float * dst_d = (float *)dst->data; 29 | cudaStream_t stream = ctx.stream(); 30 | 31 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 32 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 33 | GGML_ASSERT(ggml_is_contiguous(src0)); 34 | 35 | const int64_t ncols = src0->ne[0]; 36 | const int64_t nrows = ggml_nrows(src0); 37 | 38 | sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); 39 | } 40 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/sumrows.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream); 4 | 5 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, float); 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, float); 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, float); 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, float); 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, float); 10 | DECL_FATTN_WMMA_F16_CASE(256, 16, float); 11 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, float); 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, float); 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, float); 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, float); 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, float); 10 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, half); 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, half); 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, half); 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, half); 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, half); 10 | DECL_FATTN_WMMA_F16_CASE(256, 16, half); 11 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, half); 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, half); 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, half); 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, half); 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, half); 10 | DECL_FATTN_WMMA_F16_CASE(256, 32, half); 11 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 8, half); 6 | DECL_FATTN_WMMA_F16_CASE(96, 8, half); 7 | DECL_FATTN_WMMA_F16_CASE(128, 8, half); 8 | DECL_FATTN_WMMA_F16_CASE(256, 8, half); 9 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ1_S); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_S); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XS); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_S); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_NL); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q2_K); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q3_K); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_K); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_K); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q6_K); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/tsembd.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/upscale.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_UPSCALE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /ggml/src/ggml-cuda/vendors/cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #if CUDART_VERSION < 11020 9 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED 10 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH 11 | #define CUBLAS_COMPUTE_16F CUDA_R_16F 12 | #define CUBLAS_COMPUTE_32F CUDA_R_32F 13 | #define cublasComputeType_t cudaDataType_t 14 | #endif // CUDART_VERSION < 11020 15 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/backend.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_BACKEND_HPP 14 | #define GGML_SYCL_BACKEND_HPP 15 | 16 | #include "concat.hpp" 17 | #include "common.hpp" 18 | #include "conv.hpp" 19 | #include "convert.hpp" 20 | #include "dequantize.hpp" 21 | #include "dmmv.hpp" 22 | #include "mmq.hpp" 23 | #include "mmvq.hpp" 24 | #include "rope.hpp" 25 | #include "norm.hpp" 26 | #include "softmax.hpp" 27 | #include "tsembd.hpp" 28 | #include "im2col.hpp" 29 | 30 | #endif // GGML_SYCL_BACKEND_HPP 31 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/concat.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_CONCAT_HPP 14 | #define GGML_SYCL_CONCAT_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 19 | const ggml_tensor *src1, ggml_tensor *dst); 20 | 21 | #endif // GGML_SYCL_CONCAT_HPP 22 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/conv.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_CONV_HPP 14 | #define GGML_SYCL_CONV_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 19 | const ggml_tensor *src1, ggml_tensor *dst); 20 | 21 | #endif // GGML_SYCL_CONV_HPP 22 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/convert.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_CONVERT_HPP 14 | #define GGML_SYCL_CONVERT_HPP 15 | 16 | #include "common.hpp" 17 | 18 | template 19 | using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y, 20 | int64_t k, dpct::queue_ptr stream); 21 | typedef to_t_sycl_t to_fp32_sycl_t; 22 | typedef to_t_sycl_t to_fp16_sycl_t; 23 | 24 | to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type); 25 | to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type); 26 | 27 | #endif // GGML_SYCL_CONVERT_HPP 28 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/dmmv.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_DMMV_HPP 14 | #define GGML_SYCL_DMMV_HPP 15 | 16 | #include "common.hpp" 17 | 18 | 19 | void ggml_sycl_op_dequantize_mul_mat_vec( 20 | ggml_backend_sycl_context & ctx, 21 | const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 22 | const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, 23 | float *dst_dd_i, const int64_t row_low, const int64_t row_high, 24 | const int64_t src1_ncols, const int64_t src1_padded_row_size, 25 | const dpct::queue_ptr &stream); 26 | 27 | #endif // GGML_SYCL_DMMV_HPP 28 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/im2col.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_IM2COL_HPP 14 | #define GGML_SYCL_IM2COL_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_im2col( 19 | ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, 20 | ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, 21 | const queue_ptr &main_stream); 22 | 23 | #endif // GGML_SYCL_IM2COL_HPP 24 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/mmq.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_MMQ_HPP 14 | #define GGML_SYCL_MMQ_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_mul_mat_q( 19 | ggml_backend_sycl_context & ctx, 20 | const ggml_tensor* src0, 21 | const ggml_tensor* src1, 22 | ggml_tensor* dst, 23 | const char* src0_dd_i, 24 | const float* src1_ddf_i, 25 | const char* src1_ddq_i, 26 | float* dst_dd_i, 27 | const int64_t row_low, 28 | const int64_t row_high, 29 | const int64_t src1_ncols, 30 | const int64_t src1_padded_row_size, 31 | const dpct::queue_ptr& stream); 32 | 33 | #endif // GGML_SYCL_MMQ_HPP 34 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/mmvq.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_MMVQ_HPP 14 | #define GGML_SYCL_MMVQ_HPP 15 | 16 | #include "common.hpp" 17 | 18 | 19 | void ggml_sycl_op_mul_mat_vec_q( 20 | ggml_backend_sycl_context & ctx, 21 | const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 22 | const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, 23 | float *dst_dd_i, const int64_t row_low, const int64_t row_high, 24 | const int64_t src1_ncols, const int64_t src1_padded_row_size, 25 | const dpct::queue_ptr &stream); 26 | 27 | #endif // GGML_SYCL_MMVQ_HPP 28 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/norm.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_NORM_HPP 14 | #define GGML_SYCL_NORM_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, 19 | ggml_tensor* dst, const float* src0_dd, 20 | const float* src1_dd, float* dst_dd, 21 | const queue_ptr& main_stream); 22 | 23 | void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, 24 | const ggml_tensor* src1, ggml_tensor* dst, 25 | const float* src0_dd, const float* src1_dd, 26 | float* dst_dd, 27 | const queue_ptr& main_stream); 28 | 29 | void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, 30 | const ggml_tensor* src1, ggml_tensor* dst, 31 | const float* src0_dd, const float* src1_dd, 32 | float* dst_dd, 33 | const queue_ptr& main_stream); 34 | 35 | #endif // GGML_SYCL_NORM_HPP 36 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/rope.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_ROPE_HPP 14 | #define GGML_SYCL_ROPE_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_rope( 19 | ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 20 | const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); 21 | 22 | #endif // GGML_SYCL_ROPE_HPP 23 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/softmax.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_SOFTMAX_HPP 14 | #define GGML_SYCL_SOFTMAX_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0, 19 | const ggml_tensor *src1, ggml_tensor *dst, 20 | const float *src0_dd, const float *src1_dd, 21 | float *dst_dd, 22 | const queue_ptr &main_stream); 23 | 24 | #endif // GGML_SYCL_SOFTMAX_HPP 25 | -------------------------------------------------------------------------------- /ggml/src/ggml-sycl/tsembd.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // MIT license 3 | // Copyright (C) 2024 Intel Corporation 4 | // SPDX-License-Identifier: MIT 5 | // 6 | 7 | // 8 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 9 | // See https://llvm.org/LICENSE.txt for license information. 10 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 11 | // 12 | 13 | #ifndef GGML_SYCL_TSEMBD_HPP 14 | #define GGML_SYCL_TSEMBD_HPP 15 | 16 | #include "common.hpp" 17 | 18 | void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 19 | const ggml_tensor *src1, ggml_tensor * dst); 20 | 21 | #endif // GGML_SYCL_TSEMBD_HPP 22 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_addrow.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; 10 | 11 | layout(push_constant) uniform PushConstants { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | uint row; 16 | } pcs; 17 | 18 | void main() { 19 | const uint baseIndex = gl_WorkGroupID.x * 4; 20 | 21 | for (uint x = 0; x < 4; x++) { 22 | const uint i = baseIndex + x; 23 | out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff]; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_diagmask.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | uint n_past; 14 | int ne00; 15 | int ne01; 16 | } pcs; 17 | 18 | void main() { 19 | const uint i02 = gl_WorkGroupID.z; 20 | const uint i01 = gl_WorkGroupID.y; 21 | const uint i00 = gl_WorkGroupID.x; 22 | 23 | const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00; 24 | 25 | if (i00 > pcs.n_past + i01) { 26 | out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000); 27 | } else { 28 | out_[index + pcs.outOff] = in_[index + pcs.inOff]; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_gelu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 8; 16 | 17 | for (uint x = 0; x < 8; x++) { 18 | const uint i = baseIndex + x; 19 | const float y = in_[i + pcs.inOff]; 20 | out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0))); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows.comp: -------------------------------------------------------------------------------- 1 | void main() { 2 | const uint i = gl_WorkGroupID.x; 3 | const int r = inB[i + pcs.inBOff]; 4 | 5 | int z = 0; 6 | for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) { 7 | const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK; 8 | const mat4 result = dequantize_block(inIndex, ind%NL); 9 | for (uint j = 0; j < 4; ++j) { 10 | for (uint k = 0; k < 4; ++k) { 11 | const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z; 12 | out_[outIndex] = result[j][k]; 13 | ++z; 14 | } 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows_f16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 10 | 11 | layout (push_constant) uniform parameter { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | int ne00; 16 | int nb01; 17 | int nb1; 18 | } pcs; 19 | 20 | void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { 21 | for (int j = 0; j < k; j++) { 22 | out_[y + j] = inA[x + j]; 23 | } 24 | } 25 | 26 | void main() { 27 | const uint i = gl_WorkGroupID.x; 28 | const int r = inB[i + pcs.inBOff]; 29 | 30 | dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); 31 | } 32 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows_f32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout (binding = 0) readonly buffer tensorInA { float inA[]; }; 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 10 | 11 | layout (push_constant) uniform parameter { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | int ne00; 16 | int nb01; 17 | int nb1; 18 | } pcs; 19 | 20 | void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { 21 | for (int j = 0; j < k; j++) { 22 | out_[y + j] = inA[x + j]; 23 | } 24 | } 25 | 26 | void main() { 27 | const uint i = gl_WorkGroupID.x; 28 | const int r = inB[i + pcs.inBOff]; 29 | 30 | dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); 31 | } 32 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 2 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q4_0 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q4_0 get_unaligned_block_q4_0(uint index) { 25 | block_q4_0 fres; 26 | fres.d = u8BufToFloat16(inA, index); 27 | [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) { 28 | fres.qs[it] = inA[index+2+it]; 29 | } 30 | return fres; 31 | } 32 | 33 | mat4 dequantize_block(uint index, uint il) { 34 | const block_q4_0 block = get_unaligned_block_q4_0(index); 35 | return dequantize_q4_0(block, il); 36 | } 37 | 38 | #include "op_getrows.comp" 39 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 2 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q4_1 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q4_1 get_unaligned_block_q4_1(uint index) { 25 | block_q4_1 fres; 26 | fres.d = u8BufToFloat16(inA, index); 27 | fres.m = u8BufToFloat16(inA, index+2); 28 | [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { 29 | fres.qs[it] = inA[index+4+it]; 30 | } 31 | return fres; 32 | } 33 | 34 | mat4 dequantize_block(uint index, uint il) { 35 | const block_q4_1 block = get_unaligned_block_q4_1(index); 36 | return dequantize_q4_1(block, il); 37 | } 38 | 39 | #include "op_getrows.comp" 40 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_getrows_q6_k.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 16 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q6_k 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q6_k get_unaligned_block_q6_k(uint index) { 25 | block_q6_k fres; 26 | [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { 27 | fres.ql[it] = inA[index + it]; 28 | } 29 | [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { 30 | fres.qh[it] = inA[index + QK_K/2 + it]; 31 | } 32 | [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { 33 | fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); 34 | } 35 | fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); 36 | return fres; 37 | } 38 | 39 | mat4 dequantize_block(uint index, uint il) { 40 | const block_q6_k block = get_unaligned_block_q6_k(index); 41 | return dequantize_q6_k(block, il); 42 | } 43 | 44 | #include "op_getrows.comp" 45 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_mul_mat_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define BLOCKS_IN_QUANT QK4_0 6 | #define SIZE_OF_BLOCK sizeof_block_q4_0 7 | #define N_ROWS 4 8 | 9 | #include "op_mul_mv_q_n_pre.comp" 10 | 11 | // The q4_0 version of this function 12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) { 13 | vec2 acc = vec2(0.0, 0.0); 14 | const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; 15 | float d = float(u8BufToFloat16(inA, index)); 16 | float sumy = 0.0f; 17 | for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { 18 | const uint16_t b = u8BufToU16(inA, index + 2 + il + i); 19 | 20 | const float yl0 = inB[yb + i]; 21 | const float yl1 = inB[yb + i + 1]; 22 | const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; 23 | const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; 24 | 25 | sumy += yl0 + yl1 + yl8 + yl9; 26 | 27 | acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); 28 | acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); 29 | } 30 | return d * (sumy * -8.f + acc[0] + acc[1]); 31 | } 32 | 33 | #include "op_mul_mv_q_n.comp" 34 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_mul_mat_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define BLOCKS_IN_QUANT QK4_1 6 | #define SIZE_OF_BLOCK sizeof_block_q4_1 7 | #define N_ROWS 4 8 | 9 | #include "op_mul_mv_q_n_pre.comp" 10 | 11 | // The q4_1 version of this function 12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) { 13 | vec2 acc = vec2(0.0, 0.0); 14 | const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; 15 | float d = float(u8BufToFloat16(inA, index)); 16 | float m = float(u8BufToFloat16(inA, index+2)); 17 | 18 | float sumy = 0.0f; 19 | for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { 20 | const uint16_t b = u8BufToU16(inA, index + 4 + il + i); 21 | 22 | const float yl0 = inB[yb + i]; 23 | const float yl1 = inB[yb + i + 1]; 24 | const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; 25 | const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; 26 | 27 | sumy += yl0 + yl1 + yl8 + yl9; 28 | 29 | acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); 30 | acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); 31 | } 32 | return d * (acc[0] + acc[1]) + sumy * m; 33 | } 34 | 35 | #include "op_mul_mv_q_n.comp" 36 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp: -------------------------------------------------------------------------------- 1 | layout(local_size_x_id = 0) in; 2 | layout(local_size_y = 1) in; 3 | layout(local_size_z = 1) in; 4 | 5 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 6 | layout (binding = 1) readonly buffer tensorInB { float inB[]; }; 7 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 8 | 9 | layout (push_constant) uniform parameter { 10 | uint inAOff; 11 | uint inBOff; 12 | uint outOff; 13 | int ne00; 14 | int ne01; 15 | int ne02; 16 | int ne10; 17 | int ne12; 18 | int ne0; 19 | int ne1; 20 | uint r2; 21 | uint r3; 22 | } pcs; 23 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 4; 16 | 17 | for (uint x = 0; x < 4; x++) { 18 | const uint i = baseIndex + x; 19 | out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_scale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | float scale; 14 | } pcs; 15 | 16 | void main() { 17 | const uint i = gl_WorkGroupID.x; 18 | out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; 19 | } 20 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_scale_8.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | float scale; 14 | } pcs; 15 | 16 | void main() { 17 | const uint baseIndex = gl_WorkGroupID.x * 8; 18 | 19 | for (uint x = 0; x < 8; x++) { 20 | const uint i = baseIndex + x; 21 | out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /ggml/src/kompute-shaders/op_silu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 4; 16 | 17 | for (uint x = 0; x < 4; x++) { 18 | const uint i = baseIndex + x; 19 | const float y = in_[i + pcs.inOff]; 20 | out_[i + pcs.outOff] = y / (1.0 + exp(-y)); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /ggml/src/llamafile/sgemm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, 9 | const void *, int64_t, void *, int64_t, int, int, 10 | int, int, int); 11 | 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package (Threads REQUIRED) 2 | 3 | set(TARGET vulkan-shaders-gen) 4 | add_executable(${TARGET} vulkan-shaders-gen.cpp) 5 | install(TARGETS ${TARGET} RUNTIME) 6 | target_compile_features(${TARGET} PRIVATE cxx_std_11) 7 | target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) 8 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/acc.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = gl_GlobalInvocationID.x; 8 | if (idx >= p.ne) { 9 | return; 10 | } 11 | 12 | const uint offset = p.param3; 13 | const uint src1_i = idx - offset; 14 | const uint oz = src1_i / p.nb02; 15 | const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; 16 | const uint ox = src1_i % p.nb01; 17 | 18 | if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) { 19 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11])); 20 | } else { 21 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)])); 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/add.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/clamp.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); 15 | } 16 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/copy.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | #ifndef OPTIMIZATION_ERROR_WORKAROUND 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]); 15 | #else 16 | data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)]; 17 | #endif 18 | } 19 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/cos.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val)); 15 | } 16 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_f32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {float data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_GlobalInvocationID.x * 16; 12 | 13 | if (i >= p.nel) { 14 | return; 15 | } 16 | 17 | [[unroll]] for (uint l = 0; l < 16; l++) { 18 | data_b[i + l] = D_TYPE(data_a[i + l]); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_head.comp: -------------------------------------------------------------------------------- 1 | #extension GL_EXT_control_flow_attributes : require 2 | #extension GL_EXT_shader_16bit_storage : require 3 | 4 | layout (push_constant) uniform parameter 5 | { 6 | uint M; 7 | uint K; 8 | uint stride_a; 9 | uint stride_b; 10 | uint nel; 11 | } p; 12 | 13 | #include "types.comp" 14 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_iq4_nl.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint q_idx = 8*il; 22 | const uint b_idx = 1024*i + 32*ir + q_idx; 23 | 24 | const float d = float(data_a[ib].d); 25 | 26 | [[unroll]] for (uint l = 0; l < 8; ++l) { 27 | data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]); 28 | data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q4_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint q_idx = 8*il; 22 | const uint b_idx = 1024*i + 32*ir + q_idx; 23 | 24 | const float d = float(data_a[ib].d); 25 | 26 | [[unroll]] for (uint l = 0; l < 8; ++l) { 27 | data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f)); 28 | data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f)); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q4_1 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const float m = float(data_a[ib].m); 25 | 26 | const uint q_idx = 8*il; 27 | 28 | [[unroll]] for (uint l = 0; l < 8; ++l) { 29 | data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m); 30 | data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + m); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_q5_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q5_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; 25 | 26 | const uint q_idx = 8*il; 27 | 28 | [[unroll]] for (uint l = 0; l < 8; ++l) { 29 | const uint iqs = q_idx + l; 30 | const uint vui = uint(data_a[ib].qs[iqs]); 31 | data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f)); 32 | data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f)); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_q5_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q5_1 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const float m = float(data_a[ib].m); 25 | const uint qh = data_a[ib].qh; 26 | 27 | const uint q_idx = 8*il; 28 | 29 | [[unroll]] for (uint l = 0; l < 8; ++l) { 30 | const uint iqs = q_idx + l; 31 | const uint vui = uint(data_a[ib].qs[iqs]); 32 | data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m); 33 | data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10))) + m); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/dequant_q8_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q8_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 16*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | 25 | const uint q_idx = 16*il; 26 | 27 | [[unroll]] for (uint l = 0; l < 16; l += 2) { 28 | data_b[b_idx + l ] = D_TYPE(d * data_a[ib].qs[q_idx + l ]); 29 | data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/diag_mask_inf.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage : require 4 | #extension GL_EXT_control_flow_attributes : enable 5 | 6 | layout (push_constant) uniform parameter 7 | { 8 | uint ncols; 9 | uint rows_per_channel; 10 | uint n_past; 11 | } p; 12 | 13 | #include "types.comp" 14 | 15 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 16 | 17 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 18 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 19 | 20 | void main() { 21 | const uint col = gl_GlobalInvocationID.y; 22 | const uint row = gl_GlobalInvocationID.x; 23 | 24 | if (col >= p.ncols) { 25 | return; 26 | } 27 | 28 | const uint i = row*p.ncols + col; 29 | if (col > p.n_past + row % p.rows_per_channel) { 30 | data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000)); 31 | } else { 32 | data_d[i] = D_TYPE(data_a[i]); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/div.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/gelu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const float GELU_COEF_A = 0.044715f; 15 | const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; 16 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 17 | 18 | if (i >= p.KX) { 19 | return; 20 | } 21 | 22 | const float xi = float(data_a[i]); 23 | const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); 24 | data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); 25 | } 26 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/gelu_quick.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const float GELU_QUICK_COEF = -1.702f; 15 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 16 | 17 | if (i >= p.KX) { 18 | return; 19 | } 20 | 21 | const float x = float(data_a[i]); 22 | data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); 23 | } 24 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/generic_head.comp: -------------------------------------------------------------------------------- 1 | #extension GL_EXT_shader_16bit_storage : require 2 | 3 | layout (push_constant) uniform parameter 4 | { 5 | uint KX; 6 | uint KY; 7 | float param1; 8 | float param2; 9 | } p; 10 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/get_rows.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint i00 = gl_GlobalInvocationID.x; 8 | const uint i10 = gl_GlobalInvocationID.y; 9 | const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; 10 | const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; 11 | 12 | if (i00 >= p.ne00) { 13 | return; 14 | } 15 | 16 | const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; 17 | 18 | const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; 19 | const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; 20 | 21 | #ifndef OPTIMIZATION_ERROR_WORKAROUND 22 | data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]); 23 | #else 24 | data_d[d_offset + i00] = data_a[a_offset + i00]; 25 | #endif 26 | } 27 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/get_rows_quant.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | #include "dequant_funcs.comp" 6 | 7 | void main() { 8 | const uint i00 = (gl_GlobalInvocationID.x)*2; 9 | const uint i10 = gl_GlobalInvocationID.y; 10 | const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; 11 | const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; 12 | 13 | if (i00 >= p.ne00) { 14 | return; 15 | } 16 | 17 | const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; 18 | 19 | const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; 20 | const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; 21 | 22 | const uint ib = a_offset + i00/QUANT_K; // block index 23 | const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index 24 | const uint iybs = i00 - i00%QUANT_K; // dst block start index 25 | const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; 26 | 27 | vec2 v = dequantize(ib, iqs, 0); 28 | 29 | data_d[d_offset + iybs + iqs ] = D_TYPE(v.x); 30 | data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y); 31 | } 32 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/leaky_relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | const float val = float(data_a[i]); 21 | data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); 22 | } 23 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/mul.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_control_flow_attributes : enable 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {float data_a[];}; 8 | layout (binding = 1) writeonly buffer D {float data_d[];}; 9 | 10 | layout (push_constant) uniform parameter { 11 | uint ne; 12 | uint k_num; 13 | } p; 14 | 15 | void main() { 16 | const uint idx = gl_GlobalInvocationID.x; 17 | 18 | if (idx >= p.ne) { 19 | return; 20 | } 21 | 22 | float result = 0.0f; 23 | 24 | [[unroll]] for (uint i = 0; i < p.k_num; i++) { 25 | result += data_a[i * p.ne + idx]; 26 | } 27 | 28 | data_d[idx] = result; 29 | } 30 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/pad.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const uint i3 = idx / (p.ne12*p.ne11*p.ne10); 14 | const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; 15 | const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10); 16 | const uint i2_offset = i2*p.ne11*p.ne10; 17 | const uint i1 = (idx - i3_offset - i2_offset) / p.ne10; 18 | const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; 19 | 20 | const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; 21 | const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; 22 | 23 | const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; 24 | 25 | data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f); 26 | } 27 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | data_d[i] = max(float(data_a[i]), 0); 21 | } 22 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/repeat.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | uint src0_idx_mod(uint idx) { 7 | const uint i13 = idx / (p.ne12*p.ne11*p.ne10); 8 | const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; 9 | const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); 10 | const uint i12_offset = i12*p.ne11*p.ne10; 11 | const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; 12 | const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; 13 | return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00; 14 | } 15 | 16 | void main() { 17 | const uint idx = get_idx(); 18 | 19 | if (idx >= p.ne) { 20 | return; 21 | } 22 | 23 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]); 24 | } 25 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/rope_neox.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "rope_head.comp" 4 | 5 | void main() { 6 | const uint col = gl_GlobalInvocationID.y * 2; 7 | const uint row = gl_GlobalInvocationID.x; 8 | 9 | if (col >= p.ncols) { 10 | return; 11 | } 12 | 13 | if (col >= p.n_dims) { 14 | const uint i = row*p.ncols + col; 15 | 16 | data_d[i + 0] = data_a[i + 0]; 17 | data_d[i + 1] = data_a[i + 1]; 18 | 19 | return; 20 | } 21 | 22 | const uint i = row*p.ncols + col/2; 23 | const uint i2 = row/p.p_delta_rows; 24 | 25 | const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f); 26 | 27 | const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f; 28 | 29 | float cos_theta, sin_theta; 30 | rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta); 31 | 32 | const float x0 = float(data_a[i + 0]); 33 | const float x1 = float(data_a[i + p.n_dims/2]); 34 | 35 | data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta); 36 | data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta); 37 | } 38 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/rope_norm.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "rope_head.comp" 4 | 5 | void main() { 6 | const uint col = gl_GlobalInvocationID.y * 2; 7 | const uint row = gl_GlobalInvocationID.x; 8 | 9 | if (col >= p.ncols) { 10 | return; 11 | } 12 | 13 | if (col >= p.n_dims) { 14 | const uint i = row*p.ncols + col; 15 | 16 | data_d[i + 0] = data_a[i + 0]; 17 | data_d[i + 1] = data_a[i + 1]; 18 | 19 | return; 20 | } 21 | 22 | const uint i = row*p.ncols + col; 23 | const uint i2 = row/p.p_delta_rows; 24 | 25 | const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f); 26 | 27 | const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f; 28 | 29 | float cos_theta, sin_theta; 30 | rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta); 31 | 32 | const float x0 = float(data_a[i + 0]); 33 | const float x1 = float(data_a[i + 1]); 34 | 35 | data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta); 36 | data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta); 37 | } 38 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/scale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); 14 | } 15 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/silu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | const float xi = float(data_a[i]); 21 | data_d[i] = D_TYPE(xi / (1.0f + exp(-xi))); 22 | } 23 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/sin.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val)); 15 | } 16 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/square.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val); 15 | } 16 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/sum_rows.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; 8 | 9 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 10 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 11 | 12 | layout (constant_id = 0) const uint BLOCK_SIZE = 32; 13 | 14 | shared FLOAT_TYPE tmp[BLOCK_SIZE]; 15 | 16 | void main() { 17 | const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; 18 | const uint col = gl_LocalInvocationID.x; 19 | 20 | tmp[col] = FLOAT_TYPE(0.0f); 21 | 22 | for (uint i = col; i < p.KX; i += BLOCK_SIZE) { 23 | tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]); 24 | } 25 | 26 | barrier(); 27 | [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) { 28 | if (col < s) { 29 | tmp[col] += tmp[col + s]; 30 | } 31 | barrier(); 32 | } 33 | 34 | if (col == 0) { 35 | data_d[row] = D_TYPE(tmp[0]); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/tanh.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | data_d[i] = D_TYPE(tanh(data_a[i])); 21 | } 22 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/timestep_embedding.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage : require 4 | 5 | layout (push_constant) uniform parameter 6 | { 7 | uint nb1; 8 | uint dim; 9 | uint max_period; 10 | } p; 11 | 12 | #include "types.comp" 13 | 14 | #extension GL_EXT_control_flow_attributes : enable 15 | #define BLOCK_SIZE 256 16 | 17 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; 18 | 19 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 20 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 21 | 22 | void main() { 23 | const uint i = gl_WorkGroupID.y; 24 | const uint j = gl_GlobalInvocationID.x; 25 | const uint d_offset = i * p.nb1; 26 | 27 | if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { 28 | data_d[d_offset + p.dim] = 0.f; 29 | } 30 | 31 | const uint half_dim = p.dim / 2; 32 | if (j >= half_dim) { 33 | return; 34 | } 35 | 36 | const float timestep = float(data_a[i]); 37 | const float freq = float(exp(-log(p.max_period) * j / half_dim)); 38 | const float arg = timestep * freq; 39 | data_d[d_offset + j] = D_TYPE(cos(arg)); 40 | data_d[d_offset + j + half_dim] = D_TYPE(sin(arg)); 41 | } 42 | -------------------------------------------------------------------------------- /ggml/src/vulkan-shaders/upscale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (push_constant) uniform parameter 4 | { 5 | uint ne; uint d_offset; 6 | uint nb00; uint nb01; uint nb02; uint nb03; 7 | uint ne10; uint ne11; uint ne12; uint ne13; 8 | float sf0; float sf1; float sf2; float sf3; 9 | } p; 10 | 11 | #include "types.comp" 12 | 13 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 14 | 15 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 16 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 17 | 18 | void main() { 19 | const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 20 | 21 | if (idx >= p.ne) { 22 | return; 23 | } 24 | 25 | const uint i10 = idx % p.ne10; 26 | const uint i11 = (idx / p.ne10) % p.ne11; 27 | const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; 28 | const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; 29 | 30 | const uint i00 = uint(i10 / p.sf0); 31 | const uint i01 = uint(i11 / p.sf1); 32 | const uint i02 = uint(i12 / p.sf2); 33 | const uint i03 = uint(i13 / p.sf3); 34 | 35 | data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); 36 | } 37 | -------------------------------------------------------------------------------- /gguf-py/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Georgi Gerganov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /gguf-py/examples/writer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | 7 | # Necessary to load the local gguf package 8 | sys.path.insert(0, str(Path(__file__).parent.parent)) 9 | 10 | from gguf import GGUFWriter # noqa: E402 11 | 12 | 13 | # Example usage: 14 | def writer_example() -> None: 15 | # Example usage with a file 16 | gguf_writer = GGUFWriter("example.gguf", "llama") 17 | 18 | gguf_writer.add_block_count(12) 19 | gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer 20 | gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float 21 | gguf_writer.add_custom_alignment(64) 22 | 23 | tensor1 = np.ones((32,), dtype=np.float32) * 100.0 24 | tensor2 = np.ones((64,), dtype=np.float32) * 101.0 25 | tensor3 = np.ones((96,), dtype=np.float32) * 102.0 26 | 27 | gguf_writer.add_tensor("tensor1", tensor1) 28 | gguf_writer.add_tensor("tensor2", tensor2) 29 | gguf_writer.add_tensor("tensor3", tensor3) 30 | 31 | gguf_writer.write_header_to_file() 32 | gguf_writer.write_kv_data_to_file() 33 | gguf_writer.write_tensors_to_file() 34 | 35 | gguf_writer.close() 36 | 37 | 38 | if __name__ == '__main__': 39 | writer_example() 40 | -------------------------------------------------------------------------------- /gguf-py/gguf/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import * 2 | from .lazy import * 3 | from .gguf_reader import * 4 | from .gguf_writer import * 5 | from .quants import * 6 | from .tensor_mapping import * 7 | from .vocab import * 8 | from .utility import * 9 | from .metadata import * 10 | -------------------------------------------------------------------------------- /gguf-py/gguf/gguf.py: -------------------------------------------------------------------------------- 1 | # This file left for compatibility. If you want to use the GGUF API from Python 2 | # then don't import gguf/gguf.py directly. If you're looking for examples, see the 3 | # examples/ directory for gguf-py 4 | 5 | import importlib 6 | import sys 7 | from pathlib import Path 8 | 9 | sys.path.insert(0, str(Path(__file__).parent.parent)) 10 | 11 | # Compatibility for people trying to import gguf/gguf.py directly instead of as a package. 12 | importlib.invalidate_caches() 13 | import gguf # noqa: E402 14 | 15 | importlib.reload(gguf) 16 | -------------------------------------------------------------------------------- /gguf-py/gguf/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/gguf-py/gguf/py.typed -------------------------------------------------------------------------------- /gguf-py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "gguf" 3 | version = "0.10.0" 4 | description = "Read and write ML models in GGUF for GGML" 5 | authors = ["GGML "] 6 | packages = [ 7 | {include = "gguf"}, 8 | {include = "gguf/py.typed"}, 9 | {include = "scripts"}, 10 | ] 11 | readme = "README.md" 12 | homepage = "https://ggml.ai" 13 | repository = "https://github.com/ggerganov/llama.cpp" 14 | keywords = ["ggml", "gguf", "llama.cpp"] 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | 21 | [tool.poetry.dependencies] 22 | python = ">=3.8" 23 | numpy = ">=1.17" 24 | tqdm = ">=4.27" 25 | pyyaml = ">=5.1" 26 | sentencepiece = ">=0.1.98,<=0.2.0" 27 | 28 | [tool.poetry.dev-dependencies] 29 | pytest = "^5.2" 30 | 31 | [build-system] 32 | requires = ["poetry-core>=1.0.0"] 33 | build-backend = "poetry.core.masonry.api" 34 | 35 | [tool.poetry.scripts] 36 | gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" 37 | gguf-dump = "scripts:gguf_dump_entrypoint" 38 | gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" 39 | gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" 40 | -------------------------------------------------------------------------------- /gguf-py/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # pyright: reportUnusedImport=false 2 | 3 | from .gguf_convert_endian import main as gguf_convert_endian_entrypoint 4 | from .gguf_dump import main as gguf_dump_entrypoint 5 | from .gguf_set_metadata import main as gguf_set_metadata_entrypoint 6 | from .gguf_new_metadata import main as gguf_new_metadata_entrypoint 7 | -------------------------------------------------------------------------------- /gguf-py/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_metadata import * 2 | -------------------------------------------------------------------------------- /grammars/arithmetic.gbnf: -------------------------------------------------------------------------------- 1 | root ::= (expr "=" ws term "\n")+ 2 | expr ::= term ([-+*/] term)* 3 | term ::= ident | num | "(" ws expr ")" ws 4 | ident ::= [a-z] [a-z0-9_]* ws 5 | num ::= [0-9]+ ws 6 | ws ::= [ \t\n]* 7 | -------------------------------------------------------------------------------- /grammars/chess.gbnf: -------------------------------------------------------------------------------- 1 | # Specifies chess moves as a list in algebraic notation, using PGN conventions 2 | 3 | # Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern 4 | root ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+ 5 | move ::= (pawn | nonpawn | castle) [+#]? 6 | 7 | # piece type, optional file/rank, optional capture, dest file & rank 8 | nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8] 9 | 10 | # optional file & capture, dest file & rank, optional promotion 11 | pawn ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])? 12 | 13 | castle ::= "O-O" "-O"? 14 | -------------------------------------------------------------------------------- /grammars/japanese.gbnf: -------------------------------------------------------------------------------- 1 | # A probably incorrect grammar for Japanese 2 | root ::= jp-char+ ([ \t\n] jp-char+)* 3 | jp-char ::= hiragana | katakana | punctuation | cjk 4 | hiragana ::= [ぁ-ゟ] 5 | katakana ::= [ァ-ヿ] 6 | punctuation ::= [、-〾] 7 | cjk ::= [一-鿿] 8 | -------------------------------------------------------------------------------- /grammars/json.gbnf: -------------------------------------------------------------------------------- 1 | root ::= object 2 | value ::= object | array | string | number | ("true" | "false" | "null") ws 3 | 4 | object ::= 5 | "{" ws ( 6 | string ":" ws value 7 | ("," ws string ":" ws value)* 8 | )? "}" ws 9 | 10 | array ::= 11 | "[" ws ( 12 | value 13 | ("," ws value)* 14 | )? "]" ws 15 | 16 | string ::= 17 | "\"" ( 18 | [^"\\\x7F\x00-\x1F] | 19 | "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes 20 | )* "\"" ws 21 | 22 | number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws 23 | 24 | # Optional space: by convention, applied in this grammar after literal chars when allowed 25 | ws ::= | " " | "\n" [ \t]{0,20} 26 | -------------------------------------------------------------------------------- /grammars/json_arr.gbnf: -------------------------------------------------------------------------------- 1 | # This is the same as json.gbnf but we restrict whitespaces at the end of the root array 2 | # Useful for generating JSON arrays 3 | 4 | root ::= arr 5 | value ::= object | array | string | number | ("true" | "false" | "null") ws 6 | 7 | arr ::= 8 | "[\n" ws ( 9 | value 10 | (",\n" ws value)* 11 | )? "]" 12 | 13 | object ::= 14 | "{" ws ( 15 | string ":" ws value 16 | ("," ws string ":" ws value)* 17 | )? "}" ws 18 | 19 | array ::= 20 | "[" ws ( 21 | value 22 | ("," ws value)* 23 | )? "]" ws 24 | 25 | string ::= 26 | "\"" ( 27 | [^"\\\x7F\x00-\x1F] | 28 | "\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes 29 | )* "\"" ws 30 | 31 | number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws 32 | 33 | # Optional space: by convention, applied in this grammar after literal chars when allowed 34 | ws ::= | " " | "\n" [ \t]{0,20} 35 | -------------------------------------------------------------------------------- /grammars/list.gbnf: -------------------------------------------------------------------------------- 1 | root ::= item+ 2 | 3 | # Excludes various line break characters 4 | item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n" 5 | -------------------------------------------------------------------------------- /lang-cli-src/config.cpp: -------------------------------------------------------------------------------- 1 | #include "config.h" -------------------------------------------------------------------------------- /lang-cli-src/config.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #ifndef CONFIG_MANAGER_H 5 | #define CONFIG_MANAGER_H 6 | 7 | #define CONFIG_FILE "langcommand_config.json" 8 | #define CMD_NAME "langcommand" 9 | 10 | #endif // CONFIG_MANAGER_H 11 | -------------------------------------------------------------------------------- /lang-cli-src/file_manager.h: -------------------------------------------------------------------------------- 1 | #ifndef DOWNLOADER_H 2 | #define DOWNLOADER_H 3 | 4 | #include 5 | 6 | bool download_file(const std::string& url, const std::string& file_path); 7 | void show_progress_bar(double percentage, double speed); 8 | bool file_exists(const std::string & path); 9 | bool file_is_empty(const std::string & path); 10 | bool file_create(const std::string & path); 11 | bool json_file_create(const std::string & path); 12 | #endif // DOWNLOADER_H 13 | -------------------------------------------------------------------------------- /lang-cli-src/output_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef OUTPUT_PARSER_H 2 | #define OUTPUT_PARSER_H 3 | 4 | #include 5 | #include 6 | 7 | std::vector extract_suggestions(const std::string& input); 8 | 9 | size_t check_early_stop(const std::string& output_buffer); 10 | 11 | std::string colorize_text(const std::string& input); 12 | #endif // OUTPUT_PARSER_H 13 | -------------------------------------------------------------------------------- /lang-cli-src/shell_executor.h: -------------------------------------------------------------------------------- 1 | #ifndef SHELL_EXECUTOR_H 2 | #define SHELL_EXECUTOR_H 3 | 4 | #include 5 | 6 | // Declare the function prototype 7 | std::string exec_command(const std::string& cmd); 8 | void choose_edit_exec(std::vector& output_lines); 9 | #endif // SHELL_EXECUTOR_H 10 | -------------------------------------------------------------------------------- /lang-cli-src/str_parser.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef STR_PARSER_H 3 | #define STR_PARSER_H 4 | 5 | #include 6 | #include 7 | // Function declaration for extracting bash blocks from input string 8 | std::vector extract_strs(const std::string& input, const std::string& regex_str); 9 | 10 | std::string extract_str(const std::string& input, const std::string& regex_str); 11 | 12 | 13 | size_t get_nth_delimiters(const std::string& input, const std::string& delimiter, size_t n); 14 | 15 | size_t get_num_delimiters(const std::string& input, const std::string& delimiter); 16 | 17 | std::vector split_str(const std::string& str, char delimiter); 18 | 19 | 20 | std::string to_lower_case(const std::string& input); 21 | 22 | #endif // STR_PARSER_H -------------------------------------------------------------------------------- /media/llama-leader.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama-leader.jpeg -------------------------------------------------------------------------------- /media/llama0-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama0-banner.png -------------------------------------------------------------------------------- /media/llama0-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama0-logo.png -------------------------------------------------------------------------------- /media/llama1-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama1-banner.png -------------------------------------------------------------------------------- /media/llama1-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/llama1-logo.png -------------------------------------------------------------------------------- /media/matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/media/matmul.png -------------------------------------------------------------------------------- /models/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | -------------------------------------------------------------------------------- /models/ggml-vocab-aquila.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-aquila.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-baichuan.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-baichuan.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-bert-bge.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-bert-bge.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-command-r.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-command-r.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-deepseek-coder.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-deepseek-coder.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-deepseek-llm.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-deepseek-llm.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-falcon.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-falcon.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-gpt-2.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-gpt-2.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-gpt-neox.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-gpt-neox.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-llama-bpe.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-llama-bpe.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-llama-spm.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-llama-spm.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-mpt.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-mpt.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-phi-3.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-phi-3.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-qwen2.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-qwen2.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-refact.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-refact.gguf -------------------------------------------------------------------------------- /models/ggml-vocab-starcoder.gguf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/models/ggml-vocab-starcoder.gguf -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | strict = true 3 | allow_untyped_calls = true 4 | allow_untyped_defs = true 5 | allow_incomplete_defs = true 6 | disable_error_code = import-untyped 7 | warn_return_any = false 8 | -------------------------------------------------------------------------------- /output.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoriyue/LangCommand/f209d811584bea33fe2a1efa65ca4c8e81f497a3/output.gif -------------------------------------------------------------------------------- /pocs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # dependencies 2 | 3 | find_package(Threads REQUIRED) 4 | 5 | # third-party 6 | 7 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 8 | 9 | if (EMSCRIPTEN) 10 | else() 11 | add_subdirectory(vdot) 12 | endif() 13 | -------------------------------------------------------------------------------- /pocs/vdot/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TARGET llama-vdot) 2 | add_executable(${TARGET} vdot.cpp) 3 | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) 4 | target_compile_features(${TARGET} PRIVATE cxx_std_11) 5 | 6 | set(TARGET llama-q8dot) 7 | add_executable(${TARGET} q8dot.cpp) 8 | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) 9 | target_compile_features(${TARGET} PRIVATE cxx_std_11) 10 | -------------------------------------------------------------------------------- /prompts/alpaca.txt: -------------------------------------------------------------------------------- 1 | Below is an instruction that describes a task. Write a response that appropriately completes the request. 2 | -------------------------------------------------------------------------------- /prompts/chat-with-baichuan.txt: -------------------------------------------------------------------------------- 1 | 以下内容为人类用户与与一位智能助手的对话。 2 | 3 | 用户:你好! 4 | 助手: 5 | -------------------------------------------------------------------------------- /prompts/chat-with-bob.txt: -------------------------------------------------------------------------------- 1 | Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. 2 | 3 | User: Hello, Bob. 4 | Bob: Hello. How may I help you today? 5 | User: Please tell me the largest city in Europe. 6 | Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. 7 | User: -------------------------------------------------------------------------------- /prompts/chat-with-qwen.txt: -------------------------------------------------------------------------------- 1 | You are a helpful assistant. -------------------------------------------------------------------------------- /prompts/chat-with-vicuna-v0.txt: -------------------------------------------------------------------------------- 1 | A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions. 2 | 3 | ### [[USER_NAME]]: Hello, [[AI_NAME]]. 4 | ### [[AI_NAME]]: Hello. How may I help you today? 5 | ### [[USER_NAME]]: Please tell me the largest city in Europe. 6 | ### [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia. 7 | ### [[USER_NAME]]: 8 | -------------------------------------------------------------------------------- /prompts/chat-with-vicuna-v1.txt: -------------------------------------------------------------------------------- 1 | A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions. 2 | 3 | [[USER_NAME]]: Hello, [[AI_NAME]]. 4 | [[AI_NAME]]: Hello. How may I help you today? 5 | [[USER_NAME]]: Please tell me the largest city in Europe. 6 | [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia. 7 | [[USER_NAME]]: 8 | -------------------------------------------------------------------------------- /prompts/reason-act.txt: -------------------------------------------------------------------------------- 1 | You run in a loop of Thought, Action, Observation. 2 | At the end of the loop either Answer or restate your Thought and Action. 3 | Use Thought to describe your thoughts about the question you have been asked. 4 | Use Action to run one of these actions available to you: 5 | - calculate[python math expression] 6 | Observation will be the result of running those actions 7 | 8 | 9 | Question: What is 4 * 7 / 3? 10 | Thought: Do I need to use an action? Yes, I use calculate to do math 11 | Action: calculate[4 * 7 / 3] 12 | Observation: 9.3333333333 13 | Thought: Do I need to use an action? No, have the result 14 | Answer: The calculate tool says it is 9.3333333333 15 | Question: What is capital of france? 16 | Thought: Do I need to use an action? No, I know the answer 17 | Answer: Paris is the capital of France 18 | Question: -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extraPaths": ["gguf-py"], 3 | "pythonVersion": "3.9", 4 | "pythonPlatform": "All", 5 | "reportUnusedImport": "warning", 6 | "reportDuplicateImport": "error", 7 | "reportDeprecated": "warning", 8 | "reportUnnecessaryTypeIgnoreComment": "information", 9 | "disableBytesTypePromotions": false, // TODO: change once Python 3.12 is the minimum 10 | "executionEnvironments": [ 11 | { 12 | // TODO: make this version override work correctly 13 | "root": "gguf-py", 14 | "pythonVersion": "3.8", 15 | }, 16 | { 17 | // uses match expressions in steps.py 18 | "root": "examples/server/tests", 19 | "pythonVersion": "3.10", 20 | }, 21 | ], 22 | } 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # These requirements include all dependencies for all top-level python scripts 2 | # for llama.cpp. Avoid adding packages here directly. 3 | # 4 | # Package versions must stay compatible across all top-level python scripts. 5 | # 6 | 7 | -r ./requirements/requirements-convert_legacy_llama.txt 8 | 9 | -r ./requirements/requirements-convert_hf_to_gguf.txt 10 | -r ./requirements/requirements-convert_hf_to_gguf_update.txt 11 | -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt 12 | -r ./requirements/requirements-convert_lora_to_gguf.txt 13 | -------------------------------------------------------------------------------- /requirements/requirements-all.txt: -------------------------------------------------------------------------------- 1 | -r ../examples/llava/requirements.txt 2 | -r ../examples/server/bench/requirements.txt 3 | -r ../examples/server/tests/requirements.txt 4 | 5 | -r ./requirements-compare-llama-bench.txt 6 | -r ./requirements-pydantic.txt 7 | -r ./requirements-test-tokenizer-random.txt 8 | 9 | -r ./requirements-convert_hf_to_gguf.txt 10 | -r ./requirements-convert_hf_to_gguf_update.txt 11 | -r ./requirements-convert_legacy_llama.txt 12 | -r ./requirements-convert_llama_ggml_to_gguf.txt 13 | -------------------------------------------------------------------------------- /requirements/requirements-compare-llama-bench.txt: -------------------------------------------------------------------------------- 1 | tabulate~=0.9.0 2 | GitPython~=3.1.43 3 | -------------------------------------------------------------------------------- /requirements/requirements-convert_hf_to_gguf.txt: -------------------------------------------------------------------------------- 1 | -r ./requirements-convert_legacy_llama.txt 2 | --extra-index-url https://download.pytorch.org/whl/cpu 3 | torch~=2.2.1 4 | -------------------------------------------------------------------------------- /requirements/requirements-convert_hf_to_gguf_update.txt: -------------------------------------------------------------------------------- 1 | -r ./requirements-convert_legacy_llama.txt 2 | --extra-index-url https://download.pytorch.org/whl/cpu 3 | torch~=2.2.1 4 | -------------------------------------------------------------------------------- /requirements/requirements-convert_legacy_llama.txt: -------------------------------------------------------------------------------- 1 | numpy~=1.26.4 2 | sentencepiece~=0.2.0 3 | transformers>=4.45.1,<5.0.0 4 | gguf>=0.1.0 5 | protobuf>=4.21.0,<5.0.0 6 | -------------------------------------------------------------------------------- /requirements/requirements-convert_llama_ggml_to_gguf.txt: -------------------------------------------------------------------------------- 1 | -r ./requirements-convert_legacy_llama.txt 2 | -------------------------------------------------------------------------------- /requirements/requirements-convert_lora_to_gguf.txt: -------------------------------------------------------------------------------- 1 | -r ./requirements-convert_hf_to_gguf.txt 2 | --extra-index-url https://download.pytorch.org/whl/cpu 3 | -------------------------------------------------------------------------------- /requirements/requirements-pydantic.txt: -------------------------------------------------------------------------------- 1 | docstring_parser~=0.15 2 | pydantic~=2.6.3 3 | requests 4 | -------------------------------------------------------------------------------- /requirements/requirements-test-tokenizer-random.txt: -------------------------------------------------------------------------------- 1 | cffi~=1.16.0 2 | -------------------------------------------------------------------------------- /scripts/build-info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CC=$1 4 | 5 | build_number="0" 6 | build_commit="unknown" 7 | build_compiler="unknown" 8 | build_target="unknown" 9 | 10 | if out=$(git rev-list --count HEAD); then 11 | # git is broken on WSL so we need to strip extra newlines 12 | build_number=$(printf '%s' "$out" | tr -d '\n') 13 | fi 14 | 15 | if out=$(git rev-parse --short HEAD); then 16 | build_commit=$(printf '%s' "$out" | tr -d '\n') 17 | fi 18 | 19 | if out=$($CC --version | head -1); then 20 | build_compiler=$out 21 | fi 22 | 23 | if out=$($CC -dumpmachine); then 24 | build_target=$out 25 | fi 26 | 27 | echo "int LLAMA_BUILD_NUMBER = ${build_number};" 28 | echo "char const *LLAMA_COMMIT = \"${build_commit}\";" 29 | echo "char const *LLAMA_COMPILER = \"${build_compiler}\";" 30 | echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";" 31 | -------------------------------------------------------------------------------- /scripts/compare-commits.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 2 ]; then 4 | echo "usage: ./scripts/compare-commits.sh [additional llama-bench arguments]" 5 | exit 1 6 | fi 7 | 8 | set -e 9 | set -x 10 | 11 | # verify at the start that the compare script has all the necessary dependencies installed 12 | ./scripts/compare-llama-bench.py --check 13 | 14 | bench_args="${@:3}" 15 | 16 | rm -f llama-bench.sqlite > /dev/null 17 | 18 | # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...) 19 | 20 | git checkout $1 > /dev/null 21 | make clean > /dev/null 22 | make -j$(nproc) $make_opts llama-bench > /dev/null 23 | ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite 24 | 25 | git checkout $2 > /dev/null 26 | make clean > /dev/null 27 | make -j$(nproc) $make_opts llama-bench > /dev/null 28 | ./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite 29 | 30 | ./scripts/compare-llama-bench.py -b $1 -c $2 31 | -------------------------------------------------------------------------------- /scripts/gen-authors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | printf "# date: $(date)\n" > AUTHORS 4 | printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS 5 | 6 | git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS 7 | 8 | # if necessary, update your name here. for example: jdoe -> John Doe 9 | sed -i '' 's/^jdoe/John Doe/g' AUTHORS 10 | -------------------------------------------------------------------------------- /scripts/get-flags.mk: -------------------------------------------------------------------------------- 1 | ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))' 2 | GF_CC_IS_GCC = 1 3 | GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }') 4 | else 5 | GF_CC_IS_CLANG = 1 6 | ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))' 7 | GF_CC_IS_LLVM_CLANG = 1 8 | else 9 | GF_CC_IS_APPLE_CLANG = 1 10 | endif 11 | GF_CC_VER := \ 12 | $(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \ 13 | | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }') 14 | endif 15 | 16 | ifeq ($(GF_CC_IS_CLANG), 1) 17 | # clang options 18 | GF_CFLAGS = -Wunreachable-code-break -Wunreachable-code-return 19 | GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi 20 | 21 | ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))' 22 | GF_CFLAGS += -Wdouble-promotion 23 | endif 24 | ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))' 25 | GF_CFLAGS += -Wdouble-promotion 26 | endif 27 | else 28 | # gcc options 29 | GF_CFLAGS = -Wdouble-promotion 30 | GF_CXXFLAGS = -Wno-array-bounds 31 | 32 | ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1) 33 | GF_CXXFLAGS += -Wno-format-truncation 34 | endif 35 | ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1) 36 | GF_CXXFLAGS += -Wextra-semi 37 | endif 38 | endif 39 | -------------------------------------------------------------------------------- /scripts/get-hellaswag.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt 4 | 5 | echo "Usage:" 6 | echo "" 7 | echo " ./llama-perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]" 8 | echo "" 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /scripts/get-wikitext-103.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip 4 | 5 | echo "Usage:" 6 | echo "" 7 | echo " ./llama-perplexity -m model.gguf -f wiki.test.raw [other params]" 8 | echo "" 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /scripts/get-wikitext-2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip 4 | unzip wikitext-2-raw-v1.zip 5 | 6 | echo "Usage:" 7 | echo "" 8 | echo " ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" 9 | echo "" 10 | 11 | exit 0 12 | -------------------------------------------------------------------------------- /scripts/get-winogrande.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv 4 | 5 | echo "Usage:" 6 | echo "" 7 | echo " ./llama-perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]" 8 | echo "" 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /scripts/install-oneapi.bat: -------------------------------------------------------------------------------- 1 | :: MIT license 2 | :: Copyright (C) 2024 Intel Corporation 3 | :: SPDX-License-Identifier: MIT 4 | 5 | 6 | set URL=%1 7 | set COMPONENTS=%2 8 | 9 | curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5 10 | start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log 11 | del %TEMP%\webimage.exe 12 | if "%COMPONENTS%"=="" ( 13 | webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=. 14 | ) else ( 15 | webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=. 16 | ) 17 | set installer_exit_code=%ERRORLEVEL% 18 | rd /s/q "webimage_extracted" 19 | exit /b %installer_exit_code% 20 | -------------------------------------------------------------------------------- /scripts/qnt-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) 4 | args="" 5 | 6 | if [ -z "$1" ]; then 7 | echo "usage: $0 [qnt] [args]" 8 | echo "default: $0 \"${qnt[@]}\" \"${args}\"" 9 | exit 1 10 | fi 11 | 12 | if [ ! -z "$2" ]; then 13 | qnt=($2) 14 | fi 15 | 16 | if [ ! -z "$3" ]; then 17 | args="$3" 18 | fi 19 | 20 | model="$1" 21 | out="../tmp/results-${model}" 22 | 23 | set -o pipefail 24 | set -e 25 | 26 | mkdir -p ${out} 27 | 28 | for q in ${qnt[@]}; do 29 | time ./bin/llama-quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt 30 | done 31 | -------------------------------------------------------------------------------- /scripts/run-all-perf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) 4 | args="-ngl 999 -n 64 -p 512" 5 | 6 | if [ -z "$1" ]; then 7 | echo "usage: $0 [qnt] [args]" 8 | echo "default: $0 \"${qnt[@]}\" \"${args}\"" 9 | exit 1 10 | fi 11 | 12 | if [ ! -z "$2" ]; then 13 | qnt=($2) 14 | fi 15 | 16 | if [ ! -z "$3" ]; then 17 | args="$3" 18 | fi 19 | 20 | model="$1" 21 | out="../tmp/results-${model}" 22 | 23 | set -o pipefail 24 | set -e 25 | 26 | mkdir -p ${out} 27 | 28 | mstr="" 29 | 30 | for q in ${qnt[@]}; do 31 | mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf" 32 | done 33 | 34 | ./bin/llama-bench ${mstr} ${args} 2> /dev/null 35 | -------------------------------------------------------------------------------- /scripts/run-all-ppl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k) 4 | args="-ngl 999 -t 8" 5 | 6 | if [ -z "$1" ]; then 7 | echo "usage: $0 [qnt] [args]" 8 | echo "default: $0 \"${qnt[@]}\" \"${args}\"" 9 | exit 1 10 | fi 11 | 12 | if [ ! -z "$2" ]; then 13 | qnt=($2) 14 | fi 15 | 16 | if [ ! -z "$3" ]; then 17 | args="$3" 18 | fi 19 | 20 | set -o pipefail 21 | set -e 22 | 23 | model="$1" 24 | out="../tmp/results-${model}" 25 | 26 | mkdir -p ${out} 27 | 28 | for q in ${qnt[@]}; do 29 | time ./bin/llama-perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt 30 | done 31 | -------------------------------------------------------------------------------- /scripts/sync-ggml.last: -------------------------------------------------------------------------------- 1 | b77f48b1efa671e094696b99fbf566aac8c87d74 2 | -------------------------------------------------------------------------------- /scripts/xxd.cmake: -------------------------------------------------------------------------------- 1 | # CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}` 2 | # Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake 3 | 4 | SET(INPUT "" CACHE STRING "Input File") 5 | SET(OUTPUT "" CACHE STRING "Output File") 6 | 7 | get_filename_component(filename "${INPUT}" NAME) 8 | string(REGEX REPLACE "\\.|-" "_" name "${filename}") 9 | 10 | file(READ "${INPUT}" hex_data HEX) 11 | string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}") 12 | 13 | string(LENGTH ${hex_data} hex_len) 14 | math(EXPR len "${hex_len} / 2") 15 | 16 | file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n") 17 | -------------------------------------------------------------------------------- /spm-headers/ggml-alloc.h: -------------------------------------------------------------------------------- 1 | ../ggml/include/ggml-alloc.h -------------------------------------------------------------------------------- /spm-headers/ggml-backend.h: -------------------------------------------------------------------------------- 1 | ../ggml/include/ggml-backend.h -------------------------------------------------------------------------------- /spm-headers/ggml-metal.h: -------------------------------------------------------------------------------- 1 | ../ggml/include/ggml-metal.h -------------------------------------------------------------------------------- /spm-headers/ggml.h: -------------------------------------------------------------------------------- 1 | ../ggml/include/ggml.h -------------------------------------------------------------------------------- /spm-headers/llama.h: -------------------------------------------------------------------------------- 1 | ../include/llama.h -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # TODO: should not use this 2 | if (WIN32) 3 | if (BUILD_SHARED_LIBS) 4 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 5 | endif() 6 | endif() 7 | 8 | # 9 | # libraries 10 | # 11 | 12 | # llama 13 | 14 | add_library(llama 15 | ../include/llama.h 16 | llama.cpp 17 | llama-vocab.cpp 18 | llama-grammar.cpp 19 | llama-sampling.cpp 20 | unicode.h 21 | unicode.cpp 22 | unicode-data.cpp 23 | ) 24 | 25 | target_include_directories(llama PUBLIC . ../include) 26 | target_compile_features (llama PUBLIC cxx_std_11) # don't bump 27 | 28 | target_link_libraries(llama PUBLIC ggml) 29 | 30 | if (BUILD_SHARED_LIBS) 31 | set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) 32 | target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) 33 | endif() 34 | -------------------------------------------------------------------------------- /src/llama-sampling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ? 4 | 5 | #include "llama-grammar.h" 6 | 7 | #include 8 | 9 | struct llama_vocab; 10 | struct llama_grammar; 11 | 12 | // sampler chain 13 | 14 | struct llama_sampler_chain { 15 | llama_sampler_chain_params params; 16 | 17 | std::vector samplers; 18 | 19 | // timing 20 | 21 | mutable int64_t t_sample_us; 22 | 23 | mutable int32_t n_sample; 24 | }; 25 | 26 | struct llama_sampler * llama_sampler_init_grammar_impl( 27 | const struct llama_vocab & vocab, 28 | const char * grammar_str, 29 | const char * grammar_root); 30 | -------------------------------------------------------------------------------- /src/unicode-data.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | struct range_nfd { 9 | uint32_t first; 10 | uint32_t last; 11 | uint32_t nfd; 12 | }; 13 | 14 | static const uint32_t MAX_CODEPOINTS = 0x110000; 15 | 16 | extern const std::initializer_list> unicode_ranges_flags; 17 | extern const std::unordered_set unicode_set_whitespace; 18 | extern const std::initializer_list> unicode_map_lowercase; 19 | extern const std::initializer_list> unicode_map_uppercase; 20 | extern const std::initializer_list unicode_ranges_nfd; 21 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !*.* 3 | *.o 4 | ggml-common.h 5 | -------------------------------------------------------------------------------- /tests/get-model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "get-model.h" 6 | 7 | char * get_model_or_exit(int argc, char *argv[]) { 8 | char * model_path; 9 | if (argc > 1) { 10 | model_path = argv[1]; 11 | 12 | } else { 13 | model_path = getenv("LLAMACPP_TEST_MODELFILE"); 14 | if (!model_path || strlen(model_path) == 0) { 15 | fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE= to silence this warning and run this test.\n\033[0m"); 16 | exit(EXIT_SUCCESS); 17 | } 18 | } 19 | 20 | return model_path; 21 | } 22 | -------------------------------------------------------------------------------- /tests/get-model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | char * get_model_or_exit(int, char*[]); 3 | -------------------------------------------------------------------------------- /tests/run-json-schema-to-grammar.mjs: -------------------------------------------------------------------------------- 1 | import { readFileSync } from "fs" 2 | import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs" 3 | 4 | const [, , file] = process.argv 5 | const url = `file://${file}` 6 | let schema = JSON.parse(readFileSync(file, "utf8")); 7 | const converter = new SchemaConverter({}) 8 | schema = await converter.resolveRefs(schema, url) 9 | converter.visit(schema, '') 10 | console.log(converter.formatGrammar()) 11 | -------------------------------------------------------------------------------- /tests/test-autorelease.cpp: -------------------------------------------------------------------------------- 1 | // ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "llama.h" 8 | #include "get-model.h" 9 | 10 | // This creates a new context inside a pthread and then tries to exit cleanly. 11 | int main(int argc, char ** argv) { 12 | auto * model_path = get_model_or_exit(argc, argv); 13 | 14 | std::thread([&model_path]() { 15 | llama_backend_init(); 16 | auto * model = llama_load_model_from_file(model_path, llama_model_default_params()); 17 | auto * ctx = llama_new_context_with_model(model, llama_context_default_params()); 18 | llama_free(ctx); 19 | llama_free_model(model); 20 | llama_backend_free(); 21 | }).join(); 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /tests/test-c.c: -------------------------------------------------------------------------------- 1 | #include "llama.h" 2 | 3 | #ifdef GGML_USE_KOMPUTE 4 | #include "ggml-kompute.h" 5 | #endif 6 | 7 | int main(void) {} 8 | -------------------------------------------------------------------------------- /tests/test-log.cpp: -------------------------------------------------------------------------------- 1 | #include "log.h" 2 | 3 | #include 4 | #include 5 | 6 | int main() { 7 | const int n_thread = 8; 8 | 9 | std::thread threads[n_thread]; 10 | for (int i = 0; i < n_thread; i++) { 11 | threads[i] = std::thread([i]() { 12 | const int n_msg = 1000; 13 | 14 | for (int j = 0; j < n_msg; j++) { 15 | const int log_type = std::rand() % 4; 16 | 17 | switch (log_type) { 18 | case 0: LOG_INF("Thread %d: %d\n", i, j); break; 19 | case 1: LOG_WRN("Thread %d: %d\n", i, j); break; 20 | case 2: LOG_ERR("Thread %d: %d\n", i, j); break; 21 | case 3: LOG_DBG("Thread %d: %d\n", i, j); break; 22 | default: 23 | break; 24 | } 25 | 26 | if (rand () % 10 < 5) { 27 | gpt_log_set_timestamps(gpt_log_main(), rand() % 2); 28 | gpt_log_set_prefix (gpt_log_main(), rand() % 2); 29 | } 30 | } 31 | }); 32 | } 33 | 34 | for (int i = 0; i < n_thread; i++) { 35 | threads[i].join(); 36 | } 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /tests/test-model-load-cancel.cpp: -------------------------------------------------------------------------------- 1 | #include "llama.h" 2 | #include "get-model.h" 3 | 4 | #include 5 | 6 | int main(int argc, char *argv[] ) { 7 | auto * model_path = get_model_or_exit(argc, argv); 8 | auto * file = fopen(model_path, "r"); 9 | if (file == nullptr) { 10 | fprintf(stderr, "no model at '%s' found\n", model_path); 11 | return EXIT_FAILURE; 12 | } 13 | 14 | fprintf(stderr, "using '%s'\n", model_path); 15 | fclose(file); 16 | 17 | llama_backend_init(); 18 | auto params = llama_model_params{}; 19 | params.use_mmap = false; 20 | params.progress_callback = [](float progress, void * ctx){ 21 | (void) ctx; 22 | return progress > 0.50; 23 | }; 24 | auto * model = llama_load_model_from_file(model_path, params); 25 | llama_backend_free(); 26 | return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; 27 | } 28 | -------------------------------------------------------------------------------- /tests/test-tokenizer-0.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # 5 | # test-tokenizer-0.sh 6 | # 7 | 8 | if [ $# -ne 2 ]; then 9 | printf "Usage: $0 \n" 10 | exit 1 11 | fi 12 | 13 | name=$1 14 | input=$2 15 | 16 | make -j tests/test-tokenizer-0 17 | 18 | printf "Testing %s on %s ...\n" $name $input 19 | 20 | set -e 21 | 22 | printf "Tokenizing using (py) Python AutoTokenizer ...\n" 23 | python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 24 | 25 | printf "Tokenizing using (cpp) llama.cpp ...\n" 26 | ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 27 | 28 | cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" 29 | cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" 30 | 31 | set +e 32 | 33 | diff $input.tok $input.tokcpp > /dev/null 2>&1 34 | 35 | if [ $? -eq 0 ]; then 36 | printf "Tokenization is correct!\n" 37 | else 38 | diff $input.tok $input.tokcpp | head -n 32 39 | 40 | printf "Tokenization differs!\n" 41 | fi 42 | --------------------------------------------------------------------------------