├── libs ├── .clang-format ├── ggml │ ├── .gitignore │ ├── src │ │ ├── ggml-cuda │ │ │ ├── argsort.cuh │ │ │ ├── fattn.cuh │ │ │ ├── fattn-tile-f16.cuh │ │ │ ├── fattn-tile-f32.cuh │ │ │ ├── acc.cuh │ │ │ ├── pad.cuh │ │ │ ├── clamp.cuh │ │ │ ├── rope.cuh │ │ │ ├── scale.cuh │ │ │ ├── arange.cuh │ │ │ ├── concat.cuh │ │ │ ├── im2col.cuh │ │ │ ├── pool2d.cuh │ │ │ ├── getrows.cuh │ │ │ ├── softmax.cuh │ │ │ ├── upscale.cuh │ │ │ ├── diagmask.cuh │ │ │ ├── tsembd.cuh │ │ │ ├── conv-transpose-1d.cuh │ │ │ ├── template-instances │ │ │ │ ├── mmq-instance-iq1_s.cu │ │ │ │ ├── mmq-instance-iq2_s.cu │ │ │ │ ├── mmq-instance-iq3_s.cu │ │ │ │ ├── mmq-instance-q2_k.cu │ │ │ │ ├── mmq-instance-q3_k.cu │ │ │ │ ├── mmq-instance-q4_0.cu │ │ │ │ ├── mmq-instance-q4_1.cu │ │ │ │ ├── mmq-instance-q4_k.cu │ │ │ │ ├── mmq-instance-q5_0.cu │ │ │ │ ├── mmq-instance-q5_1.cu │ │ │ │ ├── mmq-instance-q5_k.cu │ │ │ │ ├── mmq-instance-q6_k.cu │ │ │ │ ├── mmq-instance-q8_0.cu │ │ │ │ ├── mmq-instance-iq2_xs.cu │ │ │ │ ├── mmq-instance-iq2_xxs.cu │ │ │ │ ├── mmq-instance-iq3_xxs.cu │ │ │ │ ├── mmq-instance-iq4_nl.cu │ │ │ │ ├── mmq-instance-iq4_xs.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-f16-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_0-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q4_1-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_0-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q5_1-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs128-q8_0-q8_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs256-f16-f16.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-q4_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-q4_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-q5_0.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-q5_1.cu │ │ │ │ ├── fattn-vec-f16-instance-hs64-f16-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-f16-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_0-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q4_1-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_0-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q5_1-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs128-q8_0-q8_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs256-f16-f16.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-q4_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-q4_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-q5_0.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-q5_1.cu │ │ │ │ ├── fattn-vec-f32-instance-hs64-f16-q8_0.cu │ │ │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb8.cu │ │ │ │ ├── fattn-wmma-f16-instance-kqfloat-cpb32.cu │ │ │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb16.cu │ │ │ │ ├── fattn-wmma-f16-instance-kqhalf-cpb32.cu │ │ │ │ └── fattn-wmma-f16-instance-kqfloat-cpb16.cu │ │ │ ├── cross-entropy-loss.cuh │ │ │ ├── sum.cuh │ │ │ ├── sumrows.cuh │ │ │ ├── norm.cuh │ │ │ ├── cpy.cuh │ │ │ ├── binbcast.cuh │ │ │ ├── convert.cuh │ │ │ ├── mmvq.cuh │ │ │ ├── dmmv.cuh │ │ │ ├── quantize.cuh │ │ │ ├── scale.cu │ │ │ ├── vendors │ │ │ │ └── cuda.h │ │ │ ├── clamp.cu │ │ │ ├── arange.cu │ │ │ └── sumrows.cu │ │ ├── vulkan-shaders │ │ │ ├── generic_head.comp │ │ │ ├── CMakeLists.txt │ │ │ ├── dequant_head.comp │ │ │ ├── scale.comp │ │ │ ├── add.comp │ │ │ ├── div.comp │ │ │ ├── mul.comp │ │ │ ├── cos.comp │ │ │ ├── sin.comp │ │ │ ├── square.comp │ │ │ ├── clamp.comp │ │ │ ├── copy.comp │ │ │ ├── dequant_f32.comp │ │ │ ├── tanh.comp │ │ │ ├── relu.comp │ │ │ ├── silu.comp │ │ │ ├── leaky_relu.comp │ │ │ ├── gelu_quick.comp │ │ │ ├── mul_mat_split_k_reduce.comp │ │ │ ├── acc.comp │ │ │ ├── get_rows.comp │ │ │ ├── repeat.comp │ │ │ ├── gelu.comp │ │ │ ├── diag_mask_inf.comp │ │ │ ├── dequant_q8_0.comp │ │ │ ├── pad.comp │ │ │ ├── dequant_q4_0.comp │ │ │ ├── dequant_iq4_nl.comp │ │ │ ├── dequant_q4_1.comp │ │ │ ├── rope_norm.comp │ │ │ ├── rope_neox.comp │ │ │ ├── sum_rows.comp │ │ │ ├── get_rows_quant.comp │ │ │ ├── dequant_q5_0.comp │ │ │ ├── dequant_q5_1.comp │ │ │ ├── timestep_embedding.comp │ │ │ ├── upscale.comp │ │ │ ├── rms_norm.comp │ │ │ ├── norm.comp │ │ │ ├── concat.comp │ │ │ ├── dequant_q6_k.comp │ │ │ └── dequant_q2_k.comp │ │ ├── kompute-shaders │ │ │ ├── op_scale.comp │ │ │ ├── op_relu.comp │ │ │ ├── op_mul_mv_q_n_pre.comp │ │ │ ├── op_scale_8.comp │ │ │ ├── op_silu.comp │ │ │ ├── op_getrows.comp │ │ │ ├── op_gelu.comp │ │ │ ├── op_addrow.comp │ │ │ ├── op_diagmask.comp │ │ │ ├── op_getrows_f32.comp │ │ │ ├── op_getrows_f16.comp │ │ │ ├── op_getrows_q4_0.comp │ │ │ ├── op_getrows_q4_1.comp │ │ │ ├── op_mul_mat_q4_0.comp │ │ │ ├── op_mul_mat_q4_1.comp │ │ │ ├── op_getrows_q6_k.comp │ │ │ ├── op_mul_mat_mat_f32.comp │ │ │ ├── op_mul.comp │ │ │ └── op_rmsnorm.comp │ │ ├── llamafile │ │ │ └── sgemm.h │ │ ├── ggml-cann │ │ │ └── kernels │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── ascendc_kernels.h │ │ └── ggml-sycl │ │ │ ├── conv.hpp │ │ │ ├── tsembd.hpp │ │ │ ├── concat.hpp │ │ │ ├── rope.hpp │ │ │ ├── im2col.hpp │ │ │ ├── softmax.hpp │ │ │ ├── backend.hpp │ │ │ ├── convert.hpp │ │ │ ├── mmvq.hpp │ │ │ ├── dmmv.hpp │ │ │ └── mmq.hpp │ ├── README.md │ └── include │ │ ├── ggml-blas.h │ │ └── ggml-rpc.h ├── qnn_headers │ └── CMakeLists.txt ├── stb_headers │ └── CMakeLists.txt ├── llama_tokenizer │ ├── CMakeLists.txt │ └── unicode-data.h └── CMakeLists.txt ├── tools ├── convert_hf_to_gguf │ ├── gguf-py │ │ ├── gguf │ │ │ ├── py.typed │ │ │ ├── __init__.py │ │ │ └── gguf.py │ │ ├── tests │ │ │ └── __init__.py │ │ ├── scripts │ │ │ └── __init__.py │ │ ├── LICENSE │ │ ├── pyproject.toml │ │ └── examples │ │ │ └── writer.py │ ├── llama-quantize-x86_64-linux-clang │ └── requirements.txt ├── CMakeLists.txt ├── parameter_search │ └── token_tree │ │ ├── requirements.txt │ │ ├── .gitignore │ │ ├── search.py │ │ ├── README.md │ │ └── analyze.py ├── mmlu │ ├── .gitignore │ └── README.md ├── qnn_converter │ ├── .gitignore │ ├── prompt │ │ ├── system_prompt_qwen.txt │ │ └── system_prompt_llama.txt │ ├── requirements.txt │ ├── soc_config.py │ └── graph_params.py ├── simple_qnn_test │ ├── .gitignore │ └── README.md ├── extract_embd_from_vl │ └── README.md ├── gguf_config_to_json │ └── CMakeLists.txt ├── gen_flame_graph.sh ├── end_to_end │ └── powerserve.sh └── cos_sim.py ├── requirements.txt ├── assets ├── system_prompts │ ├── qwen2.txt │ └── llama3.txt └── prompts │ ├── comparison_qwen2.txt │ ├── strawberry_qwen2.txt │ ├── comparison_llama.txt │ ├── gsm8k1.txt │ ├── gsm8k1_qwen.txt │ ├── gsm8k1_llama.txt │ ├── gsm8k2.txt │ ├── short_prompt.txt │ ├── gsm8k2_qwen.txt │ └── long_prompt.txt ├── src ├── executor │ ├── CMakeLists.txt │ └── executor.hpp ├── tokenizer │ └── CMakeLists.txt ├── model │ ├── llama │ │ └── CMakeLists.txt │ ├── qwen2 │ │ └── CMakeLists.txt │ ├── module │ │ ├── CMakeLists.txt │ │ ├── ffn.hpp │ │ ├── norm_attention.hpp │ │ ├── attention.hpp │ │ ├── attention_mask.cpp │ │ ├── ffn.cpp │ │ └── attention_mask.hpp │ ├── CMakeLists.txt │ ├── internvl │ │ └── CMakeLists.txt │ └── model_loader.hpp ├── speculative │ ├── CMakeLists.txt │ └── speculative_config.hpp ├── graph │ ├── CMakeLists.txt │ ├── node.cpp │ └── op_type.hpp ├── storage │ └── CMakeLists.txt ├── sampler │ ├── CMakeLists.txt │ └── sampler_chain.hpp ├── backend │ ├── ggml │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ ├── qnn │ │ └── CMakeLists.txt │ └── backend.hpp ├── core │ ├── CMakeLists.txt │ ├── buffer.hpp │ ├── spin_barrier.h │ ├── defines.hpp │ ├── getenv.hpp │ ├── typedefs.hpp │ ├── spin_barrier.hpp │ ├── timer.hpp │ └── android_logger.hpp └── CMakeLists.txt ├── app ├── run │ ├── CMakeLists.txt │ └── README.md ├── perplexity │ └── CMakeLists.txt ├── common │ └── CMakeLists.txt ├── CMakeLists.txt └── server │ ├── CMakeLists.txt │ └── server.cpp ├── pyproject.toml ├── .gitignore ├── tests ├── CMakeLists.txt └── quant_mul_mat.cpp ├── docs └── perfetto_tracing.md ├── .clang-format └── .gitmodules /libs/.clang-format: -------------------------------------------------------------------------------- 1 | DisableFormat: true 2 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/gguf/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(gguf_config_to_json) 2 | -------------------------------------------------------------------------------- /tools/parameter_search/token_tree/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | -------------------------------------------------------------------------------- /tools/parameter_search/token_tree/.gitignore: -------------------------------------------------------------------------------- 1 | *.jsonl 2 | *.bak 3 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_metadata import * 2 | -------------------------------------------------------------------------------- /tools/mmlu/.gitignore: -------------------------------------------------------------------------------- 1 | /*.json 2 | /prompt/*.txt 3 | dev/ 4 | test/ 5 | val/ 6 | -------------------------------------------------------------------------------- /tools/qnn_converter/.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | smallthinker_*/ 3 | tmp*/ 4 | debug/ 5 | -------------------------------------------------------------------------------- /libs/ggml/.gitignore: -------------------------------------------------------------------------------- 1 | src/ggml-vulkan-shaders.hpp 2 | src/ggml-vulkan-shaders.cpp 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r ./tools/convert_hf_to_gguf/requirements.txt 2 | 3 | isort 4 | black 5 | -------------------------------------------------------------------------------- /assets/system_prompts/qwen2.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>system 2 | You are a helpful assistant.<|im_end|> 3 | -------------------------------------------------------------------------------- /src/executor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "executor.cpp" 3 | ) 4 | -------------------------------------------------------------------------------- /src/tokenizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "tokenizer.cpp" 3 | ) 4 | -------------------------------------------------------------------------------- /src/model/llama/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "llama_model.cpp" 3 | ) 4 | -------------------------------------------------------------------------------- /src/model/qwen2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "qwen2_model.cpp" 3 | ) 4 | -------------------------------------------------------------------------------- /src/speculative/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "token_tree.cpp" 3 | ) 4 | -------------------------------------------------------------------------------- /src/graph/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "graph.cpp" 3 | "node.cpp" 4 | ) 5 | -------------------------------------------------------------------------------- /tools/qnn_converter/prompt/system_prompt_qwen.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>system 2 | You are a helpful assistant.<|im_end|> 3 | -------------------------------------------------------------------------------- /tools/simple_qnn_test/.gitignore: -------------------------------------------------------------------------------- 1 | ./model_libs 2 | ./output 3 | 4 | *.bin 5 | *.onnx 6 | *.cpp 7 | *.json 8 | *.so 9 | -------------------------------------------------------------------------------- /assets/system_prompts/llama3.txt: -------------------------------------------------------------------------------- 1 | <|start_header_id|>system<|end_header_id|> 2 | You are a helpful assistant.<|eot_id|> 3 | -------------------------------------------------------------------------------- /assets/prompts/comparison_qwen2.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | 9.11 and 9.8, which is larger?<|im_end|> 3 | <|im_start|>assistant 4 | -------------------------------------------------------------------------------- /assets/prompts/strawberry_qwen2.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | How many "r"s in "strawberry"?<|im_end|> 3 | <|im_start|>assistant 4 | -------------------------------------------------------------------------------- /src/storage/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE storage_source *.cpp) 2 | target_sources(powerserve PRIVATE 3 | ${storage_source} 4 | ) 5 | -------------------------------------------------------------------------------- /tools/qnn_converter/prompt/system_prompt_llama.txt: -------------------------------------------------------------------------------- 1 | <|start_header_id|>system<|end_header_id|> 2 | You are a helpful assistant.<|eot_id|> 3 | -------------------------------------------------------------------------------- /src/sampler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "prob_array.cpp" 3 | "sampler_chain.cpp" 4 | "sampler.cpp" 5 | ) 6 | -------------------------------------------------------------------------------- /app/run/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(run run.cpp) 2 | target_link_libraries(run PRIVATE powerserve app.common) 3 | 4 | powerserve_add_binary(run) 5 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/argsort.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/fattn.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /src/backend/ggml/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "ggml.cpp" 3 | "ggml_kv_cache.cpp" 4 | "ggml_wrapper.cpp" 5 | ) 6 | -------------------------------------------------------------------------------- /src/model/module/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "attention_mask.cpp" 3 | "ffn.cpp" 4 | "norm_attention.cpp" 5 | ) 6 | -------------------------------------------------------------------------------- /libs/qnn_headers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(qnn_headers INTERFACE 2 | "qnn_type_macros.hpp" 3 | ) 4 | target_include_directories(qnn_headers INTERFACE .) 5 | -------------------------------------------------------------------------------- /assets/prompts/comparison_llama.txt: -------------------------------------------------------------------------------- 1 | <|start_header_id|>user<|end_header_id|>9.11 and 9.8, which is larger?<|eot_id|><|start_header_id|>assistant<|end_header_id|> 2 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/fattn-tile-f16.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/fattn-tile-f32.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/acc.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ACC_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/pad.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_PAD_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/llama-quantize-x86_64-linux-clang: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/powerserve-project/PowerServe/HEAD/tools/convert_hf_to_gguf/llama-quantize-x86_64-linux-clang -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/clamp.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CLAMP_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/rope.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ROPE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/scale.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_SCALE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.26.4 2 | sentencepiece>=0.2.0 3 | transformers>=4.40.1,<5.0.0 4 | gguf>=0.1.0 5 | protobuf>=4.21.0,<5.0.0 6 | torch>=2.2.1,<2.5.0 7 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/arange.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_ARANGE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/concat.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CONCAT_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/im2col.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_IM2COL_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/pool2d.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_POOL2D_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/getrows.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_GET_ROWS_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/softmax.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_SOFT_MAX_BLOCK_SIZE 1024 4 | 5 | void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/upscale.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_UPSCALE_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/stb_headers/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(stb_headers INTERFACE 2 | "stb/stb_image.h" 3 | "stb/stb_image_resize2.h" 4 | ) 5 | target_include_directories(stb_headers INTERFACE ./stb) 6 | -------------------------------------------------------------------------------- /tools/qnn_converter/requirements.txt: -------------------------------------------------------------------------------- 1 | onnx==1.16.1 2 | onnxruntime==1.18.0 3 | onnxsim==0.4.36 4 | sentencepiece>=0.2.0 5 | transformers>=4.40.1,<5.0.0 6 | torch>=1.13.1,<2.5.0 7 | protobuf==3.20.2 -------------------------------------------------------------------------------- /tools/extract_embd_from_vl/README.md: -------------------------------------------------------------------------------- 1 | # export embedding only 2 | ``` 3 | python ./tools/extract_embd_from_vl/main.py --model-path /data1/models/Qwen2-VL-2B-Instruct/ --out-path ./tmp.gguf 4 | ``` 5 | -------------------------------------------------------------------------------- /app/perplexity/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(perplexity-test "main.cpp") 2 | target_link_libraries(perplexity-test PRIVATE powerserve ggml app.common) 3 | 4 | powerserve_add_artifact(perplexity-test) 5 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/diagmask.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 4 | 5 | void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /src/backend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "platform.cpp" 3 | ) 4 | 5 | add_subdirectory(ggml) 6 | 7 | if (POWERSERVE_WITH_QNN) 8 | add_subdirectory(qnn) 9 | endif() 10 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/tsembd.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /src/model/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(llama) 2 | add_subdirectory(internvl) 3 | add_subdirectory(qwen2) 4 | add_subdirectory(module) 5 | 6 | target_sources(powerserve PRIVATE 7 | "model_loader.cpp" 8 | ) 9 | -------------------------------------------------------------------------------- /tools/parameter_search/token_tree/search.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from common import * 4 | from tqdm import tqdm 5 | 6 | 7 | for params in tqdm(untested_params): 8 | run(params) 9 | time.sleep(5) 10 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/conv-transpose-1d.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ1_S); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_S); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_S); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q2_K); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q3_K); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q4_K); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q5_K); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q6_K); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /src/model/internvl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "internvl_model.cpp" 3 | ) 4 | target_link_libraries(powerserve PRIVATE stb_headers) 5 | target_link_libraries(powerserve PRIVATE xtensor xtl) 6 | -------------------------------------------------------------------------------- /tools/gguf_config_to_json/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(config-generator "main.cpp") 2 | target_link_libraries(config-generator PRIVATE powerserve ggml CLI11::CLI11) 3 | 4 | powerserve_add_artifact(config-generator) 5 | -------------------------------------------------------------------------------- /app/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(app.common STATIC cmdline.cpp) 2 | target_link_libraries(app.common PRIVATE powerserve CLI11::CLI11) 3 | 4 | target_include_directories(app.common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 5 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/cross-entropy-loss.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256 4 | 5 | void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XS); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_NL); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../mmq.cuh" 4 | 5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | include = 'powerserve' 4 | extend-exclude = '/.*/' 5 | preview = true 6 | enable-unstable-feature = ['hug_parens_with_braces_and_square_brackets', 'docstring_check_for_newline'] 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build*/ 2 | .cache/ 3 | .vscode/ 4 | __pycache__/ 5 | .venv/ 6 | models/ 7 | proj/ 8 | 9 | *.svg 10 | *.data 11 | *.folded 12 | *.perf 13 | Pipfile 14 | tools/llama_tf_to_qnn_bin 15 | tools/process_image 16 | *.log 17 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/generic_head.comp: -------------------------------------------------------------------------------- 1 | #extension GL_EXT_shader_16bit_storage : require 2 | 3 | layout (push_constant) uniform parameter 4 | { 5 | uint KX; 6 | uint KY; 7 | float param1; 8 | float param2; 9 | } p; 10 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/sum.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream); 4 | 5 | void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/README.md: -------------------------------------------------------------------------------- 1 | ## Copyright and License 2 | 3 | The original ggml project is © 2023-2024 by the ggml authors and is released under the MIT License. You can find a copy of the original license [here](https://github.com/ggerganov/ggml/blob/master/LICENSE). 4 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/sumrows.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream); 4 | 5 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f16.cuh" 4 | 5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1); 6 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-vec-f32.cuh" 4 | 5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0); 6 | -------------------------------------------------------------------------------- /assets/prompts/gsm8k1.txt: -------------------------------------------------------------------------------- 1 | Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden? -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(quant_mul_mat "quant_mul_mat.cpp") 2 | target_link_libraries(quant_mul_mat PRIVATE powerserve ggml) 3 | 4 | if (POWERSERVE_WITH_QNN) 5 | add_executable(qnn_test qnn_test.cpp) 6 | target_link_libraries(qnn_test PRIVATE powerserve CLI11::CLI11) 7 | endif() 8 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/gguf/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import * 2 | from .gguf_reader import * 3 | from .gguf_writer import * 4 | from .lazy import * 5 | from .metadata import * 6 | from .quants import * 7 | from .tensor_mapping import * 8 | from .utility import * 9 | from .vocab import * 10 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/norm.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | 5 | void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | 7 | void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package (Threads REQUIRED) 2 | 3 | set(TARGET vulkan-shaders-gen) 4 | add_executable(${TARGET} vulkan-shaders-gen.cpp) 5 | install(TARGETS ${TARGET} RUNTIME) 6 | target_compile_features(${TARGET} PRIVATE cxx_std_11) 7 | target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) 8 | -------------------------------------------------------------------------------- /src/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(powerserve PRIVATE 2 | "config.cpp" 3 | "perf.cpp" 4 | "perfetto_trace.cpp" 5 | "thread_pool.cpp" 6 | "timer.cpp" 7 | "spin_barrier.cpp" 8 | ) 9 | 10 | if (POWERSERVE_WITH_PERFETTO) 11 | target_compile_definitions(powerserve PUBLIC POWERSERVE_WITH_PERFETTO) 12 | endif() 13 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_head.comp: -------------------------------------------------------------------------------- 1 | #extension GL_EXT_control_flow_attributes : require 2 | #extension GL_EXT_shader_16bit_storage : require 3 | 4 | layout (push_constant) uniform parameter 5 | { 6 | uint M; 7 | uint K; 8 | uint stride_a; 9 | uint stride_b; 10 | uint nel; 11 | } p; 12 | 13 | #include "types.comp" 14 | -------------------------------------------------------------------------------- /assets/prompts/gsm8k1_qwen.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?<|im_end|> 3 | <|im_start|>assistant 4 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | # pyright: reportUnusedImport=false 2 | 3 | from .gguf_convert_endian import main as gguf_convert_endian_entrypoint 4 | from .gguf_dump import main as gguf_dump_entrypoint 5 | from .gguf_new_metadata import main as gguf_new_metadata_entrypoint 6 | from .gguf_set_metadata import main as gguf_set_metadata_entrypoint 7 | -------------------------------------------------------------------------------- /tools/qnn_converter/soc_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class SoCConfig: 6 | htp_version: int 7 | soc_id: int 8 | 9 | 10 | soc_map = { 11 | "8650": SoCConfig(htp_version=75, soc_id=57), 12 | "8750": SoCConfig(htp_version=79, soc_id=69), 13 | "8295": SoCConfig(htp_version=68, soc_id=39), 14 | } 15 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/cpy.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_CPY_BLOCK_SIZE 32 4 | 5 | void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); 6 | 7 | void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | 9 | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1); 10 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/scale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); 14 | } 15 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 8, half); 6 | DECL_FATTN_WMMA_F16_CASE(96, 8, half); 7 | DECL_FATTN_WMMA_F16_CASE(128, 8, half); 8 | DECL_FATTN_WMMA_F16_CASE(256, 8, half); 9 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/add.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/div.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/mul.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)])); 14 | } 15 | -------------------------------------------------------------------------------- /assets/prompts/gsm8k1_llama.txt: -------------------------------------------------------------------------------- 1 | <|start_header_id|>user<|end_header_id|>Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?<|eot_id|><|start_header_id|>assistant<|end_header_id|> -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/cos.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val)); 15 | } 16 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/sin.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val)); 15 | } 16 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/square.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val); 15 | } 16 | -------------------------------------------------------------------------------- /assets/prompts/gsm8k2.txt: -------------------------------------------------------------------------------- 1 | Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes? -------------------------------------------------------------------------------- /assets/prompts/short_prompt.txt: -------------------------------------------------------------------------------- 1 | Mia is planning a camping trip in the Canadian Rockies and has a budget of $800 for equipment. She buys a tent for $120, which is 15% off the original price. She then purchases a sleeping bag for $80, which is 20% off. If she also needs to buy a backpack and a portable stove, and the total cost of these two items is $180, what percentage of her budget will she have left after all the purchases? -------------------------------------------------------------------------------- /libs/llama_tokenizer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(llama_tokenizer STATIC 2 | "llama-vocab.cpp" 3 | "unicode-data.cpp" 4 | "unicode.cpp" 5 | ) 6 | target_link_libraries(llama_tokenizer PRIVATE ggml) 7 | target_include_directories(llama_tokenizer PUBLIC .) 8 | 9 | if (CMAKE_C_COMPILER_ID MATCHES "Clang") 10 | target_compile_options(llama_tokenizer PRIVATE -Wno-deprecated-declarations) 11 | endif() 12 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, float); 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, float); 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, float); 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, float); 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, float); 10 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/clamp.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); 15 | } 16 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/binbcast.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 4 | void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 5 | void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 6 | void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 7 | void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); 8 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/copy.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = get_idx(); 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | #ifndef OPTIMIZATION_ERROR_WORKAROUND 14 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]); 15 | #else 16 | data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)]; 17 | #endif 18 | } 19 | -------------------------------------------------------------------------------- /assets/prompts/gsm8k2_qwen.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?<|im_end|> 3 | <|im_start|>assistant 4 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/convert.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 4 | 5 | template 6 | using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); 7 | 8 | typedef to_t_cuda_t to_fp32_cuda_t; 9 | typedef to_t_cuda_t to_fp16_cuda_t; 10 | 11 | to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); 12 | 13 | to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); 14 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, half); 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, half); 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, half); 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, half); 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, half); 10 | DECL_FATTN_WMMA_F16_CASE(256, 16, half); 11 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, half); 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, half); 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, half); 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, half); 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, half); 10 | DECL_FATTN_WMMA_F16_CASE(256, 32, half); 11 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu: -------------------------------------------------------------------------------- 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually. 2 | 3 | #include "../fattn-wmma-f16.cuh" 4 | 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, float); 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, float); 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, float); 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, float); 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, float); 10 | DECL_FATTN_WMMA_F16_CASE(256, 16, float); 11 | -------------------------------------------------------------------------------- /tools/parameter_search/token_tree/README.md: -------------------------------------------------------------------------------- 1 | 首先需要导出模型和编译项目。 2 | 3 | 超参数设置修改`common.py`中的`search_grid`,运行命令修改`run`函数。 4 | 5 | 推送脚本到手机上: 6 | 7 | ```bash 8 | rsync -avzP tools/parameter_search/token_tree/{common,search}.py 8gen4:~/ 9 | ``` 10 | 11 | 在手机上运行脚本: 12 | 13 | ```bash 14 | python search.py 15 | ``` 16 | 17 | 然后将数据库下载到本地,并用`analyze.py`分析: 18 | 19 | ```bash 20 | cd tools/parameter_search/token_tree 21 | rsync -avzP 8gen4:~/database.jsonl . 22 | python analyze.py 23 | ``` 24 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/mmvq.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. 4 | 5 | void ggml_cuda_op_mul_mat_vec_q( 6 | ggml_backend_cuda_context & ctx, 7 | const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, 8 | const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, 9 | const int64_t src1_padded_row_size, cudaStream_t stream); 10 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_scale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | float scale; 14 | } pcs; 15 | 16 | void main() { 17 | const uint i = gl_WorkGroupID.x; 18 | out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; 19 | } 20 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_f32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {float data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_GlobalInvocationID.x * 16; 12 | 13 | if (i >= p.nel) { 14 | return; 15 | } 16 | 17 | [[unroll]] for (uint l = 0; l < 16; l++) { 18 | data_b[i + l] = D_TYPE(data_a[i + l]); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /app/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(POWERSERVE_BINARY_DIR ${PROJECT_BINARY_DIR}/bin) 2 | message(STATUS "PowerServe binary dir: ${POWERSERVE_BINARY_DIR}") 3 | 4 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${POWERSERVE_BINARY_DIR}) 5 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${POWERSERVE_BINARY_DIR}) 6 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${POWERSERVE_BINARY_DIR}) 7 | 8 | function(powerserve_add_binary target_name) 9 | powerserve_add_artifact(${target_name}) 10 | endfunction() 11 | 12 | add_subdirectory(common) 13 | 14 | add_subdirectory(run) 15 | add_subdirectory(perplexity) 16 | add_subdirectory(server) 17 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/gguf/gguf.py: -------------------------------------------------------------------------------- 1 | # This file left for compatibility. If you want to use the GGUF API from Python 2 | # then don't import gguf/gguf.py directly. If you're looking for examples, see the 3 | # examples/ directory for gguf-py 4 | 5 | import importlib 6 | import sys 7 | from pathlib import Path 8 | 9 | 10 | sys.path.insert(0, str(Path(__file__).parent.parent)) 11 | 12 | # Compatibility for people trying to import gguf/gguf.py directly instead of as a package. 13 | importlib.invalidate_caches() 14 | import gguf # noqa: E402 15 | 16 | 17 | importlib.reload(gguf) 18 | -------------------------------------------------------------------------------- /src/backend/qnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(QNN_SDK $ENV{QNN_SDK_ROOT}) 2 | 3 | if ("${QNN_SDK}" STREQUAL "") 4 | message(FATAL_ERROR "Environment variable \"QNN_SDK_ROOT\" is not defined.") 5 | endif() 6 | 7 | message("QNN_SDK_ROOT=${QNN_SDK}") 8 | 9 | target_sources(powerserve PRIVATE 10 | "causal_models.cpp" 11 | "graph_interface.cpp" 12 | "qnn_backend.cpp" 13 | "qnn.cpp" 14 | ) 15 | target_include_directories(powerserve PUBLIC ${QNN_SDK}/include/QNN) 16 | target_compile_definitions(powerserve PUBLIC POWERSERVE_WITH_QNN) 17 | target_link_libraries(powerserve PUBLIC qnn_headers) 18 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 4; 16 | 17 | for (uint x = 0; x < 4; x++) { 18 | const uint i = baseIndex + x; 19 | out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/tanh.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | data_d[i] = D_TYPE(tanh(data_a[i])); 21 | } 22 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | data_d[i] = max(float(data_a[i]), 0); 21 | } 22 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp: -------------------------------------------------------------------------------- 1 | layout(local_size_x_id = 0) in; 2 | layout(local_size_y = 1) in; 3 | layout(local_size_z = 1) in; 4 | 5 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 6 | layout (binding = 1) readonly buffer tensorInB { float inB[]; }; 7 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 8 | 9 | layout (push_constant) uniform parameter { 10 | uint inAOff; 11 | uint inBOff; 12 | uint outOff; 13 | int ne00; 14 | int ne01; 15 | int ne02; 16 | int ne10; 17 | int ne12; 18 | int ne0; 19 | int ne1; 20 | uint r2; 21 | uint r3; 22 | } pcs; 23 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_scale_8.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | float scale; 14 | } pcs; 15 | 16 | void main() { 17 | const uint baseIndex = gl_WorkGroupID.x * 8; 18 | 19 | for (uint x = 0; x < 8; x++) { 20 | const uint i = baseIndex + x; 21 | out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_silu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 4; 16 | 17 | for (uint x = 0; x < 4; x++) { 18 | const uint i = baseIndex + x; 19 | const float y = in_[i + pcs.inOff]; 20 | out_[i + pcs.outOff] = y / (1.0 + exp(-y)); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/silu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | const float xi = float(data_a[i]); 21 | data_d[i] = D_TYPE(xi / (1.0f + exp(-xi))); 22 | } 23 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows.comp: -------------------------------------------------------------------------------- 1 | void main() { 2 | const uint i = gl_WorkGroupID.x; 3 | const int r = inB[i + pcs.inBOff]; 4 | 5 | int z = 0; 6 | for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) { 7 | const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK; 8 | const mat4 result = dequantize_block(inIndex, ind%NL); 9 | for (uint j = 0; j < 4; ++j) { 10 | for (uint k = 0; k < 4; ++k) { 11 | const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z; 12 | out_[outIndex] = result[j][k]; 13 | ++z; 14 | } 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/leaky_relu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 15 | 16 | if (i >= p.KX) { 17 | return; 18 | } 19 | 20 | const float val = float(data_a[i]); 21 | data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); 22 | } 23 | -------------------------------------------------------------------------------- /tools/gen_flame_graph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # bash ./tools/gen_flame_graph.sh ./build/bin/run --file-path ../models/Meta-Llama-3.1-8B/llama3-8b_Q4_0.gguf --vocab-path ../models/Meta-Llama-3.1-8B/llama3.1_8b_vocab.gguf --prompt "Tell me a story:" --steps 16 3 | set -x 4 | 5 | TOOLS_DIR="/home/zwb/SS/FlameGraph" 6 | CUR_DIR=$(pwd) 7 | 8 | cmd=("$@") 9 | 10 | sudo perf record -F 99 -a -g -- "${cmd[@]}" 11 | sudo perf script -i perf.data > $CUR_DIR/out.perf 12 | sudo $TOOLS_DIR/stackcollapse-perf.pl $CUR_DIR/out.perf > $CUR_DIR/out.folded 13 | sudo $TOOLS_DIR/flamegraph.pl $CUR_DIR/out.folded > $CUR_DIR/out.svg 14 | sudo chmod a+rw $CUR_DIR/out.svg 15 | 16 | sudo rm -f $CUR_DIR/perf.data $CUR_DIR/out.perf $CUR_DIR/out.folded 17 | -------------------------------------------------------------------------------- /tools/simple_qnn_test/README.md: -------------------------------------------------------------------------------- 1 | # Simple QNN Test 2 | 3 | ## Convert Model 4 | 5 | convert model + compile model + generate binary 6 | ```shell 7 | python3 simple_convert_qnn.py --model-name simple_model --model-num 3 8 | ``` 9 | The output will locate in the directory `output` 10 | 11 | ## Execute Model 12 | 13 | Push the model to the phone 14 | 15 | ```shell 16 | adb push output/simple_model.bin 17 | ``` 18 | 19 | ```shell 20 | /tests/qnn_tests --qnn-path /data/local/tmp/simple_model \ 21 | --model-name \ 22 | --graph-num \ 23 | --repeat 24 | ``` 25 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_gelu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | layout(push_constant) uniform PushConstants { 10 | uint inOff; 11 | uint outOff; 12 | } pcs; 13 | 14 | void main() { 15 | const uint baseIndex = gl_WorkGroupID.x * 8; 16 | 17 | for (uint x = 0; x < 8; x++) { 18 | const uint i = baseIndex + x; 19 | const float y = in_[i + pcs.inOff]; 20 | out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0))); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/dmmv.cuh: -------------------------------------------------------------------------------- 1 | #include "common.cuh" 2 | 3 | // dmmv = dequantize_mul_mat_vec 4 | 5 | // TODO: remove this? 6 | #ifndef GGML_CUDA_DMMV_X 7 | #define GGML_CUDA_DMMV_X 32 8 | #endif 9 | 10 | #ifndef GGML_CUDA_MMV_Y 11 | #define GGML_CUDA_MMV_Y 1 12 | #endif 13 | 14 | void ggml_cuda_op_dequantize_mul_mat_vec( 15 | ggml_backend_cuda_context & ctx, 16 | const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, 17 | const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, 18 | const int64_t src1_padded_row_size, cudaStream_t stream); 19 | 20 | bool ggml_cuda_dmmv_type_supported(ggml_type src0_type); 21 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/gelu_quick.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const float GELU_QUICK_COEF = -1.702f; 15 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 16 | 17 | if (i >= p.KX) { 18 | return; 19 | } 20 | 21 | const float x = float(data_a[i]); 22 | data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); 23 | } 24 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_control_flow_attributes : enable 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {float data_a[];}; 8 | layout (binding = 1) writeonly buffer D {float data_d[];}; 9 | 10 | layout (push_constant) uniform parameter { 11 | uint ne; 12 | uint k_num; 13 | } p; 14 | 15 | void main() { 16 | const uint idx = gl_GlobalInvocationID.x; 17 | 18 | if (idx >= p.ne) { 19 | return; 20 | } 21 | 22 | float result = 0.0f; 23 | 24 | [[unroll]] for (uint i = 0; i < p.k_num; i++) { 25 | result += data_a[i * p.ne + idx]; 26 | } 27 | 28 | data_d[idx] = result; 29 | } 30 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_addrow.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; 10 | 11 | layout(push_constant) uniform PushConstants { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | uint row; 16 | } pcs; 17 | 18 | void main() { 19 | const uint baseIndex = gl_WorkGroupID.x * 4; 20 | 21 | for (uint x = 0; x < 4; x++) { 22 | const uint i = baseIndex + x; 23 | out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff]; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tools/parameter_search/token_tree/analyze.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | 3 | 4 | def get_score(stat: dict) -> float: 5 | draft_model_latency_ms = 8.421 6 | target_model_latency_ms = 62.867 7 | 8 | n_iterations = stat["n_iterations"] 9 | n_draft_times = stat["n_draft_times"] 10 | n_generated_tokens = stat["n_generated_tokens"] 11 | 12 | latency_ms = n_iterations * target_model_latency_ms + (n_iterations + n_draft_times) * draft_model_latency_ms 13 | tokens_per_second = n_generated_tokens * 1000 / latency_ms 14 | 15 | return tokens_per_second 16 | 17 | 18 | leaderboard = sorted((get_score(stat), params, stat) for params, stat in database.items() if stat is not None) 19 | 20 | for score, params, stat in leaderboard: 21 | print(f"{score:.3f} '{format_params(params)}' {stat}") 22 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/acc.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = gl_GlobalInvocationID.x; 8 | if (idx >= p.ne) { 9 | return; 10 | } 11 | 12 | const uint offset = p.param3; 13 | const uint src1_i = idx - offset; 14 | const uint oz = src1_i / p.nb02; 15 | const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; 16 | const uint ox = src1_i % p.nb01; 17 | 18 | if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) { 19 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11])); 20 | } else { 21 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)])); 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/get_rows.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint i00 = gl_GlobalInvocationID.x; 8 | const uint i10 = gl_GlobalInvocationID.y; 9 | const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; 10 | const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; 11 | 12 | if (i00 >= p.ne00) { 13 | return; 14 | } 15 | 16 | const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; 17 | 18 | const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; 19 | const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; 20 | 21 | #ifndef OPTIMIZATION_ERROR_WORKAROUND 22 | data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]); 23 | #else 24 | data_d[d_offset + i00] = data_a[a_offset + i00]; 25 | #endif 26 | } 27 | -------------------------------------------------------------------------------- /src/model/model_loader.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "model/model.hpp" 16 | 17 | namespace powerserve { 18 | 19 | auto load_model(const Path &model_dir) -> std::shared_ptr; 20 | 21 | } // namespace powerserve 22 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/repeat.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | uint src0_idx_mod(uint idx) { 7 | const uint i13 = idx / (p.ne12*p.ne11*p.ne10); 8 | const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; 9 | const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); 10 | const uint i12_offset = i12*p.ne11*p.ne10; 11 | const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; 12 | const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; 13 | return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00; 14 | } 15 | 16 | void main() { 17 | const uint idx = get_idx(); 18 | 19 | if (idx >= p.ne) { 20 | return; 21 | } 22 | 23 | data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]); 24 | } 25 | -------------------------------------------------------------------------------- /tools/mmlu/README.md: -------------------------------------------------------------------------------- 1 | # MMLU Test 2 | - Download the test set: https://github.com/bobozi-cmd/mmlu/releases/download/publish/mmlu_dataset.zip 3 | 4 | - Unpack and put it in the same directory as the script `mmlu_test.py`: 5 | ``` 6 | > tree -L 1 ./ 7 | ./ 8 | ├── dev 9 | ├── mmlu_test.py 10 | ├── README.md 11 | ├── test 12 | └── val 13 | ``` 14 | 15 | - Start the server, assuming that the IP address is 192.168.1.39, the model directory is build_llama_proj, and the port is 8080. Test command: 16 | 17 | ```bash 18 | python ./mmlu_test.py --host 192.168.1.39 --port 8080 -s 1 --model build_llama_proj 19 | ``` 20 | 21 | - MMLU has a total of 57 subjects, and `-s30` means the first 30 questions in each subject. Without adding the '-s' parameter, you will measure the entire MMLU data set. 22 | - If you run on pure CPU, it will be slower, so you can test `-s1` first. 23 | -------------------------------------------------------------------------------- /src/backend/backend.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // WIP: direct pass funcs 16 | #pragma once 17 | 18 | namespace powerserve { 19 | 20 | struct Backend { 21 | virtual ~Backend() = default; 22 | }; 23 | 24 | } // namespace powerserve 25 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_diagmask.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | uint n_past; 14 | int ne00; 15 | int ne01; 16 | } pcs; 17 | 18 | void main() { 19 | const uint i02 = gl_WorkGroupID.z; 20 | const uint i01 = gl_WorkGroupID.y; 21 | const uint i00 = gl_WorkGroupID.x; 22 | 23 | const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00; 24 | 25 | if (i00 > pcs.n_past + i01) { 26 | out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000); 27 | } else { 28 | out_[index + pcs.outOff] = in_[index + pcs.inOff]; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/gelu.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 9 | 10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 12 | 13 | void main() { 14 | const float GELU_COEF_A = 0.044715f; 15 | const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; 16 | const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 17 | 18 | if (i >= p.KX) { 19 | return; 20 | } 21 | 22 | const float xi = float(data_a[i]); 23 | const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); 24 | data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); 25 | } 26 | -------------------------------------------------------------------------------- /assets/prompts/long_prompt.txt: -------------------------------------------------------------------------------- 1 | Imagine you are an AI assistant developed by Moonshot AI, named Kimi. You are designed to interact with users in both English and Chinese, providing helpful and accurate responses while adhering to safety and compliance standards. Your capabilities include processing long texts, handling various file formats, and accessing the internet for information. You are also programmed to refuse any requests that involve terrorism, racial discrimination, explicit content, or politically sensitive topics. 2 | Today, you are tasked with engaging in a conversation with a user who is interested in learning about the history of artificial intelligence. They have asked you to provide a brief overview of the key milestones in AI development, from its inception to the present day. Your response should be informative, concise, and engaging, ensuring that the user gains a clear understanding of AI's evolution. -------------------------------------------------------------------------------- /app/server/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(server.simple STATIC simple_server.cpp) 2 | target_link_libraries(server.simple PUBLIC 3 | powerserve 4 | httplib::httplib 5 | nlohmann_json::nlohmann_json 6 | concurrentqueue 7 | ) 8 | if (POWERSERVE_SERVER_MULTIMODEL) 9 | target_compile_definitions(server.simple PUBLIC POWERSERVE_SERVER_MULTIMODEL) 10 | endif () 11 | 12 | add_library(server.local STATIC local_server.cpp) 13 | target_link_libraries(server.local PUBLIC 14 | powerserve 15 | httplib::httplib 16 | nlohmann_json::nlohmann_json 17 | concurrentqueue 18 | ) 19 | if (POWERSERVE_SERVER_MULTIMODEL) 20 | target_compile_definitions(server.local PUBLIC POWERSERVE_SERVER_MULTIMODEL) 21 | endif () 22 | 23 | add_executable(server server.cpp) 24 | target_link_libraries(server PRIVATE 25 | powerserve 26 | app.common 27 | server.simple 28 | ) 29 | 30 | powerserve_add_binary(server) 31 | -------------------------------------------------------------------------------- /src/core/buffer.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | namespace powerserve { 20 | 21 | struct BaseBuffer { 22 | public: 23 | virtual ~BaseBuffer() = default; 24 | }; 25 | 26 | using BufferPtr = std::shared_ptr; 27 | 28 | } // namespace powerserve 29 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows_f32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout (binding = 0) readonly buffer tensorInA { float inA[]; }; 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 10 | 11 | layout (push_constant) uniform parameter { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | int ne00; 16 | int nb01; 17 | int nb1; 18 | } pcs; 19 | 20 | void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { 21 | for (int j = 0; j < k; j++) { 22 | out_[y + j] = inA[x + j]; 23 | } 24 | } 25 | 26 | void main() { 27 | const uint i = gl_WorkGroupID.x; 28 | const int r = inB[i + pcs.inBOff]; 29 | 30 | dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); 31 | } 32 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows_f16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1) in; 6 | 7 | layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 10 | 11 | layout (push_constant) uniform parameter { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | int ne00; 16 | int nb01; 17 | int nb1; 18 | } pcs; 19 | 20 | void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { 21 | for (int j = 0; j < k; j++) { 22 | out_[y + j] = inA[x + j]; 23 | } 24 | } 25 | 26 | void main() { 27 | const uint i = gl_WorkGroupID.x; 28 | const int r = inB[i + pcs.inBOff]; 29 | 30 | dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); 31 | } 32 | -------------------------------------------------------------------------------- /docs/perfetto_tracing.md: -------------------------------------------------------------------------------- 1 | First, set POWERSERVE_WITH_PERFETTO to ON during code compilation. 2 | 3 | Then explicitly start and stop tracing in your code, for example: 4 | 5 | ```c++ 6 | powerserve::PerfettoTrace::instance().start_tracing(32 * 1024); // Buffer size in KiB 7 | powerserve::TreeSpeculative spec(main_model, draft_model); 8 | spec.generate(tokenizer, sampler, prompt, n_predicts); 9 | spec.print_stat(); 10 | powerserve::PerfettoTrace::instance().stop_tracing("./perfetto.data"); // Will save to perfetto.data file 11 | ``` 12 | 13 | Use PerfettoTrace::begin and PerfettoTrace::end to mark start and end positions, for example: 14 | 15 | ```c++ 16 | PerfettoTrace::begin("draft_model_forward"); 17 | auto logits = draft_model->forward({node.token}, {node.position}, CausalAttentionMask(1)); 18 | PerfettoTrace::end(); 19 | ``` 20 | 21 | `PerfettoTrace::counter`can be used to record a line graph. 22 | 23 | Finally, open the trace file at https://ui.perfetto.dev. 24 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | 3 | AccessModifierOffset: -4 4 | AlignAfterOpenBracket: BlockIndent 5 | AlignConsecutiveAssignments: true 6 | AlignConsecutiveBitFields: true 7 | AlignConsecutiveMacros: true 8 | AllowShortBlocksOnASingleLine: Empty 9 | AllowShortEnumsOnASingleLine: false 10 | AllowShortFunctionsOnASingleLine: Empty 11 | AllowShortIfStatementsOnASingleLine: false 12 | AlwaysBreakAfterReturnType: None 13 | AlwaysBreakTemplateDeclarations: Yes 14 | BinPackArguments: false 15 | BinPackParameters: false 16 | BreakAfterAttributes: Always 17 | BreakBeforeBraces: Attach 18 | BreakConstructorInitializers: AfterColon 19 | ColumnLimit: 120 20 | ContinuationIndentWidth: 4 21 | IncludeBlocks: Regroup 22 | IndentWidth: 4 23 | InsertNewlineAtEOF: true 24 | PackConstructorInitializers: CurrentLine 25 | PenaltyReturnTypeOnItsOwnLine: 100000 26 | ReflowComments: true 27 | SeparateDefinitionBlocks: Always 28 | ShortNamespaceLines: 0 29 | TabWidth: 4 30 | UseTab: Never 31 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/diag_mask_inf.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage : require 4 | #extension GL_EXT_control_flow_attributes : enable 5 | 6 | layout (push_constant) uniform parameter 7 | { 8 | uint ncols; 9 | uint rows_per_channel; 10 | uint n_past; 11 | } p; 12 | 13 | #include "types.comp" 14 | 15 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 16 | 17 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 18 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 19 | 20 | void main() { 21 | const uint col = gl_GlobalInvocationID.y; 22 | const uint row = gl_GlobalInvocationID.x; 23 | 24 | if (col >= p.ncols) { 25 | return; 26 | } 27 | 28 | const uint i = row*p.ncols + col; 29 | if (col > p.n_past + row % p.rows_per_channel) { 30 | data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000)); 31 | } else { 32 | data_d[i] = D_TYPE(data_a[i]); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q8_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q8_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 16*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | 25 | const uint q_idx = 16*il; 26 | 27 | [[unroll]] for (uint l = 0; l < 16; l += 2) { 28 | data_b[b_idx + l ] = D_TYPE(d * data_a[ib].qs[q_idx + l ]); 29 | data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/pad.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_unary_head.comp" 5 | 6 | void main() { 7 | const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 8 | 9 | if (idx >= p.ne) { 10 | return; 11 | } 12 | 13 | const uint i3 = idx / (p.ne12*p.ne11*p.ne10); 14 | const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; 15 | const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10); 16 | const uint i2_offset = i2*p.ne11*p.ne10; 17 | const uint i1 = (idx - i3_offset - i2_offset) / p.ne10; 18 | const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; 19 | 20 | const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; 21 | const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; 22 | 23 | const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; 24 | 25 | data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f); 26 | } 27 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q4_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint q_idx = 8*il; 22 | const uint b_idx = 1024*i + 32*ir + q_idx; 23 | 24 | const float d = float(data_a[ib].d); 25 | 26 | [[unroll]] for (uint l = 0; l < 8; ++l) { 27 | data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f)); 28 | data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f)); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_iq4_nl.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint q_idx = 8*il; 22 | const uint b_idx = 1024*i + 32*ir + q_idx; 23 | 24 | const float d = float(data_a[ib].d); 25 | 26 | [[unroll]] for (uint l = 0; l < 8; ++l) { 27 | data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]); 28 | data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/core/spin_barrier.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #if defined(__cplusplus) 18 | extern "C" { 19 | #endif 20 | 21 | #include 22 | 23 | struct spin_barrier { 24 | size_t opaque[2]; 25 | }; 26 | 27 | void spin_barrier_init(struct spin_barrier *opaque, size_t width); 28 | void spin_barrier_wait(struct spin_barrier *opaque); 29 | 30 | #if defined(__cplusplus) 31 | } 32 | #endif 33 | -------------------------------------------------------------------------------- /libs/ggml/src/llamafile/sgemm.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | #include 17 | #include 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, 23 | const void *, int64_t, void *, int64_t, int, int, 24 | int, int, int); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q4_1 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const float m = float(data_a[ib].m); 25 | 26 | const uint q_idx = 8*il; 27 | 28 | [[unroll]] for (uint l = 0; l < 8; ++l) { 29 | data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m); 30 | data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + m); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/graph/node.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "graph/node.hpp" 16 | 17 | namespace powerserve { 18 | 19 | auto Node::tensor() -> Tensor * { 20 | return dynamic_cast(this); 21 | } 22 | 23 | auto Node::tensor_view() -> TensorViewNode * { 24 | return dynamic_cast(this); 25 | } 26 | 27 | auto Node::op() -> OpNode * { 28 | return dynamic_cast(this); 29 | } 30 | 31 | } // namespace powerserve 32 | -------------------------------------------------------------------------------- /app/server/server.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "cmdline.hpp" 16 | #include "simple_server.hpp" 17 | 18 | #include 19 | 20 | int main(int argc, char *argv[]) { 21 | // 0. load config 22 | const powerserve::CommandLineArgument args = powerserve::parse_command_line("PowerServe CLI", argc, argv); 23 | 24 | simple_server_handler(args.work_folder, args.qnn_lib_folder, args.host, args.port); 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/rope_norm.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "rope_head.comp" 4 | 5 | void main() { 6 | const uint col = gl_GlobalInvocationID.y * 2; 7 | const uint row = gl_GlobalInvocationID.x; 8 | 9 | if (col >= p.ncols) { 10 | return; 11 | } 12 | 13 | if (col >= p.n_dims) { 14 | const uint i = row*p.ncols + col; 15 | 16 | data_d[i + 0] = data_a[i + 0]; 17 | data_d[i + 1] = data_a[i + 1]; 18 | 19 | return; 20 | } 21 | 22 | const uint i = row*p.ncols + col; 23 | const uint i2 = row/p.p_delta_rows; 24 | 25 | const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f); 26 | 27 | const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f; 28 | 29 | float cos_theta, sin_theta; 30 | rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta); 31 | 32 | const float x0 = float(data_a[i + 0]); 33 | const float x1 = float(data_a[i + 1]); 34 | 35 | data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta); 36 | data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta); 37 | } 38 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/quantize.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.cuh" 4 | #include "mmq.cuh" 5 | 6 | #include 7 | 8 | #define CUDA_QUANTIZE_BLOCK_SIZE 256 9 | #define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128 10 | 11 | static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access."); 12 | static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access."); 13 | 14 | typedef void (*quantize_cuda_t)( 15 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 16 | const ggml_type type_x, cudaStream_t stream); 17 | 18 | void quantize_row_q8_1_cuda( 19 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 20 | const ggml_type type_x, cudaStream_t stream); 21 | 22 | void quantize_mmq_q8_1_cuda( 23 | const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded, 24 | const ggml_type type_x, cudaStream_t stream); 25 | -------------------------------------------------------------------------------- /src/core/defines.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #define POWERSERVE_BUILTIN_EXPECT(expr, value) __builtin_expect((expr), (value)) 18 | #define POWERSERVE_LIKELY(expr) POWERSERVE_BUILTIN_EXPECT((expr), 1) 19 | #define POWERSERVE_UNLIKELY(expr) POWERSERVE_BUILTIN_EXPECT((expr), 0) 20 | 21 | #if !defined(ALWAYS_INLINE) 22 | #define ALWAYS_INLINE __attribute__((always_inline)) 23 | #endif 24 | 25 | #define POWERSERVE_UNUSED(x) ((void)(x)) 26 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/rope_neox.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "rope_head.comp" 4 | 5 | void main() { 6 | const uint col = gl_GlobalInvocationID.y * 2; 7 | const uint row = gl_GlobalInvocationID.x; 8 | 9 | if (col >= p.ncols) { 10 | return; 11 | } 12 | 13 | if (col >= p.n_dims) { 14 | const uint i = row*p.ncols + col; 15 | 16 | data_d[i + 0] = data_a[i + 0]; 17 | data_d[i + 1] = data_a[i + 1]; 18 | 19 | return; 20 | } 21 | 22 | const uint i = row*p.ncols + col/2; 23 | const uint i2 = row/p.p_delta_rows; 24 | 25 | const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f); 26 | 27 | const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f; 28 | 29 | float cos_theta, sin_theta; 30 | rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta); 31 | 32 | const float x0 = float(data_a[i + 0]); 33 | const float x1 = float(data_a[i + p.n_dims/2]); 34 | 35 | data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta); 36 | data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta); 37 | } 38 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 2 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q4_0 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q4_0 get_unaligned_block_q4_0(uint index) { 25 | block_q4_0 fres; 26 | fres.d = u8BufToFloat16(inA, index); 27 | [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) { 28 | fres.qs[it] = inA[index+2+it]; 29 | } 30 | return fres; 31 | } 32 | 33 | mat4 dequantize_block(uint index, uint il) { 34 | const block_q4_0 block = get_unaligned_block_q4_0(index); 35 | return dequantize_q4_0(block, il); 36 | } 37 | 38 | #include "op_getrows.comp" 39 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/sum_rows.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; 8 | 9 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 10 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 11 | 12 | layout (constant_id = 0) const uint BLOCK_SIZE = 32; 13 | 14 | shared FLOAT_TYPE tmp[BLOCK_SIZE]; 15 | 16 | void main() { 17 | const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; 18 | const uint col = gl_LocalInvocationID.x; 19 | 20 | tmp[col] = FLOAT_TYPE(0.0f); 21 | 22 | for (uint i = col; i < p.KX; i += BLOCK_SIZE) { 23 | tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]); 24 | } 25 | 26 | barrier(); 27 | [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) { 28 | if (col < s) { 29 | tmp[col] += tmp[col + s]; 30 | } 31 | barrier(); 32 | } 33 | 34 | if (col == 0) { 35 | data_d[row] = D_TYPE(tmp[0]); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tools/end_to_end/powerserve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to check if a command exists 4 | check_command() { 5 | if ! command -v $1 &> /dev/null; then 6 | echo -e "\033[31m$1 could not be found. Please install it.\033[0m" 7 | exit 1 8 | fi 9 | } 10 | 11 | # Check necessary commands 12 | check_command "docker" 13 | check_command "adb" 14 | check_command "python3" 15 | check_command "pip" 16 | 17 | # echo -e "\033[36mChecking if the needed PowerServe docker image was updated...\033[0m" 18 | # don not show stderr or stdout 19 | sudo docker pull santoxin/mobile-build:v1.1 &> /dev/null 20 | 21 | # Check whether now locates at .../PowerServe 22 | if [ ! -d "tools" ]; then 23 | echo -e "\033[31mPlease run this script from the root directory of PowerServe.\033[0m" 24 | exit 1 25 | fi 26 | 27 | # check if pip install requests huggingface_hub 28 | pip install requests huggingface_hub --quiet 29 | 30 | # forward all strings to python file 31 | # example: ./tools/powerserve.sh run llama-3.2-1b -> then call python ./tools/end_to_end/powerserve.py run llama-3.2-1b 32 | python3 ./tools/end_to_end/powerserve.py "$@" 33 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/get_rows_quant.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | #include "dequant_funcs.comp" 6 | 7 | void main() { 8 | const uint i00 = (gl_GlobalInvocationID.x)*2; 9 | const uint i10 = gl_GlobalInvocationID.y; 10 | const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; 11 | const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; 12 | 13 | if (i00 >= p.ne00) { 14 | return; 15 | } 16 | 17 | const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; 18 | 19 | const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; 20 | const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; 21 | 22 | const uint ib = a_offset + i00/QUANT_K; // block index 23 | const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index 24 | const uint iybs = i00 - i00%QUANT_K; // dst block start index 25 | const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; 26 | 27 | vec2 v = dequantize(ib, iqs, 0); 28 | 29 | data_d[d_offset + iybs + iqs ] = D_TYPE(v.x); 30 | data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y); 31 | } 32 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 2 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q4_1 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q4_1 get_unaligned_block_q4_1(uint index) { 25 | block_q4_1 fres; 26 | fres.d = u8BufToFloat16(inA, index); 27 | fres.m = u8BufToFloat16(inA, index+2); 28 | [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { 29 | fres.qs[it] = inA[index+4+it]; 30 | } 31 | return fres; 32 | } 33 | 34 | mat4 dequantize_block(uint index, uint il) { 35 | const block_q4_1 block = get_unaligned_block_q4_1(index); 36 | return dequantize_q4_1(block, il); 37 | } 38 | 39 | #include "op_getrows.comp" 40 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/scale.cu: -------------------------------------------------------------------------------- 1 | #include "scale.cuh" 2 | 3 | static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { 4 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 5 | 6 | if (i >= k) { 7 | return; 8 | } 9 | 10 | dst[i] = scale * x[i]; 11 | } 12 | 13 | static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { 14 | const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; 15 | scale_f32<<>>(x, dst, scale, k); 16 | } 17 | 18 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 19 | const ggml_tensor * src0 = dst->src[0]; 20 | const float * src0_d = (const float *)src0->data; 21 | float * dst_d = (float *)dst->data; 22 | cudaStream_t stream = ctx.stream(); 23 | 24 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 25 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 26 | 27 | float scale; 28 | memcpy(&scale, dst->op_params, sizeof(float)); 29 | 30 | scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream); 31 | } 32 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Georgi Gerganov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/cos_sim.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | try: 5 | import numpy as np 6 | 7 | def cos_sim(a, b): 8 | return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) 9 | 10 | except ImportError: 11 | 12 | def dot_product(a, b): 13 | return sum(x * y for x, y in zip(a, b)) 14 | 15 | def norm(a): 16 | return sum(x * x for x in a) ** 0.5 17 | 18 | def cos_sim(a, b): 19 | return dot_product(a, b) / (norm(a) * norm(b)) 20 | 21 | 22 | if __name__ == "__main__": 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--f1", type=str) 26 | parser.add_argument("--f2", type=str) 27 | args = parser.parse_args() 28 | 29 | file1 = args.f1 30 | file2 = args.f2 31 | 32 | with open(file1, "r") as f1, open(file2, "r") as f2: 33 | lines1 = f1.readlines() 34 | lines2 = f2.readlines() 35 | 36 | a = [float(line.replace("\n", "")) for line in lines1 if line.replace("\n", "").strip()] 37 | b = [float(line.replace("\n", "")) for line in lines2 if line.replace("\n", "").strip()] 38 | 39 | assert len(a) == len(b), "two file's length must be equal!" 40 | print(cos_sim(a, b)) 41 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define BLOCKS_IN_QUANT QK4_0 6 | #define SIZE_OF_BLOCK sizeof_block_q4_0 7 | #define N_ROWS 4 8 | 9 | #include "op_mul_mv_q_n_pre.comp" 10 | 11 | // The q4_0 version of this function 12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) { 13 | vec2 acc = vec2(0.0, 0.0); 14 | const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; 15 | float d = float(u8BufToFloat16(inA, index)); 16 | float sumy = 0.0f; 17 | for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { 18 | const uint16_t b = u8BufToU16(inA, index + 2 + il + i); 19 | 20 | const float yl0 = inB[yb + i]; 21 | const float yl1 = inB[yb + i + 1]; 22 | const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; 23 | const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; 24 | 25 | sumy += yl0 + yl1 + yl8 + yl9; 26 | 27 | acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); 28 | acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); 29 | } 30 | return d * (sumy * -8.f + acc[0] + acc[1]); 31 | } 32 | 33 | #include "op_mul_mv_q_n.comp" 34 | -------------------------------------------------------------------------------- /src/model/module/ffn.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "graph/graph.hpp" 18 | #include "model/common/weights.hpp" 19 | 20 | namespace powerserve { 21 | 22 | struct FFN { 23 | private: 24 | const ModelConfig::LLMConfig &m_config; 25 | std::shared_ptr m_weights; 26 | 27 | public: 28 | FFN(const ModelConfig::LLMConfig &config, std::shared_ptr weights) : m_config(config), m_weights(weights) {} 29 | 30 | public: 31 | TensorNode *build(Graph &g, TensorNode *attn_o, int64_t L); 32 | }; 33 | 34 | } // namespace powerserve 35 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/vendors/cuda.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #if CUDART_VERSION < 11020 23 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED 24 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH 25 | #define CUBLAS_COMPUTE_16F CUDA_R_16F 26 | #define CUBLAS_COMPUTE_32F CUDA_R_32F 27 | #define cublasComputeType_t cudaDataType_t 28 | #endif // CUDART_VERSION < 11020 29 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q5_0.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q5_0 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; 25 | 26 | const uint q_idx = 8*il; 27 | 28 | [[unroll]] for (uint l = 0; l < 8; ++l) { 29 | const uint iqs = q_idx + l; 30 | const uint vui = uint(data_a[ib].qs[iqs]); 31 | data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f)); 32 | data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f)); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cann/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if (NOT SOC_TYPE) 2 | set (SOC_TYPE "Ascend910B3") 3 | endif() 4 | 5 | file(GLOB SRC_FILES 6 | get_row_f32.cpp 7 | get_row_f16.cpp 8 | get_row_q4_0.cpp 9 | get_row_q8_0.cpp 10 | quantize_f32_q8_0.cpp 11 | quantize_f16_q8_0.cpp 12 | quantize_float_to_q4_0.cpp 13 | dup.cpp 14 | ) 15 | 16 | string(TOLOWER ${SOC_TYPE} SOC_VERSION) 17 | set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) 18 | set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") 19 | 20 | if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) 21 | set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) 22 | elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) 23 | set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) 24 | else() 25 | message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") 26 | endif() 27 | include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) 28 | 29 | ascendc_library(ascendc_kernels STATIC 30 | ${SRC_FILES} 31 | ) 32 | 33 | # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) 34 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q5_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {block_q5_1 data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; 12 | 13 | const uint tid = gl_LocalInvocationID.x % 64; 14 | const uint il = tid/32; 15 | const uint ir = tid%32; 16 | const uint ib = 32*i + ir; 17 | if (ib >= p.nel / 32) { 18 | return; 19 | } 20 | 21 | const uint b_idx = 1024*i + 32*ir + 8*il; 22 | 23 | const float d = float(data_a[ib].d); 24 | const float m = float(data_a[ib].m); 25 | const uint qh = data_a[ib].qh; 26 | 27 | const uint q_idx = 8*il; 28 | 29 | [[unroll]] for (uint l = 0; l < 8; ++l) { 30 | const uint iqs = q_idx + l; 31 | const uint vui = uint(data_a[ib].qs[iqs]); 32 | data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m); 33 | data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10))) + m); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libs/json"] 2 | path = libs/json 3 | url = https://github.com/nlohmann/json.git 4 | [submodule "libs/fmt"] 5 | path = libs/fmt 6 | url = https://github.com/fmtlib/fmt.git 7 | [submodule "libs/cli11"] 8 | path = libs/cli11 9 | url = https://github.com/CLIUtils/CLI11.git 10 | [submodule "libs/libuv"] 11 | path = libs/libuv 12 | url = https://github.com/libuv/libuv.git 13 | [submodule "libs/xtensor"] 14 | path = libs/xtensor 15 | url = https://github.com/xtensor-stack/xtensor.git 16 | [submodule "libs/xtl"] 17 | path = libs/xtl 18 | url = https://github.com/xtensor-stack/xtl.git 19 | [submodule "tools/mmmu_test/mmmu"] 20 | path = tools/mmmu_test/mmmu 21 | url = https://github.com/MMMU-Benchmark/MMMU.git 22 | [submodule "libs/stb_headers/stb"] 23 | path = libs/stb_headers/stb 24 | url = https://github.com/nothings/stb.git 25 | [submodule "libs/cpp-httplib"] 26 | path = libs/cpp-httplib 27 | url = https://github.com/yhirose/cpp-httplib.git 28 | [submodule "libs/concurrentqueue"] 29 | path = libs/concurrentqueue 30 | url = https://github.com/cameron314/concurrentqueue 31 | [submodule "libs/perfetto"] 32 | path = libs/perfetto 33 | url = https://android.googlesource.com/platform/external/perfetto 34 | -------------------------------------------------------------------------------- /src/graph/op_type.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | namespace powerserve { 18 | 19 | enum class OpType { 20 | NONE = 0, 21 | 22 | ADD, 23 | MAT_MUL, 24 | RMS_NORM, 25 | SILU_HADAMARD, 26 | ROPE, 27 | SOFTMAX, 28 | COPY, 29 | 30 | #if defined(POWERSERVE_WITH_QNN) 31 | QNN_FORWARD, 32 | QNN_FORWARD_VL, 33 | #endif 34 | 35 | PRINT, 36 | GET_EMBEDDING, 37 | ADD_CACHE, 38 | PERMUTE, 39 | CONT, 40 | VIEW, 41 | SOFTMAX_EXT, 42 | GET_MASK, 43 | TRANSPOSE, 44 | INSERT_IMG_EMBEDDIGN, 45 | }; 46 | 47 | } // namespace powerserve 48 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/timestep_embedding.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #extension GL_EXT_shader_16bit_storage : require 4 | 5 | layout (push_constant) uniform parameter 6 | { 7 | uint nb1; 8 | uint dim; 9 | uint max_period; 10 | } p; 11 | 12 | #include "types.comp" 13 | 14 | #extension GL_EXT_control_flow_attributes : enable 15 | #define BLOCK_SIZE 256 16 | 17 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; 18 | 19 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 20 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 21 | 22 | void main() { 23 | const uint i = gl_WorkGroupID.y; 24 | const uint j = gl_GlobalInvocationID.x; 25 | const uint d_offset = i * p.nb1; 26 | 27 | if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { 28 | data_d[d_offset + p.dim] = 0.f; 29 | } 30 | 31 | const uint half_dim = p.dim / 2; 32 | if (j >= half_dim) { 33 | return; 34 | } 35 | 36 | const float timestep = float(data_a[i]); 37 | const float freq = float(exp(-log(p.max_period) * j / half_dim)); 38 | const float arg = timestep * freq; 39 | data_d[d_offset + j] = D_TYPE(cos(arg)); 40 | data_d[d_offset + j + half_dim] = D_TYPE(sin(arg)); 41 | } 42 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define BLOCKS_IN_QUANT QK4_1 6 | #define SIZE_OF_BLOCK sizeof_block_q4_1 7 | #define N_ROWS 4 8 | 9 | #include "op_mul_mv_q_n_pre.comp" 10 | 11 | // The q4_1 version of this function 12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) { 13 | vec2 acc = vec2(0.0, 0.0); 14 | const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; 15 | float d = float(u8BufToFloat16(inA, index)); 16 | float m = float(u8BufToFloat16(inA, index+2)); 17 | 18 | float sumy = 0.0f; 19 | for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { 20 | const uint16_t b = u8BufToU16(inA, index + 4 + il + i); 21 | 22 | const float yl0 = inB[yb + i]; 23 | const float yl1 = inB[yb + i + 1]; 24 | const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; 25 | const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; 26 | 27 | sumy += yl0 + yl1 + yl8 + yl9; 28 | 29 | acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); 30 | acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); 31 | } 32 | return d * (acc[0] + acc[1]) + sumy * m; 33 | } 34 | 35 | #include "op_mul_mv_q_n.comp" 36 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "gguf" 3 | version = "0.10.0" 4 | description = "Read and write ML models in GGUF for GGML" 5 | authors = ["GGML "] 6 | packages = [ 7 | {include = "gguf"}, 8 | {include = "gguf/py.typed"}, 9 | {include = "scripts"}, 10 | ] 11 | readme = "README.md" 12 | homepage = "https://ggml.ai" 13 | repository = "https://github.com/ggerganov/llama.cpp" 14 | keywords = ["ggml", "gguf", "llama.cpp"] 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | 21 | [tool.poetry.dependencies] 22 | python = ">=3.8" 23 | numpy = ">=1.17" 24 | tqdm = ">=4.27" 25 | pyyaml = ">=5.1" 26 | sentencepiece = ">=0.1.98,<=0.2.0" 27 | 28 | [tool.poetry.dev-dependencies] 29 | pytest = "^5.2" 30 | 31 | [build-system] 32 | requires = ["poetry-core>=1.0.0"] 33 | build-backend = "poetry.core.masonry.api" 34 | 35 | [tool.poetry.scripts] 36 | gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" 37 | gguf-dump = "scripts:gguf_dump_entrypoint" 38 | gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" 39 | gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" 40 | -------------------------------------------------------------------------------- /src/speculative/speculative_config.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "core/typedefs.hpp" 18 | 19 | namespace powerserve { 20 | 21 | struct SpeculativeConfig { 22 | size_t draft_batch_size = 12; 23 | 24 | struct { 25 | size_t top_k = 15; 26 | float temperature = 1.5f; 27 | float p_base = 0.9f; 28 | } draft_sampler; 29 | 30 | struct { 31 | size_t max_fan_out = 3; 32 | float min_prob = 0.2f; 33 | bool early_stop = true; 34 | bool debug = false; 35 | } token_tree; 36 | }; 37 | 38 | } // namespace powerserve 39 | -------------------------------------------------------------------------------- /src/core/getenv.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | namespace powerserve { 20 | 21 | template 22 | T getenv(const std::string &name, const T &default_value) { 23 | auto env = ::getenv(name.c_str()); 24 | if (env) { 25 | if constexpr (std::is_integral_v) { 26 | return atoll(env); 27 | } else if constexpr (std::is_floating_point_v) { 28 | return atof(env); 29 | } else { 30 | return std::string(env); 31 | } 32 | } else { 33 | return default_value; 34 | } 35 | } 36 | 37 | } // namespace powerserve 38 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/upscale.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (push_constant) uniform parameter 4 | { 5 | uint ne; uint d_offset; 6 | uint nb00; uint nb01; uint nb02; uint nb03; 7 | uint ne10; uint ne11; uint ne12; uint ne13; 8 | float sf0; float sf1; float sf2; float sf3; 9 | } p; 10 | 11 | #include "types.comp" 12 | 13 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; 14 | 15 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 16 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 17 | 18 | void main() { 19 | const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 20 | 21 | if (idx >= p.ne) { 22 | return; 23 | } 24 | 25 | const uint i10 = idx % p.ne10; 26 | const uint i11 = (idx / p.ne10) % p.ne11; 27 | const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; 28 | const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; 29 | 30 | const uint i00 = uint(i10 / p.sf0); 31 | const uint i01 = uint(i11 / p.sf1); 32 | const uint i02 = uint(i12 / p.sf2); 33 | const uint i03 = uint(i13 / p.sf3); 34 | 35 | data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); 36 | } 37 | -------------------------------------------------------------------------------- /tools/convert_hf_to_gguf/gguf-py/examples/writer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | 7 | 8 | # Necessary to load the local gguf package 9 | sys.path.insert(0, str(Path(__file__).parent.parent)) 10 | 11 | from gguf import GGUFWriter # noqa: E402 12 | 13 | 14 | # Example usage: 15 | def writer_example() -> None: 16 | # Example usage with a file 17 | gguf_writer = GGUFWriter("example.gguf", "llama") 18 | 19 | gguf_writer.add_block_count(12) 20 | gguf_writer.add_uint32("answer", 42) # Write a 32-bit integer 21 | gguf_writer.add_float32("answer_in_float", 42.0) # Write a 32-bit float 22 | gguf_writer.add_custom_alignment(64) 23 | 24 | tensor1 = np.ones((32,), dtype=np.float32) * 100.0 25 | tensor2 = np.ones((64,), dtype=np.float32) * 101.0 26 | tensor3 = np.ones((96,), dtype=np.float32) * 102.0 27 | 28 | gguf_writer.add_tensor("tensor1", tensor1) 29 | gguf_writer.add_tensor("tensor2", tensor2) 30 | gguf_writer.add_tensor("tensor3", tensor3) 31 | 32 | gguf_writer.write_header_to_file() 33 | gguf_writer.write_kv_data_to_file() 34 | gguf_writer.write_tensors_to_file() 35 | 36 | gguf_writer.close() 37 | 38 | 39 | if __name__ == "__main__": 40 | writer_example() 41 | -------------------------------------------------------------------------------- /libs/ggml/include/ggml-blas.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "ggml.h" 18 | #include "ggml-backend.h" 19 | 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | // backend API 26 | GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); 27 | 28 | GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); 29 | 30 | // number of threads used for conversion to float 31 | // for openblas and blis, this will also set the number of threads used for blas operations 32 | GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); 33 | 34 | 35 | #ifdef __cplusplus 36 | } 37 | #endif 38 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/conv.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_CONV_HPP 28 | #define GGML_SYCL_CONV_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 33 | const ggml_tensor *src1, ggml_tensor *dst); 34 | 35 | #endif // GGML_SYCL_CONV_HPP 36 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/clamp.cu: -------------------------------------------------------------------------------- 1 | #include "clamp.cuh" 2 | 3 | static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { 4 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 5 | 6 | if (i >= k) { 7 | return; 8 | } 9 | 10 | dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); 11 | } 12 | 13 | static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { 14 | const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; 15 | clamp_f32<<>>(x, dst, min, max, k); 16 | } 17 | 18 | 19 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 20 | const ggml_tensor * src0 = dst->src[0]; 21 | const float * src0_d = (const float *)src0->data; 22 | float * dst_d = (float *)dst->data; 23 | cudaStream_t stream = ctx.stream(); 24 | 25 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 26 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 27 | 28 | float min; 29 | float max; 30 | memcpy(&min, dst->op_params, sizeof(float)); 31 | memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); 32 | 33 | clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream); 34 | } 35 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/tsembd.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_TSEMBD_HPP 28 | #define GGML_SYCL_TSEMBD_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 33 | const ggml_tensor *src1, ggml_tensor * dst); 34 | 35 | #endif // GGML_SYCL_TSEMBD_HPP 36 | -------------------------------------------------------------------------------- /tools/qnn_converter/graph_params.py: -------------------------------------------------------------------------------- 1 | class GraphParams: 2 | batch_size: int 3 | cache_size: int 4 | context_size: int 5 | 6 | 7 | class Batch1_Params(GraphParams): 8 | batch_size = 1 9 | cache_size = 1920 10 | context_size = 2048 11 | 12 | 13 | class Batch4_Params(GraphParams): 14 | batch_size = 4 15 | cache_size = 1920 16 | context_size = 2048 17 | 18 | 19 | class Batch8_Params(GraphParams): 20 | batch_size = 8 21 | cache_size = 1920 22 | context_size = 2048 23 | 24 | 25 | class Batch12_Params(GraphParams): 26 | batch_size = 12 27 | cache_size = 1920 28 | context_size = 2048 29 | 30 | 31 | class Batch16_Params(GraphParams): 32 | batch_size = 16 33 | cache_size = 1920 34 | context_size = 2048 35 | 36 | 37 | class Batch32_Params(GraphParams): 38 | batch_size = 32 39 | cache_size = 1920 40 | context_size = 2048 41 | 42 | 43 | class Batch128_Params(GraphParams): 44 | batch_size = 128 45 | cache_size = 1920 46 | context_size = 2048 47 | 48 | 49 | graph_map: dict[str, GraphParams] = { 50 | "batch_1": Batch1_Params, 51 | "batch_4": Batch4_Params, 52 | "batch_8": Batch8_Params, 53 | "batch_12": Batch12_Params, 54 | "batch_16": Batch16_Params, 55 | "batch_32": Batch32_Params, 56 | "batch_128": Batch128_Params, 57 | } 58 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/concat.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_CONCAT_HPP 28 | #define GGML_SYCL_CONCAT_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, 33 | const ggml_tensor *src1, ggml_tensor *dst); 34 | 35 | #endif // GGML_SYCL_CONCAT_HPP 36 | -------------------------------------------------------------------------------- /libs/llama_tokenizer/unicode-data.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | struct range_nfd { 23 | uint32_t first; 24 | uint32_t last; 25 | uint32_t nfd; 26 | }; 27 | 28 | static const uint32_t MAX_CODEPOINTS = 0x110000; 29 | 30 | extern const std::vector> unicode_ranges_flags; 31 | extern const std::unordered_set unicode_set_whitespace; 32 | extern const std::unordered_map unicode_map_lowercase; 33 | extern const std::unordered_map unicode_map_uppercase; 34 | extern const std::vector unicode_ranges_nfd; 35 | -------------------------------------------------------------------------------- /src/core/typedefs.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "llama-vocab.h" 18 | 19 | #include 20 | #include 21 | 22 | namespace powerserve { 23 | 24 | using Path = std::filesystem::path; 25 | using Token = llama_vocab::id; 26 | 27 | static constexpr size_t max_n_dims = 4; 28 | using Shape = std::array; 29 | using Stride = std::array; 30 | 31 | struct Noncopyable { 32 | Noncopyable(const Noncopyable &) = delete; 33 | auto operator=(const Noncopyable &) = delete; 34 | 35 | protected: 36 | Noncopyable() = default; 37 | ~Noncopyable() = default; 38 | }; 39 | 40 | } // namespace powerserve 41 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/arange.cu: -------------------------------------------------------------------------------- 1 | #include "arange.cuh" 2 | 3 | static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { 4 | // blockIDx.x: idx of ne0 / BLOCK_SIZE 5 | int nidx = threadIdx.x + blockIdx.x * blockDim.x; 6 | if (nidx >= ne0) { 7 | return; 8 | } 9 | dst[nidx] = start + step * nidx; 10 | } 11 | 12 | static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { 13 | int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; 14 | arange_f32<<>>(dst, ne0, start, step); 15 | } 16 | 17 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 18 | float * dst_d = (float *)dst->data; 19 | cudaStream_t stream = ctx.stream(); 20 | 21 | GGML_ASSERT(dst->type == GGML_TYPE_F32); 22 | 23 | float start; 24 | float stop; 25 | float step; 26 | memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); 27 | memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); 28 | memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); 29 | 30 | int64_t steps = (int64_t)ceil((stop - start) / step); 31 | GGML_ASSERT(ggml_nelements(dst) == steps); 32 | 33 | arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); 34 | } 35 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cuda/sumrows.cu: -------------------------------------------------------------------------------- 1 | #include "sumrows.cuh" 2 | 3 | static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { 4 | const int row = blockIdx.x; 5 | const int col = threadIdx.x; 6 | 7 | float sum = 0.0f; 8 | for (int i = col; i < ncols; i += blockDim.x) { 9 | sum += x[row * ncols + i]; 10 | } 11 | 12 | sum = warp_reduce_sum(sum); 13 | 14 | if (col == 0) { 15 | dst[row] = sum; 16 | } 17 | } 18 | 19 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { 20 | const dim3 block_dims(WARP_SIZE, 1, 1); 21 | const dim3 block_nums(nrows, 1, 1); 22 | k_sum_rows_f32<<>>(x, dst, ncols); 23 | } 24 | 25 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { 26 | const ggml_tensor * src0 = dst->src[0]; 27 | const float * src0_d = (const float *)src0->data; 28 | float * dst_d = (float *)dst->data; 29 | cudaStream_t stream = ctx.stream(); 30 | 31 | GGML_ASSERT(src0->type == GGML_TYPE_F32); 32 | GGML_ASSERT( dst->type == GGML_TYPE_F32); 33 | GGML_ASSERT(ggml_is_contiguous(src0)); 34 | 35 | const int64_t ncols = src0->ne[0]; 36 | const int64_t nrows = ggml_nrows(src0); 37 | 38 | sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); 39 | } 40 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/rope.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_ROPE_HPP 28 | #define GGML_SYCL_ROPE_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_rope( 33 | ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 34 | const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); 35 | 36 | #endif // GGML_SYCL_ROPE_HPP 37 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(powerserve STATIC) 2 | target_include_directories(powerserve PUBLIC .) 3 | 4 | target_link_libraries(powerserve PUBLIC 5 | fmt 6 | ggml 7 | llama_tokenizer 8 | nlohmann_json::nlohmann_json 9 | ) 10 | 11 | if (NOT OHOS) 12 | message(STATUS "Introduce uv a") 13 | target_link_libraries(powerserve PUBLIC uv_a) 14 | endif() 15 | 16 | if (POWERSERVE_WITH_PERFETTO) 17 | target_link_libraries(powerserve PRIVATE perfetto) 18 | endif() 19 | 20 | if (POWERSERVE_ANDROID_LOG) 21 | add_compile_definitions(POWERSERVE_ANDROID_LOG) 22 | target_link_libraries(powerserve PUBLIC log) 23 | endif () 24 | 25 | if (NOT MSVC) 26 | target_compile_options(powerserve PRIVATE 27 | -Wall 28 | -Wextra 29 | 30 | -Wno-unused-function 31 | ) 32 | 33 | if (POWERSERVE_ENABLE_WERROR) 34 | target_compile_options(powerserve PRIVATE -Werror) 35 | endif() 36 | endif() 37 | 38 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") 39 | target_compile_options(powerserve PRIVATE -Wno-unknown-pragmas) 40 | endif() 41 | 42 | add_subdirectory(storage) 43 | add_subdirectory(backend) 44 | add_subdirectory(core) 45 | add_subdirectory(executor) 46 | add_subdirectory(graph) 47 | add_subdirectory(model) 48 | add_subdirectory(sampler) 49 | if (POWERSERVE_WITH_QNN) 50 | add_subdirectory(speculative) 51 | endif() 52 | add_subdirectory(tokenizer) 53 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_getrows_q6_k.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #define NL 16 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/ 7 | #define SIZE_OF_BLOCK sizeof_block_q6_k 8 | 9 | layout(local_size_x = 1) in; 10 | 11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; 12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; }; 13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout (push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int nb01; 21 | int nb1; 22 | } pcs; 23 | 24 | block_q6_k get_unaligned_block_q6_k(uint index) { 25 | block_q6_k fres; 26 | [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { 27 | fres.ql[it] = inA[index + it]; 28 | } 29 | [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { 30 | fres.qh[it] = inA[index + QK_K/2 + it]; 31 | } 32 | [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { 33 | fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); 34 | } 35 | fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); 36 | return fres; 37 | } 38 | 39 | mat4 dequantize_block(uint index, uint il) { 40 | const block_q6_k block = get_unaligned_block_q6_k(index); 41 | return dequantize_q6_k(block, il); 42 | } 43 | 44 | #include "op_getrows.comp" 45 | -------------------------------------------------------------------------------- /src/core/spin_barrier.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "core/spin_barrier.h" 18 | 19 | #include 20 | 21 | #define CHECK_STRUCT_SIZE_AND_ALIGNMENT(a, b) \ 22 | static_assert(sizeof(a) == sizeof(b), "Sizes of " #a " and " #b " must equal"); \ 23 | static_assert(alignof(a) == alignof(b), "Alignments of " #a " and " #b " must equal") 24 | 25 | namespace powerserve { 26 | 27 | struct SpinBarrier { 28 | void init(size_t new_width); 29 | void wait(); 30 | 31 | private: 32 | size_t width = 0; 33 | std::atomic count; 34 | }; 35 | 36 | CHECK_STRUCT_SIZE_AND_ALIGNMENT(SpinBarrier, spin_barrier); 37 | 38 | } // namespace powerserve 39 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/im2col.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_IM2COL_HPP 28 | #define GGML_SYCL_IM2COL_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_im2col( 33 | ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, 34 | ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, 35 | const queue_ptr &main_stream); 36 | 37 | #endif // GGML_SYCL_IM2COL_HPP 38 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/softmax.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_SOFTMAX_HPP 28 | #define GGML_SYCL_SOFTMAX_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0, 33 | const ggml_tensor *src1, ggml_tensor *dst, 34 | const float *src0_dd, const float *src1_dd, 35 | float *dst_dd, 36 | const queue_ptr &main_stream); 37 | 38 | #endif // GGML_SYCL_SOFTMAX_HPP 39 | -------------------------------------------------------------------------------- /libs/ggml/include/ggml-rpc.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "ggml.h" 18 | #include "ggml-backend.h" 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #define GGML_RPC_MAX_SERVERS 16 25 | 26 | // backend API 27 | GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); 28 | GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); 29 | 30 | GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); 31 | 32 | GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); 33 | 34 | GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-cann/kernels/ascendc_kernels.h: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ASCENDC_KERNELS_H 16 | #define ASCENDC_KERNELS_H 17 | 18 | #include "aclrtlaunch_ascendc_get_row_f32.h" 19 | #include "aclrtlaunch_ascendc_get_row_f16.h" 20 | #include "aclrtlaunch_ascendc_get_row_q8_0.h" 21 | #include "aclrtlaunch_ascendc_get_row_q4_0.h" 22 | 23 | #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" 24 | #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" 25 | #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" 26 | #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" 27 | 28 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" 29 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" 30 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" 31 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" 32 | 33 | #endif // ASCENDC_KERNELS_H 34 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/rms_norm.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | #define BLOCK_SIZE 512 8 | 9 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; 10 | 11 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 12 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 13 | 14 | shared FLOAT_TYPE sum[BLOCK_SIZE]; 15 | 16 | void main() { 17 | const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; 18 | const uint tid = gl_LocalInvocationID.x; 19 | 20 | sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp 21 | 22 | [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { 23 | const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]); 24 | sum[tid] += xi * xi; 25 | } 26 | 27 | // sum up partial sums and write back result 28 | barrier(); 29 | [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { 30 | if (tid < s) { 31 | sum[tid] += sum[tid + s]; 32 | } 33 | barrier(); 34 | } 35 | 36 | const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX); 37 | const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); 38 | 39 | [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { 40 | data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col])); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/backend.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_BACKEND_HPP 28 | #define GGML_SYCL_BACKEND_HPP 29 | 30 | #include "concat.hpp" 31 | #include "common.hpp" 32 | #include "conv.hpp" 33 | #include "convert.hpp" 34 | #include "dequantize.hpp" 35 | #include "dmmv.hpp" 36 | #include "mmq.hpp" 37 | #include "mmvq.hpp" 38 | #include "rope.hpp" 39 | #include "norm.hpp" 40 | #include "softmax.hpp" 41 | #include "tsembd.hpp" 42 | #include "im2col.hpp" 43 | 44 | #endif // GGML_SYCL_BACKEND_HPP 45 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/norm.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "generic_head.comp" 4 | #include "types.comp" 5 | 6 | #extension GL_EXT_control_flow_attributes : enable 7 | #define BLOCK_SIZE 512 8 | 9 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; 10 | 11 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; 12 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; 13 | 14 | shared vec2 sum[BLOCK_SIZE]; 15 | 16 | void main() { 17 | const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; 18 | const uint tid = gl_LocalInvocationID.x; 19 | 20 | sum[tid] = vec2(0.0f, 0.0f); 21 | 22 | [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { 23 | const float xi = float(data_a[row*p.KX + col]); 24 | sum[tid].x += xi; 25 | sum[tid].y += xi * xi; 26 | } 27 | 28 | // sum up partial sums and write back result 29 | barrier(); 30 | [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { 31 | if (tid < s) { 32 | sum[tid] += sum[tid + s]; 33 | } 34 | barrier(); 35 | } 36 | 37 | const float mean = sum[0].x / p.KX; 38 | const float var = sum[0].y / p.KX - mean * mean; 39 | const float inv_std = inversesqrt(var + p.param1); 40 | 41 | [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { 42 | data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tests/quant_mul_mat.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "ggml.h" 16 | 17 | int main() { 18 | 19 | struct ggml_init_params params = { 20 | .mem_size = 16 * 1024 * 1024, 21 | .mem_buffer = NULL, 22 | }; 23 | 24 | // memory allocation happens here 25 | struct ggml_context *ctx = ggml_init(params); 26 | struct ggml_tensor *a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, 64, 2); 27 | // ggml_set_param(ctx, a); // a is an input variable 28 | struct ggml_tensor *w = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); 29 | 30 | struct ggml_tensor *a2 = ggml_mul_mat(ctx, w, a); 31 | 32 | struct ggml_cgraph *gf = ggml_new_graph(ctx); 33 | ggml_build_forward_expand(gf, a2); 34 | // ggml_set_2d(); 35 | ggml_graph_compute_with_ctx(ctx, gf, 1); 36 | return 0; 37 | 38 | // ggml_quantize_chunk( 39 | // ); 40 | } 41 | -------------------------------------------------------------------------------- /libs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CLI11 2 | add_subdirectory(cli11) 3 | 4 | # fmt 5 | add_subdirectory(fmt) 6 | 7 | # ggml 8 | set(GGML_OPENMP OFF) 9 | add_subdirectory(ggml) 10 | 11 | if (${CMAKE_C_COMPILER_ID} MATCHES "Clang") 12 | target_compile_options(ggml PRIVATE -Wno-double-promotion -Wno-unreachable-code-break) 13 | endif() 14 | 15 | # json 16 | add_subdirectory(json) 17 | 18 | if (NOT OHOS) 19 | # libuv 20 | set(LIBUV_BUILD_SHARED OFF CACHE BOOL "Build shared lib") 21 | add_subdirectory(libuv) 22 | 23 | if (CMAKE_C_COMPILER_ID MATCHES "Clang") 24 | target_compile_options(uv_a PRIVATE -Wno-missing-prototypes -Wno-unreachable-code-return -Wno-shadow) 25 | elseif (CMAKE_C_COMPILER_ID MATCHES "GNU") 26 | target_compile_options(uv_a PRIVATE -Wno-cast-qual -Wno-missing-prototypes -Wno-shadow) 27 | endif() 28 | endif() 29 | 30 | # llama_tokenizer 31 | add_subdirectory(llama_tokenizer) 32 | 33 | # QNN headers 34 | add_subdirectory(qnn_headers) 35 | 36 | # xtl(Required by xtensor) 37 | add_subdirectory(xtl) 38 | 39 | # xtensor 40 | add_subdirectory(xtensor) 41 | 42 | add_subdirectory(stb_headers) 43 | 44 | # http lib 45 | add_subdirectory(cpp-httplib) 46 | 47 | # concurrent queue 48 | add_subdirectory(concurrentqueue) 49 | 50 | # Perfetto 51 | if (POWERSERVE_WITH_PERFETTO) 52 | add_library(perfetto STATIC perfetto/sdk/perfetto.cc) 53 | target_include_directories(perfetto PUBLIC perfetto/sdk) 54 | if (ANDROID) 55 | target_link_libraries(perfetto PRIVATE -llog) 56 | endif() 57 | endif() 58 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/concat.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "types.comp" 4 | #include "generic_binary_head.comp" 5 | 6 | void main() { 7 | const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; 8 | const int dim = p.param3; 9 | 10 | if (idx >= p.ne) { 11 | return; 12 | } 13 | 14 | const uint i3 = idx / (p.ne22*p.ne21*p.ne20); 15 | const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20; 16 | const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20); 17 | const uint i2_offset = i2*p.ne21*p.ne20; 18 | const uint i1 = (idx - i3_offset - i2_offset) / p.ne20; 19 | const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20; 20 | 21 | uint o[4] = {0, 0, 0, 0}; 22 | o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03)); 23 | 24 | const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; 25 | const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10; 26 | const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20; 27 | 28 | const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; 29 | 30 | #ifndef OPTIMIZATION_ERROR_WORKAROUND 31 | data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]); 32 | #else 33 | if (is_src0) { 34 | data_d[p.d_offset + dst_idx] = data_a[src0_idx]; 35 | } else { 36 | data_d[p.d_offset + dst_idx] = data_b[src1_idx]; 37 | } 38 | #endif 39 | } 40 | -------------------------------------------------------------------------------- /src/core/timer.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | namespace powerserve { 20 | 21 | auto timestamp_ns() -> int64_t; 22 | auto timestamp_us() -> int64_t; 23 | auto timestamp_ms() -> int64_t; 24 | 25 | struct Timer { 26 | Timer(); 27 | 28 | // Return elapsed time since last tick. 29 | auto elapsed_time_ns() const -> int64_t; 30 | auto elapsed_time_us() const -> int64_t; 31 | auto elapsed_time_ms() const -> int64_t; 32 | 33 | // Return elapsed time since last tick and set new tick. 34 | auto tick_ns() -> int64_t; 35 | auto tick_us() -> int64_t; 36 | auto tick_ms() -> int64_t; 37 | 38 | void reset(); 39 | 40 | private: 41 | using Clock = std::chrono::steady_clock; 42 | 43 | Clock::time_point last_time_point; 44 | 45 | auto tick_impl(Clock::time_point *out_time_point) const -> int64_t; 46 | }; 47 | 48 | } // namespace powerserve 49 | -------------------------------------------------------------------------------- /src/model/module/norm_attention.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "core/config.hpp" 18 | #include "graph/graph.hpp" 19 | #include "graph/node.hpp" 20 | #include "model/common/weights.hpp" 21 | #include "model/module/attention.hpp" 22 | 23 | #include 24 | 25 | namespace powerserve { 26 | 27 | struct NormAttention : Attention { 28 | 29 | public: 30 | NormAttention(const ModelConfig::LLMConfig &config, std::shared_ptr weights) : Attention(config, weights) {} 31 | 32 | ~NormAttention() = default; 33 | 34 | public: 35 | TensorNode *build( 36 | Graph &g, 37 | TensorNode *x, 38 | int64_t L, 39 | const TensorNode *k_cache, 40 | const TensorNode *v_cache, 41 | const std::vector &pos, 42 | const CausalAttentionMask &mask, 43 | bool is_need_bias = false 44 | ) override; 45 | }; 46 | 47 | } // namespace powerserve 48 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | #extension GL_KHR_shader_subgroup_arithmetic : require 6 | #extension GL_EXT_debug_printf : enable 7 | 8 | // device subgroup size 9 | layout (local_size_x_id = 0) in; 10 | 11 | layout(binding = 0) readonly buffer tensorInA { float inA[]; }; 12 | layout(binding = 1) readonly buffer tensorInB { float inB[]; }; 13 | layout(binding = 2) writeonly buffer tensorOut { float out_[]; }; 14 | 15 | layout(push_constant) uniform parameter { 16 | uint inAOff; 17 | uint inBOff; 18 | uint outOff; 19 | int ne00; 20 | int ne01; 21 | int ne02; 22 | int ne11; 23 | int ne12; 24 | uint nb01; 25 | uint nb02; 26 | uint nb11; 27 | uint nb12; 28 | uint nb1; 29 | uint nb2; 30 | } 31 | pcs; 32 | 33 | 34 | void main() { 35 | uvec3 gid = gl_WorkGroupID; 36 | 37 | uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z; 38 | uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z; 39 | 40 | const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA 41 | const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB 42 | float sum = 0.0f; 43 | for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { 44 | sum += float(inA[x+i]) * float(inB[y+i]); 45 | } 46 | 47 | const float all_sum = subgroupAdd(sum); 48 | if (subgroupElect()) { 49 | out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/convert.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_CONVERT_HPP 28 | #define GGML_SYCL_CONVERT_HPP 29 | 30 | #include "common.hpp" 31 | 32 | template 33 | using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y, 34 | int64_t k, dpct::queue_ptr stream); 35 | typedef to_t_sycl_t to_fp32_sycl_t; 36 | typedef to_t_sycl_t to_fp16_sycl_t; 37 | 38 | to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type); 39 | to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type); 40 | 41 | #endif // GGML_SYCL_CONVERT_HPP 42 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/mmvq.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_MMVQ_HPP 28 | #define GGML_SYCL_MMVQ_HPP 29 | 30 | #include "common.hpp" 31 | 32 | 33 | void ggml_sycl_op_mul_mat_vec_q( 34 | ggml_backend_sycl_context & ctx, 35 | const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 36 | const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, 37 | float *dst_dd_i, const int64_t row_low, const int64_t row_high, 38 | const int64_t src1_ncols, const int64_t src1_padded_row_size, 39 | const dpct::queue_ptr &stream); 40 | 41 | #endif // GGML_SYCL_MMVQ_HPP 42 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q6_k.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { 12 | const uint i = gl_WorkGroupID.x * 256 + wgy; 13 | if (i >= p.M * p.K / QUANT_K) { 14 | return; 15 | } 16 | const uint tid = gl_LocalInvocationID.x; 17 | const uint ip = tid / 32; 18 | const uint il = tid - 32 * ip; 19 | const uint is = 8 * ip + il / 16; 20 | 21 | const uint y_idx = i * QUANT_K + 128 * ip + il; 22 | 23 | const uint ql_idx = 64 * ip + il; 24 | const uint8_t qh = data_a[i].qh[32 * ip + il]; 25 | 26 | const FLOAT_TYPE d = FLOAT_TYPE(data_a[i].d); 27 | 28 | data_b[y_idx + 0] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 0] * (int8_t((data_a[i].ql[ql_idx + 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))); 29 | data_b[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 2] * (int8_t((data_a[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))); 30 | data_b[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 4] * (int8_t((data_a[i].ql[ql_idx + 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32))); 31 | data_b[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 6] * (int8_t((data_a[i].ql[ql_idx + 32] >> 4) | (((qh >> 6) & 3) << 4)) - 32))); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/executor/executor.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "backend/platform.hpp" 18 | #include "graph/graph.hpp" 19 | 20 | namespace powerserve { 21 | 22 | struct Executor { 23 | public: 24 | Platform &m_platform; 25 | Graph &m_graph; 26 | 27 | public: 28 | Executor(Platform &platform, Graph &graph) : m_platform(platform), m_graph(graph) {} 29 | 30 | public: 31 | void allocate_buffers(); 32 | void run(); 33 | void plan(); 34 | 35 | private: 36 | template 37 | void create_cpu_buffer(std::shared_ptr tensor) { 38 | if (tensor->type == NodeType::TENSOR_VIEW) { 39 | tensor->m_data = 40 | CPUBuffer::create_buffer_view(tensor->tensor_view()->parent->get(), tensor->m_shape); 41 | } else { 42 | tensor->m_data = CPUBuffer::create_buffer(tensor->m_shape); 43 | } 44 | } 45 | }; 46 | 47 | } // namespace powerserve 48 | -------------------------------------------------------------------------------- /src/model/module/attention.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "core/config.hpp" 18 | #include "graph/node.hpp" 19 | #include "model/common/weights.hpp" 20 | 21 | namespace powerserve { 22 | 23 | struct Attention { 24 | 25 | public: 26 | const ModelConfig::LLMConfig &m_config; 27 | std::shared_ptr m_weights; 28 | 29 | public: 30 | Attention(const ModelConfig::LLMConfig &config, const std::shared_ptr &weights) : 31 | m_config(config), 32 | m_weights(weights) {} 33 | 34 | virtual ~Attention() = default; 35 | 36 | public: 37 | virtual TensorNode *build( 38 | Graph &g, 39 | TensorNode *x, 40 | int64_t L, 41 | const TensorNode *k_cache, 42 | const TensorNode *v_cache, 43 | const std::vector &pos, 44 | const CausalAttentionMask &mask, 45 | bool is_need_bias = false 46 | ) = 0; 47 | }; 48 | 49 | } // namespace powerserve 50 | -------------------------------------------------------------------------------- /src/model/module/attention_mask.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "attention_mask.hpp" 16 | 17 | namespace powerserve { 18 | 19 | AttentionMask::AttentionMask(size_t size) : size(size) {} 20 | 21 | AttentionMaskView::AttentionMaskView(const AttentionMask &mask, size_t offset, size_t size) : 22 | mask(mask), 23 | offset(offset), 24 | size(size) {} 25 | 26 | bool AttentionMaskView::not_masked(size_t i, size_t j) const { 27 | return mask.not_masked(offset + i, offset + j); 28 | } 29 | 30 | CausalAttentionMask::CausalAttentionMask(size_t size) : AttentionMask(size) {} 31 | 32 | CausalAttentionMask::CausalAttentionMask(size_t size, const std::vector> &batch_mask) : 33 | AttentionMask(size), 34 | mask(batch_mask) {} 35 | 36 | bool CausalAttentionMask::not_masked(size_t i, size_t j) const { 37 | if (!mask.empty()) { 38 | return mask[i][j]; 39 | } 40 | return i >= j; 41 | } 42 | 43 | } // namespace powerserve 44 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/dmmv.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_DMMV_HPP 28 | #define GGML_SYCL_DMMV_HPP 29 | 30 | #include "common.hpp" 31 | 32 | 33 | void ggml_sycl_op_dequantize_mul_mat_vec( 34 | ggml_backend_sycl_context & ctx, 35 | const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, 36 | const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, 37 | float *dst_dd_i, const int64_t row_low, const int64_t row_high, 38 | const int64_t src1_ncols, const int64_t src1_padded_row_size, 39 | const dpct::queue_ptr &stream); 40 | 41 | #endif // GGML_SYCL_DMMV_HPP 42 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_mul.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 1024) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; 10 | 11 | layout(push_constant) uniform PushConstants { 12 | uint inAOff; 13 | uint inBOff; 14 | uint outOff; 15 | int ne00; 16 | int nb00; 17 | int nb01; 18 | int nb02; 19 | int nb03; 20 | int ne10; 21 | int ne11; 22 | int ne12; 23 | int ne13; 24 | int nb10; 25 | int nb11; 26 | int nb12; 27 | int nb13; 28 | int ne0; 29 | int nb0; 30 | int nb1; 31 | int nb2; 32 | int nb3; 33 | } pcs; 34 | 35 | void main() { 36 | const uint i03 = gl_WorkGroupID.z; 37 | const uint i02 = gl_WorkGroupID.y; 38 | const uint i01 = gl_WorkGroupID.x; 39 | 40 | const uint i13 = i03 % pcs.ne13; 41 | const uint i12 = i02 % pcs.ne12; 42 | const uint i11 = i01 % pcs.ne11; 43 | 44 | uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4); 45 | uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4); 46 | uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4); 47 | 48 | for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { 49 | const uint i10 = i0 % pcs.ne10; 50 | out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10]; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/model/module/ffn.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "model/module/ffn.hpp" 16 | 17 | #include "graph/graph.hpp" 18 | #include "graph/node.hpp" 19 | 20 | namespace powerserve { 21 | 22 | TensorNode *FFN::build(Graph &g, TensorNode *attn_o, int64_t L) { 23 | auto ffn_norm_w = g.add_tensor(m_weights->lw[L].ffn_norm); 24 | auto ffn_norm_o = g.rms_norm(attn_o, ffn_norm_w, m_config.norm_eps); 25 | 26 | auto gate_w = g.add_tensor(m_weights->lw[L].ffn_gate); 27 | auto gate_o = g.mat_mul(gate_w, ffn_norm_o); 28 | 29 | auto up_w = g.add_tensor(m_weights->lw[L].ffn_up); 30 | auto up_o = g.mat_mul(up_w, ffn_norm_o); 31 | 32 | // {hidden_dim, bs, 1, 1} 33 | auto silu = g.silu_hadamard(gate_o, up_o); 34 | 35 | auto down_w = g.add_tensor(m_weights->lw[L].ffn_down); 36 | auto down_o = g.mat_mul(down_w, silu); 37 | 38 | // {embed_dim, bs, 1, 1} 39 | auto res_conn = g.add(attn_o, down_o); 40 | 41 | return res_conn; 42 | } 43 | 44 | } // namespace powerserve 45 | -------------------------------------------------------------------------------- /libs/ggml/src/ggml-sycl/mmq.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 16 | // MIT license 17 | // Copyright (C) 2024 Intel Corporation 18 | // SPDX-License-Identifier: MIT 19 | // 20 | 21 | // 22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 23 | // See https://llvm.org/LICENSE.txt for license information. 24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 25 | // 26 | 27 | #ifndef GGML_SYCL_MMQ_HPP 28 | #define GGML_SYCL_MMQ_HPP 29 | 30 | #include "common.hpp" 31 | 32 | void ggml_sycl_op_mul_mat_q( 33 | ggml_backend_sycl_context & ctx, 34 | const ggml_tensor* src0, 35 | const ggml_tensor* src1, 36 | ggml_tensor* dst, 37 | const char* src0_dd_i, 38 | const float* src1_ddf_i, 39 | const char* src1_ddq_i, 40 | float* dst_dd_i, 41 | const int64_t row_low, 42 | const int64_t row_high, 43 | const int64_t src1_ncols, 44 | const int64_t src1_padded_row_size, 45 | const dpct::queue_ptr& stream); 46 | 47 | #endif // GGML_SYCL_MMQ_HPP 48 | -------------------------------------------------------------------------------- /libs/ggml/src/vulkan-shaders/dequant_q2_k.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "dequant_head.comp" 4 | 5 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; 6 | 7 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; 9 | 10 | void main() { 11 | [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) { 12 | const uint i = gl_WorkGroupID.x * 256 + wgy; 13 | if (i >= p.M * p.K / QUANT_K) { 14 | return; 15 | } 16 | 17 | const uint tid = gl_LocalInvocationID.x; 18 | const uint ip = tid / 32; 19 | const uint il = tid - 32 * ip; 20 | const uint is = 8 * ip + il / 16; 21 | 22 | const uint y_idx = i * QUANT_K + 128 * ip + il; 23 | 24 | const uint ql_idx = 32 * ip + il; 25 | const uint8_t qs = data_a[i].qs[32 * ip + il]; 26 | 27 | FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x); 28 | FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y); 29 | data_b[y_idx + 0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4)); 30 | data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4)); 31 | data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4)); 32 | data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4)); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /libs/ggml/src/kompute-shaders/op_rmsnorm.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | #include "common.comp" 4 | 5 | layout(local_size_x = 512) in; 6 | 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; 8 | layout(binding = 1) buffer restrict tensorOut { float out_[]; }; 9 | 10 | layout(push_constant) uniform PushConstants { 11 | uint inOff; 12 | uint outOff; 13 | uint ne00; 14 | uint nb01; 15 | float eps; 16 | } pcs; 17 | 18 | shared float sum[gl_WorkGroupSize.x]; 19 | 20 | void main() { 21 | const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ 22 | 23 | // parallel sum 24 | sum[gl_LocalInvocationID.x] = 0.0; 25 | for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { 26 | sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; 27 | } 28 | 29 | // reduce 30 | barrier(); 31 | memoryBarrierShared(); 32 | [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { 33 | if (gl_LocalInvocationID.x < i) { 34 | sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; 35 | } 36 | barrier(); 37 | memoryBarrierShared(); 38 | } 39 | 40 | // broadcast 41 | if (gl_LocalInvocationID.x == 0) { 42 | sum[0] /= float(pcs.ne00); 43 | } 44 | barrier(); 45 | memoryBarrierShared(); 46 | 47 | const float scale = 1.0f/sqrt(sum[0] + pcs.eps); 48 | 49 | const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ 50 | for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { 51 | out_[y+i00] = in_[x+i00] * scale; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/run/README.md: -------------------------------------------------------------------------------- 1 | # CLI 2 | 3 | ## Quick Start 4 | 5 | To get started right away, run the following command, making sure to use the correct path for the model you have: 6 | - [hugging-face](https://huggingface.co/PowerServe/Llama-3.1-8B-PowerServe-QNN/tree/main) 7 | 8 | ```bash 9 | # compile binary on linux 10 | ./powerserve create --exe-path ./build/out -m ./Llama-3.1-8B-PowerServe-QNN/ -o proj 11 | ``` 12 | 13 | ## Common Options 14 | 15 | In this section, we cover the most commonly used options for running the `run` program with models: 16 | 17 | - `--work-folder [-d] DIRECTORY` The directory containing GGUF or QNN models 18 | - `--n-predicts [-n] N` Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. 19 | - `--no-qnn` Set this flag to disable QNN backend (if compiled with POWERSERVE_WITH_QNN=ON) 20 | 21 | ```bash 22 | ./proj/bin/powerserve-run --work-folder proj --prompt "Once upon a time" 23 | ``` 24 | 25 | ## Input Prompts 26 | 27 | The `run` program provides several ways to interact with the models using input prompts: 28 | 29 | - `--prompt [-p] PROMPT`: Provide a prompt directly as a command-line option. 30 | - `--prompt-file [-f] FNAME`: Provide a file containing a prompt or multiple prompts. 31 | 32 | ## Additional Options 33 | 34 | These options provide extra functionality and customization when running the LLaMA models: 35 | 36 | - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. 37 | -------------------------------------------------------------------------------- /src/sampler/sampler_chain.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "core/config.hpp" 18 | #include "sampler.hpp" 19 | #include "tokenizer/tokenizer.hpp" 20 | 21 | #include 22 | 23 | namespace powerserve { 24 | 25 | struct SamplerChain final : Sampler { 26 | virtual ~SamplerChain() override = default; 27 | 28 | SamplerChain() = default; 29 | 30 | SamplerChain(const HyperParams::SamplerConfig &config, const Tokenizer &tokenizer) { 31 | build_from_config(config, tokenizer); 32 | } 33 | 34 | template 35 | void append(Args &&...args) { 36 | m_samplers.emplace_back(std::make_unique(std::forward(args)...)); 37 | } 38 | 39 | void build_from_config(const HyperParams::SamplerConfig &config, const Tokenizer &tokenizer); 40 | 41 | void apply(ProbArray &probs) override; 42 | void accept(Token token) override; 43 | 44 | private: 45 | std::vector> m_samplers; 46 | }; 47 | 48 | } // namespace powerserve 49 | -------------------------------------------------------------------------------- /src/core/android_logger.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | namespace powerserve { 20 | 21 | #define POWERSERVE_LOG_DEBUG(...) \ 22 | __android_log_write(ANDROID_LOG_DEBUG, "PowerServe", fmt::format("" __VA_ARGS__).c_str()) 23 | 24 | #define POWERSERVE_LOG_INFO(...) \ 25 | __android_log_write(ANDROID_LOG_INFO, "PowerServe", fmt::format("" __VA_ARGS__).c_str()) 26 | 27 | #define POWERSERVE_LOG_WARN(...) \ 28 | __android_log_write(ANDROID_LOG_WARN, "PowerServe", fmt::format("" __VA_ARGS__).c_str()) 29 | 30 | #define POWERSERVE_LOG_ERROR(...) \ 31 | __android_log_write(ANDROID_LOG_ERROR, "PowerServe", fmt::format("" __VA_ARGS__).c_str()) 32 | 33 | } // namespace powerserve 34 | -------------------------------------------------------------------------------- /src/model/module/attention_mask.hpp: -------------------------------------------------------------------------------- 1 | // Copyright 2024-2025 PowerServe Authors 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | 20 | namespace powerserve { 21 | 22 | struct AttentionMask { 23 | size_t size = 0; 24 | 25 | AttentionMask(size_t size); 26 | 27 | virtual ~AttentionMask() = default; 28 | 29 | virtual bool not_masked(size_t i, size_t j) const = 0; 30 | }; 31 | 32 | struct AttentionMaskView { 33 | const AttentionMask &mask; 34 | size_t offset = 0; 35 | 36 | size_t size = 0; 37 | 38 | AttentionMaskView(const AttentionMask &mask, size_t offset, size_t size); 39 | 40 | bool not_masked(size_t i, size_t j) const; 41 | }; 42 | 43 | struct CausalAttentionMask : AttentionMask { 44 | std::vector> mask; 45 | 46 | CausalAttentionMask(size_t size); 47 | CausalAttentionMask(size_t size, const std::vector> &mask); 48 | virtual ~CausalAttentionMask() override = default; 49 | virtual bool not_masked(size_t i, size_t j) const override; 50 | }; 51 | 52 | } // namespace powerserve 53 | --------------------------------------------------------------------------------