├── libs
    ├── .clang-format
    ├── ggml
    │   ├── .gitignore
    │   ├── src
    │   │   ├── ggml-cuda
    │   │   │   ├── argsort.cuh
    │   │   │   ├── fattn.cuh
    │   │   │   ├── fattn-tile-f16.cuh
    │   │   │   ├── fattn-tile-f32.cuh
    │   │   │   ├── acc.cuh
    │   │   │   ├── pad.cuh
    │   │   │   ├── clamp.cuh
    │   │   │   ├── rope.cuh
    │   │   │   ├── scale.cuh
    │   │   │   ├── arange.cuh
    │   │   │   ├── concat.cuh
    │   │   │   ├── im2col.cuh
    │   │   │   ├── pool2d.cuh
    │   │   │   ├── getrows.cuh
    │   │   │   ├── softmax.cuh
    │   │   │   ├── upscale.cuh
    │   │   │   ├── diagmask.cuh
    │   │   │   ├── tsembd.cuh
    │   │   │   ├── conv-transpose-1d.cuh
    │   │   │   ├── template-instances
    │   │   │   │   ├── mmq-instance-iq1_s.cu
    │   │   │   │   ├── mmq-instance-iq2_s.cu
    │   │   │   │   ├── mmq-instance-iq3_s.cu
    │   │   │   │   ├── mmq-instance-q2_k.cu
    │   │   │   │   ├── mmq-instance-q3_k.cu
    │   │   │   │   ├── mmq-instance-q4_0.cu
    │   │   │   │   ├── mmq-instance-q4_1.cu
    │   │   │   │   ├── mmq-instance-q4_k.cu
    │   │   │   │   ├── mmq-instance-q5_0.cu
    │   │   │   │   ├── mmq-instance-q5_1.cu
    │   │   │   │   ├── mmq-instance-q5_k.cu
    │   │   │   │   ├── mmq-instance-q6_k.cu
    │   │   │   │   ├── mmq-instance-q8_0.cu
    │   │   │   │   ├── mmq-instance-iq2_xs.cu
    │   │   │   │   ├── mmq-instance-iq2_xxs.cu
    │   │   │   │   ├── mmq-instance-iq3_xxs.cu
    │   │   │   │   ├── mmq-instance-iq4_nl.cu
    │   │   │   │   ├── mmq-instance-iq4_xs.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-f16-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs256-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-q4_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-q4_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-q5_0.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-q5_1.cu
    │   │   │   │   ├── fattn-vec-f16-instance-hs64-f16-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-f16-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs256-f16-f16.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-q4_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-q4_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-q5_0.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-q5_1.cu
    │   │   │   │   ├── fattn-vec-f32-instance-hs64-f16-q8_0.cu
    │   │   │   │   ├── fattn-wmma-f16-instance-kqhalf-cpb8.cu
    │   │   │   │   ├── fattn-wmma-f16-instance-kqfloat-cpb32.cu
    │   │   │   │   ├── fattn-wmma-f16-instance-kqhalf-cpb16.cu
    │   │   │   │   ├── fattn-wmma-f16-instance-kqhalf-cpb32.cu
    │   │   │   │   └── fattn-wmma-f16-instance-kqfloat-cpb16.cu
    │   │   │   ├── cross-entropy-loss.cuh
    │   │   │   ├── sum.cuh
    │   │   │   ├── sumrows.cuh
    │   │   │   ├── norm.cuh
    │   │   │   ├── cpy.cuh
    │   │   │   ├── binbcast.cuh
    │   │   │   ├── convert.cuh
    │   │   │   ├── mmvq.cuh
    │   │   │   ├── dmmv.cuh
    │   │   │   ├── quantize.cuh
    │   │   │   ├── scale.cu
    │   │   │   ├── vendors
    │   │   │   │   └── cuda.h
    │   │   │   ├── clamp.cu
    │   │   │   ├── arange.cu
    │   │   │   └── sumrows.cu
    │   │   ├── vulkan-shaders
    │   │   │   ├── generic_head.comp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── dequant_head.comp
    │   │   │   ├── scale.comp
    │   │   │   ├── add.comp
    │   │   │   ├── div.comp
    │   │   │   ├── mul.comp
    │   │   │   ├── cos.comp
    │   │   │   ├── sin.comp
    │   │   │   ├── square.comp
    │   │   │   ├── clamp.comp
    │   │   │   ├── copy.comp
    │   │   │   ├── dequant_f32.comp
    │   │   │   ├── tanh.comp
    │   │   │   ├── relu.comp
    │   │   │   ├── silu.comp
    │   │   │   ├── leaky_relu.comp
    │   │   │   ├── gelu_quick.comp
    │   │   │   ├── mul_mat_split_k_reduce.comp
    │   │   │   ├── acc.comp
    │   │   │   ├── get_rows.comp
    │   │   │   ├── repeat.comp
    │   │   │   ├── gelu.comp
    │   │   │   ├── diag_mask_inf.comp
    │   │   │   ├── dequant_q8_0.comp
    │   │   │   ├── pad.comp
    │   │   │   ├── dequant_q4_0.comp
    │   │   │   ├── dequant_iq4_nl.comp
    │   │   │   ├── dequant_q4_1.comp
    │   │   │   ├── rope_norm.comp
    │   │   │   ├── rope_neox.comp
    │   │   │   ├── sum_rows.comp
    │   │   │   ├── get_rows_quant.comp
    │   │   │   ├── dequant_q5_0.comp
    │   │   │   ├── dequant_q5_1.comp
    │   │   │   ├── timestep_embedding.comp
    │   │   │   ├── upscale.comp
    │   │   │   ├── rms_norm.comp
    │   │   │   ├── norm.comp
    │   │   │   ├── concat.comp
    │   │   │   ├── dequant_q6_k.comp
    │   │   │   └── dequant_q2_k.comp
    │   │   ├── kompute-shaders
    │   │   │   ├── op_scale.comp
    │   │   │   ├── op_relu.comp
    │   │   │   ├── op_mul_mv_q_n_pre.comp
    │   │   │   ├── op_scale_8.comp
    │   │   │   ├── op_silu.comp
    │   │   │   ├── op_getrows.comp
    │   │   │   ├── op_gelu.comp
    │   │   │   ├── op_addrow.comp
    │   │   │   ├── op_diagmask.comp
    │   │   │   ├── op_getrows_f32.comp
    │   │   │   ├── op_getrows_f16.comp
    │   │   │   ├── op_getrows_q4_0.comp
    │   │   │   ├── op_getrows_q4_1.comp
    │   │   │   ├── op_mul_mat_q4_0.comp
    │   │   │   ├── op_mul_mat_q4_1.comp
    │   │   │   ├── op_getrows_q6_k.comp
    │   │   │   ├── op_mul_mat_mat_f32.comp
    │   │   │   ├── op_mul.comp
    │   │   │   └── op_rmsnorm.comp
    │   │   ├── llamafile
    │   │   │   └── sgemm.h
    │   │   ├── ggml-cann
    │   │   │   └── kernels
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── ascendc_kernels.h
    │   │   └── ggml-sycl
    │   │   │   ├── conv.hpp
    │   │   │   ├── tsembd.hpp
    │   │   │   ├── concat.hpp
    │   │   │   ├── rope.hpp
    │   │   │   ├── im2col.hpp
    │   │   │   ├── softmax.hpp
    │   │   │   ├── backend.hpp
    │   │   │   ├── convert.hpp
    │   │   │   ├── mmvq.hpp
    │   │   │   ├── dmmv.hpp
    │   │   │   └── mmq.hpp
    │   ├── README.md
    │   └── include
    │   │   ├── ggml-blas.h
    │   │   └── ggml-rpc.h
    ├── qnn_headers
    │   └── CMakeLists.txt
    ├── stb_headers
    │   └── CMakeLists.txt
    ├── llama_tokenizer
    │   ├── CMakeLists.txt
    │   └── unicode-data.h
    └── CMakeLists.txt
├── tools
    ├── convert_hf_to_gguf
    │   ├── gguf-py
    │   │   ├── gguf
    │   │   │   ├── py.typed
    │   │   │   ├── __init__.py
    │   │   │   └── gguf.py
    │   │   ├── tests
    │   │   │   └── __init__.py
    │   │   ├── scripts
    │   │   │   └── __init__.py
    │   │   ├── LICENSE
    │   │   ├── pyproject.toml
    │   │   └── examples
    │   │   │   └── writer.py
    │   ├── llama-quantize-x86_64-linux-clang
    │   └── requirements.txt
    ├── CMakeLists.txt
    ├── parameter_search
    │   └── token_tree
    │   │   ├── requirements.txt
    │   │   ├── .gitignore
    │   │   ├── search.py
    │   │   ├── README.md
    │   │   └── analyze.py
    ├── mmlu
    │   ├── .gitignore
    │   └── README.md
    ├── qnn_converter
    │   ├── .gitignore
    │   ├── prompt
    │   │   ├── system_prompt_qwen.txt
    │   │   └── system_prompt_llama.txt
    │   ├── requirements.txt
    │   ├── soc_config.py
    │   └── graph_params.py
    ├── simple_qnn_test
    │   ├── .gitignore
    │   └── README.md
    ├── extract_embd_from_vl
    │   └── README.md
    ├── gguf_config_to_json
    │   └── CMakeLists.txt
    ├── gen_flame_graph.sh
    ├── end_to_end
    │   └── powerserve.sh
    └── cos_sim.py
├── requirements.txt
├── assets
    ├── system_prompts
    │   ├── qwen2.txt
    │   └── llama3.txt
    └── prompts
    │   ├── comparison_qwen2.txt
    │   ├── strawberry_qwen2.txt
    │   ├── comparison_llama.txt
    │   ├── gsm8k1.txt
    │   ├── gsm8k1_qwen.txt
    │   ├── gsm8k1_llama.txt
    │   ├── gsm8k2.txt
    │   ├── short_prompt.txt
    │   ├── gsm8k2_qwen.txt
    │   └── long_prompt.txt
├── src
    ├── executor
    │   ├── CMakeLists.txt
    │   └── executor.hpp
    ├── tokenizer
    │   └── CMakeLists.txt
    ├── model
    │   ├── llama
    │   │   └── CMakeLists.txt
    │   ├── qwen2
    │   │   └── CMakeLists.txt
    │   ├── module
    │   │   ├── CMakeLists.txt
    │   │   ├── ffn.hpp
    │   │   ├── norm_attention.hpp
    │   │   ├── attention.hpp
    │   │   ├── attention_mask.cpp
    │   │   ├── ffn.cpp
    │   │   └── attention_mask.hpp
    │   ├── CMakeLists.txt
    │   ├── internvl
    │   │   └── CMakeLists.txt
    │   └── model_loader.hpp
    ├── speculative
    │   ├── CMakeLists.txt
    │   └── speculative_config.hpp
    ├── graph
    │   ├── CMakeLists.txt
    │   ├── node.cpp
    │   └── op_type.hpp
    ├── storage
    │   └── CMakeLists.txt
    ├── sampler
    │   ├── CMakeLists.txt
    │   └── sampler_chain.hpp
    ├── backend
    │   ├── ggml
    │   │   └── CMakeLists.txt
    │   ├── CMakeLists.txt
    │   ├── qnn
    │   │   └── CMakeLists.txt
    │   └── backend.hpp
    ├── core
    │   ├── CMakeLists.txt
    │   ├── buffer.hpp
    │   ├── spin_barrier.h
    │   ├── defines.hpp
    │   ├── getenv.hpp
    │   ├── typedefs.hpp
    │   ├── spin_barrier.hpp
    │   ├── timer.hpp
    │   └── android_logger.hpp
    └── CMakeLists.txt
├── app
    ├── run
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── perplexity
    │   └── CMakeLists.txt
    ├── common
    │   └── CMakeLists.txt
    ├── CMakeLists.txt
    └── server
    │   ├── CMakeLists.txt
    │   └── server.cpp
├── pyproject.toml
├── .gitignore
├── tests
    ├── CMakeLists.txt
    └── quant_mul_mat.cpp
├── docs
    └── perfetto_tracing.md
├── .clang-format
└── .gitmodules


/libs/.clang-format:
--------------------------------------------------------------------------------
1 | DisableFormat: true
2 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/gguf/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(gguf_config_to_json)
2 | 


--------------------------------------------------------------------------------
/tools/parameter_search/token_tree/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | 


--------------------------------------------------------------------------------
/tools/parameter_search/token_tree/.gitignore:
--------------------------------------------------------------------------------
1 | *.jsonl
2 | *.bak
3 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_metadata import *
2 | 


--------------------------------------------------------------------------------
/tools/mmlu/.gitignore:
--------------------------------------------------------------------------------
1 | /*.json
2 | /prompt/*.txt
3 | dev/
4 | test/
5 | val/
6 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/.gitignore:
--------------------------------------------------------------------------------
1 | output/
2 | smallthinker_*/
3 | tmp*/
4 | debug/
5 | 


--------------------------------------------------------------------------------
/libs/ggml/.gitignore:
--------------------------------------------------------------------------------
1 | src/ggml-vulkan-shaders.hpp
2 | src/ggml-vulkan-shaders.cpp
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ./tools/convert_hf_to_gguf/requirements.txt
2 | 
3 | isort
4 | black
5 | 


--------------------------------------------------------------------------------
/assets/system_prompts/qwen2.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>system
2 | You are a helpful assistant.<|im_end|>
3 | 


--------------------------------------------------------------------------------
/src/executor/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "executor.cpp"
3 | )
4 | 


--------------------------------------------------------------------------------
/src/tokenizer/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "tokenizer.cpp"
3 | )
4 | 


--------------------------------------------------------------------------------
/src/model/llama/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "llama_model.cpp"
3 | )
4 | 


--------------------------------------------------------------------------------
/src/model/qwen2/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "qwen2_model.cpp"
3 | )
4 | 


--------------------------------------------------------------------------------
/src/speculative/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "token_tree.cpp"
3 | )
4 | 


--------------------------------------------------------------------------------
/src/graph/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "graph.cpp"
3 |     "node.cpp"
4 | )
5 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/prompt/system_prompt_qwen.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>system
2 | You are a helpful assistant.<|im_end|>
3 | 


--------------------------------------------------------------------------------
/tools/simple_qnn_test/.gitignore:
--------------------------------------------------------------------------------
1 | ./model_libs
2 | ./output
3 | 
4 | *.bin
5 | *.onnx
6 | *.cpp
7 | *.json
8 | *.so
9 | 


--------------------------------------------------------------------------------
/assets/system_prompts/llama3.txt:
--------------------------------------------------------------------------------
1 | <|start_header_id|>system<|end_header_id|>
2 | You are a helpful assistant.<|eot_id|>
3 | 


--------------------------------------------------------------------------------
/assets/prompts/comparison_qwen2.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>user
2 | 9.11 and 9.8, which is larger?<|im_end|>
3 | <|im_start|>assistant
4 | 


--------------------------------------------------------------------------------
/assets/prompts/strawberry_qwen2.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>user
2 | How many "r"s in "strawberry"?<|im_end|>
3 | <|im_start|>assistant
4 | 


--------------------------------------------------------------------------------
/src/storage/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB_RECURSE storage_source *.cpp)
2 | target_sources(powerserve PRIVATE
3 |     ${storage_source}
4 | )
5 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/prompt/system_prompt_llama.txt:
--------------------------------------------------------------------------------
1 | <|start_header_id|>system<|end_header_id|>
2 | You are a helpful assistant.<|eot_id|>
3 | 


--------------------------------------------------------------------------------
/src/sampler/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "prob_array.cpp"
3 |     "sampler_chain.cpp"
4 |     "sampler.cpp"
5 | )
6 | 


--------------------------------------------------------------------------------
/app/run/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(run run.cpp)
2 | target_link_libraries(run PRIVATE powerserve app.common)
3 | 
4 | powerserve_add_binary(run)
5 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/argsort.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/fattn.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/src/backend/ggml/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "ggml.cpp"
3 |     "ggml_kv_cache.cpp"
4 |     "ggml_wrapper.cpp"
5 | )
6 | 


--------------------------------------------------------------------------------
/src/model/module/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "attention_mask.cpp"
3 |     "ffn.cpp"
4 |     "norm_attention.cpp"
5 | )
6 | 


--------------------------------------------------------------------------------
/libs/qnn_headers/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(qnn_headers INTERFACE
2 |     "qnn_type_macros.hpp"
3 | )
4 | target_include_directories(qnn_headers INTERFACE .)
5 | 


--------------------------------------------------------------------------------
/assets/prompts/comparison_llama.txt:
--------------------------------------------------------------------------------
1 | <|start_header_id|>user<|end_header_id|>9.11 and 9.8, which is larger?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
2 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/fattn-tile-f16.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/fattn-tile-f32.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/acc.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ACC_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/pad.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_PAD_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/llama-quantize-x86_64-linux-clang:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/powerserve-project/PowerServe/HEAD/tools/convert_hf_to_gguf/llama-quantize-x86_64-linux-clang


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/clamp.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CLAMP_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/rope.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ROPE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/scale.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_SCALE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.26.4
2 | sentencepiece>=0.2.0
3 | transformers>=4.40.1,<5.0.0
4 | gguf>=0.1.0
5 | protobuf>=4.21.0,<5.0.0
6 | torch>=2.2.1,<2.5.0
7 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/arange.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_ARANGE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/concat.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CONCAT_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/im2col.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_IM2COL_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/pool2d.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_POOL2D_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/getrows.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_GET_ROWS_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/softmax.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
4 | 
5 | void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/upscale.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_UPSCALE_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/stb_headers/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(stb_headers INTERFACE
2 |     "stb/stb_image.h"
3 |     "stb/stb_image_resize2.h"
4 | )
5 | target_include_directories(stb_headers INTERFACE ./stb)
6 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/requirements.txt:
--------------------------------------------------------------------------------
1 | onnx==1.16.1
2 | onnxruntime==1.18.0
3 | onnxsim==0.4.36
4 | sentencepiece>=0.2.0
5 | transformers>=4.40.1,<5.0.0
6 | torch>=1.13.1,<2.5.0
7 | protobuf==3.20.2


--------------------------------------------------------------------------------
/tools/extract_embd_from_vl/README.md:
--------------------------------------------------------------------------------
1 | # export embedding only
2 | ```
3 | python ./tools/extract_embd_from_vl/main.py --model-path /data1/models/Qwen2-VL-2B-Instruct/ --out-path ./tmp.gguf
4 | ```
5 | 


--------------------------------------------------------------------------------
/app/perplexity/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(perplexity-test "main.cpp")
2 | target_link_libraries(perplexity-test PRIVATE powerserve ggml app.common)
3 | 
4 | powerserve_add_artifact(perplexity-test)
5 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/diagmask.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
4 | 
5 | void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/src/backend/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | target_sources(powerserve PRIVATE
 2 |     "platform.cpp"
 3 | )
 4 | 
 5 | add_subdirectory(ggml)
 6 | 
 7 | if (POWERSERVE_WITH_QNN)
 8 |     add_subdirectory(qnn)
 9 | endif()
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/tsembd.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/src/model/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(llama)
2 | add_subdirectory(internvl)
3 | add_subdirectory(qwen2)
4 | add_subdirectory(module)
5 | 
6 | target_sources(powerserve PRIVATE
7 |     "model_loader.cpp"
8 | )
9 | 


--------------------------------------------------------------------------------
/tools/parameter_search/token_tree/search.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from common import *
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | for params in tqdm(untested_params):
 8 |     run(params)
 9 |     time.sleep(5)
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/conv-transpose-1d.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q2_K);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q3_K);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q4_K);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q5_K);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q6_K);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/src/model/internvl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(powerserve PRIVATE
2 |     "internvl_model.cpp"
3 | )
4 | target_link_libraries(powerserve PRIVATE stb_headers)
5 | target_link_libraries(powerserve PRIVATE xtensor xtl)
6 | 


--------------------------------------------------------------------------------
/tools/gguf_config_to_json/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(config-generator "main.cpp")
2 | target_link_libraries(config-generator PRIVATE powerserve ggml CLI11::CLI11)
3 | 
4 | powerserve_add_artifact(config-generator)
5 | 


--------------------------------------------------------------------------------
/app/common/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(app.common STATIC cmdline.cpp)
2 | target_link_libraries(app.common PRIVATE powerserve CLI11::CLI11)
3 | 
4 | target_include_directories(app.common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
5 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/cross-entropy-loss.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
4 | 
5 | void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../mmq.cuh"
4 | 
5 | DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 120
3 | include = 'powerserve'
4 | extend-exclude = '/.*/'
5 | preview = true
6 | enable-unstable-feature = ['hug_parens_with_braces_and_square_brackets', 'docstring_check_for_newline']
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build*/
 2 | .cache/
 3 | .vscode/
 4 | __pycache__/
 5 | .venv/
 6 | models/
 7 | proj/
 8 | 
 9 | *.svg
10 | *.data
11 | *.folded
12 | *.perf
13 | Pipfile
14 | tools/llama_tf_to_qnn_bin
15 | tools/process_image
16 | *.log
17 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/generic_head.comp:
--------------------------------------------------------------------------------
 1 | #extension GL_EXT_shader_16bit_storage : require
 2 | 
 3 | layout (push_constant) uniform parameter
 4 | {
 5 |     uint KX;
 6 |     uint KY;
 7 |     float param1;
 8 |     float param2;
 9 | } p;
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/sum.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
4 | 
5 | void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/README.md:
--------------------------------------------------------------------------------
1 | ## Copyright and License
2 | 
3 | The original ggml project is © 2023-2024 by the ggml authors and is released under the MIT License. You can find a copy of the original license [here](https://github.com/ggerganov/ggml/blob/master/LICENSE).
4 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/sumrows.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
4 | 
5 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f16.cuh"
4 | 
5 | DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
6 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-vec-f32.cuh"
4 | 
5 | DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
6 | 


--------------------------------------------------------------------------------
/assets/prompts/gsm8k1.txt:
--------------------------------------------------------------------------------
1 | Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(quant_mul_mat "quant_mul_mat.cpp")
2 | target_link_libraries(quant_mul_mat PRIVATE powerserve ggml)
3 | 
4 | if (POWERSERVE_WITH_QNN)
5 |     add_executable(qnn_test qnn_test.cpp)
6 |     target_link_libraries(qnn_test PRIVATE powerserve CLI11::CLI11)
7 | endif()
8 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/gguf/__init__.py:
--------------------------------------------------------------------------------
 1 | from .constants import *
 2 | from .gguf_reader import *
 3 | from .gguf_writer import *
 4 | from .lazy import *
 5 | from .metadata import *
 6 | from .quants import *
 7 | from .tensor_mapping import *
 8 | from .utility import *
 9 | from .vocab import *
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/norm.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | 
5 | void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | 
7 | void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | find_package (Threads REQUIRED)
2 | 
3 | set(TARGET vulkan-shaders-gen)
4 | add_executable(${TARGET} vulkan-shaders-gen.cpp)
5 | install(TARGETS ${TARGET} RUNTIME)
6 | target_compile_features(${TARGET} PRIVATE cxx_std_11)
7 | target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
8 | 


--------------------------------------------------------------------------------
/src/core/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | target_sources(powerserve PRIVATE
 2 |     "config.cpp"
 3 |     "perf.cpp"
 4 |     "perfetto_trace.cpp"
 5 |     "thread_pool.cpp"
 6 |     "timer.cpp"
 7 |     "spin_barrier.cpp"
 8 | )
 9 | 
10 | if (POWERSERVE_WITH_PERFETTO)
11 |     target_compile_definitions(powerserve PUBLIC POWERSERVE_WITH_PERFETTO)
12 | endif()
13 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_head.comp:
--------------------------------------------------------------------------------
 1 | #extension GL_EXT_control_flow_attributes : require
 2 | #extension GL_EXT_shader_16bit_storage : require
 3 | 
 4 | layout (push_constant) uniform parameter
 5 | {
 6 |     uint M;
 7 |     uint K;
 8 |     uint stride_a;
 9 |     uint stride_b;
10 |     uint nel;
11 | } p;
12 | 
13 | #include "types.comp"
14 | 


--------------------------------------------------------------------------------
/assets/prompts/gsm8k1_qwen.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>user
2 | Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?<|im_end|>
3 | <|im_start|>assistant
4 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | # pyright: reportUnusedImport=false
2 | 
3 | from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
4 | from .gguf_dump import main as gguf_dump_entrypoint
5 | from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
6 | from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
7 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/soc_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class SoCConfig:
 6 |     htp_version: int
 7 |     soc_id: int
 8 | 
 9 | 
10 | soc_map = {
11 |     "8650": SoCConfig(htp_version=75, soc_id=57),
12 |     "8750": SoCConfig(htp_version=79, soc_id=69),
13 |     "8295": SoCConfig(htp_version=68, soc_id=39),
14 | }
15 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/cpy.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define CUDA_CPY_BLOCK_SIZE 32
 4 | 
 5 | void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
 6 | 
 7 | void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 8 | 
 9 | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/scale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
14 | }
15 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu:
--------------------------------------------------------------------------------
1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
2 | 
3 | #include "../fattn-wmma-f16.cuh"
4 | 
5 | DECL_FATTN_WMMA_F16_CASE(64, 8, half);
6 | DECL_FATTN_WMMA_F16_CASE(96, 8, half);
7 | DECL_FATTN_WMMA_F16_CASE(128, 8, half);
8 | DECL_FATTN_WMMA_F16_CASE(256, 8, half);
9 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/add.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/div.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/mul.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
14 | }
15 | 


--------------------------------------------------------------------------------
/assets/prompts/gsm8k1_llama.txt:
--------------------------------------------------------------------------------
1 | <|start_header_id|>user<|end_header_id|>Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?<|eot_id|><|start_header_id|>assistant<|end_header_id|>


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/cos.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
15 | }
16 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/sin.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
15 | }
16 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/square.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
15 | }
16 | 


--------------------------------------------------------------------------------
/assets/prompts/gsm8k2.txt:
--------------------------------------------------------------------------------
1 | Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?


--------------------------------------------------------------------------------
/assets/prompts/short_prompt.txt:
--------------------------------------------------------------------------------
1 | Mia is planning a camping trip in the Canadian Rockies and has a budget of $800 for equipment. She buys a tent for $120, which is 15% off the original price. She then purchases a sleeping bag for $80, which is 20% off. If she also needs to buy a backpack and a portable stove, and the total cost of these two items is $180, what percentage of her budget will she have left after all the purchases?


--------------------------------------------------------------------------------
/libs/llama_tokenizer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(llama_tokenizer STATIC
 2 |     "llama-vocab.cpp"
 3 |     "unicode-data.cpp"
 4 |     "unicode.cpp"
 5 | )
 6 | target_link_libraries(llama_tokenizer PRIVATE ggml)
 7 | target_include_directories(llama_tokenizer PUBLIC .)
 8 | 
 9 | if (CMAKE_C_COMPILER_ID MATCHES "Clang")
10 |     target_compile_options(llama_tokenizer PRIVATE -Wno-deprecated-declarations)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, float);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, float);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, float);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, float);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, float);
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/clamp.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
15 | }
16 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/binbcast.cuh:
--------------------------------------------------------------------------------
1 | #include "common.cuh"
2 | 
3 | void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4 | void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
5 | void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6 | void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
7 | void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
8 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/copy.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = get_idx();
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 | #ifndef OPTIMIZATION_ERROR_WORKAROUND
14 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
15 | #else
16 |     data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
17 | #endif
18 | }
19 | 


--------------------------------------------------------------------------------
/assets/prompts/gsm8k2_qwen.txt:
--------------------------------------------------------------------------------
1 | <|im_start|>user
2 | Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?<|im_end|>
3 | <|im_start|>assistant
4 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/convert.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 4 | 
 5 | template<typename T>
 6 | using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
 7 | 
 8 | typedef to_t_cuda_t<float> to_fp32_cuda_t;
 9 | typedef to_t_cuda_t<half> to_fp16_cuda_t;
10 | 
11 | to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
12 | 
13 | to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
14 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, half);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, half);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, half);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, half);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, half);
10 | DECL_FATTN_WMMA_F16_CASE(256, 16, half);
11 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 32, half);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 32, half);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 32, half);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 32, half);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 32, half);
10 | DECL_FATTN_WMMA_F16_CASE(256, 32, half);
11 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu:
--------------------------------------------------------------------------------
 1 | // This file has been autogenerated by generate_cu_files.py, do not edit manually.
 2 | 
 3 | #include "../fattn-wmma-f16.cuh"
 4 | 
 5 | DECL_FATTN_WMMA_F16_CASE(64, 16, float);
 6 | DECL_FATTN_WMMA_F16_CASE(80, 16, float);
 7 | DECL_FATTN_WMMA_F16_CASE(96, 16, float);
 8 | DECL_FATTN_WMMA_F16_CASE(112, 16, float);
 9 | DECL_FATTN_WMMA_F16_CASE(128, 16, float);
10 | DECL_FATTN_WMMA_F16_CASE(256, 16, float);
11 | 


--------------------------------------------------------------------------------
/tools/parameter_search/token_tree/README.md:
--------------------------------------------------------------------------------
 1 | 首先需要导出模型和编译项目。
 2 | 
 3 | 超参数设置修改`common.py`中的`search_grid`，运行命令修改`run`函数。
 4 | 
 5 | 推送脚本到手机上：
 6 | 
 7 | ```bash
 8 | rsync -avzP tools/parameter_search/token_tree/{common,search}.py 8gen4:~/
 9 | ```
10 | 
11 | 在手机上运行脚本：
12 | 
13 | ```bash
14 | python search.py
15 | ```
16 | 
17 | 然后将数据库下载到本地，并用`analyze.py`分析：
18 | 
19 | ```bash
20 | cd tools/parameter_search/token_tree
21 | rsync -avzP 8gen4:~/database.jsonl .
22 | python analyze.py
23 | ```
24 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/mmvq.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 4 | 
 5 | void ggml_cuda_op_mul_mat_vec_q(
 6 |     ggml_backend_cuda_context & ctx,
 7 |     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
 8 |     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 9 |     const int64_t src1_padded_row_size, cudaStream_t stream);
10 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_scale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     float scale;
14 | } pcs;
15 | 
16 | void main() {
17 |     const uint i = gl_WorkGroupID.x;
18 |     out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
19 | }
20 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_f32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {float data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_GlobalInvocationID.x * 16;
12 | 
13 |     if (i >= p.nel) {
14 |         return;
15 |     }
16 | 
17 |     [[unroll]] for (uint l = 0; l < 16; l++) {
18 |         data_b[i + l] = D_TYPE(data_a[i + l]);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/app/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(POWERSERVE_BINARY_DIR  ${PROJECT_BINARY_DIR}/bin)
 2 | message(STATUS "PowerServe binary dir: ${POWERSERVE_BINARY_DIR}")
 3 | 
 4 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY  ${POWERSERVE_BINARY_DIR})
 5 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY  ${POWERSERVE_BINARY_DIR})
 6 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY  ${POWERSERVE_BINARY_DIR})
 7 | 
 8 | function(powerserve_add_binary target_name)
 9 |     powerserve_add_artifact(${target_name})
10 | endfunction()
11 | 
12 | add_subdirectory(common)
13 | 
14 | add_subdirectory(run)
15 | add_subdirectory(perplexity)
16 | add_subdirectory(server)
17 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/gguf/gguf.py:
--------------------------------------------------------------------------------
 1 | # This file left for compatibility. If you want to use the GGUF API from Python
 2 | # then don't import gguf/gguf.py directly. If you're looking for examples, see the
 3 | # examples/ directory for gguf-py
 4 | 
 5 | import importlib
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | 
10 | sys.path.insert(0, str(Path(__file__).parent.parent))
11 | 
12 | # Compatibility for people trying to import gguf/gguf.py directly instead of as a package.
13 | importlib.invalidate_caches()
14 | import gguf  # noqa: E402
15 | 
16 | 
17 | importlib.reload(gguf)
18 | 


--------------------------------------------------------------------------------
/src/backend/qnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(QNN_SDK $ENV{QNN_SDK_ROOT})
 2 | 
 3 | if ("${QNN_SDK}" STREQUAL "")
 4 |     message(FATAL_ERROR "Environment variable \"QNN_SDK_ROOT\" is not defined.")
 5 | endif()
 6 | 
 7 | message("QNN_SDK_ROOT=${QNN_SDK}")
 8 | 
 9 | target_sources(powerserve PRIVATE
10 |     "causal_models.cpp"
11 |     "graph_interface.cpp"
12 |     "qnn_backend.cpp"
13 |     "qnn.cpp"
14 | )
15 | target_include_directories(powerserve PUBLIC ${QNN_SDK}/include/QNN)
16 | target_compile_definitions(powerserve PUBLIC POWERSERVE_WITH_QNN)
17 | target_link_libraries(powerserve PUBLIC qnn_headers)
18 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 4;
16 | 
17 |     for (uint x = 0; x < 4; x++) {
18 |         const uint i = baseIndex + x;
19 |         out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/tanh.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     data_d[i] = D_TYPE(tanh(data_a[i]));
21 | }
22 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     data_d[i] = max(float(data_a[i]), 0);
21 | }
22 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp:
--------------------------------------------------------------------------------
 1 | layout(local_size_x_id = 0) in;
 2 | layout(local_size_y = 1) in;
 3 | layout(local_size_z = 1) in;
 4 | 
 5 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 6 | layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 7 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 8 | 
 9 | layout (push_constant) uniform parameter {
10 |     uint inAOff;
11 |     uint inBOff;
12 |     uint outOff;
13 |     int  ne00;
14 |     int  ne01;
15 |     int  ne02;
16 |     int  ne10;
17 |     int  ne12;
18 |     int  ne0;
19 |     int  ne1;
20 |     uint r2;
21 |     uint r3;
22 | } pcs;
23 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_scale_8.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     float scale;
14 | } pcs;
15 | 
16 | void main() {
17 |     const uint baseIndex = gl_WorkGroupID.x * 8;
18 | 
19 |     for (uint x = 0; x < 8; x++) {
20 |         const uint i = baseIndex + x;
21 |         out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_silu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 4;
16 | 
17 |     for (uint x = 0; x < 4; x++) {
18 |         const uint i = baseIndex + x;
19 |         const float y = in_[i + pcs.inOff];
20 |         out_[i + pcs.outOff] = y / (1.0 + exp(-y));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/silu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     const float xi = float(data_a[i]);
21 |     data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
22 | }
23 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows.comp:
--------------------------------------------------------------------------------
 1 | void main() {
 2 |     const uint i = gl_WorkGroupID.x;
 3 |     const int r = inB[i + pcs.inBOff];
 4 | 
 5 |     int z = 0;
 6 |     for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
 7 |         const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
 8 |         const mat4 result = dequantize_block(inIndex, ind%NL);
 9 |         for (uint j = 0; j < 4; ++j) {
10 |             for (uint k = 0; k < 4; ++k) {
11 |                 const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
12 |                 out_[outIndex] = result[j][k];
13 |                 ++z;
14 |             }
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/leaky_relu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
15 | 
16 |     if (i >= p.KX) {
17 |         return;
18 |     }
19 | 
20 |     const float val = float(data_a[i]);
21 |     data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
22 | }
23 | 


--------------------------------------------------------------------------------
/tools/gen_flame_graph.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # bash ./tools/gen_flame_graph.sh ./build/bin/run --file-path ../models/Meta-Llama-3.1-8B/llama3-8b_Q4_0.gguf --vocab-path ../models/Meta-Llama-3.1-8B/llama3.1_8b_vocab.gguf --prompt "Tell me a story:" --steps 16
 3 | set -x
 4 | 
 5 | TOOLS_DIR="/home/zwb/SS/FlameGraph"
 6 | CUR_DIR=$(pwd)
 7 | 
 8 | cmd=("$@")
 9 | 
10 | sudo perf record -F 99 -a -g -- "${cmd[@]}"
11 | sudo perf script -i perf.data > $CUR_DIR/out.perf
12 | sudo $TOOLS_DIR/stackcollapse-perf.pl $CUR_DIR/out.perf > $CUR_DIR/out.folded
13 | sudo $TOOLS_DIR/flamegraph.pl $CUR_DIR/out.folded > $CUR_DIR/out.svg
14 | sudo chmod a+rw $CUR_DIR/out.svg
15 | 
16 | sudo rm -f $CUR_DIR/perf.data $CUR_DIR/out.perf $CUR_DIR/out.folded
17 | 


--------------------------------------------------------------------------------
/tools/simple_qnn_test/README.md:
--------------------------------------------------------------------------------
 1 | # Simple QNN Test
 2 | 
 3 | ## Convert Model
 4 | 
 5 | convert model + compile model + generate binary
 6 | ```shell
 7 | python3 simple_convert_qnn.py --model-name simple_model --model-num 3
 8 | ```
 9 | The output will locate in the directory `output`
10 | 
11 | ## Execute Model
12 | 
13 | Push the model to the phone
14 | 
15 | ```shell
16 | adb push output/simple_model.bin
17 | ```
18 | 
19 | ```shell
20 | <path-to-build-dir>/tests/qnn_tests --qnn-path /data/local/tmp/simple_model \
21 |                 --model-name <model-name>                                   \
22 |                 --graph-num <graph-num>                                     \
23 |                 --repeat <repeat-num>
24 | ```
25 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_gelu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | layout(push_constant) uniform PushConstants {
10 |     uint inOff;
11 |     uint outOff;
12 | } pcs;
13 | 
14 | void main() {
15 |     const uint baseIndex = gl_WorkGroupID.x * 8;
16 | 
17 |     for (uint x = 0; x < 8; x++) {
18 |         const uint i = baseIndex + x;
19 |         const float y = in_[i + pcs.inOff];
20 |         out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/dmmv.cuh:
--------------------------------------------------------------------------------
 1 | #include "common.cuh"
 2 | 
 3 | // dmmv = dequantize_mul_mat_vec
 4 | 
 5 | // TODO: remove this?
 6 | #ifndef GGML_CUDA_DMMV_X
 7 | #define GGML_CUDA_DMMV_X 32
 8 | #endif
 9 | 
10 | #ifndef GGML_CUDA_MMV_Y
11 | #define GGML_CUDA_MMV_Y 1
12 | #endif
13 | 
14 | void ggml_cuda_op_dequantize_mul_mat_vec(
15 |     ggml_backend_cuda_context & ctx,
16 |     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
17 |     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
18 |     const int64_t src1_padded_row_size, cudaStream_t stream);
19 | 
20 | bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
21 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/gelu_quick.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const float GELU_QUICK_COEF = -1.702f;
15 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
16 | 
17 |     if (i >= p.KX) {
18 |         return;
19 |     }
20 | 
21 |     const float x = float(data_a[i]);
22 |     data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
23 | }
24 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/mul_mat_split_k_reduce.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_control_flow_attributes : enable
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {float data_a[];};
 8 | layout (binding = 1) writeonly buffer D {float data_d[];};
 9 | 
10 | layout (push_constant) uniform parameter {
11 |     uint ne;
12 |     uint k_num;
13 | } p;
14 | 
15 | void main() {
16 |     const uint idx = gl_GlobalInvocationID.x;
17 | 
18 |     if (idx >= p.ne) {
19 |         return;
20 |     }
21 | 
22 |     float result = 0.0f;
23 | 
24 |     [[unroll]] for (uint i = 0; i < p.k_num; i++) {
25 |         result += data_a[i * p.ne + idx];
26 |     }
27 | 
28 |     data_d[idx] = result;
29 | }
30 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_addrow.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10 | 
11 | layout(push_constant) uniform PushConstants {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     uint row;
16 | } pcs;
17 | 
18 | void main() {
19 |     const uint baseIndex = gl_WorkGroupID.x * 4;
20 | 
21 |     for (uint x = 0; x < 4; x++) {
22 |         const uint i = baseIndex + x;
23 |         out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/tools/parameter_search/token_tree/analyze.py:
--------------------------------------------------------------------------------
 1 | from common import *
 2 | 
 3 | 
 4 | def get_score(stat: dict) -> float:
 5 |     draft_model_latency_ms = 8.421
 6 |     target_model_latency_ms = 62.867
 7 | 
 8 |     n_iterations = stat["n_iterations"]
 9 |     n_draft_times = stat["n_draft_times"]
10 |     n_generated_tokens = stat["n_generated_tokens"]
11 | 
12 |     latency_ms = n_iterations * target_model_latency_ms + (n_iterations + n_draft_times) * draft_model_latency_ms
13 |     tokens_per_second = n_generated_tokens * 1000 / latency_ms
14 | 
15 |     return tokens_per_second
16 | 
17 | 
18 | leaderboard = sorted((get_score(stat), params, stat) for params, stat in database.items() if stat is not None)
19 | 
20 | for score, params, stat in leaderboard:
21 |     print(f"{score:.3f} '{format_params(params)}' {stat}")
22 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/acc.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = gl_GlobalInvocationID.x;
 8 |     if (idx >= p.ne) {
 9 |         return;
10 |     }
11 | 
12 |     const uint offset = p.param3;
13 |     const uint src1_i = idx - offset;
14 |     const uint oz = src1_i / p.nb02;
15 |     const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
16 |     const uint ox = src1_i % p.nb01;
17 | 
18 |     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
19 |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
20 |     } else {
21 |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
22 |     }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/get_rows.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint i00 = gl_GlobalInvocationID.x;
 8 |     const uint i10 = gl_GlobalInvocationID.y;
 9 |     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
10 |     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
11 | 
12 |     if (i00 >= p.ne00) {
13 |         return;
14 |     }
15 | 
16 |     const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
17 | 
18 |     const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
19 |     const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
20 | 
21 | #ifndef OPTIMIZATION_ERROR_WORKAROUND
22 |     data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
23 | #else
24 |     data_d[d_offset + i00] = data_a[a_offset + i00];
25 | #endif
26 | }
27 | 


--------------------------------------------------------------------------------
/src/model/model_loader.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "model/model.hpp"
16 | 
17 | namespace powerserve {
18 | 
19 | auto load_model(const Path &model_dir) -> std::shared_ptr<Model>;
20 | 
21 | } // namespace powerserve
22 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/repeat.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | uint src0_idx_mod(uint idx) {
 7 |     const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
 8 |     const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
 9 |     const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
10 |     const uint i12_offset = i12*p.ne11*p.ne10;
11 |     const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
12 |     const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
13 |     return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
14 | }
15 | 
16 | void main() {
17 |     const uint idx = get_idx();
18 | 
19 |     if (idx >= p.ne) {
20 |         return;
21 |     }
22 | 
23 |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
24 | }
25 | 


--------------------------------------------------------------------------------
/tools/mmlu/README.md:
--------------------------------------------------------------------------------
 1 | # MMLU Test
 2 | - Download the test set: https://github.com/bobozi-cmd/mmlu/releases/download/publish/mmlu_dataset.zip
 3 | 
 4 | - Unpack and put it in the same directory as the script `mmlu_test.py`:
 5 | ```
 6 | > tree -L 1 ./
 7 | ./
 8 | ├── dev
 9 | ├── mmlu_test.py
10 | ├── README.md
11 | ├── test
12 | └── val
13 | ```
14 | 
15 | - Start the server, assuming that the IP address is 192.168.1.39, the model directory is build_llama_proj, and the port is 8080. Test command:
16 | 
17 | ```bash
18 | python ./mmlu_test.py --host 192.168.1.39 --port 8080 -s 1 --model build_llama_proj
19 | ```
20 | 
21 | - MMLU has a total of 57 subjects, and `-s30` means the first 30 questions in each subject. Without adding the '-s' parameter, you will measure the entire MMLU data set.
22 | - If you run on pure CPU, it will be slower, so you can test `-s1` first.
23 | 


--------------------------------------------------------------------------------
/src/backend/backend.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // WIP: direct pass funcs
16 | #pragma once
17 | 
18 | namespace powerserve {
19 | 
20 | struct Backend {
21 |     virtual ~Backend() = default;
22 | };
23 | 
24 | } // namespace powerserve
25 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_diagmask.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     uint n_past;
14 |     int ne00;
15 |     int ne01;
16 | } pcs;
17 | 
18 | void main() {
19 |     const uint i02 = gl_WorkGroupID.z;
20 |     const uint i01 = gl_WorkGroupID.y;
21 |     const uint i00 = gl_WorkGroupID.x;
22 | 
23 |     const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
24 | 
25 |     if (i00 > pcs.n_past + i01) {
26 |         out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
27 |     } else {
28 |         out_[index + pcs.outOff] = in_[index + pcs.inOff];
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/gelu.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | 
 8 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 9 | 
10 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
12 | 
13 | void main() {
14 |     const float GELU_COEF_A    = 0.044715f;
15 |     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
16 |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
17 | 
18 |     if (i >= p.KX) {
19 |         return;
20 |     }
21 | 
22 |     const float xi = float(data_a[i]);
23 |     const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
24 |     data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
25 | }
26 | 


--------------------------------------------------------------------------------
/assets/prompts/long_prompt.txt:
--------------------------------------------------------------------------------
1 | Imagine you are an AI assistant developed by Moonshot AI, named Kimi. You are designed to interact with users in both English and Chinese, providing helpful and accurate responses while adhering to safety and compliance standards. Your capabilities include processing long texts, handling various file formats, and accessing the internet for information. You are also programmed to refuse any requests that involve terrorism, racial discrimination, explicit content, or politically sensitive topics.  
2 | Today, you are tasked with engaging in a conversation with a user who is interested in learning about the history of artificial intelligence. They have asked you to provide a brief overview of the key milestones in AI development, from its inception to the present day. Your response should be informative, concise, and engaging, ensuring that the user gains a clear understanding of AI's evolution.


--------------------------------------------------------------------------------
/app/server/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(server.simple STATIC simple_server.cpp)
 2 | target_link_libraries(server.simple PUBLIC
 3 |     powerserve
 4 |     httplib::httplib
 5 |     nlohmann_json::nlohmann_json
 6 |     concurrentqueue
 7 | )
 8 | if (POWERSERVE_SERVER_MULTIMODEL)
 9 |     target_compile_definitions(server.simple PUBLIC POWERSERVE_SERVER_MULTIMODEL)
10 | endif ()
11 | 
12 | add_library(server.local STATIC local_server.cpp)
13 | target_link_libraries(server.local PUBLIC
14 |     powerserve
15 |     httplib::httplib
16 |     nlohmann_json::nlohmann_json
17 |     concurrentqueue
18 | )
19 | if (POWERSERVE_SERVER_MULTIMODEL)
20 |     target_compile_definitions(server.local PUBLIC POWERSERVE_SERVER_MULTIMODEL)
21 | endif ()
22 | 
23 | add_executable(server server.cpp)
24 | target_link_libraries(server PRIVATE
25 |     powerserve
26 |     app.common
27 |     server.simple
28 | )
29 | 
30 | powerserve_add_binary(server)
31 | 


--------------------------------------------------------------------------------
/src/core/buffer.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <memory>
18 | 
19 | namespace powerserve {
20 | 
21 | struct BaseBuffer {
22 | public:
23 |     virtual ~BaseBuffer() = default;
24 | };
25 | 
26 | using BufferPtr = std::shared_ptr<BaseBuffer>;
27 | 
28 | } // namespace powerserve
29 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows_f32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer tensorInA { float inA[]; };
 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10 | 
11 | layout (push_constant) uniform parameter {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     int ne00;
16 |     int nb01;
17 |     int nb1;
18 | } pcs;
19 | 
20 | void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21 |     for (int j = 0; j < k; j++) {
22 |         out_[y + j] = inA[x + j];
23 |     }
24 | }
25 | 
26 | void main() {
27 |     const uint i = gl_WorkGroupID.x;
28 |     const int r = inB[i + pcs.inBOff];
29 | 
30 |     dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31 | }
32 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows_f16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 8 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
 9 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10 | 
11 | layout (push_constant) uniform parameter {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     int ne00;
16 |     int nb01;
17 |     int nb1;
18 | } pcs;
19 | 
20 | void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21 |     for (int j = 0; j < k; j++) {
22 |         out_[y + j] = inA[x + j];
23 |     }
24 | }
25 | 
26 | void main() {
27 |     const uint i = gl_WorkGroupID.x;
28 |     const int r = inB[i + pcs.inBOff];
29 | 
30 |     dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31 | }
32 | 


--------------------------------------------------------------------------------
/docs/perfetto_tracing.md:
--------------------------------------------------------------------------------
 1 | First, set POWERSERVE_WITH_PERFETTO to ON during code compilation.
 2 | 
 3 | Then explicitly start and stop tracing in your code, for example:
 4 | 
 5 | ```c++
 6 | powerserve::PerfettoTrace::instance().start_tracing(32 * 1024); // Buffer size in KiB
 7 | powerserve::TreeSpeculative spec(main_model, draft_model);
 8 | spec.generate(tokenizer, sampler, prompt, n_predicts);
 9 | spec.print_stat();
10 | powerserve::PerfettoTrace::instance().stop_tracing("./perfetto.data"); // Will save to perfetto.data file
11 | ```
12 | 
13 | Use PerfettoTrace::begin and PerfettoTrace::end to mark start and end positions, for example:
14 | 
15 | ```c++
16 | PerfettoTrace::begin("draft_model_forward");
17 | auto logits = draft_model->forward({node.token}, {node.position}, CausalAttentionMask(1));
18 | PerfettoTrace::end();
19 | ```
20 | 
21 | `PerfettoTrace::counter`can be used to record a line graph.
22 | 
23 | Finally, open the trace file at https://ui.perfetto.dev.
24 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | 
 3 | AccessModifierOffset: -4
 4 | AlignAfterOpenBracket: BlockIndent
 5 | AlignConsecutiveAssignments: true
 6 | AlignConsecutiveBitFields: true
 7 | AlignConsecutiveMacros: true
 8 | AllowShortBlocksOnASingleLine: Empty
 9 | AllowShortEnumsOnASingleLine: false
10 | AllowShortFunctionsOnASingleLine: Empty
11 | AllowShortIfStatementsOnASingleLine: false
12 | AlwaysBreakAfterReturnType: None
13 | AlwaysBreakTemplateDeclarations: Yes
14 | BinPackArguments: false
15 | BinPackParameters: false
16 | BreakAfterAttributes: Always
17 | BreakBeforeBraces: Attach
18 | BreakConstructorInitializers: AfterColon
19 | ColumnLimit: 120
20 | ContinuationIndentWidth: 4
21 | IncludeBlocks: Regroup
22 | IndentWidth: 4
23 | InsertNewlineAtEOF: true
24 | PackConstructorInitializers: CurrentLine
25 | PenaltyReturnTypeOnItsOwnLine: 100000
26 | ReflowComments: true
27 | SeparateDefinitionBlocks: Always
28 | ShortNamespaceLines: 0
29 | TabWidth: 4
30 | UseTab: Never
31 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/diag_mask_inf.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_shader_16bit_storage : require
 4 | #extension GL_EXT_control_flow_attributes : enable
 5 | 
 6 | layout (push_constant) uniform parameter
 7 | {
 8 |     uint ncols;
 9 |     uint rows_per_channel;
10 |     uint n_past;
11 | } p;
12 | 
13 | #include "types.comp"
14 | 
15 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
16 | 
17 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
18 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
19 | 
20 | void main() {
21 |     const uint col = gl_GlobalInvocationID.y;
22 |     const uint row = gl_GlobalInvocationID.x;
23 | 
24 |     if (col >= p.ncols) {
25 |         return;
26 |     }
27 | 
28 |     const uint i = row*p.ncols + col;
29 |     if (col > p.n_past + row % p.rows_per_channel) {
30 |         data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000));
31 |     } else {
32 |         data_d[i] = D_TYPE(data_a[i]);
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q8_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q8_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 16*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 | 
25 |     const uint q_idx = 16*il;
26 | 
27 |     [[unroll]] for (uint l = 0; l < 16; l += 2) {
28 |         data_b[b_idx + l    ] = D_TYPE(d * data_a[ib].qs[q_idx + l    ]);
29 |         data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/pad.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_unary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 8 | 
 9 |     if (idx >= p.ne) {
10 |         return;
11 |     }
12 | 
13 |     const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
14 |     const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
15 |     const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
16 |     const uint i2_offset = i2*p.ne11*p.ne10;
17 |     const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
18 |     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
19 | 
20 |     const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
21 |     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
22 | 
23 |     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
24 | 
25 |     data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
26 | }
27 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint q_idx = 8*il;
22 |     const uint b_idx = 1024*i + 32*ir + q_idx;
23 | 
24 |     const float d = float(data_a[ib].d);
25 | 
26 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
27 |         data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
28 |         data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_iq4_nl.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint q_idx = 8*il;
22 |     const uint b_idx = 1024*i + 32*ir + q_idx;
23 | 
24 |     const float d = float(data_a[ib].d);
25 | 
26 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
27 |         data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
28 |         data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/core/spin_barrier.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #if defined(__cplusplus)
18 | extern "C" {
19 | #endif
20 | 
21 | #include <stddef.h>
22 | 
23 | struct spin_barrier {
24 |     size_t opaque[2];
25 | };
26 | 
27 | void spin_barrier_init(struct spin_barrier *opaque, size_t width);
28 | void spin_barrier_wait(struct spin_barrier *opaque);
29 | 
30 | #if defined(__cplusplus)
31 | }
32 | #endif
33 | 


--------------------------------------------------------------------------------
/libs/ggml/src/llamafile/sgemm.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | #include <stdint.h>
17 | #include <stdbool.h>
18 | #ifdef __cplusplus
19 | extern "C" {
20 | #endif
21 | 
22 | bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
23 |                      const void *, int64_t, void *, int64_t, int, int,
24 |                      int, int, int);
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const float m = float(data_a[ib].m);
25 | 
26 |     const uint q_idx = 8*il;
27 | 
28 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
29 |         data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
30 |         data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + m);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/graph/node.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "graph/node.hpp"
16 | 
17 | namespace powerserve {
18 | 
19 | auto Node::tensor() -> Tensor * {
20 |     return dynamic_cast<Tensor *>(this);
21 | }
22 | 
23 | auto Node::tensor_view() -> TensorViewNode * {
24 |     return dynamic_cast<TensorViewNode *>(this);
25 | }
26 | 
27 | auto Node::op() -> OpNode * {
28 |     return dynamic_cast<OpNode *>(this);
29 | }
30 | 
31 | } // namespace powerserve
32 | 


--------------------------------------------------------------------------------
/app/server/server.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "cmdline.hpp"
16 | #include "simple_server.hpp"
17 | 
18 | #include <cstdlib>
19 | 
20 | int main(int argc, char *argv[]) {
21 |     // 0. load config
22 |     const powerserve::CommandLineArgument args = powerserve::parse_command_line("PowerServe CLI", argc, argv);
23 | 
24 |     simple_server_handler(args.work_folder, args.qnn_lib_folder, args.host, args.port);
25 | 
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/rope_norm.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "rope_head.comp"
 4 | 
 5 | void main() {
 6 |     const uint col = gl_GlobalInvocationID.y * 2;
 7 |     const uint row = gl_GlobalInvocationID.x;
 8 | 
 9 |     if (col >= p.ncols) {
10 |         return;
11 |     }
12 | 
13 |     if (col >= p.n_dims) {
14 |         const uint i = row*p.ncols + col;
15 | 
16 |         data_d[i + 0] = data_a[i + 0];
17 |         data_d[i + 1] = data_a[i + 1];
18 | 
19 |         return;
20 |     }
21 | 
22 |     const uint i = row*p.ncols + col;
23 |     const uint i2 = row/p.p_delta_rows;
24 | 
25 |     const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
26 | 
27 |     const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
28 | 
29 |     float cos_theta, sin_theta;
30 |     rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
31 | 
32 |     const float x0 = float(data_a[i + 0]);
33 |     const float x1 = float(data_a[i + 1]);
34 | 
35 |     data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
36 |     data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
37 | }
38 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/quantize.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.cuh"
 4 | #include "mmq.cuh"
 5 | 
 6 | #include <cstdint>
 7 | 
 8 | #define CUDA_QUANTIZE_BLOCK_SIZE     256
 9 | #define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
10 | 
11 | static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
12 | static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
13 | 
14 | typedef void (*quantize_cuda_t)(
15 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
16 |     const ggml_type type_x, cudaStream_t stream);
17 | 
18 | void quantize_row_q8_1_cuda(
19 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
20 |     const ggml_type type_x, cudaStream_t stream);
21 | 
22 | void quantize_mmq_q8_1_cuda(
23 |     const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
24 |     const ggml_type type_x, cudaStream_t stream);
25 | 


--------------------------------------------------------------------------------
/src/core/defines.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #define POWERSERVE_BUILTIN_EXPECT(expr, value) __builtin_expect((expr), (value))
18 | #define POWERSERVE_LIKELY(expr)                POWERSERVE_BUILTIN_EXPECT((expr), 1)
19 | #define POWERSERVE_UNLIKELY(expr)              POWERSERVE_BUILTIN_EXPECT((expr), 0)
20 | 
21 | #if !defined(ALWAYS_INLINE)
22 | #define ALWAYS_INLINE __attribute__((always_inline))
23 | #endif
24 | 
25 | #define POWERSERVE_UNUSED(x) ((void)(x))
26 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/rope_neox.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "rope_head.comp"
 4 | 
 5 | void main() {
 6 |     const uint col = gl_GlobalInvocationID.y * 2;
 7 |     const uint row = gl_GlobalInvocationID.x;
 8 | 
 9 |     if (col >= p.ncols) {
10 |         return;
11 |     }
12 | 
13 |     if (col >= p.n_dims) {
14 |         const uint i = row*p.ncols + col;
15 | 
16 |         data_d[i + 0] = data_a[i + 0];
17 |         data_d[i + 1] = data_a[i + 1];
18 | 
19 |         return;
20 |     }
21 | 
22 |     const uint i  = row*p.ncols + col/2;
23 |     const uint i2 = row/p.p_delta_rows;
24 | 
25 |     const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
26 | 
27 |     const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
28 | 
29 |     float cos_theta, sin_theta;
30 |     rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
31 | 
32 |     const float x0 = float(data_a[i + 0]);
33 |     const float x1 = float(data_a[i + p.n_dims/2]);
34 | 
35 |     data_d[i + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
36 |     data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
37 | }
38 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 2
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q4_0
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q4_0 get_unaligned_block_q4_0(uint index) {
25 |     block_q4_0 fres;
26 |     fres.d = u8BufToFloat16(inA, index);
27 |     [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
28 |         fres.qs[it] = inA[index+2+it];
29 |     }
30 |     return fres;
31 | }
32 | 
33 | mat4 dequantize_block(uint index, uint il) {
34 |     const block_q4_0 block = get_unaligned_block_q4_0(index);
35 |     return dequantize_q4_0(block, il);
36 | }
37 | 
38 | #include "op_getrows.comp"
39 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/sum_rows.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 8 | 
 9 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
10 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
11 | 
12 | layout (constant_id = 0) const uint BLOCK_SIZE = 32;
13 | 
14 | shared FLOAT_TYPE tmp[BLOCK_SIZE];
15 | 
16 | void main() {
17 |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18 |     const uint col = gl_LocalInvocationID.x;
19 | 
20 |     tmp[col] = FLOAT_TYPE(0.0f);
21 | 
22 |     for (uint i = col; i < p.KX; i += BLOCK_SIZE) {
23 |         tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]);
24 |     }
25 | 
26 |     barrier();
27 |     [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) {
28 |         if (col < s) {
29 |             tmp[col] += tmp[col + s];
30 |         }
31 |         barrier();
32 |     }
33 | 
34 |     if (col == 0) {
35 |         data_d[row] = D_TYPE(tmp[0]);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/tools/end_to_end/powerserve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to check if a command exists
 4 | check_command() {
 5 |     if ! command -v $1 &> /dev/null; then
 6 |         echo -e "\033[31m$1 could not be found. Please install it.\033[0m"
 7 |         exit 1
 8 |     fi
 9 | }
10 | 
11 | # Check necessary commands
12 | check_command "docker"
13 | check_command "adb"
14 | check_command "python3"
15 | check_command "pip"
16 | 
17 | # echo -e "\033[36mChecking if the needed PowerServe docker image was updated...\033[0m"
18 | # don not show stderr or stdout
19 | sudo docker pull santoxin/mobile-build:v1.1 &> /dev/null
20 | 
21 | # Check whether now locates at .../PowerServe
22 | if [ ! -d "tools" ]; then
23 |     echo -e "\033[31mPlease run this script from the root directory of PowerServe.\033[0m"
24 |     exit 1
25 | fi
26 | 
27 | # check if pip install requests huggingface_hub
28 | pip install requests huggingface_hub --quiet
29 | 
30 | # forward all strings to python file
31 | # example: ./tools/powerserve.sh run llama-3.2-1b -> then call python ./tools/end_to_end/powerserve.py run llama-3.2-1b
32 | python3 ./tools/end_to_end/powerserve.py "$@"
33 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/get_rows_quant.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | #include "dequant_funcs.comp"
 6 | 
 7 | void main() {
 8 |     const uint i00 = (gl_GlobalInvocationID.x)*2;
 9 |     const uint i10 = gl_GlobalInvocationID.y;
10 |     const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
11 |     const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
12 | 
13 |     if (i00 >= p.ne00) {
14 |         return;
15 |     }
16 | 
17 |     const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
18 | 
19 |     const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
20 |     const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
21 | 
22 |     const uint ib = a_offset + i00/QUANT_K; // block index
23 |     const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
24 |     const uint iybs = i00 - i00%QUANT_K; // dst block start index
25 |     const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
26 | 
27 |     vec2 v = dequantize(ib, iqs, 0);
28 | 
29 |     data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
30 |     data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
31 | }
32 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 2
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q4_1
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q4_1 get_unaligned_block_q4_1(uint index) {
25 |     block_q4_1 fres;
26 |     fres.d = u8BufToFloat16(inA, index);
27 |     fres.m = u8BufToFloat16(inA, index+2);
28 |     [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
29 |         fres.qs[it] = inA[index+4+it];
30 |     }
31 |     return fres;
32 | }
33 | 
34 | mat4 dequantize_block(uint index, uint il) {
35 |     const block_q4_1 block = get_unaligned_block_q4_1(index);
36 |     return dequantize_q4_1(block, il);
37 | }
38 | 
39 | #include "op_getrows.comp"
40 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/scale.cu:
--------------------------------------------------------------------------------
 1 | #include "scale.cuh"
 2 | 
 3 | static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
 4 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 5 | 
 6 |     if (i >= k) {
 7 |         return;
 8 |     }
 9 | 
10 |     dst[i] = scale * x[i];
11 | }
12 | 
13 | static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
14 |     const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
15 |     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
16 | }
17 | 
18 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
19 |     const ggml_tensor * src0 = dst->src[0];
20 |     const float * src0_d = (const float *)src0->data;
21 |     float * dst_d = (float *)dst->data;
22 |     cudaStream_t stream = ctx.stream();
23 | 
24 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
25 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
26 | 
27 |     float scale;
28 |     memcpy(&scale, dst->op_params, sizeof(float));
29 | 
30 |     scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
31 | }
32 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Georgi Gerganov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/cos_sim.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | try:
 5 |     import numpy as np
 6 | 
 7 |     def cos_sim(a, b):
 8 |         return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
 9 | 
10 | except ImportError:
11 | 
12 |     def dot_product(a, b):
13 |         return sum(x * y for x, y in zip(a, b))
14 | 
15 |     def norm(a):
16 |         return sum(x * x for x in a) ** 0.5
17 | 
18 |     def cos_sim(a, b):
19 |         return dot_product(a, b) / (norm(a) * norm(b))
20 | 
21 | 
22 | if __name__ == "__main__":
23 | 
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--f1", type=str)
26 |     parser.add_argument("--f2", type=str)
27 |     args = parser.parse_args()
28 | 
29 |     file1 = args.f1
30 |     file2 = args.f2
31 | 
32 |     with open(file1, "r") as f1, open(file2, "r") as f2:
33 |         lines1 = f1.readlines()
34 |         lines2 = f2.readlines()
35 | 
36 |     a = [float(line.replace("\n", "")) for line in lines1 if line.replace("\n", "").strip()]
37 |     b = [float(line.replace("\n", "")) for line in lines2 if line.replace("\n", "").strip()]
38 | 
39 |     assert len(a) == len(b), "two file's length must be equal!"
40 |     print(cos_sim(a, b))
41 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_mul_mat_q4_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define BLOCKS_IN_QUANT QK4_0
 6 | #define SIZE_OF_BLOCK sizeof_block_q4_0
 7 | #define N_ROWS 4
 8 | 
 9 | #include "op_mul_mv_q_n_pre.comp"
10 | 
11 | // The q4_0 version of this function
12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13 |     vec2 acc = vec2(0.0, 0.0);
14 |     const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15 |     float d = float(u8BufToFloat16(inA, index));
16 |     float sumy = 0.0f;
17 |     for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
18 |         const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
19 | 
20 |         const float yl0 = inB[yb + i];
21 |         const float yl1 = inB[yb + i + 1];
22 |         const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
23 |         const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
24 | 
25 |         sumy += yl0 + yl1 + yl8 + yl9;
26 | 
27 |         acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
28 |         acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
29 |     }
30 |     return d * (sumy * -8.f + acc[0] + acc[1]);
31 | }
32 | 
33 | #include "op_mul_mv_q_n.comp"
34 | 


--------------------------------------------------------------------------------
/src/model/module/ffn.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "graph/graph.hpp"
18 | #include "model/common/weights.hpp"
19 | 
20 | namespace powerserve {
21 | 
22 | struct FFN {
23 | private:
24 |     const ModelConfig::LLMConfig &m_config;
25 |     std::shared_ptr<Weight> m_weights;
26 | 
27 | public:
28 |     FFN(const ModelConfig::LLMConfig &config, std::shared_ptr<Weight> weights) : m_config(config), m_weights(weights) {}
29 | 
30 | public:
31 |     TensorNode *build(Graph &g, TensorNode *attn_o, int64_t L);
32 | };
33 | 
34 | } // namespace powerserve
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/vendors/cuda.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <cuda_runtime.h>
18 | #include <cuda.h>
19 | #include <cublas_v2.h>
20 | #include <cuda_fp16.h>
21 | 
22 | #if CUDART_VERSION < 11020
23 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
24 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
25 | #define CUBLAS_COMPUTE_16F CUDA_R_16F
26 | #define CUBLAS_COMPUTE_32F CUDA_R_32F
27 | #define cublasComputeType_t cudaDataType_t
28 | #endif // CUDART_VERSION < 11020
29 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q5_0.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q5_0 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
25 | 
26 |     const uint q_idx = 8*il;
27 | 
28 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
29 |         const uint iqs = q_idx + l;
30 |         const uint vui = uint(data_a[ib].qs[iqs]);
31 |         data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f));
32 |         data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f));
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cann/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (NOT SOC_TYPE)
 2 |     set (SOC_TYPE "Ascend910B3")
 3 | endif()
 4 | 
 5 | file(GLOB SRC_FILES
 6 |     get_row_f32.cpp
 7 |     get_row_f16.cpp
 8 |     get_row_q4_0.cpp
 9 |     get_row_q8_0.cpp
10 |     quantize_f32_q8_0.cpp
11 |     quantize_f16_q8_0.cpp
12 |     quantize_float_to_q4_0.cpp
13 |     dup.cpp
14 | )
15 | 
16 | string(TOLOWER ${SOC_TYPE} SOC_VERSION)
17 | set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
18 | set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
19 | 
20 | if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
21 |     set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
22 | elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
23 |     set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
24 | else()
25 |     message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
26 | endif()
27 | include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
28 | 
29 | ascendc_library(ascendc_kernels STATIC
30 |     ${SRC_FILES}
31 | )
32 | 
33 | # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
34 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q5_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {block_q5_1 data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
12 | 
13 |     const uint tid = gl_LocalInvocationID.x % 64;
14 |     const uint il  = tid/32;
15 |     const uint ir  = tid%32;
16 |     const uint ib = 32*i + ir;
17 |     if (ib >= p.nel / 32) {
18 |         return;
19 |     }
20 | 
21 |     const uint b_idx = 1024*i + 32*ir + 8*il;
22 | 
23 |     const float d = float(data_a[ib].d);
24 |     const float m = float(data_a[ib].m);
25 |     const uint qh = data_a[ib].qh;
26 | 
27 |     const uint q_idx = 8*il;
28 | 
29 |     [[unroll]] for (uint l = 0; l < 8; ++l) {
30 |         const uint iqs = q_idx + l;
31 |         const uint vui = uint(data_a[ib].qs[iqs]);
32 |         data_b[b_idx + l +  0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m);
33 |         data_b[b_idx + l + 16] = D_TYPE(d * (((vui >>  4) | ((qh >> (iqs + 12)) & 0x10))) + m);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "libs/json"]
 2 | 	path = libs/json
 3 | 	url = https://github.com/nlohmann/json.git
 4 | [submodule "libs/fmt"]
 5 | 	path = libs/fmt
 6 | 	url = https://github.com/fmtlib/fmt.git
 7 | [submodule "libs/cli11"]
 8 | 	path = libs/cli11
 9 | 	url = https://github.com/CLIUtils/CLI11.git
10 | [submodule "libs/libuv"]
11 | 	path = libs/libuv
12 | 	url = https://github.com/libuv/libuv.git
13 | [submodule "libs/xtensor"]
14 | 	path = libs/xtensor
15 | 	url = https://github.com/xtensor-stack/xtensor.git
16 | [submodule "libs/xtl"]
17 | 	path = libs/xtl
18 | 	url = https://github.com/xtensor-stack/xtl.git
19 | [submodule "tools/mmmu_test/mmmu"]
20 | 	path = tools/mmmu_test/mmmu
21 | 	url = https://github.com/MMMU-Benchmark/MMMU.git
22 | [submodule "libs/stb_headers/stb"]
23 | 	path = libs/stb_headers/stb
24 | 	url = https://github.com/nothings/stb.git
25 | [submodule "libs/cpp-httplib"]
26 | 	path = libs/cpp-httplib
27 | 	url = https://github.com/yhirose/cpp-httplib.git
28 | [submodule "libs/concurrentqueue"]
29 | 	path = libs/concurrentqueue
30 | 	url = https://github.com/cameron314/concurrentqueue
31 | [submodule "libs/perfetto"]
32 | 	path = libs/perfetto
33 | 	url = https://android.googlesource.com/platform/external/perfetto
34 | 


--------------------------------------------------------------------------------
/src/graph/op_type.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | namespace powerserve {
18 | 
19 | enum class OpType {
20 |     NONE = 0,
21 | 
22 |     ADD,
23 |     MAT_MUL,
24 |     RMS_NORM,
25 |     SILU_HADAMARD,
26 |     ROPE,
27 |     SOFTMAX,
28 |     COPY,
29 | 
30 | #if defined(POWERSERVE_WITH_QNN)
31 |     QNN_FORWARD,
32 |     QNN_FORWARD_VL,
33 | #endif
34 | 
35 |     PRINT,
36 |     GET_EMBEDDING,
37 |     ADD_CACHE,
38 |     PERMUTE,
39 |     CONT,
40 |     VIEW,
41 |     SOFTMAX_EXT,
42 |     GET_MASK,
43 |     TRANSPOSE,
44 |     INSERT_IMG_EMBEDDIGN,
45 | };
46 | 
47 | } // namespace powerserve
48 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/timestep_embedding.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #extension GL_EXT_shader_16bit_storage : require
 4 | 
 5 | layout (push_constant) uniform parameter
 6 | {
 7 |     uint nb1;
 8 |     uint dim;
 9 |     uint max_period;
10 | } p;
11 | 
12 | #include "types.comp"
13 | 
14 | #extension GL_EXT_control_flow_attributes : enable
15 | #define BLOCK_SIZE 256
16 | 
17 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
18 | 
19 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
20 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
21 | 
22 | void main() {
23 |     const uint i = gl_WorkGroupID.y;
24 |     const uint j = gl_GlobalInvocationID.x;
25 |     const uint d_offset = i * p.nb1;
26 | 
27 |     if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
28 |         data_d[d_offset + p.dim] = 0.f;
29 |     }
30 | 
31 |     const uint half_dim = p.dim / 2;
32 |     if (j >= half_dim) {
33 |         return;
34 |     }
35 | 
36 |     const float timestep = float(data_a[i]);
37 |     const float freq = float(exp(-log(p.max_period) * j / half_dim));
38 |     const float arg = timestep * freq;
39 |     data_d[d_offset + j] = D_TYPE(cos(arg));
40 |     data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
41 | }
42 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_mul_mat_q4_1.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define BLOCKS_IN_QUANT QK4_1
 6 | #define SIZE_OF_BLOCK sizeof_block_q4_1
 7 | #define N_ROWS 4
 8 | 
 9 | #include "op_mul_mv_q_n_pre.comp"
10 | 
11 | // The q4_1 version of this function
12 | float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13 |     vec2 acc = vec2(0.0, 0.0);
14 |     const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15 |     float d = float(u8BufToFloat16(inA, index));
16 |     float m = float(u8BufToFloat16(inA, index+2));
17 | 
18 |     float sumy = 0.0f;
19 |     for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
20 |         const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
21 | 
22 |         const float yl0 = inB[yb + i];
23 |         const float yl1 = inB[yb + i + 1];
24 |         const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
25 |         const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
26 | 
27 |         sumy += yl0 + yl1 + yl8 + yl9;
28 | 
29 |         acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
30 |         acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
31 |     }
32 |     return d * (acc[0] + acc[1]) + sumy * m;
33 | }
34 | 
35 | #include "op_mul_mv_q_n.comp"
36 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "gguf"
 3 | version = "0.10.0"
 4 | description = "Read and write ML models in GGUF for GGML"
 5 | authors = ["GGML <ggml@ggml.ai>"]
 6 | packages = [
 7 |     {include = "gguf"},
 8 |     {include = "gguf/py.typed"},
 9 |     {include = "scripts"},
10 | ]
11 | readme = "README.md"
12 | homepage = "https://ggml.ai"
13 | repository = "https://github.com/ggerganov/llama.cpp"
14 | keywords = ["ggml", "gguf", "llama.cpp"]
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Operating System :: OS Independent",
19 | ]
20 | 
21 | [tool.poetry.dependencies]
22 | python = ">=3.8"
23 | numpy = ">=1.17"
24 | tqdm = ">=4.27"
25 | pyyaml = ">=5.1"
26 | sentencepiece = ">=0.1.98,<=0.2.0"
27 | 
28 | [tool.poetry.dev-dependencies]
29 | pytest = "^5.2"
30 | 
31 | [build-system]
32 | requires = ["poetry-core>=1.0.0"]
33 | build-backend = "poetry.core.masonry.api"
34 | 
35 | [tool.poetry.scripts]
36 | gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint"
37 | gguf-dump = "scripts:gguf_dump_entrypoint"
38 | gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint"
39 | gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint"
40 | 


--------------------------------------------------------------------------------
/src/speculative/speculative_config.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "core/typedefs.hpp"
18 | 
19 | namespace powerserve {
20 | 
21 | struct SpeculativeConfig {
22 |     size_t draft_batch_size = 12;
23 | 
24 |     struct {
25 |         size_t top_k      = 15;
26 |         float temperature = 1.5f;
27 |         float p_base      = 0.9f;
28 |     } draft_sampler;
29 | 
30 |     struct {
31 |         size_t max_fan_out = 3;
32 |         float min_prob     = 0.2f;
33 |         bool early_stop    = true;
34 |         bool debug         = false;
35 |     } token_tree;
36 | };
37 | 
38 | } // namespace powerserve
39 | 


--------------------------------------------------------------------------------
/src/core/getenv.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <string>
18 | 
19 | namespace powerserve {
20 | 
21 | template <typename T>
22 | T getenv(const std::string &name, const T &default_value) {
23 |     auto env = ::getenv(name.c_str());
24 |     if (env) {
25 |         if constexpr (std::is_integral_v<T>) {
26 |             return atoll(env);
27 |         } else if constexpr (std::is_floating_point_v<T>) {
28 |             return atof(env);
29 |         } else {
30 |             return std::string(env);
31 |         }
32 |     } else {
33 |         return default_value;
34 |     }
35 | }
36 | 
37 | } // namespace powerserve
38 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/upscale.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (push_constant) uniform parameter
 4 | {
 5 |     uint ne; uint d_offset;
 6 |     uint nb00; uint nb01; uint nb02; uint nb03;
 7 |     uint ne10; uint ne11; uint ne12; uint ne13;
 8 |     float sf0; float sf1; float sf2; float sf3;
 9 | } p;
10 | 
11 | #include "types.comp"
12 | 
13 | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
14 | 
15 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
16 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
17 | 
18 | void main() {
19 |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
20 | 
21 |     if (idx >= p.ne) {
22 |         return;
23 |     }
24 | 
25 |     const uint i10 = idx % p.ne10;
26 |     const uint i11 = (idx / p.ne10) % p.ne11;
27 |     const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
28 |     const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
29 | 
30 |     const uint i00 = uint(i10 / p.sf0);
31 |     const uint i01 = uint(i11 / p.sf1);
32 |     const uint i02 = uint(i12 / p.sf2);
33 |     const uint i03 = uint(i13 / p.sf3);
34 | 
35 |     data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36 | }
37 | 


--------------------------------------------------------------------------------
/tools/convert_hf_to_gguf/gguf-py/examples/writer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | 
 7 | 
 8 | # Necessary to load the local gguf package
 9 | sys.path.insert(0, str(Path(__file__).parent.parent))
10 | 
11 | from gguf import GGUFWriter  # noqa: E402
12 | 
13 | 
14 | # Example usage:
15 | def writer_example() -> None:
16 |     # Example usage with a file
17 |     gguf_writer = GGUFWriter("example.gguf", "llama")
18 | 
19 |     gguf_writer.add_block_count(12)
20 |     gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
21 |     gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
22 |     gguf_writer.add_custom_alignment(64)
23 | 
24 |     tensor1 = np.ones((32,), dtype=np.float32) * 100.0
25 |     tensor2 = np.ones((64,), dtype=np.float32) * 101.0
26 |     tensor3 = np.ones((96,), dtype=np.float32) * 102.0
27 | 
28 |     gguf_writer.add_tensor("tensor1", tensor1)
29 |     gguf_writer.add_tensor("tensor2", tensor2)
30 |     gguf_writer.add_tensor("tensor3", tensor3)
31 | 
32 |     gguf_writer.write_header_to_file()
33 |     gguf_writer.write_kv_data_to_file()
34 |     gguf_writer.write_tensors_to_file()
35 | 
36 |     gguf_writer.close()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     writer_example()
41 | 


--------------------------------------------------------------------------------
/libs/ggml/include/ggml-blas.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "ggml.h"
18 | #include "ggml-backend.h"
19 | 
20 | 
21 | #ifdef  __cplusplus
22 | extern "C" {
23 | #endif
24 | 
25 | // backend API
26 | GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
27 | 
28 | GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
29 | 
30 | // number of threads used for conversion to float
31 | // for openblas and blis, this will also set the number of threads used for blas operations
32 | GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
33 | 
34 | 
35 | #ifdef  __cplusplus
36 | }
37 | #endif
38 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/conv.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_CONV_HPP
28 | #define GGML_SYCL_CONV_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
33 |   const ggml_tensor *src1, ggml_tensor *dst);
34 | 
35 | #endif // GGML_SYCL_CONV_HPP
36 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/clamp.cu:
--------------------------------------------------------------------------------
 1 | #include "clamp.cuh"
 2 | 
 3 | static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
 4 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 5 | 
 6 |     if (i >= k) {
 7 |         return;
 8 |     }
 9 | 
10 |     dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
11 | }
12 | 
13 | static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
14 |     const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
15 |     clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
16 | }
17 | 
18 | 
19 | void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
20 |     const ggml_tensor * src0 = dst->src[0];
21 |     const float * src0_d = (const float *)src0->data;
22 |     float * dst_d = (float *)dst->data;
23 |     cudaStream_t stream = ctx.stream();
24 | 
25 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
26 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
27 | 
28 |     float min;
29 |     float max;
30 |     memcpy(&min, dst->op_params, sizeof(float));
31 |     memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
32 | 
33 |     clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/tsembd.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_TSEMBD_HPP
28 | #define GGML_SYCL_TSEMBD_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
33 |     const ggml_tensor *src1, ggml_tensor * dst);
34 | 
35 | #endif // GGML_SYCL_TSEMBD_HPP
36 | 


--------------------------------------------------------------------------------
/tools/qnn_converter/graph_params.py:
--------------------------------------------------------------------------------
 1 | class GraphParams:
 2 |     batch_size: int
 3 |     cache_size: int
 4 |     context_size: int
 5 | 
 6 | 
 7 | class Batch1_Params(GraphParams):
 8 |     batch_size = 1
 9 |     cache_size = 1920
10 |     context_size = 2048
11 | 
12 | 
13 | class Batch4_Params(GraphParams):
14 |     batch_size = 4
15 |     cache_size = 1920
16 |     context_size = 2048
17 | 
18 | 
19 | class Batch8_Params(GraphParams):
20 |     batch_size = 8
21 |     cache_size = 1920
22 |     context_size = 2048
23 | 
24 | 
25 | class Batch12_Params(GraphParams):
26 |     batch_size = 12
27 |     cache_size = 1920
28 |     context_size = 2048
29 | 
30 | 
31 | class Batch16_Params(GraphParams):
32 |     batch_size = 16
33 |     cache_size = 1920
34 |     context_size = 2048
35 | 
36 | 
37 | class Batch32_Params(GraphParams):
38 |     batch_size = 32
39 |     cache_size = 1920
40 |     context_size = 2048
41 | 
42 | 
43 | class Batch128_Params(GraphParams):
44 |     batch_size = 128
45 |     cache_size = 1920
46 |     context_size = 2048
47 | 
48 | 
49 | graph_map: dict[str, GraphParams] = {
50 |     "batch_1": Batch1_Params,
51 |     "batch_4": Batch4_Params,
52 |     "batch_8": Batch8_Params,
53 |     "batch_12": Batch12_Params,
54 |     "batch_16": Batch16_Params,
55 |     "batch_32": Batch32_Params,
56 |     "batch_128": Batch128_Params,
57 | }
58 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/concat.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_CONCAT_HPP
28 | #define GGML_SYCL_CONCAT_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
33 |                                 const ggml_tensor *src1, ggml_tensor *dst);
34 | 
35 | #endif // GGML_SYCL_CONCAT_HPP
36 | 


--------------------------------------------------------------------------------
/libs/llama_tokenizer/unicode-data.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <cstdint>
18 | #include <vector>
19 | #include <unordered_map>
20 | #include <unordered_set>
21 | 
22 | struct range_nfd {
23 |     uint32_t first;
24 |     uint32_t last;
25 |     uint32_t nfd;
26 | };
27 | 
28 | static const uint32_t MAX_CODEPOINTS = 0x110000;
29 | 
30 | extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
31 | extern const std::unordered_set<uint32_t> unicode_set_whitespace;
32 | extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
33 | extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
34 | extern const std::vector<range_nfd> unicode_ranges_nfd;
35 | 


--------------------------------------------------------------------------------
/src/core/typedefs.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "llama-vocab.h"
18 | 
19 | #include <array>
20 | #include <filesystem>
21 | 
22 | namespace powerserve {
23 | 
24 | using Path  = std::filesystem::path;
25 | using Token = llama_vocab::id;
26 | 
27 | static constexpr size_t max_n_dims = 4;
28 | using Shape                        = std::array<size_t, max_n_dims>;
29 | using Stride                       = std::array<size_t, max_n_dims>;
30 | 
31 | struct Noncopyable {
32 |     Noncopyable(const Noncopyable &)    = delete;
33 |     auto operator=(const Noncopyable &) = delete;
34 | 
35 | protected:
36 |     Noncopyable()  = default;
37 |     ~Noncopyable() = default;
38 | };
39 | 
40 | } // namespace powerserve
41 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/arange.cu:
--------------------------------------------------------------------------------
 1 | #include "arange.cuh"
 2 | 
 3 | static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
 4 |     // blockIDx.x: idx of ne0 / BLOCK_SIZE
 5 |     int nidx = threadIdx.x + blockIdx.x * blockDim.x;
 6 |     if (nidx >= ne0) {
 7 |         return;
 8 |     }
 9 |     dst[nidx] = start + step * nidx;
10 | }
11 | 
12 | static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
13 |     int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
14 |     arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
15 | }
16 | 
17 | void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
18 |     float * dst_d = (float *)dst->data;
19 |     cudaStream_t stream = ctx.stream();
20 | 
21 |     GGML_ASSERT(dst->type == GGML_TYPE_F32);
22 | 
23 |     float start;
24 |     float stop;
25 |     float step;
26 |     memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
27 |     memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
28 |     memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
29 | 
30 |     int64_t steps = (int64_t)ceil((stop - start) / step);
31 |     GGML_ASSERT(ggml_nelements(dst) == steps);
32 | 
33 |     arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cuda/sumrows.cu:
--------------------------------------------------------------------------------
 1 | #include "sumrows.cuh"
 2 | 
 3 | static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
 4 |     const int row = blockIdx.x;
 5 |     const int col = threadIdx.x;
 6 | 
 7 |     float sum = 0.0f;
 8 |     for (int i = col; i < ncols; i += blockDim.x) {
 9 |         sum += x[row * ncols + i];
10 |     }
11 | 
12 |     sum = warp_reduce_sum(sum);
13 | 
14 |     if (col == 0) {
15 |         dst[row] = sum;
16 |     }
17 | }
18 | 
19 | void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
20 |     const dim3 block_dims(WARP_SIZE, 1, 1);
21 |     const dim3 block_nums(nrows, 1, 1);
22 |     k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
23 | }
24 | 
25 | void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
26 |     const ggml_tensor * src0 = dst->src[0];
27 |     const float * src0_d = (const float *)src0->data;
28 |     float * dst_d = (float *)dst->data;
29 |     cudaStream_t stream = ctx.stream();
30 | 
31 |     GGML_ASSERT(src0->type == GGML_TYPE_F32);
32 |     GGML_ASSERT( dst->type == GGML_TYPE_F32);
33 |     GGML_ASSERT(ggml_is_contiguous(src0));
34 | 
35 |     const int64_t ncols = src0->ne[0];
36 |     const int64_t nrows = ggml_nrows(src0);
37 | 
38 |     sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
39 | }
40 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/rope.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_ROPE_HPP
28 | #define GGML_SYCL_ROPE_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_rope(
33 |     ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
34 |     const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
35 | 
36 | #endif // GGML_SYCL_ROPE_HPP
37 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(powerserve STATIC)
 2 | target_include_directories(powerserve PUBLIC .)
 3 | 
 4 | target_link_libraries(powerserve PUBLIC
 5 |     fmt
 6 |     ggml
 7 |     llama_tokenizer
 8 |     nlohmann_json::nlohmann_json
 9 | )
10 | 
11 | if (NOT OHOS)
12 |     message(STATUS "Introduce uv a")
13 |     target_link_libraries(powerserve PUBLIC uv_a)
14 | endif()
15 | 
16 | if (POWERSERVE_WITH_PERFETTO)
17 |     target_link_libraries(powerserve PRIVATE perfetto)
18 | endif()
19 | 
20 | if (POWERSERVE_ANDROID_LOG)
21 |     add_compile_definitions(POWERSERVE_ANDROID_LOG)
22 |     target_link_libraries(powerserve PUBLIC log)
23 | endif ()
24 | 
25 | if (NOT MSVC)
26 |     target_compile_options(powerserve PRIVATE
27 |         -Wall
28 |         -Wextra
29 | 
30 |         -Wno-unused-function
31 |     )
32 | 
33 |     if (POWERSERVE_ENABLE_WERROR)
34 |         target_compile_options(powerserve PRIVATE -Werror)
35 |     endif()
36 | endif()
37 | 
38 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
39 |     target_compile_options(powerserve PRIVATE -Wno-unknown-pragmas)
40 | endif()
41 | 
42 | add_subdirectory(storage)
43 | add_subdirectory(backend)
44 | add_subdirectory(core)
45 | add_subdirectory(executor)
46 | add_subdirectory(graph)
47 | add_subdirectory(model)
48 | add_subdirectory(sampler)
49 | if (POWERSERVE_WITH_QNN)
50 |     add_subdirectory(speculative)
51 | endif()
52 | add_subdirectory(tokenizer)
53 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_getrows_q6_k.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #define NL 16
 6 | #define BYTES_FOR_TYPE 4 /*bytes for float*/
 7 | #define SIZE_OF_BLOCK sizeof_block_q6_k
 8 | 
 9 | layout(local_size_x = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12 | layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13 | layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout (push_constant) uniform parameter {
16 |     uint inAOff;
17 |     uint inBOff;
18 |     uint outOff;
19 |     int ne00;
20 |     int nb01;
21 |     int nb1;
22 | } pcs;
23 | 
24 | block_q6_k get_unaligned_block_q6_k(uint index) {
25 |     block_q6_k fres;
26 |     [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
27 |         fres.ql[it] = inA[index + it];
28 |     }
29 |     [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
30 |         fres.qh[it] = inA[index + QK_K/2 + it];
31 |     }
32 |     [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
33 |         fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
34 |     }
35 |     fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
36 |     return fres;
37 | }
38 | 
39 | mat4 dequantize_block(uint index, uint il) {
40 |     const block_q6_k block = get_unaligned_block_q6_k(index);
41 |     return dequantize_q6_k(block, il);
42 | }
43 | 
44 | #include "op_getrows.comp"
45 | 


--------------------------------------------------------------------------------
/src/core/spin_barrier.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "core/spin_barrier.h"
18 | 
19 | #include <atomic>
20 | 
21 | #define CHECK_STRUCT_SIZE_AND_ALIGNMENT(a, b)                                                                          \
22 |     static_assert(sizeof(a) == sizeof(b), "Sizes of " #a " and " #b " must equal");                                    \
23 |     static_assert(alignof(a) == alignof(b), "Alignments of " #a " and " #b " must equal")
24 | 
25 | namespace powerserve {
26 | 
27 | struct SpinBarrier {
28 |     void init(size_t new_width);
29 |     void wait();
30 | 
31 | private:
32 |     size_t width = 0;
33 |     std::atomic<size_t> count;
34 | };
35 | 
36 | CHECK_STRUCT_SIZE_AND_ALIGNMENT(SpinBarrier, spin_barrier);
37 | 
38 | } // namespace powerserve
39 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/im2col.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_IM2COL_HPP
28 | #define GGML_SYCL_IM2COL_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_im2col(
33 |         ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
34 |         ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
35 |         const queue_ptr &main_stream);
36 | 
37 | #endif // GGML_SYCL_IM2COL_HPP
38 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/softmax.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_SOFTMAX_HPP
28 | #define GGML_SYCL_SOFTMAX_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0,
33 |     const ggml_tensor *src1, ggml_tensor *dst,
34 |     const float *src0_dd, const float *src1_dd,
35 |     float *dst_dd,
36 |     const queue_ptr &main_stream);
37 | 
38 | #endif // GGML_SYCL_SOFTMAX_HPP
39 | 


--------------------------------------------------------------------------------
/libs/ggml/include/ggml-rpc.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "ggml.h"
18 | #include "ggml-backend.h"
19 | 
20 | #ifdef  __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | #define GGML_RPC_MAX_SERVERS       16
25 | 
26 | // backend API
27 | GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
28 | GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
29 | 
30 | GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
31 | 
32 | GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
33 | 
34 | GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
35 | 
36 | #ifdef  __cplusplus
37 | }
38 | #endif
39 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-cann/kernels/ascendc_kernels.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef ASCENDC_KERNELS_H
16 | #define ASCENDC_KERNELS_H
17 | 
18 | #include "aclrtlaunch_ascendc_get_row_f32.h"
19 | #include "aclrtlaunch_ascendc_get_row_f16.h"
20 | #include "aclrtlaunch_ascendc_get_row_q8_0.h"
21 | #include "aclrtlaunch_ascendc_get_row_q4_0.h"
22 | 
23 | #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
24 | #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
25 | #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
26 | #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
27 | 
28 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
29 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
30 | #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
31 | #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
32 | 
33 | #endif  // ASCENDC_KERNELS_H
34 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/rms_norm.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | #define BLOCK_SIZE 512
 8 | 
 9 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
12 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
13 | 
14 | shared FLOAT_TYPE sum[BLOCK_SIZE];
15 | 
16 | void main() {
17 |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18 |     const uint tid = gl_LocalInvocationID.x;
19 | 
20 |     sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
21 | 
22 |     [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
23 |         const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
24 |         sum[tid] += xi * xi;
25 |     }
26 | 
27 |     // sum up partial sums and write back result
28 |     barrier();
29 |     [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
30 |         if (tid < s) {
31 |             sum[tid] += sum[tid + s];
32 |         }
33 |         barrier();
34 |     }
35 | 
36 |     const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
37 |     const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
38 | 
39 |     [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
40 |         data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/backend.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_BACKEND_HPP
28 | #define GGML_SYCL_BACKEND_HPP
29 | 
30 | #include "concat.hpp"
31 | #include "common.hpp"
32 | #include "conv.hpp"
33 | #include "convert.hpp"
34 | #include "dequantize.hpp"
35 | #include "dmmv.hpp"
36 | #include "mmq.hpp"
37 | #include "mmvq.hpp"
38 | #include "rope.hpp"
39 | #include "norm.hpp"
40 | #include "softmax.hpp"
41 | #include "tsembd.hpp"
42 | #include "im2col.hpp"
43 | 
44 | #endif // GGML_SYCL_BACKEND_HPP
45 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/norm.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "generic_head.comp"
 4 | #include "types.comp"
 5 | 
 6 | #extension GL_EXT_control_flow_attributes : enable
 7 | #define BLOCK_SIZE 512
 8 | 
 9 | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
10 | 
11 | layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
12 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
13 | 
14 | shared vec2 sum[BLOCK_SIZE];
15 | 
16 | void main() {
17 |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
18 |     const uint tid = gl_LocalInvocationID.x;
19 | 
20 |     sum[tid] = vec2(0.0f, 0.0f);
21 | 
22 |     [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
23 |         const float xi = float(data_a[row*p.KX + col]);
24 |         sum[tid].x += xi;
25 |         sum[tid].y += xi * xi;
26 |     }
27 | 
28 |     // sum up partial sums and write back result
29 |     barrier();
30 |     [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
31 |         if (tid < s) {
32 |             sum[tid] += sum[tid + s];
33 |         }
34 |         barrier();
35 |     }
36 | 
37 |     const float mean = sum[0].x / p.KX;
38 |     const float var = sum[0].y / p.KX - mean * mean;
39 |     const float inv_std = inversesqrt(var + p.param1);
40 | 
41 |     [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
42 |         data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/quant_mul_mat.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "ggml.h"
16 | 
17 | int main() {
18 | 
19 |     struct ggml_init_params params = {
20 |         .mem_size   = 16 * 1024 * 1024,
21 |         .mem_buffer = NULL,
22 |     };
23 | 
24 |     // memory allocation happens here
25 |     struct ggml_context *ctx = ggml_init(params);
26 |     struct ggml_tensor *a    = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, 64, 2);
27 |     // ggml_set_param(ctx, a); // a is an input variable
28 |     struct ggml_tensor *w = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
29 | 
30 |     struct ggml_tensor *a2 = ggml_mul_mat(ctx, w, a);
31 | 
32 |     struct ggml_cgraph *gf = ggml_new_graph(ctx);
33 |     ggml_build_forward_expand(gf, a2);
34 |     // ggml_set_2d();
35 |     ggml_graph_compute_with_ctx(ctx, gf, 1);
36 |     return 0;
37 | 
38 |     // ggml_quantize_chunk(
39 |     // );
40 | }
41 | 


--------------------------------------------------------------------------------
/libs/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CLI11
 2 | add_subdirectory(cli11)
 3 | 
 4 | # fmt
 5 | add_subdirectory(fmt)
 6 | 
 7 | # ggml
 8 | set(GGML_OPENMP OFF)
 9 | add_subdirectory(ggml)
10 | 
11 | if (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
12 |     target_compile_options(ggml PRIVATE -Wno-double-promotion -Wno-unreachable-code-break)
13 | endif()
14 | 
15 | # json
16 | add_subdirectory(json)
17 | 
18 | if (NOT OHOS)
19 |     # libuv
20 |     set(LIBUV_BUILD_SHARED OFF CACHE BOOL "Build shared lib")
21 |     add_subdirectory(libuv)
22 | 
23 |     if (CMAKE_C_COMPILER_ID MATCHES "Clang")
24 |         target_compile_options(uv_a PRIVATE -Wno-missing-prototypes -Wno-unreachable-code-return -Wno-shadow)
25 |     elseif (CMAKE_C_COMPILER_ID MATCHES "GNU")
26 |         target_compile_options(uv_a PRIVATE -Wno-cast-qual -Wno-missing-prototypes -Wno-shadow)
27 |     endif()
28 | endif()
29 | 
30 | # llama_tokenizer
31 | add_subdirectory(llama_tokenizer)
32 | 
33 | # QNN headers
34 | add_subdirectory(qnn_headers)
35 | 
36 | # xtl(Required by xtensor)
37 | add_subdirectory(xtl)
38 | 
39 | # xtensor
40 | add_subdirectory(xtensor)
41 | 
42 | add_subdirectory(stb_headers)
43 | 
44 | # http lib
45 | add_subdirectory(cpp-httplib)
46 | 
47 | # concurrent queue
48 | add_subdirectory(concurrentqueue)
49 | 
50 | # Perfetto
51 | if (POWERSERVE_WITH_PERFETTO)
52 |     add_library(perfetto STATIC perfetto/sdk/perfetto.cc)
53 |     target_include_directories(perfetto PUBLIC perfetto/sdk)
54 |     if (ANDROID)
55 |         target_link_libraries(perfetto PRIVATE -llog)
56 |     endif()
57 | endif()
58 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/concat.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "types.comp"
 4 | #include "generic_binary_head.comp"
 5 | 
 6 | void main() {
 7 |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 8 |     const int dim = p.param3;
 9 | 
10 |     if (idx >= p.ne) {
11 |         return;
12 |     }
13 | 
14 |     const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
15 |     const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
16 |     const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
17 |     const uint i2_offset = i2*p.ne21*p.ne20;
18 |     const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
19 |     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
20 | 
21 |     uint o[4] = {0, 0, 0, 0};
22 |     o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
23 | 
24 |     const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
25 |     const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
26 |     const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
27 | 
28 |     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
29 | 
30 | #ifndef OPTIMIZATION_ERROR_WORKAROUND
31 |     data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
32 | #else
33 |     if (is_src0) {
34 |         data_d[p.d_offset + dst_idx] = data_a[src0_idx];
35 |     } else {
36 |         data_d[p.d_offset + dst_idx] = data_b[src1_idx];
37 |     }
38 | #endif
39 | }
40 | 


--------------------------------------------------------------------------------
/src/core/timer.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <chrono>
18 | 
19 | namespace powerserve {
20 | 
21 | auto timestamp_ns() -> int64_t;
22 | auto timestamp_us() -> int64_t;
23 | auto timestamp_ms() -> int64_t;
24 | 
25 | struct Timer {
26 |     Timer();
27 | 
28 |     // Return elapsed time since last tick.
29 |     auto elapsed_time_ns() const -> int64_t;
30 |     auto elapsed_time_us() const -> int64_t;
31 |     auto elapsed_time_ms() const -> int64_t;
32 | 
33 |     // Return elapsed time since last tick and set new tick.
34 |     auto tick_ns() -> int64_t;
35 |     auto tick_us() -> int64_t;
36 |     auto tick_ms() -> int64_t;
37 | 
38 |     void reset();
39 | 
40 | private:
41 |     using Clock = std::chrono::steady_clock;
42 | 
43 |     Clock::time_point last_time_point;
44 | 
45 |     auto tick_impl(Clock::time_point *out_time_point) const -> int64_t;
46 | };
47 | 
48 | } // namespace powerserve
49 | 


--------------------------------------------------------------------------------
/src/model/module/norm_attention.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "core/config.hpp"
18 | #include "graph/graph.hpp"
19 | #include "graph/node.hpp"
20 | #include "model/common/weights.hpp"
21 | #include "model/module/attention.hpp"
22 | 
23 | #include <cstdlib>
24 | 
25 | namespace powerserve {
26 | 
27 | struct NormAttention : Attention {
28 | 
29 | public:
30 |     NormAttention(const ModelConfig::LLMConfig &config, std::shared_ptr<Weight> weights) : Attention(config, weights) {}
31 | 
32 |     ~NormAttention() = default;
33 | 
34 | public:
35 |     TensorNode *build(
36 |         Graph &g,
37 |         TensorNode *x,
38 |         int64_t L,
39 |         const TensorNode *k_cache,
40 |         const TensorNode *v_cache,
41 |         const std::vector<int> &pos,
42 |         const CausalAttentionMask &mask,
43 |         bool is_need_bias = false
44 |     ) override;
45 | };
46 | 
47 | } // namespace powerserve
48 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_mul_mat_mat_f32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | #extension GL_KHR_shader_subgroup_arithmetic : require
 6 | #extension GL_EXT_debug_printf : enable
 7 | 
 8 | // device subgroup size
 9 | layout (local_size_x_id = 0) in;
10 | 
11 | layout(binding = 0) readonly buffer tensorInA { float inA[]; };
12 | layout(binding = 1) readonly buffer tensorInB { float inB[]; };
13 | layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
14 | 
15 | layout(push_constant) uniform parameter {
16 |   uint inAOff;
17 |   uint inBOff;
18 |   uint outOff;
19 |   int ne00;
20 |   int ne01;
21 |   int ne02;
22 |   int ne11;
23 |   int ne12;
24 |   uint nb01;
25 |   uint nb02;
26 |   uint nb11;
27 |   uint nb12;
28 |   uint nb1;
29 |   uint nb2;
30 | }
31 | pcs;
32 | 
33 | 
34 | void main() {
35 |   uvec3 gid = gl_WorkGroupID;
36 | 
37 |   uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
38 |   uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
39 | 
40 |   const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
41 |   const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
42 |   float sum = 0.0f;
43 |   for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
44 |       sum += float(inA[x+i]) * float(inB[y+i]);
45 |   }
46 | 
47 |   const float all_sum = subgroupAdd(sum);
48 |   if (subgroupElect()) {
49 |     out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/convert.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_CONVERT_HPP
28 | #define GGML_SYCL_CONVERT_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | template <typename T>
33 | using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
34 |                              int64_t k, dpct::queue_ptr stream);
35 | typedef to_t_sycl_t<float> to_fp32_sycl_t;
36 | typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
37 | 
38 | to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
39 | to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
40 | 
41 | #endif // GGML_SYCL_CONVERT_HPP
42 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/mmvq.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_MMVQ_HPP
28 | #define GGML_SYCL_MMVQ_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | 
33 | void ggml_sycl_op_mul_mat_vec_q(
34 |     ggml_backend_sycl_context & ctx,
35 |     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
36 |     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
37 |     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
38 |     const int64_t src1_ncols, const int64_t src1_padded_row_size,
39 |     const dpct::queue_ptr &stream);
40 | 
41 | #endif // GGML_SYCL_MMVQ_HPP
42 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q6_k.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
12 |         const uint i = gl_WorkGroupID.x * 256 + wgy;
13 |         if (i >= p.M * p.K / QUANT_K) {
14 |             return;
15 |         }
16 |         const uint tid = gl_LocalInvocationID.x;
17 |         const uint ip = tid / 32;
18 |         const uint il = tid - 32 * ip;
19 |         const uint is = 8 * ip + il / 16;
20 | 
21 |         const uint y_idx = i * QUANT_K + 128 * ip + il;
22 | 
23 |         const uint ql_idx = 64 * ip + il;
24 |         const uint8_t qh = data_a[i].qh[32 * ip + il];
25 | 
26 |         const FLOAT_TYPE d = FLOAT_TYPE(data_a[i].d);
27 | 
28 |         data_b[y_idx +  0] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 0] * (int8_t((data_a[i].ql[ql_idx +  0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
29 |         data_b[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 2] * (int8_t((data_a[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
30 |         data_b[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 4] * (int8_t((data_a[i].ql[ql_idx +  0] >>  4) | (((qh >> 4) & 3) << 4)) - 32)));
31 |         data_b[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 6] * (int8_t((data_a[i].ql[ql_idx + 32] >>  4) | (((qh >> 6) & 3) << 4)) - 32)));
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/executor/executor.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "backend/platform.hpp"
18 | #include "graph/graph.hpp"
19 | 
20 | namespace powerserve {
21 | 
22 | struct Executor {
23 | public:
24 |     Platform &m_platform;
25 |     Graph &m_graph;
26 | 
27 | public:
28 |     Executor(Platform &platform, Graph &graph) : m_platform(platform), m_graph(graph) {}
29 | 
30 | public:
31 |     void allocate_buffers();
32 |     void run();
33 |     void plan();
34 | 
35 | private:
36 |     template <typename T>
37 |     void create_cpu_buffer(std::shared_ptr<TensorNode> tensor) {
38 |         if (tensor->type == NodeType::TENSOR_VIEW) {
39 |             tensor->m_data =
40 |                 CPUBuffer::create_buffer_view<T>(tensor->tensor_view()->parent->get<CPUBuffer>(), tensor->m_shape);
41 |         } else {
42 |             tensor->m_data = CPUBuffer::create_buffer<T>(tensor->m_shape);
43 |         }
44 |     }
45 | };
46 | 
47 | } // namespace powerserve
48 | 


--------------------------------------------------------------------------------
/src/model/module/attention.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "core/config.hpp"
18 | #include "graph/node.hpp"
19 | #include "model/common/weights.hpp"
20 | 
21 | namespace powerserve {
22 | 
23 | struct Attention {
24 | 
25 | public:
26 |     const ModelConfig::LLMConfig &m_config;
27 |     std::shared_ptr<Weight> m_weights;
28 | 
29 | public:
30 |     Attention(const ModelConfig::LLMConfig &config, const std::shared_ptr<Weight> &weights) :
31 |         m_config(config),
32 |         m_weights(weights) {}
33 | 
34 |     virtual ~Attention() = default;
35 | 
36 | public:
37 |     virtual TensorNode *build(
38 |         Graph &g,
39 |         TensorNode *x,
40 |         int64_t L,
41 |         const TensorNode *k_cache,
42 |         const TensorNode *v_cache,
43 |         const std::vector<int> &pos,
44 |         const CausalAttentionMask &mask,
45 |         bool is_need_bias = false
46 |     ) = 0;
47 | };
48 | 
49 | } // namespace powerserve
50 | 


--------------------------------------------------------------------------------
/src/model/module/attention_mask.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "attention_mask.hpp"
16 | 
17 | namespace powerserve {
18 | 
19 | AttentionMask::AttentionMask(size_t size) : size(size) {}
20 | 
21 | AttentionMaskView::AttentionMaskView(const AttentionMask &mask, size_t offset, size_t size) :
22 |     mask(mask),
23 |     offset(offset),
24 |     size(size) {}
25 | 
26 | bool AttentionMaskView::not_masked(size_t i, size_t j) const {
27 |     return mask.not_masked(offset + i, offset + j);
28 | }
29 | 
30 | CausalAttentionMask::CausalAttentionMask(size_t size) : AttentionMask(size) {}
31 | 
32 | CausalAttentionMask::CausalAttentionMask(size_t size, const std::vector<std::vector<bool>> &batch_mask) :
33 |     AttentionMask(size),
34 |     mask(batch_mask) {}
35 | 
36 | bool CausalAttentionMask::not_masked(size_t i, size_t j) const {
37 |     if (!mask.empty()) {
38 |         return mask[i][j];
39 |     }
40 |     return i >= j;
41 | }
42 | 
43 | } // namespace powerserve
44 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/dmmv.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_DMMV_HPP
28 | #define GGML_SYCL_DMMV_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | 
33 | void ggml_sycl_op_dequantize_mul_mat_vec(
34 |     ggml_backend_sycl_context & ctx,
35 |     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
36 |     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
37 |     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
38 |     const int64_t src1_ncols, const int64_t src1_padded_row_size,
39 |     const dpct::queue_ptr &stream);
40 | 
41 | #endif // GGML_SYCL_DMMV_HPP
42 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_mul.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 1024) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 8 | layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
 9 | layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10 | 
11 | layout(push_constant) uniform PushConstants {
12 |     uint inAOff;
13 |     uint inBOff;
14 |     uint outOff;
15 |     int ne00;
16 |     int nb00;
17 |     int nb01;
18 |     int nb02;
19 |     int nb03;
20 |     int ne10;
21 |     int ne11;
22 |     int ne12;
23 |     int ne13;
24 |     int nb10;
25 |     int nb11;
26 |     int nb12;
27 |     int nb13;
28 |     int ne0;
29 |     int nb0;
30 |     int nb1;
31 |     int nb2;
32 |     int nb3;
33 | } pcs;
34 | 
35 | void main() {
36 |     const uint i03 = gl_WorkGroupID.z;
37 |     const uint i02 = gl_WorkGroupID.y;
38 |     const uint i01 = gl_WorkGroupID.x;
39 | 
40 |     const uint i13 = i03 % pcs.ne13;
41 |     const uint i12 = i02 % pcs.ne12;
42 |     const uint i11 = i01 % pcs.ne11;
43 | 
44 |     uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
45 |     uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
46 |     uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1)  / 4);
47 | 
48 |     for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
49 |         const uint i10 = i0 % pcs.ne10;
50 |         out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/model/module/ffn.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "model/module/ffn.hpp"
16 | 
17 | #include "graph/graph.hpp"
18 | #include "graph/node.hpp"
19 | 
20 | namespace powerserve {
21 | 
22 | TensorNode *FFN::build(Graph &g, TensorNode *attn_o, int64_t L) {
23 |     auto ffn_norm_w = g.add_tensor(m_weights->lw[L].ffn_norm);
24 |     auto ffn_norm_o = g.rms_norm(attn_o, ffn_norm_w, m_config.norm_eps);
25 | 
26 |     auto gate_w = g.add_tensor(m_weights->lw[L].ffn_gate);
27 |     auto gate_o = g.mat_mul(gate_w, ffn_norm_o);
28 | 
29 |     auto up_w = g.add_tensor(m_weights->lw[L].ffn_up);
30 |     auto up_o = g.mat_mul(up_w, ffn_norm_o);
31 | 
32 |     // {hidden_dim, bs, 1, 1}
33 |     auto silu = g.silu_hadamard(gate_o, up_o);
34 | 
35 |     auto down_w = g.add_tensor(m_weights->lw[L].ffn_down);
36 |     auto down_o = g.mat_mul(down_w, silu);
37 | 
38 |     // {embed_dim, bs, 1, 1}
39 |     auto res_conn = g.add(attn_o, down_o);
40 | 
41 |     return res_conn;
42 | }
43 | 
44 | } // namespace powerserve
45 | 


--------------------------------------------------------------------------------
/libs/ggml/src/ggml-sycl/mmq.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | //
16 | // MIT license
17 | // Copyright (C) 2024 Intel Corporation
18 | // SPDX-License-Identifier: MIT
19 | //
20 | 
21 | //
22 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
23 | // See https://llvm.org/LICENSE.txt for license information.
24 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
25 | //
26 | 
27 | #ifndef GGML_SYCL_MMQ_HPP
28 | #define GGML_SYCL_MMQ_HPP
29 | 
30 | #include "common.hpp"
31 | 
32 | void ggml_sycl_op_mul_mat_q(
33 |     ggml_backend_sycl_context & ctx,
34 |     const ggml_tensor* src0,
35 |     const ggml_tensor* src1,
36 |     ggml_tensor* dst,
37 |     const char* src0_dd_i,
38 |     const float* src1_ddf_i,
39 |     const char* src1_ddq_i,
40 |     float* dst_dd_i,
41 |     const int64_t row_low,
42 |     const int64_t row_high,
43 |     const int64_t src1_ncols,
44 |     const int64_t src1_padded_row_size,
45 |     const dpct::queue_ptr& stream);
46 | 
47 | #endif // GGML_SYCL_MMQ_HPP
48 | 


--------------------------------------------------------------------------------
/libs/ggml/src/vulkan-shaders/dequant_q2_k.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "dequant_head.comp"
 4 | 
 5 | layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 6 | 
 7 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 8 | layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 9 | 
10 | void main() {
11 |     [[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
12 |         const uint i = gl_WorkGroupID.x * 256 + wgy;
13 |         if (i >= p.M * p.K / QUANT_K) {
14 |             return;
15 |         }
16 | 
17 |         const uint tid = gl_LocalInvocationID.x;
18 |         const uint ip = tid / 32;
19 |         const uint il = tid - 32 * ip;
20 |         const uint is = 8 * ip + il / 16;
21 | 
22 |         const uint y_idx = i * QUANT_K + 128 * ip + il;
23 | 
24 |         const uint ql_idx = 32 * ip + il;
25 |         const uint8_t qs = data_a[i].qs[32 * ip + il];
26 | 
27 |         FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
28 |         FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
29 |         data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
30 |         data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
31 |         data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
32 |         data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4));
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/libs/ggml/src/kompute-shaders/op_rmsnorm.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | #include "common.comp"
 4 | 
 5 | layout(local_size_x = 512) in;
 6 | 
 7 | layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 8 | layout(binding = 1) buffer restrict tensorOut { float out_[]; };
 9 | 
10 | layout(push_constant) uniform PushConstants {
11 |     uint inOff;
12 |     uint outOff;
13 |     uint ne00;
14 |     uint nb01;
15 |     float eps;
16 | } pcs;
17 | 
18 | shared float sum[gl_WorkGroupSize.x];
19 | 
20 | void main() {
21 |     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
22 | 
23 |     // parallel sum
24 |     sum[gl_LocalInvocationID.x] = 0.0;
25 |     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
26 |         sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
27 |     }
28 | 
29 |     // reduce
30 |     barrier();
31 |     memoryBarrierShared();
32 |     [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
33 |         if (gl_LocalInvocationID.x < i) {
34 |             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
35 |         }
36 |         barrier();
37 |         memoryBarrierShared();
38 |     }
39 | 
40 |     // broadcast
41 |     if (gl_LocalInvocationID.x == 0) {
42 |         sum[0] /= float(pcs.ne00);
43 |     }
44 |     barrier();
45 |     memoryBarrierShared();
46 | 
47 |     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
48 | 
49 |     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
50 |     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
51 |         out_[y+i00] = in_[x+i00] * scale;
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/run/README.md:
--------------------------------------------------------------------------------
 1 | # CLI
 2 | 
 3 | ## Quick Start
 4 | 
 5 | To get started right away, run the following command, making sure to use the correct path for the model you have:
 6 | - [hugging-face](https://huggingface.co/PowerServe/Llama-3.1-8B-PowerServe-QNN/tree/main)
 7 | 
 8 | ```bash
 9 | # compile binary on linux
10 | ./powerserve create --exe-path ./build/out -m ./Llama-3.1-8B-PowerServe-QNN/ -o proj
11 | ```
12 | 
13 | ## Common Options
14 | 
15 | In this section, we cover the most commonly used options for running the `run` program with models:
16 | 
17 | - `--work-folder [-d] DIRECTORY` The directory containing GGUF or QNN models
18 | - `--n-predicts [-n] N` Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
19 | - `--no-qnn` Set this flag to disable QNN backend (if compiled with POWERSERVE_WITH_QNN=ON)
20 | 
21 | ```bash
22 | ./proj/bin/powerserve-run --work-folder proj --prompt "Once upon a time"
23 | ```
24 | 
25 | ## Input Prompts
26 | 
27 | The `run` program provides several ways to interact with the models using input prompts:
28 | 
29 | - `--prompt [-p] PROMPT`: Provide a prompt directly as a command-line option.
30 | - `--prompt-file [-f] FNAME`: Provide a file containing a prompt or multiple prompts.
31 | 
32 | ## Additional Options
33 | 
34 | These options provide extra functionality and customization when running the LLaMA models:
35 | 
36 | -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
37 | 


--------------------------------------------------------------------------------
/src/sampler/sampler_chain.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "core/config.hpp"
18 | #include "sampler.hpp"
19 | #include "tokenizer/tokenizer.hpp"
20 | 
21 | #include <vector>
22 | 
23 | namespace powerserve {
24 | 
25 | struct SamplerChain final : Sampler {
26 |     virtual ~SamplerChain() override = default;
27 | 
28 |     SamplerChain() = default;
29 | 
30 |     SamplerChain(const HyperParams::SamplerConfig &config, const Tokenizer &tokenizer) {
31 |         build_from_config(config, tokenizer);
32 |     }
33 | 
34 |     template <typename SamplerType, typename... Args>
35 |     void append(Args &&...args) {
36 |         m_samplers.emplace_back(std::make_unique<SamplerType>(std::forward<Args>(args)...));
37 |     }
38 | 
39 |     void build_from_config(const HyperParams::SamplerConfig &config, const Tokenizer &tokenizer);
40 | 
41 |     void apply(ProbArray &probs) override;
42 |     void accept(Token token) override;
43 | 
44 | private:
45 |     std::vector<std::unique_ptr<Sampler>> m_samplers;
46 | };
47 | 
48 | } // namespace powerserve
49 | 


--------------------------------------------------------------------------------
/src/core/android_logger.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <android/log.h>
18 | 
19 | namespace powerserve {
20 | 
21 | #define POWERSERVE_LOG_DEBUG(...)                                                                                      \
22 |     __android_log_write(ANDROID_LOG_DEBUG, "PowerServe", fmt::format("" __VA_ARGS__).c_str())
23 | 
24 | #define POWERSERVE_LOG_INFO(...)                                                                                       \
25 |     __android_log_write(ANDROID_LOG_INFO, "PowerServe", fmt::format("" __VA_ARGS__).c_str())
26 | 
27 | #define POWERSERVE_LOG_WARN(...)                                                                                       \
28 |     __android_log_write(ANDROID_LOG_WARN, "PowerServe", fmt::format("" __VA_ARGS__).c_str())
29 | 
30 | #define POWERSERVE_LOG_ERROR(...)                                                                                      \
31 |     __android_log_write(ANDROID_LOG_ERROR, "PowerServe", fmt::format("" __VA_ARGS__).c_str())
32 | 
33 | } // namespace powerserve
34 | 


--------------------------------------------------------------------------------
/src/model/module/attention_mask.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2024-2025 PowerServe Authors
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <cstddef>
18 | #include <vector>
19 | 
20 | namespace powerserve {
21 | 
22 | struct AttentionMask {
23 |     size_t size = 0;
24 | 
25 |     AttentionMask(size_t size);
26 | 
27 |     virtual ~AttentionMask() = default;
28 | 
29 |     virtual bool not_masked(size_t i, size_t j) const = 0;
30 | };
31 | 
32 | struct AttentionMaskView {
33 |     const AttentionMask &mask;
34 |     size_t offset = 0;
35 | 
36 |     size_t size = 0;
37 | 
38 |     AttentionMaskView(const AttentionMask &mask, size_t offset, size_t size);
39 | 
40 |     bool not_masked(size_t i, size_t j) const;
41 | };
42 | 
43 | struct CausalAttentionMask : AttentionMask {
44 |     std::vector<std::vector<bool>> mask;
45 | 
46 |     CausalAttentionMask(size_t size);
47 |     CausalAttentionMask(size_t size, const std::vector<std::vector<bool>> &mask);
48 |     virtual ~CausalAttentionMask() override = default;
49 |     virtual bool not_masked(size_t i, size_t j) const override;
50 | };
51 | 
52 | } // namespace powerserve
53 | 


--------------------------------------------------------------------------------