├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── -bug-.yaml
    │   ├── -bug2-.yaml
    │   ├── -feature-.yaml
    │   └── -feature2-.yaml
    └── workflows
    │   ├── book-ci.yml
    │   ├── deploy.yml
    │   ├── docker-image.yml
    │   ├── install.yml
    │   ├── package_wheel_release.yml
    │   ├── package_wheel_test.yml
    │   └── score.yml
├── .gitignore
├── .gitmodules
├── .pylintrc
├── Dockerfile
├── Dockerfile.xpu
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── README_ZH.md
├── SECURITY.md
├── WeChatGroup.png
├── book.toml
├── csrc
    ├── balance_serve
    │   ├── CMakeLists.txt
    │   ├── kvc2
    │   │   ├── .clang-format
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── config
    │   │   │   ├── model_configs.json
    │   │   │   └── quant_configs.json
    │   │   ├── export_envs_before_run.sh
    │   │   ├── install_deps.sh
    │   │   ├── mkfs.sh
    │   │   ├── src
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── async_store.cpp
    │   │   │   ├── async_store.hh
    │   │   │   ├── bind.cpp
    │   │   │   ├── cache_entry.cpp
    │   │   │   ├── cache_entry.hh
    │   │   │   ├── common.h
    │   │   │   ├── cuda_stream_manager.cpp
    │   │   │   ├── cuda_stream_manager.hh
    │   │   │   ├── defs.h
    │   │   │   ├── gpu_cache.cpp
    │   │   │   ├── gpu_cache.hh
    │   │   │   ├── hasher.hpp
    │   │   │   ├── io_helper.hpp
    │   │   │   ├── kvc2.h
    │   │   │   ├── kvc2_utils.py
    │   │   │   ├── metrics.cpp
    │   │   │   ├── metrics.h
    │   │   │   ├── model_config.h
    │   │   │   ├── page_aligned_memory_pool.cpp
    │   │   │   ├── page_aligned_memory_pool.h
    │   │   │   ├── prefix.cpp
    │   │   │   └── utils
    │   │   │   │   ├── all.hpp
    │   │   │   │   ├── arithmetic.hpp
    │   │   │   │   ├── easy_format.hpp
    │   │   │   │   ├── lock_free_queue.hpp
    │   │   │   │   ├── mpsc.hpp
    │   │   │   │   ├── mutex_extend.hpp
    │   │   │   │   ├── periodic_task.hpp
    │   │   │   │   ├── spin_lock.hpp
    │   │   │   │   └── timer.hpp
    │   │   ├── test
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── hashmap_test.cpp
    │   │   │   ├── kvc2_export_header_test.cpp
    │   │   │   ├── kvc2_export_load_test.cpp
    │   │   │   ├── kvc2_test_utils.cpp
    │   │   │   ├── kvc2test
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── append-tokens.cpp
    │   │   │   │   ├── check-flush-back.cpp
    │   │   │   │   ├── common.hpp
    │   │   │   │   ├── flush-back.cpp
    │   │   │   │   ├── lookup-alt-gpu.cpp
    │   │   │   │   ├── lookup-alt.cpp
    │   │   │   │   ├── lookup-gpu-async.cpp
    │   │   │   │   ├── lookup-gpu-mt-without-vcache.cpp
    │   │   │   │   ├── lookup-gpu-mt.cpp
    │   │   │   │   ├── lookup-gpu.cpp
    │   │   │   │   ├── lookup-mt.cpp
    │   │   │   │   ├── lookup-without-vcache.cpp
    │   │   │   │   ├── lookup.cpp
    │   │   │   │   └── raw_insert_read.cpp
    │   │   │   ├── kvcache_disk_insert_read_test.cpp
    │   │   │   ├── kvcache_mem_eviction_test.cpp
    │   │   │   ├── kvcache_mem_insert_read_test.cpp
    │   │   │   ├── kvcache_save_load_test.cpp
    │   │   │   ├── kvcache_test_utils.cpp
    │   │   │   ├── page_pool_test.cpp
    │   │   │   ├── prefix_test.cpp
    │   │   │   ├── pytest_load.py
    │   │   │   ├── pytest_mem_prefix_test.py
    │   │   │   ├── pytest_mem_read.py
    │   │   │   ├── pytest_raw_insert_and_read.py
    │   │   │   ├── test_align.py
    │   │   │   ├── test_cuda_stream.cpp
    │   │   │   ├── test_cuda_stream_manager.cpp
    │   │   │   ├── test_lock_free_queue.cpp
    │   │   │   ├── test_periodic_task.cpp
    │   │   │   ├── test_queue_perf.cpp
    │   │   │   ├── test_std_list.cpp
    │   │   │   └── xxHash_test.cpp
    │   │   └── unit_test.sh
    │   └── sched
    │   │   ├── CMakeLists.txt
    │   │   ├── bind.cpp
    │   │   ├── metrics.cpp
    │   │   ├── metrics.h
    │   │   ├── model_config.h
    │   │   ├── scheduler.cpp
    │   │   ├── scheduler.h
    │   │   └── utils
    │   │       ├── all.hpp
    │   │       ├── arithmetic.hpp
    │   │       ├── atomic_ptr_with_flags.hpp
    │   │       ├── csv.hpp
    │   │       ├── easy_format.hpp
    │   │       ├── mpsc.hpp
    │   │       ├── readable_number.hpp
    │   │       ├── statistics.hpp
    │   │       └── timer.hpp
    ├── custom_marlin
    │   ├── __init__.py
    │   ├── binding.cpp
    │   ├── gptq_marlin
    │   │   ├── gptq_marlin.cu
    │   │   ├── gptq_marlin.cuh
    │   │   ├── gptq_marlin_dtypes.cuh
    │   │   ├── gptq_marlin_repack.cu
    │   │   └── ops.h
    │   ├── setup.py
    │   ├── test_cuda_graph.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── format24.py
    │   │   ├── marlin_24_perms.py
    │   │   ├── marlin_perms.py
    │   │   ├── marlin_utils.py
    │   │   └── quant_utils.py
    └── ktransformers_ext
    │   ├── CMakeLists.txt
    │   ├── bench
    │       ├── bench_attention.py
    │       ├── bench_attention_torch.py
    │       ├── bench_linear.py
    │       ├── bench_linear_torch.py
    │       ├── bench_mlp.py
    │       ├── bench_mlp_torch.py
    │       ├── bench_moe.py
    │       ├── bench_moe_amx.py
    │       └── bench_moe_torch.py
    │   ├── cmake
    │       └── FindSIMD.cmake
    │   ├── cpu_backend
    │       ├── backend.cpp
    │       ├── backend.h
    │       ├── cpuinfer.h
    │       ├── shared_mem_buffer.cpp
    │       ├── shared_mem_buffer.h
    │       ├── task_queue.cpp
    │       ├── task_queue.h
    │       └── vendors
    │       │   ├── README.md
    │       │   ├── cuda.h
    │       │   ├── hip.h
    │       │   ├── musa.h
    │       │   └── vendor.h
    │   ├── cuda
    │       ├── binding.cpp
    │       ├── custom_gguf
    │       │   ├── dequant.cu
    │       │   └── ops.h
    │       ├── gptq_marlin
    │       │   ├── gptq_marlin.cu
    │       │   ├── gptq_marlin.cuh
    │       │   ├── gptq_marlin_dtypes.cuh
    │       │   └── ops.h
    │       ├── setup.py
    │       └── test_dequant.py
    │   ├── examples
    │       ├── test_attention.py
    │       ├── test_linear.py
    │       ├── test_mlp.py
    │       └── test_moe.py
    │   ├── ext_bindings.cpp
    │   ├── operators
    │       ├── amx
    │       │   ├── la
    │       │   │   ├── amx.hpp
    │       │   │   └── utils.hpp
    │       │   └── moe.hpp
    │       ├── kvcache
    │       │   ├── kvcache.h
    │       │   ├── kvcache_attn.cpp
    │       │   ├── kvcache_load_dump.cpp
    │       │   ├── kvcache_read_write.cpp
    │       │   └── kvcache_utils.cpp
    │       └── llamafile
    │       │   ├── conversion.h
    │       │   ├── linear.cpp
    │       │   ├── linear.h
    │       │   ├── mlp.cpp
    │       │   ├── mlp.h
    │       │   ├── moe.cpp
    │       │   └── moe.h
    │   └── vendors
    │       ├── cuda.h
    │       ├── hip.h
    │       ├── musa.h
    │       └── vendor.h
├── doc
    ├── README.md
    ├── SUMMARY.md
    ├── assets
    │   ├── BigCodeBench.png
    │   ├── DeepSeek-on-KTransformers.png
    │   ├── Framework_effect.png
    │   ├── InfLLM_equation.jpg
    │   ├── InfLLM_framework.png
    │   ├── InjectStruction.png
    │   ├── KTransformers.png
    │   ├── KTransformers_long_context_v1.png
    │   ├── KTransformers_long_context_v2.png
    │   ├── Quest_framework.png
    │   ├── SnapKV_framework.png
    │   ├── SparQ_attention.png
    │   ├── amx.png
    │   ├── amx_avx.png
    │   ├── amx_intro.png
    │   ├── cpuinfer.png
    │   ├── deepseekv2_structure.png
    │   ├── internlm_memory.png
    │   ├── long_context_generate.png
    │   ├── long_context_prefill.png
    │   ├── model_structure_guild.png
    │   ├── multi_gpu.png
    │   ├── needle_128K.png
    │   ├── needle_1M.png
    │   ├── onednn_1.png
    │   └── website.png
    ├── basic
    │   ├── note1.md
    │   └── note2.md
    ├── en
    │   ├── AMX.md
    │   ├── DeepseekR1_V3_tutorial.md
    │   ├── Docker.md
    │   ├── Docker_xpu.md
    │   ├── FAQ.md
    │   ├── ROCm.md
    │   ├── V3-success.md
    │   ├── api
    │   │   └── server
    │   │   │   ├── api.md
    │   │   │   ├── run-tabby.png
    │   │   │   ├── server-arch.png
    │   │   │   ├── server.md
    │   │   │   ├── tabby.md
    │   │   │   ├── visit-api-tags.png
    │   │   │   └── website.md
    │   ├── balance-serve.md
    │   ├── benchmark.md
    │   ├── deepseek-v2-injection.md
    │   ├── fp8_kernel.md
    │   ├── injection_tutorial.md
    │   ├── install.md
    │   ├── llama4.md
    │   ├── long_context_introduction.md
    │   ├── long_context_tutorial.md
    │   ├── makefile_usage.md
    │   ├── multi-gpu-tutorial.md
    │   ├── operators
    │   │   ├── Combined_MoE_time_per_layer.png
    │   │   ├── Linear_projection_time.png
    │   │   └── llamafile.md
    │   └── xpu.md
    └── zh
    │   ├── DeepseekR1_V3_tutorial_zh.md
    │   └── api
    │       └── server
    │           ├── api.md
    │           ├── run-tabby.png
    │           ├── server-arch.png
    │           ├── server.md
    │           ├── tabby.md
    │           ├── visit-api-tags.png
    │           └── website.md
├── install-with-cache.sh
├── install.bat
├── install.sh
├── ktransformers
    ├── __init__.py
    ├── configs
    │   ├── config.yaml
    │   └── log_config.ini
    ├── ktransformers_ext
    │   ├── operators
    │   │   └── custom_marlin
    │   │   │   └── quantize
    │   │   │       └── utils
    │   │   │           ├── __init__.py
    │   │   │           ├── format_24.py
    │   │   │           ├── marlin_24_perms.py
    │   │   │           ├── marlin_perms.py
    │   │   │           ├── marlin_utils.py
    │   │   │           └── quant_utils.py
    │   └── triton
    │   │   └── fp8gemm.py
    ├── local_chat.py
    ├── local_chat_test.py
    ├── models
    │   ├── __init__.py
    │   ├── configuration_deepseek.py
    │   ├── configuration_deepseek_v3.py
    │   ├── configuration_llama.py
    │   ├── configuration_qwen2_moe.py
    │   ├── configuration_qwen3_moe.py
    │   ├── custom_cache.py
    │   ├── custom_modeling_deepseek_v2.py
    │   ├── custom_modeling_deepseek_v3.py
    │   ├── custom_modeling_qwen2_moe.py
    │   ├── custom_modeling_qwen3_moe.py
    │   ├── modeling_deepseek.py
    │   ├── modeling_deepseek_v3.py
    │   ├── modeling_llama.py
    │   ├── modeling_mixtral.py
    │   ├── modeling_qwen2_moe.py
    │   └── modeling_qwen3_moe.py
    ├── operators
    │   ├── RoPE.py
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── balance_serve_attention.py
    │   ├── base_operator.py
    │   ├── cpuinfer.py
    │   ├── dynamic_attention.py
    │   ├── experts.py
    │   ├── flashinfer_batch_prefill_wrapper.py
    │   ├── flashinfer_wrapper.py
    │   ├── gate.py
    │   ├── layernorm.py
    │   ├── linear.py
    │   ├── mlp.py
    │   ├── models.py
    │   ├── triton_attention.py
    │   └── triton_attention_prefill.py
    ├── optimize
    │   ├── optimize.py
    │   └── optimize_rules
    │   │   ├── DeepSeek-V2-Chat-multi-gpu-4.yaml
    │   │   ├── DeepSeek-V2-Chat-multi-gpu.yaml
    │   │   ├── DeepSeek-V2-Chat.yaml
    │   │   ├── DeepSeek-V2-Lite-Chat-gpu-cpu.yaml
    │   │   ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml
    │   │   ├── DeepSeek-V2-Lite-Chat.yaml
    │   │   ├── DeepSeek-V3-Chat-amx.yaml
    │   │   ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
    │   │   ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
    │   │   ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
    │   │   ├── DeepSeek-V3-Chat-multi-gpu-4.yaml
    │   │   ├── DeepSeek-V3-Chat-multi-gpu-8.yaml
    │   │   ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
    │   │   ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml
    │   │   ├── DeepSeek-V3-Chat-multi-gpu.yaml
    │   │   ├── DeepSeek-V3-Chat-serve.yaml
    │   │   ├── DeepSeek-V3-Chat.yaml
    │   │   ├── Internlm2_5-7b-Chat-1m.yaml
    │   │   ├── Mixtral.yaml
    │   │   ├── Moonlight-16B-A3B-serve.yaml
    │   │   ├── Moonlight-16B-A3B.yaml
    │   │   ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml
    │   │   ├── Qwen2-57B-A14B-Instruct.yaml
    │   │   ├── Qwen2-serve-amx.yaml
    │   │   ├── Qwen2-serve.yaml
    │   │   ├── Qwen3Moe-serve-amx.yaml
    │   │   ├── Qwen3Moe-serve.yaml
    │   │   ├── rocm
    │   │       └── DeepSeek-V3-Chat.yaml
    │   │   └── xpu
    │   │       ├── DeepSeek-V2-Chat.yaml
    │   │       ├── DeepSeek-V3-Chat.yaml
    │   │       └── Qwen3Moe-Chat.yaml
    ├── server
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── ollama
    │   │   │   ├── __init__.py
    │   │   │   └── completions.py
    │   │   ├── openai
    │   │   │   ├── __init__.py
    │   │   │   ├── assistants
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── assistants.py
    │   │   │   │   ├── messages.py
    │   │   │   │   ├── runs.py
    │   │   │   │   └── threads.py
    │   │   │   ├── endpoints
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── chat.py
    │   │   │   └── legacy
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── completions.py
    │   │   └── web
    │   │   │   ├── __init__.py
    │   │   │   └── system.py
    │   ├── args.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── args.py
    │   │   ├── base.py
    │   │   ├── context_manager.py
    │   │   └── interfaces
    │   │   │   ├── __init__.py
    │   │   │   ├── balance_serve.py
    │   │   │   ├── exllamav2.py
    │   │   │   ├── ktransformers.py
    │   │   │   └── transformers.py
    │   ├── balance_serve
    │   │   ├── inference
    │   │   │   ├── __init__.py
    │   │   │   ├── config.py
    │   │   │   ├── distributed
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── communication_op.py
    │   │   │   │   ├── cuda_wrapper.py
    │   │   │   │   ├── custom_all_reduce.py
    │   │   │   │   ├── custom_all_reduce_utils.py
    │   │   │   │   ├── parallel_state.py
    │   │   │   │   ├── pynccl.py
    │   │   │   │   ├── pynccl_wrapper.py
    │   │   │   │   └── utils.py
    │   │   │   ├── forward_batch.py
    │   │   │   ├── model_runner.py
    │   │   │   ├── query_manager.py
    │   │   │   └── sampling
    │   │   │   │   ├── penaltylib
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── orchestrator.py
    │   │   │   │       └── penalizers
    │   │   │   │       │   ├── frequency_penalty.py
    │   │   │   │       │   ├── min_new_tokens.py
    │   │   │   │       │   ├── presence_penalty.py
    │   │   │   │       │   └── repetition_penalty.py
    │   │   │   │   └── sampler.py
    │   │   ├── sched_rpc.py
    │   │   └── settings.py
    │   ├── config
    │   │   ├── config.py
    │   │   ├── log.py
    │   │   └── singleton.py
    │   ├── crud
    │   │   ├── __init__.py
    │   │   └── assistants
    │   │   │   ├── __init__.py
    │   │   │   ├── assistants.py
    │   │   │   ├── messages.py
    │   │   │   ├── runs.py
    │   │   │   └── threads.py
    │   ├── exceptions.py
    │   ├── main.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── assistants
    │   │   │   ├── __init__.py
    │   │   │   ├── assistants.py
    │   │   │   ├── messages.py
    │   │   │   ├── run_steps.py
    │   │   │   ├── runs.py
    │   │   │   └── threads.py
    │   ├── requirements.txt
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   ├── assistants
    │   │   │   ├── __init__.py
    │   │   │   ├── assistants.py
    │   │   │   ├── messages.py
    │   │   │   ├── runs.py
    │   │   │   ├── streaming.py
    │   │   │   ├── threads.py
    │   │   │   └── tool.py
    │   │   ├── base.py
    │   │   ├── conversation.py
    │   │   ├── endpoints
    │   │   │   └── chat.py
    │   │   └── legacy
    │   │   │   ├── __init__.py
    │   │   │   └── completions.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── create_interface.py
    │   │   ├── multi_timer.py
    │   │   └── sql_utils.py
    ├── tests
    │   ├── .gitignore
    │   ├── AIME_2024
    │   │   ├── eval_api.py
    │   │   ├── evaluation.py
    │   │   └── prompts.py
    │   ├── dequant_gpu.py
    │   ├── dequant_gpu_t.py
    │   ├── function_call_test.py
    │   ├── humaneval
    │   │   ├── eval_api.py
    │   │   ├── evaluation.py
    │   │   └── prompts.py
    │   ├── mmlu_pro_test.py
    │   ├── mmlu_test.py
    │   ├── mmlu_test_multi.py
    │   ├── score.py
    │   ├── test_client.py
    │   ├── test_pytorch_q8.py
    │   ├── test_speed.py
    │   └── triton_fp8gemm_test.py
    ├── util
    │   ├── cuda_graph_runner.py
    │   ├── custom_gguf.py
    │   ├── custom_loader.py
    │   ├── modeling_rope_utils.py
    │   ├── textstream.py
    │   ├── utils.py
    │   ├── vendors.py
    │   └── weight_loader.py
    └── website
    │   ├── .browserslistrc
    │   ├── .eslintrc.js
    │   ├── .gitignore
    │   ├── README.md
    │   ├── config.d.ts
    │   ├── jest.config.js
    │   ├── package-lock.json
    │   ├── package.json
    │   ├── public
    │       ├── balck.ico
    │       ├── config.js
    │       ├── css
    │       │   └── reset.css
    │       ├── images
    │       │   ├── assistant-avatar.png
    │       │   ├── avatar.png
    │       │   ├── bgbg.png
    │       │   ├── logo.ico
    │       │   ├── logo.png
    │       │   ├── three.png
    │       │   └── user-filling.png
    │       └── index.html
    │   ├── src
    │       ├── App.vue
    │       ├── api
    │       │   ├── api-client.ts
    │       │   ├── assistant.ts
    │       │   ├── message.ts
    │       │   ├── run.ts
    │       │   └── thread.ts
    │       ├── assets
    │       │   ├── css
    │       │   │   └── mixins.styl
    │       │   └── iconfont
    │       │   │   ├── demo.css
    │       │   │   ├── demo_index.html
    │       │   │   ├── iconfont.css
    │       │   │   ├── iconfont.js
    │       │   │   ├── iconfont.json
    │       │   │   ├── iconfont.svg
    │       │   │   ├── iconfont.ttf
    │       │   │   ├── iconfont.woff
    │       │   │   └── iconfont.woff2
    │       ├── components
    │       │   └── chat
    │       │   │   └── index.vue
    │       ├── conf
    │       │   └── config.ts
    │       ├── locals
    │       │   ├── en.js
    │       │   ├── index.js
    │       │   └── zh.js
    │       ├── main.ts
    │       ├── router
    │       │   └── index.ts
    │       ├── shims-vue.d.ts
    │       ├── store
    │       │   └── index.ts
    │       ├── utils
    │       │   ├── copy.ts
    │       │   └── types.ts
    │       └── views
    │       │   └── home.vue
    │   ├── tests
    │       └── unit
    │       │   └── example.spec.ts
    │   ├── tsconfig.json
    │   └── vue.config.js
├── merge_tensors
    └── merge_safetensor_gguf.py
├── pyproject.toml
├── requirements-local_chat.txt
├── setup.py
└── third_party
    ├── llamafile
        ├── README.md
        ├── bench.h
        ├── flags.cpp
        ├── flags.h
        ├── iqk_mul_mat.inc
        ├── iqk_mul_mat_amd_avx2.cpp
        ├── iqk_mul_mat_amd_zen4.cpp
        ├── iqk_mul_mat_arm82.cpp
        ├── macros.h
        ├── micros.h
        ├── numba.h
        ├── sgemm.cpp
        ├── sgemm.h
        ├── tinyblas_cpu.h
        ├── tinyblas_cpu_mixmul.inc
        ├── tinyblas_cpu_mixmul_amd_avx.cpp
        ├── tinyblas_cpu_mixmul_amd_avx2.cpp
        ├── tinyblas_cpu_mixmul_amd_avx512f.cpp
        ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp
        ├── tinyblas_cpu_mixmul_amd_fma.cpp
        ├── tinyblas_cpu_mixmul_amd_zen4.cpp
        ├── tinyblas_cpu_mixmul_arm80.cpp
        ├── tinyblas_cpu_mixmul_arm82.cpp
        ├── tinyblas_cpu_sgemm.inc
        ├── tinyblas_cpu_sgemm_amd_avx.cpp
        ├── tinyblas_cpu_sgemm_amd_avx2.cpp
        ├── tinyblas_cpu_sgemm_amd_avx512f.cpp
        ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp
        ├── tinyblas_cpu_sgemm_amd_fma.cpp
        ├── tinyblas_cpu_sgemm_amd_zen4.cpp
        ├── tinyblas_cpu_sgemm_arm80.cpp
        ├── tinyblas_cpu_sgemm_arm82.cpp
        └── tinyblas_cpu_unsupported.cpp
    └── nlohmann
        ├── json.hpp
        └── json_fwd.hpp


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
 2 | WORKDIR /workspace
 3 | ENV CUDA_HOME /usr/local/cuda
 4 | RUN <<EOF
 5 | apt update -y &&  apt install -y  --no-install-recommends \
 6 |     git \
 7 |     wget \
 8 |     vim \
 9 |     gcc \
10 |     g++ \
11 |     cmake && 
12 | rm -rf /var/lib/apt/lists/* &&
13 | pip install --upgrade pip &&
14 | pip install ninja pyproject numpy cpufeature &&
15 | pip install flash-attn &&
16 | cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
17 | EOF
18 | # Set the default shell to bash
19 | CMD ["/bin/bash"]
20 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Ktrans Dev Container",
 3 |     "privileged": true,
 4 |     "build": {
 5 |         "dockerfile": "Dockerfile",
 6 |         "context": "..",
 7 |         "args": {
 8 |             "http_proxy": "${env:http_proxy}",
 9 |             "https_proxy": "${env:https_proxy}",
10 |         }
11 |     },
12 |     "runArgs": [
13 |         "--network=host",
14 |         "--gpus",
15 |         "all"
16 |         // "--gpu all"
17 |     ],
18 |     "workspaceFolder": "/workspace",
19 |     "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
20 |     "mounts": [
21 |         "source=/mnt/data,target=/mnt/incontainer,type=bind,consistency=cached"
22 |     ],
23 |     "customizations": {
24 |         "vscode": {
25 |             "extensions": [
26 |             ],
27 |             "settings": {
28 |                 "terminal.integrated.shell.linux": "/bin/bash",
29 |                 "cmake.configureOnOpen": true,
30 |                 "cmake.generator": "Ninja"
31 |             }
32 |         }
33 |     }
34 | }


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | extend-select = B950
4 | extend-ignore = E203,E501,E701, B001,B006,B007,B008,B009,B010,B011,B016,B028,B031,B950,E265,E266,E401,E402,E711,E712,E713,E721,E722,E731,F401,F403,F405,F541,F811,F821,F841,W391


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/-bug-.yaml:
--------------------------------------------------------------------------------
 1 | name: 🐞 Bug report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | title: "[Bug] "
 4 | labels: ['Bug']
 5 | 
 6 | body:
 7 | - type: checkboxes
 8 |   attributes:
 9 |     label: Checklist
10 |     options:
11 |     - label: 1. I have searched related issues but cannot get the expected help.
12 |     - label: 2. The bug has not been fixed in the latest version.
13 |     - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
14 |     - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
15 |     - label: 5. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-Chinese/English content without translation may be closed.
16 | 
17 | - type: textarea
18 |   attributes:
19 |     label: Describe the bug
20 |     description: A clear and concise description of what the bug is.
21 |   validations:
22 |     required: true
23 | - type: textarea
24 |   attributes:
25 |     label: Reproduction
26 |     description: |
27 |       What command or script did you run? Which **model** are you using?
28 |     placeholder: |
29 |       A placeholder for the command.
30 |   validations:
31 |     required: true
32 | - type: textarea
33 |   attributes:
34 |     label: Environment
35 |     description: |
36 |       Please provide necessary environment information here (e.g. OS/GPU/CPU). Otherwise the issue will be close.
37 |     placeholder: Environment here.
38 |   validations:
39 |     required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/-bug2-.yaml:
--------------------------------------------------------------------------------
 1 | name: 🐞 BUG报告
 2 | description: 创建报告以帮助我们复现并修复BUG
 3 | title: "[Bug] "
 4 | labels: ['Bug']
 5 | 
 6 | body:
 7 | - type: checkboxes
 8 |   attributes:
 9 |     label: 检查清单
10 |     options:
11 |     - label: 1. 我已经搜索过相关问题，但未能获得预期的帮助
12 |     - label: 2. 该问题在最新版本中尚未修复
13 |     - label: 3. 请注意，如果您提交的BUG相关 issue 缺少对应环境信息和最小可复现示例，我们将难以复现和定位问题，降低获得反馈的可能性
14 |     - label: 4. 如果您提出的不是bug而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
15 |     - label: 5. 为方便社区交流，我将使用中文/英文或附上中文/英文翻译（如使用其他语言）。未附带翻译的非中文/英语内容可能会被关闭
16 | 
17 | - type: textarea
18 |   attributes:
19 |     label: 问题描述
20 |     description: 清晰简洁地描述BUG是什么
21 |   validations:
22 |     required: true
23 | - type: textarea
24 |   attributes:
25 |     label: 复现步骤
26 |     description: |
27 |       你运行了什么命令或脚本？使用的是哪个**模型**？
28 |     placeholder: |
29 |       在此处填写命令
30 |   validations:
31 |     required: true
32 | - type: textarea
33 |   attributes:
34 |     label: 环境信息
35 |     description: |
36 |       请提供必要的环境信息（如操作系统/GPU/CPU），否则该 issue 将被关闭
37 |     placeholder: 在此处填写环境信息
38 |   validations:
39 |     required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/-feature-.yaml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Suggest an idea for this project
 3 | title: "[Feature] "
 4 | 
 5 | body:
 6 | - type: checkboxes
 7 |   attributes:
 8 |     label: Checklist
 9 |     options:
10 |     - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/kvcache-ai/ktransformers/discussions. Otherwise, it will be closed.
11 |     - label: 2. To help the community, I will use Chinese/English or attach an Chinese/English translation if using another language. Non-English/Chinese content without translation may be closed.
12 | - type: textarea
13 |   attributes:
14 |     label: Motivation
15 |     description: |
16 |       A clear and concise description of the motivation of the feature.
17 |   validations:
18 |     required: true
19 | - type: textarea
20 |   attributes:
21 |     label: Related resources
22 |     description: |
23 |       If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/-feature2-.yaml:
--------------------------------------------------------------------------------
 1 | name: 🚀 新功能请求
 2 | description: 为项目提出新功能建议
 3 | title: "[Feature] "
 4 | 
 5 | body:
 6 | - type: checkboxes
 7 |   attributes:
 8 |     label: 检查清单
 9 |     options:
10 |     - label: 1. 如果您提出的不是新功能而是问题，请在讨论区发起讨论 https://github.com/kvcache-ai/ktransformers/discussions。否则该 issue 将被关闭
11 |     - label: 2. 为方便社区交流，我将使用中文/英文或附上英文/中文翻译（如使用其他语言）。未附带翻译的非英文/中文内容可能会被关闭
12 | - type: textarea
13 |   attributes:
14 |     label: 需求背景
15 |     description: |
16 |       清晰简洁地描述该功能的背景需求
17 |   validations:
18 |     required: true
19 | - type: textarea
20 |   attributes:
21 |     label: 相关资源
22 |     description: |
23 |       如果有官方代码实现或第三方实现，请在此提供相关信息，这将非常有帮助


--------------------------------------------------------------------------------
/.github/workflows/book-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Book-CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       # - server_support
 8 | 
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |       # - server_support
13 | jobs:
14 |   test:
15 |     name: test
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       matrix:
19 |         os: [ubuntu-latest, macos-latest, windows-latest]
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - name: Install Rust
23 |         run: |
24 |           rustup set profile minimal
25 |           rustup toolchain install stable
26 |           rustup default stable
27 |       - name: Setup mdBook
28 |         uses: peaceiris/actions-mdbook@v2
29 |         with:
30 |           mdbook-version: "latest"
31 |       # - name: Run tests
32 |       #   run: mdbook test


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       # - server_support
 8 | 
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |       # - server_support
13 | 
14 | defaults:
15 |   run:
16 |     shell: bash
17 | 
18 | permissions:
19 |   contents: write
20 | 
21 | jobs:
22 |   deploy:
23 |     runs-on: ${{ matrix.os }}
24 |     strategy:
25 |       matrix:
26 |         os: [ubuntu-latest, macos-latest, windows-latest]
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - name: Install Rust
30 |         run: |
31 |           rustup set profile minimal
32 |           rustup toolchain install stable
33 |           rustup default stable
34 |       - name: Setup mdBook
35 |         uses: peaceiris/actions-mdbook@v2
36 |         with:
37 |           mdbook-version: "latest"
38 |       - run: mdbook build
39 |       # - name: Copy Assets
40 |       #   run: |
41 |       #     chmod +x ci/copy-assets.sh
42 |       #     ci/copy-assets.sh ${{ matrix.os }}
43 |       - name: Deploy
44 |         uses: peaceiris/actions-gh-pages@v3
45 |         # or || github.ref == 'refs/heads/server_support'
46 |         if: ${{ github.ref == 'refs/heads/main' }}
47 |         with:
48 |           github_token: ${{ secrets.GITHUB_TOKEN }}
49 |           publish_dir: ./book


--------------------------------------------------------------------------------
/.github/workflows/score.yml:
--------------------------------------------------------------------------------
 1 | name: Human Eval Score
 2 | run-name: Human Eval Score
 3 | on: workflow_dispatch
 4 | jobs:
 5 |   Human-Eval-Score:
 6 |     runs-on: self-hosted
 7 |     steps:
 8 |       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
 9 |       - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
10 |       - name: Check out repository code
11 |         uses: actions/checkout@v4
12 |       - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
13 |       - name: Human Eval Run
14 |         run: |
15 |           set -e
16 |           source /home/qujing3/anaconda3/etc/profile.d/conda.sh
17 |           conda activate ktransformers-dev
18 |           export PATH=/usr/local/cuda-12.4/bin:$PATH
19 |           export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
20 |           export CUDA_HOME=/usr/local/cuda-12.4
21 |           cd ${{ github.workspace }}
22 |           python ktransformers/tests/score.py
23 | 
24 |       - run: echo "This job's status is ${{ job.status }}."
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | build
 3 | .vscode
 4 | *.so
 5 | *.cache
 6 | server.db
 7 | logs
 8 | node_modules
 9 | *.nsys-rep
10 | .vs/
11 | *pycache*
12 | *build/
13 | */third_party/*
14 | .DS_Store
15 | compile_commands.json
16 | *.egg-info*
17 | *dist/
18 | ktransformers/server/local_store/
19 | ktransformers/server_test1.db
20 | *.patch
21 | img/
22 | tmp*.txt
23 | test.txt
24 | book
25 | ktransformers/tests/chat_txt.txt
26 | mmlu_result*
27 | ktransformers/ktransformers_ext/cuda_musa/
28 | test_prompt.txt
29 | csrc/demo


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/llama.cpp"]
 2 | 	path = third_party/llama.cpp
 3 | 	url = https://github.com/ggerganov/llama.cpp.git
 4 | [submodule "third_party/pybind11"]
 5 | 	path = third_party/pybind11
 6 | 	url = https://github.com/pybind/pybind11.git
 7 | [submodule "third_party/spdlog"]
 8 | 	path = third_party/spdlog
 9 | 	url = https://github.com/gabime/spdlog.git
10 | [submodule "third_party/custom_flashinfer"]
11 | 	path = third_party/custom_flashinfer
12 | 	url = https://github.com/kvcache-ai/custom_flashinfer.git
13 | 	branch = fix-precision-mla-merge-main
14 | [submodule "third_party/xxHash"]
15 | 	path = third_party/xxHash
16 | 	url = https://github.com/Cyan4973/xxHash.git
17 | [submodule "third_party/prometheus-cpp"]
18 | 	path = third_party/prometheus-cpp
19 | 	url = https://github.com/jupp0r/prometheus-cpp
20 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 | extension-pkg-whitelist=pydantic
3 | max-line-length=120
4 | 
5 | [MESSAGES CONTROL]
6 | disable=missing-function-docstring


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server
 2 | 
 3 | 
 4 | ARG CPU_INSTRUCT=NATIVE
 5 | 
 6 | # 设置工作目录和 CUDA 路径
 7 | WORKDIR /workspace
 8 | ENV CUDA_HOME=/usr/local/cuda
 9 | 
10 | 
11 | 
12 | # 安装依赖
13 | RUN apt update -y
14 | RUN apt install -y --no-install-recommends \
15 |     libtbb-dev \
16 |     libssl-dev \
17 |     libcurl4-openssl-dev \
18 |     libaio1 \
19 |     libaio-dev \
20 |     libfmt-dev \
21 |     libgflags-dev \
22 |     zlib1g-dev \
23 |     patchelf \
24 |     git \
25 |     wget \
26 |     vim \
27 |     gcc \
28 |     g++ \
29 |     cmake
30 | # 拷贝代码
31 | RUN git clone https://github.com/kvcache-ai/ktransformers.git 
32 | # 清理 apt 缓存
33 | RUN rm -rf /var/lib/apt/lists/*
34 | 
35 | # 进入项目目录
36 | WORKDIR /workspace/ktransformers
37 | # 初始化子模块
38 | RUN git submodule update --init --recursive
39 | 
40 | # 升级 pip
41 | RUN pip install --upgrade pip
42 | 
43 | # 安装构建依赖
44 | RUN pip install ninja pyproject numpy cpufeature aiohttp zmq openai
45 | 
46 | # 安装 flash-attn（提前装可以避免后续某些编译依赖出错）
47 | RUN pip install flash-attn
48 | 
49 | # 安装 ktransformers 本体（含编译）
50 | RUN CPU_INSTRUCT=${CPU_INSTRUCT} \
51 |     USE_BALANCE_SERVE=1 \
52 |     KTRANSFORMERS_FORCE_BUILD=TRUE \
53 |     TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9;9.0+PTX" \
54 |     pip install . --no-build-isolation --verbose
55 | 
56 | RUN pip install third_party/custom_flashinfer/
57 | # 清理 pip 缓存
58 | RUN pip cache purge
59 | 
60 | # 拷贝 C++ 运行时库
61 | RUN cp /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /opt/conda/lib/
62 | 
63 | # 保持容器运行（调试用）
64 | ENTRYPOINT ["tail", "-f", "/dev/null"]


--------------------------------------------------------------------------------
/Dockerfile.xpu:
--------------------------------------------------------------------------------
 1 | # Base image
 2 | FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04
 3 | 
 4 | ARG http_proxy
 5 | ARG https_proxy
 6 | 
 7 | ENV DEBIAN_FRONTEND=noninteractive
 8 | ENV CONDA_DIR=/opt/conda
 9 | 
10 | # Install dependencies
11 | RUN apt-get update && apt-get install -y \
12 |     wget \
13 |     curl \
14 |     bash \
15 |     git \
16 |     vim \
17 |     ca-certificates \
18 |     binutils \
19 |     cmake \
20 |     g++ \
21 |     && rm -rf /var/lib/apt/lists/*
22 | 
23 | # Install Miniforge
24 | RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O /tmp/miniforge.sh && \
25 |     bash /tmp/miniforge.sh -b -p $CONDA_DIR && \
26 |     rm /tmp/miniforge.sh && \
27 |     $CONDA_DIR/bin/conda clean -afy
28 | 
29 | # Add conda to PATH
30 | ENV PATH=$CONDA_DIR/bin:$PATH
31 | 
32 | RUN bash -c "\
33 |     source /opt/conda/etc/profile.d/conda.sh && \
34 |     conda create --name ktransformers python=3.11 -y && \
35 |     conda activate ktransformers && \
36 |     conda env list && \
37 |     conda install -c conda-forge libstdcxx-ng -y && \
38 |     strings \$(find /opt/conda/envs/ktransformers/lib -name 'libstdc++.so.6') | grep GLIBCXX | grep 3.4.32 \
39 | "
40 | 
41 | RUN bash -c "\
42 |     source /opt/conda/etc/profile.d/conda.sh && \
43 |     conda activate ktransformers && \
44 |     pip install ipex-llm[xpu_2.6]==2.3.0b20250518 --extra-index-url https://download.pytorch.org/whl/xpu && \
45 |     pip uninstall -y torch torchvision torchaudio && \
46 |     pip install torch==2.7+xpu torchvision torchaudio --index-url https://download.pytorch.org/whl/test/xpu && \
47 |     pip uninstall -y intel-opencl-rt dpcpp-cpp-rt && \
48 |     pip list \
49 | "
50 | 
51 | # Clone and set up ktransformers repo
52 | RUN bash -c "\
53 |     source $CONDA_DIR/etc/profile.d/conda.sh && \
54 |     conda activate ktransformers && \
55 |     git clone https://github.com/kvcache-ai/ktransformers.git && \
56 |     cd ktransformers && \
57 |     git submodule update --init && \
58 |     sed -i 's/torch\.xpu\.is_available()/True/g' setup.py && \
59 |     bash install.sh --dev xpu \
60 | "
61 | 
62 | # Init conda and prepare bashrc
63 | RUN conda init bash && \
64 |     echo "source $CONDA_DIR/etc/profile.d/conda.sh" >> ~/.bashrc && \
65 |     echo "conda activate ktransformers" >> ~/.bashrc
66 | 
67 | WORKDIR /ktransformers/
68 | CMD ["bash"]
69 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | graft third_party
 2 | graft ktransformers
 3 | graft local_chat.py
 4 | graft csrc
 5 | include LICENSE README.md
 6 | prune ktransformers/website
 7 | prune ktransformers/logs
 8 | prune ktransformers.egg-info
 9 | prune third_party/llama.cpp/models
10 | graft ktransformers/website/dist
11 | global-exclude __pycache__
12 | include KTransformersOps.*.so
13 | include cpuinfer_ext.*.so
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | flake_find:
 2 | 	cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - 
 3 | format:
 4 | 	@cd ktransformers && black .
 5 | 	@black setup.py
 6 | dev_install:
 7 | # clear build dirs
 8 | 	rm -rf build
 9 | 	rm -rf *.egg-info
10 | 	rm -rf ktransformers/ktransformers_ext/build
11 | 	rm -rf ktransformers/ktransformers_ext/cuda/build
12 | 	rm -rf ktransformers/ktransformers_ext/cuda/dist
13 | 	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info
14 | 
15 | # install ktransformers
16 | 	echo "Installing python dependencies from requirements.txt"
17 | 	pip install -r requirements-local_chat.txt
18 | 
19 | 	echo "Installing ktransformers"
20 | 	KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation
21 | 	echo "Installation completed successfully"
22 | clean:
23 | 	rm -rf build
24 | 	rm -rf *.egg-info
25 | 	rm -rf ktransformers/ktransformers_ext/build
26 | 	rm -rf ktransformers/ktransformers_ext/cuda/build
27 | 	rm -rf ktransformers/ktransformers_ext/cuda/dist
28 | 	rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info	
29 | install_numa:
30 | 	USE_NUMA=1 make dev_install
31 | install_no_numa:
32 | 	env -u USE_NUMA make dev_install


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/WeChatGroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/WeChatGroup.png


--------------------------------------------------------------------------------
/book.toml:
--------------------------------------------------------------------------------
 1 | [book]
 2 | authors = ["kvcache-ai"]
 3 | language = "zh-CN"
 4 | title = "Ktransformers"
 5 | src = "doc"
 6 | 
 7 | [output.html]
 8 | git-repository-url = "https://github.com/kvcache-ai/ktransformers"
 9 | edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}"
10 | 
11 | [output.html.playground]
12 | editable = true
13 | copy-js = true
14 | # line-numbers = true
15 | 
16 | [output.html.fold]
17 | enable = true
18 | level = 0


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/.clang-format:
--------------------------------------------------------------------------------
 1 | Language:        Cpp
 2 | # 格式化风格，可以是LLVM, Google, Chromium, Mozilla, WebKit等，或者自定义
 3 | BasedOnStyle:  Google
 4 | 
 5 | # 缩进设置
 6 | IndentWidth:        2
 7 | TabWidth:           2
 8 | UseTab:             Never
 9 | 
10 | # 换行相关设置
11 | BreakBeforeBraces: Attach
12 | AllowShortIfStatementsOnASingleLine: false
13 | AllowShortFunctionsOnASingleLine: Inline
14 | AllowShortLoopsOnASingleLine: false
15 | 
16 | # 类与结构体
17 | DerivePointerAlignment: false
18 | PointerAlignment: Left
19 | 
20 | # 包含文件的排序和分组
21 | IncludeBlocks:   Preserve
22 | SortIncludes:    true
23 | 
24 | # 控制最大行宽
25 | ColumnLimit:     120
26 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/README.md:
--------------------------------------------------------------------------------
 1 | # KVC2
 2 | 
 3 | # Build
 4 | 运行以下命令编译kvc2，注意可能需要 sudo 权限安装一些依赖
 5 | ```shell
 6 | git clone https://github.com/kvcache-ai/kvc2
 7 | cd kvc2
 8 | ./install_deps.sh
 9 | mkdir build
10 | cd build
11 | cmake ..
12 | make -j && make install
13 | ```
14 | 编译完成后会生成`build/output`，包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so`和`kvc2_utils.py`方便调用。
15 | 
16 | <!-- # Test
17 | 运行以下命令测试kvc2，需要指定一个 disk path 作为测试目录。
18 | ```shell
19 | ./unit_test.sh ${DISK_PATH}
20 | ```
21 | 或者运行 python 的测试文件
22 | ```shell
23 | python test/pytest_mem_read.py 
24 | ``` -->
25 | 
26 | # Troubleshooting
27 | 在 Python 环境运行时，可以需要在 conda 中安装相关的依赖。
28 | ```shell
29 | conda install -c conda-forge gcc_linux-64 gxx_linux-64
30 | ```
31 | 
32 | 也可以尝试设置一下环境变量，然后再运行。
33 | ```shell
34 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
35 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
36 | ```
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/config/model_configs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "DeepSeek-Coder-V2-Instruct": {
 3 |         "hidden_size": 5120,
 4 |         "intermediate_size": 12288,
 5 |         "max_position_embeddings": 163840,
 6 |         "model_type": "deepseek_v2",
 7 |         "num_attention_heads": 128,
 8 |         "num_hidden_layers": 60,
 9 |         "num_key_value_heads": 128,
10 |         "vocab_size": 102400
11 |     },
12 |     "LLaMA-2-7B-32K": {
13 |         "hidden_size": 4096,
14 |         "intermediate_size": 11008,
15 |         "max_position_embeddings": 32768,
16 |         "model_type": "llama",
17 |         "num_attention_heads": 32,
18 |         "num_hidden_layers": 32,
19 |         "num_key_value_heads": 32,
20 |         "vocab_size": 32000
21 |     },
22 |     "Qwen2.5-7B-Instruct": {
23 |         "hidden_size": 3584,
24 |         "intermediate_size": 18944,
25 |         "max_position_embeddings": 32768,
26 |         "model_type": "qwen2",
27 |         "num_attention_heads": 28,
28 |         "num_hidden_layers": 28,
29 |         "num_key_value_heads": 4,
30 |         "vocab_size": 152064
31 |     },
32 |     "qwen2-72b-instruct": {
33 |         "hidden_size": 8192,
34 |         "intermediate_size": 29568,
35 |         "max_position_embeddings": 32768,
36 |         "model_type": "qwen2",
37 |         "num_attention_heads": 64,
38 |         "num_hidden_layers": 80,
39 |         "num_key_value_heads": 8,
40 |         "vocab_size": 152064
41 |     }
42 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/config/quant_configs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "BF16": {
 3 |         "block_element_count": 1,
 4 |         "block_element_size": 2,
 5 |         "bytes_per_element": 2.0,
 6 |         "can_be_used_as_vector": true,
 7 |         "has_min": false,
 8 |         "has_scale": false,
 9 |         "name": "BF16",
10 |         "reference": "",
11 |         "type_of_dot_vector": "BF16"
12 |     },
13 |     "FP16": {
14 |         "block_element_count": 1,
15 |         "block_element_size": 2,
16 |         "bytes_per_element": 2.0,
17 |         "can_be_used_as_vector": true,
18 |         "has_min": false,
19 |         "has_scale": false,
20 |         "name": "FP16",
21 |         "reference": "",
22 |         "type_of_dot_vector": "FP16"
23 |     },
24 |     "FP32": {
25 |         "block_element_count": 1,
26 |         "block_element_size": 4,
27 |         "bytes_per_element": 4.0,
28 |         "can_be_used_as_vector": true,
29 |         "has_min": false,
30 |         "has_scale": false,
31 |         "name": "FP32",
32 |         "reference": "",
33 |         "type_of_dot_vector": "FP32"
34 |     },
35 |     "Q4_0": {
36 |         "block_element_count": 32,
37 |         "block_element_size": 18,
38 |         "bytes_per_element": 0.5625,
39 |         "can_be_used_as_vector": false,
40 |         "has_min": false,
41 |         "has_scale": true,
42 |         "name": "Q4_0",
43 |         "reference": "https://huggingface.co/docs/hub/gguf",
44 |         "type_of_dot_vector": "Q8_0"
45 |     },
46 |     "Q8_0": {
47 |         "block_element_count": 32,
48 |         "block_element_size": 34,
49 |         "bytes_per_element": 1.0625,
50 |         "can_be_used_as_vector": true,
51 |         "has_min": false,
52 |         "has_scale": true,
53 |         "name": "Q8_0",
54 |         "reference": "https://huggingface.co/docs/hub/gguf",
55 |         "type_of_dot_vector": "Q8_0"
56 |     }
57 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/export_envs_before_run.sh:
--------------------------------------------------------------------------------
1 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
2 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 
3 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/install_deps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd "${0%/*}"
 4 | git submodule update --init --recursive
 5 | 
 6 | sudo apt update
 7 | sudo apt install libtbb-dev
 8 | sudo apt install libcurl4-openssl-dev
 9 | sudo apt install libaio-dev
10 | 
11 | cd third_party/xxHash/
12 | make -j
13 | sudo make install
14 | cd ../..
15 | 
16 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/mkfs.sh:
--------------------------------------------------------------------------------
1 | sudo umount /mnt/xwy 
2 | sudo mkfs.xfs /dev/nvme0n1 -f
3 | sudo mount /dev/nvme0n1 /mnt/xwy
4 | sudo chown -R xwy /mnt/xwy/


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(${THIRD_PARTY_DIR}/asyncio/include)
 2 | 
 3 | add_library(kvc2_metrics STATIC metrics.cpp)
 4 | target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull)
 5 | 
 6 | add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp)
 7 | target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
 8 | 
 9 | function(add_third_party_includes TARGET_NAME)
10 |     target_include_directories(${TARGET_NAME} PRIVATE
11 |         ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include
12 |         ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include
13 |         ${THIRD_PARTY_DIR}/prometheus-cpp/core/include
14 |         ${THIRD_PARTY_DIR}/prometheus-cpp/pull/include
15 |         ${THIRD_PARTY_DIR}/spdlog/include
16 |     )
17 | endfunction()
18 | 
19 | 
20 | add_library(cache_entry cache_entry.cpp)
21 | add_third_party_includes(cache_entry)
22 | target_link_libraries(cache_entry PUBLIC gpu_cache)
23 | 
24 | add_library(gpu_cache gpu_cache.cpp)
25 | add_third_party_includes(gpu_cache)
26 | target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager)
27 | 
28 | add_library(kvc2 prefix.cpp)
29 | target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
30 | add_third_party_includes(kvc2)
31 | target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics)
32 | 
33 | message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})
34 | add_library(async_store async_store.cpp)
35 | target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include)
36 | target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include)
37 | target_link_libraries(async_store PUBLIC pthread)
38 | 
39 | 
40 | 
41 | add_library(cuda_stream_manager cuda_stream_manager.cpp)
42 | target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include)
43 | target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include)
44 | target_include_directories(cuda_stream_manager  PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
45 | target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart)
46 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/async_store.hh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstddef>
 3 | #include <filesystem>
 4 | 
 5 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
 6 | #define FMT_HEADER_ONLY
 7 | #include "spdlog/spdlog.h"
 8 | 
 9 | #include "io_helper.hpp"
10 | 
11 | namespace async_store {
12 | 
13 | struct ArrayStore;
14 | 
15 | ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path);
16 | void close_store(ArrayStore* store);
17 | size_t capacity(ArrayStore* store);
18 | void extend(ArrayStore* store, size_t to);
19 | 
20 | 
21 | 
22 | struct IORequest {
23 |   ArrayStore* store;
24 |   bool write;
25 |   void* data;
26 |   size_t index;
27 | 
28 |   // for sync
29 |   bool need_promise = false;
30 |   BatchPromise* promise;
31 | };
32 | 
33 | std::string request_to_string(IORequest* req);
34 | 
35 | struct IODealerImpl;
36 | struct IODealer {
37 |   IODealerImpl* io_impl;
38 | 
39 |   IODealer(bool use_io_uring = false, int IO_DEPTH = 128);
40 |   ~IODealer();
41 |   IODealer(const IODealer&) = delete;
42 |   IODealer& operator=(const IODealer&) = delete;
43 |   IODealer(IODealer&&) = default;
44 |   IODealer& operator=(IODealer&&) = default;
45 | 
46 |   void enqueue(std::shared_ptr<IORequest> req);
47 |   std::thread start_io_thread();
48 |   void stop();
49 | };
50 | 
51 | }  // namespace async_store
52 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/common.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/src/common.h


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/cuda_stream_manager.hh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Xie Weiyu ervinxie@qq.com
 3 |  * @Date: 2024-11-19 09:24:47
 4 |  * @LastEditors: Xie Weiyu ervinxie@qq.com
 5 |  * @LastEditTime: 2024-11-20 02:55:49
 6 |  * @FilePath: /kvc2/src/cuda_stream_manager.hh
 7 |  * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 8 |  */
 9 | #pragma once
10 | 
11 | #include <cuda_runtime.h>
12 | #include <atomic>
13 | #include <functional>
14 | #include <memory>
15 | #include <thread>
16 | #include <vector>
17 | #include "utils/mpsc.hpp"
18 | 
19 | class CudaStreamManager {
20 |  public:
21 |   // 构造函数，接受要使用的设备 ID 列表和每个设备的流数量
22 |   CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device);
23 |   ~CudaStreamManager();
24 | 
25 |   // 请求结构体
26 |   struct Request {
27 |     bool should_exit = false;
28 |     int device_id;
29 |     std::vector<void*> host_mem_addresses;
30 |     std::vector<void*> device_mem_addresses;
31 |     std::vector<size_t> sizes;
32 |     cudaMemcpyKind direction;
33 |     std::function<void()> callback;
34 |   };
35 | 
36 |   void submitRequest(std::shared_ptr<Request> request);
37 | 
38 |  private:
39 |   // 每个设备的信息
40 |   struct DeviceInfo {
41 |     int device_id;
42 |     std::thread worker_thread;
43 |     std::vector<cudaStream_t> streams;
44 |     int next_stream_index;
45 |     MPSCQueueConsumerLock<std::shared_ptr<Request>> request_queue;
46 |     std::atomic_bool stop_flag;
47 |   };
48 | 
49 |   // 设备 ID 到 DeviceInfo 的映射
50 |   std::vector<std::unique_ptr<DeviceInfo>> devices_;
51 | 
52 |   // 私有方法
53 |   void deviceWorker(DeviceInfo& device_info);
54 | };
55 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/defs.h:
--------------------------------------------------------------------------------
 1 | #ifndef __DEFS_H_
 2 | #define __DEFS_H_
 3 | 
 4 | #include <cstdint>
 5 | #include <optional>
 6 | #include <vector>
 7 | #include "model_config.h"
 8 | 
 9 | namespace kvc2 {
10 | using kvc2_ptr = void*;
11 | // using data_block_ptr = std::intptr_t;
12 | using data_block_ptr = void*;
13 | using layer_data = std::vector<data_block_ptr>;
14 | using kvc2_handle = void*;
15 | 
16 | using Token = uint32_t;
17 | using Tokens = std::vector<Token>;
18 | using TokenPtr = std::intptr_t;
19 | using TokenLength = size_t;
20 | using BlockLength = size_t;
21 | 
22 | struct CacheInfo {
23 |   ModelName model_name;
24 |   bool is_key_cache;
25 |   QuantType quant_type;
26 | 
27 |   size_t hidden_layer_count();
28 |   std::filesystem::path path(std::optional<size_t> which_layer = std::nullopt);
29 |   bool operator==(const CacheInfo& other) const;
30 |   size_t element_size(size_t block_length);
31 |   size_t hash_value() const;
32 | };
33 | 
34 | };  // namespace kvc2
35 | #endif
36 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/hasher.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __HASHER_HPP_
 2 | #define __HASHER_HPP_
 3 | 
 4 | #include "defs.h"
 5 | #include "xxhash.h"
 6 | 
 7 | namespace kvc2 {
 8 | 
 9 | const uint64_t hash_seed = 4123512;
10 | const uint64_t check_hash_seed = 1025753;
11 | 
12 | using TokensHash = XXH64_hash_t;
13 | struct TokensHasher {
14 |   XXH64_state_t* state;
15 |   TokensHasher() {
16 |     state = XXH64_createState();
17 |     reset();
18 |   }
19 |   ~TokensHasher() { XXH64_freeState(state); }
20 | 
21 |   TokensHasher(TokensHasher& other) = delete;
22 |   TokensHasher& operator=(TokensHasher& other) = delete;
23 |   TokensHasher(TokensHasher&& other) = delete;
24 |   TokensHasher& operator=(TokensHasher&& other) = delete;
25 |   TokensHash get() { return XXH64_digest(state); }
26 |   void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); }
27 |   TokensHash update(Token* data, TokenLength length) {
28 |     XXH64_update(state, data, length * sizeof(Token));
29 |     return get();
30 |   }
31 | 
32 |   TokensHash update_raw(void* data, size_t size) {
33 |     XXH64_update(state, data, size);
34 |     return get();
35 |   }
36 | 
37 |   static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); }
38 | };
39 | }  // namespace kvc2
40 | #endif


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/kvc2_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import ctypes
 3 | 
 4 | def aligned_tensor(size, alignment=4096):
 5 |     num_bytes = size 
 6 |     mem = ctypes.c_void_p()
 7 |     error_code = ctypes.CDLL(None).posix_memalign(
 8 |         ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
 9 |     )
10 | 
11 |     if error_code != 0:
12 |         raise MemoryError(f"posix_memalign failed with error code {error_code}")
13 | 
14 |     array_type = (ctypes.c_int8 * size) 
15 |     raw_array = array_type.from_address(mem.value)
16 | 
17 |     tensor = torch.frombuffer(raw_array, dtype=torch.int8)
18 | 
19 |     if tensor.data_ptr() % alignment != 0:
20 |         raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
21 | 
22 |     return tensor, mem
23 | 
24 | def alloc_aligned_cache(layer_count,block_count,element_size):
25 |     cache = []
26 |     cache_mem = []
27 |     for i in range(layer_count):
28 |         layer_data = []
29 |         layer_mem = []
30 |         for j in range(block_count):
31 |             tensor, mem_ptr = aligned_tensor(element_size, alignment=4096)
32 |             layer_data.append(tensor)
33 |             layer_mem.append(mem_ptr)
34 |         cache.append(layer_data)
35 |         cache_mem.append(layer_mem)
36 |     return cache,cache_mem
37 | 
38 | def dealloc_aligned_cache(cache_mem):
39 |     for layer_mem in cache_mem:
40 |         for mem_ptr in layer_mem:
41 |             ctypes.CDLL(None).free(mem_ptr)
42 | 
43 | def get_tensor_ptr(tensors):
44 |     tensor_ptr = []
45 |     for layer in tensors:
46 |         layer_ptr = []
47 |         for data in layer:
48 |             layer_ptr.append(data.data_ptr())
49 |         tensor_ptr.append(layer_ptr)
50 |     return tensor_ptr
51 | 
52 | def get_tensor_from_data_ptr(matched_data,element_size):
53 |     re = []
54 |     for layer in matched_data:
55 |         re_layer = []
56 |         for data_ptr in layer:
57 |             array_type = (ctypes.c_int8 * element_size) 
58 |             raw_array = array_type.from_address(data_ptr)
59 |             tensor = torch.frombuffer(raw_array, dtype=torch.int8)
60 |             re_layer.append(tensor)
61 |         re.append(re_layer)
62 |     return re
63 | if __name__ == "__main__":
64 |     pass


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/metrics.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <atomic>
 4 | #include <chrono>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <thread>
 8 | #include <vector>
 9 | #include "prometheus/counter.h"
10 | #include "prometheus/exposer.h"
11 | #include "prometheus/gauge.h"
12 | #include "prometheus/histogram.h"
13 | #include "prometheus/registry.h"
14 | 
15 | #include "utils/timer.hpp"
16 | 
17 | namespace kvc2 {
18 | 
19 | // 指标前缀宏定义
20 | #define METRIC_PREFIX "kvc2"
21 | 
22 | struct MetricsConfig {
23 |   std::string endpoint;  // 监听端点，如 "0.0.0.0:8080"
24 | };
25 | 
26 | class Metrics {
27 |  public:
28 |   // 构造函数传入 MetricsConfig
29 |   Metrics(const MetricsConfig& config);
30 |   ~Metrics();
31 | 
32 |   // 禁止拷贝和赋值
33 |   Metrics(const Metrics&) = delete;
34 |   Metrics& operator=(const Metrics&) = delete;
35 | 
36 |   // 指标指针
37 |   prometheus::Counter* prefix_nodes;
38 |   prometheus::Counter* prefix_block_count;
39 | 
40 |   prometheus::Histogram* raw_insert_time_ms;
41 |   prometheus::Histogram* lookup_time_ms;
42 |   prometheus::Histogram* lookup_prefixmatch_length;
43 |   prometheus::Histogram* matched_length_percentage;
44 | 
45 |   prometheus::Gauge* disk_usage;
46 | 
47 |   prometheus::Gauge* memory_pool_size(const std::string& type);
48 |   prometheus::Gauge* memory_pool_node_count(const std::string& type);
49 | 
50 |   prometheus::Gauge* lru_entry_count(const std::string& type);
51 |   prometheus::Gauge* gpu_page_count(std::string type);
52 | 
53 |   prometheus::Histogram* append_tokens_time_ms;
54 |   prometheus::Histogram* gpu_flush_back_time_ms;
55 |   prometheus::Histogram* cpu_flush_back_time_ms;
56 | 
57 |  private:
58 |   std::shared_ptr<prometheus::Registry> registry_;
59 |   prometheus::Exposer exposer_;
60 | 
61 |   prometheus::Family<prometheus::Gauge>* memory_pool_size_family_;
62 |   prometheus::Family<prometheus::Gauge>* memory_pool_node_count_family_;
63 |   prometheus::Family<prometheus::Gauge>* lru_entry_count_family_;
64 |   prometheus::Family<prometheus::Gauge>* gpu_page_count_family_;
65 | };
66 | 
67 | class TimeObserver {
68 |  public:
69 |   TimeObserver(prometheus::Histogram* h);
70 |   ~TimeObserver();
71 | 
72 |  private:
73 |   Timer timer_;
74 |   prometheus::Histogram* histogram_;
75 | };
76 | 
77 | }  // namespace kvc2


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <assert.h>
 4 | #include <algorithm>  // std::sort
 5 | #include <atomic>
 6 | #include <bitset>
 7 | #include <cstddef>  // size_t
 8 | #include <mutex>    // std::mutex
 9 | #include <vector>
10 | 
11 | constexpr size_t PageSize = 4096;
12 | 
13 | /// PageAlignedMemoryPool 类的声明
14 | struct PageAlignedMemoryPool {
15 |  private:
16 |   constexpr static size_t Blocks = 16;
17 | 
18 |   void* data = nullptr;
19 | 
20 |   size_t total_size = 0, total_pages = 0;
21 | 
22 |   std::atomic_size_t now_block = 0;
23 |   std::atomic_size_t allocated = 0;  // allocated_size
24 |   std::atomic_size_t alloc_count = 0;
25 |   std::atomic_size_t free_count = 0;
26 | 
27 |   std::mutex lock[Blocks];
28 |   size_t page_per_block = 0;
29 |   void* first_page[Blocks];
30 |   size_t count_page[Blocks];
31 |   std::vector<int8_t> bitmap[Blocks];
32 |   void* alloc_in_block(size_t block_index, size_t alloc_size);
33 | 
34 |  public:
35 |   /// 构造函数和析构函数
36 |   explicit PageAlignedMemoryPool(size_t size_in_bytes);
37 |   ~PageAlignedMemoryPool();
38 | 
39 |   /// 禁用拷贝和移动
40 |   PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete;
41 |   PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete;
42 |   PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete;
43 |   PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete;
44 | 
45 |   /// 成员函数
46 |   size_t page_count();
47 |   size_t page_padded_size(size_t size);
48 | 
49 |   void* alloc(size_t size);
50 |   std::vector<void*> alloc_multiple(size_t size, size_t count);
51 |   void free(void* data, size_t size);
52 |   void defragment();
53 |   std::string debug();
54 | };
55 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/all.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "easy_format.hpp"
3 | #include "timer.hpp"


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/arithmetic.hpp:
--------------------------------------------------------------------------------
 1 | #include <memory>
 2 | #include <type_traits>
 3 | 
 4 | template <typename T, typename U>
 5 | T div_up(T x, U by) {
 6 |   static_assert(std::is_integral_v<T>);
 7 |   static_assert(std::is_integral_v<U>);
 8 |   return (x + by - 1) / by;
 9 | }
10 | 
11 | template <typename T>
12 | T* offset_by_bytes(T* t, size_t n) {
13 |   return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
14 | }
15 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/easy_format.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __EASY_FORMAT_HPP_
 2 | #define __EASY_FORMAT_HPP_
 3 | #include <array>
 4 | #include <iomanip>
 5 | #include <sstream>
 6 | #include <string>
 7 | 
 8 | #include <vector>
 9 | 
10 | template <typename T>
11 | inline std::string format_vector(const std::vector<T>& v) {
12 |   std::ostringstream oss;
13 |   if (v.empty())
14 |     return "[]";
15 |   for (size_t i = 0; i < v.size(); ++i) {
16 |     oss << v[i];
17 |     if (i < v.size() - 1)
18 |       oss << ", ";  // 逗号分隔
19 |   }
20 |   return oss.str();
21 | }
22 | 
23 | inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
24 | 
25 | inline std::string readable_number(size_t size) {
26 |   size_t unit_index = 0;
27 |   double readable_size = size;
28 |   while (readable_size >= 1000 && unit_index < units.size() - 1) {
29 |     readable_size /= 1000;
30 |     unit_index++;
31 |   }
32 |   std::ostringstream ss;
33 |   ss << std::fixed << std::setprecision(2) << readable_size;
34 |   std::string str = ss.str();
35 |   return str + "" + units[unit_index];
36 | }
37 | #endif


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp:
--------------------------------------------------------------------------------
 1 | #include <atomic>
 2 | #include <future>
 3 | #include <iostream>
 4 | #include <memory>
 5 | #include <thread>
 6 | #include <vector>
 7 | 
 8 | template <typename T>
 9 | class MPSCQueue {
10 |   struct Node {
11 |     std::shared_ptr<T> data;
12 |     std::atomic<Node*> next;
13 | 
14 |     Node() : next(nullptr) {}
15 |     Node(std::shared_ptr<T> data_) : data(std::move(data_)), next(nullptr) {}
16 |   };
17 | 
18 |   std::atomic<Node*> head;
19 |   Node* tail;
20 | 
21 |  public:
22 |   std::atomic_size_t enqueue_count = 0;
23 |   size_t dequeue_count = 0;
24 |   MPSCQueue() {
25 |     Node* dummy = new Node();
26 |     head.store(dummy, std::memory_order_relaxed);
27 |     tail = dummy;
28 |   }
29 | 
30 |   ~MPSCQueue() {
31 |     // 清理剩余的节点
32 |     Node* node = tail;
33 |     while (node) {
34 |       Node* next = node->next.load(std::memory_order_relaxed);
35 |       delete node;
36 |       node = next;
37 |     }
38 |   }
39 | 
40 |   // 生产者调用
41 |   void enqueue(std::shared_ptr<T> data) {
42 |     enqueue_count.fetch_add(1);
43 |     Node* node = new Node(std::move(data));
44 |     Node* prev_head = head.exchange(node, std::memory_order_acq_rel);
45 |     prev_head->next.store(node, std::memory_order_release);
46 |   }
47 | 
48 |   // 消费者调用
49 |   std::shared_ptr<T> dequeue() {
50 |     Node* next = tail->next.load(std::memory_order_acquire);
51 |     if (next) {
52 |       std::shared_ptr<T> res = std::move(next->data);
53 |       delete tail;
54 |       tail = next;
55 |       dequeue_count += 1;
56 |       return res;
57 |     }
58 |     return nullptr;
59 |   }
60 | };


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __MUTEX_EXTEND_HPP_
 2 | #define __MUTEX_EXTEND_HPP_
 3 | 
 4 | #include <atomic>
 5 | #include <chrono>
 6 | #include <iostream>
 7 | #include <thread>
 8 | 
 9 | class non_recursive_mutex {
10 |  public:
11 |   non_recursive_mutex() = default;
12 | 
13 |   // 使用 try_lock 实现非递归锁
14 |   bool try_lock() {
15 |     std::thread::id this_id = std::this_thread::get_id();
16 | 
17 |     // 检查当前线程是否已经持有该锁
18 |     if (owner.load(std::memory_order_acquire) == this_id) {
19 |       return false;  // 如果是当前线程，返回失败
20 |     }
21 | 
22 |     // 尝试加锁
23 |     if (mtx.try_lock()) {
24 |       owner.store(this_id, std::memory_order_release);  // 设置锁的拥有者
25 |       return true;
26 |     }
27 | 
28 |     return false;
29 |   }
30 | 
31 |   // lock 会阻塞，直到获得锁
32 |   void lock() {
33 |     std::thread::id this_id = std::this_thread::get_id();
34 | 
35 |     while (true) {
36 |       // 检查当前线程是否已经持有该锁
37 |       if (owner.load(std::memory_order_acquire) == this_id) {
38 |         throw std::runtime_error("Thread is trying to lock a mutex it already holds");
39 |       }
40 | 
41 |       // 尝试加锁
42 |       if (mtx.try_lock()) {
43 |         owner.store(this_id, std::memory_order_release);  // 设置锁的拥有者
44 |         return;
45 |       }
46 | 
47 |       // 如果锁未获得，则稍微等待，防止忙等
48 |       std::this_thread::yield();
49 |     }
50 |   }
51 | 
52 |   // 解锁
53 |   void unlock() {
54 |     std::thread::id this_id = std::this_thread::get_id();
55 | 
56 |     // 确保只有持有锁的线程可以解锁
57 |     if (owner.load(std::memory_order_acquire) == this_id) {
58 |       owner.store(std::thread::id(), std::memory_order_release);  // 清除锁的拥有者
59 |       mtx.unlock();
60 |     } else {
61 |       throw std::runtime_error("Thread attempting to unlock a mutex it doesn't own");
62 |     }
63 |   }
64 | 
65 |  private:
66 |   std::mutex mtx;                      // 实际的互斥量
67 |   std::atomic<std::thread::id> owner;  // 原子变量，记录当前锁的拥有者
68 | };
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/src/utils/spin_lock.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Xie Weiyu ervinxie@qq.com
 3 |  * @Date: 2024-11-21 06:35:47
 4 |  * @LastEditors: Xie Weiyu ervinxie@qq.com
 5 |  * @LastEditTime: 2024-11-21 06:35:50
 6 |  * @FilePath: /kvc2/src/utils/spin_lock.hpp
 7 |  * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
 8 |  * https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
 9 |  */
10 | 
11 | #include <atomic>
12 | #include <chrono>
13 | #include <thread>
14 | 
15 | class SpinLock {
16 |  public:
17 |   SpinLock() { flag.clear(); }
18 | 
19 |   void lock() {
20 |     const int max_delay = 1024;  // Maximum delay in microseconds
21 |     int delay = 1;               // Initial delay in microseconds
22 | 
23 |     while (flag.test_and_set(std::memory_order_acquire)) {
24 |       std::this_thread::sleep_for(std::chrono::microseconds(delay));
25 |       delay *= 2;
26 |       if (delay > max_delay) {
27 |         delay = max_delay;
28 |       }
29 |     }
30 |   }
31 | 
32 |   void unlock() { flag.clear(std::memory_order_release); }
33 | 
34 |  private:
35 |   std::atomic_flag flag = ATOMIC_FLAG_INIT;
36 | };
37 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/hashmap_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <tbb/concurrent_hash_map.h>
 2 | #include <iostream>
 3 | 
 4 | int main() {
 5 |   tbb::concurrent_hash_map<int, int> map;
 6 |   map.insert({1, 2});
 7 |   decltype(map)::accessor a;
 8 |   std::cout << map.find(a, 1) << std::endl;
 9 | 
10 |   return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
 3 | 
 4 | function(add_kvc2_test source_file)
 5 |     get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
 6 |     add_executable(${target_name} ${source_file})
 7 |     # target_compile_options(${target_name} PRIVATE -fopenmp  -fno-strict-aliasing)
 8 |     target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 9 |     target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/nlohmann/single_include)
10 |     target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
11 |     target_link_libraries(${target_name} PRIVATE kvc2 async_store)
12 | endfunction()
13 | 
14 | add_kvc2_test(raw_insert_read.cpp)
15 | add_kvc2_test(lookup.cpp)
16 | add_kvc2_test(lookup-alt.cpp)
17 | add_kvc2_test(lookup-alt-gpu.cpp)
18 | add_kvc2_test(lookup-mt.cpp)
19 | add_kvc2_test(lookup-gpu.cpp)
20 | add_kvc2_test(lookup-gpu-mt.cpp)
21 | add_kvc2_test(lookup-gpu-async.cpp)
22 | add_kvc2_test(append-tokens.cpp)
23 | add_kvc2_test(flush-back.cpp)
24 | add_kvc2_test(check-flush-back.cpp)
25 | add_kvc2_test(lookup-without-vcache.cpp)
26 | add_kvc2_test(lookup-gpu-mt-without-vcache.cpp)
27 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp:
--------------------------------------------------------------------------------
 1 | #include <future>
 2 | #include "common.hpp"
 3 | 
 4 | int main(int argc, char* argv[]) {
 5 |   init(argc, argv);
 6 |   spdlog::set_level(spdlog::level::debug);
 7 |   config.gpu_cache_config->total_kvcache_pages = 12;
 8 |   auto kvc2 = kvc2::create_kvc2(config);
 9 |   kvc2->load();
10 |   // #pragma omp parallel for
11 |   for (size_t ti = 0; ti < 2; ti++) {
12 |     SPDLOG_WARN("Test {}", ti);
13 |     auto [kcache, vcache] = kvc2->get_kvcache();
14 |     std::mt19937 gen(ti + 123);
15 |     size_t total_page = 10;
16 |     TokenLength total_length = total_page * config.num_token_per_page;
17 |     auto tokens = random_ids(total_length, gen);
18 |     auto k1 = random_kvcache(total_page, gen);
19 |     auto v1 = random_kvcache(total_page, gen);
20 | 
21 |     {
22 |       std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
23 |       kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
24 |                                 [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
25 |       auto fut = p.get_future();
26 |       fut.wait();
27 |       auto h = fut.get();
28 |       assert(h->matched_length() == total_length);
29 |       size_t matched_block = h->matched_length() / config.num_token_per_page;
30 |       auto block_idx = h->get_gpu_block_idx();
31 |       cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
32 |     }
33 |   }
34 |   SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
35 |   return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : Xie Weiyu
 4 |  * @Date         : 2024-11-22 09:52:48
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : Xie Weiyu
 7 |  * @LastEditTime : 2024-11-25 07:51:09
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | 
11 | #include <future>
12 | #include "common.hpp"
13 | 
14 | int main(int argc, char* argv[]) {
15 |   init(argc, argv);
16 |   spdlog::set_level(spdlog::level::debug);
17 |   auto kvc2 = kvc2::create_kvc2(config);
18 | 
19 |   std::mt19937 gen(123);
20 |   auto ids1 = random_ids(10 * config.num_token_per_page, gen);
21 |   auto k1 = random_kvcache(10, gen);
22 |   auto v1 = random_kvcache(10, gen);
23 | 
24 |   kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
25 | 
26 | // complete same
27 | #pragma omp parallel for
28 |   for (size_t ti = 0; ti < 3; ti++) {
29 |     std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
30 |     kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, ids1.data(), ids1.size(),
31 |                               ids1.size() + 2 * config.num_token_per_page,
32 |                               [&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
33 |     auto fut = p.get_future();
34 |     fut.wait();
35 |     auto h = fut.get();
36 |     auto k = h->handle_data(true);
37 |     auto v = h->handle_data(false);
38 |     cmp_handle_data(k1, k, 10);
39 |     cmp_handle_data(v1, v, 10);
40 | 
41 |     auto block_idx = h->get_gpu_block_idx();
42 |     auto [kcache, vcache] = kvc2->get_kvcache();
43 | 
44 |     cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, 10);
45 |   }
46 | 
47 |   SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
48 |   return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : Xie Weiyu
 4 |  * @Date         : 2024-11-22 09:52:48
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : Xie Weiyu
 7 |  * @LastEditTime : 2024-11-25 07:51:09
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | 
11 | #include "common.hpp"
12 | 
13 | int main(int argc, char* argv[]) {
14 |   qw25_7B_gpu_config.v_cache_on = false;
15 |   config.gpu_cache_config = qw25_7B_gpu_config;
16 |   config.v_cache_on = false;
17 | 
18 |   init(argc, argv);
19 |   spdlog::set_level(spdlog::level::debug);
20 |   auto kvc2 = kvc2::create_kvc2(config);
21 | 
22 |   std::mt19937 gen(123);
23 |   auto ids1 = random_ids(10 * config.num_token_per_page, gen);
24 |   auto k1 = random_kvcache(10, gen);
25 | 
26 |   kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
27 | 
28 | // complete same
29 | #pragma omp parallel for
30 |   for (size_t ti = 0; ti < 3; ti++) {
31 |     auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
32 |                                  ids1.size() + 2 * config.num_token_per_page);
33 |     auto k = h->handle_data(true);
34 |     cmp_handle_data(k1, k, 10);
35 | 
36 |     auto block_idx = h->get_gpu_block_idx();
37 |     auto [kcache, vcache] = kvc2->get_kvcache();
38 | 
39 |     auto k_from_gpu = empty_kvcache(15);
40 | 
41 |     size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
42 |     size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
43 |     for (size_t i = 0; i < k_from_gpu.size(); i++) {
44 |       for (size_t j = 0; j < block_idx.size(); j++) {
45 |         size_t b_idx = block_idx[j];
46 |         for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
47 |           {
48 |             auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
49 |             void* src = kt.data_ptr();
50 |             void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
51 |             memcpy(dst, src, element_size_per_gpu);
52 |           }
53 |         }
54 |       }
55 |     }
56 |     cmp_handle_data(k1, k_from_gpu, 10);
57 |   }
58 | 
59 |   SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
60 |   return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "kvcache_test_utils.cpp"
 2 | 
 3 | int main(int argc, char* argv[]) {
 4 |   parse_and_check(argc, argv);
 5 |   spdlog::set_level(spdlog::level::debug);
 6 |   std::mt19937 gen(123);
 7 | 
 8 |   KVC2 kvc2(FLAGS_disk_cache_path);
 9 |   auto io = kvc2.io_dealer->start_io_thread();
10 | 
11 |   SPDLOG_WARN("Insert 10 x 10 KVCache");
12 |   std::vector<KVCacheHandle> handles(10);
13 |   for (int i = 0; i < 10; i++) {
14 |     handles[i] = random_kvcache(qwen_cache_info, 10, gen);
15 |     auto& h1 = handles[i];
16 |     h1.ids = random_ids(10 * BlockLength, gen);
17 |     kvc2.raw_insert(h1);
18 |   }
19 | 
20 |   SPDLOG_WARN("Cache Eviction Test");
21 |   {
22 |     for (int i = 0; i < 10; i++) {
23 |       auto& h = handles[i];
24 |       SPDLOG_WARN("Lookup {}", i);
25 |       auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
26 |       cmp_handle_data(h, *x);
27 |     }
28 |     SPDLOG_WARN("Simple Eviction OK");
29 |   }
30 | 
31 |   {
32 |     std::vector<std::shared_ptr<KVCacheHandle>> lookup_handles;
33 |     for (int i = 0; i < 10; i++) {
34 |       auto& h = handles[i];
35 |       SPDLOG_WARN("Lookup {}", i);
36 |       auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
37 |       if (i >= 5) {
38 |         assert(x == nullptr);
39 |         continue;
40 |       }
41 |       lookup_handles.push_back(x);
42 |       cmp_handle_data(h, *x);
43 |     }
44 |     SPDLOG_WARN("Cannot Eviction OK");
45 |   }
46 | 
47 |   kvc2.io_dealer->stop();
48 |   io.join();
49 | 
50 |   SPDLOG_WARN("{} Test Passed", __FILE__);
51 |   return 0;
52 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/page_pool_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <unistd.h>
 3 | #include <iostream>
 4 | #include <random>
 5 | #include <thread>
 6 | #include <vector>
 7 | #include "page_aligned_memory_pool.cpp"
 8 | 
 9 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
10 | #define FMT_HEADER_ONLY
11 | #include "spdlog/spdlog.h"
12 | 
13 | // 每个线程执行的任务
14 | void thread_task(PageAlignedMemoryPool& pool) {
15 |   std::mt19937 gen(123);
16 |   std::vector<std::pair<void*, size_t>> allocated;
17 |   size_t cnt = 40000;
18 |   for (size_t i = 0; i < cnt; ++i) {
19 |     // 随机分配一个大小
20 |     size_t size = (gen() % 100 + 1) * 4096 * 4;
21 |     void* ptr = pool.alloc(size);
22 |     // SPDLOG_DEBUG(pool.debug());
23 |     if (ptr) {
24 |       pool.free(ptr, size);
25 |       //   allocated.push_back({ptr, size});
26 |     }
27 |     // sleep((int)(gen() % 1000) / 1000.0);
28 |   }
29 |   // free all memory
30 |   for (auto& p : allocated) {
31 |     pool.free(p.first, p.second);
32 |   }
33 | }
34 | 
35 | int main(int argc, char* argv[]) {
36 |   spdlog::set_level(spdlog::level::debug);
37 | 
38 |   // 创建一个内存池
39 |   PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024);  // 40 G
40 | 
41 |   // 创建线程
42 |   const int num_threads = 32;
43 |   std::vector<std::thread> threads;
44 |   for (int i = 0; i < num_threads; ++i) {
45 |     threads.emplace_back(thread_task, std::ref(pool));
46 |   }
47 | 
48 |   // 等待所有线程完成
49 |   for (auto& t : threads) {
50 |     t.join();
51 |   }
52 | 
53 |   // 输出调试信息
54 |   std::cout << pool.debug() << std::endl;
55 | 
56 |   return 0;
57 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/prefix_test.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/test/prefix_test.cpp


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/pytest_load.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('./build')
 3 | sys.path.append('./src')
 4 | import torch
 5 | import kvc2_ext
 6 | from kvc2_utils import get_tensor_from_data_ptr
 7 | 
 8 | # Create a kvc2 instance
 9 | path = "/mnt/data/kvc2"
10 | kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
11 | kvc2_ext.load(kvc2_instance)
12 | 
13 | # Start IO thread
14 | print("Start IO thread")
15 | kvc2_ext.start_io_thread(kvc2_instance)
16 | print("IO thread started")
17 | 
18 | # Create CacheInfoInput
19 | test_info = kvc2_ext.CacheInfoInput()
20 | test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
21 | test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
22 | test_info.quant_type = kvc2_ext.QuantType.QT_F32
23 | 
24 | print("Element size: ", test_info.element_size())
25 | 
26 | # Generate random test IDs (length = 2560)
27 | torch.manual_seed(123)
28 | length = 2560
29 | test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
30 | block_count = (length+255) // 256
31 | # print("Test ID: ", test_id)
32 | 
33 | # Generate test data based on element size and hidden layer count
34 | element_size = test_info.element_size()
35 | hidden_layer_count = test_info.hidden_layer_count()
36 | 
37 | def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
38 |     handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
39 |     if kvc2_ext.is_nullptr(handle):
40 |         print("Handle is nullptr.")
41 |         exit()
42 |     matched_length = kvc2_ext.matched_length(handle)
43 |     matched_data = kvc2_ext.handle_data(handle)
44 |     print('Matched length: ', matched_length)
45 |     if matched_length >0:
46 |         print(f'First layer address {[hex(x) for x in matched_data[0]]}')
47 |     read_data = get_tensor_from_data_ptr(matched_data,element_size)
48 |     
49 |     print("Just read check ok.")
50 |     kvc2_ext.release(handle)
51 | 
52 | 
53 | l = 128
54 | while l<=length:
55 |     read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
56 |     l+=128
57 | 
58 | kvc2_ext.destroy_kvc2(kvc2_instance)
59 | 
60 | 
61 | print("Test completed successfully.")
62 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/test_align.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import torch
 3 | 
 4 | def aligned_tensor(size, alignment=4096):
 5 |     num_bytes = size 
 6 |     mem = ctypes.c_void_p()
 7 |     error_code = ctypes.CDLL(None).posix_memalign(
 8 |         ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
 9 |     )
10 | 
11 |     if error_code != 0:
12 |         raise MemoryError(f"posix_memalign failed with error code {error_code}")
13 | 
14 |     array_type = (ctypes.c_int8 * size) 
15 |     raw_array = array_type.from_address(mem.value)
16 | 
17 |     tensor = torch.frombuffer(raw_array, dtype=torch.int8)
18 | 
19 |     if tensor.data_ptr() % alignment != 0:
20 |         raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
21 | 
22 |     return tensor, mem
23 | 
24 | 
25 | size = 5124380
26 | tensor, mem_ptr = aligned_tensor(size, alignment=4096)
27 | 
28 | print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}")
29 | print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}")
30 | print(f"Allocated memory address: {mem_ptr.value}")
31 | 
32 | ctypes.CDLL(None).free(mem_ptr)
33 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <iostream>
 3 | #include <thread>
 4 | #include <vector>
 5 | #include "utils/lock_free_queue.hpp"
 6 | 
 7 | struct Item {
 8 |   int value;
 9 |   std::promise<void> promise;
10 | };
11 | 
12 | int main() {
13 |   MPSCQueue<Item> queue;
14 | 
15 |   std::vector<std::thread> producers;
16 |   const int num_producers = 4;
17 |   const int items_per_producer = 5;
18 | 
19 |   // 启动生产者线程
20 |   for (int i = 0; i < num_producers; ++i) {
21 |     producers.emplace_back([&queue, i]() {
22 |       for (int j = 0; j < items_per_producer; ++j) {
23 |         auto item = std::make_shared<Item>();
24 |         item->value = i * items_per_producer + j;
25 |         std::future<void> future = item->promise.get_future();
26 |         queue.enqueue(item);
27 |         future.wait();  // 等待消费者处理完成
28 |       }
29 |     });
30 |   }
31 | 
32 |   // 启动消费者线程
33 |   std::thread consumer([&queue, num_producers, items_per_producer]() {
34 |     int total_items = num_producers * items_per_producer;
35 |     int processed = 0;
36 |     while (processed < total_items) {
37 |       std::shared_ptr<Item> item = queue.dequeue();
38 |       if (item) {
39 |         std::cout << "Consumed item with value: " << item->value << std::endl;
40 |         item->promise.set_value();  // 通知生产者
41 |         ++processed;
42 |       } else {
43 |         // 如果队列为空，可以选择休眠或让出线程
44 |         std::this_thread::yield();
45 |       }
46 |     }
47 |   });
48 | 
49 |   // 等待所有线程完成
50 |   for (auto& producer : producers) {
51 |     producer.join();
52 |   }
53 |   consumer.join();
54 | 
55 |   return 0;
56 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/test_queue_perf.cpp:
--------------------------------------------------------------------------------
 1 | #include <mutex>
 2 | #include <queue>
 3 | #include "utils/lock_free_queue.hpp"
 4 | 
 5 | #define STDQ
 6 | 
 7 | int main() {
 8 |   const int num_producers = 48;
 9 |   const int num_items = 1e6;
10 | 
11 | #ifdef STDQ
12 |   std::mutex lock;
13 |   std::queue<int> queue;
14 | #else
15 |   MPSCQueue<int> queue;
16 | #endif
17 | 
18 |   auto start_time = std::chrono::high_resolution_clock::now();
19 | 
20 |   // Launch multiple producer threads
21 |   std::vector<std::thread> producers;
22 |   for (int i = 0; i < num_producers; ++i) {
23 |     producers.emplace_back([&queue, i
24 | #ifdef STDQ
25 |                             ,
26 |                             &lock
27 | #endif
28 |     ]() {
29 |       for (int j = 0; j < num_items; ++j) {
30 | #ifdef STDQ
31 |         std::lock_guard<std::mutex> guard(lock);
32 |         queue.push(i * num_items + j);
33 | #else
34 |         queue.enqueue(std::make_shared<int>(i * num_items + j));
35 | #endif
36 |       }
37 |     });
38 |   }
39 | 
40 |   // Consumer thread
41 |   std::thread consumer([&queue, num_producers
42 | #ifdef STDQ
43 |                         ,
44 |                         &lock
45 | #endif
46 |   ]() {
47 |     int count = 0;
48 |     while (count < num_producers * num_items) {
49 | #ifdef STDQ
50 |       std::lock_guard<std::mutex> guard(lock);
51 |       if (!queue.empty()) {
52 |         queue.pop();
53 |         count++;
54 |       }
55 | #else
56 |       if (auto item = queue.dequeue()) {
57 |         count++;
58 |       }
59 | #endif
60 |     }
61 |   });
62 | 
63 |   // Wait for all producers to finish
64 |   for (auto& producer : producers) {
65 |     producer.join();
66 |   }
67 | 
68 |   // Wait for the consumer to finish
69 |   consumer.join();
70 | 
71 |   auto end_time = std::chrono::high_resolution_clock::now();
72 |   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
73 | 
74 | #ifdef STDQ
75 |   std::cout << "std::queue with mutex ";
76 | #else
77 |   std::cout << "lock free queue ";
78 | #endif
79 | 
80 |   std::cout << "Processed " << num_producers * num_items / 1e6 << "M items in " << duration << " milliseconds "
81 |             << num_producers * num_items / 1e3 / duration << " MOps." << std::endl;
82 | 
83 |   return 0;
84 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/test_std_list.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <iterator>
 3 | #include <vector>
 4 | 
 5 | int main() {
 6 |   std::vector<int> v = {0, 1, 2, 3, 4, 5};
 7 | 
 8 |   using RevIt = std::reverse_iterator<std::vector<int>::iterator>;
 9 | 
10 |   const auto it = v.begin() + 3;
11 |   RevIt r_it{it};
12 | 
13 |   std::cout << "*it == " << *it << '\n'
14 |             << "*r_it == " << *r_it << '\n'
15 |             << "*r_it.base() == " << *r_it.base() << '\n'
16 |             << "*(r_it.base()-1) == " << *(r_it.base() - 1) << '\n';
17 | 
18 |   RevIt r_end{v.begin()};
19 |   RevIt r_begin{v.end()};
20 | 
21 |   for (auto it = r_end.base(); it != r_begin.base(); ++it)
22 |     std::cout << *it << ' ';
23 |   std::cout << '\n';
24 | 
25 |   for (auto it = r_begin; it != r_end; ++it)
26 |     std::cout << *it << ' ';
27 |   std::cout << '\n';
28 | 
29 |   for (auto it = r_begin; it != r_end; ++it) {
30 |     if (*it == 3) {
31 |       v.erase(std::next(it).base());
32 |     }
33 |   }
34 | 
35 |   for (auto it : v)
36 |     std::cout << it << ' ';
37 |   std::cout << '\n';
38 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/test/xxHash_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "xxhash.h"
 2 | #include <iostream>
 3 | 
 4 | int main() {
 5 |   std::string t = "hello world";
 6 |   XXH64_hash_t hash = XXH64(t.data(), t.size(), 123);
 7 |   std::cout << hash << std::endl;
 8 |   {
 9 |     /* create a hash state */
10 |     XXH64_state_t* const state = XXH64_createState();
11 |     if (state == NULL)
12 |       abort();
13 | 
14 |     if (XXH64_reset(state, 123) == XXH_ERROR)
15 |       abort();
16 | 
17 |     if (XXH64_update(state, t.data(), 5) == XXH_ERROR)
18 |       abort();
19 | 
20 |     if (XXH64_update(state, t.data() + 5, t.size() - 5) == XXH_ERROR)
21 |       abort();
22 |     /* Produce the final hash value */
23 |     XXH64_hash_t const hash = XXH64_digest(state);
24 | 
25 |     /* State could be re-used; but in this example, it is simply freed  */
26 |     XXH64_freeState(state);
27 |     std::cout << hash << std::endl;
28 |   }
29 | 
30 |   return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/kvc2/unit_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # 检查是否提供了 disk_cache_path 参数
 4 | if [ -z "$1" ]; then
 5 |     echo "Usage: $0 <disk_cache_path>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # 将 disk_cache_path 参数赋值给变量
10 | disk_cache_path=$1
11 | 
12 | # 定义测试命令数组，并使用变量替换 disk_cache_path
13 | tests=(
14 |     "./build/test/kvc2_export_header_test --disk_cache_path=$disk_cache_path"
15 |     "./build/test/kvcache_disk_insert_read_test --disk_cache_path=$disk_cache_path"
16 |     "./build/test/kvcache_mem_eviction_test --disk_cache_path=$disk_cache_path"
17 |     "./build/test/kvcache_mem_insert_read_test --disk_cache_path=$disk_cache_path"
18 |     "./build/test/kvcache_save_load_test --disk_cache_path=$disk_cache_path"
19 | )
20 | 
21 | 
22 | # 遍历每个测试命令
23 | for test in "${tests[@]}"; do
24 |     echo "Running: $test"
25 |     # 运行测试并捕获输出
26 |     output=$($test)
27 |     
28 |     # 检查测试输出中是否包含 "Test Passed"
29 |     if echo "$output" | grep -q "Test Passed"; then
30 |         echo "  Test Passed"
31 |     else
32 |         echo "  Test Failed"
33 |     fi
34 | 
35 |     sleep 1
36 | done


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC")
 2 | # set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC")
 3 | add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI})
 4 | 
 5 | set(UTILS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/utils)
 6 | 
 7 | add_library(sched_metrics metrics.cpp)
 8 | target_include_directories(sched_metrics PRIVATE ${UTILS_DIR})
 9 | target_link_libraries(sched_metrics PUBLIC prometheus-cpp::pull)
10 | 
11 | 
12 | add_library(sched scheduler.cpp)
13 | target_include_directories(sched PRIVATE ${SPDLOG_DIR}/include ${FMT_DIR}/include ${UTILS_DIR} ${KVC2_INCLUDE_DIR})
14 | target_link_libraries(sched PUBLIC pthread ${TORCH_LIBRARIES} kvc2 async_store sched_metrics)
15 | 
16 | pybind11_add_module(sched_ext bind.cpp)
17 | target_link_libraries(sched_ext PUBLIC sched ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/utils/all.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "readable_number.hpp"
3 | #include "timer.hpp"


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/utils/arithmetic.hpp:
--------------------------------------------------------------------------------
1 | #include <type_traits>
2 | 
3 | template <typename T, typename U> T div_up(T x, U by) {
4 |   static_assert(std::is_integral_v<T>);
5 |   static_assert(std::is_integral_v<U>);
6 |   return (x + by - 1) / by;
7 | }


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp:
--------------------------------------------------------------------------------
 1 | #include <atomic>
 2 | 
 3 | template <typename T> struct AtomicPtrWithFlag {
 4 |   constexpr static uint64_t mask = 1ull << 63;
 5 |   std::atomic_uint64_t ptr = 0;
 6 | 
 7 |   std::pair<T *, bool>
 8 |   load(std::memory_order order = std::memory_order_seq_cst) {
 9 |     uint64_t val = ptr.load(order);
10 |     return {reinterpret_cast<T *>(val & (~mask)), val & mask};
11 |   }
12 | 
13 |   void store(T *p, bool flag,
14 |              std::memory_order order = std::memory_order_seq_cst) {
15 |     ptr.store(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
16 |   }
17 | 
18 |   std::pair<T *, bool>
19 |   exchange(T *p, bool flag,
20 |            std::memory_order order = std::memory_order_seq_cst) {
21 |     uint64_t val =
22 |         ptr.exchange(reinterpret_cast<uint64_t>(p) | (flag ? mask : 0), order);
23 |     return {reinterpret_cast<T *>(val & (~mask)), val & mask};
24 |   }
25 | 
26 |   std::pair<T *, bool>
27 |   touch_load(std::memory_order order = std::memory_order_seq_cst) {
28 |     uint64_t val = ptr.fetch_and(~mask, order);
29 |     return {reinterpret_cast<T *>(val & (~mask)), val & mask};
30 |   }
31 | 
32 |   bool check_flag(std::memory_order order = std::memory_order_seq_cst) {
33 |     return ptr.load(order) & mask;
34 |   }
35 | };
36 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/utils/easy_format.hpp:
--------------------------------------------------------------------------------
 1 | #include <sstream>
 2 | #include <string>
 3 | #include <vector>
 4 | 
 5 | template <typename T> std::string format_vector(const std::vector<T> &v) {
 6 |   std::ostringstream oss;
 7 |   if (v.empty())
 8 |     return "[]";
 9 |   for (size_t i = 0; i < v.size(); ++i) {
10 |     oss << v[i];
11 |     if (i < v.size() - 1)
12 |       oss << ", "; // 逗号分隔
13 |   }
14 |   return oss.str();
15 | }
16 | 


--------------------------------------------------------------------------------
/csrc/balance_serve/sched/utils/readable_number.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <array>
 3 | #include <iomanip>
 4 | #include <sstream>
 5 | #include <string>
 6 | 
 7 | inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
 8 | 
 9 | inline std::string readable_number(size_t size) {
10 |   size_t unit_index = 0;
11 |   double readable_size = size;
12 |   while (readable_size >= 1000 && unit_index < units.size() - 1) {
13 |     readable_size /= 1000;
14 |     unit_index++;
15 |   }
16 |   std::ostringstream ss;
17 |   ss << std::fixed << std::setprecision(2) << readable_size;
18 |   std::string str = ss.str();
19 |   return str + "" + units[unit_index];
20 | }


--------------------------------------------------------------------------------
/csrc/custom_marlin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/custom_marlin/__init__.py


--------------------------------------------------------------------------------
/csrc/custom_marlin/binding.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : Azure-Tang
 4 |  * @Date         : 2024-07-25 13:38:30
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : kkk1nak0
 7 |  * @LastEditTime : 2024-08-12 03:05:04
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | 
11 | #include "gptq_marlin/ops.h"
12 | // Python bindings
13 | #include <pybind11/pybind11.h>
14 | #include <pybind11/stl.h>
15 | #include <torch/extension.h>
16 | #include <torch/library.h>
17 | #include <torch/torch.h>
18 | // namespace py = pybind11;
19 | 
20 | PYBIND11_MODULE(vLLMMarlin, m) {
21 | 
22 |     /*m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0
23 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
24 |     m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k
25 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
26 |     m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k
27 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
28 |     m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k
29 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
30 |     m.def("dequantize_q3_k",  &dequantize_q3_k, "Function to dequantize q3_k
31 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
32 |     m.def("dequantize_q2_k",  &dequantize_q2_k, "Function to dequantize q2_k
33 |     data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));
34 |     m.def("dequantize_iq4_xs",  &dequantize_iq4_xs, "Function to dequantize
35 |     iq4_xs data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));*/
36 |     m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
37 |           "Function to perform GEMM using Marlin quantization.", py::arg("a"),
38 |           py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
39 |           py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m_tensor"),
40 |           py::arg("size_m"), py::arg("size_n"), py::arg("size_k"),
41 |           py::arg("sms"), py::arg("is_k_full"));
42 |     m.def("gptq_marlin_repack", &gptq_marlin_repack,
43 |             "gptq_marlin repack from GPTQ");
44 | }


--------------------------------------------------------------------------------
/csrc/custom_marlin/gptq_marlin/ops.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : Azure
 4 |  * @Date         : 2024-07-22 09:27:55
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : Azure
 7 |  * @LastEditTime : 2024-07-26 08:35:00
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #pragma once
11 | 
12 | #include <torch/extension.h>
13 | #include <torch/library.h>
14 | #include <torch/torch.h>
15 | 
16 | torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
17 |                                torch::Tensor &b_scales, torch::Tensor &g_idx,
18 |                                torch::Tensor &perm, torch::Tensor &workspace,
19 |                                int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n,
20 |                                int64_t size_k, int sms, bool is_k_full);
21 | 
22 | torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor&perm,
23 |                                  int64_t size_k, int64_t size_n,
24 |                                  int64_t num_bits);


--------------------------------------------------------------------------------
/csrc/custom_marlin/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | from torch.utils import cpp_extension
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | setup(
 5 |     name='vLLMMarlin',
 6 |     ext_modules=[
 7 |         CUDAExtension(
 8 |             'vLLMMarlin', [
 9 |                 #'custom_gguf/dequant.cu',
10 |                 'binding.cpp',
11 |                 'gptq_marlin/gptq_marlin.cu',
12 |                 'gptq_marlin/gptq_marlin_repack.cu',
13 |             ],
14 |             extra_compile_args={
15 |                 'cxx': ['-O3'],
16 |                 'nvcc': [
17 |                     '-O3',
18 |                     '--use_fast_math',
19 |                     '-Xcompiler', '-fPIC',
20 |                 ]
21 |             },
22 |         )
23 |     ],
24 |     cmdclass={'build_ext': BuildExtension}
25 | )


--------------------------------------------------------------------------------
/csrc/custom_marlin/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/custom_marlin/utils/__init__.py


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/backend.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2024-07-22 02:03:05
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022
 7 |  * @LastEditTime : 2024-07-25 10:33:38
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #ifndef CPUINFER_BACKEND_H
11 | #define CPUINFER_BACKEND_H
12 | 
13 | #include <atomic>
14 | #include <condition_variable>
15 | #include <cstdio>
16 | #include <functional>
17 | #include <mutex>
18 | #include <thread>
19 | #include <vector>
20 | 
21 | enum ThreadStatus {
22 |     WORKING,
23 |     WAITING,
24 |     EXIT,
25 | };
26 | 
27 | struct ThreadState {
28 |     std::unique_ptr<std::atomic<ThreadStatus>> status;
29 |     std::unique_ptr<std::atomic<int>> curr;
30 |     int end;
31 | };
32 | 
33 | class Backend {
34 |   public:
35 |     Backend(int);
36 |     ~Backend();
37 |     int get_thread_num();
38 |     void do_work_stealing_job(int, std::function<void(int)>,
39 |                               std::function<void(int)>,
40 |                               std::function<void(int)>);
41 |     #ifdef USE_NUMA
42 |     static thread_local int numa_node;
43 |     #endif
44 |     static thread_local int thread_local_id;
45 | 
46 |   private:
47 |     int thread_num_;
48 |     int max_thread_num_;
49 |     std::vector<ThreadState> thread_state_; // [thread_num]
50 |     std::function<void(int)> init_func_;
51 |     std::function<void(int)> compute_func_;
52 |     std::function<void(int)> finalize_func_;
53 |     std::vector<std::thread> workers_;
54 | 
55 |     void process_tasks(int);
56 |     void worker_thread(int);
57 | };
58 | #endif


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2024-08-05 04:49:08
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022 
 7 |  * @LastEditTime : 2024-08-05 09:21:29
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #include "shared_mem_buffer.h"
11 | #include <cstdio>
12 | 
13 | SharedMemBuffer::SharedMemBuffer() {
14 |     buffer_ = nullptr;
15 |     size_ = 0;
16 | }
17 | 
18 | SharedMemBuffer::~SharedMemBuffer() {
19 |     if (buffer_) {
20 |         free(buffer_);
21 |     }
22 | }
23 | 
24 | void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests) {
25 |     uint64_t size = 0;
26 |     for (auto& request : requests) {
27 |         size += request.second;
28 |     }
29 |     if (size > size_) {
30 |         if (buffer_) {
31 |             free(buffer_);
32 |         }
33 |         buffer_ = std::aligned_alloc(64, size);
34 | 
35 |         size_ = size;
36 |         for (auto& obj_requests : hist_requests_) {
37 |             for (auto& requests : obj_requests.second) {
38 |                 arrange(requests);
39 |             }
40 |         }
41 |     }
42 |     arrange(requests);
43 |     hist_requests_[object].push_back(requests);
44 | }
45 | 
46 | void SharedMemBuffer::dealloc(void* object) {
47 |     hist_requests_.erase(object);
48 | }
49 | 
50 | void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
51 |     uint64_t offset = 0;
52 |     for (auto& request : requests) {
53 |         *(request.first) = (uint8_t*)buffer_ + offset;
54 |         offset += request.second;
55 |     }
56 | }


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2024-08-05 04:49:08
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022 
 7 |  * @LastEditTime : 2024-08-05 06:36:41
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | 
11 |  #ifndef CPUINFER_SHAREDMEMBUFFER_H
12 |  #define CPUINFER_SHAREDMEMBUFFER_H
13 |  
14 |  #include <cstdint>
15 |  #include <cstdlib>
16 |  #include <map>
17 |  #include <vector>
18 |  
19 |  class SharedMemBuffer {
20 |     public:
21 |      SharedMemBuffer();
22 |      ~SharedMemBuffer();
23 |  
24 |      void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
25 |      void dealloc(void* object);
26 |  
27 |     private:
28 |      void* buffer_;
29 |      uint64_t size_;
30 |      std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
31 |  
32 |      void arrange(std::vector<std::pair<void**, uint64_t>> requests);
33 |  };
34 |  
35 |  static SharedMemBuffer shared_mem_buffer;
36 |  
37 |  #endif


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/task_queue.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description :
 3 |  * @Author    : chenht2022
 4 |  * @Date     : 2024-07-17 12:25:51
 5 |  * @Version   : 1.0.0
 6 |  * @LastEditors : chenht2022
 7 |  * @LastEditTime : 2024-10-09 11:08:10
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #include "task_queue.h"
11 | 
12 | TaskQueue::TaskQueue() {
13 |     worker = std::thread(&TaskQueue::processTasks, this);
14 |     sync_flag.store(true, std::memory_order_seq_cst);
15 |     exit_flag.store(false, std::memory_order_seq_cst);
16 | }
17 | 
18 | TaskQueue::~TaskQueue() {
19 |     {
20 |         mutex.lock();
21 |         exit_flag.store(true, std::memory_order_seq_cst);
22 |         mutex.unlock();
23 |     }
24 |     cv.notify_all();
25 |     if (worker.joinable()) {
26 |         worker.join();
27 |     }
28 | }
29 | 
30 | void TaskQueue::enqueue(std::function<void()> task) {
31 |     {
32 |         mutex.lock();
33 |         tasks.push(task);
34 |         sync_flag.store(false, std::memory_order_seq_cst);
35 |         mutex.unlock();
36 |     }
37 |     cv.notify_one();
38 | }
39 | 
40 | void TaskQueue::sync() {
41 |     while (!sync_flag.load(std::memory_order_seq_cst))
42 |         ;
43 | }
44 | 
45 | void TaskQueue::processTasks() {
46 |     while (true) {
47 |         std::function<void()> task;
48 |         {
49 |             mutex.lock();
50 |             cv.wait(mutex, [this]() { return !tasks.empty() || exit_flag.load(std::memory_order_seq_cst); });
51 |             if (exit_flag.load(std::memory_order_seq_cst) && tasks.empty()) {
52 |                 return;
53 |             }
54 |             task = tasks.front();
55 |             tasks.pop();
56 |             mutex.unlock();
57 |         }
58 |         task();
59 |         {
60 |             mutex.lock();
61 |             if (tasks.empty()) {
62 |                 sync_flag.store(true, std::memory_order_seq_cst);
63 |             }
64 |             mutex.unlock();
65 |         }
66 |     }
67 | }


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/vendors/README.md:
--------------------------------------------------------------------------------
1 | ## TODO
2 | 
3 | This directory can be removed after updating the version of `llama.cpp`.


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/vendors/cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cuda.h>
 5 | #include <cublas_v2.h>
 6 | #include <cuda_bf16.h>
 7 | #include <cuda_fp16.h>
 8 | 
 9 | #if CUDART_VERSION < 11020
10 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
11 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
12 | #define CUBLAS_COMPUTE_16F CUDA_R_16F
13 | #define CUBLAS_COMPUTE_32F CUDA_R_32F
14 | #define cublasComputeType_t cudaDataType_t
15 | #endif // CUDART_VERSION < 11020
16 | 


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cpu_backend/vendors/vendor.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPUINFER_VENDOR_VENDOR_H
 2 | #define CPUINFER_VENDOR_VENDOR_H
 3 | 
 4 | #ifdef USE_CUDA
 5 | #include "cuda.h"
 6 | #elif USE_HIP
 7 | #define __HIP_PLATFORM_AMD__
 8 | #include "hip.h"
 9 | #elif USE_MUSA
10 | #include "musa.h"
11 | #endif
12 | 
13 | #endif  // CPUINFER_VENDOR_VENDOR_H


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cuda/custom_gguf/ops.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : Azure-Tang
 4 |  * @Date         : 2024-07-22 09:27:55
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : kkk1nak0
 7 |  * @LastEditTime : 2024-08-12 03:48:46
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 | **/
10 | #pragma once
11 | 
12 | #include <torch/library.h>
13 | #include <torch/extension.h>
14 | #include <torch/torch.h>
15 | 
16 | torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
17 | torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
18 | torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
19 | torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
20 | torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
21 | torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);
22 | torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype);


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cuda/gptq_marlin/ops.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :  
 3 |  * @Author       : Azure
 4 |  * @Date         : 2024-07-22 09:27:55
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : Azure 
 7 |  * @LastEditTime : 2024-07-26 08:35:00
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 9 | **/
10 | #pragma once
11 | 
12 | #include <torch/library.h>
13 | #include <torch/extension.h>
14 | #include <torch/torch.h>
15 | 
16 | torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
17 |                                torch::Tensor& b_scales, torch::Tensor& g_idx,
18 |                                torch::Tensor& perm, torch::Tensor& workspace,
19 |                                int64_t num_bits, int64_t size_m, int64_t size_n,
20 |                                int64_t size_k, bool is_k_full);
21 | 
22 | // torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
23 | //                                  int64_t size_k, int64_t size_n,
24 | //                                  int64_t num_bits);


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cuda/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from setuptools import setup, Extension
 3 | from torch.utils import cpp_extension
 4 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 5 | setup(
 6 |     name='KTransformersOps',
 7 |     ext_modules=[
 8 |         CUDAExtension(
 9 |             'KTransformersOps', [
10 |                 'custom_gguf/dequant.cu',
11 |                 'binding.cpp',
12 |                 'gptq_marlin/gptq_marlin.cu',
13 |                 # 'gptq_marlin_repack.cu',
14 |             ],
15 |             extra_compile_args={
16 |                 'cxx': ['-O3'],
17 |                 'nvcc': [
18 |                     '-O3',
19 |                     '--use_fast_math',
20 |                     '-Xcompiler', '-fPIC',
21 |                 ]
22 |             },
23 |         )
24 |     ],
25 |     cmdclass={'build_ext': BuildExtension}
26 | )


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/cuda/test_dequant.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.insert(0,"/home/zbx/ktransformers")
 4 | from ktransformers.util.custom_loader import GGUFLoader
 5 | import torch
 6 | 
 7 | gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
 8 | gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/")
 9 | 
10 | torch.set_default_dtype(torch.bfloat16)
11 | 
12 | tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
13 | tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda")
14 | 
15 | print(tensor_1[0, -64:])
16 | print(tensor_2[0, -64:])


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/examples/test_linear.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | '''
 4 | Description  :  
 5 | Author       : chenht2022
 6 | Date         : 2024-07-25 10:32:05
 7 | Version      : 1.0.0
 8 | LastEditors  : chenht2022 
 9 | LastEditTime : 2024-08-06 10:36:59
10 | Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
11 | '''
12 | import os, sys
13 | import time
14 | sys.path.append(os.path.dirname(__file__) + '/../build')
15 | import cpuinfer_ext
16 | import torch
17 | 
18 | input_size = 16384
19 | output_size = 5120
20 | stride = 32
21 | group_max_len = 1024
22 | proj_type = 1 # ggml_type::GGML_TYPE_F16
23 | hidden_type = 1 # ggml_type::GGML_TYPE_F16
24 | qlen = 30
25 | layer_num = 10
26 | CPUInfer = cpuinfer_ext.CPUInfer(48)
27 | validation_iter = 100
28 | 
29 | with torch.inference_mode(mode=True):
30 |     linears = []
31 |     projs = []
32 |     for _ in range(layer_num):
33 |         proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
34 |         config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type)
35 |         linear = cpuinfer_ext.linear.Linear(config)
36 |         projs.append(proj)
37 |         linears.append(linear)
38 | 
39 |     # validation
40 |     for i in range(validation_iter):
41 |         linear = linears[i % layer_num]
42 |         input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous()
43 |         output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous()
44 |         input = input / 100
45 | 
46 |         CPUInfer.submit(
47 |             linear.forward(
48 |                 qlen,
49 |                 input.data_ptr(),
50 |                 output.data_ptr()
51 |             )
52 |         )
53 |         CPUInfer.sync()
54 |         # print('cpuinfer output', output)
55 | 
56 |         proj = projs[i%layer_num]
57 |         t_output = torch.mm(input, proj.t())
58 |         # print('torch output', t_output)
59 | 
60 |         diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
61 |         print('diff = ', diff)
62 |         assert(diff < 0.001)
63 | 


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/operators/amx/la/utils.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2025-04-25 18:28:12
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022
 7 |  * @LastEditTime : 2025-04-25 18:28:12
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | 
11 | #pragma once
12 | #include <cstdint>
13 | 
14 | 
15 | template <typename T>
16 | T* offset_pointer(T* ptr, std::size_t byte_offset) {
17 |   return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
18 | }
19 | 
20 | template <typename T>
21 | const T* offset_pointer(const T* ptr, std::size_t byte_offset) {
22 |   return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
23 | }
24 | 
25 | template <typename T>
26 | T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) {
27 |   return offset_pointer(t, row * ld) + col;
28 | }
29 | 
30 | template <typename T>
31 | T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) {
32 |   return offset_pointer(t, col * ld) + row;
33 | }
34 | 
35 | static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
36 |   _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
37 | }
38 | 
39 | static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
40 |   _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
41 | }
42 | 
43 | static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
44 |   _mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16)));
45 |   _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16)));
46 | }


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/operators/llamafile/conversion.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2024-07-12 10:07:58
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022 
 7 |  * @LastEditTime : 2024-07-25 10:34:55
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #ifndef CPUINFER_CONVERSION_H
11 | #define CPUINFER_CONVERSION_H
12 | 
13 | #include <memory.h>
14 | #include "llama.cpp/ggml.h"
15 | 
16 | inline void to_float(const void* input, float* output, int size, ggml_type type) {
17 |     if (type == ggml_type::GGML_TYPE_F32) {
18 |         memcpy(output, input, size * sizeof(float));
19 |     } else {
20 |         ggml_internal_get_type_traits(type).to_float(input, output, size);
21 |     }
22 | }
23 | 
24 | inline void from_float(const float* input, void* output, int size, ggml_type type) {
25 |     if (type == ggml_type::GGML_TYPE_F32) {
26 |         memcpy(output, input, size * sizeof(float));
27 |     } else {
28 |         ggml_internal_get_type_traits(type).from_float(input, output, size);
29 |     }
30 | }
31 | 
32 | #endif


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/operators/llamafile/linear.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @Description  :
 3 |  * @Author       : chenht2022
 4 |  * @Date         : 2024-07-12 10:07:58
 5 |  * @Version      : 1.0.0
 6 |  * @LastEditors  : chenht2022
 7 |  * @LastEditTime : 2024-07-25 10:35:00
 8 |  * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 9 |  **/
10 | #ifndef CPUINFER_OPERATOR_LINEAR_H
11 | #define CPUINFER_OPERATOR_LINEAR_H
12 | 
13 | #include <cmath>
14 | #include <cstdio>
15 | #include <functional>
16 | #include <mutex>
17 | #include <vector>
18 | 
19 | #include "../../cpu_backend/backend.h"
20 | #include "../../cpu_backend/shared_mem_buffer.h"
21 | #include "conversion.h"
22 | #include "llama.cpp/ggml-impl.h"
23 | #include "llama.cpp/ggml-quants.h"
24 | #include "llama.cpp/ggml.h"
25 | #include "llamafile/sgemm.h"
26 | 
27 | struct LinearConfig {
28 |     int input_size;
29 |     int output_size;
30 |     int stride;
31 |     int group_max_len;
32 |     void* proj;
33 |     ggml_type proj_type;
34 |     ggml_type hidden_type;
35 | 
36 |     LinearConfig() {}
37 | 
38 |     LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type)
39 |         : input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {}
40 | };
41 | 
42 | class Linear {
43 |    public:
44 |     Linear(LinearConfig);
45 |     ~Linear();
46 |     void warm_up(Backend* backend);
47 |     void forward_many(int qlen, const void* input, void* output, Backend* backend);
48 |     void forward(int qlen, const void* input, void* output, Backend* backend);
49 | 
50 |    private:
51 |     LinearConfig config_;
52 |     void* proj_;  // [output_size * input_size ( /32 if quantized)]
53 | 
54 |     float* input_fp32_;    // [group_max_len * input_size]
55 |     uint8_t* proj_input_;  // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)]
56 |     float* proj_output_;   // [group_max_len * output_size]
57 | };
58 | 
59 | #endif


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/vendors/cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cuda.h>
 5 | #include <cublas_v2.h>
 6 | #include <cuda_bf16.h>
 7 | #include <cuda_fp16.h>
 8 | 
 9 | #if CUDART_VERSION < 11020
10 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
11 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
12 | #define CUBLAS_COMPUTE_16F CUDA_R_16F
13 | #define CUBLAS_COMPUTE_32F CUDA_R_32F
14 | #define cublasComputeType_t cudaDataType_t
15 | #endif // CUDART_VERSION < 11020
16 | 


--------------------------------------------------------------------------------
/csrc/ktransformers_ext/vendors/vendor.h:
--------------------------------------------------------------------------------
 1 | #ifndef CPUINFER_VENDOR_VENDOR_H
 2 | #define CPUINFER_VENDOR_VENDOR_H
 3 | 
 4 | #ifdef USE_CUDA
 5 | #include "cuda.h"
 6 | #elif USE_HIP
 7 | #define __HIP_PLATFORM_AMD__
 8 | #include "hip.h"
 9 | #elif USE_MUSA
10 | #include "musa.h"
11 | #endif
12 | 
13 | #endif  // CPUINFER_VENDOR_VENDOR_H


--------------------------------------------------------------------------------
/doc/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Ktransformers
 2 | 
 3 | [Introduction](./README.md)
 4 | # Install
 5 | - [Installation Guide](en/install.md)
 6 | 
 7 | # Tutorial 
 8 | - [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md)
 9 | - [Why KTransformers So Fast](en/deepseek-v2-injection.md)
10 | - [Injection Tutorial](en/injection_tutorial.md)
11 | - [Multi-GPU Tutorial](en/multi-gpu-tutorial.md)
12 | - [Use FP8 GPU Kernel](en/fp8_kernel.md)
13 | - [Use AMD GPU](en/ROCm.md)
14 | # Server
15 |   - [Server](en/api/server/server.md)
16 |   - [Website](en/api/server/website.md)
17 |   - [Tabby](en/api/server/tabby.md)
18 | # For Developer
19 | - [Makefile Usage](en/makefile_usage.md)
20 | 
21 | # FAQ
22 | - [FAQ](en/FAQ.md)
23 | # V3 Reproduction
24 | - [Success List](en/V3-success.md)
25 | # Benchmark
26 | - [Benchmark](en/benchmark.md)
27 | 


--------------------------------------------------------------------------------
/doc/assets/BigCodeBench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/BigCodeBench.png


--------------------------------------------------------------------------------
/doc/assets/DeepSeek-on-KTransformers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/DeepSeek-on-KTransformers.png


--------------------------------------------------------------------------------
/doc/assets/Framework_effect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/Framework_effect.png


--------------------------------------------------------------------------------
/doc/assets/InfLLM_equation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InfLLM_equation.jpg


--------------------------------------------------------------------------------
/doc/assets/InfLLM_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InfLLM_framework.png


--------------------------------------------------------------------------------
/doc/assets/InjectStruction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InjectStruction.png


--------------------------------------------------------------------------------
/doc/assets/KTransformers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers.png


--------------------------------------------------------------------------------
/doc/assets/KTransformers_long_context_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers_long_context_v1.png


--------------------------------------------------------------------------------
/doc/assets/KTransformers_long_context_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers_long_context_v2.png


--------------------------------------------------------------------------------
/doc/assets/Quest_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/Quest_framework.png


--------------------------------------------------------------------------------
/doc/assets/SnapKV_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/SnapKV_framework.png


--------------------------------------------------------------------------------
/doc/assets/SparQ_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/SparQ_attention.png


--------------------------------------------------------------------------------
/doc/assets/amx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx.png


--------------------------------------------------------------------------------
/doc/assets/amx_avx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx_avx.png


--------------------------------------------------------------------------------
/doc/assets/amx_intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx_intro.png


--------------------------------------------------------------------------------
/doc/assets/cpuinfer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/cpuinfer.png


--------------------------------------------------------------------------------
/doc/assets/deepseekv2_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/deepseekv2_structure.png


--------------------------------------------------------------------------------
/doc/assets/internlm_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/internlm_memory.png


--------------------------------------------------------------------------------
/doc/assets/long_context_generate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/long_context_generate.png


--------------------------------------------------------------------------------
/doc/assets/long_context_prefill.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/long_context_prefill.png


--------------------------------------------------------------------------------
/doc/assets/model_structure_guild.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/model_structure_guild.png


--------------------------------------------------------------------------------
/doc/assets/multi_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/multi_gpu.png


--------------------------------------------------------------------------------
/doc/assets/needle_128K.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/needle_128K.png


--------------------------------------------------------------------------------
/doc/assets/needle_1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/needle_1M.png


--------------------------------------------------------------------------------
/doc/assets/onednn_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/onednn_1.png


--------------------------------------------------------------------------------
/doc/assets/website.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/website.png


--------------------------------------------------------------------------------
/doc/basic/note1.md:
--------------------------------------------------------------------------------
1 | # basic-first20
2 | 


--------------------------------------------------------------------------------
/doc/basic/note2.md:
--------------------------------------------------------------------------------
1 | # basic-data_structure
2 | 


--------------------------------------------------------------------------------
/doc/en/Docker.md:
--------------------------------------------------------------------------------
 1 | # Docker
 2 | 
 3 | ## Prerequisites
 4 | * Docker must be installed and running on your system.
 5 | * Create a folder to store big models & intermediate files (ex. /mnt/models)
 6 | 
 7 | ## Images
 8 | There is a Docker image available for our project, you can pull the docker image by：
 9 | ```
10 | docker pull approachingai/ktransformers:0.2.1
11 | ```
12 | **Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformers in the /workspace/ktransformers directory within the container.
13 | 
14 | ## Building docker image locally
15 |  - Download Dockerfile in [there](../../Dockerfile)
16 | 
17 |  - finish, execute
18 |    ```bash
19 |    docker build  -t approachingai/ktransformers:0.2.1 .
20 |    ```
21 | 
22 | ## Usage
23 | 
24 | Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
25 | ```
26 | docker run --gpus all -v /path/to/models:/models --name ktransformers -itd approachingai/ktransformers:0.2.1
27 | docker exec -it ktransformers /bin/bash
28 | python -m ktransformers.local_chat  --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --cpu_infer 33
29 | ```
30 | 
31 | More operators you can see in the [readme](../../README.md)


--------------------------------------------------------------------------------
/doc/en/V3-success.md:
--------------------------------------------------------------------------------
 1 | ## Hello everyone, here is the successfully reproduced environment configuration for your reference:
 2 | ### Case 1
 3 | - Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel
 4 | - Performance: prefill 108 tokens/s, decode 10.8 tokens/s
 5 | - Used version: main source code compiled 
 6 | ### Case 2
 7 | - Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card
 8 | - Performance: Running speed approximately 6-8 tokens per second 
 9 | ## NOTE
10 | If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : )
11 | [click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2)


--------------------------------------------------------------------------------
/doc/en/api/server/run-tabby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/run-tabby.png


--------------------------------------------------------------------------------
/doc/en/api/server/server-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/server-arch.png


--------------------------------------------------------------------------------
/doc/en/api/server/visit-api-tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/visit-api-tags.png


--------------------------------------------------------------------------------
/doc/en/api/server/website.md:
--------------------------------------------------------------------------------
 1 | # Start with website
 2 | 
 3 | This document provides the necessary steps to set up and run the web service for this project.
 4 | 
 5 | ## 1. Starting the Web Service
 6 | 
 7 | ### 1.1. Compiling the Web Code
 8 | 
 9 | Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
10 | 
11 | Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first.
12 | 
13 | ```bash
14 | 
15 |   # sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y
16 |   sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
17 |   curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg
18 |   sudo chmod 644 /usr/share/keyrings/nodesource.gpg
19 |   echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
20 |   sudo apt-get update -y
21 |   sudo apt-get install nodejs -y
22 | 
23 | ```
24 | 
25 | Once npm is installed, navigate to the `ktransformers/website` directory:
26 | 
27 | ```bash
28 | cd ktransformers/website
29 | ```
30 | 
31 | Next, install the Vue CLI with the following command:
32 | 
33 | ```bash
34 | npm install @vue/cli
35 | ```
36 | 
37 | Now you can build the project:
38 | 
39 | ```bash
40 | npm run build
41 | ```
42 | Finally you can build ktransformers with website:
43 | ```
44 | cd ../../
45 | pip install .
46 | ```
47 | 


--------------------------------------------------------------------------------
/doc/en/makefile_usage.md:
--------------------------------------------------------------------------------
 1 | # Makefile
 2 | ## Target
 3 | ### flake_find:
 4 | ```bash
 5 | make flake_find
 6 | ```
 7 | find all the python files under ./ktransformers dir and find the Error, Warning, Fatal... (their codes) into a list that are not consistent with the pep8 standard. For now we have get all this list in the .flake8 file's extend-ignore section in order to let flakes8 ignore them temporarily.(we may improve them in the future)
 8 | ### format:
 9 | ```bash
10 | make format
11 | ```
12 | we use black to format all the python files under ./ktransformers dir. It obeys the pep8 standard 
13 | but we modify the line length to 120 by add 
14 | ```toml
15 | [tool.black]
16 | line-length = 120
17 | preview = true
18 | unstable = true
19 | ```
20 | in the pyproject.toml file.
21 | 
22 | ### dev_install:
23 | ```bash
24 | make dev_install
25 | ```
26 | install the package in the development mode. It means that the package is installed in the editable mode. So if you modify the code, you don't need to reinstall the package. We recommend the developer to use this method to install the package.


--------------------------------------------------------------------------------
/doc/en/operators/Combined_MoE_time_per_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/operators/Combined_MoE_time_per_layer.png


--------------------------------------------------------------------------------
/doc/en/operators/Linear_projection_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/operators/Linear_projection_time.png


--------------------------------------------------------------------------------
/doc/zh/api/server/run-tabby.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/run-tabby.png


--------------------------------------------------------------------------------
/doc/zh/api/server/server-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/server-arch.png


--------------------------------------------------------------------------------
/doc/zh/api/server/tabby.md:
--------------------------------------------------------------------------------
 1 | # 如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全？
 2 | 
 3 | [Tabby](https://tabby.tabbyml.com/docs/welcome/) 是一个开源的代码助手，用户可以手动配置后端使用的框架及模型，并在多个 IDE/编辑器 上使用，例如 VSCode 和 InteliJ。因为 Tabby 在框架侧可以对接到 Ollama，并且 ktransformers server 提供和 Ollama 一致的 API 接口，所以我们可以将 Tabby 对接到 ktransformers server。并在代码补全的场景中体验到 ktransformers 快速的异构推理。
 4 | 
 5 | 1. 启动 ktransformers。
 6 | ```bash
 7 | ./ktransformers --port 9112
 8 | ```
 9 | 2. 安装 Tabby：按照 Tabby 的官方教程在带有英伟达 GPU 的 Linux 服务器或者 Windows PC 上[安装 Tabby](https://tabby.tabbyml.com/docs/quick-start/installation/linux/)。
10 | 3. 配置 Tabby：创建`~/.tabby/config.toml`，并加入以下配置。
11 | ```toml
12 | [model.completion.http]
13 | kind = "ollama/completion"
14 | api_endpoint = "http://127.0.0.1:9112/"
15 | model_name = "DeepSeek-Coder-V2-Instruct"
16 | prompt_template = "<｜fim▁begin｜>{prefix}<｜fim▁hole｜>{suffix}<｜fim▁end｜>" # Prompt Template
17 | ```
18 | 
19 | 在这个配置中，`kind` 指明 ktransformers 使用 Ollama 的标准 API 为 Tabby 提供服务；`api_endpoint` 与 ktransforer 启动时绑定的接口保持一致；`model_name` 设置为 ktransformers 使用的模型，这里使用 `DeepSeek-Coder-V2-Instruct` 作为后台推理的模型；`prompt_template` 是模型的提示词模板，针对不同的模型，使用相对应的模版才能正常使用模型 Fill In the Middle 的功能。
20 | 在这里演示的是 Tabby 使用 Ollama API 提供 Completion 功能的相关配置，有关 Tabby 其他可选功能的配置信息请参照[这里](https://tabby.tabbyml.com/docs/administration/model/)。
21 | 
22 | 
23 | 4. 启动 Tabby 服务：`./tabby serve`。
24 | <img src="run-tabby.png" alt="image-20240709112329577" style="zoom:50%;" />
25 | 
26 | ​	启动之后，期望会在 ktransformers 的命令行界面看到对 `/api/tags` 接口的访问(在 Tabby 新版本 v0.13.0 中变为对 `/api/show/` 接口的访问)。
27 | <img src="visit-api-tags.png" alt="image-20240709111648215" style="zoom:67%;" />
28 | 
29 | 6. 注册 Tabby 账户，获取 Token：在启动 Tabby 服务后，在浏览器中打开相应的链接(如上图的 0.0.0.0:8080)，并参照[教程](https://tabby.tabbyml.com/docs/quick-start/register-account/) 创建用户并获取 Token。
30 | 
31 | 7. 启动 VScode 安装 Tabby 拓展插件，并在相关提示下，使用上一步获得的 Token 连接 Tabby Server，参照[这里](https://tabby.tabbyml.com/docs/extensions/installation/vscode/)。
32 | 
33 | 8. 打开任意代码文件，体验 ktransformers 的快速异构推理。
34 | 
35 | 


--------------------------------------------------------------------------------
/doc/zh/api/server/visit-api-tags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/visit-api-tags.png


--------------------------------------------------------------------------------
/doc/zh/api/server/website.md:
--------------------------------------------------------------------------------
 1 | # Start with website
 2 | 
 3 | This document provides the necessary steps to set up and run the web service for this project.
 4 | 
 5 | ## 1. Starting the Web Service
 6 | 
 7 | ### 1.1. Compiling the Web Code
 8 | 
 9 | Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher
10 | 
11 | Once npm is installed, navigate to the `ktransformers/website` directory:
12 | 
13 | ```bash
14 | cd ktransformers/website
15 | ```
16 | 
17 | Next, install the Vue CLI with the following command:
18 | 
19 | ```bash
20 | npm install @vue/cli
21 | ```
22 | 
23 | Now you can build the project:
24 | 
25 | ```bash
26 | npm run build
27 | ```
28 | Finally you can build ktransformers with website:
29 | ```
30 | cd ../../
31 | pip install .
32 | ```
33 | 


--------------------------------------------------------------------------------
/install-with-cache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  
 3 | 
 4 | # clear build dirs
 5 | # rm -rf build
 6 | # rm -rf *.egg-info
 7 | # rm -rf csrc/build
 8 | # rm -rf csrc/ktransformers_ext/build
 9 | # rm -rf csrc/ktransformers_ext/cuda/build
10 | # rm -rf csrc/ktransformers_ext/cuda/dist
11 | # rm -rf csrc/ktransformers_ext/cuda/*.egg-info
12 | rm -rf ~/.ktransformers
13 | echo "Installing python dependencies from requirements.txt"
14 | pip install -r requirements-local_chat.txt
15 | pip install -r ktransformers/server/requirements.txt
16 | echo "Installing ktransformers"
17 | KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
18 | pip install third_party/custom_flashinfer/ -v
19 | 
20 | # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
21 | # echo "Copying thirdparty libs to $SITE_PACKAGES"
22 | # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
23 | # patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
24 | 
25 | 
26 | echo "Installation completed successfully"
27 | 


--------------------------------------------------------------------------------
/install.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | REM clear build dirs
 4 | rmdir /S /Q ktransformers\ktransformers_ext\build
 5 | rmdir /S /Q ktransformers\ktransformers_ext\cuda\build
 6 | rmdir /S /Q ktransformers\ktransformers_ext\cuda\dist
 7 | rmdir /S /Q ktransformers\ktransformers_ext\out
 8 | del /F /Q ktransformers\ktransformers_ext\cuda\*.egg-info
 9 | 
10 | echo Installing python dependencies from requirements.txt
11 | pip install -r requirements-local_chat.txt
12 | 
13 | echo Installing ktransformers
14 | set KTRANSFORMERS_FORCE_BUILD=TRUE
15 | pip install . --no-build-isolation
16 | echo Installation completed successfully


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  
 3 | 
 4 | # default backend
 5 | DEV="cuda"
 6 | 
 7 | # parse --dev argument
 8 | while [[ "$#" -gt 0 ]]; do
 9 |     case $1 in
10 |         --dev) DEV="$2"; shift ;;
11 |         *) echo "Unknown parameter passed: $1"; exit 1 ;;
12 |     esac
13 |     shift
14 | done
15 | export DEV_BACKEND="$DEV"
16 | echo "Selected backend: $DEV_BACKEND"
17 | 
18 | # clear build dirs
19 | rm -rf build
20 | rm -rf *.egg-info
21 | rm -rf csrc/build
22 | rm -rf csrc/ktransformers_ext/build
23 | rm -rf csrc/ktransformers_ext/cuda/build
24 | rm -rf csrc/ktransformers_ext/cuda/dist
25 | rm -rf csrc/ktransformers_ext/cuda/*.egg-info
26 | rm -rf ~/.ktransformers
27 | echo "Installing python dependencies from requirements.txt"
28 | pip install -r requirements-local_chat.txt
29 | pip install -r ktransformers/server/requirements.txt
30 | 
31 | echo "Installing ktransformers"
32 | KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
33 | 
34 | if [[ "$DEV_BACKEND" == "cuda" ]]; then
35 |     echo "Installing custom_flashinfer for CUDA backend"
36 |     pip install third_party/custom_flashinfer/
37 | fi
38 | # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
39 | # echo "Copying thirdparty libs to $SITE_PACKAGES"
40 | # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
41 | # patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
42 | 
43 | echo "Installation completed successfully"


--------------------------------------------------------------------------------
/ktransformers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | '''
 4 | Description  : 
 5 | Author       : kkk1nak0
 6 | Date         : 2024-08-15 07:34:46
 7 | Version      : 1.0.0
 8 | LastEditors  : chenxl 
 9 | LastEditTime : 2025-02-15 03:53:02
10 | '''
11 | __version__ = "0.3.1"
12 | 


--------------------------------------------------------------------------------
/ktransformers/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | log:
 2 |   dir: "logs"
 3 |   file: "lexllama.log"
 4 |   #log level: debug, info, warn, error, crit
 5 |   level: "debug"
 6 |   backup_count: -1
 7 | 
 8 | server:
 9 |   ip: 0.0.0.0
10 |   port: 10002
11 | 
12 | db:
13 |   type: "sqllite"
14 |   database: "server.db"
15 |   host: "./"
16 |   pool_size: 10
17 | 
18 | user:
19 |   secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73"
20 |   algorithm: "HS256"
21 | 
22 | model:
23 |   # type: transformers
24 |   # type: balance_serve
25 |   type: ktransformers
26 | 
27 |   name: DeepSeek-Coder-V2-Instruct
28 |   path: deepseek-ai/DeepSeek-V2-Lite-Chat
29 |   gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF
30 | 
31 |   device: cuda:0
32 |   cache_lens: 16384
33 |   max_new_tokens: 500
34 | web:
35 |   mount: False
36 |   open_cross_domain: True
37 | 
38 | ext:
39 |   cpu_infer: 10
40 | 
41 | long_context:
42 |   max_seq_len: 32000
43 |   block_size: 128
44 |   local_windows_len: 4096
45 |   second_select_num: 32
46 |   anchor_type: DYNAMIC
47 |   kv_type: FP16
48 |   dense_layer_num: 2
49 |   anchor_num: 1
50 |   preselect_block: True
51 |   head_select_mode: SHARED
52 |   preselect_block_count: 32
53 |   layer_step: 1
54 |   token_step: 
55 | 
56 | local_chat:
57 |   prompt_file: ""
58 | 
59 | async_server:
60 |   sched_strategy: "FCFS"
61 |   sched_port: 56441
62 |   sched_metrics_port: 54321
63 |   kvc2_metrics_port: 54391
64 |   max_batch_size: 4  # decode count + prefill count, in one mini batch
65 | 
66 | attn:
67 |   page_size: 256
68 |   chunk_size: 256
69 | kvc2:
70 |   gpu_only: true 
71 |   utilization_percentage: 1.0
72 |   cpu_memory_size_GB: 500
73 | 


--------------------------------------------------------------------------------
/ktransformers/configs/log_config.ini:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,uvicorn,uvicornError,uvicornAccess
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler,fileHandler
 6 | 
 7 | [formatters]
 8 | keys=detailedFormatter
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=consoleHandler
13 | 
14 | [logger_uvicorn]
15 | level=INFO
16 | handlers=consoleHandler,fileHandler
17 | qualname=uvicorn
18 | propagate=0
19 | 
20 | [logger_uvicornError]
21 | level=ERROR
22 | handlers=consoleHandler,fileHandler
23 | qualname=uvicorn.error
24 | propagate=0
25 | 
26 | [logger_uvicornAccess]
27 | level=INFO
28 | handlers=consoleHandler,fileHandler
29 | qualname=uvicorn.access
30 | propagate=0
31 | 
32 | [handler_consoleHandler]
33 | class=StreamHandler
34 | level=INFO
35 | formatter=detailedFormatter
36 | args=(sys.stdout,)
37 | 
38 | [handler_fileHandler]
39 | class=logging.FileHandler
40 | level=INFO
41 | formatter=detailedFormatter
42 | args=('uvicorn_logs.log', 'a')
43 | 
44 | [formatter_detailedFormatter]
45 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s
46 | datefmt=%Y-%m-%d %H:%M:%S
47 | 


--------------------------------------------------------------------------------
/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py


--------------------------------------------------------------------------------
/ktransformers/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/models/__init__.py


--------------------------------------------------------------------------------
/ktransformers/operators/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ktransformers/operators/mlp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ktransformers.operators.base_operator import BaseInjectedModule
 3 | from ktransformers.util.custom_loader import GGUFLoader
 4 | from transformers import PretrainedConfig
 5 | import torch.nn as nn
 6 | from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
 7 | from ktransformers.models.modeling_qwen2_moe import Qwen2MoeMLP
 8 | class kDeepseekV3MLP(DeepseekV3MLP, BaseInjectedModule):
 9 |     def __init__(self,
10 |                  key: str,
11 |                  gguf_loader : GGUFLoader,
12 |                  config: PretrainedConfig,
13 |                  orig_module: nn.Module,
14 |                  prefill_device: str = "cuda",
15 |                  generate_device: str = "cuda",
16 |                  **kwargs):
17 |         BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
18 |         self.orig_module.__init__(orig_module.config,
19 |             orig_module.hidden_size, orig_module.intermediate_size)
20 |     def forward(self, x, bsz_tensor):
21 |         down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
22 |         return down_proj
23 | class KQwen2MoeMLP(Qwen2MoeMLP, BaseInjectedModule):
24 |     def __init__(self,
25 |                  key: str,
26 |                  gguf_loader : GGUFLoader,
27 |                  config: PretrainedConfig,
28 |                  orig_module: nn.Module,
29 |                  prefill_device: str = "cuda",
30 |                  generate_device: str = "cuda",
31 |                  **kwargs):
32 |         BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
33 |         self.orig_module.__init__(orig_module.config,
34 |             orig_module.intermediate_size)
35 |     def forward(self, x, bsz_tensor):
36 |         down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor)
37 |         return down_proj


--------------------------------------------------------------------------------
/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml:
--------------------------------------------------------------------------------
 1 | - match:
 2 |     class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
 3 |   replace:
 4 |     class: ktransformers.operators.RoPE.RotaryEmbeddingV2
 5 | - match:
 6 |     name: "^model.embed_tokens"
 7 |   replace:
 8 |     class: "default"
 9 |     kwargs:
10 |         generate_device: "cpu"
11 |         prefill_device: "cpu"
12 | - match:
13 |     class: ktransformers.models.modeling_llama.LlamaModel
14 |   replace:
15 |     class: ktransformers.operators.models.KLlamaModel
16 |     kwargs:
17 |       generate_device: "cuda"
18 |       prefill_device: "cuda"
19 |       per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
20 | 
21 | - match:
22 |     name: "^model\\.layers\\..*\\.self_attn$"
23 |   replace:
24 |     class: ktransformers.operators.attention.KLlamaAttention
25 |     kwargs:
26 |       generate_device: "cuda"
27 |       prefill_device: "cuda"
28 | 
29 | 


--------------------------------------------------------------------------------
/ktransformers/optimize/optimize_rules/Mixtral.yaml:
--------------------------------------------------------------------------------
 1 | - match:
 2 |     class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
 3 |   replace:
 4 |     class: ktransformers.operators.RoPE.RotaryEmbedding
 5 |     kwargs:
 6 |       generate_device: "cuda"
 7 |       prefill_device: "cuda"
 8 | - match:
 9 |     name: "^model\\.layers\\..*$"
10 |     class: torch.nn.Linear  # only match modules matching name and class simultaneously
11 |   replace:
12 |     class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
13 |     kwargs:
14 |       generate_device: "cuda"
15 |       prefill_device: "cuda"
16 |       generate_op: "KLinearMarlin"
17 |       prefill_op: "KLinearTorch"
18 | - match:
19 |     name: "^lm_head"
20 |     class: torch.nn.Linear
21 |   replace:
22 |     class: ktransformers.operators.linear.KTransformersLinear
23 |     kwargs:
24 |       generate_device: "cuda"
25 |       prefill_device: "cuda"
26 |       generate_op: "KLinearMarlin"
27 |       prefill_op: "KLinearTorch"
28 | - match:
29 |     name: "^model\\.layers\\..*\\.block_sparse_moe$"
30 |     class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
31 |   replace: 
32 |     class: ktransformers.operators.experts.KMistralSparseMoEBlock
33 | - match:
34 |     name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
35 |   replace: 
36 |     class: ktransformers.operators.experts.KTransformersExperts
37 |     kwargs:
38 |       prefill_device: "cuda"
39 |       prefill_op: "KExpertsTorch"
40 |       generate_device: "cpu"
41 |       generate_op:  "KExpertsCPU"
42 |       out_device: "cuda"
43 |   recursive: False # don't recursively inject submodules of this module
44 | 
45 | - match:
46 |     name: "^model.embed_tokens"
47 |   replace:
48 |     class: "default"
49 |     kwargs:
50 |         generate_device: "cpu"
51 |         prefill_device: "cpu"
52 | 
53 | - match:
54 |     name: "^model\\.layers\\..*\\."
55 |   replace:
56 |     class: "default"
57 |     kwargs:
58 |       generate_device: "cuda"
59 |       prefill_device: "cuda"


--------------------------------------------------------------------------------
/ktransformers/server/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter
 2 | 
 3 | from .ollama import router as ollama_router
 4 | from .openai import router as openai_router,post_db_creation_operations
 5 | from .web import router as web_router
 6 | 
 7 | router = APIRouter()
 8 | router.include_router(ollama_router)
 9 | router.include_router(openai_router)
10 | router.include_router(web_router)
11 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/ollama/__init__.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 | 
3 | from .completions import router as completions_router
4 | 
5 | router = APIRouter()
6 | router.include_router(completions_router)
7 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/openai/__init__.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter
 2 | 
 3 | from .assistants import router as assistants_router,create_default_assistant
 4 | from .endpoints.chat import router as chat_router
 5 | from .legacy import router as legacy_router
 6 | 
 7 | router = APIRouter(prefix='/v1')
 8 | 
 9 | 
10 | router.include_router(assistants_router)
11 | router.include_router(chat_router)
12 | router.include_router(legacy_router)
13 | 
14 | def post_db_creation_operations():
15 |     create_default_assistant()
16 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/openai/assistants/__init__.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter
 2 | 
 3 | from .assistants import router as assistants_router, create_default_assistant
 4 | from .messages import router as messages_router
 5 | from .runs import router as runs_router
 6 | from .threads import router as threads_router
 7 | 
 8 | router = APIRouter()
 9 | 
10 | threads_router.include_router(runs_router)
11 | threads_router.include_router(messages_router)
12 | 
13 | router.include_router(assistants_router)
14 | router.include_router(threads_router)
15 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/openai/assistants/threads.py:
--------------------------------------------------------------------------------
 1 | from typing import List,Optional
 2 | from fastapi import APIRouter
 3 | 
 4 | from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager,Order,ObjectID
 5 | from ktransformers.server.schemas.assistants.threads import ThreadObject,ThreadCreate,ThreadModify
 6 | from ktransformers.server.schemas.base import DeleteResponse
 7 | from ktransformers.server.schemas.conversation import ThreadPreview
 8 | 
 9 | router = APIRouter(prefix='/threads')
10 | threads_manager = ThreadsDatabaseManager()
11 | 
12 | 
13 | @router.post("/",tags=['openai'], response_model=ThreadObject)
14 | async def create_thread(thread: ThreadCreate):
15 |     return threads_manager.db_create_thread(thread)
16 | 
17 | 
18 | @router.get("/", tags=['openai-ext'],response_model=List[ThreadPreview])
19 | async def list_threads(limit: Optional[int] = 20, order: Order = Order.DESC):
20 |     return threads_manager.db_list_threads_preview(limit, order)
21 | 
22 | 
23 | @router.get("/{thread_id}",tags=['openai'], response_model=ThreadObject)
24 | async def retrieve_thread(thread_id: ObjectID):
25 |     return threads_manager.db_get_thread_by_id(thread_id)
26 | 
27 | 
28 | @router.post("/{thread_id}",tags=['openai'], response_model=ThreadObject)
29 | async def modify_thread(thread_id: ObjectID, thread: ThreadModify):
30 |     raise NotImplementedError
31 | 
32 | 
33 | @router.delete("/{thread_id}",tags=['openai'], response_model=DeleteResponse)
34 | async def delete_thread(thread_id: ObjectID):
35 |     threads_manager.db_delete_thread_by_id(thread_id=thread_id)
36 |     return DeleteResponse(id=thread_id, object='thread.deleted')
37 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/openai/endpoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/api/openai/endpoints/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/api/openai/legacy/__init__.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 | 
3 | from . import completions
4 | 
5 | router = APIRouter()
6 | router.include_router(completions.router)


--------------------------------------------------------------------------------
/ktransformers/server/api/web/__init__.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 | from .system import router as system_router
3 | 
4 | 
5 | router = APIRouter()
6 | router.include_router(system_router)
7 | 


--------------------------------------------------------------------------------
/ktransformers/server/api/web/system.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter
 2 | 
 3 | 
 4 | router = APIRouter()
 5 | 
 6 | 
 7 | @router.get('/system-info',tags=['web'])
 8 | def system_info():
 9 |     raise NotImplementedError
10 | 


--------------------------------------------------------------------------------
/ktransformers/server/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/backend/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/backend/interfaces/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/backend/interfaces/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/backend/interfaces/exllamav2.py:
--------------------------------------------------------------------------------
 1 | import sys, os
 2 | from typing import AsyncIterator, Dict, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from ..args import ConfigArgs, default_args
 7 | 
 8 | from ..base import BackendInterfaceBase, ThreadContext
 9 | from ktransformers.server.schemas.assistants.runs import RunObject
10 | 
11 | 
12 | from ..args import *
13 | 
14 | class ExllamaThreadContext(ThreadContext):
15 |     def __init__(self, run: RunObject, args: ConfigArgs = default_args) -> None:
16 |         super().__init__(run,args)
17 |         
18 |     def get_interface(self):
19 |         return 
20 | 
21 |     def get_local_messages(self):
22 |         raise NotImplementedError
23 | 
24 | 
25 | 
26 | 
27 | class ExllamaInterface(BackendInterfaceBase):
28 |     
29 |     def __init__(self, args: ConfigArgs = ...):
30 |         raise NotImplementedError
31 |     
32 |     def tokenize_prompt(self, prompt: str) -> torch.Tensor:
33 |         raise NotImplementedError
34 |     
35 |     async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator:
36 |         raise NotImplementedError
37 |     
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/ktransformers/server/balance_serve/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/balance_serve/inference/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/balance_serve/inference/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .communication_op import *
2 | from .parallel_state import *
3 | from .utils import *
4 | 


--------------------------------------------------------------------------------
/ktransformers/server/balance_serve/inference/distributed/communication_op.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Date: 2024-12-11 06:02:42
 3 | LastEditors: djw
 4 | LastEditTime: 2024-12-12 09:52:06
 5 | """
 6 | 
 7 | from typing import Any, Dict, Optional, Union
 8 | 
 9 | import torch
10 | import torch.distributed
11 | 
12 | from .parallel_state import get_tp_group
13 | 
14 | 
15 | def tensor_model_parallel_all_reduce(input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor:
16 |     """All-reduce the input tensor across model parallel group."""
17 |     return get_tp_group().all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap)
18 | 
19 | 
20 | def tensor_model_parallel_all_gather(
21 |     input_: torch.Tensor, dim: int = -1
22 | ) -> torch.Tensor:
23 |     """All-gather the input tensor across model parallel group."""
24 |     return get_tp_group().all_gather(input_, dim)
25 | 
26 | 
27 | def tensor_model_parallel_gather(
28 |     input_: torch.Tensor, dst: int = 0, dim: int = -1
29 | ) -> Optional[torch.Tensor]:
30 |     """Gather the input tensor across model parallel group."""
31 |     return get_tp_group().gather(input_, dst, dim)
32 | 
33 | 
34 | def broadcast_tensor_dict(
35 |     tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
36 | ):
37 |     if not torch.distributed.is_initialized():
38 |         return tensor_dict
39 |     return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
40 | 


--------------------------------------------------------------------------------
/ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py:
--------------------------------------------------------------------------------
 1 | from .orchestrator import BatchedPenalizerOrchestrator
 2 | from .penalizers.frequency_penalty import BatchedFrequencyPenalizer
 3 | from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer
 4 | from .penalizers.presence_penalty import BatchedPresencePenalizer
 5 | from .penalizers.repetition_penalty import BatchedRepetitionPenalizer
 6 | 
 7 | __all__ = [
 8 |     "BatchedFrequencyPenalizer",
 9 |     "BatchedMinNewTokensPenalizer",
10 |     "BatchedPresencePenalizer",
11 |     "BatchedRepetitionPenalizer",
12 |     "BatchedPenalizerOrchestrator",
13 | ]
14 | 


--------------------------------------------------------------------------------
/ktransformers/server/config/singleton.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | '''
 4 | Description  : Implement singleton
 5 | Author       : unicornchan
 6 | Date         : 2024-06-11 17:08:36
 7 | Version      : 1.0.0
 8 | LastEditors  : chenxl 
 9 | LastEditTime : 2024-07-27 01:55:56
10 | '''
11 | import abc
12 | 
13 | class Singleton(abc.ABCMeta, type):
14 |     """_summary_
15 | 
16 |     Args:
17 |         abc.ABCMeta: Provide a mechanism for defining abstract methods and properties,
18 |             enforcing subclasses to implement these methods and properties.
19 |         type: Inherit from 'type' to make 'Singleton' a metaclass,
20 |             enabling the implementation of the Singleton
21 |     """
22 |     _instances = {}
23 | 
24 |     def __call__(cls, *args, **kwds):
25 |         if cls not in cls._instances:
26 |             cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwds)
27 |         return cls._instances[cls]
28 | 
29 | class AbstractSingleton(abc.ABC, metaclass=Singleton):
30 |     """Provided an abstract Singleton base class, any class inheriting from
31 |        this base class will automatically become a Singleton class.
32 | 
33 |     Args:
34 |         abc.ABC: Abstract base class, it cannot be instantiated, only inherited. 
35 |     """
36 | 


--------------------------------------------------------------------------------
/ktransformers/server/crud/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/crud/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/crud/assistants/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/crud/assistants/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/crud/assistants/runs.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | from uuid import uuid4
 3 | 
 4 | from ktransformers.server.models.assistants.runs import Run
 5 | from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject
 6 | from ktransformers.server.schemas.base import ObjectID
 7 | from ktransformers.server.utils.sql_utils import SQLUtil
 8 | 
 9 | 
10 | class RunsDatabaseManager:
11 |     def __init__(self) -> None:
12 |         self.sql_util = SQLUtil()
13 | 
14 |     def create_run_object(self, thread_id: ObjectID, run: RunCreate) -> RunObject:
15 |         run_obj = RunObject(
16 |             **run.model_dump(mode='json', exclude={"stream"}),
17 |             id=str(uuid4()),
18 |             object='run',
19 |             created_at=int(time()),
20 |             thread_id=thread_id,
21 |             status=RunObject.Status.queued,
22 |         )
23 |         run_obj.set_compute_save(0)
24 |         return run_obj
25 | 
26 |     def db_create_run(self, thread_id: str, run: RunCreate):
27 |         db_run = Run(
28 |             **run.model_dump(mode="json", exclude={"stream"}),
29 |             id=str(uuid4()),
30 |             created_at=int(time()),
31 |             status="queued",
32 |             thread_id=thread_id,
33 |         )
34 |         with self.sql_util.get_db() as db:
35 |             self.sql_util.db_add_commit_refresh(db, db_run)
36 |             run_obj = RunObject.model_validate(db_run.__dict__)
37 |             run_obj.set_compute_save(0)
38 |         return run_obj
39 | 
40 |     def db_sync_run(self, run: RunObject) -> None:
41 |         db_run = Run(
42 |             **run.model_dump(mode='json'),
43 |         )
44 |         with self.sql_util.get_db() as db:
45 |             self.sql_util.db_merge_commit(db, db_run)
46 | 
47 |     def db_get_run(self, run_id: ObjectID) -> RunObject:
48 |         with self.sql_util.get_db() as db:
49 |             db_run = db.query(Run).filter(Run.id == run_id).first()
50 |             return RunObject.model_validate(db_run.__dict__)
51 | 


--------------------------------------------------------------------------------
/ktransformers/server/exceptions.py:
--------------------------------------------------------------------------------
 1 | from fastapi import HTTPException, status
 2 | 
 3 | 
 4 | def db_exception():
 5 |     return HTTPException(
 6 |         status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
 7 |         detail="DB Error",
 8 |     )
 9 | 
10 | 
11 | def not_implemented(what):
12 |     return HTTPException(
13 |         status_code=status.HTTP_501_NOT_IMPLEMENTED,
14 |         detail=f"{what} not implemented",
15 |     )
16 | 
17 | 
18 | def internal_server_error(what):
19 |     return HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"{what}")
20 | 
21 | 
22 | def request_error(what):
23 |     return HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"{what}")
24 | 


--------------------------------------------------------------------------------
/ktransformers/server/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/models/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/models/assistants/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/assistants.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import JSON, Column, Float, Integer, String, Text
 2 | from sqlalchemy.orm import relationship
 3 | 
 4 | from ktransformers.server.utils.sql_utils import Base
 5 | 
 6 | 
 7 | class Assistant(Base):
 8 |     __tablename__ = "assistants"
 9 | 
10 |     id = Column(String, primary_key=True, index=True)
11 |     object = Column(String, default="assistant")
12 |     created_at = Column(Integer)
13 | 
14 |     name = Column(String, nullable=True)
15 |     description = Column(String, nullable=True)
16 |     model = Column(String)
17 |     instructions = Column(Text, nullable=True)
18 |     tools = Column(JSON)
19 |     tool_resources = Column(JSON)
20 |     temperature = Column(Float, nullable=True)
21 |     meta_data = Column(JSON, nullable=True)
22 |     top_p = Column(Float, nullable=True)
23 |     response_format = Column(JSON, default="auto")
24 | 
25 |     build_status = Column(JSON, nullable=True)
26 | 
27 |     runs = relationship("Run", back_populates="assistant")
28 | 
29 |     messages = relationship("Message", back_populates="assistant")
30 | 


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/messages.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import JSON, Column, ForeignKey, Integer, String
 2 | from sqlalchemy.orm import relationship
 3 | 
 4 | from ktransformers.server.utils.sql_utils import Base
 5 | 
 6 | 
 7 | class Message(Base):
 8 |     __tablename__ = "messages"
 9 | 
10 |     id = Column(String, primary_key=True, index=True)
11 |     object = Column(String, default="thread.message")
12 |     created_at = Column(Integer)
13 | 
14 |     thread_id = Column(String, ForeignKey("threads.id"))
15 |     status = Column(String, default="in_progress")
16 |     incomplete_details = Column(JSON, nullable=True)
17 |     completed_at = Column(Integer, nullable=True)
18 |     incomplete_at = Column(Integer, nullable=True)
19 |     role = Column(JSON)
20 |     content = Column(JSON)
21 |     assistant_id = Column(String, ForeignKey("assistants.id"), nullable=True)
22 |     run_id = Column(String, ForeignKey("runs.id"), nullable=True)
23 |     attachments = Column(JSON, nullable=True)
24 |     meta_data = Column(JSON, nullable=True)
25 | 
26 |     thread = relationship("Thread", back_populates="messages")
27 |     assistant = relationship("Assistant", back_populates="messages")
28 |     run = relationship("Run", back_populates="message")
29 | 


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/run_steps.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import JSON, Column, ForeignKey, Integer, String
 2 | from sqlalchemy.orm import relationship
 3 | 
 4 | from ktransformers.server.utils.sql_utils import Base
 5 | 
 6 | 
 7 | class RunStep(Base):
 8 |     __tablename__ = "run_steps"
 9 |     # todo
10 |     id = Column(String, primary_key=True, index=True)
11 |     object = Column(String, default="thread.run.step")
12 |     created_at = Column(Integer)
13 | 
14 |     assistant_id = Column(String, ForeignKey("assistants.id"))
15 |     thread_id = Column(String, ForeignKey("threads.id"))
16 |     run_id = Column(String, ForeignKey("runs.id"))
17 |     type = Column(String)
18 |     status = Column(String)
19 |     step_details = Column(JSON)
20 |     last_error = Column(JSON, nullable=True)
21 |     expires_at = Column(Integer, nullable=True)
22 |     cancelled_at = Column(Integer, nullable=True)
23 |     failed_at = Column(Integer, nullable=True)
24 |     completed_at = Column(Integer, nullable=True)
25 | 
26 |     meta_data = Column(JSON, nullable=True)
27 |     usage = Column(JSON, nullable=True)
28 | 
29 |     assistant = relationship("Assistant", back_populates="run_steps")
30 |     thread = relationship("Thread", back_populates="run_steps")
31 |     run = relationship("Run", back_populates="run_steps")
32 | 


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/runs.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import JSON, Column, Float, ForeignKey, Integer, String, Text
 2 | from sqlalchemy.orm import relationship
 3 | 
 4 | from ktransformers.server.utils.sql_utils import Base
 5 | 
 6 | 
 7 | class Run(Base):
 8 |     __tablename__ = "runs"
 9 | 
10 |     id = Column(String, primary_key=True, index=True)
11 |     object = Column(String, default="thread.run")
12 |     created_at = Column(Integer)
13 |     thread_id = Column(String, ForeignKey("threads.id"))
14 |     assistant_id = Column(String, ForeignKey("assistants.id"))
15 |     status = Column(String)
16 |     required_action = Column(JSON, nullable=True)
17 |     last_error = Column(JSON, nullable=True)
18 |     expires_at = Column(Integer, nullable=True)
19 |     started_at = Column(Integer, nullable=True)
20 |     cancelled_at = Column(Integer, nullable=True)
21 |     failed_at = Column(Integer, nullable=True)
22 |     completed_at = Column(Integer, nullable=True)
23 |     incomplete_details = Column(JSON, nullable=True)
24 |     # get from assistant
25 |     model = Column(String)
26 |     instructions = Column(Text, nullable=True)
27 |     tools = Column(JSON)
28 |     meta_data = Column(JSON, nullable=True)
29 |     usage = Column(JSON, nullable=True)
30 |     temperature = Column(Float, nullable=True)
31 |     top_p = Column(Float, nullable=True)
32 |     max_propmp_tokens = Column(Integer, nullable=True)
33 |     truncation_strategy = Column(JSON)
34 |     tool_choice = Column(JSON)
35 |     response_format = Column(JSON, default="auto")
36 | 
37 |     thread = relationship("Thread", back_populates="runs")
38 |     assistant = relationship("Assistant", back_populates="runs")
39 |     message = relationship("Message", back_populates="run")
40 | 


--------------------------------------------------------------------------------
/ktransformers/server/models/assistants/threads.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import JSON, Column, Integer, String
 2 | from sqlalchemy.orm import relationship
 3 | 
 4 | from ktransformers.server.utils.sql_utils import Base
 5 | 
 6 | 
 7 | class Thread(Base):
 8 |     __tablename__ = "threads"
 9 | 
10 |     id = Column(String, primary_key=True, index=True)
11 |     object = Column(String, default="thread")
12 |     created_at = Column(Integer)
13 | 
14 |     tool_resources = Column(JSON, nullable=True)
15 |     meta_data = Column(JSON, nullable=True)
16 | 
17 |     runs = relationship("Run", back_populates="thread")
18 |     messages = relationship("Message", back_populates="thread")
19 | 


--------------------------------------------------------------------------------
/ktransformers/server/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch >= 2.3.0
 2 | transformers == 4.51.3
 3 | fastapi >= 0.111.0
 4 | langchain >= 0.2.0
 5 | blessed >= 1.20.0
 6 | accelerate >= 0.31.0
 7 | sentencepiece >= 0.1.97
 8 | openai
 9 | setuptools
10 | build
11 | ninja
12 | wheel
13 | colorlog
14 | fire
15 | zmq
16 | psutil


--------------------------------------------------------------------------------
/ktransformers/server/schemas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/schemas/assistants/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/assistants/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/schemas/assistants/threads.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List
 3 | from typing_extensions import Self 
 4 | 
 5 | from pydantic import BaseModel, Field, model_validator
 6 | 
 7 | from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime
 8 | from ktransformers.server.schemas.assistants.tool import ToolResource
 9 | from ktransformers.server.schemas.assistants.messages import MessageCore
10 | 
11 | 
12 | class ThreadBase(BaseModel):
13 |     meta_data: Metadata = MetadataField
14 |     @model_validator(mode='before')
15 |     @classmethod
16 |     def convert_meta_data(cls,values):
17 |         if 'meta_data' in values:
18 |             values['metadata'] = values['meta_data']
19 |         return values
20 | 
21 |     tool_resources: List[ToolResource] = Field([], max_length=128)
22 | 
23 | 
24 | class ThreadObject(ThreadBase, ObjectWithCreatedTime):
25 |     is_related_threads:bool = Field(False,exclude=True)
26 | 
27 |     @model_validator(mode='after')
28 |     def check_is_related_threads(self)->Self:
29 |         # logger.debug(f'check thread {self.id} is related thread? by {self}')
30 |         if 'assistant_id' in self.meta_data:
31 |             self.is_related_threads = True
32 |         return self
33 | 
34 |     class StreamEvent(Enum):
35 |         created = 'created'
36 | 
37 |     def to_stream_reply(self,event:StreamEvent):
38 |         return f"event: thread.{event.value}\ndata: {self.model_dump_json()}\n\n"
39 |     
40 | 
41 | class ThreadCreate(ThreadBase):
42 |     messages: List[MessageCore] = Field(default=[])
43 | 
44 | 
45 | class ThreadModify(ThreadBase):
46 |     pass
47 | 
48 | 
49 | # other than OpenAI API
50 | 


--------------------------------------------------------------------------------
/ktransformers/server/schemas/assistants/tool.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List, Optional, Union
 3 | 
 4 | from pydantic import BaseModel, Field
 5 | 
 6 | from ktransformers.server.schemas.base import ObjectID
 7 | 
 8 | 
 9 | class ToolType(str, Enum):
10 |     CODE_INTERPRETER = "code_interpreter"
11 |     FILE_SEARCH = "file_search"
12 |     RELATED_THREADS = "related_threads"
13 |     FUNCTION = "function"
14 | 
15 | 
16 | class ToolBase(BaseModel):
17 |     type: ToolType
18 | 
19 | 
20 | class CodeInterpreter(ToolBase):
21 |     pass
22 | 
23 | 
24 | class FileSearch(ToolBase):
25 |     pass
26 | 
27 | 
28 | class RelatedThreads(ToolBase):
29 |     pass
30 | 
31 | 
32 | class FuntionTool(ToolBase):
33 |     description: str
34 |     name: str
35 |     parameters: List[str]
36 | 
37 | 
38 | Tool = Union[CodeInterpreter, FileSearch, RelatedThreads, FuntionTool]
39 | 
40 | 
41 | class CodeInterpreterResource(BaseModel):
42 |     file_ids: Optional[List[str]] = Field(default_factory=list, max_length=20)
43 | 
44 | 
45 | class FileSearchResource(BaseModel):
46 |     vector_store_ids: Optional[List[str]] = Field(default_factory=list, max_length=1)
47 |     vector_stores: Optional[List[str]] = Field(default_factory=list, max_length=1)
48 | 
49 | 
50 | class RelatedThreadsResource(BaseModel):
51 |     thread_ids: List[ObjectID] = Field(default=[])
52 | 
53 | 
54 | ToolResource = Union[CodeInterpreterResource,FileSearchResource,RelatedThreadsResource] 
55 | 


--------------------------------------------------------------------------------
/ktransformers/server/schemas/base.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Dict
 3 | 
 4 | import sqlalchemy
 5 | from pydantic import BaseModel, ConfigDict, Field
 6 | 
 7 | TODO = BaseModel
 8 | 
 9 | ObjectID = str
10 | 
11 | 
12 | class Object(BaseModel):
13 |     id: ObjectID
14 |     object: str
15 | 
16 |     model_config = ConfigDict(from_attributes=True)
17 | 
18 | 
19 | # Pydantic Base Models
20 | class ObjectWithCreatedTime(Object):
21 |     created_at: int
22 | 
23 | 
24 | 
25 | class Order(str, Enum):
26 |     ASC = "asc"
27 |     DESC = "desc"
28 | 
29 |     def to_sqlalchemy_order(self):
30 |         match self:
31 |             case Order.ASC:
32 |                 return sqlalchemy.asc
33 |             case Order.DESC:
34 |                 return sqlalchemy.desc
35 | 
36 | 
37 | Metadata = Dict[str, str]
38 | MetadataField: Metadata = Field({},max_length=16, alias="metadata")
39 | 
40 | 
41 | class DeleteResponse(Object):
42 |     deleted: bool = True
43 | 
44 | class OperationResponse(BaseModel):
45 |     operation: str
46 |     status: str
47 | 


--------------------------------------------------------------------------------
/ktransformers/server/schemas/conversation.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from .assistants.assistants import AssistantObject
 6 | from .assistants.threads import ThreadObject
 7 | from .assistants.messages import MessageObject
 8 | 
 9 | class ThreadPreview(BaseModel):
10 |     assistant: Optional[AssistantObject] = None
11 |     thread: ThreadObject
12 |     first_message: Optional[MessageObject] = None
13 | 


--------------------------------------------------------------------------------
/ktransformers/server/schemas/legacy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/legacy/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/schemas/legacy/completions.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from enum import Enum
 3 | from pydantic import BaseModel, Field
 4 | from ktransformers.server.config.config import Config
 5 | from ..base import Object
 6 | 
 7 | class CompletionCreate(BaseModel):
 8 |     model: str
 9 |     prompt: str | List[str]
10 |     stream: bool = False
11 |     temperature: Optional[float] = Field(default=Config().temperature)
12 |     top_p: Optional[float] = Field(default=Config().top_p)
13 |     max_tokens: Optional[int] = Field(default=None)
14 |     max_completion_tokens: Optional[int] = Field(default=None)
15 |     
16 |     def get_tokenizer_messages(self):
17 |         if isinstance(self.prompt,List):
18 |             self.get_tokenizer_messages('\n'.join(self.prompt))
19 |         return [{'content':self.prompt,'role':'user'}]
20 | 
21 | 
22 | class FinishReason(Enum):
23 |     stop = 'stop'
24 |     length = 'length'
25 | 
26 | class Choice(BaseModel):
27 |     index: int
28 |     text: str
29 |     logprobs: Optional[str] = None
30 |     finish_reason: FinishReason = None
31 | 
32 | 
33 | class CompletionObject(Object):
34 |     created:int
35 |     choices: List[Choice] = []
36 |     model:str = 'not implmented'
37 |     system_fingerprint:str = 'not implmented'
38 |     usage: Optional[str] = None
39 | 
40 |     def set_token(self,token:str):
41 |         if len(self.choices)==0:
42 |             self.choices.append(Choice(index=0,text=''))
43 |         self.choices[0].text = token    
44 | 
45 |     def append_token(self,token:str):
46 |         if len(self.choices)==0:
47 |             self.choices.append(Choice(index=0,text=''))
48 |         self.choices[0].text += token
49 | 
50 |     def to_stream_reply(self):
51 |         return f"data:{self.model_dump_json()}\n\n"
52 | 


--------------------------------------------------------------------------------
/ktransformers/server/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/utils/__init__.py


--------------------------------------------------------------------------------
/ktransformers/server/utils/create_interface.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | '''
 4 | Description  :  
 5 | Author       : qiyuxinlin
 6 | Date         : 2024-07-25 11:50:16
 7 | Version      : 1.0.0
 8 | LastEditors  : qiyuxinlin 
 9 | LastEditTime : 2024-07-25 12:54:48
10 | Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
11 | '''
12 | from ktransformers.server.config.config import Config
13 | from ktransformers.server.backend.args import ConfigArgs
14 | from ktransformers.server.backend.context_manager import ThreadContextManager
15 | from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface
16 | from ktransformers.server.backend.interfaces.transformers import TransformersInterface
17 | from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface
18 | 
19 | def create_interface(config: Config, default_args: ConfigArgs):
20 |     if config.backend_type=='transformers':
21 |         from ktransformers.server.backend.interfaces.transformers import  TransformersInterface as BackendInterface
22 |     elif config.backend_type == 'exllamav2':
23 |         from ktransformers.server.backend.interfaces.exllamav2 import  ExllamaInterface as BackendInterface
24 |     elif config.backend_type == 'ktransformers':
25 |         from ktransformers.server.backend.interfaces.ktransformers import  KTransformersInterface as BackendInterface
26 |     elif config.backend_type == 'balance_serve':
27 |         from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface as BackendInterface
28 |     else:
29 |         raise NotImplementedError(f'{config.backend_type} not implemented')
30 |     GlobalInterface.interface = BackendInterface(default_args)
31 |     GlobalContextManager.context_manager = ThreadContextManager(GlobalInterface.interface)
32 | 
33 | class GlobalContextManager:
34 |     context_manager: ThreadContextManager
35 | class GlobalInterface:
36 |     interface:  TransformersInterface | KTransformersInterface | ExllamaInterface 
37 |     
38 | def get_thread_context_manager() -> GlobalContextManager:
39 |     return GlobalContextManager.context_manager
40 | def get_interface() -> GlobalInterface:
41 |     return GlobalInterface.interface


--------------------------------------------------------------------------------
/ktransformers/tests/.gitignore:
--------------------------------------------------------------------------------
1 | results/


--------------------------------------------------------------------------------
/ktransformers/tests/AIME_2024/evaluation.py:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
 2 | def filter_answer(completion: str) -> str:
 3 |     # the answer is the last part of the completion, it's a int64 number
 4 |     # get the last line
 5 |     completion = completion.strip().split("\n")[-1]
 6 |     # handle the $\\boxed{...}$ format
 7 |     if "$\\boxed{" in completion:
 8 |         return completion.split("}")[0].split("{")[-1]
 9 |     return completion.split()[-1]
10 | 
11 | 


--------------------------------------------------------------------------------
/ktransformers/tests/AIME_2024/prompts.py:
--------------------------------------------------------------------------------
1 | def instruct_prompt(prompt: str) -> str:
2 |     return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:"""
3 | 


--------------------------------------------------------------------------------
/ktransformers/tests/dequant_gpu.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | # os.environ["CUDA_VISIBLE_DEVICES"]="1,2"
 3 | # add path
 4 | import sys
 5 | current_path = os.path.abspath(os.path.dirname(__file__))
 6 | sys.path.append(current_path+"/../..")
 7 | import numpy as np
 8 | # from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
 9 | # from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
10 | from ktransformers.util.custom_loader import GGUFLoader
11 | import torch
12 | import KTransformersOps
13 | torch.set_default_dtype(torch.bfloat16)
14 | import time
15 | from transformers import (
16 |     AutoConfig,
17 | )
18 | import os
19 | # CUDA_LAUNCH_BLOCKING=1
20 | os.environ["CUDA_LAUNCH_BLOCKING"]="1"
21 | 
22 | gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
23 | model_name = "/data/Qwen2-57B-A14B-Instruct"
24 | 
25 | # Q4k
26 | key = "blk.1."
27 | target = "attn_q.weight"
28 | 
29 | t1 = time.time()
30 | q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
31 | # q_weight_cpu = torch.from_numpy(q_weight_cpu)
32 | 
33 | t2 = time.time()
34 | q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
35 | t3 = time.time()
36 | print()
37 | allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6)
38 | print(f"Q4k {key+target}")
39 | print("load gguf tensor from cpu cost: ", t2-t1)
40 | print("load gguf tensor from gpu cost: ", t3-t2)
41 | print("allclose: ", allclose)
42 | 
43 | 
44 | # Q6k
45 | key = "blk.0."
46 | target = "ffn_down_exps.weight"
47 | 
48 | t1 = time.time()
49 | q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu")
50 | t2 = time.time()
51 | q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0")
52 | t3 = time.time()
53 | print()
54 | allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6)
55 | print(f"Q6k {key+target}")
56 | print("load gguf tensor from cpu cost: ", t2-t1)
57 | print("load gguf tensor from gpu cost: ", t3-t2)
58 | print("allclose: ", allclose)
59 | 


--------------------------------------------------------------------------------
/ktransformers/tests/dequant_gpu_t.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 3 | # add path
 4 | import sys
 5 | sys.path.append("../..")
 6 | import pycuda.autoinit
 7 | import pycuda.driver as cuda
 8 | from pycuda.compiler import SourceModule
 9 | import numpy as np
10 | from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin
11 | from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch
12 | from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
13 | import torch
14 | import KTransformersOps
15 | torch.set_default_dtype(torch.bfloat16)
16 | import time
17 | from transformers import (
18 |     AutoConfig,
19 | )
20 | 
21 | gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m")
22 | model_name = "/data/Qwen2-57B-A14B-Instruct"
23 | key = "blk.0."
24 | target = "ffn_up_exps.weight"
25 | 
26 | data = gguf_config.get_mmap_tensor(key + target)
27 | 
28 | _, factors, offsets, qs1, qs2= dequantize_q4_k(data)
29 | factors_cpu = torch.from_numpy(factors)
30 | offsets_cpu = torch.from_numpy(offsets)
31 | qs1_cpu = torch.from_numpy(qs1)
32 | qs2_cpu = torch.from_numpy(qs2)
33 | 
34 | 
35 | _, factors, offsets, qs1, qs2 = dequantize_q4_k_gpu(data)
36 | 
37 | print(torch.allclose(factors.cpu(), factors_cpu))
38 | print(torch.allclose(offsets.cpu(), offsets_cpu))
39 | print(torch.allclose(qs1.cpu(), qs1_cpu))
40 | print(torch.allclose(qs2.cpu(), qs2_cpu))


--------------------------------------------------------------------------------
/ktransformers/tests/function_call_test.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | def send_messages(messages):
 4 |     response = client.chat.completions.create(
 5 |         model="deepseek-chat",
 6 |         messages=messages,
 7 |         tools=tools
 8 |     )
 9 |     return response.choices[0].message
10 | 
11 | client = OpenAI(
12 |     api_key="placeholder",
13 |     base_url="http://0.0.0.0:10002/v1",
14 | )
15 | 
16 | tools = [
17 |     {
18 |         "type": "function",
19 |         "function": {
20 |             "name": "get_weather",
21 |             "description": "Get weather of an location, the user shoud supply a location first",
22 |             "parameters": {
23 |                 "type": "object",
24 |                 "properties": {
25 |                     "location": {
26 |                         "type": "string",
27 |                         "description": "The city and state, e.g. San Francisco, CA",
28 |                     }
29 |                 },
30 |                 "required": ["location"]
31 |             },
32 |         }
33 |     },
34 | ]
35 | 
36 | messages = [{"role": "user", "content": "How's the weather in Hangzhou?"}]
37 | message = send_messages(messages)
38 | print(f"User>\t {messages[0]['content']}")
39 | print(message)
40 | tool = message.tool_calls[0]
41 | messages.append(message)
42 | 
43 | messages.append({"role": "tool", "tool_call_id": tool.id, "content": "24℃"})
44 | message = send_messages(messages)
45 | print(f"Model>\t {message.content}")


--------------------------------------------------------------------------------
/ktransformers/tests/humaneval/evaluation.py:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35
 2 | def filter_code(completion: str) -> str:
 3 |     # The program tends to overwrite, we only take the first function
 4 |     completion = completion.lstrip("\n")
 5 |     # we also remove ```python\n and ```
 6 |     completion = completion.replace("```python\n", "").replace("```", "")
 7 |     if 'if __name__ == "__main__":' in completion:
 8 |         completion = completion.split('if __name__ == "__main__":')[0]
 9 |     if "# Example usage" in completion:
10 |         completion = completion.split("# Example usage")[0]
11 |     return completion
12 | 
13 | 
14 | def fix_indents(text: str) -> str:
15 |     return text.replace("\t", "    ")
16 | 


--------------------------------------------------------------------------------
/ktransformers/tests/humaneval/prompts.py:
--------------------------------------------------------------------------------
 1 | def instruct_prompt(prompt: str) -> str:
 2 |     return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:"""
 3 | 
 4 | 
 5 | def standard_prompt(prompt: str) -> str:
 6 |     return f"""Complete the following Python code without any tests or explanation\n{prompt}"""
 7 | 
 8 | 
 9 | def write_prompt(prompt: str) -> str:
10 |     return f"""Write a python program to complete the following code:\n{prompt}"""
11 | 
12 | 
13 | def replit_glaive_prompt(prompt: str) -> str:
14 |     return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:"""
15 | 


--------------------------------------------------------------------------------
/ktransformers/tests/test_pytorch_q8.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | # 定义一个包含线性层的浮点模型
 4 | class LinearModel(torch.nn.Module):
 5 |     def __init__(self, in_features, out_features):
 6 |         super().__init__()
 7 |         self.linear = torch.nn.Linear(in_features, out_features)
 8 |     
 9 |     def forward(self, x):
10 |         return self.linear(x)
11 | 
12 | # 创建浮点模型实例
13 | in_features = 64
14 | out_features = 128
15 | model_fp32 = LinearModel(in_features, out_features)
16 | 
17 | # 创建量化模型实例
18 | model_int8 = torch.ao.quantization.quantize_dynamic(
19 |     model_fp32,          # 原始浮点模型
20 |     {torch.nn.Linear},   # 要量化的层类型集合
21 |     dtype=torch.qint8    # 量化的目标数据类型
22 | )
23 | 
24 | # 测试模型
25 | batch_size = 32
26 | input_fp32 = torch.randn(1, batch_size, in_features)  # 生成随机输入数据
27 | output_int8 = model_int8(input_fp32)               # 通过量化模型运行数据
28 | 
29 | # 打印输出形状验证
30 | print(f"输入形状: {input_fp32.shape}")
31 | print(f"输出形状: {output_int8.shape}")
32 | 
33 | # 比较原始模型和量化模型的输出
34 | with torch.no_grad():
35 |     output_fp32 = model_fp32(input_fp32)
36 |     
37 | print(f"FP32输出的前几个值: {output_fp32[0, :5]}")
38 | print(f"INT8输出的前几个值: {output_int8[0, :5]}")
39 | 
40 | # 计算平均误差
41 | error = torch.abs(output_fp32 - output_int8).mean().item()
42 | print(f"平均绝对误差: {error}")
43 | 
44 | # 打印模型类型信息
45 | print(f"量化前模型类型: {type(model_fp32.linear)}")
46 | print(f"量化后模型类型: {type(model_int8.linear)}")


--------------------------------------------------------------------------------
/ktransformers/website/.browserslistrc:
--------------------------------------------------------------------------------
1 | > 1%
2 | last 2 versions
3 | not dead
4 | not ie 11
5 | 


--------------------------------------------------------------------------------
/ktransformers/website/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   root: true,
 3 |   env: {
 4 |     node: true
 5 |   },
 6 |   'extends': [
 7 |     'plugin:vue/vue3-essential',
 8 |     'eslint:recommended',
 9 |     '@vue/typescript/recommended'
10 |   ],
11 |   parserOptions: {
12 |     ecmaVersion: 2020
13 |   },
14 |   rules: {
15 |     'no-console': process.env.NODE_ENV === 'production' ? 'warn' : 'off',
16 |     'no-debugger': process.env.NODE_ENV === 'production' ? 'warn' : 'off'
17 |   },
18 |   overrides: [
19 |     {
20 |       files: [
21 |         '**/__tests__/*.{j,t}s?(x)',
22 |         '**/tests/unit/**/*.spec.{j,t}s?(x)'
23 |       ],
24 |       env: {
25 |         jest: true
26 |       }
27 |     }
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/ktransformers/website/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | node_modules
 3 | /dist
 4 | 
 5 | 
 6 | # local env files
 7 | .env.local
 8 | .env.*.local
 9 | 
10 | # Log files
11 | npm-debug.log*
12 | yarn-debug.log*
13 | yarn-error.log*
14 | pnpm-debug.log*
15 | 
16 | # Editor directories and files
17 | .idea
18 | .vscode
19 | *.suo
20 | *.ntvs*
21 | *.njsproj
22 | *.sln
23 | *.sw?
24 | 


--------------------------------------------------------------------------------
/ktransformers/website/README.md:
--------------------------------------------------------------------------------
 1 | # 
 2 | 
 3 | ## Project setup
 4 | ```
 5 | npm install
 6 | ```
 7 | 
 8 | ### Compiles and hot-reloads for development
 9 | ```
10 | npm run serve
11 | ```
12 | 
13 | ### Compiles and minifies for production
14 | ```
15 | npm run build
16 | ```
17 | 
18 | ### Run your unit tests
19 | ```
20 | npm run test:unit
21 | ```
22 | 
23 | ### Lints and fixes files
24 | ```
25 | npm run lint
26 | ```
27 | 
28 | ### Customize configuration
29 | See [Configuration Reference](https://cli.vuejs.org/config/).
30 | 


--------------------------------------------------------------------------------
/ktransformers/website/config.d.ts:
--------------------------------------------------------------------------------
1 | declare module '*.js' {
2 |     const config: {
3 |       apiUrl: string;
4 |       port:number;
5 |     };
6 |     export { config };
7 |   }


--------------------------------------------------------------------------------
/ktransformers/website/jest.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   preset: '@vue/cli-plugin-unit-jest/presets/typescript'
3 | }
4 | 


--------------------------------------------------------------------------------
/ktransformers/website/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "",
 3 |   "version": "",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "serve": "vue-cli-service serve",
 7 |     "build": "vue-cli-service build",
 8 |     "test:unit": "vue-cli-service test:unit",
 9 |     "lint": "vue-cli-service lint"
10 |   },
11 |   "dependencies": {
12 |     "@types/pdfjs-dist": "^2.10.378",
13 |     "@types/websocket": "^1.0.10",
14 |     "@vue/cli": "^5.0.8",
15 |     "ant-design-vue": "^4.2.1",
16 |     "apexcharts": "^3.49.1",
17 |     "axios": "^1.7.0",
18 |     "axios-extensions": "^3.1.6",
19 |     "better-scroll": "^2.5.1",
20 |     "element-plus": "^2.7.3",
21 |     "marked": "^12.0.2",
22 |     "marked-highlight": "^2.1.1",
23 |     "pdf-lib": "^1.17.1",
24 |     "pdfobject": "^2.3.0",
25 |     "v-clipboard": "^3.0.0-next.1",
26 |     "vue": "^3.4.27",
27 |     "vue-i18n": "^9.13.1",
28 |     "vue-pdf": "^4.3.0",
29 |     "vue-router": "^4.0.3",
30 |     "vue3-apexcharts": "^1.5.3",
31 |     "vuex": "^4.0.0",
32 |     "webpack": "^5.91.0",
33 |     "webpack-cli": "^5.1.4",
34 |     "websocket": "^1.0.35"
35 |   },
36 |   "devDependencies": {
37 |     "@types/jest": "^27.0.1",
38 |     "@types/pdfobject": "^2.2.5",
39 |     "@typescript-eslint/eslint-plugin": "^5.4.0",
40 |     "@typescript-eslint/parser": "^5.4.0",
41 |     "@vue/cli-plugin-eslint": "~5.0.0",
42 |     "@vue/cli-plugin-router": "~5.0.0",
43 |     "@vue/cli-plugin-typescript": "~5.0.0",
44 |     "@vue/cli-plugin-unit-jest": "~5.0.0",
45 |     "@vue/cli-plugin-vuex": "~5.0.0",
46 |     "@vue/cli-service": "~5.0.0",
47 |     "@vue/eslint-config-typescript": "^9.1.0",
48 |     "@vue/test-utils": "^2.0.0-0",
49 |     "@vue/vue3-jest": "^27.0.0-alpha.1",
50 |     "babel-jest": "^27.0.6",
51 |     "eslint": "^7.32.0",
52 |     "eslint-plugin-vue": "^8.0.3",
53 |     "jest": "^27.0.5",
54 |     "stylus": "^0.55.0",
55 |     "stylus-loader": "^6.1.0",
56 |     "ts-jest": "^27.0.4",
57 |     "typescript": "~4.5.5"
58 |   },
59 |   "_id": "@",
60 |   "readme": "ERROR: No README data found!"
61 | }
62 | 


--------------------------------------------------------------------------------
/ktransformers/website/public/balck.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/balck.ico


--------------------------------------------------------------------------------
/ktransformers/website/public/config.js:
--------------------------------------------------------------------------------
1 | window.configWeb = {
2 |     apiUrl: 'http://119.255.238.12:15670/v1',
3 |     port: 8080,
4 |   };


--------------------------------------------------------------------------------
/ktransformers/website/public/css/reset.css:
--------------------------------------------------------------------------------
 1 | html, body, div, span, applet, object, iframe,
 2 | h1, h2, h3, h4, h5, h6, p, blockquote, pre,
 3 | a, abbr, acronym, address, big, cite, code,
 4 | del, dfn, em, img, ins, kbd, q, s, samp,
 5 | small, strike, strong, sub, sup, tt, var,
 6 | b, u, i, center,
 7 | dl, dt, dd, ol, ul, li,
 8 | fieldset, form, label, legend,textarea,
 9 | table, caption, tbody, tfoot, thead, tr, th, td,
10 | article, aside, canvas, details, embed,
11 | figure, figcaption, footer, header, hgroup,
12 | menu, nav, output, ruby, section, summary,
13 | time, mark, audio, video {
14 |     margin: 0;
15 |     padding: 0;
16 |     border: 0;
17 |     font-size: 100%;
18 |     *font: inherit;
19 |     font-family: Arial, Microsoft YaHei, SimHei, Tahoma, sans-serif !important;
20 |     vertical-align: baseline;
21 | }
22 | /* HTML5 display-role reset for older browsers */
23 | article, aside, details, figcaption, figure,
24 | footer, header, hgroup, menu, nav, section {
25 |     display: block;
26 | }
27 | body {
28 |     line-height: 1;
29 |     -webkit-text-size-adjust: 100%!important;
30 |     margin: 0;
31 | }
32 | html,body {
33 |     height: 100%;
34 |     width: 100%;
35 |     overflow: hidden;
36 | }
37 | ol, ul {
38 |     list-style: none;
39 | }
40 | blockquote, q {
41 |     quotes: none;
42 | }
43 | blockquote:before, blockquote:after,
44 | q:before, q:after {
45 |     content: '';
46 |     content: none;
47 | }
48 | table {
49 |     border-collapse: collapse;
50 |     border-spacing: 0;
51 | }
52 | 
53 | .clearfix:before,
54 | .clearfix:after {
55 |     content:"";
56 |     display:table
57 | }
58 | .clearfix:after {
59 |     clear:both
60 | }
61 | 
62 | /*显示省略号*/
63 | .ellipsis{
64 |     overflow: hidden;
65 |     text-overflow: ellipsis;
66 |     white-space: nowrap;
67 | }
68 | 


--------------------------------------------------------------------------------
/ktransformers/website/public/images/assistant-avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/assistant-avatar.png


--------------------------------------------------------------------------------
/ktransformers/website/public/images/avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/avatar.png


--------------------------------------------------------------------------------
/ktransformers/website/public/images/bgbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/bgbg.png


--------------------------------------------------------------------------------
/ktransformers/website/public/images/logo.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/logo.ico


--------------------------------------------------------------------------------
/ktransformers/website/public/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/logo.png


--------------------------------------------------------------------------------
/ktransformers/website/public/images/three.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/three.png


--------------------------------------------------------------------------------
/ktransformers/website/public/images/user-filling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/user-filling.png


--------------------------------------------------------------------------------
/ktransformers/website/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,minimum-scale=1.0,user-scalable=no">
 7 |     <script src="./config.js"></script>
 8 |     <link rel="icon" href="./balck.ico" />
 9 |     <link type="text/css" rel="stylesheet" href="<%= BASE_URL %>/css/reset.css">
10 |     <title>KTransformers</title>
11 |   </head>
12 |   <body onselectstart='return false' onselect='return false'>
13 |     <noscript>
14 |       <strong>We're sorry but <%= htmlWebpackPlugin.options.title %> doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>
15 |     </noscript>
16 |     <div id="app"></div>
17 |     <!-- built files will be auto injected -->
18 |   </body>
19 | </html>
20 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/App.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div class="app-container" @contextmenu.prevent.stop="">
 3 |     <keep-alive>
 4 |       <router-view/>
 5 |     </keep-alive>
 6 |   </div>
 7 | </template>
 8 | 
 9 | <script setup lang="ts">
10 | </script>
11 | 
12 | <style lang="stylus">
13 |   @import "assets/iconfont/iconfont.css"
14 |   #app
15 |   .app-container
16 |     width: 100%
17 |     height: 100%
18 |     position: relative
19 | </style>


--------------------------------------------------------------------------------
/ktransformers/website/src/api/api-client.ts:
--------------------------------------------------------------------------------
 1 | import axios, { AxiosInstance } from 'axios';
 2 | import {baseURL} from '@/conf/config';
 3 | const apiClient: AxiosInstance = axios.create({
 4 |     baseURL: baseURL,
 5 |     // baseURL: '/api',
 6 |     headers: {
 7 |         'Content-Type': 'application/json',
 8 |     },
 9 |     withCredentials: true,
10 | });
11 | export default apiClient;
12 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/api/message.ts:
--------------------------------------------------------------------------------
 1 | import apiClient from './api-client';
 2 | import { IMessage,IDeleteResult } from '../utils/types';
 3 | 
 4 | export const createMessage = async (
 5 |     thread_id: string,
 6 |     content: string,
 7 |     role?: string,
 8 |     attachments?: any[],
 9 |     metadata?:{[key:string]:any}
10 | ): Promise<IMessage> => {
11 |     const message_data: {
12 |         content: string;
13 |         role?: string;
14 |         attachments?: any[];
15 |         metadata?:{[key:string]:any}
16 |     } = {
17 |         content,
18 |     };
19 | 
20 |     if (metadata) {
21 |         message_data.metadata = metadata;
22 |     }
23 |     if (role) {
24 |         message_data.role = role;
25 |     }
26 |     if (attachments) {
27 |         message_data.attachments = attachments;
28 |     }
29 |     const response = await apiClient.post<IMessage>(`/threads/${thread_id}/messages`, message_data);
30 |     return response.data;
31 | };
32 | 
33 | 
34 | export const listMessages = async (
35 |     thread_id: string,
36 |     limit?: number,
37 |     order?: string,
38 |     after?: string,
39 |     before?: string,
40 |     run_id?: string,
41 | ): Promise<IMessage[]> => {
42 |     const params: {
43 |         limit?: number,
44 |         order?: string,
45 |         after?: string,
46 |         before?: string,
47 |         run_id?: string
48 |     } = {};
49 | 
50 |     if (typeof limit !== 'undefined') {
51 |         params.limit = limit;
52 |     }
53 |     if (typeof order !== 'undefined') {
54 |         params.order = order;
55 |     }
56 |     if (typeof after !== 'undefined') {
57 |         params.after = after;
58 |     }
59 |     if (typeof before !== 'undefined') {
60 |         params.before = before;
61 |     }
62 |     if (typeof run_id !== 'undefined') {
63 |         params.run_id = run_id;
64 |     }
65 | 
66 |     const response = await apiClient.get<IMessage[]>(`/threads/${thread_id}/messages`, {
67 |         params
68 |     });
69 | 
70 |     return response.data;
71 | };
72 | export const deleteMessage = async(thread_id:string, message_id:string): Promise<IDeleteResult> => {
73 |     const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}/messages/${message_id}`);
74 |     return response.data;
75 | }
76 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/api/thread.ts:
--------------------------------------------------------------------------------
 1 | import apiClient from './api-client';
 2 | import { IThread, IMessage, IThreadAndMessageAndAssistant, IDeleteResult } from '../utils/types';
 3 | export const createThread = async (
 4 |     message?: IMessage,
 5 |     tool_resources?: object,
 6 |     metadata?: { [key: string]: any }
 7 | ): Promise<IThread> => {
 8 |     const thread_data: { message?: object, metadata?: { [key: string]: any } } = {};
 9 |     if (message) {
10 |         thread_data.message = message;
11 |     }
12 |     if (metadata) {
13 |         thread_data.metadata = metadata;
14 |     }
15 |     const response = await apiClient.post<IThread>(
16 |         '/threads',
17 |         thread_data);
18 |     return response.data;
19 | };
20 | 
21 | export const listThreads = async (
22 |     limit?: number,
23 |     order?: string,
24 | ): Promise<IThreadAndMessageAndAssistant[]> => {
25 |     const params: {
26 |         limit?: number,
27 |         order?: string,
28 |     } = { limit, order };
29 |     const response = await apiClient.get<IThreadAndMessageAndAssistant[]>('/threads', {
30 |         params
31 |     });
32 | 
33 |     return response.data;
34 | };
35 | 
36 | export const deleteThread = async (
37 |     thread_id: string
38 | ): Promise<IDeleteResult> => {
39 |     const response = await apiClient.delete<IDeleteResult>(`/threads/${thread_id}`);
40 |     return response.data;
41 | }
42 | 
43 | export const getThread = async (
44 |     thread_id: string
45 | ): Promise<IThread> => {
46 |     const response = await apiClient.get<IThread>(`/threads/${thread_id}`);
47 |     return response.data;
48 | }


--------------------------------------------------------------------------------
/ktransformers/website/src/assets/iconfont/iconfont.css:
--------------------------------------------------------------------------------
 1 | @font-face {
 2 |   font-family: "iconfont"; /* Project id 4550268 */
 3 |   src: url('iconfont.woff2?t=1717950820214') format('woff2'),
 4 |        url('iconfont.woff?t=1717950820214') format('woff'),
 5 |        url('iconfont.ttf?t=1717950820214') format('truetype'),
 6 |        url('iconfont.svg?t=1717950820214#iconfont') format('svg');
 7 | }
 8 | 
 9 | .iconfont {
10 |   font-family: "iconfont" !important;
11 |   font-size: 16px;
12 |   font-style: normal;
13 |   -webkit-font-smoothing: antialiased;
14 |   -moz-osx-font-smoothing: grayscale;
15 | }
16 | 
17 | .icon-copy:before {
18 |   content: "\e8b0";
19 | }
20 | 
21 | .icon-arrow-down:before {
22 |   content: "\e85e";
23 | }
24 | 
25 | .icon-usage-progress:before {
26 |   content: "\e651";
27 | }
28 | 
29 | .icon-gen-progress:before {
30 |   content: "\e617";
31 | }
32 | 
33 | .icon-back:before {
34 |   content: "\e779";
35 | }
36 | 
37 | .icon-point:before {
38 |   content: "\e608";
39 | }
40 | 
41 | .icon-edit:before {
42 |   content: "\e7dd";
43 | }
44 | 
45 | .icon-delete:before {
46 |   content: "\e614";
47 | }
48 | 
49 | .icon-upload-1:before {
50 |   content: "\e618";
51 | }
52 | 
53 | .icon-explore:before {
54 |   content: "\e621";
55 | }
56 | 
57 | .icon-ellipsis:before {
58 |   content: "\e657";
59 | }
60 | 
61 | .icon-sent:before {
62 |   content: "\e60c";
63 | }
64 | 
65 | .icon-list-list:before {
66 |   content: "\e62d";
67 | }
68 | 
69 | .icon-list-icon:before {
70 |   content: "\e639";
71 | }
72 | 
73 | .icon-zhongshi:before {
74 |   content: "\e6bd";
75 | }
76 | 
77 | .icon-log:before {
78 |   content: "\e826";
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/assets/iconfont/iconfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.ttf


--------------------------------------------------------------------------------
/ktransformers/website/src/assets/iconfont/iconfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.woff


--------------------------------------------------------------------------------
/ktransformers/website/src/assets/iconfont/iconfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.woff2


--------------------------------------------------------------------------------
/ktransformers/website/src/conf/config.ts:
--------------------------------------------------------------------------------
 1 | declare global {
 2 |     interface Window {
 3 |       configWeb: {
 4 |         apiUrl: string;
 5 |         port: string;
 6 |        };
 7 |      }
 8 |   }
 9 | 
10 | export const baseURL = window.configWeb.apiUrl;
11 | export const basePort = window.configWeb.port;
12 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/locals/index.js:
--------------------------------------------------------------------------------
 1 | // index.js
 2 | import { createI18n } from 'vue-i18n'
 3 | import zh from './zh'
 4 | import en from './en'
 5 | 
 6 | const messages = {
 7 |   en,
 8 |   zh,
 9 | }
10 | const language = (navigator.language || 'en').toLocaleLowerCase() // 这是获取浏览器的语言
11 | const i18n = createI18n({
12 |   legacy: false, // you must set `false`, to use Compostion API
13 |   locale: localStorage.getItem('lang') || language.split('-')[0] || 'en', // 首先从缓存里拿，没有的话就用浏览器语言，
14 |   fallbackLocale: 'en', // 设置备用语言
15 |   messages, 
16 | })
17 | 
18 | export default i18n


--------------------------------------------------------------------------------
/ktransformers/website/src/locals/zh.js:
--------------------------------------------------------------------------------
 1 | // zh.js
 2 | export default {
 3 |     home: {
 4 |         explore: '探索',
 5 |         language: '选择语言',
 6 |         english: '英语',
 7 |         chinese: '中文',
 8 |         today: '今天',
 9 |         previous:'历史',
10 |         withoutAssistantTip:'本记录的KTransformers已被删除，用户只能查看历史对话信息而无法继续对话!',
11 |         deleteThreadTip:'删除记录会清除历史信息哦～'
12 |     },
13 |     chat:{
14 |         inputTip:"发送信息和 KTransformers 畅聊吧～",
15 |     },
16 |     explore:{
17 |         description: "基于Lexllama，一起来创建你的专属KTransformers吧~",
18 |         configuring: "配置中",
19 |         completed: "完成",
20 |         assistantName: "名称",
21 |         assistantDescription: "描述",
22 |         assistantStatus: "Status",
23 |         createAssistant: "创建新的KTransformers",
24 |         deleteAssistant: "是否确认删除KTransformers，删除KTransformers之后其KVCache也会被同步清理掉哦~",
25 |     },
26 |     config:{
27 |         title:'配置你的KTransformers',
28 |         fileTip:"仅支持上传文件格式为 .text, docx, .ppt, .pdf format.",
29 |         secletFile:'选择文件',
30 |         outOfSize:'文件大小超出10MB，请重新选择',
31 |         fileExist:'文件已存在，请重新选择',
32 |         createAssistant:'KTransformers创建成功，点击build按钮开始构建KVCache',
33 |     },
34 |     build:{
35 |         title:'构建日志',
36 |         step1:'解析上传文件',
37 |         parsingFileStep1:'文件上传接收完成',
38 |         parsingFileStep2:{
39 |             parse:"正在解析第",
40 |             file:"文件",
41 |             total:'共',
42 |         },
43 |         parsingFileStep3:'Prompt装载完毕，准备生成KVCache',
44 |         step2:'生成 KVCache',
45 |         generateStep1:'生成KVCache计算计划',
46 |         generateStep2:{
47 |             calculate:"正在计算",
48 |             token:"tokens",
49 |             total:'共',
50 |         },
51 |         generateStep3:'KVCache已生成完成',
52 |         durationTime:'持续时间：',
53 |         remainTime:'剩余时间：',
54 |         buildProgress:'构建进度',
55 |         storageUsage:'存储使用：',
56 |         
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/main.ts:
--------------------------------------------------------------------------------
 1 | import { createApp } from 'vue'
 2 | import App from './App.vue'
 3 | import router from './router'
 4 | import store from './store'
 5 | import ElementPlus from 'element-plus'
 6 | import 'element-plus/dist/index.css'
 7 | import VueApexCharts from "vue3-apexcharts"
 8 | import i18n from '@/locals'
 9 | 
10 | const app = createApp(App)
11 | 
12 | app.use(ElementPlus)
13 | 
14 | app.use(i18n)
15 | app.use(VueApexCharts)
16 | app.use(store)
17 | app.use(router)
18 | app.mount('#app')
19 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/router/index.ts:
--------------------------------------------------------------------------------
 1 | import { createRouter, createWebHashHistory, RouteRecordRaw, createWebHistory } from 'vue-router'
 2 | import HomeView from '@/views/home.vue'
 3 | 
 4 | const routes: Array<RouteRecordRaw> = [
 5 |   {
 6 |     path: '/',
 7 |     name: 'home',
 8 |     component: HomeView,
 9 |     redirect: '/chat',
10 |     children: [{
11 |       path: '/chat',
12 |       name: '',
13 |       component: () => import(/* webpackChunkName: "about" */ '../components/chat/index.vue')
14 |     },]
15 |   },
16 | 
17 | ]
18 | 
19 | const router = createRouter({
20 |   history: createWebHashHistory(),
21 |   routes
22 | })
23 | 
24 | export default router
25 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/shims-vue.d.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | declare module '*.vue' {
 3 |   import type { DefineComponent } from 'vue'
 4 |   const component: DefineComponent<{}, {}, any>
 5 |   export default component
 6 |   
 7 | }
 8 | 
 9 | declare module '@/locals'
10 | declare module 'pdfobject';
11 | 


--------------------------------------------------------------------------------
/ktransformers/website/src/store/index.ts:
--------------------------------------------------------------------------------
 1 | import { createStore } from 'vuex'
 2 | 
 3 | export default createStore({
 4 |   state: {
 5 |   },
 6 |   getters: {
 7 |   },
 8 |   mutations: {
 9 |   },
10 |   actions: {
11 |   },
12 |   modules: {
13 |   }
14 | })
15 | 


--------------------------------------------------------------------------------
/ktransformers/website/tests/unit/example.spec.ts:
--------------------------------------------------------------------------------
 1 | import { shallowMount } from '@vue/test-utils'
 2 | import HelloWorld from '@/components/HelloWorld.vue'
 3 | 
 4 | describe('HelloWorld.vue', () => {
 5 |   it('renders props.msg when passed', () => {
 6 |     const msg = 'new message'
 7 |     const wrapper = shallowMount(HelloWorld, {
 8 |       props: { msg }
 9 |     })
10 |     expect(wrapper.text()).toMatch(msg)
11 |   })
12 | })
13 | 


--------------------------------------------------------------------------------
/ktransformers/website/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es5",
 4 |     "module": "esnext",
 5 |     "strict": true,
 6 |     "jsx": "preserve",
 7 |     "importHelpers": true,
 8 |     "moduleResolution": "node",
 9 |     "skipLibCheck": true,
10 |     "esModuleInterop": true,
11 |     "allowSyntheticDefaultImports": true,
12 |     "forceConsistentCasingInFileNames": true,
13 |     "useDefineForClassFields": true,
14 |     "sourceMap": true,
15 |     "allowJs": true,
16 |     "baseUrl": ".",
17 |     "types": [
18 |       "webpack-env",
19 |       "jest"
20 |     ],
21 |     "paths": {
22 |       "@/*": [
23 |         "src/*"
24 |       ]
25 |     },
26 |     "lib": [
27 |       "esnext",
28 |       "dom",
29 |       "dom.iterable",
30 |       "scripthost"
31 |     ]
32 |   },
33 |   "include": [
34 |     "src/**/*.ts",
35 |     "src/**/*.tsx",
36 |     "src/**/*.vue",
37 |     "tests/**/*.ts",
38 |     "tests/**/*.tsx",
39 |     "config.d.ts"
40 |   ],
41 |  
42 |   "exclude": [
43 |     "node_modules"
44 |   ]
45 | }


--------------------------------------------------------------------------------
/ktransformers/website/vue.config.js:
--------------------------------------------------------------------------------
 1 | 
 2 | module.exports = {
 3 |   // 配置 webpack-dev-server 行为。
 4 |   devServer: {
 5 |     open: false, // 编译后默认打开浏览器
 6 |     host: '0.0.0.0',  // 域名
 7 |     port: 8082,  // 端口
 8 |     https: false,  // 是否https
 9 |     proxy: {
10 |         '/api': {
11 |           target: 'http://localhost:9016/v1', // 你的后端服务器地址
12 |           changeOrigin: true, // 是否允许跨域
13 |           pathRewrite: {
14 |             '/api': '' // 将 '/api' 前缀替换为空，如果你的后端不需要这个前缀
15 |           }
16 |         }
17 |       }
18 | },
19 | publicPath: '/web/',  // 基本路径
20 | outputDir: 'dist', // 构建时的输出目录
21 | assetsDir: 'static', // 放置静态资源的目录
22 | indexPath: 'index.html', // html 的输出路径
23 | filenameHashing: true, // 文件名哈希值
24 | lintOnSave: false, // 是否在保存的时候使用 `eslint-loader` 进行检查。
25 | 
26 | // 组件是如何被渲染到页面中的？ （ast：抽象语法树；vDom：虚拟DOM）
27 | // template ---> ast ---> render ---> vDom ---> 真实的Dom ---> 页面
28 | // runtime-only：将template在打包的时候，就已经编译为render函数
29 | // runtime-compiler：在运行的时候才去编译template
30 | runtimeCompiler: false,
31 | 
32 | transpileDependencies: [], // babel-loader 默认会跳过 node_modules 依赖。
33 | productionSourceMap: false, // 是否为生产环境构建生成 source map
34 | 
35 | //调整内部的 webpack 配置
36 | configureWebpack: () => {},
37 | 
38 | chainWebpack: () => {},
39 |   
40 | }


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |   "setuptools",
 4 |   "torch >= 2.3.0", 
 5 |   "ninja",
 6 |   "packaging",
 7 |   "cpufeature"
 8 |   ]
 9 | build-backend = "setuptools.build_meta"
10 | 
11 | [project]
12 | 
13 | name = "ktransformers"
14 | 
15 | dynamic = ["version"]
16 | 
17 | dependencies = [
18 |   "torch >= 2.3.0",
19 |   "transformers == 4.51.3",
20 |   "fastapi >= 0.111.0",
21 |   "uvicorn >= 0.30.1",
22 |   "langchain >= 0.2.0",
23 |   "blessed >= 1.20.0",
24 |   "accelerate >= 0.31.0",
25 |   "sentencepiece >= 0.1.97",
26 |   "setuptools",
27 |   "ninja",
28 |   "wheel",
29 |   "colorlog",
30 |   "build",
31 |   "fire",
32 |   "protobuf",
33 | ]
34 | 
35 | requires-python = ">=3.10"
36 | 
37 | authors = [
38 |   {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
39 | ]
40 | 
41 | maintainers = [
42 |   {name = "james0zan", email = "zhang.mingxing@outlook.com"},
43 |   {name = "awake", email = "awake@approaching.ai"},
44 |   {name = "unicorn chan", email = "nl@approaching.ai"}
45 | ]
46 | 
47 | description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."
48 | 
49 | readme = "README.md"
50 | license = {file = "LICENSE"}
51 | 
52 | keywords = ["ktransformers", "llm"]
53 | 
54 | classifiers = [
55 |   "Development Status :: 4 - Beta",
56 |   "Programming Language :: Python :: 3.10",
57 |   "Programming Language :: Python :: 3.11",
58 |   "Programming Language :: Python :: 3.12"
59 | ]
60 | 
61 | [project.urls]
62 | Homepage = "https://kvcache.ai"
63 | Repository = "https://github.com/kvcache-ai/ktransformers.git"
64 | Issues = "https://github.com/kvcache-ai/ktransformers/issues"
65 | 
66 | 
67 | [project.scripts]
68 | ktransformers = "ktransformers.server.main:main"
69 | 
70 | [tool.setuptools.packages.find]
71 | where = ["./", ]
72 | include = ["ktransformers"]
73 | [tool.black]
74 | line-length = 120
75 | preview = true
76 | unstable = true
77 | 


--------------------------------------------------------------------------------
/requirements-local_chat.txt:
--------------------------------------------------------------------------------
 1 | fire
 2 | transformers==4.51.3
 3 | numpy
 4 | torch>=2.3.0
 5 | packaging
 6 | cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
 7 | protobuf
 8 | tiktoken
 9 | blobfile
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/README.md:
--------------------------------------------------------------------------------
1 | The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team.
2 | 


--------------------------------------------------------------------------------
/third_party/llamafile/bench.h:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 8 | #pragma once
 9 | 
10 | #include <stdio.h>
11 | 
12 | #include "micros.h"
13 | 
14 | #define BENCH(x)                                                                       \
15 |     do {                                                                               \
16 |         x;                                                                             \
17 |         __asm__ volatile("" ::: "memory");                                             \
18 |         long long start = micros();                                                    \
19 |         for (int i = 0; i < ITERATIONS; ++i) {                                         \
20 |             __asm__ volatile("" ::: "memory");                                         \
21 |             x;                                                                         \
22 |             __asm__ volatile("" ::: "memory");                                         \
23 |         }                                                                              \
24 |         printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
25 |     } while (0)
26 | 


--------------------------------------------------------------------------------
/third_party/llamafile/flags.cpp:
--------------------------------------------------------------------------------
1 | // Adapted from
2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
3 | // Copyrigth 2024 Mozilla Foundation.
4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
5 | 
6 | #include "flags.h"
7 | 
8 | bool FLAG_precise = false;
9 | 


--------------------------------------------------------------------------------
/third_party/llamafile/flags.h:
--------------------------------------------------------------------------------
1 | // Adapted from
2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
3 | // Copyrigth 2024 Mozilla Foundation.
4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
5 | 
6 | #pragma once
7 | 
8 | extern bool FLAG_precise;
9 | 


--------------------------------------------------------------------------------
/third_party/llamafile/iqk_mul_mat_amd_avx2.cpp:
--------------------------------------------------------------------------------
1 | // Adapted from
2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
3 | // Copyrigth 2024 Iwan Kawrakow.
4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
5 | 
6 | #if defined(__x86_64__) || defined(_M_X64)
7 | #include "iqk_mul_mat.inc"
8 | #endif  // __x86_64__
9 | 


--------------------------------------------------------------------------------
/third_party/llamafile/iqk_mul_mat_amd_zen4.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
 3 | // Copyrigth 2024 Iwan Kawrakow.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define iqk_mul_mat iqk_mul_mat_zen4
 8 | #define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
 9 | #include "iqk_mul_mat.inc"
10 | #endif  // __x86_64__
11 | 


--------------------------------------------------------------------------------
/third_party/llamafile/iqk_mul_mat_arm82.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp
 3 | // Copyrigth 2024 Iwan Kawrakow.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #ifdef __aarch64__
 7 | #define iqk_mul_mat iqk_mul_mat_arm82
 8 | #define iqk_mul_mat_moe iqk_mul_mat_moe_arm82
 9 | #include "iqk_mul_mat.inc"
10 | #endif  // __aarch64__
11 | 


--------------------------------------------------------------------------------
/third_party/llamafile/macros.h:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 8 | #pragma once
 9 | 
10 | #define MIN(X, Y) ((Y) > (X) ? (X) : (Y))
11 | #define MAX(X, Y) ((Y) < (X) ? (X) : (Y))
12 | #define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
13 | #define ROUNDUP(X, K) (((X) + (K) - 1) & -(K))
14 | #define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A)))))
15 | 


--------------------------------------------------------------------------------
/third_party/llamafile/micros.h:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 8 | #pragma once
 9 | 
10 | #include <ctime>
11 | 
12 | #ifndef _WIN32
13 | #include <unistd.h>
14 | #else
15 | #include <windows.h>
16 | #endif
17 | 
18 | #ifdef _WIN32
19 | static long long GetQueryPerformanceFrequency() {
20 |     LARGE_INTEGER t;
21 |     QueryPerformanceFrequency(&t);
22 |     return t.QuadPart;
23 | }
24 | static long long GetQueryPerformanceCounter() {
25 |     LARGE_INTEGER t;
26 |     QueryPerformanceCounter(&t);
27 |     return t.QuadPart;
28 | }
29 | #endif
30 | 
31 | static long long micros(void) {
32 | #ifndef _WIN32
33 |     struct timespec ts;
34 |     clock_gettime(CLOCK_REALTIME, &ts);
35 |     return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
36 | #else
37 |     static long long timer_freq = GetQueryPerformanceFrequency();
38 |     static long long timer_start = GetQueryPerformanceCounter();
39 |     return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq;
40 | #endif
41 | }
42 | 


--------------------------------------------------------------------------------
/third_party/llamafile/numba.h:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #pragma once
 7 | 
 8 | inline int rand32(void) {
 9 |     static unsigned long long lcg = 1;
10 |     lcg *= 6364136223846793005;
11 |     lcg += 1442695040888963407;
12 |     return lcg >> 32;
13 | }
14 | 
15 | inline int popcount(unsigned x) {
16 |     x = x - ((x >> 1) & 0x55555555);
17 |     x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
18 |     x = (x + (x >> 4)) & 0x0F0F0F0F;
19 |     x = (x + (x >> 16));
20 |     return (x + (x >> 8)) & 0x0000003F;
21 | }
22 | 
23 | inline int hamming(int x, int y) {
24 |     return popcount(x ^ y);
25 | }
26 | 
27 | inline float float01(unsigned x) {  // (0,1)
28 |     return 1.f / 8388608 * ((x >> 9) + .5f);
29 | }
30 | 
31 | inline float numba(void) {  // (-10,10)
32 |     return float01(rand32()) * 2.f - 1.f;
33 | }
34 | 
35 | template <typename T>
36 | void randomize(T* A, int n) {
37 |     for (int i = 0; i < n; ++i)
38 |         A[i] = numba();
39 | }
40 | 
41 | template <typename T>
42 | void randomize(int m, int n, T* A, int lda) {
43 |     for (int j = 0; j < n; ++j)
44 |         for (int i = 0; i < m; ++i)
45 |             A[lda * j + i] = numba();
46 | }
47 | 
48 | template <typename T, typename U>
49 | void broadcast(T* A, int n, U x) {
50 |     for (int i = 0; i < n; ++i)
51 |         A[i] = x;
52 | }
53 | 
54 | template <typename T, typename U>
55 | void broadcast(int m, int n, T* A, int lda, U x) {
56 |     for (int j = 0; j < n; ++j)
57 |         for (int i = 0; i < m; ++i)
58 |             A[lda * j + i] = x;
59 | }
60 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | 
10 | /**
11 |  * Returns number of shared memory bytes llamafile_mixmul() needs.
12 |  */
13 | size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
14 |     ggml_compute_params params{};
15 |     params.wsize = 0x7ffff000;
16 |     params.wdata = (void*)0x1000;
17 |     MixMul mm{&params, weights, thought, plan, 0};
18 |     if (mm.allocate_shared_memory())
19 |         return mm.get_allocated_bytes();
20 |     else
21 |         return 0;
22 | }
23 | 
24 | #endif  // __x86_64__
25 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx2
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx512f
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_avxvnni
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_fma
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_mixmul llamafile_mixmul_amd_zen4
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #ifdef __aarch64__
 7 | #define llamafile_mixmul llamafile_mixmul_arm80
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | 
10 | /**
11 |  * Returns number of shared memory bytes llamafile_mixmul() needs.
12 |  */
13 | size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) {
14 |     ggml_compute_params params{};
15 |     params.wsize = 0x7ffff000;
16 |     params.wdata = (void*)0x1000;
17 |     MixMul mm{&params, weights, thought, plan, 0};
18 |     if (mm.allocate_shared_memory())
19 |         return mm.get_allocated_bytes();
20 |     else
21 |         return 0;
22 | }
23 | 
24 | #endif  // __aarch64__
25 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #ifdef __aarch64__
 7 | #define llamafile_mixmul llamafile_mixmul_arm82
 8 | #include "tinyblas_cpu_mixmul.inc"
 9 | #endif  // __aarch64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx2
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx512f
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_avxvnni
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_fma
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __x86_64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #if defined(__x86_64__) || defined(_M_X64)
 7 | #define llamafile_sgemm llamafile_sgemm_amd_zen4
 8 | #define iqk_mul_mat iqk_mul_mat_zen4
 9 | #include "tinyblas_cpu_sgemm.inc"
10 | #endif  // __x86_64__
11 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #ifdef __aarch64__
 7 | #define llamafile_sgemm llamafile_sgemm_arm80
 8 | #include "tinyblas_cpu_sgemm.inc"
 9 | #endif  // __aarch64__
10 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | #ifdef __aarch64__
 7 | #define llamafile_sgemm llamafile_sgemm_arm82
 8 | #define iqk_mul_mat iqk_mul_mat_arm82
 9 | #include "tinyblas_cpu_sgemm.inc"
10 | #endif  // __aarch64__
11 | 


--------------------------------------------------------------------------------
/third_party/llamafile/tinyblas_cpu_unsupported.cpp:
--------------------------------------------------------------------------------
 1 | // Adapted from
 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp
 3 | // Copyrigth 2024 Mozilla Foundation.
 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
 5 | 
 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 8 | //
 9 | // Copyright 2024 Mozilla Foundation
10 | //
11 | // Licensed under the Apache License, Version 2.0 (the "License");
12 | // you may not use this file except in compliance with the License.
13 | // You may obtain a copy of the License at
14 | //
15 | //     http://www.apache.org/licenses/LICENSE-2.0
16 | //
17 | // Unless required by applicable law or agreed to in writing, software
18 | // distributed under the License is distributed on an "AS IS" BASIS,
19 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | // See the License for the specific language governing permissions and
21 | // limitations under the License.
22 | 
23 | #include "sgemm.h"
24 | 
25 | bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) {
26 |     return false;
27 | }
28 | 
29 | bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params,
30 |                                   const struct ggml_tensor* weights,
31 |                                   const struct ggml_tensor* thought,
32 |                                   const struct ggml_tensor* plan,
33 |                                   struct ggml_tensor* result) {
34 |     return false;
35 | }
36 | 
37 | bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) {
38 |     return false;
39 | }
40 | 


--------------------------------------------------------------------------------