├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── -bug-.yaml │ ├── -bug2-.yaml │ ├── -feature-.yaml │ └── -feature2-.yaml └── workflows │ ├── book-ci.yml │ ├── deploy.yml │ ├── docker-image.yml │ ├── install.yml │ ├── package_wheel_release.yml │ ├── package_wheel_test.yml │ └── score.yml ├── .gitignore ├── .gitmodules ├── .pylintrc ├── Dockerfile ├── Dockerfile.xpu ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── README_ZH.md ├── SECURITY.md ├── WeChatGroup.png ├── book.toml ├── csrc ├── balance_serve │ ├── CMakeLists.txt │ ├── kvc2 │ │ ├── .clang-format │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── config │ │ │ ├── model_configs.json │ │ │ └── quant_configs.json │ │ ├── export_envs_before_run.sh │ │ ├── install_deps.sh │ │ ├── mkfs.sh │ │ ├── src │ │ │ ├── CMakeLists.txt │ │ │ ├── async_store.cpp │ │ │ ├── async_store.hh │ │ │ ├── bind.cpp │ │ │ ├── cache_entry.cpp │ │ │ ├── cache_entry.hh │ │ │ ├── common.h │ │ │ ├── cuda_stream_manager.cpp │ │ │ ├── cuda_stream_manager.hh │ │ │ ├── defs.h │ │ │ ├── gpu_cache.cpp │ │ │ ├── gpu_cache.hh │ │ │ ├── hasher.hpp │ │ │ ├── io_helper.hpp │ │ │ ├── kvc2.h │ │ │ ├── kvc2_utils.py │ │ │ ├── metrics.cpp │ │ │ ├── metrics.h │ │ │ ├── model_config.h │ │ │ ├── page_aligned_memory_pool.cpp │ │ │ ├── page_aligned_memory_pool.h │ │ │ ├── prefix.cpp │ │ │ └── utils │ │ │ │ ├── all.hpp │ │ │ │ ├── arithmetic.hpp │ │ │ │ ├── easy_format.hpp │ │ │ │ ├── lock_free_queue.hpp │ │ │ │ ├── mpsc.hpp │ │ │ │ ├── mutex_extend.hpp │ │ │ │ ├── periodic_task.hpp │ │ │ │ ├── spin_lock.hpp │ │ │ │ └── timer.hpp │ │ ├── test │ │ │ ├── CMakeLists.txt │ │ │ ├── hashmap_test.cpp │ │ │ ├── kvc2_export_header_test.cpp │ │ │ ├── kvc2_export_load_test.cpp │ │ │ ├── kvc2_test_utils.cpp │ │ │ ├── kvc2test │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── append-tokens.cpp │ │ │ │ ├── check-flush-back.cpp │ │ │ │ ├── common.hpp │ │ │ │ ├── flush-back.cpp │ │ │ │ ├── lookup-alt-gpu.cpp │ │ │ │ ├── lookup-alt.cpp │ │ │ │ ├── lookup-gpu-async.cpp │ │ │ │ ├── lookup-gpu-mt-without-vcache.cpp │ │ │ │ ├── lookup-gpu-mt.cpp │ │ │ │ ├── lookup-gpu.cpp │ │ │ │ ├── lookup-mt.cpp │ │ │ │ ├── lookup-without-vcache.cpp │ │ │ │ ├── lookup.cpp │ │ │ │ └── raw_insert_read.cpp │ │ │ ├── kvcache_disk_insert_read_test.cpp │ │ │ ├── kvcache_mem_eviction_test.cpp │ │ │ ├── kvcache_mem_insert_read_test.cpp │ │ │ ├── kvcache_save_load_test.cpp │ │ │ ├── kvcache_test_utils.cpp │ │ │ ├── page_pool_test.cpp │ │ │ ├── prefix_test.cpp │ │ │ ├── pytest_load.py │ │ │ ├── pytest_mem_prefix_test.py │ │ │ ├── pytest_mem_read.py │ │ │ ├── pytest_raw_insert_and_read.py │ │ │ ├── test_align.py │ │ │ ├── test_cuda_stream.cpp │ │ │ ├── test_cuda_stream_manager.cpp │ │ │ ├── test_lock_free_queue.cpp │ │ │ ├── test_periodic_task.cpp │ │ │ ├── test_queue_perf.cpp │ │ │ ├── test_std_list.cpp │ │ │ └── xxHash_test.cpp │ │ └── unit_test.sh │ └── sched │ │ ├── CMakeLists.txt │ │ ├── bind.cpp │ │ ├── metrics.cpp │ │ ├── metrics.h │ │ ├── model_config.h │ │ ├── scheduler.cpp │ │ ├── scheduler.h │ │ └── utils │ │ ├── all.hpp │ │ ├── arithmetic.hpp │ │ ├── atomic_ptr_with_flags.hpp │ │ ├── csv.hpp │ │ ├── easy_format.hpp │ │ ├── mpsc.hpp │ │ ├── readable_number.hpp │ │ ├── statistics.hpp │ │ └── timer.hpp ├── custom_marlin │ ├── __init__.py │ ├── binding.cpp │ ├── gptq_marlin │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin.cuh │ │ ├── gptq_marlin_dtypes.cuh │ │ ├── gptq_marlin_repack.cu │ │ └── ops.h │ ├── setup.py │ ├── test_cuda_graph.py │ └── utils │ │ ├── __init__.py │ │ ├── format24.py │ │ ├── marlin_24_perms.py │ │ ├── marlin_perms.py │ │ ├── marlin_utils.py │ │ └── quant_utils.py └── ktransformers_ext │ ├── CMakeLists.txt │ ├── bench │ ├── bench_attention.py │ ├── bench_attention_torch.py │ ├── bench_linear.py │ ├── bench_linear_torch.py │ ├── bench_mlp.py │ ├── bench_mlp_torch.py │ ├── bench_moe.py │ ├── bench_moe_amx.py │ └── bench_moe_torch.py │ ├── cmake │ └── FindSIMD.cmake │ ├── cpu_backend │ ├── backend.cpp │ ├── backend.h │ ├── cpuinfer.h │ ├── shared_mem_buffer.cpp │ ├── shared_mem_buffer.h │ ├── task_queue.cpp │ ├── task_queue.h │ └── vendors │ │ ├── README.md │ │ ├── cuda.h │ │ ├── hip.h │ │ ├── musa.h │ │ └── vendor.h │ ├── cuda │ ├── binding.cpp │ ├── custom_gguf │ │ ├── dequant.cu │ │ └── ops.h │ ├── gptq_marlin │ │ ├── gptq_marlin.cu │ │ ├── gptq_marlin.cuh │ │ ├── gptq_marlin_dtypes.cuh │ │ └── ops.h │ ├── setup.py │ └── test_dequant.py │ ├── examples │ ├── test_attention.py │ ├── test_linear.py │ ├── test_mlp.py │ └── test_moe.py │ ├── ext_bindings.cpp │ ├── operators │ ├── amx │ │ ├── la │ │ │ ├── amx.hpp │ │ │ └── utils.hpp │ │ └── moe.hpp │ ├── kvcache │ │ ├── kvcache.h │ │ ├── kvcache_attn.cpp │ │ ├── kvcache_load_dump.cpp │ │ ├── kvcache_read_write.cpp │ │ └── kvcache_utils.cpp │ └── llamafile │ │ ├── conversion.h │ │ ├── linear.cpp │ │ ├── linear.h │ │ ├── mlp.cpp │ │ ├── mlp.h │ │ ├── moe.cpp │ │ └── moe.h │ └── vendors │ ├── cuda.h │ ├── hip.h │ ├── musa.h │ └── vendor.h ├── doc ├── README.md ├── SUMMARY.md ├── assets │ ├── BigCodeBench.png │ ├── DeepSeek-on-KTransformers.png │ ├── Framework_effect.png │ ├── InfLLM_equation.jpg │ ├── InfLLM_framework.png │ ├── InjectStruction.png │ ├── KTransformers.png │ ├── KTransformers_long_context_v1.png │ ├── KTransformers_long_context_v2.png │ ├── Quest_framework.png │ ├── SnapKV_framework.png │ ├── SparQ_attention.png │ ├── amx.png │ ├── amx_avx.png │ ├── amx_intro.png │ ├── cpuinfer.png │ ├── deepseekv2_structure.png │ ├── internlm_memory.png │ ├── long_context_generate.png │ ├── long_context_prefill.png │ ├── model_structure_guild.png │ ├── multi_gpu.png │ ├── needle_128K.png │ ├── needle_1M.png │ ├── onednn_1.png │ └── website.png ├── basic │ ├── note1.md │ └── note2.md ├── en │ ├── AMX.md │ ├── DeepseekR1_V3_tutorial.md │ ├── Docker.md │ ├── Docker_xpu.md │ ├── FAQ.md │ ├── ROCm.md │ ├── V3-success.md │ ├── api │ │ └── server │ │ │ ├── api.md │ │ │ ├── run-tabby.png │ │ │ ├── server-arch.png │ │ │ ├── server.md │ │ │ ├── tabby.md │ │ │ ├── visit-api-tags.png │ │ │ └── website.md │ ├── balance-serve.md │ ├── benchmark.md │ ├── deepseek-v2-injection.md │ ├── fp8_kernel.md │ ├── injection_tutorial.md │ ├── install.md │ ├── llama4.md │ ├── long_context_introduction.md │ ├── long_context_tutorial.md │ ├── makefile_usage.md │ ├── multi-gpu-tutorial.md │ ├── operators │ │ ├── Combined_MoE_time_per_layer.png │ │ ├── Linear_projection_time.png │ │ └── llamafile.md │ └── xpu.md └── zh │ ├── DeepseekR1_V3_tutorial_zh.md │ └── api │ └── server │ ├── api.md │ ├── run-tabby.png │ ├── server-arch.png │ ├── server.md │ ├── tabby.md │ ├── visit-api-tags.png │ └── website.md ├── install-with-cache.sh ├── install.bat ├── install.sh ├── ktransformers ├── __init__.py ├── configs │ ├── config.yaml │ └── log_config.ini ├── ktransformers_ext │ ├── operators │ │ └── custom_marlin │ │ │ └── quantize │ │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── format_24.py │ │ │ ├── marlin_24_perms.py │ │ │ ├── marlin_perms.py │ │ │ ├── marlin_utils.py │ │ │ └── quant_utils.py │ └── triton │ │ └── fp8gemm.py ├── local_chat.py ├── local_chat_test.py ├── models │ ├── __init__.py │ ├── configuration_deepseek.py │ ├── configuration_deepseek_v3.py │ ├── configuration_llama.py │ ├── configuration_qwen2_moe.py │ ├── configuration_qwen3_moe.py │ ├── custom_cache.py │ ├── custom_modeling_deepseek_v2.py │ ├── custom_modeling_deepseek_v3.py │ ├── custom_modeling_qwen2_moe.py │ ├── custom_modeling_qwen3_moe.py │ ├── modeling_deepseek.py │ ├── modeling_deepseek_v3.py │ ├── modeling_llama.py │ ├── modeling_mixtral.py │ ├── modeling_qwen2_moe.py │ └── modeling_qwen3_moe.py ├── operators │ ├── RoPE.py │ ├── __init__.py │ ├── attention.py │ ├── balance_serve_attention.py │ ├── base_operator.py │ ├── cpuinfer.py │ ├── dynamic_attention.py │ ├── experts.py │ ├── flashinfer_batch_prefill_wrapper.py │ ├── flashinfer_wrapper.py │ ├── gate.py │ ├── layernorm.py │ ├── linear.py │ ├── mlp.py │ ├── models.py │ ├── triton_attention.py │ └── triton_attention_prefill.py ├── optimize │ ├── optimize.py │ └── optimize_rules │ │ ├── DeepSeek-V2-Chat-multi-gpu-4.yaml │ │ ├── DeepSeek-V2-Chat-multi-gpu.yaml │ │ ├── DeepSeek-V2-Chat.yaml │ │ ├── DeepSeek-V2-Lite-Chat-gpu-cpu.yaml │ │ ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml │ │ ├── DeepSeek-V2-Lite-Chat.yaml │ │ ├── DeepSeek-V3-Chat-amx.yaml │ │ ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml │ │ ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml │ │ ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml │ │ ├── DeepSeek-V3-Chat-multi-gpu-4.yaml │ │ ├── DeepSeek-V3-Chat-multi-gpu-8.yaml │ │ ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml │ │ ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml │ │ ├── DeepSeek-V3-Chat-multi-gpu.yaml │ │ ├── DeepSeek-V3-Chat-serve.yaml │ │ ├── DeepSeek-V3-Chat.yaml │ │ ├── Internlm2_5-7b-Chat-1m.yaml │ │ ├── Mixtral.yaml │ │ ├── Moonlight-16B-A3B-serve.yaml │ │ ├── Moonlight-16B-A3B.yaml │ │ ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml │ │ ├── Qwen2-57B-A14B-Instruct.yaml │ │ ├── Qwen2-serve-amx.yaml │ │ ├── Qwen2-serve.yaml │ │ ├── Qwen3Moe-serve-amx.yaml │ │ ├── Qwen3Moe-serve.yaml │ │ ├── rocm │ │ └── DeepSeek-V3-Chat.yaml │ │ └── xpu │ │ ├── DeepSeek-V2-Chat.yaml │ │ ├── DeepSeek-V3-Chat.yaml │ │ └── Qwen3Moe-Chat.yaml ├── server │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── ollama │ │ │ ├── __init__.py │ │ │ └── completions.py │ │ ├── openai │ │ │ ├── __init__.py │ │ │ ├── assistants │ │ │ │ ├── __init__.py │ │ │ │ ├── assistants.py │ │ │ │ ├── messages.py │ │ │ │ ├── runs.py │ │ │ │ └── threads.py │ │ │ ├── endpoints │ │ │ │ ├── __init__.py │ │ │ │ └── chat.py │ │ │ └── legacy │ │ │ │ ├── __init__.py │ │ │ │ └── completions.py │ │ └── web │ │ │ ├── __init__.py │ │ │ └── system.py │ ├── args.py │ ├── backend │ │ ├── __init__.py │ │ ├── args.py │ │ ├── base.py │ │ ├── context_manager.py │ │ └── interfaces │ │ │ ├── __init__.py │ │ │ ├── balance_serve.py │ │ │ ├── exllamav2.py │ │ │ ├── ktransformers.py │ │ │ └── transformers.py │ ├── balance_serve │ │ ├── inference │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ ├── distributed │ │ │ │ ├── __init__.py │ │ │ │ ├── communication_op.py │ │ │ │ ├── cuda_wrapper.py │ │ │ │ ├── custom_all_reduce.py │ │ │ │ ├── custom_all_reduce_utils.py │ │ │ │ ├── parallel_state.py │ │ │ │ ├── pynccl.py │ │ │ │ ├── pynccl_wrapper.py │ │ │ │ └── utils.py │ │ │ ├── forward_batch.py │ │ │ ├── model_runner.py │ │ │ ├── query_manager.py │ │ │ └── sampling │ │ │ │ ├── penaltylib │ │ │ │ ├── __init__.py │ │ │ │ ├── orchestrator.py │ │ │ │ └── penalizers │ │ │ │ │ ├── frequency_penalty.py │ │ │ │ │ ├── min_new_tokens.py │ │ │ │ │ ├── presence_penalty.py │ │ │ │ │ └── repetition_penalty.py │ │ │ │ └── sampler.py │ │ ├── sched_rpc.py │ │ └── settings.py │ ├── config │ │ ├── config.py │ │ ├── log.py │ │ └── singleton.py │ ├── crud │ │ ├── __init__.py │ │ └── assistants │ │ │ ├── __init__.py │ │ │ ├── assistants.py │ │ │ ├── messages.py │ │ │ ├── runs.py │ │ │ └── threads.py │ ├── exceptions.py │ ├── main.py │ ├── models │ │ ├── __init__.py │ │ └── assistants │ │ │ ├── __init__.py │ │ │ ├── assistants.py │ │ │ ├── messages.py │ │ │ ├── run_steps.py │ │ │ ├── runs.py │ │ │ └── threads.py │ ├── requirements.txt │ ├── schemas │ │ ├── __init__.py │ │ ├── assistants │ │ │ ├── __init__.py │ │ │ ├── assistants.py │ │ │ ├── messages.py │ │ │ ├── runs.py │ │ │ ├── streaming.py │ │ │ ├── threads.py │ │ │ └── tool.py │ │ ├── base.py │ │ ├── conversation.py │ │ ├── endpoints │ │ │ └── chat.py │ │ └── legacy │ │ │ ├── __init__.py │ │ │ └── completions.py │ └── utils │ │ ├── __init__.py │ │ ├── create_interface.py │ │ ├── multi_timer.py │ │ └── sql_utils.py ├── tests │ ├── .gitignore │ ├── AIME_2024 │ │ ├── eval_api.py │ │ ├── evaluation.py │ │ └── prompts.py │ ├── dequant_gpu.py │ ├── dequant_gpu_t.py │ ├── function_call_test.py │ ├── humaneval │ │ ├── eval_api.py │ │ ├── evaluation.py │ │ └── prompts.py │ ├── mmlu_pro_test.py │ ├── mmlu_test.py │ ├── mmlu_test_multi.py │ ├── score.py │ ├── test_client.py │ ├── test_pytorch_q8.py │ ├── test_speed.py │ └── triton_fp8gemm_test.py ├── util │ ├── cuda_graph_runner.py │ ├── custom_gguf.py │ ├── custom_loader.py │ ├── modeling_rope_utils.py │ ├── textstream.py │ ├── utils.py │ ├── vendors.py │ └── weight_loader.py └── website │ ├── .browserslistrc │ ├── .eslintrc.js │ ├── .gitignore │ ├── README.md │ ├── config.d.ts │ ├── jest.config.js │ ├── package-lock.json │ ├── package.json │ ├── public │ ├── balck.ico │ ├── config.js │ ├── css │ │ └── reset.css │ ├── images │ │ ├── assistant-avatar.png │ │ ├── avatar.png │ │ ├── bgbg.png │ │ ├── logo.ico │ │ ├── logo.png │ │ ├── three.png │ │ └── user-filling.png │ └── index.html │ ├── src │ ├── App.vue │ ├── api │ │ ├── api-client.ts │ │ ├── assistant.ts │ │ ├── message.ts │ │ ├── run.ts │ │ └── thread.ts │ ├── assets │ │ ├── css │ │ │ └── mixins.styl │ │ └── iconfont │ │ │ ├── demo.css │ │ │ ├── demo_index.html │ │ │ ├── iconfont.css │ │ │ ├── iconfont.js │ │ │ ├── iconfont.json │ │ │ ├── iconfont.svg │ │ │ ├── iconfont.ttf │ │ │ ├── iconfont.woff │ │ │ └── iconfont.woff2 │ ├── components │ │ └── chat │ │ │ └── index.vue │ ├── conf │ │ └── config.ts │ ├── locals │ │ ├── en.js │ │ ├── index.js │ │ └── zh.js │ ├── main.ts │ ├── router │ │ └── index.ts │ ├── shims-vue.d.ts │ ├── store │ │ └── index.ts │ ├── utils │ │ ├── copy.ts │ │ └── types.ts │ └── views │ │ └── home.vue │ ├── tests │ └── unit │ │ └── example.spec.ts │ ├── tsconfig.json │ └── vue.config.js ├── merge_tensors └── merge_safetensor_gguf.py ├── pyproject.toml ├── requirements-local_chat.txt ├── setup.py └── third_party ├── llamafile ├── README.md ├── bench.h ├── flags.cpp ├── flags.h ├── iqk_mul_mat.inc ├── iqk_mul_mat_amd_avx2.cpp ├── iqk_mul_mat_amd_zen4.cpp ├── iqk_mul_mat_arm82.cpp ├── macros.h ├── micros.h ├── numba.h ├── sgemm.cpp ├── sgemm.h ├── tinyblas_cpu.h ├── tinyblas_cpu_mixmul.inc ├── tinyblas_cpu_mixmul_amd_avx.cpp ├── tinyblas_cpu_mixmul_amd_avx2.cpp ├── tinyblas_cpu_mixmul_amd_avx512f.cpp ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp ├── tinyblas_cpu_mixmul_amd_fma.cpp ├── tinyblas_cpu_mixmul_amd_zen4.cpp ├── tinyblas_cpu_mixmul_arm80.cpp ├── tinyblas_cpu_mixmul_arm82.cpp ├── tinyblas_cpu_sgemm.inc ├── tinyblas_cpu_sgemm_amd_avx.cpp ├── tinyblas_cpu_sgemm_amd_avx2.cpp ├── tinyblas_cpu_sgemm_amd_avx512f.cpp ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp ├── tinyblas_cpu_sgemm_amd_fma.cpp ├── tinyblas_cpu_sgemm_amd_zen4.cpp ├── tinyblas_cpu_sgemm_arm80.cpp ├── tinyblas_cpu_sgemm_arm82.cpp └── tinyblas_cpu_unsupported.cpp └── nlohmann ├── json.hpp └── json_fwd.hpp /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel as compile_server 2 | WORKDIR /workspace 3 | ENV CUDA_HOME /usr/local/cuda 4 | RUN <> ~/.bashrc && \ 65 | echo "conda activate ktransformers" >> ~/.bashrc 66 | 67 | WORKDIR /ktransformers/ 68 | CMD ["bash"] 69 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft third_party 2 | graft ktransformers 3 | graft local_chat.py 4 | graft csrc 5 | include LICENSE README.md 6 | prune ktransformers/website 7 | prune ktransformers/logs 8 | prune ktransformers.egg-info 9 | prune third_party/llama.cpp/models 10 | graft ktransformers/website/dist 11 | global-exclude __pycache__ 12 | include KTransformersOps.*.so 13 | include cpuinfer_ext.*.so 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | flake_find: 2 | cd ktransformers && flake8 | grep -Eo '[A-Z][0-9]{3}' | sort | uniq| paste -sd ',' - 3 | format: 4 | @cd ktransformers && black . 5 | @black setup.py 6 | dev_install: 7 | # clear build dirs 8 | rm -rf build 9 | rm -rf *.egg-info 10 | rm -rf ktransformers/ktransformers_ext/build 11 | rm -rf ktransformers/ktransformers_ext/cuda/build 12 | rm -rf ktransformers/ktransformers_ext/cuda/dist 13 | rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info 14 | 15 | # install ktransformers 16 | echo "Installing python dependencies from requirements.txt" 17 | pip install -r requirements-local_chat.txt 18 | 19 | echo "Installing ktransformers" 20 | KTRANSFORMERS_FORCE_BUILD=TRUE pip install -e . -v --no-build-isolation 21 | echo "Installation completed successfully" 22 | clean: 23 | rm -rf build 24 | rm -rf *.egg-info 25 | rm -rf ktransformers/ktransformers_ext/build 26 | rm -rf ktransformers/ktransformers_ext/cuda/build 27 | rm -rf ktransformers/ktransformers_ext/cuda/dist 28 | rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info 29 | install_numa: 30 | USE_NUMA=1 make dev_install 31 | install_no_numa: 32 | env -u USE_NUMA make dev_install -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 5.1.x | :white_check_mark: | 11 | | 5.0.x | :x: | 12 | | 4.0.x | :white_check_mark: | 13 | | < 4.0 | :x: | 14 | 15 | ## Reporting a Vulnerability 16 | 17 | Use this section to tell people how to report a vulnerability. 18 | 19 | Tell them where to go, how often they can expect to get an update on a 20 | reported vulnerability, what to expect if the vulnerability is accepted or 21 | declined, etc. 22 | -------------------------------------------------------------------------------- /WeChatGroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/WeChatGroup.png -------------------------------------------------------------------------------- /book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["kvcache-ai"] 3 | language = "zh-CN" 4 | title = "Ktransformers" 5 | src = "doc" 6 | 7 | [output.html] 8 | git-repository-url = "https://github.com/kvcache-ai/ktransformers" 9 | edit-url-template = "https://github.com/kvcache-ai/ktransformers/edit/main/{path}" 10 | 11 | [output.html.playground] 12 | editable = true 13 | copy-js = true 14 | # line-numbers = true 15 | 16 | [output.html.fold] 17 | enable = true 18 | level = 0 -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/.clang-format: -------------------------------------------------------------------------------- 1 | Language: Cpp 2 | # 格式化风格,可以是LLVM, Google, Chromium, Mozilla, WebKit等,或者自定义 3 | BasedOnStyle: Google 4 | 5 | # 缩进设置 6 | IndentWidth: 2 7 | TabWidth: 2 8 | UseTab: Never 9 | 10 | # 换行相关设置 11 | BreakBeforeBraces: Attach 12 | AllowShortIfStatementsOnASingleLine: false 13 | AllowShortFunctionsOnASingleLine: Inline 14 | AllowShortLoopsOnASingleLine: false 15 | 16 | # 类与结构体 17 | DerivePointerAlignment: false 18 | PointerAlignment: Left 19 | 20 | # 包含文件的排序和分组 21 | IncludeBlocks: Preserve 22 | SortIncludes: true 23 | 24 | # 控制最大行宽 25 | ColumnLimit: 120 26 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/README.md: -------------------------------------------------------------------------------- 1 | # KVC2 2 | 3 | # Build 4 | 运行以下命令编译kvc2,注意可能需要 sudo 权限安装一些依赖 5 | ```shell 6 | git clone https://github.com/kvcache-ai/kvc2 7 | cd kvc2 8 | ./install_deps.sh 9 | mkdir build 10 | cd build 11 | cmake .. 12 | make -j && make install 13 | ``` 14 | 编译完成后会生成`build/output`,包含`kvc2_ext.cpython-312-x86_64-linux-gnu.so`和`kvc2_utils.py`方便调用。 15 | 16 | 25 | 26 | # Troubleshooting 27 | 在 Python 环境运行时,可以需要在 conda 中安装相关的依赖。 28 | ```shell 29 | conda install -c conda-forge gcc_linux-64 gxx_linux-64 30 | ``` 31 | 32 | 也可以尝试设置一下环境变量,然后再运行。 33 | ```shell 34 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 35 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 36 | ``` 37 | 38 | 39 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/config/model_configs.json: -------------------------------------------------------------------------------- 1 | { 2 | "DeepSeek-Coder-V2-Instruct": { 3 | "hidden_size": 5120, 4 | "intermediate_size": 12288, 5 | "max_position_embeddings": 163840, 6 | "model_type": "deepseek_v2", 7 | "num_attention_heads": 128, 8 | "num_hidden_layers": 60, 9 | "num_key_value_heads": 128, 10 | "vocab_size": 102400 11 | }, 12 | "LLaMA-2-7B-32K": { 13 | "hidden_size": 4096, 14 | "intermediate_size": 11008, 15 | "max_position_embeddings": 32768, 16 | "model_type": "llama", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 32, 19 | "num_key_value_heads": 32, 20 | "vocab_size": 32000 21 | }, 22 | "Qwen2.5-7B-Instruct": { 23 | "hidden_size": 3584, 24 | "intermediate_size": 18944, 25 | "max_position_embeddings": 32768, 26 | "model_type": "qwen2", 27 | "num_attention_heads": 28, 28 | "num_hidden_layers": 28, 29 | "num_key_value_heads": 4, 30 | "vocab_size": 152064 31 | }, 32 | "qwen2-72b-instruct": { 33 | "hidden_size": 8192, 34 | "intermediate_size": 29568, 35 | "max_position_embeddings": 32768, 36 | "model_type": "qwen2", 37 | "num_attention_heads": 64, 38 | "num_hidden_layers": 80, 39 | "num_key_value_heads": 8, 40 | "vocab_size": 152064 41 | } 42 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/config/quant_configs.json: -------------------------------------------------------------------------------- 1 | { 2 | "BF16": { 3 | "block_element_count": 1, 4 | "block_element_size": 2, 5 | "bytes_per_element": 2.0, 6 | "can_be_used_as_vector": true, 7 | "has_min": false, 8 | "has_scale": false, 9 | "name": "BF16", 10 | "reference": "", 11 | "type_of_dot_vector": "BF16" 12 | }, 13 | "FP16": { 14 | "block_element_count": 1, 15 | "block_element_size": 2, 16 | "bytes_per_element": 2.0, 17 | "can_be_used_as_vector": true, 18 | "has_min": false, 19 | "has_scale": false, 20 | "name": "FP16", 21 | "reference": "", 22 | "type_of_dot_vector": "FP16" 23 | }, 24 | "FP32": { 25 | "block_element_count": 1, 26 | "block_element_size": 4, 27 | "bytes_per_element": 4.0, 28 | "can_be_used_as_vector": true, 29 | "has_min": false, 30 | "has_scale": false, 31 | "name": "FP32", 32 | "reference": "", 33 | "type_of_dot_vector": "FP32" 34 | }, 35 | "Q4_0": { 36 | "block_element_count": 32, 37 | "block_element_size": 18, 38 | "bytes_per_element": 0.5625, 39 | "can_be_used_as_vector": false, 40 | "has_min": false, 41 | "has_scale": true, 42 | "name": "Q4_0", 43 | "reference": "https://huggingface.co/docs/hub/gguf", 44 | "type_of_dot_vector": "Q8_0" 45 | }, 46 | "Q8_0": { 47 | "block_element_count": 32, 48 | "block_element_size": 34, 49 | "bytes_per_element": 1.0625, 50 | "can_be_used_as_vector": true, 51 | "has_min": false, 52 | "has_scale": true, 53 | "name": "Q8_0", 54 | "reference": "https://huggingface.co/docs/hub/gguf", 55 | "type_of_dot_vector": "Q8_0" 56 | } 57 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/export_envs_before_run.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 2 | export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libffi.so.7 3 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/install_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd "${0%/*}" 4 | git submodule update --init --recursive 5 | 6 | sudo apt update 7 | sudo apt install libtbb-dev 8 | sudo apt install libcurl4-openssl-dev 9 | sudo apt install libaio-dev 10 | 11 | cd third_party/xxHash/ 12 | make -j 13 | sudo make install 14 | cd ../.. 15 | 16 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/mkfs.sh: -------------------------------------------------------------------------------- 1 | sudo umount /mnt/xwy 2 | sudo mkfs.xfs /dev/nvme0n1 -f 3 | sudo mount /dev/nvme0n1 /mnt/xwy 4 | sudo chown -R xwy /mnt/xwy/ -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(${THIRD_PARTY_DIR}/asyncio/include) 2 | 3 | add_library(kvc2_metrics STATIC metrics.cpp) 4 | target_link_libraries(kvc2_metrics PUBLIC prometheus-cpp::pull) 5 | 6 | add_library(page_aligned_memory_pool page_aligned_memory_pool.cpp) 7 | target_include_directories(page_aligned_memory_pool PRIVATE ${THIRD_PARTY_DIR}/spdlog/include) 8 | 9 | function(add_third_party_includes TARGET_NAME) 10 | target_include_directories(${TARGET_NAME} PRIVATE 11 | ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/core/include 12 | ${THIRD_PARTY_BUILD_DIR}/prometheus-cpp/pull/include 13 | ${THIRD_PARTY_DIR}/prometheus-cpp/core/include 14 | ${THIRD_PARTY_DIR}/prometheus-cpp/pull/include 15 | ${THIRD_PARTY_DIR}/spdlog/include 16 | ) 17 | endfunction() 18 | 19 | 20 | add_library(cache_entry cache_entry.cpp) 21 | add_third_party_includes(cache_entry) 22 | target_link_libraries(cache_entry PUBLIC gpu_cache) 23 | 24 | add_library(gpu_cache gpu_cache.cpp) 25 | add_third_party_includes(gpu_cache) 26 | target_link_libraries(gpu_cache PUBLIC xxHash::xxhash ${TORCH_LIBRARIES} cuda_stream_manager) 27 | 28 | add_library(kvc2 prefix.cpp) 29 | target_include_directories(kvc2 PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include) 30 | add_third_party_includes(kvc2) 31 | target_link_libraries(kvc2 PUBLIC TBB::tbb xxHash::xxhash cache_entry cuda_stream_manager page_aligned_memory_pool ${TORCH_LIBRARIES} prometheus-cpp::pull kvc2_metrics) 32 | 33 | message(STATUS "CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR}) 34 | add_library(async_store async_store.cpp) 35 | target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/nlohmann/single_include) 36 | target_include_directories(async_store PRIVATE ${THIRD_PARTY_DIR}/spdlog/include) 37 | target_link_libraries(async_store PUBLIC pthread) 38 | 39 | 40 | 41 | add_library(cuda_stream_manager cuda_stream_manager.cpp) 42 | target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/nlohmann/single_include) 43 | target_include_directories(cuda_stream_manager PUBLIC ${THIRD_PARTY_DIR}/spdlog/include) 44 | target_include_directories(cuda_stream_manager PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) 45 | target_link_libraries(cuda_stream_manager PUBLIC CUDA::cudart) 46 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/async_store.hh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG 6 | #define FMT_HEADER_ONLY 7 | #include "spdlog/spdlog.h" 8 | 9 | #include "io_helper.hpp" 10 | 11 | namespace async_store { 12 | 13 | struct ArrayStore; 14 | 15 | ArrayStore* create_or_open_store(size_t element_size, size_t size, std::filesystem::path data_path); 16 | void close_store(ArrayStore* store); 17 | size_t capacity(ArrayStore* store); 18 | void extend(ArrayStore* store, size_t to); 19 | 20 | 21 | 22 | struct IORequest { 23 | ArrayStore* store; 24 | bool write; 25 | void* data; 26 | size_t index; 27 | 28 | // for sync 29 | bool need_promise = false; 30 | BatchPromise* promise; 31 | }; 32 | 33 | std::string request_to_string(IORequest* req); 34 | 35 | struct IODealerImpl; 36 | struct IODealer { 37 | IODealerImpl* io_impl; 38 | 39 | IODealer(bool use_io_uring = false, int IO_DEPTH = 128); 40 | ~IODealer(); 41 | IODealer(const IODealer&) = delete; 42 | IODealer& operator=(const IODealer&) = delete; 43 | IODealer(IODealer&&) = default; 44 | IODealer& operator=(IODealer&&) = default; 45 | 46 | void enqueue(std::shared_ptr req); 47 | std::thread start_io_thread(); 48 | void stop(); 49 | }; 50 | 51 | } // namespace async_store 52 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/common.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/src/common.h -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/cuda_stream_manager.hh: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Xie Weiyu ervinxie@qq.com 3 | * @Date: 2024-11-19 09:24:47 4 | * @LastEditors: Xie Weiyu ervinxie@qq.com 5 | * @LastEditTime: 2024-11-20 02:55:49 6 | * @FilePath: /kvc2/src/cuda_stream_manager.hh 7 | * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE 8 | */ 9 | #pragma once 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "utils/mpsc.hpp" 18 | 19 | class CudaStreamManager { 20 | public: 21 | // 构造函数,接受要使用的设备 ID 列表和每个设备的流数量 22 | CudaStreamManager(const std::vector& device_ids, int num_streams_per_device); 23 | ~CudaStreamManager(); 24 | 25 | // 请求结构体 26 | struct Request { 27 | bool should_exit = false; 28 | int device_id; 29 | std::vector host_mem_addresses; 30 | std::vector device_mem_addresses; 31 | std::vector sizes; 32 | cudaMemcpyKind direction; 33 | std::function callback; 34 | }; 35 | 36 | void submitRequest(std::shared_ptr request); 37 | 38 | private: 39 | // 每个设备的信息 40 | struct DeviceInfo { 41 | int device_id; 42 | std::thread worker_thread; 43 | std::vector streams; 44 | int next_stream_index; 45 | MPSCQueueConsumerLock> request_queue; 46 | std::atomic_bool stop_flag; 47 | }; 48 | 49 | // 设备 ID 到 DeviceInfo 的映射 50 | std::vector> devices_; 51 | 52 | // 私有方法 53 | void deviceWorker(DeviceInfo& device_info); 54 | }; 55 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/defs.h: -------------------------------------------------------------------------------- 1 | #ifndef __DEFS_H_ 2 | #define __DEFS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "model_config.h" 8 | 9 | namespace kvc2 { 10 | using kvc2_ptr = void*; 11 | // using data_block_ptr = std::intptr_t; 12 | using data_block_ptr = void*; 13 | using layer_data = std::vector; 14 | using kvc2_handle = void*; 15 | 16 | using Token = uint32_t; 17 | using Tokens = std::vector; 18 | using TokenPtr = std::intptr_t; 19 | using TokenLength = size_t; 20 | using BlockLength = size_t; 21 | 22 | struct CacheInfo { 23 | ModelName model_name; 24 | bool is_key_cache; 25 | QuantType quant_type; 26 | 27 | size_t hidden_layer_count(); 28 | std::filesystem::path path(std::optional which_layer = std::nullopt); 29 | bool operator==(const CacheInfo& other) const; 30 | size_t element_size(size_t block_length); 31 | size_t hash_value() const; 32 | }; 33 | 34 | }; // namespace kvc2 35 | #endif 36 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/hasher.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __HASHER_HPP_ 2 | #define __HASHER_HPP_ 3 | 4 | #include "defs.h" 5 | #include "xxhash.h" 6 | 7 | namespace kvc2 { 8 | 9 | const uint64_t hash_seed = 4123512; 10 | const uint64_t check_hash_seed = 1025753; 11 | 12 | using TokensHash = XXH64_hash_t; 13 | struct TokensHasher { 14 | XXH64_state_t* state; 15 | TokensHasher() { 16 | state = XXH64_createState(); 17 | reset(); 18 | } 19 | ~TokensHasher() { XXH64_freeState(state); } 20 | 21 | TokensHasher(TokensHasher& other) = delete; 22 | TokensHasher& operator=(TokensHasher& other) = delete; 23 | TokensHasher(TokensHasher&& other) = delete; 24 | TokensHasher& operator=(TokensHasher&& other) = delete; 25 | TokensHash get() { return XXH64_digest(state); } 26 | void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); } 27 | TokensHash update(Token* data, TokenLength length) { 28 | XXH64_update(state, data, length * sizeof(Token)); 29 | return get(); 30 | } 31 | 32 | TokensHash update_raw(void* data, size_t size) { 33 | XXH64_update(state, data, size); 34 | return get(); 35 | } 36 | 37 | static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); } 38 | }; 39 | } // namespace kvc2 40 | #endif -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/kvc2_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import ctypes 3 | 4 | def aligned_tensor(size, alignment=4096): 5 | num_bytes = size 6 | mem = ctypes.c_void_p() 7 | error_code = ctypes.CDLL(None).posix_memalign( 8 | ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes) 9 | ) 10 | 11 | if error_code != 0: 12 | raise MemoryError(f"posix_memalign failed with error code {error_code}") 13 | 14 | array_type = (ctypes.c_int8 * size) 15 | raw_array = array_type.from_address(mem.value) 16 | 17 | tensor = torch.frombuffer(raw_array, dtype=torch.int8) 18 | 19 | if tensor.data_ptr() % alignment != 0: 20 | raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes") 21 | 22 | return tensor, mem 23 | 24 | def alloc_aligned_cache(layer_count,block_count,element_size): 25 | cache = [] 26 | cache_mem = [] 27 | for i in range(layer_count): 28 | layer_data = [] 29 | layer_mem = [] 30 | for j in range(block_count): 31 | tensor, mem_ptr = aligned_tensor(element_size, alignment=4096) 32 | layer_data.append(tensor) 33 | layer_mem.append(mem_ptr) 34 | cache.append(layer_data) 35 | cache_mem.append(layer_mem) 36 | return cache,cache_mem 37 | 38 | def dealloc_aligned_cache(cache_mem): 39 | for layer_mem in cache_mem: 40 | for mem_ptr in layer_mem: 41 | ctypes.CDLL(None).free(mem_ptr) 42 | 43 | def get_tensor_ptr(tensors): 44 | tensor_ptr = [] 45 | for layer in tensors: 46 | layer_ptr = [] 47 | for data in layer: 48 | layer_ptr.append(data.data_ptr()) 49 | tensor_ptr.append(layer_ptr) 50 | return tensor_ptr 51 | 52 | def get_tensor_from_data_ptr(matched_data,element_size): 53 | re = [] 54 | for layer in matched_data: 55 | re_layer = [] 56 | for data_ptr in layer: 57 | array_type = (ctypes.c_int8 * element_size) 58 | raw_array = array_type.from_address(data_ptr) 59 | tensor = torch.frombuffer(raw_array, dtype=torch.int8) 60 | re_layer.append(tensor) 61 | re.append(re_layer) 62 | return re 63 | if __name__ == "__main__": 64 | pass -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/metrics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "prometheus/counter.h" 10 | #include "prometheus/exposer.h" 11 | #include "prometheus/gauge.h" 12 | #include "prometheus/histogram.h" 13 | #include "prometheus/registry.h" 14 | 15 | #include "utils/timer.hpp" 16 | 17 | namespace kvc2 { 18 | 19 | // 指标前缀宏定义 20 | #define METRIC_PREFIX "kvc2" 21 | 22 | struct MetricsConfig { 23 | std::string endpoint; // 监听端点,如 "0.0.0.0:8080" 24 | }; 25 | 26 | class Metrics { 27 | public: 28 | // 构造函数传入 MetricsConfig 29 | Metrics(const MetricsConfig& config); 30 | ~Metrics(); 31 | 32 | // 禁止拷贝和赋值 33 | Metrics(const Metrics&) = delete; 34 | Metrics& operator=(const Metrics&) = delete; 35 | 36 | // 指标指针 37 | prometheus::Counter* prefix_nodes; 38 | prometheus::Counter* prefix_block_count; 39 | 40 | prometheus::Histogram* raw_insert_time_ms; 41 | prometheus::Histogram* lookup_time_ms; 42 | prometheus::Histogram* lookup_prefixmatch_length; 43 | prometheus::Histogram* matched_length_percentage; 44 | 45 | prometheus::Gauge* disk_usage; 46 | 47 | prometheus::Gauge* memory_pool_size(const std::string& type); 48 | prometheus::Gauge* memory_pool_node_count(const std::string& type); 49 | 50 | prometheus::Gauge* lru_entry_count(const std::string& type); 51 | prometheus::Gauge* gpu_page_count(std::string type); 52 | 53 | prometheus::Histogram* append_tokens_time_ms; 54 | prometheus::Histogram* gpu_flush_back_time_ms; 55 | prometheus::Histogram* cpu_flush_back_time_ms; 56 | 57 | private: 58 | std::shared_ptr registry_; 59 | prometheus::Exposer exposer_; 60 | 61 | prometheus::Family* memory_pool_size_family_; 62 | prometheus::Family* memory_pool_node_count_family_; 63 | prometheus::Family* lru_entry_count_family_; 64 | prometheus::Family* gpu_page_count_family_; 65 | }; 66 | 67 | class TimeObserver { 68 | public: 69 | TimeObserver(prometheus::Histogram* h); 70 | ~TimeObserver(); 71 | 72 | private: 73 | Timer timer_; 74 | prometheus::Histogram* histogram_; 75 | }; 76 | 77 | } // namespace kvc2 -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include // std::sort 5 | #include 6 | #include 7 | #include // size_t 8 | #include // std::mutex 9 | #include 10 | 11 | constexpr size_t PageSize = 4096; 12 | 13 | /// PageAlignedMemoryPool 类的声明 14 | struct PageAlignedMemoryPool { 15 | private: 16 | constexpr static size_t Blocks = 16; 17 | 18 | void* data = nullptr; 19 | 20 | size_t total_size = 0, total_pages = 0; 21 | 22 | std::atomic_size_t now_block = 0; 23 | std::atomic_size_t allocated = 0; // allocated_size 24 | std::atomic_size_t alloc_count = 0; 25 | std::atomic_size_t free_count = 0; 26 | 27 | std::mutex lock[Blocks]; 28 | size_t page_per_block = 0; 29 | void* first_page[Blocks]; 30 | size_t count_page[Blocks]; 31 | std::vector bitmap[Blocks]; 32 | void* alloc_in_block(size_t block_index, size_t alloc_size); 33 | 34 | public: 35 | /// 构造函数和析构函数 36 | explicit PageAlignedMemoryPool(size_t size_in_bytes); 37 | ~PageAlignedMemoryPool(); 38 | 39 | /// 禁用拷贝和移动 40 | PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete; 41 | PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete; 42 | PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete; 43 | PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete; 44 | 45 | /// 成员函数 46 | size_t page_count(); 47 | size_t page_padded_size(size_t size); 48 | 49 | void* alloc(size_t size); 50 | std::vector alloc_multiple(size_t size, size_t count); 51 | void free(void* data, size_t size); 52 | void defragment(); 53 | std::string debug(); 54 | }; 55 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/all.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "easy_format.hpp" 3 | #include "timer.hpp" -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/arithmetic.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template 5 | T div_up(T x, U by) { 6 | static_assert(std::is_integral_v); 7 | static_assert(std::is_integral_v); 8 | return (x + by - 1) / by; 9 | } 10 | 11 | template 12 | T* offset_by_bytes(T* t, size_t n) { 13 | return reinterpret_cast(reinterpret_cast(t) + n); 14 | } 15 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/easy_format.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __EASY_FORMAT_HPP_ 2 | #define __EASY_FORMAT_HPP_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | template 11 | inline std::string format_vector(const std::vector& v) { 12 | std::ostringstream oss; 13 | if (v.empty()) 14 | return "[]"; 15 | for (size_t i = 0; i < v.size(); ++i) { 16 | oss << v[i]; 17 | if (i < v.size() - 1) 18 | oss << ", "; // 逗号分隔 19 | } 20 | return oss.str(); 21 | } 22 | 23 | inline std::array units = {"", "K", "M", "G", "T", "P", "E"}; 24 | 25 | inline std::string readable_number(size_t size) { 26 | size_t unit_index = 0; 27 | double readable_size = size; 28 | while (readable_size >= 1000 && unit_index < units.size() - 1) { 29 | readable_size /= 1000; 30 | unit_index++; 31 | } 32 | std::ostringstream ss; 33 | ss << std::fixed << std::setprecision(2) << readable_size; 34 | std::string str = ss.str(); 35 | return str + "" + units[unit_index]; 36 | } 37 | #endif -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | template 9 | class MPSCQueue { 10 | struct Node { 11 | std::shared_ptr data; 12 | std::atomic next; 13 | 14 | Node() : next(nullptr) {} 15 | Node(std::shared_ptr data_) : data(std::move(data_)), next(nullptr) {} 16 | }; 17 | 18 | std::atomic head; 19 | Node* tail; 20 | 21 | public: 22 | std::atomic_size_t enqueue_count = 0; 23 | size_t dequeue_count = 0; 24 | MPSCQueue() { 25 | Node* dummy = new Node(); 26 | head.store(dummy, std::memory_order_relaxed); 27 | tail = dummy; 28 | } 29 | 30 | ~MPSCQueue() { 31 | // 清理剩余的节点 32 | Node* node = tail; 33 | while (node) { 34 | Node* next = node->next.load(std::memory_order_relaxed); 35 | delete node; 36 | node = next; 37 | } 38 | } 39 | 40 | // 生产者调用 41 | void enqueue(std::shared_ptr data) { 42 | enqueue_count.fetch_add(1); 43 | Node* node = new Node(std::move(data)); 44 | Node* prev_head = head.exchange(node, std::memory_order_acq_rel); 45 | prev_head->next.store(node, std::memory_order_release); 46 | } 47 | 48 | // 消费者调用 49 | std::shared_ptr dequeue() { 50 | Node* next = tail->next.load(std::memory_order_acquire); 51 | if (next) { 52 | std::shared_ptr res = std::move(next->data); 53 | delete tail; 54 | tail = next; 55 | dequeue_count += 1; 56 | return res; 57 | } 58 | return nullptr; 59 | } 60 | }; -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __MUTEX_EXTEND_HPP_ 2 | #define __MUTEX_EXTEND_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class non_recursive_mutex { 10 | public: 11 | non_recursive_mutex() = default; 12 | 13 | // 使用 try_lock 实现非递归锁 14 | bool try_lock() { 15 | std::thread::id this_id = std::this_thread::get_id(); 16 | 17 | // 检查当前线程是否已经持有该锁 18 | if (owner.load(std::memory_order_acquire) == this_id) { 19 | return false; // 如果是当前线程,返回失败 20 | } 21 | 22 | // 尝试加锁 23 | if (mtx.try_lock()) { 24 | owner.store(this_id, std::memory_order_release); // 设置锁的拥有者 25 | return true; 26 | } 27 | 28 | return false; 29 | } 30 | 31 | // lock 会阻塞,直到获得锁 32 | void lock() { 33 | std::thread::id this_id = std::this_thread::get_id(); 34 | 35 | while (true) { 36 | // 检查当前线程是否已经持有该锁 37 | if (owner.load(std::memory_order_acquire) == this_id) { 38 | throw std::runtime_error("Thread is trying to lock a mutex it already holds"); 39 | } 40 | 41 | // 尝试加锁 42 | if (mtx.try_lock()) { 43 | owner.store(this_id, std::memory_order_release); // 设置锁的拥有者 44 | return; 45 | } 46 | 47 | // 如果锁未获得,则稍微等待,防止忙等 48 | std::this_thread::yield(); 49 | } 50 | } 51 | 52 | // 解锁 53 | void unlock() { 54 | std::thread::id this_id = std::this_thread::get_id(); 55 | 56 | // 确保只有持有锁的线程可以解锁 57 | if (owner.load(std::memory_order_acquire) == this_id) { 58 | owner.store(std::thread::id(), std::memory_order_release); // 清除锁的拥有者 59 | mtx.unlock(); 60 | } else { 61 | throw std::runtime_error("Thread attempting to unlock a mutex it doesn't own"); 62 | } 63 | } 64 | 65 | private: 66 | std::mutex mtx; // 实际的互斥量 67 | std::atomic owner; // 原子变量,记录当前锁的拥有者 68 | }; 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/src/utils/spin_lock.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Xie Weiyu ervinxie@qq.com 3 | * @Date: 2024-11-21 06:35:47 4 | * @LastEditors: Xie Weiyu ervinxie@qq.com 5 | * @LastEditTime: 2024-11-21 06:35:50 6 | * @FilePath: /kvc2/src/utils/spin_lock.hpp 7 | * @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: 8 | * https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | class SpinLock { 16 | public: 17 | SpinLock() { flag.clear(); } 18 | 19 | void lock() { 20 | const int max_delay = 1024; // Maximum delay in microseconds 21 | int delay = 1; // Initial delay in microseconds 22 | 23 | while (flag.test_and_set(std::memory_order_acquire)) { 24 | std::this_thread::sleep_for(std::chrono::microseconds(delay)); 25 | delay *= 2; 26 | if (delay > max_delay) { 27 | delay = max_delay; 28 | } 29 | } 30 | } 31 | 32 | void unlock() { flag.clear(std::memory_order_release); } 33 | 34 | private: 35 | std::atomic_flag flag = ATOMIC_FLAG_INIT; 36 | }; 37 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/hashmap_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | tbb::concurrent_hash_map map; 6 | map.insert({1, 2}); 7 | decltype(map)::accessor a; 8 | std::cout << map.find(a, 1) << std::endl; 9 | 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp") 3 | 4 | function(add_kvc2_test source_file) 5 | get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名 6 | add_executable(${target_name} ${source_file}) 7 | # target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing) 8 | target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src) 9 | target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/nlohmann/single_include) 10 | target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include) 11 | target_link_libraries(${target_name} PRIVATE kvc2 async_store) 12 | endfunction() 13 | 14 | add_kvc2_test(raw_insert_read.cpp) 15 | add_kvc2_test(lookup.cpp) 16 | add_kvc2_test(lookup-alt.cpp) 17 | add_kvc2_test(lookup-alt-gpu.cpp) 18 | add_kvc2_test(lookup-mt.cpp) 19 | add_kvc2_test(lookup-gpu.cpp) 20 | add_kvc2_test(lookup-gpu-mt.cpp) 21 | add_kvc2_test(lookup-gpu-async.cpp) 22 | add_kvc2_test(append-tokens.cpp) 23 | add_kvc2_test(flush-back.cpp) 24 | add_kvc2_test(check-flush-back.cpp) 25 | add_kvc2_test(lookup-without-vcache.cpp) 26 | add_kvc2_test(lookup-gpu-mt-without-vcache.cpp) 27 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.hpp" 3 | 4 | int main(int argc, char* argv[]) { 5 | init(argc, argv); 6 | spdlog::set_level(spdlog::level::debug); 7 | config.gpu_cache_config->total_kvcache_pages = 12; 8 | auto kvc2 = kvc2::create_kvc2(config); 9 | kvc2->load(); 10 | // #pragma omp parallel for 11 | for (size_t ti = 0; ti < 2; ti++) { 12 | SPDLOG_WARN("Test {}", ti); 13 | auto [kcache, vcache] = kvc2->get_kvcache(); 14 | std::mt19937 gen(ti + 123); 15 | size_t total_page = 10; 16 | TokenLength total_length = total_page * config.num_token_per_page; 17 | auto tokens = random_ids(total_length, gen); 18 | auto k1 = random_kvcache(total_page, gen); 19 | auto v1 = random_kvcache(total_page, gen); 20 | 21 | { 22 | std::promise> p; 23 | kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length, 24 | [&p](std::shared_ptr h) { p.set_value(h); }); 25 | auto fut = p.get_future(); 26 | fut.wait(); 27 | auto h = fut.get(); 28 | assert(h->matched_length() == total_length); 29 | size_t matched_block = h->matched_length() / config.num_token_per_page; 30 | auto block_idx = h->get_gpu_block_idx(); 31 | cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block); 32 | } 33 | } 34 | SPDLOG_CRITICAL("All Test Passed: {}", argv[0]); 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Xie Weiyu 4 | * @Date : 2024-11-22 09:52:48 5 | * @Version : 1.0.0 6 | * @LastEditors : Xie Weiyu 7 | * @LastEditTime : 2024-11-25 07:51:09 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | 11 | #include 12 | #include "common.hpp" 13 | 14 | int main(int argc, char* argv[]) { 15 | init(argc, argv); 16 | spdlog::set_level(spdlog::level::debug); 17 | auto kvc2 = kvc2::create_kvc2(config); 18 | 19 | std::mt19937 gen(123); 20 | auto ids1 = random_ids(10 * config.num_token_per_page, gen); 21 | auto k1 = random_kvcache(10, gen); 22 | auto v1 = random_kvcache(10, gen); 23 | 24 | kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1); 25 | 26 | // complete same 27 | #pragma omp parallel for 28 | for (size_t ti = 0; ti < 3; ti++) { 29 | std::promise> p; 30 | kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, ids1.data(), ids1.size(), 31 | ids1.size() + 2 * config.num_token_per_page, 32 | [&p](std::shared_ptr h) { p.set_value(h); }); 33 | auto fut = p.get_future(); 34 | fut.wait(); 35 | auto h = fut.get(); 36 | auto k = h->handle_data(true); 37 | auto v = h->handle_data(false); 38 | cmp_handle_data(k1, k, 10); 39 | cmp_handle_data(v1, v, 10); 40 | 41 | auto block_idx = h->get_gpu_block_idx(); 42 | auto [kcache, vcache] = kvc2->get_kvcache(); 43 | 44 | cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, 10); 45 | } 46 | 47 | SPDLOG_CRITICAL("All Test Passed: {}", argv[0]); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Xie Weiyu 4 | * @Date : 2024-11-22 09:52:48 5 | * @Version : 1.0.0 6 | * @LastEditors : Xie Weiyu 7 | * @LastEditTime : 2024-11-25 07:51:09 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | 11 | #include "common.hpp" 12 | 13 | int main(int argc, char* argv[]) { 14 | qw25_7B_gpu_config.v_cache_on = false; 15 | config.gpu_cache_config = qw25_7B_gpu_config; 16 | config.v_cache_on = false; 17 | 18 | init(argc, argv); 19 | spdlog::set_level(spdlog::level::debug); 20 | auto kvc2 = kvc2::create_kvc2(config); 21 | 22 | std::mt19937 gen(123); 23 | auto ids1 = random_ids(10 * config.num_token_per_page, gen); 24 | auto k1 = random_kvcache(10, gen); 25 | 26 | kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {}); 27 | 28 | // complete same 29 | #pragma omp parallel for 30 | for (size_t ti = 0; ti < 3; ti++) { 31 | auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(), 32 | ids1.size() + 2 * config.num_token_per_page); 33 | auto k = h->handle_data(true); 34 | cmp_handle_data(k1, k, 10); 35 | 36 | auto block_idx = h->get_gpu_block_idx(); 37 | auto [kcache, vcache] = kvc2->get_kvcache(); 38 | 39 | auto k_from_gpu = empty_kvcache(15); 40 | 41 | size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size(); 42 | size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count; 43 | for (size_t i = 0; i < k_from_gpu.size(); i++) { 44 | for (size_t j = 0; j < block_idx.size(); j++) { 45 | size_t b_idx = block_idx[j]; 46 | for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) { 47 | { 48 | auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU); 49 | void* src = kt.data_ptr(); 50 | void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu); 51 | memcpy(dst, src, element_size_per_gpu); 52 | } 53 | } 54 | } 55 | } 56 | cmp_handle_data(k1, k_from_gpu, 10); 57 | } 58 | 59 | SPDLOG_CRITICAL("All Test Passed: {}", argv[0]); 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvcache_mem_eviction_test.cpp: -------------------------------------------------------------------------------- 1 | #include "kvcache_test_utils.cpp" 2 | 3 | int main(int argc, char* argv[]) { 4 | parse_and_check(argc, argv); 5 | spdlog::set_level(spdlog::level::debug); 6 | std::mt19937 gen(123); 7 | 8 | KVC2 kvc2(FLAGS_disk_cache_path); 9 | auto io = kvc2.io_dealer->start_io_thread(); 10 | 11 | SPDLOG_WARN("Insert 10 x 10 KVCache"); 12 | std::vector handles(10); 13 | for (int i = 0; i < 10; i++) { 14 | handles[i] = random_kvcache(qwen_cache_info, 10, gen); 15 | auto& h1 = handles[i]; 16 | h1.ids = random_ids(10 * BlockLength, gen); 17 | kvc2.raw_insert(h1); 18 | } 19 | 20 | SPDLOG_WARN("Cache Eviction Test"); 21 | { 22 | for (int i = 0; i < 10; i++) { 23 | auto& h = handles[i]; 24 | SPDLOG_WARN("Lookup {}", i); 25 | auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size()); 26 | cmp_handle_data(h, *x); 27 | } 28 | SPDLOG_WARN("Simple Eviction OK"); 29 | } 30 | 31 | { 32 | std::vector> lookup_handles; 33 | for (int i = 0; i < 10; i++) { 34 | auto& h = handles[i]; 35 | SPDLOG_WARN("Lookup {}", i); 36 | auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size()); 37 | if (i >= 5) { 38 | assert(x == nullptr); 39 | continue; 40 | } 41 | lookup_handles.push_back(x); 42 | cmp_handle_data(h, *x); 43 | } 44 | SPDLOG_WARN("Cannot Eviction OK"); 45 | } 46 | 47 | kvc2.io_dealer->stop(); 48 | io.join(); 49 | 50 | SPDLOG_WARN("{} Test Passed", __FILE__); 51 | return 0; 52 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/test/kvcache_test_utils.cpp -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/page_pool_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "page_aligned_memory_pool.cpp" 8 | 9 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG 10 | #define FMT_HEADER_ONLY 11 | #include "spdlog/spdlog.h" 12 | 13 | // 每个线程执行的任务 14 | void thread_task(PageAlignedMemoryPool& pool) { 15 | std::mt19937 gen(123); 16 | std::vector> allocated; 17 | size_t cnt = 40000; 18 | for (size_t i = 0; i < cnt; ++i) { 19 | // 随机分配一个大小 20 | size_t size = (gen() % 100 + 1) * 4096 * 4; 21 | void* ptr = pool.alloc(size); 22 | // SPDLOG_DEBUG(pool.debug()); 23 | if (ptr) { 24 | pool.free(ptr, size); 25 | // allocated.push_back({ptr, size}); 26 | } 27 | // sleep((int)(gen() % 1000) / 1000.0); 28 | } 29 | // free all memory 30 | for (auto& p : allocated) { 31 | pool.free(p.first, p.second); 32 | } 33 | } 34 | 35 | int main(int argc, char* argv[]) { 36 | spdlog::set_level(spdlog::level::debug); 37 | 38 | // 创建一个内存池 39 | PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024); // 40 G 40 | 41 | // 创建线程 42 | const int num_threads = 32; 43 | std::vector threads; 44 | for (int i = 0; i < num_threads; ++i) { 45 | threads.emplace_back(thread_task, std::ref(pool)); 46 | } 47 | 48 | // 等待所有线程完成 49 | for (auto& t : threads) { 50 | t.join(); 51 | } 52 | 53 | // 输出调试信息 54 | std::cout << pool.debug() << std::endl; 55 | 56 | return 0; 57 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/prefix_test.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/balance_serve/kvc2/test/prefix_test.cpp -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/pytest_load.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./build') 3 | sys.path.append('./src') 4 | import torch 5 | import kvc2_ext 6 | from kvc2_utils import get_tensor_from_data_ptr 7 | 8 | # Create a kvc2 instance 9 | path = "/mnt/data/kvc2" 10 | kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool 11 | kvc2_ext.load(kvc2_instance) 12 | 13 | # Start IO thread 14 | print("Start IO thread") 15 | kvc2_ext.start_io_thread(kvc2_instance) 16 | print("IO thread started") 17 | 18 | # Create CacheInfoInput 19 | test_info = kvc2_ext.CacheInfoInput() 20 | test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2 21 | test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache 22 | test_info.quant_type = kvc2_ext.QuantType.QT_F32 23 | 24 | print("Element size: ", test_info.element_size()) 25 | 26 | # Generate random test IDs (length = 2560) 27 | torch.manual_seed(123) 28 | length = 2560 29 | test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous() 30 | block_count = (length+255) // 256 31 | # print("Test ID: ", test_id) 32 | 33 | # Generate test data based on element size and hidden layer count 34 | element_size = test_info.element_size() 35 | hidden_layer_count = test_info.hidden_layer_count() 36 | 37 | def read_cmp_and_release(kvc2_instance,cache_info,ids,length): 38 | handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length) 39 | if kvc2_ext.is_nullptr(handle): 40 | print("Handle is nullptr.") 41 | exit() 42 | matched_length = kvc2_ext.matched_length(handle) 43 | matched_data = kvc2_ext.handle_data(handle) 44 | print('Matched length: ', matched_length) 45 | if matched_length >0: 46 | print(f'First layer address {[hex(x) for x in matched_data[0]]}') 47 | read_data = get_tensor_from_data_ptr(matched_data,element_size) 48 | 49 | print("Just read check ok.") 50 | kvc2_ext.release(handle) 51 | 52 | 53 | l = 128 54 | while l<=length: 55 | read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l) 56 | l+=128 57 | 58 | kvc2_ext.destroy_kvc2(kvc2_instance) 59 | 60 | 61 | print("Test completed successfully.") 62 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/test_align.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import torch 3 | 4 | def aligned_tensor(size, alignment=4096): 5 | num_bytes = size 6 | mem = ctypes.c_void_p() 7 | error_code = ctypes.CDLL(None).posix_memalign( 8 | ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes) 9 | ) 10 | 11 | if error_code != 0: 12 | raise MemoryError(f"posix_memalign failed with error code {error_code}") 13 | 14 | array_type = (ctypes.c_int8 * size) 15 | raw_array = array_type.from_address(mem.value) 16 | 17 | tensor = torch.frombuffer(raw_array, dtype=torch.int8) 18 | 19 | if tensor.data_ptr() % alignment != 0: 20 | raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes") 21 | 22 | return tensor, mem 23 | 24 | 25 | size = 5124380 26 | tensor, mem_ptr = aligned_tensor(size, alignment=4096) 27 | 28 | print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}") 29 | print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}") 30 | print(f"Allocated memory address: {mem_ptr.value}") 31 | 32 | ctypes.CDLL(None).free(mem_ptr) 33 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/test_lock_free_queue.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "utils/lock_free_queue.hpp" 6 | 7 | struct Item { 8 | int value; 9 | std::promise promise; 10 | }; 11 | 12 | int main() { 13 | MPSCQueue queue; 14 | 15 | std::vector producers; 16 | const int num_producers = 4; 17 | const int items_per_producer = 5; 18 | 19 | // 启动生产者线程 20 | for (int i = 0; i < num_producers; ++i) { 21 | producers.emplace_back([&queue, i]() { 22 | for (int j = 0; j < items_per_producer; ++j) { 23 | auto item = std::make_shared(); 24 | item->value = i * items_per_producer + j; 25 | std::future future = item->promise.get_future(); 26 | queue.enqueue(item); 27 | future.wait(); // 等待消费者处理完成 28 | } 29 | }); 30 | } 31 | 32 | // 启动消费者线程 33 | std::thread consumer([&queue, num_producers, items_per_producer]() { 34 | int total_items = num_producers * items_per_producer; 35 | int processed = 0; 36 | while (processed < total_items) { 37 | std::shared_ptr item = queue.dequeue(); 38 | if (item) { 39 | std::cout << "Consumed item with value: " << item->value << std::endl; 40 | item->promise.set_value(); // 通知生产者 41 | ++processed; 42 | } else { 43 | // 如果队列为空,可以选择休眠或让出线程 44 | std::this_thread::yield(); 45 | } 46 | } 47 | }); 48 | 49 | // 等待所有线程完成 50 | for (auto& producer : producers) { 51 | producer.join(); 52 | } 53 | consumer.join(); 54 | 55 | return 0; 56 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/test_queue_perf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "utils/lock_free_queue.hpp" 4 | 5 | #define STDQ 6 | 7 | int main() { 8 | const int num_producers = 48; 9 | const int num_items = 1e6; 10 | 11 | #ifdef STDQ 12 | std::mutex lock; 13 | std::queue queue; 14 | #else 15 | MPSCQueue queue; 16 | #endif 17 | 18 | auto start_time = std::chrono::high_resolution_clock::now(); 19 | 20 | // Launch multiple producer threads 21 | std::vector producers; 22 | for (int i = 0; i < num_producers; ++i) { 23 | producers.emplace_back([&queue, i 24 | #ifdef STDQ 25 | , 26 | &lock 27 | #endif 28 | ]() { 29 | for (int j = 0; j < num_items; ++j) { 30 | #ifdef STDQ 31 | std::lock_guard guard(lock); 32 | queue.push(i * num_items + j); 33 | #else 34 | queue.enqueue(std::make_shared(i * num_items + j)); 35 | #endif 36 | } 37 | }); 38 | } 39 | 40 | // Consumer thread 41 | std::thread consumer([&queue, num_producers 42 | #ifdef STDQ 43 | , 44 | &lock 45 | #endif 46 | ]() { 47 | int count = 0; 48 | while (count < num_producers * num_items) { 49 | #ifdef STDQ 50 | std::lock_guard guard(lock); 51 | if (!queue.empty()) { 52 | queue.pop(); 53 | count++; 54 | } 55 | #else 56 | if (auto item = queue.dequeue()) { 57 | count++; 58 | } 59 | #endif 60 | } 61 | }); 62 | 63 | // Wait for all producers to finish 64 | for (auto& producer : producers) { 65 | producer.join(); 66 | } 67 | 68 | // Wait for the consumer to finish 69 | consumer.join(); 70 | 71 | auto end_time = std::chrono::high_resolution_clock::now(); 72 | auto duration = std::chrono::duration_cast(end_time - start_time).count(); 73 | 74 | #ifdef STDQ 75 | std::cout << "std::queue with mutex "; 76 | #else 77 | std::cout << "lock free queue "; 78 | #endif 79 | 80 | std::cout << "Processed " << num_producers * num_items / 1e6 << "M items in " << duration << " milliseconds " 81 | << num_producers * num_items / 1e3 / duration << " MOps." << std::endl; 82 | 83 | return 0; 84 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/test_std_list.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() { 6 | std::vector v = {0, 1, 2, 3, 4, 5}; 7 | 8 | using RevIt = std::reverse_iterator::iterator>; 9 | 10 | const auto it = v.begin() + 3; 11 | RevIt r_it{it}; 12 | 13 | std::cout << "*it == " << *it << '\n' 14 | << "*r_it == " << *r_it << '\n' 15 | << "*r_it.base() == " << *r_it.base() << '\n' 16 | << "*(r_it.base()-1) == " << *(r_it.base() - 1) << '\n'; 17 | 18 | RevIt r_end{v.begin()}; 19 | RevIt r_begin{v.end()}; 20 | 21 | for (auto it = r_end.base(); it != r_begin.base(); ++it) 22 | std::cout << *it << ' '; 23 | std::cout << '\n'; 24 | 25 | for (auto it = r_begin; it != r_end; ++it) 26 | std::cout << *it << ' '; 27 | std::cout << '\n'; 28 | 29 | for (auto it = r_begin; it != r_end; ++it) { 30 | if (*it == 3) { 31 | v.erase(std::next(it).base()); 32 | } 33 | } 34 | 35 | for (auto it : v) 36 | std::cout << it << ' '; 37 | std::cout << '\n'; 38 | } -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/test/xxHash_test.cpp: -------------------------------------------------------------------------------- 1 | #include "xxhash.h" 2 | #include 3 | 4 | int main() { 5 | std::string t = "hello world"; 6 | XXH64_hash_t hash = XXH64(t.data(), t.size(), 123); 7 | std::cout << hash << std::endl; 8 | { 9 | /* create a hash state */ 10 | XXH64_state_t* const state = XXH64_createState(); 11 | if (state == NULL) 12 | abort(); 13 | 14 | if (XXH64_reset(state, 123) == XXH_ERROR) 15 | abort(); 16 | 17 | if (XXH64_update(state, t.data(), 5) == XXH_ERROR) 18 | abort(); 19 | 20 | if (XXH64_update(state, t.data() + 5, t.size() - 5) == XXH_ERROR) 21 | abort(); 22 | /* Produce the final hash value */ 23 | XXH64_hash_t const hash = XXH64_digest(state); 24 | 25 | /* State could be re-used; but in this example, it is simply freed */ 26 | XXH64_freeState(state); 27 | std::cout << hash << std::endl; 28 | } 29 | 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /csrc/balance_serve/kvc2/unit_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 检查是否提供了 disk_cache_path 参数 4 | if [ -z "$1" ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # 将 disk_cache_path 参数赋值给变量 10 | disk_cache_path=$1 11 | 12 | # 定义测试命令数组,并使用变量替换 disk_cache_path 13 | tests=( 14 | "./build/test/kvc2_export_header_test --disk_cache_path=$disk_cache_path" 15 | "./build/test/kvcache_disk_insert_read_test --disk_cache_path=$disk_cache_path" 16 | "./build/test/kvcache_mem_eviction_test --disk_cache_path=$disk_cache_path" 17 | "./build/test/kvcache_mem_insert_read_test --disk_cache_path=$disk_cache_path" 18 | "./build/test/kvcache_save_load_test --disk_cache_path=$disk_cache_path" 19 | ) 20 | 21 | 22 | # 遍历每个测试命令 23 | for test in "${tests[@]}"; do 24 | echo "Running: $test" 25 | # 运行测试并捕获输出 26 | output=$($test) 27 | 28 | # 检查测试输出中是否包含 "Test Passed" 29 | if echo "$output" | grep -q "Test Passed"; then 30 | echo " Test Passed" 31 | else 32 | echo " Test Failed" 33 | fi 34 | 35 | sleep 1 36 | done -------------------------------------------------------------------------------- /csrc/balance_serve/sched/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fPIC") 2 | # set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -fPIC") 3 | add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}) 4 | 5 | set(UTILS_DIR ${CMAKE_CURRENT_SOURCE_DIR}/utils) 6 | 7 | add_library(sched_metrics metrics.cpp) 8 | target_include_directories(sched_metrics PRIVATE ${UTILS_DIR}) 9 | target_link_libraries(sched_metrics PUBLIC prometheus-cpp::pull) 10 | 11 | 12 | add_library(sched scheduler.cpp) 13 | target_include_directories(sched PRIVATE ${SPDLOG_DIR}/include ${FMT_DIR}/include ${UTILS_DIR} ${KVC2_INCLUDE_DIR}) 14 | target_link_libraries(sched PUBLIC pthread ${TORCH_LIBRARIES} kvc2 async_store sched_metrics) 15 | 16 | pybind11_add_module(sched_ext bind.cpp) 17 | target_link_libraries(sched_ext PUBLIC sched ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /csrc/balance_serve/sched/utils/all.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "readable_number.hpp" 3 | #include "timer.hpp" -------------------------------------------------------------------------------- /csrc/balance_serve/sched/utils/arithmetic.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template T div_up(T x, U by) { 4 | static_assert(std::is_integral_v); 5 | static_assert(std::is_integral_v); 6 | return (x + by - 1) / by; 7 | } -------------------------------------------------------------------------------- /csrc/balance_serve/sched/utils/atomic_ptr_with_flags.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template struct AtomicPtrWithFlag { 4 | constexpr static uint64_t mask = 1ull << 63; 5 | std::atomic_uint64_t ptr = 0; 6 | 7 | std::pair 8 | load(std::memory_order order = std::memory_order_seq_cst) { 9 | uint64_t val = ptr.load(order); 10 | return {reinterpret_cast(val & (~mask)), val & mask}; 11 | } 12 | 13 | void store(T *p, bool flag, 14 | std::memory_order order = std::memory_order_seq_cst) { 15 | ptr.store(reinterpret_cast(p) | (flag ? mask : 0), order); 16 | } 17 | 18 | std::pair 19 | exchange(T *p, bool flag, 20 | std::memory_order order = std::memory_order_seq_cst) { 21 | uint64_t val = 22 | ptr.exchange(reinterpret_cast(p) | (flag ? mask : 0), order); 23 | return {reinterpret_cast(val & (~mask)), val & mask}; 24 | } 25 | 26 | std::pair 27 | touch_load(std::memory_order order = std::memory_order_seq_cst) { 28 | uint64_t val = ptr.fetch_and(~mask, order); 29 | return {reinterpret_cast(val & (~mask)), val & mask}; 30 | } 31 | 32 | bool check_flag(std::memory_order order = std::memory_order_seq_cst) { 33 | return ptr.load(order) & mask; 34 | } 35 | }; 36 | -------------------------------------------------------------------------------- /csrc/balance_serve/sched/utils/easy_format.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | template std::string format_vector(const std::vector &v) { 6 | std::ostringstream oss; 7 | if (v.empty()) 8 | return "[]"; 9 | for (size_t i = 0; i < v.size(); ++i) { 10 | oss << v[i]; 11 | if (i < v.size() - 1) 12 | oss << ", "; // 逗号分隔 13 | } 14 | return oss.str(); 15 | } 16 | -------------------------------------------------------------------------------- /csrc/balance_serve/sched/utils/readable_number.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | inline std::array units = {"", "K", "M", "G", "T", "P", "E"}; 8 | 9 | inline std::string readable_number(size_t size) { 10 | size_t unit_index = 0; 11 | double readable_size = size; 12 | while (readable_size >= 1000 && unit_index < units.size() - 1) { 13 | readable_size /= 1000; 14 | unit_index++; 15 | } 16 | std::ostringstream ss; 17 | ss << std::fixed << std::setprecision(2) << readable_size; 18 | std::string str = ss.str(); 19 | return str + "" + units[unit_index]; 20 | } -------------------------------------------------------------------------------- /csrc/custom_marlin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/custom_marlin/__init__.py -------------------------------------------------------------------------------- /csrc/custom_marlin/binding.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Azure-Tang 4 | * @Date : 2024-07-25 13:38:30 5 | * @Version : 1.0.0 6 | * @LastEditors : kkk1nak0 7 | * @LastEditTime : 2024-08-12 03:05:04 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | 11 | #include "gptq_marlin/ops.h" 12 | // Python bindings 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | // namespace py = pybind11; 19 | 20 | PYBIND11_MODULE(vLLMMarlin, m) { 21 | 22 | /*m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 23 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 24 | m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k 25 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 26 | m.def("dequantize_q5_k", &dequantize_q5_k, "Function to dequantize q5_k 27 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 28 | m.def("dequantize_q4_k", &dequantize_q4_k, "Function to dequantize q4_k 29 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 30 | m.def("dequantize_q3_k", &dequantize_q3_k, "Function to dequantize q3_k 31 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 32 | m.def("dequantize_q2_k", &dequantize_q2_k, "Function to dequantize q2_k 33 | data.", py::arg("data"), py::arg("blk_size"), py::arg("device")); 34 | m.def("dequantize_iq4_xs", &dequantize_iq4_xs, "Function to dequantize 35 | iq4_xs data.", py::arg("data"), py::arg("blk_size"), py::arg("device"));*/ 36 | m.def("gptq_marlin_gemm", &gptq_marlin_gemm, 37 | "Function to perform GEMM using Marlin quantization.", py::arg("a"), 38 | py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"), 39 | py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m_tensor"), 40 | py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), 41 | py::arg("sms"), py::arg("is_k_full")); 42 | m.def("gptq_marlin_repack", &gptq_marlin_repack, 43 | "gptq_marlin repack from GPTQ"); 44 | } -------------------------------------------------------------------------------- /csrc/custom_marlin/gptq_marlin/ops.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Azure 4 | * @Date : 2024-07-22 09:27:55 5 | * @Version : 1.0.0 6 | * @LastEditors : Azure 7 | * @LastEditTime : 2024-07-26 08:35:00 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, 17 | torch::Tensor &b_scales, torch::Tensor &g_idx, 18 | torch::Tensor &perm, torch::Tensor &workspace, 19 | int64_t num_bits, torch::Tensor size_m_tensor, int64_t size_m, int64_t size_n, 20 | int64_t size_k, int sms, bool is_k_full); 21 | 22 | torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor&perm, 23 | int64_t size_k, int64_t size_n, 24 | int64_t num_bits); -------------------------------------------------------------------------------- /csrc/custom_marlin/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from torch.utils import cpp_extension 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | setup( 5 | name='vLLMMarlin', 6 | ext_modules=[ 7 | CUDAExtension( 8 | 'vLLMMarlin', [ 9 | #'custom_gguf/dequant.cu', 10 | 'binding.cpp', 11 | 'gptq_marlin/gptq_marlin.cu', 12 | 'gptq_marlin/gptq_marlin_repack.cu', 13 | ], 14 | extra_compile_args={ 15 | 'cxx': ['-O3'], 16 | 'nvcc': [ 17 | '-O3', 18 | '--use_fast_math', 19 | '-Xcompiler', '-fPIC', 20 | ] 21 | }, 22 | ) 23 | ], 24 | cmdclass={'build_ext': BuildExtension} 25 | ) -------------------------------------------------------------------------------- /csrc/custom_marlin/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/csrc/custom_marlin/utils/__init__.py -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/backend.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-07-22 02:03:05 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-07-25 10:33:38 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #ifndef CPUINFER_BACKEND_H 11 | #define CPUINFER_BACKEND_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | enum ThreadStatus { 22 | WORKING, 23 | WAITING, 24 | EXIT, 25 | }; 26 | 27 | struct ThreadState { 28 | std::unique_ptr> status; 29 | std::unique_ptr> curr; 30 | int end; 31 | }; 32 | 33 | class Backend { 34 | public: 35 | Backend(int); 36 | ~Backend(); 37 | int get_thread_num(); 38 | void do_work_stealing_job(int, std::function, 39 | std::function, 40 | std::function); 41 | #ifdef USE_NUMA 42 | static thread_local int numa_node; 43 | #endif 44 | static thread_local int thread_local_id; 45 | 46 | private: 47 | int thread_num_; 48 | int max_thread_num_; 49 | std::vector thread_state_; // [thread_num] 50 | std::function init_func_; 51 | std::function compute_func_; 52 | std::function finalize_func_; 53 | std::vector workers_; 54 | 55 | void process_tasks(int); 56 | void worker_thread(int); 57 | }; 58 | #endif -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-08-05 04:49:08 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-08-05 09:21:29 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #include "shared_mem_buffer.h" 11 | #include 12 | 13 | SharedMemBuffer::SharedMemBuffer() { 14 | buffer_ = nullptr; 15 | size_ = 0; 16 | } 17 | 18 | SharedMemBuffer::~SharedMemBuffer() { 19 | if (buffer_) { 20 | free(buffer_); 21 | } 22 | } 23 | 24 | void SharedMemBuffer::alloc(void* object, std::vector> requests) { 25 | uint64_t size = 0; 26 | for (auto& request : requests) { 27 | size += request.second; 28 | } 29 | if (size > size_) { 30 | if (buffer_) { 31 | free(buffer_); 32 | } 33 | buffer_ = std::aligned_alloc(64, size); 34 | 35 | size_ = size; 36 | for (auto& obj_requests : hist_requests_) { 37 | for (auto& requests : obj_requests.second) { 38 | arrange(requests); 39 | } 40 | } 41 | } 42 | arrange(requests); 43 | hist_requests_[object].push_back(requests); 44 | } 45 | 46 | void SharedMemBuffer::dealloc(void* object) { 47 | hist_requests_.erase(object); 48 | } 49 | 50 | void SharedMemBuffer::arrange(std::vector> requests) { 51 | uint64_t offset = 0; 52 | for (auto& request : requests) { 53 | *(request.first) = (uint8_t*)buffer_ + offset; 54 | offset += request.second; 55 | } 56 | } -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-08-05 04:49:08 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-08-05 06:36:41 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | 11 | #ifndef CPUINFER_SHAREDMEMBUFFER_H 12 | #define CPUINFER_SHAREDMEMBUFFER_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | class SharedMemBuffer { 20 | public: 21 | SharedMemBuffer(); 22 | ~SharedMemBuffer(); 23 | 24 | void alloc(void* object, std::vector> requests); 25 | void dealloc(void* object); 26 | 27 | private: 28 | void* buffer_; 29 | uint64_t size_; 30 | std::map>>> hist_requests_; 31 | 32 | void arrange(std::vector> requests); 33 | }; 34 | 35 | static SharedMemBuffer shared_mem_buffer; 36 | 37 | #endif -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/task_queue.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-07-17 12:25:51 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-10-09 11:08:10 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #include "task_queue.h" 11 | 12 | TaskQueue::TaskQueue() { 13 | worker = std::thread(&TaskQueue::processTasks, this); 14 | sync_flag.store(true, std::memory_order_seq_cst); 15 | exit_flag.store(false, std::memory_order_seq_cst); 16 | } 17 | 18 | TaskQueue::~TaskQueue() { 19 | { 20 | mutex.lock(); 21 | exit_flag.store(true, std::memory_order_seq_cst); 22 | mutex.unlock(); 23 | } 24 | cv.notify_all(); 25 | if (worker.joinable()) { 26 | worker.join(); 27 | } 28 | } 29 | 30 | void TaskQueue::enqueue(std::function task) { 31 | { 32 | mutex.lock(); 33 | tasks.push(task); 34 | sync_flag.store(false, std::memory_order_seq_cst); 35 | mutex.unlock(); 36 | } 37 | cv.notify_one(); 38 | } 39 | 40 | void TaskQueue::sync() { 41 | while (!sync_flag.load(std::memory_order_seq_cst)) 42 | ; 43 | } 44 | 45 | void TaskQueue::processTasks() { 46 | while (true) { 47 | std::function task; 48 | { 49 | mutex.lock(); 50 | cv.wait(mutex, [this]() { return !tasks.empty() || exit_flag.load(std::memory_order_seq_cst); }); 51 | if (exit_flag.load(std::memory_order_seq_cst) && tasks.empty()) { 52 | return; 53 | } 54 | task = tasks.front(); 55 | tasks.pop(); 56 | mutex.unlock(); 57 | } 58 | task(); 59 | { 60 | mutex.lock(); 61 | if (tasks.empty()) { 62 | sync_flag.store(true, std::memory_order_seq_cst); 63 | } 64 | mutex.unlock(); 65 | } 66 | } 67 | } -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/vendors/README.md: -------------------------------------------------------------------------------- 1 | ## TODO 2 | 3 | This directory can be removed after updating the version of `llama.cpp`. -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/vendors/cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #if CUDART_VERSION < 11020 10 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED 11 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH 12 | #define CUBLAS_COMPUTE_16F CUDA_R_16F 13 | #define CUBLAS_COMPUTE_32F CUDA_R_32F 14 | #define cublasComputeType_t cudaDataType_t 15 | #endif // CUDART_VERSION < 11020 16 | -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cpu_backend/vendors/vendor.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUINFER_VENDOR_VENDOR_H 2 | #define CPUINFER_VENDOR_VENDOR_H 3 | 4 | #ifdef USE_CUDA 5 | #include "cuda.h" 6 | #elif USE_HIP 7 | #define __HIP_PLATFORM_AMD__ 8 | #include "hip.h" 9 | #elif USE_MUSA 10 | #include "musa.h" 11 | #endif 12 | 13 | #endif // CPUINFER_VENDOR_VENDOR_H -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cuda/custom_gguf/ops.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Azure-Tang 4 | * @Date : 2024-07-22 09:27:55 5 | * @Version : 1.0.0 6 | * @LastEditors : kkk1nak0 7 | * @LastEditTime : 2024-08-12 03:48:46 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | torch::Tensor dequantize_q8_0(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 17 | torch::Tensor dequantize_q6_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 18 | torch::Tensor dequantize_q5_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 19 | torch::Tensor dequantize_q4_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 20 | torch::Tensor dequantize_q3_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 21 | torch::Tensor dequantize_q2_k(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); 22 | torch::Tensor dequantize_iq4_xs(const int8_t* data, const int num_bytes, const int blk_size, const int ele_per_blk, const torch::Device device, const torch::Dtype target_dtype); -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cuda/gptq_marlin/ops.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : Azure 4 | * @Date : 2024-07-22 09:27:55 5 | * @Version : 1.0.0 6 | * @LastEditors : Azure 7 | * @LastEditTime : 2024-07-26 08:35:00 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #pragma once 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, 17 | torch::Tensor& b_scales, torch::Tensor& g_idx, 18 | torch::Tensor& perm, torch::Tensor& workspace, 19 | int64_t num_bits, int64_t size_m, int64_t size_n, 20 | int64_t size_k, bool is_k_full); 21 | 22 | // torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, 23 | // int64_t size_k, int64_t size_n, 24 | // int64_t num_bits); -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cuda/setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup, Extension 3 | from torch.utils import cpp_extension 4 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 5 | setup( 6 | name='KTransformersOps', 7 | ext_modules=[ 8 | CUDAExtension( 9 | 'KTransformersOps', [ 10 | 'custom_gguf/dequant.cu', 11 | 'binding.cpp', 12 | 'gptq_marlin/gptq_marlin.cu', 13 | # 'gptq_marlin_repack.cu', 14 | ], 15 | extra_compile_args={ 16 | 'cxx': ['-O3'], 17 | 'nvcc': [ 18 | '-O3', 19 | '--use_fast_math', 20 | '-Xcompiler', '-fPIC', 21 | ] 22 | }, 23 | ) 24 | ], 25 | cmdclass={'build_ext': BuildExtension} 26 | ) -------------------------------------------------------------------------------- /csrc/ktransformers_ext/cuda/test_dequant.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0,"/home/zbx/ktransformers") 4 | from ktransformers.util.custom_loader import GGUFLoader 5 | import torch 6 | 7 | gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf") 8 | gguf_loader_2 = GGUFLoader("/mnt/data/chenht/model/gguf_for_ktransformers/DeepSeek-V3-bf16/") 9 | 10 | torch.set_default_dtype(torch.bfloat16) 11 | 12 | tensor_1 = gguf_loader_1.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda") 13 | tensor_2 = gguf_loader_2.load_gguf_tensor("blk.0.attn_kv_a_mqa.weight", "cuda") 14 | 15 | print(tensor_1[0, -64:]) 16 | print(tensor_2[0, -64:]) -------------------------------------------------------------------------------- /csrc/ktransformers_ext/examples/test_linear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Description : 5 | Author : chenht2022 6 | Date : 2024-07-25 10:32:05 7 | Version : 1.0.0 8 | LastEditors : chenht2022 9 | LastEditTime : 2024-08-06 10:36:59 10 | Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 11 | ''' 12 | import os, sys 13 | import time 14 | sys.path.append(os.path.dirname(__file__) + '/../build') 15 | import cpuinfer_ext 16 | import torch 17 | 18 | input_size = 16384 19 | output_size = 5120 20 | stride = 32 21 | group_max_len = 1024 22 | proj_type = 1 # ggml_type::GGML_TYPE_F16 23 | hidden_type = 1 # ggml_type::GGML_TYPE_F16 24 | qlen = 30 25 | layer_num = 10 26 | CPUInfer = cpuinfer_ext.CPUInfer(48) 27 | validation_iter = 100 28 | 29 | with torch.inference_mode(mode=True): 30 | linears = [] 31 | projs = [] 32 | for _ in range(layer_num): 33 | proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous() 34 | config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, group_max_len, proj.data_ptr(), proj_type, hidden_type) 35 | linear = cpuinfer_ext.linear.Linear(config) 36 | projs.append(proj) 37 | linears.append(linear) 38 | 39 | # validation 40 | for i in range(validation_iter): 41 | linear = linears[i % layer_num] 42 | input = torch.randn((qlen, input_size), dtype=torch.float16).contiguous() 43 | output = torch.empty((qlen, output_size), dtype=torch.float16).contiguous() 44 | input = input / 100 45 | 46 | CPUInfer.submit( 47 | linear.forward( 48 | qlen, 49 | input.data_ptr(), 50 | output.data_ptr() 51 | ) 52 | ) 53 | CPUInfer.sync() 54 | # print('cpuinfer output', output) 55 | 56 | proj = projs[i%layer_num] 57 | t_output = torch.mm(input, proj.t()) 58 | # print('torch output', t_output) 59 | 60 | diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output)) 61 | print('diff = ', diff) 62 | assert(diff < 0.001) 63 | -------------------------------------------------------------------------------- /csrc/ktransformers_ext/operators/amx/la/utils.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2025-04-25 18:28:12 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2025-04-25 18:28:12 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | 11 | #pragma once 12 | #include 13 | 14 | 15 | template 16 | T* offset_pointer(T* ptr, std::size_t byte_offset) { 17 | return reinterpret_cast(reinterpret_cast(ptr) + byte_offset); 18 | } 19 | 20 | template 21 | const T* offset_pointer(const T* ptr, std::size_t byte_offset) { 22 | return reinterpret_cast(reinterpret_cast(ptr) + byte_offset); 23 | } 24 | 25 | template 26 | T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) { 27 | return offset_pointer(t, row * ld) + col; 28 | } 29 | 30 | template 31 | T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) { 32 | return offset_pointer(t, col * ld) + row; 33 | } 34 | 35 | static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) { 36 | _mm512_storeu_si512(dst, _mm512_loadu_si512(src)); 37 | } 38 | 39 | static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) { 40 | _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0))); 41 | } 42 | 43 | static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) { 44 | _mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16))); 45 | _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16))); 46 | } -------------------------------------------------------------------------------- /csrc/ktransformers_ext/operators/llamafile/conversion.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-07-12 10:07:58 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-07-25 10:34:55 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #ifndef CPUINFER_CONVERSION_H 11 | #define CPUINFER_CONVERSION_H 12 | 13 | #include 14 | #include "llama.cpp/ggml.h" 15 | 16 | inline void to_float(const void* input, float* output, int size, ggml_type type) { 17 | if (type == ggml_type::GGML_TYPE_F32) { 18 | memcpy(output, input, size * sizeof(float)); 19 | } else { 20 | ggml_internal_get_type_traits(type).to_float(input, output, size); 21 | } 22 | } 23 | 24 | inline void from_float(const float* input, void* output, int size, ggml_type type) { 25 | if (type == ggml_type::GGML_TYPE_F32) { 26 | memcpy(output, input, size * sizeof(float)); 27 | } else { 28 | ggml_internal_get_type_traits(type).from_float(input, output, size); 29 | } 30 | } 31 | 32 | #endif -------------------------------------------------------------------------------- /csrc/ktransformers_ext/operators/llamafile/linear.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @Description : 3 | * @Author : chenht2022 4 | * @Date : 2024-07-12 10:07:58 5 | * @Version : 1.0.0 6 | * @LastEditors : chenht2022 7 | * @LastEditTime : 2024-07-25 10:35:00 8 | * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 9 | **/ 10 | #ifndef CPUINFER_OPERATOR_LINEAR_H 11 | #define CPUINFER_OPERATOR_LINEAR_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "../../cpu_backend/backend.h" 20 | #include "../../cpu_backend/shared_mem_buffer.h" 21 | #include "conversion.h" 22 | #include "llama.cpp/ggml-impl.h" 23 | #include "llama.cpp/ggml-quants.h" 24 | #include "llama.cpp/ggml.h" 25 | #include "llamafile/sgemm.h" 26 | 27 | struct LinearConfig { 28 | int input_size; 29 | int output_size; 30 | int stride; 31 | int group_max_len; 32 | void* proj; 33 | ggml_type proj_type; 34 | ggml_type hidden_type; 35 | 36 | LinearConfig() {} 37 | 38 | LinearConfig(int input_size, int output_size, int stride, int group_max_len, void* proj, ggml_type proj_type, ggml_type hidden_type) 39 | : input_size(input_size), output_size(output_size), stride(stride), group_max_len(group_max_len), proj(proj), proj_type(proj_type), hidden_type(hidden_type) {} 40 | }; 41 | 42 | class Linear { 43 | public: 44 | Linear(LinearConfig); 45 | ~Linear(); 46 | void warm_up(Backend* backend); 47 | void forward_many(int qlen, const void* input, void* output, Backend* backend); 48 | void forward(int qlen, const void* input, void* output, Backend* backend); 49 | 50 | private: 51 | LinearConfig config_; 52 | void* proj_; // [output_size * input_size ( /32 if quantized)] 53 | 54 | float* input_fp32_; // [group_max_len * input_size] 55 | uint8_t* proj_input_; // [group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)] 56 | float* proj_output_; // [group_max_len * output_size] 57 | }; 58 | 59 | #endif -------------------------------------------------------------------------------- /csrc/ktransformers_ext/vendors/cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #if CUDART_VERSION < 11020 10 | #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED 11 | #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH 12 | #define CUBLAS_COMPUTE_16F CUDA_R_16F 13 | #define CUBLAS_COMPUTE_32F CUDA_R_32F 14 | #define cublasComputeType_t cudaDataType_t 15 | #endif // CUDART_VERSION < 11020 16 | -------------------------------------------------------------------------------- /csrc/ktransformers_ext/vendors/vendor.h: -------------------------------------------------------------------------------- 1 | #ifndef CPUINFER_VENDOR_VENDOR_H 2 | #define CPUINFER_VENDOR_VENDOR_H 3 | 4 | #ifdef USE_CUDA 5 | #include "cuda.h" 6 | #elif USE_HIP 7 | #define __HIP_PLATFORM_AMD__ 8 | #include "hip.h" 9 | #elif USE_MUSA 10 | #include "musa.h" 11 | #endif 12 | 13 | #endif // CPUINFER_VENDOR_VENDOR_H -------------------------------------------------------------------------------- /doc/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Ktransformers 2 | 3 | [Introduction](./README.md) 4 | # Install 5 | - [Installation Guide](en/install.md) 6 | 7 | # Tutorial 8 | - [Deepseek-R1/V3 Show Case/Tutorial](en/DeepseekR1_V3_tutorial.md) 9 | - [Why KTransformers So Fast](en/deepseek-v2-injection.md) 10 | - [Injection Tutorial](en/injection_tutorial.md) 11 | - [Multi-GPU Tutorial](en/multi-gpu-tutorial.md) 12 | - [Use FP8 GPU Kernel](en/fp8_kernel.md) 13 | - [Use AMD GPU](en/ROCm.md) 14 | # Server 15 | - [Server](en/api/server/server.md) 16 | - [Website](en/api/server/website.md) 17 | - [Tabby](en/api/server/tabby.md) 18 | # For Developer 19 | - [Makefile Usage](en/makefile_usage.md) 20 | 21 | # FAQ 22 | - [FAQ](en/FAQ.md) 23 | # V3 Reproduction 24 | - [Success List](en/V3-success.md) 25 | # Benchmark 26 | - [Benchmark](en/benchmark.md) 27 | -------------------------------------------------------------------------------- /doc/assets/BigCodeBench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/BigCodeBench.png -------------------------------------------------------------------------------- /doc/assets/DeepSeek-on-KTransformers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/DeepSeek-on-KTransformers.png -------------------------------------------------------------------------------- /doc/assets/Framework_effect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/Framework_effect.png -------------------------------------------------------------------------------- /doc/assets/InfLLM_equation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InfLLM_equation.jpg -------------------------------------------------------------------------------- /doc/assets/InfLLM_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InfLLM_framework.png -------------------------------------------------------------------------------- /doc/assets/InjectStruction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/InjectStruction.png -------------------------------------------------------------------------------- /doc/assets/KTransformers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers.png -------------------------------------------------------------------------------- /doc/assets/KTransformers_long_context_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers_long_context_v1.png -------------------------------------------------------------------------------- /doc/assets/KTransformers_long_context_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/KTransformers_long_context_v2.png -------------------------------------------------------------------------------- /doc/assets/Quest_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/Quest_framework.png -------------------------------------------------------------------------------- /doc/assets/SnapKV_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/SnapKV_framework.png -------------------------------------------------------------------------------- /doc/assets/SparQ_attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/SparQ_attention.png -------------------------------------------------------------------------------- /doc/assets/amx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx.png -------------------------------------------------------------------------------- /doc/assets/amx_avx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx_avx.png -------------------------------------------------------------------------------- /doc/assets/amx_intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/amx_intro.png -------------------------------------------------------------------------------- /doc/assets/cpuinfer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/cpuinfer.png -------------------------------------------------------------------------------- /doc/assets/deepseekv2_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/deepseekv2_structure.png -------------------------------------------------------------------------------- /doc/assets/internlm_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/internlm_memory.png -------------------------------------------------------------------------------- /doc/assets/long_context_generate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/long_context_generate.png -------------------------------------------------------------------------------- /doc/assets/long_context_prefill.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/long_context_prefill.png -------------------------------------------------------------------------------- /doc/assets/model_structure_guild.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/model_structure_guild.png -------------------------------------------------------------------------------- /doc/assets/multi_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/multi_gpu.png -------------------------------------------------------------------------------- /doc/assets/needle_128K.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/needle_128K.png -------------------------------------------------------------------------------- /doc/assets/needle_1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/needle_1M.png -------------------------------------------------------------------------------- /doc/assets/onednn_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/onednn_1.png -------------------------------------------------------------------------------- /doc/assets/website.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/assets/website.png -------------------------------------------------------------------------------- /doc/basic/note1.md: -------------------------------------------------------------------------------- 1 | # basic-first20 2 | -------------------------------------------------------------------------------- /doc/basic/note2.md: -------------------------------------------------------------------------------- 1 | # basic-data_structure 2 | -------------------------------------------------------------------------------- /doc/en/Docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | ## Prerequisites 4 | * Docker must be installed and running on your system. 5 | * Create a folder to store big models & intermediate files (ex. /mnt/models) 6 | 7 | ## Images 8 | There is a Docker image available for our project, you can pull the docker image by: 9 | ``` 10 | docker pull approachingai/ktransformers:0.2.1 11 | ``` 12 | **Notice**: In this image, we compile the ktransformers in AVX512 instuction CPUs, if your cpu not support AVX512, it is suggested to recompile and install ktransformers in the /workspace/ktransformers directory within the container. 13 | 14 | ## Building docker image locally 15 | - Download Dockerfile in [there](../../Dockerfile) 16 | 17 | - finish, execute 18 | ```bash 19 | docker build -t approachingai/ktransformers:0.2.1 . 20 | ``` 21 | 22 | ## Usage 23 | 24 | Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container. 25 | ``` 26 | docker run --gpus all -v /path/to/models:/models --name ktransformers -itd approachingai/ktransformers:0.2.1 27 | docker exec -it ktransformers /bin/bash 28 | python -m ktransformers.local_chat --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --cpu_infer 33 29 | ``` 30 | 31 | More operators you can see in the [readme](../../README.md) -------------------------------------------------------------------------------- /doc/en/V3-success.md: -------------------------------------------------------------------------------- 1 | ## Hello everyone, here is the successfully reproduced environment configuration for your reference: 2 | ### Case 1 3 | - Configuration: l40s 48G + 9654 x2 (192 cores) + 768G DDR5 12-channel 4 | - Performance: prefill 108 tokens/s, decode 10.8 tokens/s 5 | - Used version: main source code compiled 6 | ### Case 2 7 | - Configuration: Dual Xeon 6430 32C processors, totaling 64 cores and 128 threads, 480GB DDR5 memory, single 4090 24G graphics card 8 | - Performance: Running speed approximately 6-8 tokens per second 9 | ## NOTE 10 | If there are any other configurations that have been successfully run, please feel free to let us know. We will keep updating for everyone to refer to when reproducing. (It has been found that it also works on 2080, AMD, etc. (doge : ) 11 | [click here](https://docs.qq.com/smartsheet/form/AVxgQOYhhNfl%2FBB08J2%2Fv3rnnq?tab=BB08J2) -------------------------------------------------------------------------------- /doc/en/api/server/run-tabby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/run-tabby.png -------------------------------------------------------------------------------- /doc/en/api/server/server-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/server-arch.png -------------------------------------------------------------------------------- /doc/en/api/server/visit-api-tags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/api/server/visit-api-tags.png -------------------------------------------------------------------------------- /doc/en/api/server/website.md: -------------------------------------------------------------------------------- 1 | # Start with website 2 | 3 | This document provides the necessary steps to set up and run the web service for this project. 4 | 5 | ## 1. Starting the Web Service 6 | 7 | ### 1.1. Compiling the Web Code 8 | 9 | Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher 10 | 11 | Note: The version of Node.js in the Ubuntu or Debian GNU/Linux software repository is too low, causing compilation errors. Users can also install Node.js through the Nodesource repository, provided they uninstall the outdated version first. 12 | 13 | ```bash 14 | 15 | # sudo apt-get remove nodejs npm -y && sudo apt-get autoremove -y 16 | sudo apt-get update -y && sudo apt-get install -y apt-transport-https ca-certificates curl gnupg 17 | curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/nodesource.gpg 18 | sudo chmod 644 /usr/share/keyrings/nodesource.gpg 19 | echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/nodesource.gpg] https://deb.nodesource.com/node_23.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list 20 | sudo apt-get update -y 21 | sudo apt-get install nodejs -y 22 | 23 | ``` 24 | 25 | Once npm is installed, navigate to the `ktransformers/website` directory: 26 | 27 | ```bash 28 | cd ktransformers/website 29 | ``` 30 | 31 | Next, install the Vue CLI with the following command: 32 | 33 | ```bash 34 | npm install @vue/cli 35 | ``` 36 | 37 | Now you can build the project: 38 | 39 | ```bash 40 | npm run build 41 | ``` 42 | Finally you can build ktransformers with website: 43 | ``` 44 | cd ../../ 45 | pip install . 46 | ``` 47 | -------------------------------------------------------------------------------- /doc/en/makefile_usage.md: -------------------------------------------------------------------------------- 1 | # Makefile 2 | ## Target 3 | ### flake_find: 4 | ```bash 5 | make flake_find 6 | ``` 7 | find all the python files under ./ktransformers dir and find the Error, Warning, Fatal... (their codes) into a list that are not consistent with the pep8 standard. For now we have get all this list in the .flake8 file's extend-ignore section in order to let flakes8 ignore them temporarily.(we may improve them in the future) 8 | ### format: 9 | ```bash 10 | make format 11 | ``` 12 | we use black to format all the python files under ./ktransformers dir. It obeys the pep8 standard 13 | but we modify the line length to 120 by add 14 | ```toml 15 | [tool.black] 16 | line-length = 120 17 | preview = true 18 | unstable = true 19 | ``` 20 | in the pyproject.toml file. 21 | 22 | ### dev_install: 23 | ```bash 24 | make dev_install 25 | ``` 26 | install the package in the development mode. It means that the package is installed in the editable mode. So if you modify the code, you don't need to reinstall the package. We recommend the developer to use this method to install the package. -------------------------------------------------------------------------------- /doc/en/operators/Combined_MoE_time_per_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/operators/Combined_MoE_time_per_layer.png -------------------------------------------------------------------------------- /doc/en/operators/Linear_projection_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/en/operators/Linear_projection_time.png -------------------------------------------------------------------------------- /doc/zh/api/server/run-tabby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/run-tabby.png -------------------------------------------------------------------------------- /doc/zh/api/server/server-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/server-arch.png -------------------------------------------------------------------------------- /doc/zh/api/server/tabby.md: -------------------------------------------------------------------------------- 1 | # 如何使用 Tabby 和 ktransformers 在本地利用 236B 的大模型做代码补全? 2 | 3 | [Tabby](https://tabby.tabbyml.com/docs/welcome/) 是一个开源的代码助手,用户可以手动配置后端使用的框架及模型,并在多个 IDE/编辑器 上使用,例如 VSCode 和 InteliJ。因为 Tabby 在框架侧可以对接到 Ollama,并且 ktransformers server 提供和 Ollama 一致的 API 接口,所以我们可以将 Tabby 对接到 ktransformers server。并在代码补全的场景中体验到 ktransformers 快速的异构推理。 4 | 5 | 1. 启动 ktransformers。 6 | ```bash 7 | ./ktransformers --port 9112 8 | ``` 9 | 2. 安装 Tabby:按照 Tabby 的官方教程在带有英伟达 GPU 的 Linux 服务器或者 Windows PC 上[安装 Tabby](https://tabby.tabbyml.com/docs/quick-start/installation/linux/)。 10 | 3. 配置 Tabby:创建`~/.tabby/config.toml`,并加入以下配置。 11 | ```toml 12 | [model.completion.http] 13 | kind = "ollama/completion" 14 | api_endpoint = "http://127.0.0.1:9112/" 15 | model_name = "DeepSeek-Coder-V2-Instruct" 16 | prompt_template = "<|fim▁begin|>{prefix}<|fim▁hole|>{suffix}<|fim▁end|>" # Prompt Template 17 | ``` 18 | 19 | 在这个配置中,`kind` 指明 ktransformers 使用 Ollama 的标准 API 为 Tabby 提供服务;`api_endpoint` 与 ktransforer 启动时绑定的接口保持一致;`model_name` 设置为 ktransformers 使用的模型,这里使用 `DeepSeek-Coder-V2-Instruct` 作为后台推理的模型;`prompt_template` 是模型的提示词模板,针对不同的模型,使用相对应的模版才能正常使用模型 Fill In the Middle 的功能。 20 | 在这里演示的是 Tabby 使用 Ollama API 提供 Completion 功能的相关配置,有关 Tabby 其他可选功能的配置信息请参照[这里](https://tabby.tabbyml.com/docs/administration/model/)。 21 | 22 | 23 | 4. 启动 Tabby 服务:`./tabby serve`。 24 | image-20240709112329577 25 | 26 | ​ 启动之后,期望会在 ktransformers 的命令行界面看到对 `/api/tags` 接口的访问(在 Tabby 新版本 v0.13.0 中变为对 `/api/show/` 接口的访问)。 27 | image-20240709111648215 28 | 29 | 6. 注册 Tabby 账户,获取 Token:在启动 Tabby 服务后,在浏览器中打开相应的链接(如上图的 0.0.0.0:8080),并参照[教程](https://tabby.tabbyml.com/docs/quick-start/register-account/) 创建用户并获取 Token。 30 | 31 | 7. 启动 VScode 安装 Tabby 拓展插件,并在相关提示下,使用上一步获得的 Token 连接 Tabby Server,参照[这里](https://tabby.tabbyml.com/docs/extensions/installation/vscode/)。 32 | 33 | 8. 打开任意代码文件,体验 ktransformers 的快速异构推理。 34 | 35 | -------------------------------------------------------------------------------- /doc/zh/api/server/visit-api-tags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/doc/zh/api/server/visit-api-tags.png -------------------------------------------------------------------------------- /doc/zh/api/server/website.md: -------------------------------------------------------------------------------- 1 | # Start with website 2 | 3 | This document provides the necessary steps to set up and run the web service for this project. 4 | 5 | ## 1. Starting the Web Service 6 | 7 | ### 1.1. Compiling the Web Code 8 | 9 | Before you can compile the web code, make sure you have installed [Node.js](https://nodejs.org) version 18.3 or higher 10 | 11 | Once npm is installed, navigate to the `ktransformers/website` directory: 12 | 13 | ```bash 14 | cd ktransformers/website 15 | ``` 16 | 17 | Next, install the Vue CLI with the following command: 18 | 19 | ```bash 20 | npm install @vue/cli 21 | ``` 22 | 23 | Now you can build the project: 24 | 25 | ```bash 26 | npm run build 27 | ``` 28 | Finally you can build ktransformers with website: 29 | ``` 30 | cd ../../ 31 | pip install . 32 | ``` 33 | -------------------------------------------------------------------------------- /install-with-cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # clear build dirs 5 | # rm -rf build 6 | # rm -rf *.egg-info 7 | # rm -rf csrc/build 8 | # rm -rf csrc/ktransformers_ext/build 9 | # rm -rf csrc/ktransformers_ext/cuda/build 10 | # rm -rf csrc/ktransformers_ext/cuda/dist 11 | # rm -rf csrc/ktransformers_ext/cuda/*.egg-info 12 | rm -rf ~/.ktransformers 13 | echo "Installing python dependencies from requirements.txt" 14 | pip install -r requirements-local_chat.txt 15 | pip install -r ktransformers/server/requirements.txt 16 | echo "Installing ktransformers" 17 | KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation 18 | pip install third_party/custom_flashinfer/ -v 19 | 20 | # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") 21 | # echo "Copying thirdparty libs to $SITE_PACKAGES" 22 | # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/ 23 | # patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython* 24 | 25 | 26 | echo "Installation completed successfully" 27 | -------------------------------------------------------------------------------- /install.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | REM clear build dirs 4 | rmdir /S /Q ktransformers\ktransformers_ext\build 5 | rmdir /S /Q ktransformers\ktransformers_ext\cuda\build 6 | rmdir /S /Q ktransformers\ktransformers_ext\cuda\dist 7 | rmdir /S /Q ktransformers\ktransformers_ext\out 8 | del /F /Q ktransformers\ktransformers_ext\cuda\*.egg-info 9 | 10 | echo Installing python dependencies from requirements.txt 11 | pip install -r requirements-local_chat.txt 12 | 13 | echo Installing ktransformers 14 | set KTRANSFORMERS_FORCE_BUILD=TRUE 15 | pip install . --no-build-isolation 16 | echo Installation completed successfully -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # default backend 5 | DEV="cuda" 6 | 7 | # parse --dev argument 8 | while [[ "$#" -gt 0 ]]; do 9 | case $1 in 10 | --dev) DEV="$2"; shift ;; 11 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 12 | esac 13 | shift 14 | done 15 | export DEV_BACKEND="$DEV" 16 | echo "Selected backend: $DEV_BACKEND" 17 | 18 | # clear build dirs 19 | rm -rf build 20 | rm -rf *.egg-info 21 | rm -rf csrc/build 22 | rm -rf csrc/ktransformers_ext/build 23 | rm -rf csrc/ktransformers_ext/cuda/build 24 | rm -rf csrc/ktransformers_ext/cuda/dist 25 | rm -rf csrc/ktransformers_ext/cuda/*.egg-info 26 | rm -rf ~/.ktransformers 27 | echo "Installing python dependencies from requirements.txt" 28 | pip install -r requirements-local_chat.txt 29 | pip install -r ktransformers/server/requirements.txt 30 | 31 | echo "Installing ktransformers" 32 | KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation 33 | 34 | if [[ "$DEV_BACKEND" == "cuda" ]]; then 35 | echo "Installing custom_flashinfer for CUDA backend" 36 | pip install third_party/custom_flashinfer/ 37 | fi 38 | # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") 39 | # echo "Copying thirdparty libs to $SITE_PACKAGES" 40 | # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/ 41 | # patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython* 42 | 43 | echo "Installation completed successfully" -------------------------------------------------------------------------------- /ktransformers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Description : 5 | Author : kkk1nak0 6 | Date : 2024-08-15 07:34:46 7 | Version : 1.0.0 8 | LastEditors : chenxl 9 | LastEditTime : 2025-02-15 03:53:02 10 | ''' 11 | __version__ = "0.3.1" 12 | -------------------------------------------------------------------------------- /ktransformers/configs/config.yaml: -------------------------------------------------------------------------------- 1 | log: 2 | dir: "logs" 3 | file: "lexllama.log" 4 | #log level: debug, info, warn, error, crit 5 | level: "debug" 6 | backup_count: -1 7 | 8 | server: 9 | ip: 0.0.0.0 10 | port: 10002 11 | 12 | db: 13 | type: "sqllite" 14 | database: "server.db" 15 | host: "./" 16 | pool_size: 10 17 | 18 | user: 19 | secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73" 20 | algorithm: "HS256" 21 | 22 | model: 23 | # type: transformers 24 | # type: balance_serve 25 | type: ktransformers 26 | 27 | name: DeepSeek-Coder-V2-Instruct 28 | path: deepseek-ai/DeepSeek-V2-Lite-Chat 29 | gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF 30 | 31 | device: cuda:0 32 | cache_lens: 16384 33 | max_new_tokens: 500 34 | web: 35 | mount: False 36 | open_cross_domain: True 37 | 38 | ext: 39 | cpu_infer: 10 40 | 41 | long_context: 42 | max_seq_len: 32000 43 | block_size: 128 44 | local_windows_len: 4096 45 | second_select_num: 32 46 | anchor_type: DYNAMIC 47 | kv_type: FP16 48 | dense_layer_num: 2 49 | anchor_num: 1 50 | preselect_block: True 51 | head_select_mode: SHARED 52 | preselect_block_count: 32 53 | layer_step: 1 54 | token_step: 55 | 56 | local_chat: 57 | prompt_file: "" 58 | 59 | async_server: 60 | sched_strategy: "FCFS" 61 | sched_port: 56441 62 | sched_metrics_port: 54321 63 | kvc2_metrics_port: 54391 64 | max_batch_size: 4 # decode count + prefill count, in one mini batch 65 | 66 | attn: 67 | page_size: 256 68 | chunk_size: 256 69 | kvc2: 70 | gpu_only: true 71 | utilization_percentage: 1.0 72 | cpu_memory_size_GB: 500 73 | -------------------------------------------------------------------------------- /ktransformers/configs/log_config.ini: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,uvicorn,uvicornError,uvicornAccess 3 | 4 | [handlers] 5 | keys=consoleHandler,fileHandler 6 | 7 | [formatters] 8 | keys=detailedFormatter 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=consoleHandler 13 | 14 | [logger_uvicorn] 15 | level=INFO 16 | handlers=consoleHandler,fileHandler 17 | qualname=uvicorn 18 | propagate=0 19 | 20 | [logger_uvicornError] 21 | level=ERROR 22 | handlers=consoleHandler,fileHandler 23 | qualname=uvicorn.error 24 | propagate=0 25 | 26 | [logger_uvicornAccess] 27 | level=INFO 28 | handlers=consoleHandler,fileHandler 29 | qualname=uvicorn.access 30 | propagate=0 31 | 32 | [handler_consoleHandler] 33 | class=StreamHandler 34 | level=INFO 35 | formatter=detailedFormatter 36 | args=(sys.stdout,) 37 | 38 | [handler_fileHandler] 39 | class=logging.FileHandler 40 | level=INFO 41 | formatter=detailedFormatter 42 | args=('uvicorn_logs.log', 'a') 43 | 44 | [formatter_detailedFormatter] 45 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s 46 | datefmt=%Y-%m-%d %H:%M:%S 47 | -------------------------------------------------------------------------------- /ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/ktransformers_ext/operators/custom_marlin/quantize/utils/__init__.py -------------------------------------------------------------------------------- /ktransformers/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/models/__init__.py -------------------------------------------------------------------------------- /ktransformers/operators/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ktransformers/operators/mlp.py: -------------------------------------------------------------------------------- 1 | 2 | from ktransformers.operators.base_operator import BaseInjectedModule 3 | from ktransformers.util.custom_loader import GGUFLoader 4 | from transformers import PretrainedConfig 5 | import torch.nn as nn 6 | from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP 7 | from ktransformers.models.modeling_qwen2_moe import Qwen2MoeMLP 8 | class kDeepseekV3MLP(DeepseekV3MLP, BaseInjectedModule): 9 | def __init__(self, 10 | key: str, 11 | gguf_loader : GGUFLoader, 12 | config: PretrainedConfig, 13 | orig_module: nn.Module, 14 | prefill_device: str = "cuda", 15 | generate_device: str = "cuda", 16 | **kwargs): 17 | BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs) 18 | self.orig_module.__init__(orig_module.config, 19 | orig_module.hidden_size, orig_module.intermediate_size) 20 | def forward(self, x, bsz_tensor): 21 | down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor) 22 | return down_proj 23 | class KQwen2MoeMLP(Qwen2MoeMLP, BaseInjectedModule): 24 | def __init__(self, 25 | key: str, 26 | gguf_loader : GGUFLoader, 27 | config: PretrainedConfig, 28 | orig_module: nn.Module, 29 | prefill_device: str = "cuda", 30 | generate_device: str = "cuda", 31 | **kwargs): 32 | BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs) 33 | self.orig_module.__init__(orig_module.config, 34 | orig_module.intermediate_size) 35 | def forward(self, x, bsz_tensor): 36 | down_proj = self.down_proj(self.act_fn(self.gate_proj(x, bsz_tensor)) * self.up_proj(x, bsz_tensor), bsz_tensor) 37 | return down_proj -------------------------------------------------------------------------------- /ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml: -------------------------------------------------------------------------------- 1 | - match: 2 | class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding 3 | replace: 4 | class: ktransformers.operators.RoPE.RotaryEmbeddingV2 5 | - match: 6 | name: "^model.embed_tokens" 7 | replace: 8 | class: "default" 9 | kwargs: 10 | generate_device: "cpu" 11 | prefill_device: "cpu" 12 | - match: 13 | class: ktransformers.models.modeling_llama.LlamaModel 14 | replace: 15 | class: ktransformers.operators.models.KLlamaModel 16 | kwargs: 17 | generate_device: "cuda" 18 | prefill_device: "cuda" 19 | per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill 20 | 21 | - match: 22 | name: "^model\\.layers\\..*\\.self_attn$" 23 | replace: 24 | class: ktransformers.operators.attention.KLlamaAttention 25 | kwargs: 26 | generate_device: "cuda" 27 | prefill_device: "cuda" 28 | 29 | -------------------------------------------------------------------------------- /ktransformers/optimize/optimize_rules/Mixtral.yaml: -------------------------------------------------------------------------------- 1 | - match: 2 | class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding 3 | replace: 4 | class: ktransformers.operators.RoPE.RotaryEmbedding 5 | kwargs: 6 | generate_device: "cuda" 7 | prefill_device: "cuda" 8 | - match: 9 | name: "^model\\.layers\\..*$" 10 | class: torch.nn.Linear # only match modules matching name and class simultaneously 11 | replace: 12 | class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types 13 | kwargs: 14 | generate_device: "cuda" 15 | prefill_device: "cuda" 16 | generate_op: "KLinearMarlin" 17 | prefill_op: "KLinearTorch" 18 | - match: 19 | name: "^lm_head" 20 | class: torch.nn.Linear 21 | replace: 22 | class: ktransformers.operators.linear.KTransformersLinear 23 | kwargs: 24 | generate_device: "cuda" 25 | prefill_device: "cuda" 26 | generate_op: "KLinearMarlin" 27 | prefill_op: "KLinearTorch" 28 | - match: 29 | name: "^model\\.layers\\..*\\.block_sparse_moe$" 30 | class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock 31 | replace: 32 | class: ktransformers.operators.experts.KMistralSparseMoEBlock 33 | - match: 34 | name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$" 35 | replace: 36 | class: ktransformers.operators.experts.KTransformersExperts 37 | kwargs: 38 | prefill_device: "cuda" 39 | prefill_op: "KExpertsTorch" 40 | generate_device: "cpu" 41 | generate_op: "KExpertsCPU" 42 | out_device: "cuda" 43 | recursive: False # don't recursively inject submodules of this module 44 | 45 | - match: 46 | name: "^model.embed_tokens" 47 | replace: 48 | class: "default" 49 | kwargs: 50 | generate_device: "cpu" 51 | prefill_device: "cpu" 52 | 53 | - match: 54 | name: "^model\\.layers\\..*\\." 55 | replace: 56 | class: "default" 57 | kwargs: 58 | generate_device: "cuda" 59 | prefill_device: "cuda" -------------------------------------------------------------------------------- /ktransformers/server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/api/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from .ollama import router as ollama_router 4 | from .openai import router as openai_router,post_db_creation_operations 5 | from .web import router as web_router 6 | 7 | router = APIRouter() 8 | router.include_router(ollama_router) 9 | router.include_router(openai_router) 10 | router.include_router(web_router) 11 | -------------------------------------------------------------------------------- /ktransformers/server/api/ollama/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from .completions import router as completions_router 4 | 5 | router = APIRouter() 6 | router.include_router(completions_router) 7 | -------------------------------------------------------------------------------- /ktransformers/server/api/openai/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from .assistants import router as assistants_router,create_default_assistant 4 | from .endpoints.chat import router as chat_router 5 | from .legacy import router as legacy_router 6 | 7 | router = APIRouter(prefix='/v1') 8 | 9 | 10 | router.include_router(assistants_router) 11 | router.include_router(chat_router) 12 | router.include_router(legacy_router) 13 | 14 | def post_db_creation_operations(): 15 | create_default_assistant() 16 | -------------------------------------------------------------------------------- /ktransformers/server/api/openai/assistants/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from .assistants import router as assistants_router, create_default_assistant 4 | from .messages import router as messages_router 5 | from .runs import router as runs_router 6 | from .threads import router as threads_router 7 | 8 | router = APIRouter() 9 | 10 | threads_router.include_router(runs_router) 11 | threads_router.include_router(messages_router) 12 | 13 | router.include_router(assistants_router) 14 | router.include_router(threads_router) 15 | -------------------------------------------------------------------------------- /ktransformers/server/api/openai/assistants/threads.py: -------------------------------------------------------------------------------- 1 | from typing import List,Optional 2 | from fastapi import APIRouter 3 | 4 | from ktransformers.server.crud.assistants.threads import ThreadsDatabaseManager,Order,ObjectID 5 | from ktransformers.server.schemas.assistants.threads import ThreadObject,ThreadCreate,ThreadModify 6 | from ktransformers.server.schemas.base import DeleteResponse 7 | from ktransformers.server.schemas.conversation import ThreadPreview 8 | 9 | router = APIRouter(prefix='/threads') 10 | threads_manager = ThreadsDatabaseManager() 11 | 12 | 13 | @router.post("/",tags=['openai'], response_model=ThreadObject) 14 | async def create_thread(thread: ThreadCreate): 15 | return threads_manager.db_create_thread(thread) 16 | 17 | 18 | @router.get("/", tags=['openai-ext'],response_model=List[ThreadPreview]) 19 | async def list_threads(limit: Optional[int] = 20, order: Order = Order.DESC): 20 | return threads_manager.db_list_threads_preview(limit, order) 21 | 22 | 23 | @router.get("/{thread_id}",tags=['openai'], response_model=ThreadObject) 24 | async def retrieve_thread(thread_id: ObjectID): 25 | return threads_manager.db_get_thread_by_id(thread_id) 26 | 27 | 28 | @router.post("/{thread_id}",tags=['openai'], response_model=ThreadObject) 29 | async def modify_thread(thread_id: ObjectID, thread: ThreadModify): 30 | raise NotImplementedError 31 | 32 | 33 | @router.delete("/{thread_id}",tags=['openai'], response_model=DeleteResponse) 34 | async def delete_thread(thread_id: ObjectID): 35 | threads_manager.db_delete_thread_by_id(thread_id=thread_id) 36 | return DeleteResponse(id=thread_id, object='thread.deleted') 37 | -------------------------------------------------------------------------------- /ktransformers/server/api/openai/endpoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/api/openai/endpoints/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/api/openai/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | from . import completions 4 | 5 | router = APIRouter() 6 | router.include_router(completions.router) -------------------------------------------------------------------------------- /ktransformers/server/api/web/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | from .system import router as system_router 3 | 4 | 5 | router = APIRouter() 6 | router.include_router(system_router) 7 | -------------------------------------------------------------------------------- /ktransformers/server/api/web/system.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | 4 | router = APIRouter() 5 | 6 | 7 | @router.get('/system-info',tags=['web']) 8 | def system_info(): 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /ktransformers/server/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/backend/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/backend/interfaces/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/backend/interfaces/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/backend/interfaces/exllamav2.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | from typing import AsyncIterator, Dict, Tuple 3 | 4 | import torch 5 | 6 | from ..args import ConfigArgs, default_args 7 | 8 | from ..base import BackendInterfaceBase, ThreadContext 9 | from ktransformers.server.schemas.assistants.runs import RunObject 10 | 11 | 12 | from ..args import * 13 | 14 | class ExllamaThreadContext(ThreadContext): 15 | def __init__(self, run: RunObject, args: ConfigArgs = default_args) -> None: 16 | super().__init__(run,args) 17 | 18 | def get_interface(self): 19 | return 20 | 21 | def get_local_messages(self): 22 | raise NotImplementedError 23 | 24 | 25 | 26 | 27 | class ExllamaInterface(BackendInterfaceBase): 28 | 29 | def __init__(self, args: ConfigArgs = ...): 30 | raise NotImplementedError 31 | 32 | def tokenize_prompt(self, prompt: str) -> torch.Tensor: 33 | raise NotImplementedError 34 | 35 | async def inference(self,local_messages,request_unique_id:Optional[str])->AsyncIterator: 36 | raise NotImplementedError 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /ktransformers/server/balance_serve/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/balance_serve/inference/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/balance_serve/inference/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication_op import * 2 | from .parallel_state import * 3 | from .utils import * 4 | -------------------------------------------------------------------------------- /ktransformers/server/balance_serve/inference/distributed/communication_op.py: -------------------------------------------------------------------------------- 1 | """ 2 | Date: 2024-12-11 06:02:42 3 | LastEditors: djw 4 | LastEditTime: 2024-12-12 09:52:06 5 | """ 6 | 7 | from typing import Any, Dict, Optional, Union 8 | 9 | import torch 10 | import torch.distributed 11 | 12 | from .parallel_state import get_tp_group 13 | 14 | 15 | def tensor_model_parallel_all_reduce(input_: torch.Tensor, bsz_tensor: torch.Tensor, is_compute_bound=False, overlap=False) -> torch.Tensor: 16 | """All-reduce the input tensor across model parallel group.""" 17 | return get_tp_group().all_reduce(input_, bsz_tensor, is_compute_bound=is_compute_bound, overlap=overlap) 18 | 19 | 20 | def tensor_model_parallel_all_gather( 21 | input_: torch.Tensor, dim: int = -1 22 | ) -> torch.Tensor: 23 | """All-gather the input tensor across model parallel group.""" 24 | return get_tp_group().all_gather(input_, dim) 25 | 26 | 27 | def tensor_model_parallel_gather( 28 | input_: torch.Tensor, dst: int = 0, dim: int = -1 29 | ) -> Optional[torch.Tensor]: 30 | """Gather the input tensor across model parallel group.""" 31 | return get_tp_group().gather(input_, dst, dim) 32 | 33 | 34 | def broadcast_tensor_dict( 35 | tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0 36 | ): 37 | if not torch.distributed.is_initialized(): 38 | return tensor_dict 39 | return get_tp_group().broadcast_tensor_dict(tensor_dict, src) 40 | -------------------------------------------------------------------------------- /ktransformers/server/balance_serve/inference/sampling/penaltylib/__init__.py: -------------------------------------------------------------------------------- 1 | from .orchestrator import BatchedPenalizerOrchestrator 2 | from .penalizers.frequency_penalty import BatchedFrequencyPenalizer 3 | from .penalizers.min_new_tokens import BatchedMinNewTokensPenalizer 4 | from .penalizers.presence_penalty import BatchedPresencePenalizer 5 | from .penalizers.repetition_penalty import BatchedRepetitionPenalizer 6 | 7 | __all__ = [ 8 | "BatchedFrequencyPenalizer", 9 | "BatchedMinNewTokensPenalizer", 10 | "BatchedPresencePenalizer", 11 | "BatchedRepetitionPenalizer", 12 | "BatchedPenalizerOrchestrator", 13 | ] 14 | -------------------------------------------------------------------------------- /ktransformers/server/config/singleton.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Description : Implement singleton 5 | Author : unicornchan 6 | Date : 2024-06-11 17:08:36 7 | Version : 1.0.0 8 | LastEditors : chenxl 9 | LastEditTime : 2024-07-27 01:55:56 10 | ''' 11 | import abc 12 | 13 | class Singleton(abc.ABCMeta, type): 14 | """_summary_ 15 | 16 | Args: 17 | abc.ABCMeta: Provide a mechanism for defining abstract methods and properties, 18 | enforcing subclasses to implement these methods and properties. 19 | type: Inherit from 'type' to make 'Singleton' a metaclass, 20 | enabling the implementation of the Singleton 21 | """ 22 | _instances = {} 23 | 24 | def __call__(cls, *args, **kwds): 25 | if cls not in cls._instances: 26 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwds) 27 | return cls._instances[cls] 28 | 29 | class AbstractSingleton(abc.ABC, metaclass=Singleton): 30 | """Provided an abstract Singleton base class, any class inheriting from 31 | this base class will automatically become a Singleton class. 32 | 33 | Args: 34 | abc.ABC: Abstract base class, it cannot be instantiated, only inherited. 35 | """ 36 | -------------------------------------------------------------------------------- /ktransformers/server/crud/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/crud/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/crud/assistants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/crud/assistants/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/crud/assistants/runs.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | from uuid import uuid4 3 | 4 | from ktransformers.server.models.assistants.runs import Run 5 | from ktransformers.server.schemas.assistants.runs import RunCreate,RunObject 6 | from ktransformers.server.schemas.base import ObjectID 7 | from ktransformers.server.utils.sql_utils import SQLUtil 8 | 9 | 10 | class RunsDatabaseManager: 11 | def __init__(self) -> None: 12 | self.sql_util = SQLUtil() 13 | 14 | def create_run_object(self, thread_id: ObjectID, run: RunCreate) -> RunObject: 15 | run_obj = RunObject( 16 | **run.model_dump(mode='json', exclude={"stream"}), 17 | id=str(uuid4()), 18 | object='run', 19 | created_at=int(time()), 20 | thread_id=thread_id, 21 | status=RunObject.Status.queued, 22 | ) 23 | run_obj.set_compute_save(0) 24 | return run_obj 25 | 26 | def db_create_run(self, thread_id: str, run: RunCreate): 27 | db_run = Run( 28 | **run.model_dump(mode="json", exclude={"stream"}), 29 | id=str(uuid4()), 30 | created_at=int(time()), 31 | status="queued", 32 | thread_id=thread_id, 33 | ) 34 | with self.sql_util.get_db() as db: 35 | self.sql_util.db_add_commit_refresh(db, db_run) 36 | run_obj = RunObject.model_validate(db_run.__dict__) 37 | run_obj.set_compute_save(0) 38 | return run_obj 39 | 40 | def db_sync_run(self, run: RunObject) -> None: 41 | db_run = Run( 42 | **run.model_dump(mode='json'), 43 | ) 44 | with self.sql_util.get_db() as db: 45 | self.sql_util.db_merge_commit(db, db_run) 46 | 47 | def db_get_run(self, run_id: ObjectID) -> RunObject: 48 | with self.sql_util.get_db() as db: 49 | db_run = db.query(Run).filter(Run.id == run_id).first() 50 | return RunObject.model_validate(db_run.__dict__) 51 | -------------------------------------------------------------------------------- /ktransformers/server/exceptions.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException, status 2 | 3 | 4 | def db_exception(): 5 | return HTTPException( 6 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, 7 | detail="DB Error", 8 | ) 9 | 10 | 11 | def not_implemented(what): 12 | return HTTPException( 13 | status_code=status.HTTP_501_NOT_IMPLEMENTED, 14 | detail=f"{what} not implemented", 15 | ) 16 | 17 | 18 | def internal_server_error(what): 19 | return HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"{what}") 20 | 21 | 22 | def request_error(what): 23 | return HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"{what}") 24 | -------------------------------------------------------------------------------- /ktransformers/server/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/models/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/models/assistants/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/assistants.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import JSON, Column, Float, Integer, String, Text 2 | from sqlalchemy.orm import relationship 3 | 4 | from ktransformers.server.utils.sql_utils import Base 5 | 6 | 7 | class Assistant(Base): 8 | __tablename__ = "assistants" 9 | 10 | id = Column(String, primary_key=True, index=True) 11 | object = Column(String, default="assistant") 12 | created_at = Column(Integer) 13 | 14 | name = Column(String, nullable=True) 15 | description = Column(String, nullable=True) 16 | model = Column(String) 17 | instructions = Column(Text, nullable=True) 18 | tools = Column(JSON) 19 | tool_resources = Column(JSON) 20 | temperature = Column(Float, nullable=True) 21 | meta_data = Column(JSON, nullable=True) 22 | top_p = Column(Float, nullable=True) 23 | response_format = Column(JSON, default="auto") 24 | 25 | build_status = Column(JSON, nullable=True) 26 | 27 | runs = relationship("Run", back_populates="assistant") 28 | 29 | messages = relationship("Message", back_populates="assistant") 30 | -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/messages.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import JSON, Column, ForeignKey, Integer, String 2 | from sqlalchemy.orm import relationship 3 | 4 | from ktransformers.server.utils.sql_utils import Base 5 | 6 | 7 | class Message(Base): 8 | __tablename__ = "messages" 9 | 10 | id = Column(String, primary_key=True, index=True) 11 | object = Column(String, default="thread.message") 12 | created_at = Column(Integer) 13 | 14 | thread_id = Column(String, ForeignKey("threads.id")) 15 | status = Column(String, default="in_progress") 16 | incomplete_details = Column(JSON, nullable=True) 17 | completed_at = Column(Integer, nullable=True) 18 | incomplete_at = Column(Integer, nullable=True) 19 | role = Column(JSON) 20 | content = Column(JSON) 21 | assistant_id = Column(String, ForeignKey("assistants.id"), nullable=True) 22 | run_id = Column(String, ForeignKey("runs.id"), nullable=True) 23 | attachments = Column(JSON, nullable=True) 24 | meta_data = Column(JSON, nullable=True) 25 | 26 | thread = relationship("Thread", back_populates="messages") 27 | assistant = relationship("Assistant", back_populates="messages") 28 | run = relationship("Run", back_populates="message") 29 | -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/run_steps.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import JSON, Column, ForeignKey, Integer, String 2 | from sqlalchemy.orm import relationship 3 | 4 | from ktransformers.server.utils.sql_utils import Base 5 | 6 | 7 | class RunStep(Base): 8 | __tablename__ = "run_steps" 9 | # todo 10 | id = Column(String, primary_key=True, index=True) 11 | object = Column(String, default="thread.run.step") 12 | created_at = Column(Integer) 13 | 14 | assistant_id = Column(String, ForeignKey("assistants.id")) 15 | thread_id = Column(String, ForeignKey("threads.id")) 16 | run_id = Column(String, ForeignKey("runs.id")) 17 | type = Column(String) 18 | status = Column(String) 19 | step_details = Column(JSON) 20 | last_error = Column(JSON, nullable=True) 21 | expires_at = Column(Integer, nullable=True) 22 | cancelled_at = Column(Integer, nullable=True) 23 | failed_at = Column(Integer, nullable=True) 24 | completed_at = Column(Integer, nullable=True) 25 | 26 | meta_data = Column(JSON, nullable=True) 27 | usage = Column(JSON, nullable=True) 28 | 29 | assistant = relationship("Assistant", back_populates="run_steps") 30 | thread = relationship("Thread", back_populates="run_steps") 31 | run = relationship("Run", back_populates="run_steps") 32 | -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/runs.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import JSON, Column, Float, ForeignKey, Integer, String, Text 2 | from sqlalchemy.orm import relationship 3 | 4 | from ktransformers.server.utils.sql_utils import Base 5 | 6 | 7 | class Run(Base): 8 | __tablename__ = "runs" 9 | 10 | id = Column(String, primary_key=True, index=True) 11 | object = Column(String, default="thread.run") 12 | created_at = Column(Integer) 13 | thread_id = Column(String, ForeignKey("threads.id")) 14 | assistant_id = Column(String, ForeignKey("assistants.id")) 15 | status = Column(String) 16 | required_action = Column(JSON, nullable=True) 17 | last_error = Column(JSON, nullable=True) 18 | expires_at = Column(Integer, nullable=True) 19 | started_at = Column(Integer, nullable=True) 20 | cancelled_at = Column(Integer, nullable=True) 21 | failed_at = Column(Integer, nullable=True) 22 | completed_at = Column(Integer, nullable=True) 23 | incomplete_details = Column(JSON, nullable=True) 24 | # get from assistant 25 | model = Column(String) 26 | instructions = Column(Text, nullable=True) 27 | tools = Column(JSON) 28 | meta_data = Column(JSON, nullable=True) 29 | usage = Column(JSON, nullable=True) 30 | temperature = Column(Float, nullable=True) 31 | top_p = Column(Float, nullable=True) 32 | max_propmp_tokens = Column(Integer, nullable=True) 33 | truncation_strategy = Column(JSON) 34 | tool_choice = Column(JSON) 35 | response_format = Column(JSON, default="auto") 36 | 37 | thread = relationship("Thread", back_populates="runs") 38 | assistant = relationship("Assistant", back_populates="runs") 39 | message = relationship("Message", back_populates="run") 40 | -------------------------------------------------------------------------------- /ktransformers/server/models/assistants/threads.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import JSON, Column, Integer, String 2 | from sqlalchemy.orm import relationship 3 | 4 | from ktransformers.server.utils.sql_utils import Base 5 | 6 | 7 | class Thread(Base): 8 | __tablename__ = "threads" 9 | 10 | id = Column(String, primary_key=True, index=True) 11 | object = Column(String, default="thread") 12 | created_at = Column(Integer) 13 | 14 | tool_resources = Column(JSON, nullable=True) 15 | meta_data = Column(JSON, nullable=True) 16 | 17 | runs = relationship("Run", back_populates="thread") 18 | messages = relationship("Message", back_populates="thread") 19 | -------------------------------------------------------------------------------- /ktransformers/server/requirements.txt: -------------------------------------------------------------------------------- 1 | torch >= 2.3.0 2 | transformers == 4.51.3 3 | fastapi >= 0.111.0 4 | langchain >= 0.2.0 5 | blessed >= 1.20.0 6 | accelerate >= 0.31.0 7 | sentencepiece >= 0.1.97 8 | openai 9 | setuptools 10 | build 11 | ninja 12 | wheel 13 | colorlog 14 | fire 15 | zmq 16 | psutil -------------------------------------------------------------------------------- /ktransformers/server/schemas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/schemas/assistants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/assistants/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/schemas/assistants/threads.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List 3 | from typing_extensions import Self 4 | 5 | from pydantic import BaseModel, Field, model_validator 6 | 7 | from ktransformers.server.schemas.base import Metadata, MetadataField, ObjectWithCreatedTime 8 | from ktransformers.server.schemas.assistants.tool import ToolResource 9 | from ktransformers.server.schemas.assistants.messages import MessageCore 10 | 11 | 12 | class ThreadBase(BaseModel): 13 | meta_data: Metadata = MetadataField 14 | @model_validator(mode='before') 15 | @classmethod 16 | def convert_meta_data(cls,values): 17 | if 'meta_data' in values: 18 | values['metadata'] = values['meta_data'] 19 | return values 20 | 21 | tool_resources: List[ToolResource] = Field([], max_length=128) 22 | 23 | 24 | class ThreadObject(ThreadBase, ObjectWithCreatedTime): 25 | is_related_threads:bool = Field(False,exclude=True) 26 | 27 | @model_validator(mode='after') 28 | def check_is_related_threads(self)->Self: 29 | # logger.debug(f'check thread {self.id} is related thread? by {self}') 30 | if 'assistant_id' in self.meta_data: 31 | self.is_related_threads = True 32 | return self 33 | 34 | class StreamEvent(Enum): 35 | created = 'created' 36 | 37 | def to_stream_reply(self,event:StreamEvent): 38 | return f"event: thread.{event.value}\ndata: {self.model_dump_json()}\n\n" 39 | 40 | 41 | class ThreadCreate(ThreadBase): 42 | messages: List[MessageCore] = Field(default=[]) 43 | 44 | 45 | class ThreadModify(ThreadBase): 46 | pass 47 | 48 | 49 | # other than OpenAI API 50 | -------------------------------------------------------------------------------- /ktransformers/server/schemas/assistants/tool.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional, Union 3 | 4 | from pydantic import BaseModel, Field 5 | 6 | from ktransformers.server.schemas.base import ObjectID 7 | 8 | 9 | class ToolType(str, Enum): 10 | CODE_INTERPRETER = "code_interpreter" 11 | FILE_SEARCH = "file_search" 12 | RELATED_THREADS = "related_threads" 13 | FUNCTION = "function" 14 | 15 | 16 | class ToolBase(BaseModel): 17 | type: ToolType 18 | 19 | 20 | class CodeInterpreter(ToolBase): 21 | pass 22 | 23 | 24 | class FileSearch(ToolBase): 25 | pass 26 | 27 | 28 | class RelatedThreads(ToolBase): 29 | pass 30 | 31 | 32 | class FuntionTool(ToolBase): 33 | description: str 34 | name: str 35 | parameters: List[str] 36 | 37 | 38 | Tool = Union[CodeInterpreter, FileSearch, RelatedThreads, FuntionTool] 39 | 40 | 41 | class CodeInterpreterResource(BaseModel): 42 | file_ids: Optional[List[str]] = Field(default_factory=list, max_length=20) 43 | 44 | 45 | class FileSearchResource(BaseModel): 46 | vector_store_ids: Optional[List[str]] = Field(default_factory=list, max_length=1) 47 | vector_stores: Optional[List[str]] = Field(default_factory=list, max_length=1) 48 | 49 | 50 | class RelatedThreadsResource(BaseModel): 51 | thread_ids: List[ObjectID] = Field(default=[]) 52 | 53 | 54 | ToolResource = Union[CodeInterpreterResource,FileSearchResource,RelatedThreadsResource] 55 | -------------------------------------------------------------------------------- /ktransformers/server/schemas/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Dict 3 | 4 | import sqlalchemy 5 | from pydantic import BaseModel, ConfigDict, Field 6 | 7 | TODO = BaseModel 8 | 9 | ObjectID = str 10 | 11 | 12 | class Object(BaseModel): 13 | id: ObjectID 14 | object: str 15 | 16 | model_config = ConfigDict(from_attributes=True) 17 | 18 | 19 | # Pydantic Base Models 20 | class ObjectWithCreatedTime(Object): 21 | created_at: int 22 | 23 | 24 | 25 | class Order(str, Enum): 26 | ASC = "asc" 27 | DESC = "desc" 28 | 29 | def to_sqlalchemy_order(self): 30 | match self: 31 | case Order.ASC: 32 | return sqlalchemy.asc 33 | case Order.DESC: 34 | return sqlalchemy.desc 35 | 36 | 37 | Metadata = Dict[str, str] 38 | MetadataField: Metadata = Field({},max_length=16, alias="metadata") 39 | 40 | 41 | class DeleteResponse(Object): 42 | deleted: bool = True 43 | 44 | class OperationResponse(BaseModel): 45 | operation: str 46 | status: str 47 | -------------------------------------------------------------------------------- /ktransformers/server/schemas/conversation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | from .assistants.assistants import AssistantObject 6 | from .assistants.threads import ThreadObject 7 | from .assistants.messages import MessageObject 8 | 9 | class ThreadPreview(BaseModel): 10 | assistant: Optional[AssistantObject] = None 11 | thread: ThreadObject 12 | first_message: Optional[MessageObject] = None 13 | -------------------------------------------------------------------------------- /ktransformers/server/schemas/legacy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/schemas/legacy/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/schemas/legacy/completions.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from enum import Enum 3 | from pydantic import BaseModel, Field 4 | from ktransformers.server.config.config import Config 5 | from ..base import Object 6 | 7 | class CompletionCreate(BaseModel): 8 | model: str 9 | prompt: str | List[str] 10 | stream: bool = False 11 | temperature: Optional[float] = Field(default=Config().temperature) 12 | top_p: Optional[float] = Field(default=Config().top_p) 13 | max_tokens: Optional[int] = Field(default=None) 14 | max_completion_tokens: Optional[int] = Field(default=None) 15 | 16 | def get_tokenizer_messages(self): 17 | if isinstance(self.prompt,List): 18 | self.get_tokenizer_messages('\n'.join(self.prompt)) 19 | return [{'content':self.prompt,'role':'user'}] 20 | 21 | 22 | class FinishReason(Enum): 23 | stop = 'stop' 24 | length = 'length' 25 | 26 | class Choice(BaseModel): 27 | index: int 28 | text: str 29 | logprobs: Optional[str] = None 30 | finish_reason: FinishReason = None 31 | 32 | 33 | class CompletionObject(Object): 34 | created:int 35 | choices: List[Choice] = [] 36 | model:str = 'not implmented' 37 | system_fingerprint:str = 'not implmented' 38 | usage: Optional[str] = None 39 | 40 | def set_token(self,token:str): 41 | if len(self.choices)==0: 42 | self.choices.append(Choice(index=0,text='')) 43 | self.choices[0].text = token 44 | 45 | def append_token(self,token:str): 46 | if len(self.choices)==0: 47 | self.choices.append(Choice(index=0,text='')) 48 | self.choices[0].text += token 49 | 50 | def to_stream_reply(self): 51 | return f"data:{self.model_dump_json()}\n\n" 52 | -------------------------------------------------------------------------------- /ktransformers/server/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/server/utils/__init__.py -------------------------------------------------------------------------------- /ktransformers/server/utils/create_interface.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | ''' 4 | Description : 5 | Author : qiyuxinlin 6 | Date : 2024-07-25 11:50:16 7 | Version : 1.0.0 8 | LastEditors : qiyuxinlin 9 | LastEditTime : 2024-07-25 12:54:48 10 | Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 11 | ''' 12 | from ktransformers.server.config.config import Config 13 | from ktransformers.server.backend.args import ConfigArgs 14 | from ktransformers.server.backend.context_manager import ThreadContextManager 15 | from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface 16 | from ktransformers.server.backend.interfaces.transformers import TransformersInterface 17 | from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface 18 | 19 | def create_interface(config: Config, default_args: ConfigArgs): 20 | if config.backend_type=='transformers': 21 | from ktransformers.server.backend.interfaces.transformers import TransformersInterface as BackendInterface 22 | elif config.backend_type == 'exllamav2': 23 | from ktransformers.server.backend.interfaces.exllamav2 import ExllamaInterface as BackendInterface 24 | elif config.backend_type == 'ktransformers': 25 | from ktransformers.server.backend.interfaces.ktransformers import KTransformersInterface as BackendInterface 26 | elif config.backend_type == 'balance_serve': 27 | from ktransformers.server.backend.interfaces.balance_serve import BalanceServeInterface as BackendInterface 28 | else: 29 | raise NotImplementedError(f'{config.backend_type} not implemented') 30 | GlobalInterface.interface = BackendInterface(default_args) 31 | GlobalContextManager.context_manager = ThreadContextManager(GlobalInterface.interface) 32 | 33 | class GlobalContextManager: 34 | context_manager: ThreadContextManager 35 | class GlobalInterface: 36 | interface: TransformersInterface | KTransformersInterface | ExllamaInterface 37 | 38 | def get_thread_context_manager() -> GlobalContextManager: 39 | return GlobalContextManager.context_manager 40 | def get_interface() -> GlobalInterface: 41 | return GlobalInterface.interface -------------------------------------------------------------------------------- /ktransformers/tests/.gitignore: -------------------------------------------------------------------------------- 1 | results/ -------------------------------------------------------------------------------- /ktransformers/tests/AIME_2024/evaluation.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35 2 | def filter_answer(completion: str) -> str: 3 | # the answer is the last part of the completion, it's a int64 number 4 | # get the last line 5 | completion = completion.strip().split("\n")[-1] 6 | # handle the $\\boxed{...}$ format 7 | if "$\\boxed{" in completion: 8 | return completion.split("}")[0].split("{")[-1] 9 | return completion.split()[-1] 10 | 11 | -------------------------------------------------------------------------------- /ktransformers/tests/AIME_2024/prompts.py: -------------------------------------------------------------------------------- 1 | def instruct_prompt(prompt: str) -> str: 2 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nSolve the following math problem without any tests or explanation only one answer surrounede by '$\\boxed{{}}$'\n{prompt}\n\n### Response:""" 3 | -------------------------------------------------------------------------------- /ktransformers/tests/dequant_gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | # os.environ["CUDA_VISIBLE_DEVICES"]="1,2" 3 | # add path 4 | import sys 5 | current_path = os.path.abspath(os.path.dirname(__file__)) 6 | sys.path.append(current_path+"/../..") 7 | import numpy as np 8 | # from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin 9 | # from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch 10 | from ktransformers.util.custom_loader import GGUFLoader 11 | import torch 12 | import KTransformersOps 13 | torch.set_default_dtype(torch.bfloat16) 14 | import time 15 | from transformers import ( 16 | AutoConfig, 17 | ) 18 | import os 19 | # CUDA_LAUNCH_BLOCKING=1 20 | os.environ["CUDA_LAUNCH_BLOCKING"]="1" 21 | 22 | gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m") 23 | model_name = "/data/Qwen2-57B-A14B-Instruct" 24 | 25 | # Q4k 26 | key = "blk.1." 27 | target = "attn_q.weight" 28 | 29 | t1 = time.time() 30 | q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu") 31 | # q_weight_cpu = torch.from_numpy(q_weight_cpu) 32 | 33 | t2 = time.time() 34 | q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0") 35 | t3 = time.time() 36 | print() 37 | allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu(), atol=1e-6) 38 | print(f"Q4k {key+target}") 39 | print("load gguf tensor from cpu cost: ", t2-t1) 40 | print("load gguf tensor from gpu cost: ", t3-t2) 41 | print("allclose: ", allclose) 42 | 43 | 44 | # Q6k 45 | key = "blk.0." 46 | target = "ffn_down_exps.weight" 47 | 48 | t1 = time.time() 49 | q_weight_cpu = gguf_config.load_gguf_tensor(key+target, "cpu") 50 | t2 = time.time() 51 | q_weight_gpu = gguf_config.load_gguf_tensor(key+target, "cuda:0") 52 | t3 = time.time() 53 | print() 54 | allclose = torch.allclose(q_weight_cpu, q_weight_gpu.cpu().to(torch.float32), atol=1e-6) 55 | print(f"Q6k {key+target}") 56 | print("load gguf tensor from cpu cost: ", t2-t1) 57 | print("load gguf tensor from gpu cost: ", t3-t2) 58 | print("allclose: ", allclose) 59 | -------------------------------------------------------------------------------- /ktransformers/tests/dequant_gpu_t.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 3 | # add path 4 | import sys 5 | sys.path.append("../..") 6 | import pycuda.autoinit 7 | import pycuda.driver as cuda 8 | from pycuda.compiler import SourceModule 9 | import numpy as np 10 | from ktransformers.operators.linear import KTransformersLinear, KLinearMarlin 11 | from ktransformers.operators.experts import KTransformersExperts, KExpertsTorch 12 | from ktransformers.util.custom_loader import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k 13 | import torch 14 | import KTransformersOps 15 | torch.set_default_dtype(torch.bfloat16) 16 | import time 17 | from transformers import ( 18 | AutoConfig, 19 | ) 20 | 21 | gguf_config = GGUFLoader("/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m") 22 | model_name = "/data/Qwen2-57B-A14B-Instruct" 23 | key = "blk.0." 24 | target = "ffn_up_exps.weight" 25 | 26 | data = gguf_config.get_mmap_tensor(key + target) 27 | 28 | _, factors, offsets, qs1, qs2= dequantize_q4_k(data) 29 | factors_cpu = torch.from_numpy(factors) 30 | offsets_cpu = torch.from_numpy(offsets) 31 | qs1_cpu = torch.from_numpy(qs1) 32 | qs2_cpu = torch.from_numpy(qs2) 33 | 34 | 35 | _, factors, offsets, qs1, qs2 = dequantize_q4_k_gpu(data) 36 | 37 | print(torch.allclose(factors.cpu(), factors_cpu)) 38 | print(torch.allclose(offsets.cpu(), offsets_cpu)) 39 | print(torch.allclose(qs1.cpu(), qs1_cpu)) 40 | print(torch.allclose(qs2.cpu(), qs2_cpu)) -------------------------------------------------------------------------------- /ktransformers/tests/function_call_test.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | def send_messages(messages): 4 | response = client.chat.completions.create( 5 | model="deepseek-chat", 6 | messages=messages, 7 | tools=tools 8 | ) 9 | return response.choices[0].message 10 | 11 | client = OpenAI( 12 | api_key="placeholder", 13 | base_url="http://0.0.0.0:10002/v1", 14 | ) 15 | 16 | tools = [ 17 | { 18 | "type": "function", 19 | "function": { 20 | "name": "get_weather", 21 | "description": "Get weather of an location, the user shoud supply a location first", 22 | "parameters": { 23 | "type": "object", 24 | "properties": { 25 | "location": { 26 | "type": "string", 27 | "description": "The city and state, e.g. San Francisco, CA", 28 | } 29 | }, 30 | "required": ["location"] 31 | }, 32 | } 33 | }, 34 | ] 35 | 36 | messages = [{"role": "user", "content": "How's the weather in Hangzhou?"}] 37 | message = send_messages(messages) 38 | print(f"User>\t {messages[0]['content']}") 39 | print(message) 40 | tool = message.tool_calls[0] 41 | messages.append(message) 42 | 43 | messages.append({"role": "tool", "tool_call_id": tool.id, "content": "24℃"}) 44 | message = send_messages(messages) 45 | print(f"Model>\t {message.content}") -------------------------------------------------------------------------------- /ktransformers/tests/humaneval/evaluation.py: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/declare-lab/instruct-eval/blob/main/human_eval/main.py#L35 2 | def filter_code(completion: str) -> str: 3 | # The program tends to overwrite, we only take the first function 4 | completion = completion.lstrip("\n") 5 | # we also remove ```python\n and ``` 6 | completion = completion.replace("```python\n", "").replace("```", "") 7 | if 'if __name__ == "__main__":' in completion: 8 | completion = completion.split('if __name__ == "__main__":')[0] 9 | if "# Example usage" in completion: 10 | completion = completion.split("# Example usage")[0] 11 | return completion 12 | 13 | 14 | def fix_indents(text: str) -> str: 15 | return text.replace("\t", " ") 16 | -------------------------------------------------------------------------------- /ktransformers/tests/humaneval/prompts.py: -------------------------------------------------------------------------------- 1 | def instruct_prompt(prompt: str) -> str: 2 | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following Python code without any tests or explanation\n{prompt}\n\n### Response:""" 3 | 4 | 5 | def standard_prompt(prompt: str) -> str: 6 | return f"""Complete the following Python code without any tests or explanation\n{prompt}""" 7 | 8 | 9 | def write_prompt(prompt: str) -> str: 10 | return f"""Write a python program to complete the following code:\n{prompt}""" 11 | 12 | 13 | def replit_glaive_prompt(prompt: str) -> str: 14 | return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n Write a response that appropriately completes the request.\n\n ### Instruction:\nWrite a program to perform the given task.\n\n Input:\n{prompt}\n\n### Response:""" 15 | -------------------------------------------------------------------------------- /ktransformers/tests/test_pytorch_q8.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | # 定义一个包含线性层的浮点模型 4 | class LinearModel(torch.nn.Module): 5 | def __init__(self, in_features, out_features): 6 | super().__init__() 7 | self.linear = torch.nn.Linear(in_features, out_features) 8 | 9 | def forward(self, x): 10 | return self.linear(x) 11 | 12 | # 创建浮点模型实例 13 | in_features = 64 14 | out_features = 128 15 | model_fp32 = LinearModel(in_features, out_features) 16 | 17 | # 创建量化模型实例 18 | model_int8 = torch.ao.quantization.quantize_dynamic( 19 | model_fp32, # 原始浮点模型 20 | {torch.nn.Linear}, # 要量化的层类型集合 21 | dtype=torch.qint8 # 量化的目标数据类型 22 | ) 23 | 24 | # 测试模型 25 | batch_size = 32 26 | input_fp32 = torch.randn(1, batch_size, in_features) # 生成随机输入数据 27 | output_int8 = model_int8(input_fp32) # 通过量化模型运行数据 28 | 29 | # 打印输出形状验证 30 | print(f"输入形状: {input_fp32.shape}") 31 | print(f"输出形状: {output_int8.shape}") 32 | 33 | # 比较原始模型和量化模型的输出 34 | with torch.no_grad(): 35 | output_fp32 = model_fp32(input_fp32) 36 | 37 | print(f"FP32输出的前几个值: {output_fp32[0, :5]}") 38 | print(f"INT8输出的前几个值: {output_int8[0, :5]}") 39 | 40 | # 计算平均误差 41 | error = torch.abs(output_fp32 - output_int8).mean().item() 42 | print(f"平均绝对误差: {error}") 43 | 44 | # 打印模型类型信息 45 | print(f"量化前模型类型: {type(model_fp32.linear)}") 46 | print(f"量化后模型类型: {type(model_int8.linear)}") -------------------------------------------------------------------------------- /ktransformers/website/.browserslistrc: -------------------------------------------------------------------------------- 1 | > 1% 2 | last 2 versions 3 | not dead 4 | not ie 11 5 | -------------------------------------------------------------------------------- /ktransformers/website/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | env: { 4 | node: true 5 | }, 6 | 'extends': [ 7 | 'plugin:vue/vue3-essential', 8 | 'eslint:recommended', 9 | '@vue/typescript/recommended' 10 | ], 11 | parserOptions: { 12 | ecmaVersion: 2020 13 | }, 14 | rules: { 15 | 'no-console': process.env.NODE_ENV === 'production' ? 'warn' : 'off', 16 | 'no-debugger': process.env.NODE_ENV === 'production' ? 'warn' : 'off' 17 | }, 18 | overrides: [ 19 | { 20 | files: [ 21 | '**/__tests__/*.{j,t}s?(x)', 22 | '**/tests/unit/**/*.spec.{j,t}s?(x)' 23 | ], 24 | env: { 25 | jest: true 26 | } 27 | } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /ktransformers/website/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | /dist 4 | 5 | 6 | # local env files 7 | .env.local 8 | .env.*.local 9 | 10 | # Log files 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | pnpm-debug.log* 15 | 16 | # Editor directories and files 17 | .idea 18 | .vscode 19 | *.suo 20 | *.ntvs* 21 | *.njsproj 22 | *.sln 23 | *.sw? 24 | -------------------------------------------------------------------------------- /ktransformers/website/README.md: -------------------------------------------------------------------------------- 1 | # 2 | 3 | ## Project setup 4 | ``` 5 | npm install 6 | ``` 7 | 8 | ### Compiles and hot-reloads for development 9 | ``` 10 | npm run serve 11 | ``` 12 | 13 | ### Compiles and minifies for production 14 | ``` 15 | npm run build 16 | ``` 17 | 18 | ### Run your unit tests 19 | ``` 20 | npm run test:unit 21 | ``` 22 | 23 | ### Lints and fixes files 24 | ``` 25 | npm run lint 26 | ``` 27 | 28 | ### Customize configuration 29 | See [Configuration Reference](https://cli.vuejs.org/config/). 30 | -------------------------------------------------------------------------------- /ktransformers/website/config.d.ts: -------------------------------------------------------------------------------- 1 | declare module '*.js' { 2 | const config: { 3 | apiUrl: string; 4 | port:number; 5 | }; 6 | export { config }; 7 | } -------------------------------------------------------------------------------- /ktransformers/website/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: '@vue/cli-plugin-unit-jest/presets/typescript' 3 | } 4 | -------------------------------------------------------------------------------- /ktransformers/website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "version": "", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build", 8 | "test:unit": "vue-cli-service test:unit", 9 | "lint": "vue-cli-service lint" 10 | }, 11 | "dependencies": { 12 | "@types/pdfjs-dist": "^2.10.378", 13 | "@types/websocket": "^1.0.10", 14 | "@vue/cli": "^5.0.8", 15 | "ant-design-vue": "^4.2.1", 16 | "apexcharts": "^3.49.1", 17 | "axios": "^1.7.0", 18 | "axios-extensions": "^3.1.6", 19 | "better-scroll": "^2.5.1", 20 | "element-plus": "^2.7.3", 21 | "marked": "^12.0.2", 22 | "marked-highlight": "^2.1.1", 23 | "pdf-lib": "^1.17.1", 24 | "pdfobject": "^2.3.0", 25 | "v-clipboard": "^3.0.0-next.1", 26 | "vue": "^3.4.27", 27 | "vue-i18n": "^9.13.1", 28 | "vue-pdf": "^4.3.0", 29 | "vue-router": "^4.0.3", 30 | "vue3-apexcharts": "^1.5.3", 31 | "vuex": "^4.0.0", 32 | "webpack": "^5.91.0", 33 | "webpack-cli": "^5.1.4", 34 | "websocket": "^1.0.35" 35 | }, 36 | "devDependencies": { 37 | "@types/jest": "^27.0.1", 38 | "@types/pdfobject": "^2.2.5", 39 | "@typescript-eslint/eslint-plugin": "^5.4.0", 40 | "@typescript-eslint/parser": "^5.4.0", 41 | "@vue/cli-plugin-eslint": "~5.0.0", 42 | "@vue/cli-plugin-router": "~5.0.0", 43 | "@vue/cli-plugin-typescript": "~5.0.0", 44 | "@vue/cli-plugin-unit-jest": "~5.0.0", 45 | "@vue/cli-plugin-vuex": "~5.0.0", 46 | "@vue/cli-service": "~5.0.0", 47 | "@vue/eslint-config-typescript": "^9.1.0", 48 | "@vue/test-utils": "^2.0.0-0", 49 | "@vue/vue3-jest": "^27.0.0-alpha.1", 50 | "babel-jest": "^27.0.6", 51 | "eslint": "^7.32.0", 52 | "eslint-plugin-vue": "^8.0.3", 53 | "jest": "^27.0.5", 54 | "stylus": "^0.55.0", 55 | "stylus-loader": "^6.1.0", 56 | "ts-jest": "^27.0.4", 57 | "typescript": "~4.5.5" 58 | }, 59 | "_id": "@", 60 | "readme": "ERROR: No README data found!" 61 | } 62 | -------------------------------------------------------------------------------- /ktransformers/website/public/balck.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/balck.ico -------------------------------------------------------------------------------- /ktransformers/website/public/config.js: -------------------------------------------------------------------------------- 1 | window.configWeb = { 2 | apiUrl: 'http://119.255.238.12:15670/v1', 3 | port: 8080, 4 | }; -------------------------------------------------------------------------------- /ktransformers/website/public/css/reset.css: -------------------------------------------------------------------------------- 1 | html, body, div, span, applet, object, iframe, 2 | h1, h2, h3, h4, h5, h6, p, blockquote, pre, 3 | a, abbr, acronym, address, big, cite, code, 4 | del, dfn, em, img, ins, kbd, q, s, samp, 5 | small, strike, strong, sub, sup, tt, var, 6 | b, u, i, center, 7 | dl, dt, dd, ol, ul, li, 8 | fieldset, form, label, legend,textarea, 9 | table, caption, tbody, tfoot, thead, tr, th, td, 10 | article, aside, canvas, details, embed, 11 | figure, figcaption, footer, header, hgroup, 12 | menu, nav, output, ruby, section, summary, 13 | time, mark, audio, video { 14 | margin: 0; 15 | padding: 0; 16 | border: 0; 17 | font-size: 100%; 18 | *font: inherit; 19 | font-family: Arial, Microsoft YaHei, SimHei, Tahoma, sans-serif !important; 20 | vertical-align: baseline; 21 | } 22 | /* HTML5 display-role reset for older browsers */ 23 | article, aside, details, figcaption, figure, 24 | footer, header, hgroup, menu, nav, section { 25 | display: block; 26 | } 27 | body { 28 | line-height: 1; 29 | -webkit-text-size-adjust: 100%!important; 30 | margin: 0; 31 | } 32 | html,body { 33 | height: 100%; 34 | width: 100%; 35 | overflow: hidden; 36 | } 37 | ol, ul { 38 | list-style: none; 39 | } 40 | blockquote, q { 41 | quotes: none; 42 | } 43 | blockquote:before, blockquote:after, 44 | q:before, q:after { 45 | content: ''; 46 | content: none; 47 | } 48 | table { 49 | border-collapse: collapse; 50 | border-spacing: 0; 51 | } 52 | 53 | .clearfix:before, 54 | .clearfix:after { 55 | content:""; 56 | display:table 57 | } 58 | .clearfix:after { 59 | clear:both 60 | } 61 | 62 | /*显示省略号*/ 63 | .ellipsis{ 64 | overflow: hidden; 65 | text-overflow: ellipsis; 66 | white-space: nowrap; 67 | } 68 | -------------------------------------------------------------------------------- /ktransformers/website/public/images/assistant-avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/assistant-avatar.png -------------------------------------------------------------------------------- /ktransformers/website/public/images/avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/avatar.png -------------------------------------------------------------------------------- /ktransformers/website/public/images/bgbg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/bgbg.png -------------------------------------------------------------------------------- /ktransformers/website/public/images/logo.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/logo.ico -------------------------------------------------------------------------------- /ktransformers/website/public/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/logo.png -------------------------------------------------------------------------------- /ktransformers/website/public/images/three.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/three.png -------------------------------------------------------------------------------- /ktransformers/website/public/images/user-filling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/public/images/user-filling.png -------------------------------------------------------------------------------- /ktransformers/website/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | KTransformers 11 | 12 | 13 | 16 |
17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /ktransformers/website/src/App.vue: -------------------------------------------------------------------------------- 1 | 8 | 9 | 11 | 12 | -------------------------------------------------------------------------------- /ktransformers/website/src/api/api-client.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosInstance } from 'axios'; 2 | import {baseURL} from '@/conf/config'; 3 | const apiClient: AxiosInstance = axios.create({ 4 | baseURL: baseURL, 5 | // baseURL: '/api', 6 | headers: { 7 | 'Content-Type': 'application/json', 8 | }, 9 | withCredentials: true, 10 | }); 11 | export default apiClient; 12 | -------------------------------------------------------------------------------- /ktransformers/website/src/api/message.ts: -------------------------------------------------------------------------------- 1 | import apiClient from './api-client'; 2 | import { IMessage,IDeleteResult } from '../utils/types'; 3 | 4 | export const createMessage = async ( 5 | thread_id: string, 6 | content: string, 7 | role?: string, 8 | attachments?: any[], 9 | metadata?:{[key:string]:any} 10 | ): Promise => { 11 | const message_data: { 12 | content: string; 13 | role?: string; 14 | attachments?: any[]; 15 | metadata?:{[key:string]:any} 16 | } = { 17 | content, 18 | }; 19 | 20 | if (metadata) { 21 | message_data.metadata = metadata; 22 | } 23 | if (role) { 24 | message_data.role = role; 25 | } 26 | if (attachments) { 27 | message_data.attachments = attachments; 28 | } 29 | const response = await apiClient.post(`/threads/${thread_id}/messages`, message_data); 30 | return response.data; 31 | }; 32 | 33 | 34 | export const listMessages = async ( 35 | thread_id: string, 36 | limit?: number, 37 | order?: string, 38 | after?: string, 39 | before?: string, 40 | run_id?: string, 41 | ): Promise => { 42 | const params: { 43 | limit?: number, 44 | order?: string, 45 | after?: string, 46 | before?: string, 47 | run_id?: string 48 | } = {}; 49 | 50 | if (typeof limit !== 'undefined') { 51 | params.limit = limit; 52 | } 53 | if (typeof order !== 'undefined') { 54 | params.order = order; 55 | } 56 | if (typeof after !== 'undefined') { 57 | params.after = after; 58 | } 59 | if (typeof before !== 'undefined') { 60 | params.before = before; 61 | } 62 | if (typeof run_id !== 'undefined') { 63 | params.run_id = run_id; 64 | } 65 | 66 | const response = await apiClient.get(`/threads/${thread_id}/messages`, { 67 | params 68 | }); 69 | 70 | return response.data; 71 | }; 72 | export const deleteMessage = async(thread_id:string, message_id:string): Promise => { 73 | const response = await apiClient.delete(`/threads/${thread_id}/messages/${message_id}`); 74 | return response.data; 75 | } 76 | -------------------------------------------------------------------------------- /ktransformers/website/src/api/thread.ts: -------------------------------------------------------------------------------- 1 | import apiClient from './api-client'; 2 | import { IThread, IMessage, IThreadAndMessageAndAssistant, IDeleteResult } from '../utils/types'; 3 | export const createThread = async ( 4 | message?: IMessage, 5 | tool_resources?: object, 6 | metadata?: { [key: string]: any } 7 | ): Promise => { 8 | const thread_data: { message?: object, metadata?: { [key: string]: any } } = {}; 9 | if (message) { 10 | thread_data.message = message; 11 | } 12 | if (metadata) { 13 | thread_data.metadata = metadata; 14 | } 15 | const response = await apiClient.post( 16 | '/threads', 17 | thread_data); 18 | return response.data; 19 | }; 20 | 21 | export const listThreads = async ( 22 | limit?: number, 23 | order?: string, 24 | ): Promise => { 25 | const params: { 26 | limit?: number, 27 | order?: string, 28 | } = { limit, order }; 29 | const response = await apiClient.get('/threads', { 30 | params 31 | }); 32 | 33 | return response.data; 34 | }; 35 | 36 | export const deleteThread = async ( 37 | thread_id: string 38 | ): Promise => { 39 | const response = await apiClient.delete(`/threads/${thread_id}`); 40 | return response.data; 41 | } 42 | 43 | export const getThread = async ( 44 | thread_id: string 45 | ): Promise => { 46 | const response = await apiClient.get(`/threads/${thread_id}`); 47 | return response.data; 48 | } -------------------------------------------------------------------------------- /ktransformers/website/src/assets/iconfont/iconfont.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: "iconfont"; /* Project id 4550268 */ 3 | src: url('iconfont.woff2?t=1717950820214') format('woff2'), 4 | url('iconfont.woff?t=1717950820214') format('woff'), 5 | url('iconfont.ttf?t=1717950820214') format('truetype'), 6 | url('iconfont.svg?t=1717950820214#iconfont') format('svg'); 7 | } 8 | 9 | .iconfont { 10 | font-family: "iconfont" !important; 11 | font-size: 16px; 12 | font-style: normal; 13 | -webkit-font-smoothing: antialiased; 14 | -moz-osx-font-smoothing: grayscale; 15 | } 16 | 17 | .icon-copy:before { 18 | content: "\e8b0"; 19 | } 20 | 21 | .icon-arrow-down:before { 22 | content: "\e85e"; 23 | } 24 | 25 | .icon-usage-progress:before { 26 | content: "\e651"; 27 | } 28 | 29 | .icon-gen-progress:before { 30 | content: "\e617"; 31 | } 32 | 33 | .icon-back:before { 34 | content: "\e779"; 35 | } 36 | 37 | .icon-point:before { 38 | content: "\e608"; 39 | } 40 | 41 | .icon-edit:before { 42 | content: "\e7dd"; 43 | } 44 | 45 | .icon-delete:before { 46 | content: "\e614"; 47 | } 48 | 49 | .icon-upload-1:before { 50 | content: "\e618"; 51 | } 52 | 53 | .icon-explore:before { 54 | content: "\e621"; 55 | } 56 | 57 | .icon-ellipsis:before { 58 | content: "\e657"; 59 | } 60 | 61 | .icon-sent:before { 62 | content: "\e60c"; 63 | } 64 | 65 | .icon-list-list:before { 66 | content: "\e62d"; 67 | } 68 | 69 | .icon-list-icon:before { 70 | content: "\e639"; 71 | } 72 | 73 | .icon-zhongshi:before { 74 | content: "\e6bd"; 75 | } 76 | 77 | .icon-log:before { 78 | content: "\e826"; 79 | } 80 | 81 | -------------------------------------------------------------------------------- /ktransformers/website/src/assets/iconfont/iconfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.ttf -------------------------------------------------------------------------------- /ktransformers/website/src/assets/iconfont/iconfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.woff -------------------------------------------------------------------------------- /ktransformers/website/src/assets/iconfont/iconfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvcache-ai/ktransformers/64ec0ec148b472850bb9145161da43947508d9f7/ktransformers/website/src/assets/iconfont/iconfont.woff2 -------------------------------------------------------------------------------- /ktransformers/website/src/conf/config.ts: -------------------------------------------------------------------------------- 1 | declare global { 2 | interface Window { 3 | configWeb: { 4 | apiUrl: string; 5 | port: string; 6 | }; 7 | } 8 | } 9 | 10 | export const baseURL = window.configWeb.apiUrl; 11 | export const basePort = window.configWeb.port; 12 | -------------------------------------------------------------------------------- /ktransformers/website/src/locals/index.js: -------------------------------------------------------------------------------- 1 | // index.js 2 | import { createI18n } from 'vue-i18n' 3 | import zh from './zh' 4 | import en from './en' 5 | 6 | const messages = { 7 | en, 8 | zh, 9 | } 10 | const language = (navigator.language || 'en').toLocaleLowerCase() // 这是获取浏览器的语言 11 | const i18n = createI18n({ 12 | legacy: false, // you must set `false`, to use Compostion API 13 | locale: localStorage.getItem('lang') || language.split('-')[0] || 'en', // 首先从缓存里拿,没有的话就用浏览器语言, 14 | fallbackLocale: 'en', // 设置备用语言 15 | messages, 16 | }) 17 | 18 | export default i18n -------------------------------------------------------------------------------- /ktransformers/website/src/locals/zh.js: -------------------------------------------------------------------------------- 1 | // zh.js 2 | export default { 3 | home: { 4 | explore: '探索', 5 | language: '选择语言', 6 | english: '英语', 7 | chinese: '中文', 8 | today: '今天', 9 | previous:'历史', 10 | withoutAssistantTip:'本记录的KTransformers已被删除,用户只能查看历史对话信息而无法继续对话!', 11 | deleteThreadTip:'删除记录会清除历史信息哦~' 12 | }, 13 | chat:{ 14 | inputTip:"发送信息和 KTransformers 畅聊吧~", 15 | }, 16 | explore:{ 17 | description: "基于Lexllama,一起来创建你的专属KTransformers吧~", 18 | configuring: "配置中", 19 | completed: "完成", 20 | assistantName: "名称", 21 | assistantDescription: "描述", 22 | assistantStatus: "Status", 23 | createAssistant: "创建新的KTransformers", 24 | deleteAssistant: "是否确认删除KTransformers,删除KTransformers之后其KVCache也会被同步清理掉哦~", 25 | }, 26 | config:{ 27 | title:'配置你的KTransformers', 28 | fileTip:"仅支持上传文件格式为 .text, docx, .ppt, .pdf format.", 29 | secletFile:'选择文件', 30 | outOfSize:'文件大小超出10MB,请重新选择', 31 | fileExist:'文件已存在,请重新选择', 32 | createAssistant:'KTransformers创建成功,点击build按钮开始构建KVCache', 33 | }, 34 | build:{ 35 | title:'构建日志', 36 | step1:'解析上传文件', 37 | parsingFileStep1:'文件上传接收完成', 38 | parsingFileStep2:{ 39 | parse:"正在解析第", 40 | file:"文件", 41 | total:'共', 42 | }, 43 | parsingFileStep3:'Prompt装载完毕,准备生成KVCache', 44 | step2:'生成 KVCache', 45 | generateStep1:'生成KVCache计算计划', 46 | generateStep2:{ 47 | calculate:"正在计算", 48 | token:"tokens", 49 | total:'共', 50 | }, 51 | generateStep3:'KVCache已生成完成', 52 | durationTime:'持续时间:', 53 | remainTime:'剩余时间:', 54 | buildProgress:'构建进度', 55 | storageUsage:'存储使用:', 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /ktransformers/website/src/main.ts: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue' 2 | import App from './App.vue' 3 | import router from './router' 4 | import store from './store' 5 | import ElementPlus from 'element-plus' 6 | import 'element-plus/dist/index.css' 7 | import VueApexCharts from "vue3-apexcharts" 8 | import i18n from '@/locals' 9 | 10 | const app = createApp(App) 11 | 12 | app.use(ElementPlus) 13 | 14 | app.use(i18n) 15 | app.use(VueApexCharts) 16 | app.use(store) 17 | app.use(router) 18 | app.mount('#app') 19 | -------------------------------------------------------------------------------- /ktransformers/website/src/router/index.ts: -------------------------------------------------------------------------------- 1 | import { createRouter, createWebHashHistory, RouteRecordRaw, createWebHistory } from 'vue-router' 2 | import HomeView from '@/views/home.vue' 3 | 4 | const routes: Array = [ 5 | { 6 | path: '/', 7 | name: 'home', 8 | component: HomeView, 9 | redirect: '/chat', 10 | children: [{ 11 | path: '/chat', 12 | name: '', 13 | component: () => import(/* webpackChunkName: "about" */ '../components/chat/index.vue') 14 | },] 15 | }, 16 | 17 | ] 18 | 19 | const router = createRouter({ 20 | history: createWebHashHistory(), 21 | routes 22 | }) 23 | 24 | export default router 25 | -------------------------------------------------------------------------------- /ktransformers/website/src/shims-vue.d.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | declare module '*.vue' { 3 | import type { DefineComponent } from 'vue' 4 | const component: DefineComponent<{}, {}, any> 5 | export default component 6 | 7 | } 8 | 9 | declare module '@/locals' 10 | declare module 'pdfobject'; 11 | -------------------------------------------------------------------------------- /ktransformers/website/src/store/index.ts: -------------------------------------------------------------------------------- 1 | import { createStore } from 'vuex' 2 | 3 | export default createStore({ 4 | state: { 5 | }, 6 | getters: { 7 | }, 8 | mutations: { 9 | }, 10 | actions: { 11 | }, 12 | modules: { 13 | } 14 | }) 15 | -------------------------------------------------------------------------------- /ktransformers/website/tests/unit/example.spec.ts: -------------------------------------------------------------------------------- 1 | import { shallowMount } from '@vue/test-utils' 2 | import HelloWorld from '@/components/HelloWorld.vue' 3 | 4 | describe('HelloWorld.vue', () => { 5 | it('renders props.msg when passed', () => { 6 | const msg = 'new message' 7 | const wrapper = shallowMount(HelloWorld, { 8 | props: { msg } 9 | }) 10 | expect(wrapper.text()).toMatch(msg) 11 | }) 12 | }) 13 | -------------------------------------------------------------------------------- /ktransformers/website/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es5", 4 | "module": "esnext", 5 | "strict": true, 6 | "jsx": "preserve", 7 | "importHelpers": true, 8 | "moduleResolution": "node", 9 | "skipLibCheck": true, 10 | "esModuleInterop": true, 11 | "allowSyntheticDefaultImports": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "useDefineForClassFields": true, 14 | "sourceMap": true, 15 | "allowJs": true, 16 | "baseUrl": ".", 17 | "types": [ 18 | "webpack-env", 19 | "jest" 20 | ], 21 | "paths": { 22 | "@/*": [ 23 | "src/*" 24 | ] 25 | }, 26 | "lib": [ 27 | "esnext", 28 | "dom", 29 | "dom.iterable", 30 | "scripthost" 31 | ] 32 | }, 33 | "include": [ 34 | "src/**/*.ts", 35 | "src/**/*.tsx", 36 | "src/**/*.vue", 37 | "tests/**/*.ts", 38 | "tests/**/*.tsx", 39 | "config.d.ts" 40 | ], 41 | 42 | "exclude": [ 43 | "node_modules" 44 | ] 45 | } -------------------------------------------------------------------------------- /ktransformers/website/vue.config.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = { 3 | // 配置 webpack-dev-server 行为。 4 | devServer: { 5 | open: false, // 编译后默认打开浏览器 6 | host: '0.0.0.0', // 域名 7 | port: 8082, // 端口 8 | https: false, // 是否https 9 | proxy: { 10 | '/api': { 11 | target: 'http://localhost:9016/v1', // 你的后端服务器地址 12 | changeOrigin: true, // 是否允许跨域 13 | pathRewrite: { 14 | '/api': '' // 将 '/api' 前缀替换为空,如果你的后端不需要这个前缀 15 | } 16 | } 17 | } 18 | }, 19 | publicPath: '/web/', // 基本路径 20 | outputDir: 'dist', // 构建时的输出目录 21 | assetsDir: 'static', // 放置静态资源的目录 22 | indexPath: 'index.html', // html 的输出路径 23 | filenameHashing: true, // 文件名哈希值 24 | lintOnSave: false, // 是否在保存的时候使用 `eslint-loader` 进行检查。 25 | 26 | // 组件是如何被渲染到页面中的? (ast:抽象语法树;vDom:虚拟DOM) 27 | // template ---> ast ---> render ---> vDom ---> 真实的Dom ---> 页面 28 | // runtime-only:将template在打包的时候,就已经编译为render函数 29 | // runtime-compiler:在运行的时候才去编译template 30 | runtimeCompiler: false, 31 | 32 | transpileDependencies: [], // babel-loader 默认会跳过 node_modules 依赖。 33 | productionSourceMap: false, // 是否为生产环境构建生成 source map 34 | 35 | //调整内部的 webpack 配置 36 | configureWebpack: () => {}, 37 | 38 | chainWebpack: () => {}, 39 | 40 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "torch >= 2.3.0", 5 | "ninja", 6 | "packaging", 7 | "cpufeature" 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | 11 | [project] 12 | 13 | name = "ktransformers" 14 | 15 | dynamic = ["version"] 16 | 17 | dependencies = [ 18 | "torch >= 2.3.0", 19 | "transformers == 4.51.3", 20 | "fastapi >= 0.111.0", 21 | "uvicorn >= 0.30.1", 22 | "langchain >= 0.2.0", 23 | "blessed >= 1.20.0", 24 | "accelerate >= 0.31.0", 25 | "sentencepiece >= 0.1.97", 26 | "setuptools", 27 | "ninja", 28 | "wheel", 29 | "colorlog", 30 | "build", 31 | "fire", 32 | "protobuf", 33 | ] 34 | 35 | requires-python = ">=3.10" 36 | 37 | authors = [ 38 | {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"} 39 | ] 40 | 41 | maintainers = [ 42 | {name = "james0zan", email = "zhang.mingxing@outlook.com"}, 43 | {name = "awake", email = "awake@approaching.ai"}, 44 | {name = "unicorn chan", email = "nl@approaching.ai"} 45 | ] 46 | 47 | description = "KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies." 48 | 49 | readme = "README.md" 50 | license = {file = "LICENSE"} 51 | 52 | keywords = ["ktransformers", "llm"] 53 | 54 | classifiers = [ 55 | "Development Status :: 4 - Beta", 56 | "Programming Language :: Python :: 3.10", 57 | "Programming Language :: Python :: 3.11", 58 | "Programming Language :: Python :: 3.12" 59 | ] 60 | 61 | [project.urls] 62 | Homepage = "https://kvcache.ai" 63 | Repository = "https://github.com/kvcache-ai/ktransformers.git" 64 | Issues = "https://github.com/kvcache-ai/ktransformers/issues" 65 | 66 | 67 | [project.scripts] 68 | ktransformers = "ktransformers.server.main:main" 69 | 70 | [tool.setuptools.packages.find] 71 | where = ["./", ] 72 | include = ["ktransformers"] 73 | [tool.black] 74 | line-length = 120 75 | preview = true 76 | unstable = true 77 | -------------------------------------------------------------------------------- /requirements-local_chat.txt: -------------------------------------------------------------------------------- 1 | fire 2 | transformers==4.51.3 3 | numpy 4 | torch>=2.3.0 5 | packaging 6 | cpufeature; sys_platform == 'win32' or sys_platform == 'Windows' 7 | protobuf 8 | tiktoken 9 | blobfile 10 | -------------------------------------------------------------------------------- /third_party/llamafile/README.md: -------------------------------------------------------------------------------- 1 | The code in this folder is copied from [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile). Special thanks to the Mozilla-Ocho team. 2 | -------------------------------------------------------------------------------- /third_party/llamafile/bench.h: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "micros.h" 13 | 14 | #define BENCH(x) \ 15 | do { \ 16 | x; \ 17 | __asm__ volatile("" ::: "memory"); \ 18 | long long start = micros(); \ 19 | for (int i = 0; i < ITERATIONS; ++i) { \ 20 | __asm__ volatile("" ::: "memory"); \ 21 | x; \ 22 | __asm__ volatile("" ::: "memory"); \ 23 | } \ 24 | printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \ 25 | } while (0) 26 | -------------------------------------------------------------------------------- /third_party/llamafile/flags.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #include "flags.h" 7 | 8 | bool FLAG_precise = false; 9 | -------------------------------------------------------------------------------- /third_party/llamafile/flags.h: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #pragma once 7 | 8 | extern bool FLAG_precise; 9 | -------------------------------------------------------------------------------- /third_party/llamafile/iqk_mul_mat_amd_avx2.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp 3 | // Copyrigth 2024 Iwan Kawrakow. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #include "iqk_mul_mat.inc" 8 | #endif // __x86_64__ 9 | -------------------------------------------------------------------------------- /third_party/llamafile/iqk_mul_mat_amd_zen4.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp 3 | // Copyrigth 2024 Iwan Kawrakow. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define iqk_mul_mat iqk_mul_mat_zen4 8 | #define iqk_mul_mat_moe iqk_mul_mat_moe_zen4 9 | #include "iqk_mul_mat.inc" 10 | #endif // __x86_64__ 11 | -------------------------------------------------------------------------------- /third_party/llamafile/iqk_mul_mat_arm82.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_arm82.cpp 3 | // Copyrigth 2024 Iwan Kawrakow. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #ifdef __aarch64__ 7 | #define iqk_mul_mat iqk_mul_mat_arm82 8 | #define iqk_mul_mat_moe iqk_mul_mat_moe_arm82 9 | #include "iqk_mul_mat.inc" 10 | #endif // __aarch64__ 11 | -------------------------------------------------------------------------------- /third_party/llamafile/macros.h: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/macros.h 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi 8 | #pragma once 9 | 10 | #define MIN(X, Y) ((Y) > (X) ? (X) : (Y)) 11 | #define MAX(X, Y) ((Y) < (X) ? (X) : (Y)) 12 | #define CEIL_DIV(M, N) (((M) + (N) - 1) / (N)) 13 | #define ROUNDUP(X, K) (((X) + (K) - 1) & -(K)) 14 | #define ARRAYLEN(A) ((sizeof(A) / sizeof(*(A))) / ((unsigned)!(sizeof(A) % sizeof(*(A))))) 15 | -------------------------------------------------------------------------------- /third_party/llamafile/micros.h: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/micros.h 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi 8 | #pragma once 9 | 10 | #include 11 | 12 | #ifndef _WIN32 13 | #include 14 | #else 15 | #include 16 | #endif 17 | 18 | #ifdef _WIN32 19 | static long long GetQueryPerformanceFrequency() { 20 | LARGE_INTEGER t; 21 | QueryPerformanceFrequency(&t); 22 | return t.QuadPart; 23 | } 24 | static long long GetQueryPerformanceCounter() { 25 | LARGE_INTEGER t; 26 | QueryPerformanceCounter(&t); 27 | return t.QuadPart; 28 | } 29 | #endif 30 | 31 | static long long micros(void) { 32 | #ifndef _WIN32 33 | struct timespec ts; 34 | clock_gettime(CLOCK_REALTIME, &ts); 35 | return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000; 36 | #else 37 | static long long timer_freq = GetQueryPerformanceFrequency(); 38 | static long long timer_start = GetQueryPerformanceCounter(); 39 | return ((GetQueryPerformanceCounter() - timer_start) * 1000000) / timer_freq; 40 | #endif 41 | } 42 | -------------------------------------------------------------------------------- /third_party/llamafile/numba.h: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/numba.h 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #pragma once 7 | 8 | inline int rand32(void) { 9 | static unsigned long long lcg = 1; 10 | lcg *= 6364136223846793005; 11 | lcg += 1442695040888963407; 12 | return lcg >> 32; 13 | } 14 | 15 | inline int popcount(unsigned x) { 16 | x = x - ((x >> 1) & 0x55555555); 17 | x = ((x >> 2) & 0x33333333) + (x & 0x33333333); 18 | x = (x + (x >> 4)) & 0x0F0F0F0F; 19 | x = (x + (x >> 16)); 20 | return (x + (x >> 8)) & 0x0000003F; 21 | } 22 | 23 | inline int hamming(int x, int y) { 24 | return popcount(x ^ y); 25 | } 26 | 27 | inline float float01(unsigned x) { // (0,1) 28 | return 1.f / 8388608 * ((x >> 9) + .5f); 29 | } 30 | 31 | inline float numba(void) { // (-10,10) 32 | return float01(rand32()) * 2.f - 1.f; 33 | } 34 | 35 | template 36 | void randomize(T* A, int n) { 37 | for (int i = 0; i < n; ++i) 38 | A[i] = numba(); 39 | } 40 | 41 | template 42 | void randomize(int m, int n, T* A, int lda) { 43 | for (int j = 0; j < n; ++j) 44 | for (int i = 0; i < m; ++i) 45 | A[lda * j + i] = numba(); 46 | } 47 | 48 | template 49 | void broadcast(T* A, int n, U x) { 50 | for (int i = 0; i < n; ++i) 51 | A[i] = x; 52 | } 53 | 54 | template 55 | void broadcast(int m, int n, T* A, int lda, U x) { 56 | for (int j = 0; j < n; ++j) 57 | for (int i = 0; i < m; ++i) 58 | A[lda * j + i] = x; 59 | } 60 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx 8 | #include "tinyblas_cpu_mixmul.inc" 9 | 10 | /** 11 | * Returns number of shared memory bytes llamafile_mixmul() needs. 12 | */ 13 | size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) { 14 | ggml_compute_params params{}; 15 | params.wsize = 0x7ffff000; 16 | params.wdata = (void*)0x1000; 17 | MixMul mm{¶ms, weights, thought, plan, 0}; 18 | if (mm.allocate_shared_memory()) 19 | return mm.get_allocated_bytes(); 20 | else 21 | return 0; 22 | } 23 | 24 | #endif // __x86_64__ 25 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx2.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx2 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avx512f.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_avx512f 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_avxvnni.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_avxvnni 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_fma.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_fma 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_amd_zen4.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_mixmul llamafile_mixmul_amd_zen4 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_arm80.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm80.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #ifdef __aarch64__ 7 | #define llamafile_mixmul llamafile_mixmul_arm80 8 | #include "tinyblas_cpu_mixmul.inc" 9 | 10 | /** 11 | * Returns number of shared memory bytes llamafile_mixmul() needs. 12 | */ 13 | size_t llamafile_mixmul_needs(const ggml_tensor* weights, const ggml_tensor* thought, const ggml_tensor* plan) { 14 | ggml_compute_params params{}; 15 | params.wsize = 0x7ffff000; 16 | params.wdata = (void*)0x1000; 17 | MixMul mm{¶ms, weights, thought, plan, 0}; 18 | if (mm.allocate_shared_memory()) 19 | return mm.get_allocated_bytes(); 20 | else 21 | return 0; 22 | } 23 | 24 | #endif // __aarch64__ 25 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_mixmul_arm82.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_mixmul_arm82.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #ifdef __aarch64__ 7 | #define llamafile_mixmul llamafile_mixmul_arm82 8 | #include "tinyblas_cpu_mixmul.inc" 9 | #endif // __aarch64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx2.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx2 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avx512f.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_avx512f 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_avxvnni.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_avxvnni 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_fma.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_fma 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __x86_64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_amd_zen4.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #if defined(__x86_64__) || defined(_M_X64) 7 | #define llamafile_sgemm llamafile_sgemm_amd_zen4 8 | #define iqk_mul_mat iqk_mul_mat_zen4 9 | #include "tinyblas_cpu_sgemm.inc" 10 | #endif // __x86_64__ 11 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_arm80.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm80.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #ifdef __aarch64__ 7 | #define llamafile_sgemm llamafile_sgemm_arm80 8 | #include "tinyblas_cpu_sgemm.inc" 9 | #endif // __aarch64__ 10 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_sgemm_arm82.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_sgemm_arm82.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | #ifdef __aarch64__ 7 | #define llamafile_sgemm llamafile_sgemm_arm82 8 | #define iqk_mul_mat iqk_mul_mat_arm82 9 | #include "tinyblas_cpu_sgemm.inc" 10 | #endif // __aarch64__ 11 | -------------------------------------------------------------------------------- /third_party/llamafile/tinyblas_cpu_unsupported.cpp: -------------------------------------------------------------------------------- 1 | // Adapted from 2 | // https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/tinyblas_cpu_unsupported.cpp 3 | // Copyrigth 2024 Mozilla Foundation. 4 | // Copyright(c) 2024 by KVCache.AI, All Rights Reserved. 5 | 6 | // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- 7 | // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi 8 | // 9 | // Copyright 2024 Mozilla Foundation 10 | // 11 | // Licensed under the Apache License, Version 2.0 (the "License"); 12 | // you may not use this file except in compliance with the License. 13 | // You may obtain a copy of the License at 14 | // 15 | // http://www.apache.org/licenses/LICENSE-2.0 16 | // 17 | // Unless required by applicable law or agreed to in writing, software 18 | // distributed under the License is distributed on an "AS IS" BASIS, 19 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | // See the License for the specific language governing permissions and 21 | // limitations under the License. 22 | 23 | #include "sgemm.h" 24 | 25 | bool llamafile_sgemm_unsupported(long m, long n, long k, const void* A, long lda, const void* B, long ldb, void* C, long ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype, int precision) { 26 | return false; 27 | } 28 | 29 | bool llamafile_mixmul_unsupported(const struct ggml_compute_params* params, 30 | const struct ggml_tensor* weights, 31 | const struct ggml_tensor* thought, 32 | const struct ggml_tensor* plan, 33 | struct ggml_tensor* result) { 34 | return false; 35 | } 36 | 37 | bool iqk_mul_mat_moe_unsupported(long, long, long, int, int, const void*, const void*, float*, long, long, const void*, int, int) { 38 | return false; 39 | } 40 | --------------------------------------------------------------------------------