├── mistralrs-core ├── src │ ├── cuda │ │ └── mod.rs │ ├── diffusion_models │ │ ├── clip │ │ │ └── mod.rs │ │ ├── flux │ │ │ └── mod.rs │ │ └── mod.rs │ ├── paged_attention │ │ ├── layers │ │ │ └── mod.rs │ │ ├── block_engine_sequence.rs │ │ └── config.rs │ ├── dummy_paged_attention │ │ ├── layers │ │ │ └── mod.rs │ │ ├── block_engine_sequence.rs │ │ ├── config.rs │ │ └── cache_engine.rs │ ├── aici │ │ ├── README.md │ │ ├── mod.rs │ │ └── bytes.rs │ ├── vision_models │ │ ├── llava │ │ │ └── mod.rs │ │ ├── processor_config.rs │ │ ├── mod.rs │ │ ├── image_processor.rs │ │ └── preprocessor_config.rs │ ├── layers_utils.rs │ ├── models │ │ └── mod.rs │ ├── utils │ │ ├── log.rs │ │ ├── debug.rs │ │ ├── tokens.rs │ │ ├── tokenizer.rs │ │ └── memory_usage.rs │ ├── tools │ │ ├── response.rs │ │ └── request.rs │ ├── gguf │ │ ├── chat_template.rs │ │ └── mod.rs │ ├── amoe │ │ ├── macros.rs │ │ └── inputs.rs │ ├── xlora_models │ │ └── config.rs │ └── scheduler │ │ └── mod.rs └── README.md ├── .gitignore ├── mistralrs-paged-attn ├── README.md ├── src │ ├── attention │ │ ├── attention_dtypes.h │ │ ├── attention_generic.cuh │ │ └── attention_utils.cuh │ ├── lib.rs │ ├── cuda_compat.h │ ├── ffi.rs │ └── backend │ │ └── mod.rs └── Cargo.toml ├── .dockerignore ├── mistralrs-pyo3 ├── build.rs ├── pyproject.toml ├── pyproject_template.toml ├── .gitignore ├── Cargo_template.toml ├── Cargo.toml └── src │ └── stream.rs ├── toml-selectors ├── plain.toml ├── lora.toml ├── xlora.toml ├── gguf.toml ├── speculative-gguf.toml ├── speculative-same-gguf.toml ├── anymoe.toml └── anymoe_lora.toml ├── topologies ├── isq.yml └── isq_and_device.yml ├── mistralrs-server ├── resources │ ├── rust-logo-32x32.png │ └── LICENSE.md └── Cargo.toml ├── examples ├── README.md ├── server │ ├── flux.py │ ├── streaming_completion.py │ ├── yacc.py │ ├── streaming.py │ ├── stream_completion_bench.py │ ├── regex.py │ ├── completion.py │ ├── chat.py │ ├── adapter_chat.py │ ├── idefics2.py │ ├── phi3v_local_img.py │ ├── llava.py │ ├── phi3v.py │ ├── llama_vision.py │ ├── llava_next.py │ └── phi3v_base64.py ├── python │ ├── flux.py │ ├── plain.py │ ├── isq.py │ ├── paged_attention.py │ ├── mixture_of_quant_experts.py │ ├── topology.py │ ├── gguf.py │ ├── streaming.py │ ├── xlora_gemma.py │ ├── token_source.py │ ├── lora_activation.py │ ├── xlora_zephyr.py │ ├── speculative.py │ ├── lora_zephyr.py │ ├── idefics2.py │ ├── speculative_xlora.py │ ├── phi3v_local_img.py │ ├── anymoe.py │ ├── phi3v.py │ ├── llava_next.py │ ├── anymoe_inference.py │ ├── anymoe_lora.py │ ├── phi3v_base64.py │ └── llama_vision.py └── amoe.json ├── orderings └── lora-paper-ordering.json ├── scripts ├── get_tokenizers_json.py ├── testgen_text.py ├── set_names.py ├── lora_add_preload_adapters.py ├── testgen_vision.py └── create_ordering.py ├── mistralrs-vision ├── README.md ├── Cargo.toml ├── src │ ├── utils.rs │ └── ops.rs └── tests │ └── integration.rs ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ ├── bug_report.md │ └── build_failure.md └── workflows │ ├── release_python.yml │ ├── docs.yml │ └── analysis.yaml ├── chat_templates ├── chatml.json ├── phi3.json ├── llama3.json ├── phi3.5.json ├── mistral.json ├── llama2.json ├── default.json └── vicuna.json ├── .typos.toml ├── mistralrs-quant ├── src │ ├── gptq │ │ ├── mod.rs │ │ ├── ffi.rs │ │ └── gptq_cpu.rs │ ├── utils │ │ ├── ffi.rs │ │ └── mod.rs │ ├── dummy │ │ └── mod.rs │ └── hqq │ │ └── ffi.rs ├── README.md ├── kernels │ └── gptq │ │ ├── qdq_8.cuh │ │ ├── qdq_util.cuh │ │ └── compat.cuh └── Cargo.toml ├── docs ├── IMAGEGEN_MODELS.md ├── SAMPLING.md ├── TOOL_CALLING.md ├── VISION_MODELS.md ├── LORA_XLORA.md ├── README.md ├── DEVICE_MAPPING.md ├── QUANTS.md ├── NON_GRANULAR.md ├── GEMMA2.md ├── ISQ.md └── CHAT_TOK.md ├── .cargo └── config.toml ├── mistralrs ├── README.md └── examples │ ├── gemma2 │ └── main.rs │ ├── grammar │ └── main.rs │ ├── flux │ └── main.rs │ ├── xlora │ └── main.rs │ ├── gguf │ └── main.rs │ ├── phi3_5_moe │ └── main.rs │ ├── lora │ └── main.rs │ ├── mixture_of_quant_experts │ └── main.rs │ ├── idefics2 │ └── main.rs │ ├── llava_next │ └── main.rs │ ├── phi3v │ └── main.rs │ ├── llava │ └── main.rs │ ├── paged_attn │ └── main.rs │ ├── llama_vision │ └── main.rs │ ├── isq │ └── main.rs │ ├── custom_logits_processor │ └── main.rs │ ├── simple │ └── main.rs │ ├── anymoe_lora │ └── main.rs │ ├── lora_activation │ └── main.rs │ ├── anymoe │ └── main.rs │ ├── batching │ └── main.rs │ ├── gguf_locally │ └── main.rs │ └── topology │ └── main.rs ├── .gitattributes ├── mistralrs-bench ├── Cargo.toml └── README.md ├── LICENSE ├── Dockerfile ├── Cargo.toml └── Dockerfile.cuda-all /mistralrs-core/src/cuda/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod ffi; 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .ruff_cache 3 | .vscode 4 | *.a -------------------------------------------------------------------------------- /mistralrs-paged-attn/README.md: -------------------------------------------------------------------------------- 1 | # mistralrs-paged-attn -------------------------------------------------------------------------------- /mistralrs-core/src/diffusion_models/clip/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod text; 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | docs 2 | examples 3 | orderings 4 | scripts 5 | target 6 | -------------------------------------------------------------------------------- /mistralrs-pyo3/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pyo3_build_config::add_extension_module_link_args(); 3 | } 4 | -------------------------------------------------------------------------------- /toml-selectors/plain.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | arch = "mistral" -------------------------------------------------------------------------------- /mistralrs-core/src/paged_attention/layers/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod paged_attention; 2 | 3 | pub use paged_attention::PagedAttention; 4 | -------------------------------------------------------------------------------- /toml-selectors/lora.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | adapters_model_id = "lamm-mit/x-lora" 3 | order = "ordering-file.json" 4 | arch = "mistral" -------------------------------------------------------------------------------- /toml-selectors/xlora.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | xlora_model_id = "lamm-mit/x-lora" 3 | order = "ordering-file.json" 4 | arch = "mistral" -------------------------------------------------------------------------------- /mistralrs-core/src/dummy_paged_attention/layers/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod paged_attention; 2 | 3 | pub use paged_attention::PagedAttention; 4 | -------------------------------------------------------------------------------- /topologies/isq.yml: -------------------------------------------------------------------------------- 1 | 0-8: 2 | isq: Q3K 3 | 8-16: 4 | isq: Q4K 5 | 16-24: 6 | isq: Q6K 7 | # Skip 24-28 8 | 28-32: 9 | isq: Q8_0 -------------------------------------------------------------------------------- /mistralrs-core/src/diffusion_models/flux/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod autoencoder; 2 | pub mod model; 3 | pub mod sampling; 4 | pub mod stepper; 5 | -------------------------------------------------------------------------------- /mistralrs-server/resources/rust-logo-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/mistral.rs/master/mistralrs-server/resources/rust-logo-32x32.png -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | - Python: [examples here](python) 3 | - HTTP Server: [examples here](server) 4 | - Rust: [examples here](../mistralrs/examples/) -------------------------------------------------------------------------------- /mistralrs-core/src/aici/README.md: -------------------------------------------------------------------------------- 1 | // Originally from https://github.com/microsoft/aici/blob/64f0b551dee49e320e9b3b92289f3d6f2e888276 2 | // Licensed under the MIT license 3 | -------------------------------------------------------------------------------- /orderings/lora-paper-ordering.json: -------------------------------------------------------------------------------- 1 | {"base_model_id": "HuggingFaceH4/zephyr-7b-beta", "order": ["adapter_1"], "preload_adapters": [{"name":"adapter_2","adapter_model_id":"lamm-mit/x-lora"}]} -------------------------------------------------------------------------------- /scripts/get_tokenizers_json.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | 3 | model = input("Enter model ID: ") 4 | tok = AutoTokenizer.from_pretrained(model) 5 | tok.save_pretrained(".") 6 | -------------------------------------------------------------------------------- /mistralrs-core/README.md: -------------------------------------------------------------------------------- 1 | # `mistralrs-core` 2 | 3 | Core crate of `mistral.rs` including the models and associated executors. 4 | 5 | Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs/ 6 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | -------------------------------------------------------------------------------- /toml-selectors/gguf.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" 4 | quantized_filename = "mistral-7b-instruct-v0.1.Q4_K_M.gguf" -------------------------------------------------------------------------------- /topologies/isq_and_device.yml: -------------------------------------------------------------------------------- 1 | 0-8: 2 | isq: Q3K 3 | device: cuda[0] 4 | 8-16: 5 | isq: Q4K 6 | device: cpu 7 | 16-24: 8 | isq: Q6K 9 | # Skip 24-28 10 | 28-32: 11 | isq: Q8_0 12 | device: cuda[0] -------------------------------------------------------------------------------- /mistralrs-core/src/vision_models/llava/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod config; 2 | pub mod llava15; 3 | pub mod llava_inputs_processor; 4 | pub mod llava_llm; 5 | pub mod llava_next; 6 | pub mod llava_next_inputs_processor; 7 | mod utils; 8 | -------------------------------------------------------------------------------- /mistralrs-vision/README.md: -------------------------------------------------------------------------------- 1 | # `mistralrs-vision` 2 | 3 | This crate provides vision utilities for mistral.rs inspired by torchvision. 4 | 5 | Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs_vision/index.html -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Feature request, such as new models or new technologies 4 | title: '' 5 | labels: ["new feature"] 6 | assignees: '' 7 | 8 | --- 9 | 10 | -------------------------------------------------------------------------------- /mistralrs-core/src/paged_attention/block_engine_sequence.rs: -------------------------------------------------------------------------------- 1 | pub trait BlockEngineSequence { 2 | fn blocks_to_add_new_tok(&self) -> usize; 3 | fn get_id(&self) -> usize; 4 | fn get_logical_token_blocks(&self) -> usize; 5 | } 6 | -------------------------------------------------------------------------------- /mistralrs-core/src/dummy_paged_attention/block_engine_sequence.rs: -------------------------------------------------------------------------------- 1 | pub trait BlockEngineSequence { 2 | fn blocks_to_add_new_tok(&self) -> usize; 3 | fn get_id(&self) -> usize; 4 | fn get_logical_token_blocks(&self) -> usize; 5 | } 6 | -------------------------------------------------------------------------------- /chat_templates/chatml.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" 3 | } -------------------------------------------------------------------------------- /mistralrs-core/src/vision_models/processor_config.rs: -------------------------------------------------------------------------------- 1 | use serde::Deserialize; 2 | 3 | #[allow(dead_code)] 4 | #[derive(Deserialize, Debug)] 5 | pub struct ProcessorConfig { 6 | pub(crate) chat_template: Option, 7 | pub(crate) image_seq_len: Option, 8 | } 9 | -------------------------------------------------------------------------------- /.typos.toml: -------------------------------------------------------------------------------- 1 | [default] 2 | extend-ignore-identifiers-re = [ 3 | "Mmaped", 4 | "mmaped", 5 | "arange", 6 | "Nd", 7 | "nin" 8 | ] 9 | 10 | [files] 11 | extend-exclude = [ 12 | "mistralrs-pyo3/pdoc/*", 13 | "examples/server/phi3_duckduckgo_mistral.rs.ipynb" 14 | ] -------------------------------------------------------------------------------- /mistralrs-quant/src/gptq/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "cuda")] 2 | mod ffi; 3 | #[cfg(not(feature = "cuda"))] 4 | mod gptq_cpu; 5 | #[cfg(feature = "cuda")] 6 | mod gptq_cuda; 7 | 8 | #[cfg(not(feature = "cuda"))] 9 | pub use gptq_cpu::GptqLayer; 10 | #[cfg(feature = "cuda")] 11 | pub use gptq_cuda::GptqLayer; 12 | -------------------------------------------------------------------------------- /mistralrs-server/resources/LICENSE.md: -------------------------------------------------------------------------------- 1 | [Rust Logo](https://www.rust-lang.org/logos/rust-logo-32x32.png)(`rust-logo-32x32.png`) by Rust Foundation is licensed under 2 | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/?ref=chooser-v1). 3 | 4 | This project is not affiliated with or endorsed by the Rust Foundation. -------------------------------------------------------------------------------- /docs/IMAGEGEN_MODELS.md: -------------------------------------------------------------------------------- 1 | # Image generation model support in mistral.rs 2 | 3 | Mistral.rs supports various modalities of models, including image generation models. Image generation models take text as input and generate images. 4 | 5 | Please see docs for the following model types: 6 | 7 | - FLUX.1 [FLUX.md](FLUX.md) 8 | -------------------------------------------------------------------------------- /chat_templates/phi3.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" 3 | } -------------------------------------------------------------------------------- /examples/server/flux.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 4 | 5 | result = client.images.generate( 6 | model="flux", 7 | prompt="A vibrant sunset in the mountains, 4k, high quality.", 8 | n=1, 9 | ) 10 | print(result.data[0].url) 11 | -------------------------------------------------------------------------------- /mistralrs-core/src/aici/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] 2 | 3 | pub(crate) mod bintokens; 4 | pub(crate) mod bytes; 5 | pub(crate) mod cfg; 6 | pub(crate) mod lex; 7 | pub(crate) mod recognizer; 8 | pub(crate) mod rx; 9 | pub(crate) mod svob; 10 | pub(crate) mod toktree; 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report a bug. 4 | title: '' 5 | labels: ["bug"] 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ## Latest commit or version 14 | Which commit or version you ran with. 15 | 16 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-unknown-linux-gnu] 2 | rustflags = ["-C", "target-cpu=native"] 3 | 4 | [target.aarch64-apple-darwin] 5 | [build] 6 | rustflags = ["-C", "target-cpu=native"] 7 | 8 | [target.wasm32-unknown-unknown] 9 | rustflags = ["-C", "target-feature=+simd128"] 10 | 11 | [target.x86_64-apple-darwin] 12 | rustflags = ["-C", "target-feature=-avx,-avx2"] -------------------------------------------------------------------------------- /docs/SAMPLING.md: -------------------------------------------------------------------------------- 1 | # Sampling and penalty techniques in mistral.rs 2 | 3 | We currently support the following sampling and penalty techniques in mistral.rs: 4 | 5 | - Top K 6 | - Top P 7 | - Min P 8 | - [Dry Penalty](https://github.com/oobabooga/text-generation-webui/pull/5677) 9 | - Frequency Penalty 10 | - Presence Penalty 11 | 12 | Please suggest more by raising an issue! -------------------------------------------------------------------------------- /toml-selectors/speculative-gguf.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | arch = "mistral" 4 | 5 | [speculative] 6 | gamma = 32 7 | 8 | [speculative.draft_model] 9 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1" 10 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" 11 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf" -------------------------------------------------------------------------------- /mistralrs-core/src/layers_utils.rs: -------------------------------------------------------------------------------- 1 | use candle_core::{Result, Tensor}; 2 | 3 | pub fn repeat_kv(x: Tensor, n_rep: usize) -> Result { 4 | if n_rep == 1 { 5 | Ok(x) 6 | } else { 7 | let (b_sz, n_kv_head, seq_len, head_dim) = x.dims4()?; 8 | Tensor::cat(&vec![&x; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim)) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /mistralrs/README.md: -------------------------------------------------------------------------------- 1 | # mistral.rs Rust API: `mistralrs` 2 | [![Documentation](https://github.com/EricLBuehler/mistral.rs/actions/workflows/docs.yml/badge.csv)](https://ericlbuehler.github.io/mistral.rs/mistralrs/) 3 | 4 | Mistral.rs provides a convenient Rust multithreaded/async API. To install, add `mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git" }` to the Cargo.toml file. 5 | 6 | Examples can be found [here](examples). -------------------------------------------------------------------------------- /chat_templates/llama3.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" 3 | } -------------------------------------------------------------------------------- /mistralrs-core/src/models/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod gemma; 2 | pub(crate) mod gemma2; 3 | pub(crate) mod llama; 4 | pub(crate) mod mistral; 5 | pub(crate) mod mixtral; 6 | pub(crate) mod phi2; 7 | pub(crate) mod phi3; 8 | pub(crate) mod phi3_5_moe; 9 | pub(crate) mod quantized_llama; 10 | pub(crate) mod quantized_phi2; 11 | pub(crate) mod quantized_phi3; 12 | pub(crate) mod quantized_starcoder2; 13 | pub(crate) mod qwen2; 14 | pub(crate) mod starcoder2; 15 | -------------------------------------------------------------------------------- /examples/server/streaming_completion.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from openai import OpenAI 3 | 4 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 5 | 6 | 7 | response = client.completions.create( 8 | model="mistral", 9 | prompt="My favorite theorem is", 10 | max_tokens=32, 11 | stream=True, 12 | ) 13 | for chunk in response: 14 | delta = chunk.choices[0].text 15 | print(delta, end="") 16 | sys.stdout.flush() 17 | -------------------------------------------------------------------------------- /mistralrs-vision/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-vision" 3 | readme = "README.md" 4 | authors = ["Eric Buehler"] 5 | version.workspace = true 6 | edition.workspace = true 7 | description.workspace = true 8 | repository.workspace = true 9 | keywords.workspace = true 10 | categories.workspace = true 11 | license.workspace = true 12 | homepage.workspace = true 13 | 14 | [dependencies] 15 | candle-core.workspace = true 16 | image.workspace = true 17 | -------------------------------------------------------------------------------- /chat_templates/phi3.5.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" 3 | } -------------------------------------------------------------------------------- /toml-selectors/speculative-same-gguf.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" 4 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf" 5 | 6 | [speculative] 7 | gamma = 32 8 | 9 | [speculative.draft_model] 10 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1" 11 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" 12 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf" -------------------------------------------------------------------------------- /toml-selectors/anymoe.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | arch = "mistral" 4 | 5 | [anymoe] 6 | dataset_json = "examples/amoe.json" 7 | prefix = "model.layers" 8 | mlp = "mlp" 9 | model_ids = ["HuggingFaceH4/zephyr-7b-beta"] 10 | layers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] 11 | 12 | [anymoe.config] 13 | hidden_size = 4096 14 | epochs = 25 15 | expert_type = "fine_tuned" 16 | gate_model_id = "saved_gate" 17 | loss_csv_path = "loss.csv" 18 | -------------------------------------------------------------------------------- /mistralrs-quant/README.md: -------------------------------------------------------------------------------- 1 | # `mistralrs-quant` 2 | 3 | Quantization techniques for mistral.rs. This implements a common trait for all quantization methods to implement for ease of extension and development. 4 | 5 | Currently supported: 6 | - GGUF: `GgufMatMul` 7 | - Gptq: `GptqLayer` 8 | - Hqq: `HqqLayer` 9 | - Unquantized (used for ISQ): `UnquantLinear` 10 | 11 | Some kernels are copied or based on implementations in: 12 | - https://github.com/vllm-project/vllm 13 | - https://github.com/mobiusml/hqq 14 | -------------------------------------------------------------------------------- /toml-selectors/anymoe_lora.toml: -------------------------------------------------------------------------------- 1 | [model] 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1" 3 | arch = "mistral" 4 | 5 | [anymoe] 6 | dataset_json = "examples/amoe.json" 7 | prefix = "model.layers" 8 | mlp = "mlp" 9 | model_ids = ["typeof/zephyr-7b-beta-lora"] 10 | 11 | [anymoe.config] 12 | hidden_size = 4096 13 | epochs = 25 14 | gate_model_id = "saved_gate" 15 | loss_csv_path = "loss.csv" 16 | 17 | [anymoe.config.expert_type.lora_adapter] 18 | rank = 64 19 | alpha = 16 20 | target_modules = ["gate_proj"] 21 | -------------------------------------------------------------------------------- /chat_templates/mistral.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" 3 | } -------------------------------------------------------------------------------- /examples/python/flux.py: -------------------------------------------------------------------------------- 1 | from mistralrs import ( 2 | Runner, 3 | Which, 4 | DiffusionArchitecture, 5 | ImageGenerationResponseFormat, 6 | ) 7 | 8 | runner = Runner( 9 | which=Which.DiffusionPlain( 10 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 11 | arch=DiffusionArchitecture.FluxOffloaded, 12 | ), 13 | ) 14 | 15 | res = runner.generate_image( 16 | "A vibrant sunset in the mountains, 4k, high quality.", 17 | ImageGenerationResponseFormat.Url, 18 | ) 19 | print(res.choices[0].url) 20 | -------------------------------------------------------------------------------- /scripts/testgen_text.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | tokenizer = transformers.AutoTokenizer.from_pretrained( 4 | "mistralai/Mistral-7B-Instruct-v0.3" 5 | ) 6 | 7 | res = tokenizer.apply_chat_template( 8 | [ 9 | {"role": "user", "content": "Hello"}, 10 | {"role": "assistant", "content": "Hi there"}, 11 | {"role": "user", "content": "Who are you"}, 12 | {"role": "assistant", "content": " I am an assistant "}, 13 | {"role": "user", "content": "Another question"}, 14 | ], 15 | add_generation_prompt=True, 16 | tokenize=False, 17 | ) 18 | print(res.replace("\n", "\\n")) 19 | -------------------------------------------------------------------------------- /mistralrs-pyo3/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin==1.4"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "mistralrs" 7 | version = "0.3.1" 8 | requires-python = ">=3.8" 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | "License :: OSI Approved :: MIT License", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Rust", 16 | ] 17 | dynamic = ["description"] 18 | 19 | [tool.maturin] 20 | features = ["pyo3/extension-module"] 21 | profile = "release" 22 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-paged-attn" 3 | readme = "README.md" 4 | authors = ["Eric Buehler"] 5 | version.workspace = true 6 | edition.workspace = true 7 | description.workspace = true 8 | repository.workspace = true 9 | keywords.workspace = true 10 | categories.workspace = true 11 | license.workspace = true 12 | homepage.workspace = true 13 | 14 | [dependencies] 15 | candle-core.workspace = true 16 | half.workspace = true 17 | float8.workspace = true 18 | 19 | [build-dependencies] 20 | bindgen_cuda = {git = "https://github.com/guoqingbao/bindgen_cuda.git", version = "0.1.6"} 21 | anyhow.workspace = true 22 | 23 | [features] 24 | cuda = [] -------------------------------------------------------------------------------- /mistralrs-pyo3/pyproject_template.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin==1.4"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "$name" 7 | version = "0.3.1" 8 | requires-python = ">=3.8" 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | "License :: OSI Approved :: MIT License", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Rust", 16 | ] 17 | dynamic = ["description"] 18 | 19 | [tool.maturin] 20 | features = ["pyo3/extension-module"] 21 | profile = "release" 22 | -------------------------------------------------------------------------------- /examples/python/plain.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | arch=Architecture.Mistral, 7 | ), 8 | ) 9 | 10 | res = runner.send_chat_completion_request( 11 | ChatCompletionRequest( 12 | model="mistral", 13 | messages=[ 14 | {"role": "user", "content": "Tell me a story about the Rust type system."} 15 | ], 16 | max_tokens=256, 17 | presence_penalty=1.0, 18 | top_p=0.1, 19 | temperature=0.1, 20 | ) 21 | ) 22 | print(res.choices[0].message.content) 23 | print(res.usage) 24 | -------------------------------------------------------------------------------- /mistralrs-quant/src/utils/ffi.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::c_void; 2 | 3 | #[allow(dead_code)] 4 | extern "C" { 5 | // Linking to definitions in mistralrs-core 6 | pub(crate) fn mq_bitwise_or_u8( 7 | d_in1: *const c_void, 8 | d_in2: *const c_void, 9 | d_out: *mut c_void, 10 | N: u32, 11 | ); 12 | pub(crate) fn mq_bitwise_or_i32( 13 | d_in1: *const c_void, 14 | d_in2: *const c_void, 15 | d_out: *mut c_void, 16 | N: u32, 17 | ); 18 | 19 | pub(crate) fn mq_leftshift_u8(d_in1: *const c_void, d_out: *mut c_void, N: u32, k: i32); 20 | pub(crate) fn mq_leftshift_i32(d_in1: *const c_void, d_out: *mut c_void, N: u32, k: i32); 21 | } 22 | -------------------------------------------------------------------------------- /examples/server/yacc.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 4 | 5 | with open("examples/server/c.y", "r") as f: 6 | c_yacc = f.read() 7 | 8 | completion = client.chat.completions.create( 9 | model="mistral", 10 | messages=[ 11 | { 12 | "role": "user", 13 | "content": "Write the main function in C, returning 42. Answer with just the code, no explanation.", 14 | } 15 | ], 16 | max_tokens=256, 17 | frequency_penalty=1.0, 18 | top_p=0.1, 19 | temperature=0, 20 | extra_body={"grammar": {"type": "yacc", "value": c_yacc}}, 21 | ) 22 | 23 | print(completion.choices[0].message.content) 24 | -------------------------------------------------------------------------------- /mistralrs-vision/src/utils.rs: -------------------------------------------------------------------------------- 1 | use image::{DynamicImage, Pixel}; 2 | 3 | pub(crate) fn empty_image(h: usize, w: usize) -> Vec>> { 4 | vec![vec![vec![]; w]; h] 5 | } 6 | 7 | pub(crate) fn get_pixel_data( 8 | n_channels: usize, 9 | pixels: image::ImageBuffer, Vec>, 10 | h: usize, 11 | w: usize, 12 | ) -> Vec>> { 13 | let mut pixel_data = empty_image(h, w); 14 | for (x, y, pixel) in pixels.enumerate_pixels() { 15 | pixel_data[y as usize][x as usize] = pixel.channels()[..n_channels].to_vec() 16 | } 17 | pixel_data 18 | } 19 | 20 | pub(crate) fn n_channels(image: &DynamicImage) -> usize { 21 | image.color().channel_count() as usize 22 | } 23 | -------------------------------------------------------------------------------- /examples/python/isq.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | arch=Architecture.Mistral, 7 | ), 8 | in_situ_quant="Q4K", 9 | ) 10 | 11 | res = runner.send_chat_completion_request( 12 | ChatCompletionRequest( 13 | model="mistral", 14 | messages=[ 15 | {"role": "user", "content": "Tell me a story about the Rust type system."} 16 | ], 17 | max_tokens=256, 18 | presence_penalty=1.0, 19 | top_p=0.1, 20 | temperature=0.1, 21 | ) 22 | ) 23 | print(res.choices[0].message.content) 24 | print(res.usage) 25 | -------------------------------------------------------------------------------- /mistralrs-core/src/utils/log.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | hash::{DefaultHasher, Hash, Hasher}, 3 | sync::Mutex, 4 | }; 5 | 6 | use once_cell::sync::Lazy; 7 | use tracing::info; 8 | 9 | static HASHED_AUTOLOADER_LOGS: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); 10 | 11 | pub fn once_log_info>(msg: M) { 12 | let msg = msg.as_ref(); 13 | let mut hasher = DefaultHasher::new(); 14 | msg.hash(&mut hasher); 15 | let hash = hasher.finish(); 16 | 17 | let mut log = HASHED_AUTOLOADER_LOGS.lock().expect("Poisoned Lock"); 18 | if !log.contains(&hash) { 19 | info!("{msg}"); 20 | log.push(hasher.finish()); 21 | } else { 22 | log.push(hasher.finish()); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /docs/TOOL_CALLING.md: -------------------------------------------------------------------------------- 1 | # Tool calling 2 | 3 | Tool calling makes LLMs smarter. 4 | 5 | LLMs use tool calling to interact with the outside world. Mistral.rs has OpenAI compatible support for tool calling in all APIs, HTTP, Python, and Rust. 6 | 7 | OpenAI docs: https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models 8 | 9 | ## OpenAI compatible HTTP example 10 | Please see [our example here](../examples/server/tool_calling.py). 11 | 12 | > OpenAI docs: https://platform.openai.com/docs/api-reference/chat/create?lang=curl 13 | 14 | ## Rust example 15 | Please see [our example here](../mistralrs/examples/tools/main.rs). 16 | 17 | ## Python example 18 | Please see [our notebook here](../examples/python/tool_calling.ipynb). 19 | -------------------------------------------------------------------------------- /docs/VISION_MODELS.md: -------------------------------------------------------------------------------- 1 | # Vision model support in mistral.rs 2 | 3 | Mistral.rs supports various modalities of models, including vision models. Vision models take images and text as input and have the capability to reason over both. 4 | 5 | Please see docs for the following model types: 6 | 7 | - Phi 3 Vision: [PHI3V.md](PHI3V.md) 8 | - Idefics2: [IDEFICS2.md](IDEFICS2.md) 9 | - LLaVA and LLaVANext [LLAVA.md](LLaVA.md) 10 | - Llama 3.2 Vision [VLLAMA.md](VLLAMA.md) 11 | 12 | > Note for the Python and HTTP APIs: 13 | > We follow the OpenAI specification for structuring the image messages and allow both base64 encoded images as well as a URL/path to the image. There are many examples of this, see [this Python example](../examples/python/phi3v.py). -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/build_failure.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Build failure 3 | about: Report a build failure 4 | title: '' 5 | labels: ["bug", "build"] 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Minimum reproducible example 11 | The minium example to reproduce the error. Simpler examples make it easier and faster to fix! 12 | 13 | ## Error 14 | What is the error? If the error is very long, please try to extract an excerpt of it and post a link to a [Gist](https://gist.github.com/) 15 | 16 | ## Other information 17 | Please specify: 18 | - Operating system (Windows, MacOS, WSL2, Linux, etc) 19 | - GPU or accelerator information 20 | - If CUDA, please run `nvcc --version`, `nvidia-smi` 21 | 22 | ## Latest commit or version 23 | Which commit or version you ran with. 24 | -------------------------------------------------------------------------------- /examples/python/paged_attention.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | arch=Architecture.Mistral, 7 | ), 8 | pa_gpu_mem=4096, 9 | pa_blk_size=32, 10 | ) 11 | 12 | res = runner.send_chat_completion_request( 13 | ChatCompletionRequest( 14 | model="mistral", 15 | messages=[ 16 | {"role": "user", "content": "Tell me a story about the Rust type system."} 17 | ], 18 | max_tokens=256, 19 | presence_penalty=1.0, 20 | top_p=0.1, 21 | temperature=0.1, 22 | ) 23 | ) 24 | print(res.choices[0].message.content) 25 | print(res.usage) 26 | -------------------------------------------------------------------------------- /examples/python/mixture_of_quant_experts.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="microsoft/Phi-3.5-MoE-instruct", 6 | arch=Architecture.Mistral, 7 | organization="moqe", 8 | ), 9 | in_situ_quant="Q4K", 10 | ) 11 | 12 | res = runner.send_chat_completion_request( 13 | ChatCompletionRequest( 14 | model="mistral", 15 | messages=[ 16 | {"role": "user", "content": "Tell me a story about the Rust type system."} 17 | ], 18 | max_tokens=256, 19 | presence_penalty=1.0, 20 | top_p=0.1, 21 | temperature=0.1, 22 | ) 23 | ) 24 | print(res.choices[0].message.content) 25 | print(res.usage) 26 | -------------------------------------------------------------------------------- /examples/python/topology.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | arch=Architecture.Mistral, 7 | topology="topologies/isq.yml", 8 | ), 9 | in_situ_quant="Q4K", 10 | ) 11 | 12 | res = runner.send_chat_completion_request( 13 | ChatCompletionRequest( 14 | model="mistral", 15 | messages=[ 16 | {"role": "user", "content": "Tell me a story about the Rust type system."} 17 | ], 18 | max_tokens=256, 19 | presence_penalty=1.0, 20 | top_p=0.1, 21 | temperature=0.1, 22 | ) 23 | ) 24 | print(res.choices[0].message.content) 25 | print(res.usage) 26 | -------------------------------------------------------------------------------- /examples/python/gguf.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.GGUF( 5 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 7 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 8 | ) 9 | ) 10 | 11 | res = runner.send_chat_completion_request( 12 | ChatCompletionRequest( 13 | model="mistral", 14 | messages=[ 15 | {"role": "user", "content": "Tell me a story about the Rust type system."} 16 | ], 17 | max_tokens=256, 18 | presence_penalty=1.0, 19 | top_p=0.1, 20 | temperature=0.1, 21 | ) 22 | ) 23 | print(res.choices[0].message.content) 24 | print(res.usage) 25 | -------------------------------------------------------------------------------- /scripts/set_names.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | filename = input("Enter input ordering file: ") 4 | 5 | with open(filename, "r") as f: 6 | data = json.loads(f.read()) 7 | print( 8 | "Note: if you are using an X-LoRA model, it is very important that the adapter names are specified in the correct order" 9 | ", which is the order used during training. If you are using a LoRA model this is not necessary." 10 | ) 11 | adapters = input("Enter a comma delimited list of adapter names: ") 12 | split = adapters.split(",") 13 | split = [x for x in split if len(x) > 0] 14 | split = [x.strip() for x in split] 15 | data["order"] = split 16 | outname = input("Enter output ordering file: ") 17 | with open(outname, "w") as f: 18 | f.write(json.dumps(data)) 19 | -------------------------------------------------------------------------------- /examples/python/streaming.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.GGUF( 5 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 7 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 8 | ) 9 | ) 10 | 11 | res = runner.send_chat_completion_request( 12 | ChatCompletionRequest( 13 | model="mistral", 14 | messages=[ 15 | {"role": "user", "content": "Tell me a story about the Rust type system."} 16 | ], 17 | max_tokens=256, 18 | presence_penalty=1.0, 19 | top_p=0.1, 20 | temperature=0.1, 21 | stream=True, 22 | ) 23 | ) 24 | for chunk in res: 25 | print(chunk) 26 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(all(feature = "cuda", target_family = "unix"))] 2 | pub const COPY_BLOCKS_KERNEL: &str = 3 | include_str!(concat!(env!("OUT_DIR"), "/copy_blocks_kernel.ptx")); 4 | #[cfg(all(feature = "cuda", target_family = "unix"))] 5 | pub const PAGEDATTENTION: &str = include_str!(concat!(env!("OUT_DIR"), "/pagedattention.ptx")); 6 | #[cfg(all(feature = "cuda", target_family = "unix"))] 7 | pub const RESHAPE_AND_CACHE_KERNEL: &str = 8 | include_str!(concat!(env!("OUT_DIR"), "/reshape_and_cache_kernel.ptx")); 9 | 10 | #[cfg(all(feature = "cuda", target_family = "unix"))] 11 | mod backend; 12 | #[cfg(all(feature = "cuda", target_family = "unix"))] 13 | mod ffi; 14 | 15 | #[cfg(all(feature = "cuda", target_family = "unix"))] 16 | pub use backend::{copy_blocks, paged_attention, reshape_and_cache, swap_blocks}; 17 | -------------------------------------------------------------------------------- /chat_templates/llama2.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" 3 | } -------------------------------------------------------------------------------- /mistralrs/examples/gemma2/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextModelBuilder, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = TextModelBuilder::new("google/gemma-2-9b-it") 9 | .with_isq(IsqType::Q4K) 10 | .with_logging() 11 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 12 | .build() 13 | .await?; 14 | 15 | let request = RequestBuilder::new().add_message( 16 | TextMessageRole::User, 17 | "Please write a mathematical equation where a few numbers are added.", 18 | ); 19 | 20 | let response = model.send_chat_request(request).await?; 21 | 22 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 23 | 24 | Ok(()) 25 | } 26 | -------------------------------------------------------------------------------- /scripts/lora_add_preload_adapters.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | filename = input("Enter input ordering file: ") 4 | 5 | with open(filename, "r") as f: 6 | data = json.loads(f.read()) 7 | preload_adapters = input( 8 | "Enter a comma delimited list of *preloaded* adapter names to preload: " 9 | ) 10 | preload_adapters_model_id = input( 11 | "Enter the model id where the preload adapters will be loaded: " 12 | ) 13 | split = preload_adapters.split(",") 14 | split = [x for x in split if len(x) > 0] 15 | split = [x.strip() for x in split] 16 | res = [] 17 | for s in split: 18 | res.append({"name": s, "adapter_model_id": preload_adapters_model_id}) 19 | data["preload_adapters"] = res 20 | outname = input("Enter output ordering file: ") 21 | with open(outname, "w") as f: 22 | f.write(json.dumps(data)) 23 | -------------------------------------------------------------------------------- /examples/server/streaming.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from openai import OpenAI 3 | 4 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 5 | 6 | messages = [] 7 | prompt = input("Enter system prompt >>> ") 8 | if len(prompt) > 0: 9 | messages.append({"role": "system", "content": prompt}) 10 | 11 | 12 | while True: 13 | prompt = input(">>> ") 14 | messages.append({"role": "user", "content": prompt}) 15 | resp = "" 16 | response = client.chat.completions.create( 17 | model="mistral", 18 | messages=messages, 19 | max_tokens=256, 20 | stream=True, 21 | ) 22 | for chunk in response: 23 | delta = chunk.choices[0].delta.content 24 | print(delta, end="") 25 | sys.stdout.flush() 26 | resp += delta 27 | messages.append({"role": "assistant", "content": resp}) 28 | print() 29 | -------------------------------------------------------------------------------- /mistralrs-quant/kernels/gptq/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_8_cuh 6 | #define _qdq_8_cuh 7 | 8 | #include "qdq_util.cuh" 9 | 10 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {} 11 | 12 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0, 13 | const uint32_t q_1, 14 | half2 (&dq)[4], int stride, 15 | const uint32_t zero) { 16 | half dqh[8]; 17 | for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero); 18 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); 19 | 20 | for (int i = 0; i < 4; i++) 21 | dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /examples/python/xlora_gemma.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.XLora( 5 | model_id=None, # Automatically determine from ordering file 6 | xlora_model_id="lamm-mit/x-lora-gemma-7b", 7 | order="orderings/xlora-gemma-paper-ordering.json", 8 | tgt_non_granular_index=None, 9 | arch=Architecture.Mistral, 10 | ) 11 | ) 12 | 13 | res = runner.send_chat_completion_request( 14 | ChatCompletionRequest( 15 | model="mistral", 16 | messages=[ 17 | {"role": "user", "content": "Tell me a story about the Rust type system."} 18 | ], 19 | max_tokens=256, 20 | presence_penalty=1.0, 21 | top_p=0.1, 22 | temperature=0.5, 23 | ) 24 | ) 25 | print(res.choices[0].message.content) 26 | print(res.usage) 27 | -------------------------------------------------------------------------------- /mistralrs-core/src/tools/response.rs: -------------------------------------------------------------------------------- 1 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass(eq, eq_int))] 2 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] 3 | #[derive(Clone, Debug, serde::Serialize, PartialEq)] 4 | #[serde(rename_all = "snake_case")] 5 | pub enum ToolCallType { 6 | Function, 7 | } 8 | 9 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)] 10 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] 11 | #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] 12 | pub struct CalledFunction { 13 | pub name: String, 14 | pub arguments: String, 15 | } 16 | 17 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)] 18 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] 19 | #[derive(Clone, Debug, serde::Serialize)] 20 | pub struct ToolCallResponse { 21 | pub id: String, 22 | #[serde(rename = "type")] 23 | pub tp: ToolCallType, 24 | pub function: CalledFunction, 25 | } 26 | -------------------------------------------------------------------------------- /examples/python/token_source.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.GGUF( 5 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 7 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 8 | ), 9 | token_source="literal: ...", # One of: "literal:", "env:", "path:", "cache", "none" 10 | ) 11 | 12 | res = runner.send_chat_completion_request( 13 | ChatCompletionRequest( 14 | model="mistral", 15 | messages=[ 16 | {"role": "user", "content": "Tell me a story about the Rust type system."} 17 | ], 18 | max_tokens=256, 19 | presence_penalty=1.0, 20 | top_p=0.1, 21 | temperature=0.1, 22 | ) 23 | ) 24 | print(res.choices[0].message.content) 25 | print(res.usage) 26 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Pattern syntax: 2 | # https://git-scm.com/docs/gitignore#_pattern_format 3 | 4 | # Normalize line endings of all non-binary files to LF upon check-in (`git add` / `git commit`): 5 | * text=auto 6 | 7 | # Use `eol=lf` / `eol=crlf` to enforce specific line endings on checkout for compatibility: 8 | # https://www.git-scm.com/docs/gitattributes/#_eol 9 | # NOTE: 10 | # - This setting implies the `text` attribute. 11 | # - `eol=lf` may not work as expected, if a file was committed with CRLF prior to the introduction of `.gitattribtues`. 12 | # 13 | # Relevant files for this setting: 14 | # - `.sh` (LF) / `.bat` (CRLF) and similar scripts that are platform specific. 15 | # - Scripts that utilize a shebang (`#!/usr/bin/env python3`) to hint the interpreter to run. 16 | # - `Dockerfile` (base image environment may require LF): 17 | # https://github.com/EricLBuehler/mistral.rs/pull/361 18 | 19 | Dockerfile* eol=lf 20 | -------------------------------------------------------------------------------- /mistralrs-quant/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-quant" 3 | readme = "README.md" 4 | authors = ["Eric Buehler"] 5 | version.workspace = true 6 | edition.workspace = true 7 | description.workspace = true 8 | repository.workspace = true 9 | keywords.workspace = true 10 | categories.workspace = true 11 | license.workspace = true 12 | homepage.workspace = true 13 | 14 | [dependencies] 15 | candle-core.workspace = true 16 | candle-nn.workspace = true 17 | half.workspace = true 18 | serde.workspace = true 19 | lazy_static = "1.4" 20 | paste = "1.0.15" 21 | tracing.workspace = true 22 | rayon.workspace = true 23 | byteorder = "1.5.0" 24 | float8.workspace = true 25 | once_cell.workspace = true 26 | 27 | [features] 28 | cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda"] 29 | metal = ["candle-core/metal", "candle-nn/metal"] 30 | 31 | [build-dependencies] 32 | bindgen_cuda = { version = "0.1.5", optional = true } 33 | -------------------------------------------------------------------------------- /examples/python/lora_activation.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.LoraGGUF( 5 | tok_model_id=None, # Automatically determine from ordering file 6 | quantized_model_id="TheBloke/zephyr-7B-beta-GGUF", 7 | quantized_filename="zephyr-7b-beta.Q4_0.gguf", 8 | xlora_model_id="lamm-mit/x-lora", 9 | order="orderings/xlora-paper-ordering.json", 10 | ) 11 | ) 12 | 13 | res = runner.send_chat_completion_request( 14 | ChatCompletionRequest( 15 | model="mistral", 16 | messages=[ 17 | {"role": "user", "content": "Tell me a story about the Rust type system."} 18 | ], 19 | max_tokens=256, 20 | presence_penalty=1.0, 21 | top_p=0.1, 22 | temperature=0.5, 23 | adapters=["adapter_4"], 24 | ) 25 | ) 26 | print(res.choices[0].message.content) 27 | print(res.usage) 28 | -------------------------------------------------------------------------------- /examples/python/xlora_zephyr.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.XLoraGGUF( 5 | tok_model_id=None, # Automatically determine from ordering file 6 | quantized_model_id="TheBloke/zephyr-7B-beta-GGUF", 7 | quantized_filename="zephyr-7b-beta.Q4_0.gguf", 8 | xlora_model_id="lamm-mit/x-lora", 9 | order="orderings/xlora-paper-ordering.json", 10 | tgt_non_granular_index=None, 11 | ) 12 | ) 13 | 14 | res = runner.send_chat_completion_request( 15 | ChatCompletionRequest( 16 | model="mistral", 17 | messages=[ 18 | {"role": "user", "content": "Tell me a story about the Rust type system."} 19 | ], 20 | max_tokens=256, 21 | presence_penalty=1.0, 22 | top_p=0.1, 23 | temperature=0.5, 24 | ) 25 | ) 26 | print(res.choices[0].message.content) 27 | print(res.usage) 28 | -------------------------------------------------------------------------------- /docs/LORA_XLORA.md: -------------------------------------------------------------------------------- 1 | # Examples of LoRA and X-LoRA models 2 | 3 | - X-LoRA with no quantization 4 | 5 | To start an X-LoRA server with the exactly as presented in [the paper](https://arxiv.org/abs/2402.07148): 6 | 7 | ```bash 8 | ./mistralrs-server --port 1234 x-lora-plain -o orderings/xlora-paper-ordering.json -x lamm-mit/x-lora 9 | ``` 10 | - LoRA with a model from GGUF 11 | 12 | To start an LoRA server with adapters from the X-LoRA paper (you should modify the ordering file to use only one adapter, as the adapter static scalings are all 1 and so the signal will become distorted): 13 | 14 | ```bash 15 | ./mistralrs-server --port 1234 lora-gguf -o orderings/xlora-paper-ordering.json -m TheBloke/zephyr-7B-beta-GGUF -f zephyr-7b-beta.Q8_0.gguf -a lamm-mit/x-lora 16 | ``` 17 | 18 | Normally with a LoRA model you would use a custom ordering file. However, for this example we use the ordering from the X-LoRA paper because we are using the adapters from the X-LoRA paper. -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## Models 4 | - Image generation [models](IMAGEGEN_MODELS.md) 5 | - Vision [models](VISION_MODELS.md) 6 | 7 | - [FLUX](FLUX.md) 8 | - [Gemma 2](GEMMA2.md) 9 | - [Idefics 2](IDEFICS2.md) 10 | - [LLaVA](LLaVA.md) 11 | - [Phi 3.5 MoE](PHI3.5MOE.md) 12 | - [Phi 3.5 Vision](PHI3V.md) 13 | - [Llama 3.2 Vision](VLLAMA.md) 14 | 15 | ## Adapters 16 | - [Docs](ADAPTER_MODELS.md) 17 | - [X-LoRA non-granular](NON_GRANULAR.md) 18 | - [LoRA and X-LoRA examples](LORA_XLORA.md) 19 | 20 | ## Quantization 21 | - [Docs](QUANTS.md) 22 | - [ISQ](ISQ.md) 23 | - [UQFF](UQFF.md) 24 | - [Topology](TOPOLOLGY.md) 25 | 26 | ## Other 27 | - [Chat templates and tokenizers](CHAT_TOK.md) 28 | - [Paged Attention](PAGED_ATTENTION.md) 29 | - [Sampling](SAMPLING.md) 30 | - [TOML selector](TOML_SELECTOR.md) 31 | - [Tool calling](TOOL_CALLING.md) 32 | 33 | ## Cross-device inference 34 | - [Device mapping](DEVICE_MAPPING.md) 35 | - [Topology](TOPOLOLGY.md) 36 | 37 | -------------------------------------------------------------------------------- /mistralrs/examples/grammar/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextModelBuilder, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 9 | .with_isq(IsqType::Q4K) 10 | .with_logging() 11 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 12 | .build() 13 | .await?; 14 | 15 | // Bullet list regex 16 | let request = RequestBuilder::new() 17 | .set_constraint(mistralrs::Constraint::Regex( 18 | "(- [^\n]*\n)+(- [^\n]*)(\n\n)?".to_string(), 19 | )) 20 | .add_message(TextMessageRole::User, "Please write a few jokes."); 21 | 22 | let response = model.send_chat_request(request).await?; 23 | 24 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 25 | 26 | Ok(()) 27 | } 28 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/cuda_compat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | #ifndef USE_ROCM 5 | #define VLLM_LDG(arg) __ldg(arg) 6 | #else 7 | #define VLLM_LDG(arg) *(arg) 8 | #endif 9 | 10 | #ifndef USE_ROCM 11 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) 12 | #else 13 | #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) 14 | #endif 15 | 16 | #ifndef USE_ROCM 17 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) 18 | #else 19 | #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) 20 | #endif 21 | 22 | #ifndef USE_ROCM 23 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 24 | cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) 25 | #else 26 | #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ 27 | hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) 28 | #endif 29 | 30 | -------------------------------------------------------------------------------- /mistralrs-core/src/aici/bytes.rs: -------------------------------------------------------------------------------- 1 | use std::mem::size_of; 2 | 3 | use bytemuck_derive::{Pod, Zeroable}; 4 | 5 | pub(crate) type TokenId = u32; 6 | 7 | #[derive(Clone, Copy, PartialEq, Eq, Debug, Zeroable, Pod)] 8 | #[repr(C)] 9 | pub struct TokRxInfo { 10 | pub vocab_size: u32, 11 | pub tok_eos: TokenId, 12 | } 13 | 14 | #[derive(Clone, Copy, PartialEq, Eq, Debug, Zeroable, Pod)] 15 | #[repr(C)] 16 | pub struct U32Pair(pub u32, pub u32); 17 | 18 | pub fn vec_from_bytes(bytes: &[u8]) -> Vec { 19 | if bytes.len() % size_of::() != 0 { 20 | panic!( 21 | "vecT: got {} bytes, needed multiple of {}", 22 | bytes.len(), 23 | size_of::() 24 | ); 25 | } 26 | bytemuck::cast_slice(bytes).to_vec() 27 | } 28 | 29 | pub fn to_hex_string(bytes: &[u8]) -> String { 30 | bytes 31 | .iter() 32 | .map(|b| format!("{:02x}", b)) 33 | .collect::>() 34 | .join("") 35 | } 36 | -------------------------------------------------------------------------------- /examples/python/speculative.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 2 | 3 | runner = Runner( 4 | which=Which.Plain( 5 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 6 | arch=Architecture.Mistral, 7 | ), 8 | which_draft=Which.GGUF( 9 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 10 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 11 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 12 | ), 13 | speculative_gamma=32, 14 | ) 15 | 16 | res = runner.send_chat_completion_request( 17 | ChatCompletionRequest( 18 | model="mistral", 19 | messages=[ 20 | {"role": "user", "content": "Tell me a story about the Rust type system."} 21 | ], 22 | max_tokens=256, 23 | presence_penalty=1.0, 24 | top_p=0.1, 25 | temperature=0.1, 26 | ) 27 | ) 28 | print(res.choices[0].message.content) 29 | print(res.usage) 30 | -------------------------------------------------------------------------------- /docs/DEVICE_MAPPING.md: -------------------------------------------------------------------------------- 1 | # Device mapping 2 | 3 | There are 2 ways to do device mapping: 4 | 1) Specify the number of layers to put on the GPU - this uses the GPU with ordinal 0. 5 | 2) Specify the ordinals and number of layers - this allows for cross-GPU device mapping. 6 | 7 | The format for the ordinals and number of layers is `ORD:NUM;...` where ORD is the unique ordinal and NUM is the number of layers for that GPU. This may be repeated as many times as necessary. 8 | 9 | > Note: We refer to GPU layers as "device layers" throughout mistral.rs. 10 | 11 | ## Example of specifying ordinals 12 | ``` 13 | cargo run --release --features cuda -- -n "0:16;1:16" -i plain -m gradientai/Llama-3-8B-Instruct-262k -a llama 14 | ``` 15 | 16 | > Note: In the Python API, the "0:16;1:16" string is passed as the list `["0:16", "1:16"]`. 17 | 18 | ## Example of specifying the number of GPU layers 19 | ``` 20 | cargo run --release --features cuda -- -n 16 -i plain -m gradientai/Llama-3-8B-Instruct-262k -a llama 21 | ``` -------------------------------------------------------------------------------- /examples/python/lora_zephyr.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.LoraGGUF( 5 | tok_model_id=None, # Automatically determine from ordering file 6 | quantized_model_id="TheBloke/zephyr-7B-beta-GGUF", 7 | quantized_filename="zephyr-7b-beta.Q4_0.gguf", 8 | xlora_model_id="lamm-mit/x-lora", 9 | order="orderings/xlora-paper-ordering.json", 10 | ) 11 | ) 12 | 13 | # Example: Make adapter_3 the active adapter 14 | runner.activate_adapters(["adapter_3"]) 15 | 16 | res = runner.send_chat_completion_request( 17 | ChatCompletionRequest( 18 | model="mistral", 19 | messages=[ 20 | {"role": "user", "content": "Tell me a story about the Rust type system."} 21 | ], 22 | max_tokens=256, 23 | presence_penalty=1.0, 24 | top_p=0.1, 25 | temperature=0.5, 26 | ) 27 | ) 28 | print(res.choices[0].message.content) 29 | print(res.usage) 30 | -------------------------------------------------------------------------------- /mistralrs-core/src/diffusion_models/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod clip; 2 | pub(crate) mod flux; 3 | pub(crate) mod processor; 4 | pub(crate) mod response; 5 | pub(crate) mod t5; 6 | 7 | macro_rules! generate_repr { 8 | ($t:ident) => { 9 | #[cfg(feature = "pyo3_macros")] 10 | #[pyo3::pymethods] 11 | impl $t { 12 | fn __repr__(&self) -> String { 13 | format!("{self:#?}") 14 | } 15 | } 16 | }; 17 | } 18 | 19 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)] 20 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))] 21 | #[derive(Debug, Clone)] 22 | pub struct DiffusionGenerationParams { 23 | pub height: usize, 24 | pub width: usize, 25 | } 26 | 27 | generate_repr!(DiffusionGenerationParams); 28 | 29 | impl Default for DiffusionGenerationParams { 30 | /// Image dimensions will be 720x1280. 31 | fn default() -> Self { 32 | Self { 33 | height: 720, 34 | width: 1280, 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /mistralrs-bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-bench" 3 | publish = false 4 | version.workspace = true 5 | edition.workspace = true 6 | description.workspace = true 7 | homepage.workspace = true 8 | repository.workspace = true 9 | keywords.workspace = true 10 | categories.workspace = true 11 | license.workspace = true 12 | 13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 14 | 15 | [dependencies] 16 | anyhow.workspace = true 17 | candle-core.workspace = true 18 | serde.workspace = true 19 | serde_json.workspace = true 20 | clap.workspace = true 21 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core" } 22 | tracing.workspace = true 23 | tokio.workspace = true 24 | cli-table = "0.4.7" 25 | 26 | [features] 27 | cuda = ["mistralrs-core/cuda"] 28 | cudnn = ["mistralrs-core/cudnn"] 29 | metal = ["mistralrs-core/metal"] 30 | flash-attn = ["cuda", "mistralrs-core/flash-attn"] 31 | accelerate = ["mistralrs-core/accelerate"] 32 | mkl = ["mistralrs-core/mkl"] 33 | -------------------------------------------------------------------------------- /mistralrs-pyo3/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | .pytest_cache/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | .venv/ 14 | env/ 15 | bin/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | include/ 26 | man/ 27 | venv/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | pip-selfcheck.json 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | 48 | # Mr Developer 49 | .mr.developer.cfg 50 | .project 51 | .pydevproject 52 | 53 | # Rope 54 | .ropeproject 55 | 56 | # Django stuff: 57 | *.log 58 | *.pot 59 | 60 | .DS_Store 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyCharm 66 | .idea/ 67 | 68 | # VSCode 69 | .vscode/ 70 | 71 | # Pyenv 72 | .python-version 73 | -------------------------------------------------------------------------------- /mistralrs-core/src/tools/request.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use serde_json::Value; 4 | 5 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] 6 | pub enum ToolType { 7 | #[serde(rename = "function")] 8 | Function, 9 | } 10 | 11 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] 12 | pub enum ToolChoice { 13 | #[serde(rename = "none")] 14 | /// Disallow selection of tools. 15 | None, 16 | #[serde(rename = "auto")] 17 | /// Allow automatic selection of any given tool, or none. 18 | Auto, 19 | #[serde(untagged)] 20 | /// Force selection of a given tool. 21 | Tool(Tool), 22 | } 23 | 24 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] 25 | pub struct Function { 26 | pub description: Option, 27 | pub name: String, 28 | pub parameters: Option>, 29 | } 30 | 31 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] 32 | pub struct Tool { 33 | #[serde(rename = "type")] 34 | pub tp: ToolType, 35 | pub function: Function, 36 | } 37 | -------------------------------------------------------------------------------- /mistralrs/examples/flux/main.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use anyhow::Result; 4 | use mistralrs::{ 5 | DiffusionGenerationParams, DiffusionLoaderType, DiffusionModelBuilder, 6 | ImageGenerationResponseFormat, 7 | }; 8 | 9 | #[tokio::main] 10 | async fn main() -> Result<()> { 11 | let model = DiffusionModelBuilder::new( 12 | "black-forest-labs/FLUX.1-schnell", 13 | DiffusionLoaderType::FluxOffloaded, 14 | ) 15 | .with_logging() 16 | .build() 17 | .await?; 18 | 19 | let start = Instant::now(); 20 | 21 | let response = model 22 | .generate_image( 23 | "A vibrant sunset in the mountains, 4k, high quality.".to_string(), 24 | ImageGenerationResponseFormat::Url, 25 | DiffusionGenerationParams::default(), 26 | ) 27 | .await?; 28 | 29 | let finished = Instant::now(); 30 | 31 | println!( 32 | "Done! Took {} s. Image saved at: {}", 33 | finished.duration_since(start).as_secs_f32(), 34 | response.data[0].url.as_ref().unwrap() 35 | ); 36 | 37 | Ok(()) 38 | } 39 | -------------------------------------------------------------------------------- /mistralrs/examples/xlora/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | use anyhow::Result; 4 | use mistralrs::{TextMessageRole, TextMessages, TextModelBuilder, XLoraModelBuilder}; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = 9 | XLoraModelBuilder::from_text_model_builder( 10 | TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(), 11 | "lamm-mit/x-lora", 12 | serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| { 13 | panic!("Could not load ordering file at my-ordering-file.json") 14 | }))?, 15 | ) 16 | .build() 17 | .await?; 18 | 19 | let messages = 20 | TextMessages::new().add_message(TextMessageRole::User, "Hello! What is graphene."); 21 | 22 | let response = model.send_chat_request(messages).await?; 23 | 24 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 25 | dbg!( 26 | response.usage.avg_prompt_tok_per_sec, 27 | response.usage.avg_compl_tok_per_sec 28 | ); 29 | 30 | Ok(()) 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Eric Buehler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /mistralrs-core/src/paged_attention/config.rs: -------------------------------------------------------------------------------- 1 | pub trait ModelConfigLike { 2 | fn num_layers(&self) -> usize; 3 | fn hidden_size(&self) -> usize; 4 | fn num_kv_heads(&self) -> usize; 5 | fn num_attn_heads(&self) -> usize; 6 | fn head_dim(&self) -> usize { 7 | self.hidden_size() / self.num_attn_heads() 8 | } 9 | } 10 | 11 | pub struct ModelConfigMetadata { 12 | pub num_layers: usize, 13 | pub hidden_size: usize, 14 | pub num_kv_heads: usize, 15 | pub num_attn_heads: usize, 16 | pub sliding_window: Option, 17 | pub head_dim: Option, 18 | } 19 | 20 | impl ModelConfigLike for ModelConfigMetadata { 21 | fn hidden_size(&self) -> usize { 22 | self.hidden_size 23 | } 24 | fn num_attn_heads(&self) -> usize { 25 | self.num_attn_heads 26 | } 27 | fn num_kv_heads(&self) -> usize { 28 | self.num_kv_heads 29 | } 30 | fn num_layers(&self) -> usize { 31 | self.num_layers 32 | } 33 | fn head_dim(&self) -> usize { 34 | self.head_dim 35 | .unwrap_or(self.hidden_size() / self.num_attn_heads()) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /mistralrs-core/src/dummy_paged_attention/config.rs: -------------------------------------------------------------------------------- 1 | pub trait ModelConfigLike { 2 | fn num_layers(&self) -> usize; 3 | fn hidden_size(&self) -> usize; 4 | fn num_kv_heads(&self) -> usize; 5 | fn num_attn_heads(&self) -> usize; 6 | fn head_dim(&self) -> usize { 7 | self.hidden_size() / self.num_attn_heads() 8 | } 9 | } 10 | 11 | pub struct ModelConfigMetadata { 12 | pub num_layers: usize, 13 | pub hidden_size: usize, 14 | pub num_kv_heads: usize, 15 | pub num_attn_heads: usize, 16 | pub sliding_window: Option, 17 | pub head_dim: Option, 18 | } 19 | 20 | impl ModelConfigLike for ModelConfigMetadata { 21 | fn hidden_size(&self) -> usize { 22 | self.hidden_size 23 | } 24 | fn num_attn_heads(&self) -> usize { 25 | self.num_attn_heads 26 | } 27 | fn num_kv_heads(&self) -> usize { 28 | self.num_kv_heads 29 | } 30 | fn num_layers(&self) -> usize { 31 | self.num_layers 32 | } 33 | fn head_dim(&self) -> usize { 34 | self.head_dim 35 | .unwrap_or(self.hidden_size() / self.num_attn_heads()) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /mistralrs-core/src/vision_models/mod.rs: -------------------------------------------------------------------------------- 1 | use std::any::Any; 2 | 3 | use candle_core::Tensor; 4 | 5 | pub(crate) mod clip; 6 | pub(crate) mod idefics2; 7 | pub(crate) mod idefics2_input_processor; 8 | pub(crate) mod image_processor; 9 | pub(crate) mod mllama; 10 | 11 | pub(crate) mod llava; 12 | pub(crate) mod phi3; 13 | pub(crate) mod phi3_inputs_processor; 14 | pub(crate) mod preprocessor_config; 15 | pub(crate) mod processor_config; 16 | pub(crate) use llava::llava15; 17 | pub(crate) use llava::llava_inputs_processor; 18 | pub(crate) use llava::llava_next; 19 | pub(crate) use llava::llava_next_inputs_processor; 20 | 21 | use crate::pipeline::text_models_inputs_processor::{FlashParams, PagedAttentionInputMetadata}; 22 | 23 | pub struct ModelInputs { 24 | pub input_ids: Tensor, 25 | pub seqlen_offsets: Vec, 26 | pub seqlen_offsets_kernel: Tensor, 27 | pub context_lens: Vec<(usize, usize)>, 28 | pub position_ids: Vec, 29 | pub pixel_values: Option, 30 | pub model_specific_args: Box, 31 | pub paged_attn_meta: Option, 32 | pub flash_meta: FlashParams, 33 | } 34 | -------------------------------------------------------------------------------- /mistralrs/examples/gguf/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{GgufModelBuilder, TextMessageRole, TextMessages}; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<()> { 6 | let model = GgufModelBuilder::new( 7 | "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", 8 | vec!["Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"], 9 | ) 10 | .with_tok_model_id("meta-llama/Meta-Llama-3.1-8B-Instruct") 11 | .with_logging() 12 | .build() 13 | .await?; 14 | 15 | let messages = TextMessages::new() 16 | .add_message( 17 | TextMessageRole::System, 18 | "You are an AI agent with a specialty in programming.", 19 | ) 20 | .add_message( 21 | TextMessageRole::User, 22 | "Hello! How are you? Please write generic binary search function in Rust.", 23 | ); 24 | 25 | let response = model.send_chat_request(messages).await?; 26 | 27 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 28 | dbg!( 29 | response.usage.avg_prompt_tok_per_sec, 30 | response.usage.avg_compl_tok_per_sec 31 | ); 32 | 33 | Ok(()) 34 | } 35 | -------------------------------------------------------------------------------- /mistralrs/examples/phi3_5_moe/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = TextModelBuilder::new("microsoft/Phi-3.5-MoE-instruct") 9 | .with_isq(IsqType::Q4K) 10 | .with_logging() 11 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 12 | .build() 13 | .await?; 14 | 15 | let messages = TextMessages::new() 16 | .add_message( 17 | TextMessageRole::System, 18 | "You are an AI agent with a specialty in programming.", 19 | ) 20 | .add_message( 21 | TextMessageRole::User, 22 | "Hello! How are you? Please write generic binary search function in Rust.", 23 | ); 24 | 25 | let response = model.send_chat_request(messages).await?; 26 | 27 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 28 | dbg!( 29 | response.usage.avg_prompt_tok_per_sec, 30 | response.usage.avg_compl_tok_per_sec 31 | ); 32 | 33 | Ok(()) 34 | } 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:latest AS builder 2 | 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 4 | && rm -rf /var/lib/apt/lists/* 5 | 6 | WORKDIR /mistralrs 7 | 8 | COPY . . 9 | 10 | RUN cargo build --release --workspace --exclude mistralrs-pyo3 11 | 12 | FROM debian:bookworm-slim AS base 13 | 14 | ENV HUGGINGFACE_HUB_CACHE=/data \ 15 | PORT=80 \ 16 | MKL_ENABLE_INSTRUCTIONS=AVX512_E4 \ 17 | RAYON_NUM_THREADS=8 \ 18 | LD_LIBRARY_PATH=/usr/local/lib 19 | 20 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 21 | libomp-dev \ 22 | ca-certificates \ 23 | libssl-dev \ 24 | curl \ 25 | pkg-config \ 26 | && rm -rf /var/lib/apt/lists/* 27 | 28 | FROM base 29 | 30 | COPY --from=builder /mistralrs/target/release/mistralrs-bench /usr/local/bin/mistralrs-bench 31 | RUN chmod +x /usr/local/bin/mistralrs-bench 32 | COPY --from=builder /mistralrs/target/release/mistralrs-server /usr/local/bin/mistralrs-server 33 | RUN chmod +x /usr/local/bin/mistralrs-server 34 | ENTRYPOINT ["mistralrs-server", "--port", "80", "--token-source", "env:HUGGING_FACE_HUB_TOKEN"] -------------------------------------------------------------------------------- /mistralrs/examples/lora/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | use anyhow::Result; 4 | use mistralrs::{LoraModelBuilder, TextMessageRole, TextMessages, TextModelBuilder}; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = 9 | LoraModelBuilder::from_text_model_builder( 10 | TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(), 11 | "lamm-mit/x-lora", 12 | serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| { 13 | panic!("Could not load ordering file at my-ordering-file.json") 14 | }))?, 15 | ) 16 | .build() 17 | .await?; 18 | 19 | let messages = TextMessages::new().add_message( 20 | TextMessageRole::User, 21 | "Hello! How are you? Please write generic binary search function in Rust.", 22 | ); 23 | 24 | let response = model.send_chat_request(messages).await?; 25 | 26 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 27 | dbg!( 28 | response.usage.avg_prompt_tok_per_sec, 29 | response.usage.avg_compl_tok_per_sec 30 | ); 31 | 32 | Ok(()) 33 | } 34 | -------------------------------------------------------------------------------- /examples/python/idefics2.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | 3 | runner = Runner( 4 | which=Which.VisionPlain( 5 | model_id="lamm-mit/Cephalo-Idefics-2-vision-8b-beta", 6 | arch=VisionArchitecture.Idefics2, 7 | ), 8 | ) 9 | 10 | res = runner.send_chat_completion_request( 11 | ChatCompletionRequest( 12 | model="idefics2", 13 | messages=[ 14 | { 15 | "role": "user", 16 | "content": [ 17 | { 18 | "type": "image_url", 19 | "image_url": { 20 | "url": "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg" 21 | }, 22 | }, 23 | { 24 | "type": "text", 25 | "text": "What is shown in this image?", 26 | }, 27 | ], 28 | }, 29 | ], 30 | max_tokens=256, 31 | presence_penalty=1.0, 32 | top_p=0.1, 33 | temperature=0.1, 34 | ) 35 | ) 36 | print(res.choices[0].message.content) 37 | print(res.usage) 38 | -------------------------------------------------------------------------------- /examples/python/speculative_xlora.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest 2 | 3 | runner = Runner( 4 | which=Which.XLoraGGUF( 5 | tok_model_id=None, # Automatically determine from ordering file 6 | quantized_model_id="TheBloke/zephyr-7B-beta-GGUF", 7 | quantized_filename="zephyr-7b-beta.Q4_0.gguf", 8 | xlora_model_id="lamm-mit/x-lora", 9 | order="orderings/xlora-paper-ordering.json", 10 | tgt_non_granular_index=None, 11 | ), 12 | which_draft=Which.GGUF( 13 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 14 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 15 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 16 | ), 17 | speculative_gamma=32, 18 | ) 19 | 20 | res = runner.send_chat_completion_request( 21 | ChatCompletionRequest( 22 | model="mistral", 23 | messages=[ 24 | {"role": "user", "content": "Tell me a story about the Rust type system."} 25 | ], 26 | max_tokens=256, 27 | presence_penalty=1.0, 28 | top_p=0.1, 29 | temperature=0.1, 30 | ) 31 | ) 32 | print(res.choices[0].message.content) 33 | print(res.usage) 34 | -------------------------------------------------------------------------------- /mistralrs/examples/mixture_of_quant_experts/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = TextModelBuilder::new("microsoft/Phi-3.5-MoE-instruct") 9 | .with_isq(IsqType::Q4K) 10 | .with_mixture_qexperts_isq() 11 | .with_logging() 12 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 13 | .build() 14 | .await?; 15 | 16 | let messages = TextMessages::new() 17 | .add_message( 18 | TextMessageRole::System, 19 | "You are an AI agent with a specialty in programming.", 20 | ) 21 | .add_message( 22 | TextMessageRole::User, 23 | "Hello! How are you? Please write generic binary search function in Rust.", 24 | ); 25 | 26 | let response = model.send_chat_request(messages).await?; 27 | 28 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 29 | dbg!( 30 | response.usage.avg_prompt_tok_per_sec, 31 | response.usage.avg_compl_tok_per_sec 32 | ); 33 | 34 | Ok(()) 35 | } 36 | -------------------------------------------------------------------------------- /mistralrs-quant/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "cuda")] 2 | mod ffi; 3 | pub(crate) mod isq; 4 | mod ops; 5 | 6 | mod uqff; 7 | 8 | pub use ops::{BitWiseOp, LeftshiftOp}; 9 | pub(crate) use uqff::{ 10 | deserialize_tensor, read_dtype, serialize_tensor, version_is_compatible, write_dtype, 11 | HQFF_VERSION, 12 | }; 13 | 14 | #[cfg(feature = "cuda")] 15 | use candle_core::{ 16 | cuda::{cudarc::driver::DevicePtr, CudaDType}, 17 | CudaDevice, Device, Storage, Tensor, WithDType, 18 | }; 19 | 20 | #[cfg(feature = "cuda")] 21 | pub(crate) fn get_cuda_slice( 22 | x: &Tensor, 23 | ) -> candle_core::Result<*const T> { 24 | let offset = x.layout().start_offset(); 25 | match &*x.storage_and_layout().0 { 26 | Storage::Cuda(a_storage) => { 27 | Ok(*a_storage.as_cuda_slice::()?.slice(offset..).device_ptr() as *const T) 28 | } 29 | _ => candle_core::bail!("Expected CUDA storage."), 30 | } 31 | } 32 | 33 | #[cfg(feature = "cuda")] 34 | pub(crate) fn get_cuda_device(x: &Tensor) -> candle_core::Result<&CudaDevice> { 35 | match x.device() { 36 | Device::Cuda(dev) => Ok(dev), 37 | _ => candle_core::bail!("Expected CUDA device"), 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /examples/python/phi3v_local_img.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | 3 | runner = Runner( 4 | which=Which.VisionPlain( 5 | model_id="microsoft/Phi-3.5-vision-instruct", 6 | arch=VisionArchitecture.Phi3V, 7 | ), 8 | ) 9 | 10 | FILENAME = "picture.jpg" 11 | 12 | res = runner.send_chat_completion_request( 13 | ChatCompletionRequest( 14 | model="phi3v", 15 | messages=[ 16 | { 17 | "role": "user", 18 | "content": [ 19 | { 20 | "type": "image_url", 21 | "image_url": { 22 | "url": FILENAME, 23 | }, 24 | }, 25 | { 26 | "type": "text", 27 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 28 | }, 29 | ], 30 | } 31 | ], 32 | max_tokens=256, 33 | presence_penalty=1.0, 34 | top_p=0.1, 35 | temperature=0.1, 36 | ) 37 | ) 38 | print(res.choices[0].message.content) 39 | print(res.usage) 40 | -------------------------------------------------------------------------------- /mistralrs-pyo3/Cargo_template.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-pyo3" 3 | authors = ["Eric Buehler"] 4 | version.workspace = true 5 | edition.workspace = true 6 | description.workspace = true 7 | repository.workspace = true 8 | keywords.workspace = true 9 | categories.workspace = true 10 | license.workspace = true 11 | homepage.workspace = true 12 | 13 | [lib] 14 | name = "mistralrs" 15 | crate-type = ["cdylib"] 16 | doc = false 17 | 18 | [dependencies] 19 | pyo3.workspace = true 20 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } 21 | serde.workspace = true 22 | serde_json.workspace = true 23 | candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251", features=["$feature_name"] } 24 | indexmap.workspace = true 25 | accelerate-src = { workspace = true, optional = true } 26 | intel-mkl-src = { workspace = true, optional = true } 27 | either.workspace = true 28 | futures.workspace = true 29 | tokio.workspace = true 30 | image.workspace = true 31 | reqwest.workspace = true 32 | base64.workspace = true 33 | url.workspace = true 34 | data-url.workspace = true 35 | anyhow.workspace = true 36 | 37 | [build-dependencies] 38 | pyo3-build-config = "0.22" 39 | -------------------------------------------------------------------------------- /examples/python/anymoe.py: -------------------------------------------------------------------------------- 1 | from mistralrs import ( 2 | Runner, 3 | Which, 4 | ChatCompletionRequest, 5 | Architecture, 6 | AnyMoeConfig, 7 | AnyMoeExpertType, 8 | ) 9 | 10 | runner = Runner( 11 | which=Which.Plain( 12 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 13 | arch=Architecture.Mistral, 14 | ), 15 | anymoe_config=AnyMoeConfig( 16 | hidden_size=4096, 17 | dataset_json="examples/amoe.json", 18 | prefix="model.layers", 19 | mlp="mlp", 20 | expert_type=AnyMoeExpertType.FineTuned(), 21 | lr=1e-3, 22 | epochs=100, 23 | batch_size=4, 24 | model_ids=["HuggingFaceH4/zephyr-7b-beta"], 25 | layers=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 26 | loss_csv_path="loss.csv", 27 | ), 28 | ) 29 | 30 | res = runner.send_chat_completion_request( 31 | ChatCompletionRequest( 32 | model="mistral", 33 | messages=[ 34 | {"role": "user", "content": "Tell me a story about the Rust type system."} 35 | ], 36 | max_tokens=256, 37 | presence_penalty=1.0, 38 | top_p=0.1, 39 | temperature=0.1, 40 | ) 41 | ) 42 | print(res.choices[0].message.content) 43 | print(res.usage) 44 | -------------------------------------------------------------------------------- /examples/server/stream_completion_bench.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from datetime import datetime 3 | 4 | Runs = 4 5 | 6 | ENDPOINT = "http://localhost:1234/v1/" 7 | 8 | 9 | def request(stream: bool): 10 | client = openai.Client(api_key="foobar", base_url=ENDPOINT) 11 | return client.chat.completions.create( 12 | model="mistral", 13 | messages=[ 14 | { 15 | "role": "user", 16 | "content": "What is the meaning of life? Write a long story.", 17 | } 18 | ], 19 | stream=stream, 20 | max_tokens=400, 21 | temperature=0.0, 22 | ) 23 | 24 | 25 | def run(): 26 | for run in range(Runs): 27 | print("\nCompletion: ") 28 | print("=" * 15) 29 | 30 | now = datetime.now() 31 | request(stream=False) 32 | finished = datetime.now() 33 | 34 | print(f"Duration: {finished-now}") 35 | 36 | print("\nStreaming: ") 37 | print("=" * 15) 38 | 39 | now = datetime.now() 40 | stream = request(stream=True) 41 | for _message in stream: 42 | pass 43 | finished = datetime.now() 44 | 45 | print(f"Duration: {finished-now}") 46 | 47 | 48 | if __name__ == "__main__": 49 | run() 50 | -------------------------------------------------------------------------------- /examples/python/phi3v.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | 3 | runner = Runner( 4 | which=Which.VisionPlain( 5 | model_id="microsoft/Phi-3.5-vision-instruct", 6 | arch=VisionArchitecture.Phi3V, 7 | ), 8 | ) 9 | 10 | res = runner.send_chat_completion_request( 11 | ChatCompletionRequest( 12 | model="phi3v", 13 | messages=[ 14 | { 15 | "role": "user", 16 | "content": [ 17 | { 18 | "type": "image_url", 19 | "image_url": { 20 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 21 | }, 22 | }, 23 | { 24 | "type": "text", 25 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 26 | }, 27 | ], 28 | } 29 | ], 30 | max_tokens=256, 31 | presence_penalty=1.0, 32 | top_p=0.1, 33 | temperature=0.1, 34 | ) 35 | ) 36 | print(res.choices[0].message.content) 37 | print(res.usage) 38 | -------------------------------------------------------------------------------- /mistralrs-core/src/gguf/chat_template.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use tracing::info; 3 | 4 | use crate::utils::gguf_metadata::ContentMetadata; 5 | 6 | use super::Content; 7 | 8 | struct PropsGGUFTemplate { 9 | chat_template: Option, 10 | } 11 | 12 | impl TryFrom> for PropsGGUFTemplate { 13 | type Error = anyhow::Error; 14 | 15 | fn try_from(c: ContentMetadata) -> Result { 16 | // No required keys 17 | 18 | let props = Self { 19 | chat_template: c.get_option_value("chat_template")?, 20 | }; 21 | 22 | Ok(props) 23 | } 24 | } 25 | 26 | // Get chat template from GGUF metadata if it exists 27 | pub fn get_gguf_chat_template( 28 | content: &Content<'_, R>, 29 | ) -> Result> { 30 | let metadata = ContentMetadata { 31 | path_prefix: "tokenizer", 32 | metadata: content.get_metadata(), 33 | }; 34 | let props = PropsGGUFTemplate::try_from(metadata)?; 35 | if let Some(ref chat_template) = props.chat_template { 36 | info!( 37 | "Discovered and using GGUF chat template: `{}`", 38 | chat_template.replace('\n', "\\n") 39 | ); 40 | } 41 | Ok(props.chat_template) 42 | } 43 | -------------------------------------------------------------------------------- /mistralrs/examples/idefics2/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder}; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<()> { 6 | let model = VisionModelBuilder::new( 7 | "HuggingFaceM4/idefics2-8b-chatty", 8 | VisionLoaderType::Idefics2, 9 | ) 10 | .with_isq(IsqType::Q4K) 11 | .with_logging() 12 | .build() 13 | .await?; 14 | 15 | let bytes = match reqwest::blocking::get( 16 | "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg", 17 | ) { 18 | Ok(http_resp) => http_resp.bytes()?.to_vec(), 19 | Err(e) => anyhow::bail!(e), 20 | }; 21 | let image = image::load_from_memory(&bytes)?; 22 | 23 | let messages = VisionMessages::new().add_idefics_image_message( 24 | TextMessageRole::User, 25 | "What is depicted here? Please describe the scene in detail.", 26 | image, 27 | ); 28 | 29 | let response = model.send_chat_request(messages).await?; 30 | 31 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 32 | dbg!( 33 | response.usage.avg_prompt_tok_per_sec, 34 | response.usage.avg_compl_tok_per_sec 35 | ); 36 | 37 | Ok(()) 38 | } 39 | -------------------------------------------------------------------------------- /mistralrs/examples/llava_next/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder}; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<()> { 6 | let model = VisionModelBuilder::new( 7 | "llava-hf/llava-v1.6-mistral-7b-hf", 8 | VisionLoaderType::LLaVANext, 9 | ) 10 | .with_isq(IsqType::Q4K) 11 | .with_logging() 12 | .build() 13 | .await?; 14 | 15 | let bytes = match reqwest::blocking::get( 16 | "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg", 17 | ) { 18 | Ok(http_resp) => http_resp.bytes()?.to_vec(), 19 | Err(e) => anyhow::bail!(e), 20 | }; 21 | let image = image::load_from_memory(&bytes)?; 22 | 23 | let messages = VisionMessages::new().add_llava_image_message( 24 | TextMessageRole::User, 25 | "What is depicted here? Please describe the scene in detail.", 26 | image, 27 | ); 28 | 29 | let response = model.send_chat_request(messages).await?; 30 | 31 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 32 | dbg!( 33 | response.usage.avg_prompt_tok_per_sec, 34 | response.usage.avg_compl_tok_per_sec 35 | ); 36 | 37 | Ok(()) 38 | } 39 | -------------------------------------------------------------------------------- /mistralrs/examples/phi3v/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder}; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<()> { 6 | let model = 7 | VisionModelBuilder::new("microsoft/Phi-3.5-vision-instruct", VisionLoaderType::Phi3V) 8 | .with_isq(IsqType::Q4K) 9 | .with_logging() 10 | .build() 11 | .await?; 12 | 13 | let bytes = match reqwest::blocking::get( 14 | "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg", 15 | ) { 16 | Ok(http_resp) => http_resp.bytes()?.to_vec(), 17 | Err(e) => anyhow::bail!(e), 18 | }; 19 | let image = image::load_from_memory(&bytes)?; 20 | 21 | let messages = VisionMessages::new().add_phiv_image_message( 22 | TextMessageRole::User, 23 | "What is depicted here? Please describe the scene in detail.", 24 | image, 25 | ); 26 | 27 | let response = model.send_chat_request(messages).await?; 28 | 29 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 30 | dbg!( 31 | response.usage.avg_prompt_tok_per_sec, 32 | response.usage.avg_compl_tok_per_sec 33 | ); 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /examples/python/llava_next.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | 3 | runner = Runner( 4 | which=Which.VisionPlain( 5 | model_id="llava-hf/llava-v1.6-mistral-7b-hf", 6 | arch=VisionArchitecture.LLaVANext, 7 | ), 8 | ) 9 | 10 | res = runner.send_chat_completion_request( 11 | ChatCompletionRequest( 12 | model="llava_next", 13 | messages=[ 14 | { 15 | "role": "user", 16 | "content": [ 17 | { 18 | "type": "image_url", 19 | "image_url": { 20 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 21 | }, 22 | }, 23 | { 24 | "type": "text", 25 | "text": "What is shown in this image? Write a detailed response analyzing the scene.", 26 | }, 27 | ], 28 | } 29 | ], 30 | max_tokens=256, 31 | presence_penalty=1.0, 32 | top_p=0.1, 33 | temperature=0.1, 34 | ) 35 | ) 36 | print(res.choices[0].message.content) 37 | print(res.usage) 38 | -------------------------------------------------------------------------------- /mistralrs-core/src/gguf/mod.rs: -------------------------------------------------------------------------------- 1 | mod chat_template; 2 | mod content; 3 | mod gguf_tokenizer; 4 | use strum::EnumString; 5 | 6 | use anyhow::{Context, Result}; 7 | pub(crate) use chat_template::get_gguf_chat_template; 8 | pub(crate) use content::Content; 9 | pub(crate) use gguf_tokenizer::{convert_gguf_to_hf_tokenizer, GgufTokenizerConversion}; 10 | use std::str::FromStr; 11 | 12 | pub const GGUF_MULTI_FILE_DELIMITER: &str = " "; 13 | 14 | #[derive(Debug, EnumString, Clone, Copy)] 15 | #[strum(serialize_all = "kebab-case")] 16 | pub enum GGUFArchitecture { 17 | Llama, 18 | Mpt, 19 | Gptneox, 20 | Gptj, 21 | Gpt2, 22 | Bloom, 23 | Falcon, 24 | Mamba, 25 | Rwkv, 26 | Phi2, 27 | Phi3, 28 | Starcoder2, 29 | } 30 | 31 | // Wraps from_str() for some convenience: 32 | // - Case-insensitive variant matching (TODO: is this desirable?) 33 | // - Customized error until potential upstream support: https://github.com/Peternator7/strum/issues/332 34 | impl GGUFArchitecture { 35 | pub fn from_value + std::fmt::Display>(value: T) -> Result { 36 | Self::from_str(&value.as_ref().to_ascii_lowercase()) 37 | .with_context(|| format!("Unknown GGUF architecture `{value}`")) 38 | .map_err(anyhow::Error::msg) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /.github/workflows/release_python.yml: -------------------------------------------------------------------------------- 1 | name: py-release 2 | 3 | # gh workflow run py-release 4 | # This also runs on release deploy 5 | on: 6 | workflow_dispatch: 7 | release: 8 | types: [published] 9 | push: 10 | tags: 11 | - '**[0-9]+.[0-9]+.[0-9]+*' 12 | 13 | permissions: 14 | contents: write 15 | pages: write 16 | id-token: write 17 | 18 | jobs: 19 | upload: 20 | runs-on: ubuntu-latest 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | rust: [stable] 25 | 26 | steps: 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | 30 | - uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: ${{ matrix.rust }} 34 | override: true 35 | 36 | - name: Set up Python 37 | uses: actions/setup-python@v5 38 | with: 39 | python-version: 3.8 40 | 41 | - name: Upload 42 | env: 43 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 44 | run: | 45 | sudo apt install libssl-dev 46 | sudo apt install pkg-config 47 | pip install maturin[patchelf]==1.4.0 twine 48 | python3 -m venv venv 49 | source venv/bin/activate 50 | cd mistralrs-pyo3 51 | python3 upload.py 52 | -------------------------------------------------------------------------------- /mistralrs-core/src/amoe/macros.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | #[doc(hidden)] 3 | macro_rules! get_delta_from_lora_ab { 4 | ($vb_mlp:expr, $rank:expr, $alpha:expr, ($in_d:expr, $out_d:expr), $name:expr) => {{ 5 | let proj_a = $vb_mlp 6 | .pp($name) 7 | .pp("lora_A") 8 | .get(($rank, $in_d), "weight")?; 9 | let proj_b = $vb_mlp 10 | .pp($name) 11 | .pp("lora_B") 12 | .get(($out_d, $rank), "weight")?; 13 | let scale = if $rank > 0 { 14 | $alpha / $rank as f64 15 | } else { 16 | 1.0 17 | }; 18 | (proj_b.matmul(&proj_a)? * scale)? 19 | }}; 20 | } 21 | 22 | #[macro_export] 23 | #[doc(hidden)] 24 | macro_rules! merge_delta { 25 | ($qmatmul:expr, $delta:expr) => { 26 | match &$qmatmul { 27 | QMatMul::Tensor(w) => QMatMul::Tensor((w + $delta)?), 28 | QMatMul::TensorF16(w) => QMatMul::TensorF16((w + $delta)?), 29 | QMatMul::QTensor(w) => { 30 | let (w, dtype) = (w.dequantize(&w.device())?, w.dtype()); 31 | QMatMul::QTensor(std::sync::Arc::new( 32 | candle_core::quantized::QTensor::quantize(&(w + $delta)?, dtype)?, 33 | )) 34 | } 35 | } 36 | }; 37 | } 38 | -------------------------------------------------------------------------------- /mistralrs-vision/tests/integration.rs: -------------------------------------------------------------------------------- 1 | use candle_core::Device; 2 | use image::{ColorType, DynamicImage}; 3 | use mistralrs_vision::{ApplyTransforms, InterpolateResize, Normalize, ToTensor, Transforms}; 4 | 5 | #[test] 6 | fn normalize() { 7 | let image = DynamicImage::new(3, 4, ColorType::Rgb8); 8 | let transforms = Transforms { 9 | input: &ToTensor, 10 | inner_transforms: &[&Normalize { 11 | mean: vec![0.5, 0.5, 0.5], 12 | std: vec![0.5, 0.5, 0.5], 13 | }], 14 | }; 15 | let transformed = image.apply(transforms, &Device::Cpu).unwrap(); 16 | assert_eq!(transformed.dims(), &[3, 4, 3]); 17 | } 18 | 19 | #[test] 20 | fn normalize_and_interpolate_resize() { 21 | let image = DynamicImage::new(300, 400, ColorType::Rgb8); 22 | let transforms = Transforms { 23 | input: &ToTensor, 24 | inner_transforms: &[ 25 | &Normalize { 26 | mean: vec![0.5, 0.5, 0.5], 27 | std: vec![0.5, 0.5, 0.5], 28 | }, 29 | &InterpolateResize { 30 | target_h: 336, 31 | target_w: 336, 32 | }, 33 | ], 34 | }; 35 | let transformed = image.apply(transforms, &Device::Cpu).unwrap(); 36 | assert_eq!(transformed.dims(), &[3, 336, 336]); 37 | } 38 | -------------------------------------------------------------------------------- /mistralrs/examples/llava/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder}; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<()> { 6 | let model = VisionModelBuilder::new("llava-hf/llava-1.5-7b-hf", VisionLoaderType::LLaVA) 7 | .with_isq(IsqType::Q4K) 8 | .with_chat_template("chat_templates/vicuna.json") 9 | .with_logging() 10 | .build() 11 | .await?; 12 | 13 | let bytes = match reqwest::blocking::get( 14 | "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg", 15 | ) { 16 | Ok(http_resp) => http_resp.bytes()?.to_vec(), 17 | Err(e) => anyhow::bail!(e), 18 | }; 19 | let image = image::load_from_memory(&bytes)?; 20 | 21 | let messages = VisionMessages::new().add_llava_image_message( 22 | TextMessageRole::User, 23 | "What is depicted here? Please describe the scene in detail.", 24 | image, 25 | ); 26 | 27 | let response = model.send_chat_request(messages).await?; 28 | 29 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 30 | dbg!( 31 | response.usage.avg_prompt_tok_per_sec, 32 | response.usage.avg_compl_tok_per_sec 33 | ); 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /examples/python/anymoe_inference.py: -------------------------------------------------------------------------------- 1 | from mistralrs import ( 2 | Runner, 3 | Which, 4 | ChatCompletionRequest, 5 | Architecture, 6 | AnyMoeConfig, 7 | AnyMoeExpertType, 8 | ) 9 | 10 | runner = Runner( 11 | which=Which.Plain( 12 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 13 | arch=Architecture.Mistral, 14 | ), 15 | anymoe_config=AnyMoeConfig( 16 | hidden_size=4096, 17 | dataset_json="examples/amoe.json", 18 | prefix="model.layers", 19 | mlp="mlp", 20 | expert_type=AnyMoeExpertType.FineTuned(), 21 | lr=1e-3, 22 | epochs=100, 23 | batch_size=4, 24 | model_ids=["HuggingFaceH4/zephyr-7b-beta"], 25 | layers=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 26 | gate_model_id="path/to/pretrained/gating_model_id", 27 | loss_csv_path="loss.csv", 28 | ), 29 | ) 30 | 31 | res = runner.send_chat_completion_request( 32 | ChatCompletionRequest( 33 | model="mistral", 34 | messages=[ 35 | {"role": "user", "content": "Tell me a story about the Rust type system."} 36 | ], 37 | max_tokens=256, 38 | presence_penalty=1.0, 39 | top_p=0.1, 40 | temperature=0.1, 41 | ) 42 | ) 43 | print(res.choices[0].message.content) 44 | print(res.usage) 45 | -------------------------------------------------------------------------------- /examples/python/anymoe_lora.py: -------------------------------------------------------------------------------- 1 | from mistralrs import ( 2 | Runner, 3 | Which, 4 | ChatCompletionRequest, 5 | Architecture, 6 | AnyMoeConfig, 7 | AnyMoeExpertType, 8 | ) 9 | 10 | runner = Runner( 11 | which=Which.Plain( 12 | model_id="mistralai/Mistral-7B-Instruct-v0.1", 13 | arch=Architecture.Mistral, 14 | ), 15 | anymoe_config=AnyMoeConfig( 16 | hidden_size=4096, 17 | dataset_json="examples/amoe.json", 18 | prefix="model.layers", 19 | mlp="mlp", 20 | expert_type=AnyMoeExpertType.LoraAdapter( 21 | rank=64, alpha=16.0, target_modules=["gate_proj"] 22 | ), 23 | lr=1e-3, 24 | epochs=100, 25 | batch_size=4, 26 | model_ids=["typeof/zephyr-7b-beta-lora"], 27 | # For inference (use a pretrained gating layer) see `anymoe_inference.py` 28 | loss_csv_path="loss.csv", 29 | ), 30 | ) 31 | 32 | res = runner.send_chat_completion_request( 33 | ChatCompletionRequest( 34 | model="mistral", 35 | messages=[ 36 | {"role": "user", "content": "Tell me a story about the Rust type system."} 37 | ], 38 | max_tokens=256, 39 | presence_penalty=1.0, 40 | top_p=0.1, 41 | temperature=0.1, 42 | ) 43 | ) 44 | print(res.choices[0].message.content) 45 | print(res.usage) 46 | -------------------------------------------------------------------------------- /mistralrs/examples/paged_attn/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, MemoryGpuConfig, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, 4 | TextModelBuilder, 5 | }; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 10 | .with_isq(IsqType::Q8_0) 11 | .with_logging() 12 | .with_paged_attn(|| { 13 | PagedAttentionMetaBuilder::default() 14 | .with_block_size(32) 15 | .with_gpu_memory(MemoryGpuConfig::ContextSize(1024)) 16 | .build() 17 | })? 18 | .build() 19 | .await?; 20 | 21 | let messages = TextMessages::new() 22 | .add_message( 23 | TextMessageRole::System, 24 | "You are an AI agent with a specialty in programming.", 25 | ) 26 | .add_message( 27 | TextMessageRole::User, 28 | "Hello! How are you? Please write generic binary search function in Rust.", 29 | ); 30 | 31 | let response = model.send_chat_request(messages).await?; 32 | 33 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 34 | dbg!( 35 | response.usage.avg_prompt_tok_per_sec, 36 | response.usage.avg_compl_tok_per_sec 37 | ); 38 | 39 | Ok(()) 40 | } 41 | -------------------------------------------------------------------------------- /examples/python/phi3v_base64.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | import base64 3 | 4 | runner = Runner( 5 | which=Which.VisionPlain( 6 | model_id="microsoft/Phi-3.5-vision-instruct", 7 | arch=VisionArchitecture.Phi3V, 8 | ), 9 | ) 10 | 11 | FILENAME = "picture.jpg" 12 | with open(FILENAME, "rb") as image_file: 13 | encoded_string = base64.b64encode(image_file.read()).decode("utf-8") 14 | 15 | res = runner.send_chat_completion_request( 16 | ChatCompletionRequest( 17 | model="phi3v", 18 | messages=[ 19 | { 20 | "role": "user", 21 | "content": [ 22 | { 23 | "type": "image_url", 24 | "image_url": { 25 | "url": str(encoded_string), 26 | }, 27 | }, 28 | { 29 | "type": "text", 30 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 31 | }, 32 | ], 33 | } 34 | ], 35 | max_tokens=256, 36 | presence_penalty=1.0, 37 | top_p=0.1, 38 | temperature=0.1, 39 | ) 40 | ) 41 | print(res.choices[0].message.content) 42 | print(res.usage) 43 | -------------------------------------------------------------------------------- /examples/python/llama_vision.py: -------------------------------------------------------------------------------- 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture 2 | 3 | # MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" 4 | MODEL_ID = "lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k" 5 | 6 | runner = Runner( 7 | which=Which.VisionPlain( 8 | model_id="MODEL_ID", 9 | arch=VisionArchitecture.VLlama, 10 | ), 11 | ) 12 | 13 | res = runner.send_chat_completion_request( 14 | ChatCompletionRequest( 15 | model="llama-vision", 16 | messages=[ 17 | { 18 | "role": "user", 19 | "content": [ 20 | { 21 | "type": "image_url", 22 | "image_url": { 23 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 24 | }, 25 | }, 26 | { 27 | "type": "text", 28 | "text": "What is shown in this image? Write a detailed response analyzing the scene.", 29 | }, 30 | ], 31 | } 32 | ], 33 | max_tokens=256, 34 | presence_penalty=1.0, 35 | top_p=0.1, 36 | temperature=0.1, 37 | ) 38 | ) 39 | print(res.choices[0].message.content) 40 | print(res.usage) 41 | -------------------------------------------------------------------------------- /mistralrs/examples/llama_vision/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder}; 3 | 4 | // const MODEL_ID: &str = "meta-llama/Llama-3.2-11B-Vision-Instruct"; 5 | const MODEL_ID: &str = "lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k"; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let model = VisionModelBuilder::new(MODEL_ID, VisionLoaderType::VLlama) 10 | .with_isq(IsqType::Q4K) 11 | .with_logging() 12 | .build() 13 | .await?; 14 | 15 | let bytes = match reqwest::blocking::get( 16 | "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg", 17 | ) { 18 | Ok(http_resp) => http_resp.bytes()?.to_vec(), 19 | Err(e) => anyhow::bail!(e), 20 | }; 21 | let image = image::load_from_memory(&bytes)?; 22 | 23 | let messages = VisionMessages::new().add_vllama_image_message( 24 | TextMessageRole::User, 25 | "What is depicted here? Please describe the scene in detail.", 26 | image, 27 | ); 28 | 29 | let response = model.send_chat_request(messages).await?; 30 | 31 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 32 | dbg!( 33 | response.usage.avg_prompt_tok_per_sec, 34 | response.usage.avg_compl_tok_per_sec 35 | ); 36 | 37 | Ok(()) 38 | } 39 | -------------------------------------------------------------------------------- /mistralrs-pyo3/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-pyo3" 3 | authors = ["Eric Buehler"] 4 | version.workspace = true 5 | edition.workspace = true 6 | description.workspace = true 7 | repository.workspace = true 8 | keywords.workspace = true 9 | categories.workspace = true 10 | license.workspace = true 11 | homepage.workspace = true 12 | 13 | [lib] 14 | name = "mistralrs" 15 | crate-type = ["cdylib"] 16 | doc = false 17 | 18 | [dependencies] 19 | pyo3.workspace = true 20 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core", features = ["pyo3_macros"] } 21 | serde.workspace = true 22 | serde_json.workspace = true 23 | candle-core.workspace = true 24 | indexmap.workspace = true 25 | accelerate-src = { workspace = true, optional = true } 26 | intel-mkl-src = { workspace = true, optional = true } 27 | either.workspace = true 28 | futures.workspace = true 29 | tokio.workspace = true 30 | image.workspace = true 31 | reqwest.workspace = true 32 | base64.workspace = true 33 | url.workspace = true 34 | data-url.workspace = true 35 | anyhow.workspace = true 36 | 37 | [build-dependencies] 38 | pyo3-build-config = "0.22" 39 | 40 | [features] 41 | cuda = ["candle-core/cuda", "mistralrs-core/cuda"] 42 | cudnn = ["candle-core/cudnn", "mistralrs-core/cudnn"] 43 | metal = ["candle-core/metal", "mistralrs-core/metal"] 44 | flash-attn = ["cuda", "mistralrs-core/flash-attn"] 45 | accelerate = ["mistralrs-core/accelerate"] 46 | mkl = ["mistralrs-core/mkl"] 47 | -------------------------------------------------------------------------------- /docs/QUANTS.md: -------------------------------------------------------------------------------- 1 | # Quantization in mistral.rs 2 | 3 | Mistral.rs supports the following quantization: 4 | - GGUF/GGML 5 | - Q, K type 6 | - Supported in GGUF/GGML and GGUF/GGML adapter models 7 | - Supported in all plain and adapter models 8 | - I quants coming! 9 | - CPU, CUDA, Metal (all supported devices) 10 | - 2, 3, 4, 5, 6, 8 bit 11 | - GPTQ 12 | - Supported in all plain and adapter models 13 | - CUDA only 14 | - 2, 3, 4, 8 bit 15 | - HQQ 16 | - Supported in all plain and adapter models via ISQ 17 | - CUDA and CPU only 18 | - 4, 8 bit 19 | - ISQ 20 | - Q, K type GGUF quants 21 | - Supported in all plain and adapter models 22 | - HQQ quants 23 | - CPU, CUDA, Metal (all supported devices) 24 | 25 | ## Using a GGUF quantized model 26 | - Use the `gguf` (cli) / `GGUF` (Python) model selector 27 | - Provide the GGUF file 28 | 29 | ``` 30 | cargo run --features cuda -- -i gguf -f my-gguf-file.gguf 31 | ``` 32 | 33 | ## Using ISQ 34 | See the [docs](ISQ.md) 35 | 36 | ``` 37 | cargo run --features cuda -- -i --isq Q4K plain -m microsoft/Phi-3-mini-4k-instruct -a phi3 38 | ``` 39 | 40 | ## Using a GPTQ quantized model 41 | - Use the `plain` (cli) / `Plain` (Python) model selector 42 | - Provide the model ID for the GPTQ model 43 | - Mistral.rs will automatically detect and use GPTQ quantization. 44 | 45 | ``` 46 | cargo run --features cuda -- -i plain -m kaitchup/Phi-3-mini-4k-instruct-gptq-4bit -a phi3 47 | ``` -------------------------------------------------------------------------------- /mistralrs-core/src/utils/debug.rs: -------------------------------------------------------------------------------- 1 | use candle_core::{Device, DeviceLocation}; 2 | use tracing::level_filters::LevelFilter; 3 | use tracing_subscriber::EnvFilter; 4 | 5 | use crate::DEBUG; 6 | 7 | static LOGGER: std::sync::OnceLock<()> = std::sync::OnceLock::new(); 8 | 9 | /// This should be called to initialize the debug flag and logging. 10 | /// This should not be called in mistralrs-core code due to Rust usage. 11 | pub fn initialize_logging() { 12 | let is_debug = std::env::var("MISTRALRS_DEBUG") 13 | .unwrap_or_default() 14 | .contains('1'); 15 | DEBUG.store(is_debug, std::sync::atomic::Ordering::Relaxed); 16 | 17 | LOGGER.get_or_init(|| { 18 | let filter = EnvFilter::builder() 19 | .with_default_directive(if is_debug { 20 | LevelFilter::DEBUG.into() 21 | } else { 22 | LevelFilter::INFO.into() 23 | }) 24 | .from_env_lossy(); 25 | tracing_subscriber::fmt().with_env_filter(filter).init(); 26 | }); 27 | } 28 | 29 | pub(crate) trait DeviceRepr { 30 | fn device_pretty_repr(&self) -> String; 31 | } 32 | 33 | impl DeviceRepr for Device { 34 | fn device_pretty_repr(&self) -> String { 35 | match self.location() { 36 | DeviceLocation::Cpu => "cpu".to_string(), 37 | DeviceLocation::Cuda { gpu_id } => format!("cuda[{gpu_id}]"), 38 | DeviceLocation::Metal { gpu_id } => format!("metal[{gpu_id}]"), 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /scripts/testgen_vision.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | processor = transformers.AutoProcessor.from_pretrained( 4 | "microsoft/Phi-3.5-vision-instruct", trust_remote_code=True 5 | ) 6 | 7 | res = processor.tokenizer.apply_chat_template( 8 | [ 9 | { 10 | "role": "system", 11 | "content": [{"type": "text", "text": "You are a helpful assistant"}], 12 | }, 13 | { 14 | "role": "user", 15 | "content": [ 16 | {"type": "image"}, 17 | { 18 | "type": "text", 19 | "text": "Hello, please describe the above.", 20 | }, 21 | ], 22 | }, 23 | {"role": "assistant", "content": [{"type": "text", "text": "Hi there"}]}, 24 | { 25 | "role": "user", 26 | "content": [ 27 | {"type": "image"}, 28 | {"type": "text", "text": "This is me, who are you"}, 29 | ], 30 | }, 31 | { 32 | "role": "assistant", 33 | "content": [{"type": "text", "text": " I am an assistant "}], 34 | }, 35 | { 36 | "role": "user", 37 | "content": [ 38 | {"type": "image"}, 39 | {"type": "text", "text": "Another question, what is this?"}, 40 | ], 41 | }, 42 | ], 43 | add_generation_prompt=True, 44 | tokenize=False, 45 | ) 46 | print(res.replace("\n", "\\n")) 47 | -------------------------------------------------------------------------------- /mistralrs-quant/src/gptq/ffi.rs: -------------------------------------------------------------------------------- 1 | use half::f16; 2 | 3 | #[allow(dead_code)] 4 | extern "C" { 5 | pub(crate) fn reconstruct_exllama( 6 | b_q_weight: *const u32, 7 | b_gptq_qzeros: *const u32, 8 | b_gptq_scales: *const f16, 9 | b_q_perm: *const i32, 10 | out: *mut f16, 11 | size_k: i32, 12 | size_n: i32, 13 | groups: i32, 14 | bit: i32, 15 | ); 16 | 17 | pub(crate) fn reconstruct_gptq( 18 | b_q_weight: *const u32, 19 | b_gptq_qzeros: *const u32, 20 | b_gptq_scales: *const f16, 21 | b_q_perm: *const i32, 22 | out: *mut f16, 23 | size_k: i32, 24 | size_n: i32, 25 | groups: i32, 26 | bit: i32, 27 | ); 28 | 29 | pub(crate) fn gemm_half_q_half_cuda_part( 30 | a: *const f16, 31 | b_q_weight: *const u32, 32 | b_gptq_qzeros: *const u32, 33 | b_gptq_scales: *const f16, 34 | b_q_perm: *const i32, 35 | out: *mut f16, 36 | m: i32, 37 | n: i32, 38 | k: i32, 39 | m_count: i32, 40 | groups: i32, 41 | bit: i32, 42 | ); 43 | 44 | pub(crate) fn gemm_half_q_half_alt( 45 | a: *const f16, 46 | b_q_weight: *const u32, 47 | b_gptq_qzeros: *const u32, 48 | b_gptq_scales: *const f16, 49 | b_g_idx: *const i32, 50 | out: *mut f16, 51 | m: i32, 52 | n: i32, 53 | k: i32, 54 | bit: i32, 55 | ); 56 | } 57 | -------------------------------------------------------------------------------- /examples/server/regex.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 4 | 5 | BULLET_LIST_REGEX = "(- [^\n]*\n)+(- [^\n]*)(\n\n)?" 6 | 7 | completion = client.chat.completions.create( 8 | model="mistral", 9 | messages=[ 10 | { 11 | "role": "user", 12 | "content": "Write a list of jokes. Return a markdown list where each item is a joke.", 13 | } 14 | ], 15 | max_tokens=256, 16 | frequency_penalty=1.0, 17 | top_p=0.1, 18 | temperature=0, 19 | extra_body={"grammar": {"type": "regex", "value": BULLET_LIST_REGEX}}, 20 | ) 21 | 22 | print(completion.choices[0].message.content) 23 | 24 | print("---") 25 | 26 | # The following does token healing. Prompting the model to continue after a space usually breaks 27 | # the text because the model wants to start the new token with a space. By setting the a space after 28 | # "Sure!" we guarantee a space after "Sure!" but we haven't forced which token that starts with space should be used yet. 29 | 30 | completion = client.chat.completions.create( 31 | model="mistral", 32 | messages=[ 33 | { 34 | "role": "user", 35 | "content": "Tell me a joke.", 36 | } 37 | ], 38 | max_tokens=256, 39 | frequency_penalty=1.0, 40 | top_p=0.1, 41 | temperature=0, 42 | extra_body={"grammar": {"type": "regex", "value": "Sure! (?s:.)*"}}, 43 | ) 44 | 45 | print(completion.choices[0].message.content) 46 | -------------------------------------------------------------------------------- /examples/amoe.json: -------------------------------------------------------------------------------- 1 | { 2 | "rows": [ 3 | { 4 | "prompt": "Discuss the impact of Renaissance art on modern aesthetics", 5 | "expert": 0 6 | }, 7 | { 8 | "prompt": "Explain the significance of the theory of relativity in modern physics", 9 | "expert": 1 10 | }, 11 | { 12 | "prompt": "Analyze the themes of existentialism in 20th-century literature", 13 | "expert": 0 14 | }, 15 | { 16 | "prompt": "Describe the process of photosynthesis and its importance to ecosystems", 17 | "expert": 1 18 | }, 19 | { 20 | "prompt": "Evaluate the role of classical music in contemporary film scores", 21 | "expert": 0 22 | }, 23 | { 24 | "prompt": "Outline the steps of the scientific method and their importance in experiments", 25 | "expert": 1 26 | }, 27 | { 28 | "prompt": "Compare and contrast the philosophies of Socrates and Nietzsche", 29 | "expert": 0 30 | }, 31 | { 32 | "prompt": "Discuss the ethical implications of artificial intelligence in society", 33 | "expert": 1 34 | }, 35 | { 36 | "prompt": "Interpret the symbolism in Salvador Dalí's paintings", 37 | "expert": 0 38 | }, 39 | { 40 | "prompt": "Describe the function and structure of DNA in genetic inheritance", 41 | "expert": 1 42 | } 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /mistralrs-core/src/utils/tokens.rs: -------------------------------------------------------------------------------- 1 | use std::{env, fs}; 2 | use thiserror::Error; 3 | 4 | use anyhow::Result; 5 | use tracing::info; 6 | 7 | use crate::pipeline::TokenSource; 8 | 9 | #[derive(Error, Debug)] 10 | enum TokenRetrievalError { 11 | #[error("No home directory.")] 12 | HomeDirectoryMissing, 13 | } 14 | 15 | /// This reads a token from a specified source. If the token cannot be read, a warning is logged with `tracing` 16 | /// and *no token is used*. 17 | pub(crate) fn get_token(source: &TokenSource) -> Result> { 18 | fn skip_token(input: &str) -> Option { 19 | info!("Could not load token at {input:?}, using no HF token."); 20 | None 21 | } 22 | 23 | let token = match source { 24 | TokenSource::Literal(data) => Some(data.clone()), 25 | TokenSource::EnvVar(envvar) => env::var(envvar).ok().or_else(|| skip_token(envvar)), 26 | TokenSource::Path(path) => fs::read_to_string(path).ok().or_else(|| skip_token(path)), 27 | TokenSource::CacheToken => { 28 | let home = format!( 29 | "{}/.cache/huggingface/token", 30 | dirs::home_dir() 31 | .ok_or(TokenRetrievalError::HomeDirectoryMissing)? 32 | .display() 33 | ); 34 | 35 | fs::read_to_string(home.clone()) 36 | .ok() 37 | .or_else(|| skip_token(&home)) 38 | } 39 | TokenSource::None => None, 40 | }; 41 | 42 | Ok(token.map(|s| s.trim().to_string())) 43 | } 44 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | #https://dev.to/deciduously/prepare-your-rust-api-docs-for-github-pages-2n5i 3 | on: 4 | push: 5 | branches: ["master"] 6 | 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: write 11 | pages: write 12 | id-token: write 13 | 14 | concurrency: 15 | group: "pages" 16 | cancel-in-progress: false 17 | 18 | jobs: 19 | deploy: 20 | runs-on: ubuntu-latest 21 | strategy: 22 | matrix: 23 | rust: [stable] 24 | steps: 25 | - name: Checkout 26 | uses: actions/checkout@v4 27 | - uses: actions-rs/toolchain@v1 28 | with: 29 | profile: minimal 30 | toolchain: ${{ matrix.rust }} 31 | override: true 32 | - name: Setup Pages 33 | uses: actions/configure-pages@v5 34 | - uses: actions-rs/cargo@v1 35 | with: 36 | command: doc 37 | args: --no-deps 38 | - name: Build docs 39 | run: | 40 | rm -rf ./docs 41 | echo "" > target/doc/index.html 42 | cp -r target/doc ./docs 43 | - name: Build Python docs 44 | run: | 45 | python3 -m venv myenv 46 | source myenv/bin/activate 47 | pip install maturin[patchelf] pdoc 48 | cd mistralrs-pyo3 49 | maturin develop 50 | cd .. 51 | pdoc mistralrs -o ./docs/pyo3 52 | - name: Deploy 53 | uses: JamesIves/github-pages-deploy-action@v4 54 | with: 55 | folder: ./docs -------------------------------------------------------------------------------- /chat_templates/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" 3 | } -------------------------------------------------------------------------------- /mistralrs-server/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mistralrs-server" 3 | readme = "README.md" 4 | authors = ["Eric Buehler"] 5 | version.workspace = true 6 | edition.workspace = true 7 | description.workspace = true 8 | repository.workspace = true 9 | keywords.workspace = true 10 | categories.workspace = true 11 | license.workspace = true 12 | homepage.workspace = true 13 | default-run = "mistralrs-server" 14 | 15 | [dependencies] 16 | anyhow.workspace = true 17 | ctrlc = "3.4.4" 18 | candle-core.workspace = true 19 | serde.workspace = true 20 | serde_json.workspace = true 21 | axum = { version = "0.7.4", features = ["tokio"] } 22 | tower-http = { version = "0.5.1", features = ["cors"]} 23 | utoipa = { version = "4.2", features = ["axum_extras"] } 24 | utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]} 25 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core" } 26 | indexmap.workspace = true 27 | accelerate-src = { workspace = true, optional = true } 28 | intel-mkl-src = { workspace = true, optional = true } 29 | futures.workspace = true 30 | tracing.workspace = true 31 | tokio.workspace = true 32 | either.workspace = true 33 | clap.workspace = true 34 | once_cell.workspace=true 35 | reqwest.workspace = true 36 | image.workspace = true 37 | url.workspace = true 38 | data-url.workspace = true 39 | 40 | [features] 41 | cuda = ["mistralrs-core/cuda"] 42 | cudnn = ["mistralrs-core/cudnn"] 43 | metal = ["mistralrs-core/metal"] 44 | flash-attn = ["cuda", "mistralrs-core/flash-attn"] 45 | accelerate = ["mistralrs-core/accelerate"] 46 | mkl = ["mistralrs-core/mkl"] 47 | -------------------------------------------------------------------------------- /docs/NON_GRANULAR.md: -------------------------------------------------------------------------------- 1 | # X-LoRA non-granular scalings 2 | 3 | A key limitation of the X-LoRA architecture is the need for 2 forward passes of the model per generation step. To trade off model performance for speed, mistral.rs allows the user to reduce the granularity of the scalings by caching them in a technique we call Non Granular Scalings. 4 | 5 | ## How it works 6 | For the first $k$ generation steps, the scalings are calculated normally for each token. However, for the rest of the tokens, it is cached and re-used. In this way, we are able to avoid the second forward pass and the performance is increased significantly. To maintain correctness, enabling non-granular scalings will restrict the engine to processing one sequence at a time. 7 | 8 | ## How to use it 9 | ### Command line 10 | This can be enabled by passing `--tgt-non-granular-index` followed by $k$: 11 | ``` 12 | ./mistralrs-server --port 1234 x-lora-plain -o orderings/xlora-paper-ordering.json -x lamm-mit/x-lora --tgt-non-granular-index 5 13 | ``` 14 | 15 | ### Python 16 | Set the `tgt_non_granular_index` attribute to a non-`None` value in the `Which` selection: 17 | ```py 18 | from mistralrs import Runner, Which 19 | 20 | runner = Runner( 21 | which=Which.XLoraGGUF( 22 | tok_model_id=None, # Automatically determine from ordering file 23 | quantized_model_id="TheBloke/zephyr-7B-beta-GGUF", 24 | quantized_filename="zephyr-7b-beta.Q4_0.gguf", 25 | xlora_model_id="lamm-mit/x-lora", 26 | order="orderings/xlora-paper-ordering.json", 27 | tgt_non_granular_index=5, 28 | ) 29 | ) 30 | 31 | ... 32 | ``` -------------------------------------------------------------------------------- /mistralrs-quant/kernels/gptq/qdq_util.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _qdq_util_cuh 6 | #define _qdq_util_cuh 7 | 8 | union half2_uint32 { 9 | uint32_t as_uint32; 10 | half2 as_half2; 11 | __device__ half2_uint32(uint32_t val) : as_uint32(val) {} 12 | __device__ half2_uint32(half2 val) : as_half2(val) {} 13 | }; 14 | 15 | union half_uint16 { 16 | uint16_t as_uint16; 17 | half as_half; 18 | __device__ half_uint16(uint16_t val) : as_uint16(val) {} 19 | __device__ half_uint16(half val) : as_half(val) {} 20 | }; 21 | 22 | // Max_scale premultiplied by 1/256 23 | 24 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) { 25 | int qs_i = qs + 1; 26 | half qs_h = __int2half_rn(qs_i * qs_i); 27 | qs_h = __hmul(qs_h, max_scale); 28 | return qs_h; 29 | } 30 | 31 | __forceinline__ __device__ half dq(const int q, const int qzero, 32 | const half scale) { 33 | return __hmul(__int2half_rn(q - qzero), scale); 34 | } 35 | 36 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) { 37 | // return __hsub(__int2half_rn(q), __int2half_rn(qzero)); 38 | return __int2half_rn(q - qzero); 39 | } 40 | 41 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, 42 | const int mask) { 43 | return (int)((q >> shift) & mask); 44 | } 45 | 46 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, 47 | const int shift, const int mask) { 48 | return (int)(__funnelshift_rc(q0, q1, shift) & mask); 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /scripts/create_ordering.py: -------------------------------------------------------------------------------- 1 | from peft.tuners import lora 2 | from transformers import AutoModelForCausalLM # type: ignore 3 | import json 4 | from peft.tuners.lora.config import LoraConfig 5 | 6 | model_id = input("Enter the base model ID: ") 7 | target_modules_in = input("Enter the target modules as a comma delimited list: ") 8 | target_modules = target_modules_in.split(",") 9 | target_modules = [x for x in target_modules if len(x) > 0] 10 | target_modules = [x.strip() for x in target_modules] 11 | 12 | model = AutoModelForCausalLM.from_pretrained(model_id) 13 | lora_config = LoraConfig(target_modules=target_modules, init_lora_weights=False) 14 | 15 | model.add_adapter(lora_config, "default") 16 | 17 | total_swapped = 0 18 | loras = {} 19 | for n, module in model.named_modules(): 20 | if isinstance(module, lora.Linear): 21 | loras[n.split("lora_A.")[0]] = total_swapped 22 | total_swapped += 1 23 | elif isinstance(module, lora.Embedding): 24 | loras[n.split("lora_embedding_A.")[0]] = total_swapped 25 | total_swapped += 1 26 | elif isinstance(module, lora.Conv2d): 27 | loras[n.split("lora_A.")[0]] = total_swapped 28 | total_swapped += 1 29 | 30 | adapters_in = input( 31 | "Enter a comma delimited list of adapter names as they were specified when training: " 32 | ) 33 | adapters = adapters_in.split(",") 34 | adapters = [x for x in adapters if len(x) > 0] 35 | adapters = [x.strip() for x in adapters] 36 | 37 | out = {"order": adapters, "layers": loras, "base_model_id": model_id} 38 | 39 | outfile = input("Enter output file: ") 40 | with open(outfile, "w") as f: 41 | f.write(json.dumps(out)) 42 | -------------------------------------------------------------------------------- /mistralrs/examples/isq/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 9 | .with_isq(IsqType::Q8_0) 10 | .with_logging() 11 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 12 | .build() 13 | .await?; 14 | 15 | let messages = TextMessages::new() 16 | .add_message( 17 | TextMessageRole::System, 18 | "You are an AI agent with a specialty in programming.", 19 | ) 20 | .add_message( 21 | TextMessageRole::User, 22 | "Hello! How are you? Please write generic binary search function in Rust.", 23 | ); 24 | 25 | let response = model.send_chat_request(messages).await?; 26 | 27 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 28 | dbg!( 29 | response.usage.avg_prompt_tok_per_sec, 30 | response.usage.avg_compl_tok_per_sec 31 | ); 32 | 33 | // Next example: re-ISQ the model at runtime 34 | model.re_isq_model(IsqType::HQQ4).await?; 35 | 36 | let messages = TextMessages::new().add_message(TextMessageRole::User, "Why is the sky blue?"); 37 | 38 | let response = model.send_chat_request(messages).await?; 39 | 40 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 41 | dbg!( 42 | response.usage.avg_prompt_tok_per_sec, 43 | response.usage.avg_compl_tok_per_sec 44 | ); 45 | 46 | Ok(()) 47 | } 48 | -------------------------------------------------------------------------------- /mistralrs-bench/README.md: -------------------------------------------------------------------------------- 1 | # `mistralrs-bench` 2 | 3 | This is our official benchmarking application, which allows you to collect structured information about the speed of `mistral.rs`. 4 | 5 | > [!NOTE] 6 | > You should replace `--features ...` with one of the features specified [here](../README.md#supported-accelerators), or remove it for pure CPU inference. 7 | 8 | To run: `cargo run --release --features ... --package mistralrs-bench` 9 | 10 | ```bash 11 | Fast and easy LLM serving. 12 | 13 | Usage: mistralrs-bench [OPTIONS] 14 | 15 | Commands: 16 | plain Select a plain model 17 | x-lora Select an X-LoRA architecture 18 | lora Select a LoRA architecture 19 | gguf Select a GGUF model 20 | x-lora-gguf Select a GGUF model with X-LoRA 21 | lora-gguf Select a GGUF model with LoRA 22 | ggml Select a GGML model 23 | x-lora-ggml Select a GGML model with X-LoRA 24 | lora-ggml Select a GGML model with LoRA 25 | help Print this message or the help of the given subcommand(s) 26 | 27 | Options: 28 | -p, --n-prompt 29 | Number of prompt tokens to run [default: 512] 30 | -g, --n-gen 31 | Number of generations tokens to run [default: 128] 32 | -c, --concurrency 33 | Number of concurrent requests to run. Default is 1 34 | -r, --repetitions 35 | Number of times to repeat each test [default: 5] 36 | -n, --num-device-layers 37 | Number of device layers to load and run on the device. All others will be on the CPU 38 | -h, --help 39 | Print help 40 | -V, --version 41 | Print version 42 | ``` -------------------------------------------------------------------------------- /examples/server/completion.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | while True: 39 | prompt = input(">>> ") 40 | completion = client.completions.create( 41 | model="mistral", 42 | prompt=prompt, 43 | max_tokens=256, 44 | frequency_penalty=1.0, 45 | top_p=0.1, 46 | temperature=0, 47 | ) 48 | resp = completion.choices[0].text 49 | print(resp) 50 | -------------------------------------------------------------------------------- /mistralrs-core/src/vision_models/image_processor.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)] 2 | 3 | use candle_core::{Device, Result, Tensor}; 4 | use image::DynamicImage; 5 | 6 | use crate::pipeline::InputsProcessor; 7 | 8 | use super::preprocessor_config::PreProcessorConfig; 9 | 10 | #[allow(dead_code)] 11 | pub(crate) struct PreprocessedImages { 12 | /// Without batch size, safe to unsqueeze & concat in dim0 13 | pub(crate) pixel_values: Tensor, 14 | /// Without batch size, safe to unsqueeze & concat in dim0 15 | pub(crate) pixel_attention_mask: Option, 16 | pub(crate) image_sizes: Option<(usize, usize)>, 17 | pub(crate) num_img_tokens: Option>, 18 | /// Without batch size, safe to unsqueeze & concat in dim0 19 | pub(crate) aspect_ratio_ids: Option, 20 | /// Without batch size, safe to unsqueeze & concat in dim0 21 | pub(crate) aspect_ratio_mask: Option, 22 | /// Without batch size 23 | pub(crate) num_tiles: Option>, 24 | } 25 | 26 | /// ImagePreProcessor: process images for the model (similar to `InputsProcessor`, typically called by it) 27 | pub trait ImagePreProcessor: InputsProcessor { 28 | const DEFAULT_MEAN: [f64; 3]; 29 | const DEFAULT_STD: [f64; 3]; 30 | 31 | /// Preprocess the images for a specific batch. 32 | /// `(bs, max_num_images)`, max_num_images is the max images per batches. 33 | #[allow(clippy::too_many_arguments)] 34 | fn preprocess( 35 | &self, 36 | images: Vec, 37 | config: &PreProcessorConfig, 38 | device: &Device, 39 | batch_info: (usize, usize), 40 | ) -> Result; 41 | } 42 | -------------------------------------------------------------------------------- /mistralrs-core/src/amoe/inputs.rs: -------------------------------------------------------------------------------- 1 | use std::{fs::File, path::Path}; 2 | 3 | use csv::Reader; 4 | use serde::Deserialize; 5 | 6 | pub struct AnyMoeTrainingResult { 7 | pub steps: usize, 8 | /// One for each gating layer 9 | pub final_loss: Vec, 10 | } 11 | 12 | #[derive(Deserialize, Debug)] 13 | pub struct AnyMoeTrainingInputRow { 14 | pub prompt: String, 15 | pub expert: usize, 16 | pub image_urls: Option>, 17 | } 18 | 19 | #[derive(Deserialize, Debug)] 20 | pub struct AnyMoeTrainingInputs { 21 | rows: Vec, 22 | } 23 | 24 | impl AnyMoeTrainingInputs { 25 | /// From a CSV file with the mandatory columns `prompt` (String), `expert` (usize), and the optional 26 | /// column `image_urls` (`Vec`). 27 | pub fn from_csv>(file: P) -> anyhow::Result { 28 | let file = File::open(file)?; 29 | let mut reader = Reader::from_reader(file); 30 | let mut rows = Vec::new(); 31 | for result in reader.deserialize() { 32 | let row: AnyMoeTrainingInputRow = result?; 33 | rows.push(row); 34 | } 35 | Ok(Self { rows }) 36 | } 37 | 38 | /// From a JSON file with the top-level key being `rows` (array), which contains objects with the 39 | /// keys `prompt` (String), `expert` (usize), `image_urls` (Option>). 40 | pub fn from_json>(file: P) -> anyhow::Result { 41 | let file = File::open(file)?; 42 | Ok(serde_json::from_reader(file)?) 43 | } 44 | 45 | pub fn len(&self) -> usize { 46 | self.rows.len() 47 | } 48 | 49 | pub fn into_inner(self) -> Vec { 50 | self.rows 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /docs/GEMMA2.md: -------------------------------------------------------------------------------- 1 | # Gemma 2 Model 2 | 3 | **[See the Gemma 2 model Collection](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)** 4 | 5 | The Gemma 2 models are a family of text-to-text decoder-only LLMs. As such, the methods to use them are the same as with all other text-to-text LLMs supported by mistral.rs. 6 | 7 | ## HTTP API 8 | 9 | ```py 10 | import openai 11 | 12 | messages = [] 13 | prompt = input("Enter system prompt >>> ") 14 | if len(prompt) > 0: 15 | messages.append({"role": "system", "content": prompt}) 16 | 17 | 18 | while True: 19 | prompt = input(">>> ") 20 | messages.append({"role": "user", "content": prompt}) 21 | completion = client.chat.completions.create( 22 | model="gemma2", 23 | messages=messages, 24 | max_tokens=256, 25 | frequency_penalty=1.0, 26 | top_p=0.1, 27 | temperature=0, 28 | ) 29 | resp = completion.choices[0].message.content 30 | print(resp) 31 | messages.append({"role": "assistant", "content": resp}) 32 | ``` 33 | 34 | ## Python API 35 | ```py 36 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture 37 | 38 | runner = Runner( 39 | which=Which.Plain( 40 | model_id="google/gemma-2-9b-it", 41 | arch=Architecture.Gemma2, 42 | ), 43 | ) 44 | 45 | res = runner.send_chat_completion_request( 46 | ChatCompletionRequest( 47 | model="mistral", 48 | messages=[ 49 | {"role": "user", "content": "Tell me a story about the Rust type system."} 50 | ], 51 | max_tokens=256, 52 | presence_penalty=1.0, 53 | top_p=0.1, 54 | temperature=0.1, 55 | ) 56 | ) 57 | print(res.choices[0].message.content) 58 | print(res.usage) 59 | ``` -------------------------------------------------------------------------------- /mistralrs/examples/custom_logits_processor/main.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use anyhow::Result; 4 | use mistralrs::{ 5 | CustomLogitsProcessor, IsqType, PagedAttentionMetaBuilder, RequestBuilder, Tensor, 6 | TextMessageRole, TextModelBuilder, 7 | }; 8 | use rand::Rng; 9 | 10 | struct ThresholdLogitsProcessor { 11 | threshold: f64, 12 | } 13 | 14 | impl CustomLogitsProcessor for ThresholdLogitsProcessor { 15 | fn apply(&self, logits: &Tensor, _context: &[u32]) -> mistralrs::Result { 16 | // Mask is 1 for true, 0 for false. 17 | let mask = logits.ge(self.threshold)?; 18 | logits.broadcast_mul(&mask.to_dtype(logits.dtype())?) 19 | } 20 | } 21 | 22 | #[tokio::main] 23 | async fn main() -> Result<()> { 24 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 25 | .with_isq(IsqType::Q4K) 26 | .with_logging() 27 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 28 | .build() 29 | .await?; 30 | 31 | let mut rng = rand::thread_rng(); 32 | let random_value: f64 = rng.gen_range(0.0..=1.0); 33 | let threshold: f64 = rng.gen_range(0.0..=0.5); 34 | 35 | let request = RequestBuilder::new() 36 | .add_logits_processor(Arc::new(move |logits: &Tensor, _context: &[u32]| { 37 | logits * random_value 38 | })) 39 | .add_logits_processor(Arc::new(ThresholdLogitsProcessor { threshold })) 40 | .add_message( 41 | TextMessageRole::User, 42 | "Please write a mathematical equation where a few numbers are added.", 43 | ); 44 | 45 | let response = model.send_chat_request(request).await?; 46 | 47 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 48 | 49 | Ok(()) 50 | } 51 | -------------------------------------------------------------------------------- /mistralrs-core/src/xlora_models/config.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use either::Either; 4 | use serde::Deserialize; 5 | 6 | fn true_default() -> bool { 7 | true 8 | } 9 | 10 | fn false_default() -> bool { 11 | false 12 | } 13 | fn default_1() -> usize { 14 | 1 15 | } 16 | 17 | fn default_2048() -> usize { 18 | 1 19 | } 20 | fn default_dropout() -> f32 { 21 | 0.2 22 | } 23 | fn default_1f64() -> f64 { 24 | 1.0 25 | } 26 | fn default_0f64() -> f64 { 27 | 0.0 28 | } 29 | 30 | #[derive(Clone, Debug, Deserialize)] 31 | pub struct XLoraConfig { 32 | pub hidden_size: usize, 33 | pub base_model_id: String, 34 | #[serde(rename = "adapters")] 35 | #[serde(with = "either::serde_untagged")] 36 | pub _adapters: Either, HashMap>, 37 | #[serde(default = "false_default")] 38 | pub layerwise_scalings: bool, 39 | #[serde(default = "false_default")] 40 | pub enable_relu_and_dropout: bool, 41 | #[serde(default = "default_1")] 42 | pub xlora_depth: usize, 43 | #[serde(default = "default_2048")] 44 | pub xlora_size: usize, 45 | #[serde(default = "default_dropout")] 46 | pub xlora_dropout_p: f32, 47 | #[serde(default = "true_default")] 48 | pub enable_softmax: bool, 49 | #[serde(default = "default_1f64")] 50 | pub softmax_temperature: f64, 51 | #[serde(default = "default_0f64")] 52 | pub scaling_pass_value: f64, 53 | #[serde(default = "false_default", rename = "use_trainable_adapters")] 54 | pub _use_trainable_adapters: bool, 55 | #[serde(default = "true_default")] 56 | pub use_bias: bool, 57 | #[serde(default = "default_1f64")] 58 | pub global_scaling_weight: f64, 59 | pub top_k_lora: Option, 60 | #[serde(default = "false_default")] 61 | pub enable_softmax_topk: bool, 62 | } 63 | -------------------------------------------------------------------------------- /mistralrs/examples/simple/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextMessages, 4 | TextModelBuilder, 5 | }; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 10 | .with_isq(IsqType::Q8_0) 11 | .with_logging() 12 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 13 | .build() 14 | .await?; 15 | 16 | let messages = TextMessages::new() 17 | .add_message( 18 | TextMessageRole::System, 19 | "You are an AI agent with a specialty in programming.", 20 | ) 21 | .add_message( 22 | TextMessageRole::User, 23 | "Hello! How are you? Please write generic binary search function in Rust.", 24 | ); 25 | 26 | let response = model.send_chat_request(messages).await?; 27 | 28 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 29 | dbg!( 30 | response.usage.avg_prompt_tok_per_sec, 31 | response.usage.avg_compl_tok_per_sec 32 | ); 33 | 34 | // Next example: Return some logprobs with the `RequestBuilder`, which enables higher configurability. 35 | let request = RequestBuilder::new().return_logprobs(true).add_message( 36 | TextMessageRole::User, 37 | "Please write a mathematical equation where a few numbers are added.", 38 | ); 39 | 40 | let response = model.send_chat_request(request).await?; 41 | 42 | println!( 43 | "Logprobs: {:?}", 44 | &response.choices[0] 45 | .logprobs 46 | .as_ref() 47 | .unwrap() 48 | .content 49 | .as_ref() 50 | .unwrap()[0..3] 51 | ); 52 | 53 | Ok(()) 54 | } 55 | -------------------------------------------------------------------------------- /mistralrs-quant/kernels/gptq/compat.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Copied from https://github.com/turboderp/exllamav2 3 | */ 4 | 5 | #ifndef _compat_cuh 6 | #define _compat_cuh 7 | 8 | // atomicAdd for half types, to support CC < 7.x 9 | 10 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) { 11 | unsigned int* address_as_ui = 12 | (unsigned int*)((char*)address - ((size_t)address & 2)); 13 | unsigned int old = *address_as_ui; 14 | unsigned int assumed; 15 | 16 | do { 17 | assumed = old; 18 | __half_raw hsum; 19 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 20 | half tmpres = __hadd(hsum, val); 21 | hsum = __half_raw(tmpres); 22 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) 23 | : (old & 0xffff0000) | hsum.x; 24 | old = atomicCAS(address_as_ui, assumed, old); 25 | } while (assumed != old); 26 | } 27 | 28 | // atomicAdd for half2 types 29 | 30 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) { 31 | unsigned int* address_as_ui = (unsigned int*)address; 32 | unsigned int old = *address_as_ui; 33 | unsigned int assumed; 34 | do { 35 | assumed = old; 36 | half2 old_val = *((half2*)&old); 37 | half2 new_val = __hadd2(old_val, val); 38 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 39 | } while (assumed != old); 40 | } 41 | 42 | // 43 | 44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 45 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 46 | 47 | __device__ __forceinline__ void atomicAdd(half* address, half val) { 48 | atomicAdd_half(address, val); 49 | } 50 | 51 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 52 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { 53 | atomicAdd_half2(address, val); 54 | } 55 | #endif 56 | 57 | #endif 58 | #endif 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /chat_templates/vicuna.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_bos_token": true, 3 | "add_eos_token": false, 4 | "bos_token": { 5 | "__type": "AddedToken", 6 | "content": "", 7 | "lstrip": false, 8 | "normalized": false, 9 | "rstrip": false, 10 | "single_word": false 11 | }, 12 | "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ system_message }}{% endif %}{% if message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}", 13 | "clean_up_tokenization_spaces": false, 14 | "eos_token": { 15 | "__type": "AddedToken", 16 | "content": "", 17 | "lstrip": false, 18 | "normalized": false, 19 | "rstrip": false, 20 | "single_word": false 21 | }, 22 | "legacy": false, 23 | "model_max_length": 4096, 24 | "pad_token": null, 25 | "padding_side": "right", 26 | "sp_model_kwargs": {}, 27 | "tokenizer_class": "LlamaTokenizer", 28 | "unk_token": { 29 | "__type": "AddedToken", 30 | "content": "", 31 | "lstrip": false, 32 | "normalized": false, 33 | "rstrip": false, 34 | "single_word": false 35 | } 36 | } -------------------------------------------------------------------------------- /mistralrs-core/src/utils/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, path::Path}; 2 | 3 | use anyhow::Result; 4 | use serde::Deserialize; 5 | use serde_json::Value; 6 | use tokenizers::{tokenizer, Tokenizer}; 7 | 8 | #[derive(Deserialize)] 9 | struct AddedToken { 10 | id: usize, 11 | content: String, 12 | } 13 | 14 | /// May fix the tokenizer according to: https://gist.github.com/jneuff/682d47b786329f19291d166957b3274a 15 | pub(crate) fn get_tokenizer + Clone>( 16 | p: P, 17 | processor_added_tokens: Option<&[&str]>, 18 | ) -> Result { 19 | let mut tokenizer = { 20 | let raw = std::fs::read(p.clone()).map_err(anyhow::Error::msg)?; 21 | let mut tokenizer: Value = serde_json::from_slice(&raw).unwrap(); 22 | let added_tokens: Vec = 23 | serde_json::from_value(tokenizer["added_tokens"].clone()).unwrap(); 24 | let vocab: HashMap = 25 | serde_json::from_value(tokenizer["model"]["vocab"].clone()).unwrap(); 26 | for token in added_tokens { 27 | if !vocab.contains_key(&token.content) { 28 | tokenizer["model"]["vocab"] 29 | .as_object_mut() 30 | .unwrap() 31 | .insert(token.content, token.id.into()) 32 | .ok_or(()) 33 | .unwrap_err(); 34 | } 35 | } 36 | let raw_fixed = serde_json::to_vec_pretty(&tokenizer).unwrap(); 37 | Tokenizer::from_bytes(&raw_fixed).map_err(anyhow::Error::msg)? 38 | }; 39 | if let Some(added_tokens) = processor_added_tokens { 40 | tokenizer.add_special_tokens( 41 | &added_tokens 42 | .iter() 43 | .map(|x| tokenizer::AddedToken::from(x.to_string(), true)) 44 | .collect::>(), 45 | ); 46 | } 47 | Ok(tokenizer) 48 | } 49 | -------------------------------------------------------------------------------- /mistralrs-vision/src/ops.rs: -------------------------------------------------------------------------------- 1 | use candle_core::{Result, Tensor}; 2 | 3 | /// Pad an image of shape (c, h, w) to (c, max_h, max_w) by padding with zeros on the right and bottom. 4 | pub fn pad(image: &Tensor, max_h: usize, max_w: usize) -> Result { 5 | let (c, h, w) = image.dims3()?; 6 | let new_image = Tensor::zeros((c, max_h, max_w), image.dtype(), image.device())?; 7 | new_image.slice_assign(&[&(..c), &(..h), &(..w)], image) 8 | } 9 | 10 | /// Generate pixel mask of shape (c, max_h, max_w). 1 indicates valid pixel, 0 indicates padding. 11 | /// 12 | /// The input tensor is of shape (c, max_h, max_w) and the output mask is the same shape and 13 | /// represents where pixels are. The mask shape is in the top left corner is passed as `h` and `w`. 14 | pub fn make_pixel_mask(image: &Tensor, h: usize, w: usize) -> Result { 15 | let (_c, max_h, max_w) = image.dims3()?; 16 | let mask = Tensor::ones((h, w), image.dtype(), image.device())?; 17 | let zeros = Tensor::zeros((max_h, max_w), image.dtype(), image.device())?; 18 | // TODO(EricLBuehler): https://github.com/huggingface/candle/pull/2223 will make this nicer 19 | zeros.slice_assign(&[&(..h), &(..w)], &mask) 20 | } 21 | 22 | /// Given the image sizes (h, w) and the minimum and maximum lengths, calculate the image dimensions 23 | /// which will preserve aspect ration while respecing the minimum and maximum lengths. 24 | pub fn get_resize_image_size( 25 | (h, w): (usize, usize), 26 | (min_len, max_len): (usize, usize), 27 | ) -> (usize, usize) { 28 | let aspect_ratio = w as f64 / h as f64; 29 | 30 | let (new_h, new_w) = if w >= h && w > max_len { 31 | ((max_len as f64 / aspect_ratio) as usize, max_len) 32 | } else if h > w && h > max_len { 33 | (max_len, (max_len as f64 * aspect_ratio) as usize) 34 | } else { 35 | (h, w) 36 | }; 37 | (new_h.max(min_len), new_w.max(min_len)) 38 | } 39 | -------------------------------------------------------------------------------- /mistralrs-core/src/scheduler/mod.rs: -------------------------------------------------------------------------------- 1 | mod default_scheduler; 2 | 3 | pub use default_scheduler::{DefaultScheduler, DefaultSchedulerMethod, DefaultSchedulerOutput}; 4 | 5 | use crate::{ 6 | paged_attention::{ 7 | BlockEngine, BlockTables, CacheConfig, PagedAttentionScheduler, 8 | PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput, 9 | }, 10 | sequence::Sequence, 11 | }; 12 | 13 | #[derive(Clone)] 14 | pub enum SchedulerConfig { 15 | DefaultScheduler { 16 | method: DefaultSchedulerMethod, 17 | }, 18 | PagedAttentionMeta { 19 | max_num_seqs: usize, 20 | config: CacheConfig, 21 | }, 22 | } 23 | 24 | impl SchedulerConfig { 25 | pub fn into_scheduler(self) -> Box { 26 | match self { 27 | Self::DefaultScheduler { method } => Box::new(DefaultScheduler::new(method)), 28 | Self::PagedAttentionMeta { 29 | max_num_seqs, 30 | config, 31 | } => Box::new(PagedAttentionScheduler::new( 32 | PagedAttentionSchedulerConfig { max_num_seqs }, 33 | config, 34 | )), 35 | } 36 | } 37 | } 38 | 39 | pub enum SchedulerOutput<'a> { 40 | DefaultScheduler { 41 | output: DefaultSchedulerOutput<'a>, 42 | }, 43 | PagedAttention { 44 | output: PagedAttentionSchedulerOutput, 45 | }, 46 | } 47 | 48 | pub trait Scheduler { 49 | fn schedule(&mut self) -> SchedulerOutput<'_>; 50 | fn waiting_len(&self) -> usize; 51 | fn running_len(&self) -> usize; 52 | fn add_seq(&mut self, seq: Sequence); 53 | /// This may do nothing. It depends on the implementation 54 | fn free_finished_sequence_groups(&mut self); 55 | 56 | // PagedAttention metadata 57 | fn block_tables(&self) -> Option<&BlockTables>; 58 | fn block_size(&self) -> Option; 59 | fn block_engine(&mut self) -> Option<&mut BlockEngine>; 60 | } 61 | -------------------------------------------------------------------------------- /mistralrs/examples/anymoe_lora/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | AnyMoeConfig, AnyMoeExpertType, AnyMoeModelBuilder, IsqType, PagedAttentionMetaBuilder, 4 | TextMessageRole, TextMessages, TextModelBuilder, 5 | }; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let text_builder = TextModelBuilder::new("mistralai/Mistral-7B-Instruct-v0.1") 10 | .with_isq(IsqType::Q8_0) 11 | .with_logging() 12 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?; 13 | 14 | let model = AnyMoeModelBuilder::from_text_builder( 15 | text_builder, 16 | AnyMoeConfig { 17 | hidden_size: 4096, 18 | lr: 1e-3, 19 | epochs: 100, 20 | batch_size: 4, 21 | expert_type: AnyMoeExpertType::FineTuned, 22 | gate_model_id: None, // Set this to Some("path/to/model/id") for the pretrained gating model id 23 | training: true, 24 | loss_csv_path: None, 25 | }, 26 | "model.layers", 27 | "mlp", 28 | "examples/amoe.json", 29 | vec!["typeof/zephyr-7b-beta-lora"], 30 | vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 31 | ) 32 | .build() 33 | .await?; 34 | 35 | let messages = TextMessages::new() 36 | .add_message( 37 | TextMessageRole::System, 38 | "You are an AI agent with a specialty in programming.", 39 | ) 40 | .add_message( 41 | TextMessageRole::User, 42 | "Hello! How are you? Please write generic binary search function in Rust.", 43 | ); 44 | 45 | let response = model.send_chat_request(messages).await?; 46 | 47 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 48 | dbg!( 49 | response.usage.avg_prompt_tok_per_sec, 50 | response.usage.avg_compl_tok_per_sec 51 | ); 52 | 53 | Ok(()) 54 | } 55 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "mistralrs-server", 4 | "mistralrs-core", 5 | "mistralrs-pyo3", 6 | "mistralrs", 7 | "mistralrs-bench", 8 | "mistralrs-vision", 9 | "mistralrs-quant", 10 | ] 11 | exclude = [ 12 | "mistralrs-paged_attn", 13 | ] 14 | resolver = "2" 15 | 16 | [workspace.package] 17 | version = "0.3.1" 18 | edition = "2021" 19 | description = "Fast and easy LLM serving." 20 | homepage = "https://github.com/EricLBuehler/mistral.rs" 21 | repository = "https://github.com/EricLBuehler/mistral.rs" 22 | keywords = ["machine-learning"] 23 | categories = ["science"] 24 | license = "MIT" 25 | 26 | [workspace.dependencies] 27 | anyhow = "1.0.80" 28 | candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251" } 29 | candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251" } 30 | serde = "1.0.197" 31 | serde_json = "1.0.114" 32 | indexmap = { version = "2.2.5", features = ["serde"] } 33 | either = { version = "1.10.0", features = ["serde"] } 34 | accelerate-src = { version = "0.3.2" } 35 | intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] } 36 | tracing = "0.1.40" 37 | tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } 38 | futures = "0.3" 39 | clap = { version = "4.5.1", features = ["derive"] } 40 | pyo3 = { version = "0.22.0", features = ["full", "extension-module", "either"] } 41 | tokio = { version = "1.36.0", features = ["full", "rt-multi-thread"] } 42 | once_cell = "1.19.0" 43 | # All features but avif, avif increases the msrv dramatically 44 | image = { version = "0.25.1", default-features = false, features = ['bmp', 'dds', 'exr', 'ff', 'gif', 'hdr', 'ico', 'jpeg', 'png', 'pnm', 'qoi', 'tga', 'tiff', 'webp']} 45 | reqwest = { version = "0.12.4", features = ["blocking"] } 46 | base64 = "0.22.1" 47 | half = "2.4.0" 48 | rayon = "1.1.0" 49 | url = "2.5.2" 50 | data-url = "0.3.1" 51 | buildstructor = "0.5.4" 52 | float8 = "0.1.1" 53 | -------------------------------------------------------------------------------- /Dockerfile.cuda-all: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder 2 | 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 4 | curl \ 5 | libssl-dev \ 6 | pkg-config \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | RUN curl https://sh.rustup.rs -sSf | bash -s -- -y 10 | ENV PATH="/root/.cargo/bin:${PATH}" 11 | RUN rustup update nightly 12 | RUN rustup default nightly 13 | 14 | WORKDIR /mistralrs 15 | 16 | COPY . . 17 | 18 | ARG CUDA_COMPUTE_CAP=80 19 | ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP} 20 | ARG FEATURES="cuda cudnn" 21 | ENV RAYON_NUM_THREADS=4 22 | RUN RUSTFLAGS="-Z threads=4" cargo build --release --workspace --exclude mistralrs-pyo3 --features "${FEATURES}" 23 | 24 | FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base 25 | 26 | ENV HUGGINGFACE_HUB_CACHE=/data \ 27 | PORT=80 \ 28 | RAYON_NUM_THREADS=8 \ 29 | LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 30 | 31 | # Run the script to create symlinks in /usr/local/cuda/lib64 32 | RUN set -eux; \ 33 | for lib in $(ls /usr/local/cuda/lib64); do \ 34 | base=$(echo $lib | sed -r 's/(.+)\.so\..+/\1.so/'); \ 35 | if [ "$lib" != "$base" ]; then \ 36 | ln -sf "/usr/local/cuda/lib64/$lib" "/usr/local/cuda/lib64/$base"; \ 37 | fi; \ 38 | done 39 | 40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 41 | libomp-dev \ 42 | ca-certificates \ 43 | libssl-dev \ 44 | curl \ 45 | pkg-config \ 46 | && rm -rf /var/lib/apt/lists/* 47 | 48 | FROM base 49 | 50 | COPY --from=builder /mistralrs/target/release/mistralrs-bench /usr/local/bin/mistralrs-bench 51 | RUN chmod +x /usr/local/bin/mistralrs-bench 52 | COPY --from=builder /mistralrs/target/release/mistralrs-server /usr/local/bin/mistralrs-server 53 | RUN chmod +x /usr/local/bin/mistralrs-server 54 | ENTRYPOINT ["mistralrs-server", "--port", "80", "--token-source", "env:HUGGING_FACE_HUB_TOKEN"] 55 | -------------------------------------------------------------------------------- /mistralrs-quant/src/dummy/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::{QuantMethod, QuantizedSerde}; 2 | 3 | #[derive(Debug)] 4 | pub struct DummyLayer; 5 | 6 | impl QuantMethod for DummyLayer { 7 | fn new(_method: crate::QuantMethodConfig) -> candle_core::Result 8 | where 9 | Self: Sized, 10 | { 11 | Ok(Self) 12 | } 13 | fn add_delta_w( 14 | &self, 15 | _delta: &candle_core::Tensor, 16 | ) -> candle_core::Result> { 17 | candle_core::bail!("DummyLayer should not ever be present in forward pass!") 18 | } 19 | fn apply_isq( 20 | self: std::sync::Arc, 21 | _dtype: Option, 22 | _device: candle_core::Device, 23 | _n_quantized: &std::sync::atomic::AtomicUsize, 24 | ) -> candle_core::Result> { 25 | candle_core::bail!("DummyLayer should not ever be present in forward pass!") 26 | } 27 | fn dtype_and_device(&self) -> (candle_core::DType, candle_core::Device) { 28 | (candle_core::DType::F64, candle_core::Device::Cpu) 29 | } 30 | fn forward(&self, _a: &candle_core::Tensor) -> candle_core::Result { 31 | candle_core::bail!("DummyLayer should not ever be present in forward pass!") 32 | } 33 | fn forward_via_half( 34 | &self, 35 | _a: &candle_core::Tensor, 36 | ) -> candle_core::Result { 37 | candle_core::bail!("DummyLayer should not ever be present in forward pass!") 38 | } 39 | fn get_bias_mut(&mut self) -> Option<&mut candle_core::Tensor> { 40 | None 41 | } 42 | fn get_max_isq_cpu_threads(&self, _dtype: crate::IsqType) -> Option { 43 | None 44 | } 45 | fn quantized_act_type(&self) -> Option { 46 | None 47 | } 48 | } 49 | 50 | impl QuantizedSerde for DummyLayer { 51 | fn name(&self) -> &'static str { 52 | "dummy" 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /examples/server/chat.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | messages = [] 39 | prompt = input("Enter system prompt >>> ") 40 | if len(prompt) > 0: 41 | messages.append({"role": "system", "content": prompt}) 42 | 43 | 44 | while True: 45 | prompt = input(">>> ") 46 | messages.append({"role": "user", "content": prompt}) 47 | completion = client.chat.completions.create( 48 | model="mistral", 49 | messages=messages, 50 | max_tokens=256, 51 | frequency_penalty=1.0, 52 | top_p=0.1, 53 | temperature=0, 54 | ) 55 | resp = completion.choices[0].message.content 56 | print(resp) 57 | messages.append({"role": "assistant", "content": resp}) 58 | -------------------------------------------------------------------------------- /mistralrs-core/src/vision_models/preprocessor_config.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use candle_core::Result; 4 | use image::imageops::FilterType; 5 | use serde::Deserialize; 6 | 7 | #[derive(Deserialize, Debug, Clone)] 8 | #[allow(dead_code)] 9 | pub struct PreProcessorConfig { 10 | pub(crate) do_convert_rgb: Option, 11 | pub(crate) do_image_splitting: Option, 12 | pub(crate) do_normalize: Option, 13 | pub(crate) do_pad: Option, 14 | pub(crate) do_rescale: Option, 15 | pub(crate) do_resize: Option, 16 | pub(crate) do_center_crop: Option, 17 | pub(crate) image_mean: Option<[f64; 3]>, 18 | pub(crate) image_std: Option<[f64; 3]>, 19 | pub(crate) rescale_factor: Option, 20 | pub(crate) resampling: Option, 21 | pub(crate) size: Option>, 22 | pub(crate) crop_size: Option>, 23 | pub(crate) num_img_tokens: Option, 24 | pub(crate) num_crops: Option, 25 | pub(crate) max_image_tiles: Option, 26 | } 27 | 28 | #[allow(dead_code)] 29 | pub(crate) trait ToFilter { 30 | fn to_filter(self) -> Result; 31 | } 32 | 33 | impl ToFilter for Option { 34 | // https://github.com/python-pillow/Pillow/blob/4b68563e8a818fb9c528fa159ddf3f4eaefa35e6/src/PIL/Image.py#L164-L170 35 | // Default: https://github.com/huggingface/transformers/blob/0df888ffb72ea370555efdef45985378d3cc7b2b/src/transformers/models/idefics2/image_processing_idefics2.py#L226 36 | fn to_filter(self) -> Result { 37 | match self { 38 | Some(0) => Ok(FilterType::Nearest), 39 | Some(1) => Ok(FilterType::Lanczos3), 40 | Some(2) | None => Ok(FilterType::Triangle), // BiLinear 41 | Some(3) => Ok(FilterType::CatmullRom), // BiCubic 42 | Some(4) => Ok(FilterType::Nearest), 43 | Some(x) => candle_core::bail!("Filter number {x} not supported"), 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /mistralrs-pyo3/src/stream.rs: -------------------------------------------------------------------------------- 1 | use tokio::sync::mpsc::Receiver; 2 | 3 | use mistralrs_core::{ChatCompletionChunkResponse, Response}; 4 | use pyo3::{exceptions::PyValueError, pyclass, pymethods, PyRef, PyRefMut, PyResult}; 5 | 6 | #[pyclass] 7 | pub struct ChatCompletionStreamer { 8 | rx: Receiver, 9 | is_done: bool, 10 | } 11 | 12 | impl ChatCompletionStreamer { 13 | pub fn from_rx(rx: Receiver) -> Self { 14 | Self { rx, is_done: false } 15 | } 16 | } 17 | 18 | #[pymethods] 19 | impl ChatCompletionStreamer { 20 | fn __iter__(this: PyRef<'_, Self>) -> PyRef<'_, Self> { 21 | this 22 | } 23 | fn __next__(mut this: PyRefMut<'_, Self>) -> Option> { 24 | if this.is_done { 25 | return None; 26 | } 27 | match this.rx.blocking_recv() { 28 | Some(resp) => match resp { 29 | Response::ModelError(msg, _) => Some(Err(PyValueError::new_err(msg.to_string()))), 30 | Response::ValidationError(e) => Some(Err(PyValueError::new_err(e.to_string()))), 31 | Response::InternalError(e) => Some(Err(PyValueError::new_err(e.to_string()))), 32 | Response::Chunk(response) => { 33 | if response.choices.iter().all(|x| x.finish_reason.is_some()) { 34 | this.is_done = true; 35 | } 36 | Some(Ok(response)) 37 | } 38 | Response::Done(_) => unreachable!(), 39 | Response::CompletionDone(_) => unreachable!(), 40 | Response::CompletionModelError(_, _) => unreachable!(), 41 | Response::CompletionChunk(_) => unreachable!(), 42 | Response::ImageGeneration(_) => unreachable!(), 43 | }, 44 | None => Some(Err(PyValueError::new_err( 45 | "Received none in ChatCompletionStreamer".to_string(), 46 | ))), 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include 21 | 22 | namespace vllm { 23 | 24 | // A vector type to store Q, K, V elements. 25 | template 26 | struct Vec {}; 27 | 28 | // A vector type to store FP32 accumulators. 29 | template 30 | struct FloatVec {}; 31 | 32 | // Template vector operations. 33 | template 34 | inline __device__ Acc mul(A a, B b); 35 | 36 | template 37 | inline __device__ float sum(T v); 38 | 39 | template 40 | inline __device__ float dot(T a, T b) { 41 | return sum(mul(a, b)); 42 | } 43 | 44 | template 45 | inline __device__ float dot(T a, T b) { 46 | return sum(mul(a, b)); 47 | } 48 | 49 | template 50 | inline __device__ void zero(T& dst) { 51 | constexpr int WORDS = sizeof(T) / 4; 52 | union { 53 | T raw; 54 | uint32_t words[WORDS]; 55 | } tmp; 56 | 57 | #pragma unroll 58 | for (int ii = 0; ii < WORDS; ++ii) { 59 | tmp.words[ii] = 0u; 60 | } 61 | dst = tmp.raw; 62 | } 63 | 64 | } // namespace vllm 65 | -------------------------------------------------------------------------------- /mistralrs/examples/lora_activation/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | 3 | use anyhow::Result; 4 | use mistralrs::{ 5 | LoraModelBuilder, RequestBuilder, TextMessageRole, TextMessages, TextModelBuilder, 6 | }; 7 | 8 | #[tokio::main] 9 | async fn main() -> Result<()> { 10 | let model = 11 | LoraModelBuilder::from_text_model_builder( 12 | TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(), 13 | "lamm-mit/x-lora", 14 | serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| { 15 | panic!("Could not load ordering file at my-ordering-file.json") 16 | }))?, 17 | ) 18 | .build() 19 | .await?; 20 | 21 | // First example: activate adapters per-request 22 | let messages = RequestBuilder::new() 23 | .set_adapters(vec!["adapter_2".to_string()]) 24 | .add_message( 25 | TextMessageRole::User, 26 | "Hello! How are you? Please write generic binary search function in Rust.", 27 | ); 28 | 29 | let response = model.send_chat_request(messages).await?; 30 | 31 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 32 | dbg!( 33 | response.usage.avg_prompt_tok_per_sec, 34 | response.usage.avg_compl_tok_per_sec 35 | ); 36 | 37 | // Second example: activate adapters for the whole model, used for all subsequent requests 38 | model 39 | .activate_adapters(vec!["adapter_1".to_string()]) 40 | .await?; 41 | 42 | let messages = TextMessages::new().add_message( 43 | TextMessageRole::User, 44 | "Hello! How are you? Please write generic binary search function in Rust.", 45 | ); 46 | 47 | let response = model.send_chat_request(messages).await?; 48 | 49 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 50 | dbg!( 51 | response.usage.avg_prompt_tok_per_sec, 52 | response.usage.avg_compl_tok_per_sec 53 | ); 54 | 55 | Ok(()) 56 | } 57 | -------------------------------------------------------------------------------- /.github/workflows/analysis.yaml: -------------------------------------------------------------------------------- 1 | name: Analysis 2 | on: 3 | pull_request_target 4 | 5 | jobs: 6 | comment: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout code 10 | uses: actions/checkout@v4 11 | 12 | - name: Install Rust and Cargo 13 | run: | 14 | curl -sSf https://sh.rustup.rs | sh -s -- -y 15 | source $HOME/.cargo/env 16 | 17 | - name: Install Tokei 18 | run: cargo install tokei 19 | 20 | - name: Run Tokei and get the lines of code 21 | run: tokei . > tokei_output.txt 22 | 23 | - name: Comment or Update PR 24 | uses: actions/github-script@v7 25 | with: 26 | script: | 27 | const fs = require('fs'); 28 | const tokeiOutput = fs.readFileSync('tokei_output.txt', 'utf8'); 29 | const uniqueIdentifier = 'Code Metrics Report'; 30 | const codeReport = ` 31 |
32 | ${uniqueIdentifier} 33 |
34 |               ${tokeiOutput}
35 |               
36 |
37 | `; 38 | 39 | const issue_number = context.issue.number; 40 | const { owner, repo } = context.repo; 41 | 42 | const comments = await github.rest.issues.listComments({ 43 | issue_number, 44 | owner, 45 | repo 46 | }); 47 | 48 | const existingComment = comments.data.find(comment => comment.body.includes(uniqueIdentifier)); 49 | 50 | if (existingComment) { 51 | await github.rest.issues.updateComment({ 52 | owner, 53 | repo, 54 | comment_id: existingComment.id, 55 | body: codeReport 56 | }); 57 | } else { 58 | await github.rest.issues.createComment({ 59 | issue_number, 60 | owner, 61 | repo, 62 | body: codeReport 63 | }); 64 | } 65 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/ffi.rs: -------------------------------------------------------------------------------- 1 | use core::ffi::{c_int, c_long, c_void}; 2 | 3 | extern "C" { 4 | pub fn reshape_and_cache( 5 | key: *const c_void, 6 | value: *const c_void, 7 | key_cache: *const c_void, 8 | value_cache: *const c_void, 9 | slot_mapping: *const c_long, 10 | 11 | num_tokens: c_int, 12 | num_heads: c_int, 13 | head_size: c_int, 14 | block_size: c_int, 15 | x: c_int, 16 | key_stride: c_int, 17 | value_stride: c_int, 18 | 19 | dtype: u32, 20 | ); 21 | 22 | pub fn paged_attention_v1( 23 | out: *const c_void, 24 | query: *const c_void, 25 | key_cache: *const c_void, 26 | value_cache: *const c_void, 27 | num_kv_heads: c_int, 28 | scale: f32, 29 | softcapping: f32, 30 | block_tables: *const c_int, 31 | context_lens: *const c_int, 32 | block_size: c_int, 33 | max_context_len: c_int, 34 | 35 | num_seqs: c_int, 36 | num_heads: c_int, 37 | head_size: c_int, 38 | max_num_blocks_per_seq: c_int, 39 | q_stride: c_int, 40 | kv_block_stride: c_int, 41 | kv_head_stride: c_int, 42 | 43 | dtype: u32, 44 | ); 45 | 46 | pub fn paged_attention_v2( 47 | out: *const c_void, 48 | exp_sums: *const f32, 49 | max_logits: *const f32, 50 | tmp_out: *const c_void, 51 | query: *const c_void, 52 | key_cache: *const c_void, 53 | value_cache: *const c_void, 54 | num_kv_heads: c_int, 55 | scale: f32, 56 | softcapping: f32, 57 | block_tables: *const c_int, 58 | context_lens: *const c_int, 59 | block_size: c_int, 60 | max_context_len: c_int, 61 | 62 | num_seqs: c_int, 63 | num_heads: c_int, 64 | head_size: c_int, 65 | max_num_blocks_per_seq: c_int, 66 | q_stride: c_int, 67 | kv_block_stride: c_int, 68 | kv_head_stride: c_int, 69 | 70 | dtype: u32, 71 | ); 72 | } 73 | -------------------------------------------------------------------------------- /mistralrs/examples/anymoe/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | AnyMoeConfig, AnyMoeExpertType, AnyMoeModelBuilder, IsqType, PagedAttentionMetaBuilder, 4 | TextMessageRole, TextMessages, TextModelBuilder, 5 | }; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let text_builder = TextModelBuilder::new("mistralai/Mistral-7B-Instruct-v0.1") 10 | .with_isq(IsqType::Q8_0) 11 | .with_logging() 12 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?; 13 | 14 | let model = AnyMoeModelBuilder::from_text_builder( 15 | text_builder, 16 | AnyMoeConfig { 17 | hidden_size: 4096, 18 | lr: 1e-3, 19 | epochs: 100, 20 | batch_size: 4, 21 | expert_type: AnyMoeExpertType::LoraAdapter { 22 | rank: 64, 23 | alpha: 16., 24 | target_modules: vec!["gate_proj".to_string()], 25 | }, 26 | gate_model_id: None, // Set this to Some("path/to/model/id") for the pretrained gating model id 27 | training: true, 28 | loss_csv_path: None, 29 | }, 30 | "model.layers", 31 | "mlp", 32 | "examples/amoe.json", 33 | vec!["HuggingFaceH4/zephyr-7b-beta"], 34 | vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 35 | ) 36 | .build() 37 | .await?; 38 | 39 | let messages = TextMessages::new() 40 | .add_message( 41 | TextMessageRole::System, 42 | "You are an AI agent with a specialty in programming.", 43 | ) 44 | .add_message( 45 | TextMessageRole::User, 46 | "Hello! How are you? Please write generic binary search function in Rust.", 47 | ); 48 | 49 | let response = model.send_chat_request(messages).await?; 50 | 51 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 52 | dbg!( 53 | response.usage.avg_prompt_tok_per_sec, 54 | response.usage.avg_compl_tok_per_sec 55 | ); 56 | 57 | Ok(()) 58 | } 59 | -------------------------------------------------------------------------------- /examples/server/adapter_chat.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | messages = [] 39 | prompt = input("Enter system prompt >>> ") 40 | if len(prompt) > 0: 41 | messages.append({"role": "system", "content": prompt}) 42 | 43 | 44 | while True: 45 | prompt = input(">>> ") 46 | adapter = input("Active adapter >>> ") 47 | messages.append({"role": "user", "content": prompt}) 48 | completion = client.chat.completions.create( 49 | model="mistral", 50 | messages=messages, 51 | max_tokens=256, 52 | frequency_penalty=1.0, 53 | top_p=0.1, 54 | temperature=0, 55 | extra_body={"adapters": [adapter]}, 56 | ) 57 | resp = completion.choices[0].message.content 58 | print(resp) 59 | messages.append({"role": "assistant", "content": resp}) 60 | -------------------------------------------------------------------------------- /mistralrs/examples/batching/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | ChatCompletionResponse, IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, 4 | TextModelBuilder, Usage, 5 | }; 6 | 7 | const N_REQUESTS: usize = 10; 8 | 9 | #[tokio::main] 10 | async fn main() -> Result<()> { 11 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 12 | .with_isq(IsqType::Q8_0) 13 | .with_logging() 14 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 15 | .build() 16 | .await?; 17 | 18 | let messages = TextMessages::new() 19 | .add_message( 20 | TextMessageRole::System, 21 | "You are an AI agent with a specialty in programming.", 22 | ) 23 | .add_message( 24 | TextMessageRole::User, 25 | "Hello! How are you? Please write generic binary search function in Rust.", 26 | ); 27 | 28 | let mut handles = Vec::new(); 29 | for _ in 0..N_REQUESTS { 30 | handles.push(model.send_chat_request(messages.clone())); 31 | } 32 | let responses = futures::future::join_all(handles) 33 | .await 34 | .into_iter() 35 | .collect::>>()?; 36 | 37 | let mut max_prompt = f32::MIN; 38 | let mut max_completion = f32::MIN; 39 | 40 | for response in responses { 41 | let ChatCompletionResponse { 42 | usage: 43 | Usage { 44 | avg_compl_tok_per_sec, 45 | avg_prompt_tok_per_sec, 46 | .. 47 | }, 48 | .. 49 | } = response; 50 | dbg!(avg_compl_tok_per_sec, avg_prompt_tok_per_sec); 51 | if avg_compl_tok_per_sec > max_prompt { 52 | max_prompt = avg_prompt_tok_per_sec; 53 | } 54 | if avg_compl_tok_per_sec > max_completion { 55 | max_completion = avg_compl_tok_per_sec; 56 | } 57 | } 58 | println!("Individual sequence stats: {max_prompt} max PP T/s, {max_completion} max TG T/s"); 59 | 60 | Ok(()) 61 | } 62 | -------------------------------------------------------------------------------- /mistralrs-core/src/dummy_paged_attention/cache_engine.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::HashMap, 3 | sync::{Arc, Mutex, MutexGuard}, 4 | }; 5 | 6 | use candle_core::{DType, Device, Result, Tensor}; 7 | 8 | use super::config::ModelConfigLike; 9 | 10 | #[derive(Clone, Debug)] 11 | pub struct CacheConfig { 12 | pub block_size: usize, 13 | pub num_gpu_blocks: usize, 14 | pub num_cpu_blocks: usize, 15 | } 16 | 17 | pub type KVCache = (Tensor, Tensor); 18 | 19 | pub struct CacheEngine { 20 | dummy_cache: Arc>>, 21 | } 22 | 23 | impl CacheEngine { 24 | pub fn new( 25 | _model_config: &dyn ModelConfigLike, 26 | _cache_config: &CacheConfig, 27 | _dtype: DType, 28 | _device: &Device, 29 | ) -> Result { 30 | Ok(Self { 31 | dummy_cache: Arc::new(Mutex::new(Vec::new())), 32 | }) 33 | } 34 | 35 | pub fn get_kv_cache(&self) -> MutexGuard<'_, Vec> { 36 | loop { 37 | if let Ok(v) = self.dummy_cache.try_lock() { 38 | return v; 39 | } 40 | } 41 | } 42 | } 43 | 44 | impl CacheEngine { 45 | pub fn execute_scheduler_ops( 46 | &self, 47 | blocks_to_swap_in: HashMap, 48 | blocks_to_swap_out: HashMap, 49 | blocks_to_copy: HashMap>, 50 | ) -> Result<()> { 51 | if !blocks_to_swap_in.is_empty() { 52 | self.swap_in(blocks_to_swap_in)?; 53 | } 54 | if !blocks_to_swap_out.is_empty() { 55 | self.swap_out(blocks_to_swap_out)?; 56 | } 57 | if !blocks_to_copy.is_empty() { 58 | self.copy(blocks_to_copy)?; 59 | } 60 | Ok(()) 61 | } 62 | 63 | pub fn swap_in(&self, _src_to_dst: HashMap) -> Result<()> { 64 | Ok(()) 65 | } 66 | 67 | pub fn swap_out(&self, _src_to_dst: HashMap) -> Result<()> { 68 | Ok(()) 69 | } 70 | 71 | pub fn copy(&self, _src_to_dst: HashMap>) -> Result<()> { 72 | Ok(()) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /docs/ISQ.md: -------------------------------------------------------------------------------- 1 | # In situ quantization 2 | 3 | In situ quantization works by quantizing non GGUF or GGML models in-place. This allows you to take advantage of flash attention, and reduces memory footprint when running the model. Currently, all layers which would be `Linear` are able to be quantized. 4 | 5 | An API is exposed on the Python and Rust APIs which provide the ability to dynamically re-ISQ models at runtime. 6 | 7 | To set the ISQ type for individual layers, use a model [`topology`](TOPOLOGY.md). 8 | 9 | ## ISQ quantization types 10 | - Q4_0 11 | - Q4_1 12 | - Q5_0 13 | - Q5_1 14 | - Q8_0 15 | - Q8_1 (*not available on CUDA*) 16 | - Q2K 17 | - Q3K 18 | - Q4K 19 | - Q5K 20 | - Q6K 21 | - Q8K (*not available on CUDA*) 22 | - HQQ4 23 | - HQQ8 24 | - FP8 25 | 26 | When using ISQ, it will automatically load ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision. 27 | 28 | For Mixture of Expert models, a method called [MoQE](https://arxiv.org/abs/2310.02410) can be applied to only quantize MoE layers. This is configured via the ISQ organization parameter in all APIs. 29 | 30 | ## Python Example 31 | ```python 32 | runner = Runner( 33 | which=Which.GGUF( 34 | tok_model_id="mistralai/Mistral-7B-Instruct-v0.1", 35 | quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", 36 | quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf", 37 | ), 38 | in_situ_quant="Q4K", 39 | ) 40 | ``` 41 | 42 | ## Rust Example 43 | You can find this example [here](../mistralrs/examples/isq/main.rs). 44 | 45 | ```rust 46 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 47 | .with_isq(IsqType::Q8_0) 48 | .with_logging() 49 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 50 | .build() 51 | .await?; 52 | ``` 53 | 54 | ## Server example 55 | ``` 56 | cargo run --release --features "cuda flash-attn" -- --port 1234 --log output.txt --isq Q2K plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral 57 | ``` -------------------------------------------------------------------------------- /mistralrs-quant/src/hqq/ffi.rs: -------------------------------------------------------------------------------- 1 | macro_rules! dequant_kernel { 2 | ($wq:ty, $scalar:ty, $postfix:tt) => { 3 | paste! { 4 | pub(crate) fn [< dequantize_ $postfix >]( 5 | wq_packed: *const $wq, 6 | scale: *const $scalar, 7 | zero: *const $scalar, 8 | out: *const $scalar, 9 | h: i32, 10 | w: i32 11 | ); 12 | } 13 | }; 14 | } 15 | 16 | pub mod eight_bit { 17 | use half::{bf16, f16}; 18 | use paste::paste; 19 | 20 | #[allow(dead_code)] 21 | extern "C" { 22 | dequant_kernel!(u8, f32, 8bit_u8_kernel_f32); 23 | dequant_kernel!(u8, f16, 8bit_u8_kernel_f16); 24 | dequant_kernel!(u8, bf16, 8bit_u8_kernel_bf16); 25 | } 26 | } 27 | 28 | pub mod four_bit { 29 | use half::{bf16, f16}; 30 | use paste::paste; 31 | 32 | #[allow(dead_code)] 33 | extern "C" { 34 | dequant_kernel!(u8, f32, 4bit_u8_kernel_f32); 35 | dequant_kernel!(u8, f16, 4bit_u8_kernel_f16); 36 | dequant_kernel!(u8, bf16, 4bit_u8_kernel_bf16); 37 | } 38 | } 39 | 40 | pub mod three_bit { 41 | use half::{bf16, f16}; 42 | use paste::paste; 43 | 44 | #[allow(dead_code)] 45 | extern "C" { 46 | dequant_kernel!(i32, f32, 3bit_32_kernel_f32); 47 | dequant_kernel!(i32, f16, 3bit_32_kernel_f16); 48 | dequant_kernel!(i32, bf16, 3bit_32_kernel_bf16); 49 | } 50 | } 51 | 52 | pub mod two_bit { 53 | use half::{bf16, f16}; 54 | use paste::paste; 55 | 56 | #[allow(dead_code)] 57 | extern "C" { 58 | dequant_kernel!(u8, f32, 2bit_u8_kernel_f32); 59 | dequant_kernel!(u8, f16, 2bit_u8_kernel_f16); 60 | dequant_kernel!(u8, bf16, 2bit_u8_kernel_bf16); 61 | } 62 | } 63 | 64 | pub mod one_bit { 65 | use half::{bf16, f16}; 66 | use paste::paste; 67 | 68 | #[allow(dead_code)] 69 | extern "C" { 70 | dequant_kernel!(u8, f32, 1bit_u8_kernel_f32); 71 | dequant_kernel!(u8, f16, 1bit_u8_kernel_f16); 72 | dequant_kernel!(u8, bf16, 1bit_u8_kernel_bf16); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "../cuda_compat.h" 21 | #include "attention_dtypes.h" 22 | 23 | #include 24 | #include 25 | 26 | namespace vllm { 27 | 28 | // Q*K^T operation. 29 | template 30 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 31 | using A_vec = typename FloatVec::Type; 32 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 33 | A_vec qk_vec = mul(q[0], k[0]); 34 | #pragma unroll 35 | for (int ii = 1; ii < N; ++ii) { 36 | qk_vec = fma(q[ii], k[ii], qk_vec); 37 | } 38 | 39 | // Finalize the reduction across lanes. 40 | float qk = sum(qk_vec); 41 | #pragma unroll 42 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 43 | qk += VLLM_SHFL_XOR_SYNC(qk, mask); 44 | } 45 | return qk; 46 | } 47 | 48 | template 49 | struct Qk_dot { 50 | template 51 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 52 | return qk_dot_(q, k); 53 | } 54 | }; 55 | 56 | } // namespace vllm 57 | -------------------------------------------------------------------------------- /mistralrs/examples/gguf_locally/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | GgufModelBuilder, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextMessages, 4 | }; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<()> { 8 | // We do not use any files from remote servers here, and instead load the 9 | // chat template from the specified file, and the tokenizer and model from a 10 | // local GGUF file at the path specified. 11 | let model = GgufModelBuilder::new( 12 | "gguf_models/mistral_v0.1/", 13 | vec!["mistral-7b-instruct-v0.1.Q4_K_M.gguf"], 14 | ) 15 | .with_chat_template("chat_templates/mistral.json") 16 | .with_logging() 17 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 18 | .build() 19 | .await?; 20 | 21 | let messages = TextMessages::new() 22 | .add_message( 23 | TextMessageRole::System, 24 | "You are an AI agent with a specialty in programming.", 25 | ) 26 | .add_message( 27 | TextMessageRole::User, 28 | "Hello! How are you? Please write generic binary search function in Rust.", 29 | ); 30 | 31 | let response = model.send_chat_request(messages).await?; 32 | 33 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 34 | dbg!( 35 | response.usage.avg_prompt_tok_per_sec, 36 | response.usage.avg_compl_tok_per_sec 37 | ); 38 | 39 | // Next example: Return some logprobs with the `RequestBuilder`, which enables higher configurability. 40 | let request = RequestBuilder::new().return_logprobs(true).add_message( 41 | TextMessageRole::User, 42 | "Please write a mathematical equation where a few numbers are added.", 43 | ); 44 | 45 | let response = model.send_chat_request(request).await?; 46 | 47 | println!( 48 | "Logprobs: {:?}", 49 | &response.choices[0] 50 | .logprobs 51 | .as_ref() 52 | .unwrap() 53 | .content 54 | .as_ref() 55 | .unwrap()[0..3] 56 | ); 57 | 58 | Ok(()) 59 | } 60 | -------------------------------------------------------------------------------- /mistralrs-quant/src/gptq/gptq_cpu.rs: -------------------------------------------------------------------------------- 1 | use crate::{IsqType, QuantMethod, QuantMethodConfig, QuantizedSerde}; 2 | use candle_core::{DType, Device, Result, Tensor}; 3 | use std::{ 4 | num::NonZeroUsize, 5 | sync::{atomic::AtomicUsize, Arc}, 6 | }; 7 | 8 | #[derive(Debug)] 9 | pub struct GptqLayer; 10 | 11 | impl QuantMethod for GptqLayer { 12 | fn new(method: QuantMethodConfig) -> Result 13 | where 14 | Self: Sized, 15 | { 16 | match method { 17 | QuantMethodConfig::Gptq { 18 | bits: _, 19 | use_exllama: _, 20 | q_weight: _, 21 | gptq_qzeros: _, 22 | gptq_scales: _, 23 | g_idx: _, 24 | bias: _, 25 | } => candle_core::bail!("GPTQ is only supported on CUDA."), 26 | QuantMethodConfig::Gguf { .. } 27 | | QuantMethodConfig::Unquantized(_) 28 | | QuantMethodConfig::Hqq { .. } 29 | | QuantMethodConfig::Dummy 30 | | QuantMethodConfig::FP8 { .. } => { 31 | unreachable!() 32 | } 33 | } 34 | } 35 | 36 | fn forward(&self, _a: &Tensor) -> Result { 37 | todo!() 38 | } 39 | 40 | fn quantized_act_type(&self) -> Option { 41 | todo!() 42 | } 43 | 44 | fn add_delta_w(&self, _delta: &Tensor) -> Result> { 45 | todo!() 46 | } 47 | 48 | fn dtype_and_device(&self) -> (DType, candle_core::Device) { 49 | todo!() 50 | } 51 | 52 | fn get_bias_mut(&mut self) -> Option<&mut Tensor> { 53 | todo!() 54 | } 55 | 56 | fn apply_isq( 57 | self: Arc, 58 | _dtype: Option, 59 | _device: Device, 60 | _n_quantized: &AtomicUsize, 61 | ) -> Result> { 62 | todo!() 63 | } 64 | 65 | fn get_max_isq_cpu_threads(&self, _dtype: IsqType) -> Option { 66 | todo!() 67 | } 68 | } 69 | 70 | impl QuantizedSerde for GptqLayer { 71 | fn name(&self) -> &'static str { 72 | "gptq" 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /examples/server/idefics2.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | completion = client.chat.completions.create( 39 | model="idefics2", 40 | messages=[ 41 | { 42 | "role": "user", 43 | "content": [ 44 | { 45 | "type": "image_url", 46 | "image_url": { 47 | "url": "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg" 48 | }, 49 | }, 50 | { 51 | "type": "text", 52 | "text": "What is shown in this image?", 53 | }, 54 | ], 55 | }, 56 | ], 57 | max_tokens=256, 58 | frequency_penalty=1.0, 59 | top_p=0.1, 60 | temperature=0, 61 | ) 62 | resp = completion.choices[0].message.content 63 | print(resp) 64 | -------------------------------------------------------------------------------- /examples/server/phi3v_local_img.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | BASE_URL = "http://localhost:1234/v1" 32 | 33 | # Enable this to log requests and responses 34 | # openai.http_client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | FILENAME = "picture.jpg" 39 | 40 | headers = { 41 | "Content-Type": "application/json", 42 | } 43 | 44 | payload = { 45 | "model": "phi3v", 46 | "messages": [ 47 | { 48 | "role": "user", 49 | "content": [ 50 | { 51 | "type": "image_url", 52 | "image_url": { 53 | "url": FILENAME, 54 | }, 55 | }, 56 | { 57 | "type": "text", 58 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 59 | }, 60 | ], 61 | } 62 | ], 63 | "max_tokens": 300, 64 | } 65 | 66 | response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=payload) 67 | print(response.json()) 68 | -------------------------------------------------------------------------------- /examples/server/llava.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | completion = client.chat.completions.create( 39 | model="llava", 40 | messages=[ 41 | { 42 | "role": "user", 43 | "content": [ 44 | { 45 | "type": "image_url", 46 | "image_url": { 47 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 48 | }, 49 | }, 50 | { 51 | "type": "text", 52 | "text": "What is shown in this image? Write a detailed response analyzing the scene.", 53 | }, 54 | ], 55 | }, 56 | ], 57 | max_tokens=256, 58 | frequency_penalty=1.0, 59 | top_p=0.1, 60 | temperature=0, 61 | ) 62 | resp = completion.choices[0].message.content 63 | print(resp) 64 | -------------------------------------------------------------------------------- /mistralrs-core/src/utils/memory_usage.rs: -------------------------------------------------------------------------------- 1 | use candle_core::{Device, Result}; 2 | use sysinfo::System; 3 | 4 | const KB_TO_BYTES: usize = 1024; 5 | 6 | pub struct MemoryUsage; 7 | 8 | impl MemoryUsage { 9 | /// Amount of available memory in bytes. 10 | pub fn get_memory_available(&self, device: &Device) -> Result { 11 | match device { 12 | Device::Cpu => { 13 | let mut sys = System::new_all(); 14 | sys.refresh_cpu(); 15 | Ok(usize::try_from(sys.free_memory())? * KB_TO_BYTES) 16 | } 17 | #[cfg(feature = "cuda")] 18 | Device::Cuda(_) => { 19 | use candle_core::cuda_backend::WrapErr; 20 | Ok(candle_core::cuda::cudarc::driver::result::mem_get_info() 21 | .w()? 22 | .0) 23 | } 24 | #[cfg(not(feature = "cuda"))] 25 | Device::Cuda(_) => { 26 | candle_core::bail!("Cannot get memory available for CUDA device") 27 | } 28 | Device::Metal(_) => { 29 | candle_core::bail!("Cannot get memory available for Metal device") 30 | } 31 | } 32 | } 33 | 34 | /// Amount of total memory in bytes. 35 | pub fn get_total_memory(&self, device: &Device) -> Result { 36 | match device { 37 | Device::Cpu => { 38 | let mut sys = System::new_all(); 39 | sys.refresh_cpu(); 40 | Ok(usize::try_from(sys.total_memory())? * KB_TO_BYTES) 41 | } 42 | #[cfg(feature = "cuda")] 43 | Device::Cuda(_) => { 44 | use candle_core::cuda_backend::WrapErr; 45 | Ok(candle_core::cuda::cudarc::driver::result::mem_get_info() 46 | .w()? 47 | .1) 48 | } 49 | #[cfg(not(feature = "cuda"))] 50 | Device::Cuda(_) => { 51 | candle_core::bail!("Cannot get total memory for CUDA device") 52 | } 53 | Device::Metal(_) => { 54 | candle_core::bail!("Cannot get total memory for Metal device") 55 | } 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /examples/server/phi3v.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | completion = client.chat.completions.create( 39 | model="phi3v", 40 | messages=[ 41 | { 42 | "role": "user", 43 | "content": [ 44 | { 45 | "type": "image_url", 46 | "image_url": { 47 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 48 | }, 49 | }, 50 | { 51 | "type": "text", 52 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 53 | }, 54 | ], 55 | }, 56 | ], 57 | max_tokens=256, 58 | frequency_penalty=1.0, 59 | top_p=0.1, 60 | temperature=0, 61 | ) 62 | resp = completion.choices[0].message.content 63 | print(resp) 64 | -------------------------------------------------------------------------------- /examples/server/llama_vision.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | completion = client.chat.completions.create( 39 | model="llama-vision", 40 | messages=[ 41 | { 42 | "role": "user", 43 | "content": [ 44 | { 45 | "type": "image_url", 46 | "image_url": { 47 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 48 | }, 49 | }, 50 | { 51 | "type": "text", 52 | "text": "What is shown in this image? Write a detailed response analyzing the scene.", 53 | }, 54 | ], 55 | }, 56 | ], 57 | max_tokens=256, 58 | frequency_penalty=1.0, 59 | top_p=0.1, 60 | temperature=0, 61 | ) 62 | resp = completion.choices[0].message.content 63 | print(resp) 64 | -------------------------------------------------------------------------------- /examples/server/llava_next.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import httpx 3 | import textwrap 4 | import json 5 | 6 | 7 | def log_response(response: httpx.Response): 8 | request = response.request 9 | print(f"Request: {request.method} {request.url}") 10 | print(" Headers:") 11 | for key, value in request.headers.items(): 12 | if key.lower() == "authorization": 13 | value = "[...]" 14 | if key.lower() == "cookie": 15 | value = value.split("=")[0] + "=..." 16 | print(f" {key}: {value}") 17 | print(" Body:") 18 | try: 19 | request_body = json.loads(request.content) 20 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 21 | except json.JSONDecodeError: 22 | print(textwrap.indent(request.content.decode(), " ")) 23 | print(f"Response: status_code={response.status_code}") 24 | print(" Headers:") 25 | for key, value in response.headers.items(): 26 | if key.lower() == "set-cookie": 27 | value = value.split("=")[0] + "=..." 28 | print(f" {key}: {value}") 29 | 30 | 31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/") 32 | 33 | # Enable this to log requests and responses 34 | # client._client = httpx.Client( 35 | # event_hooks={"request": [print], "response": [log_response]} 36 | # ) 37 | 38 | completion = client.chat.completions.create( 39 | model="llava_next", 40 | messages=[ 41 | { 42 | "role": "user", 43 | "content": [ 44 | { 45 | "type": "image_url", 46 | "image_url": { 47 | "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" 48 | }, 49 | }, 50 | { 51 | "type": "text", 52 | "text": "What is shown in this image? Write a detailed response analyzing the scene.", 53 | }, 54 | ], 55 | }, 56 | ], 57 | max_tokens=256, 58 | frequency_penalty=1.0, 59 | top_p=0.1, 60 | temperature=0, 61 | ) 62 | resp = completion.choices[0].message.content 63 | print(resp) 64 | -------------------------------------------------------------------------------- /mistralrs/examples/topology/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use mistralrs::{ 3 | IsqType, LayerTopology, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, 4 | TextModelBuilder, Topology, 5 | }; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<()> { 9 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct") 10 | .with_isq(IsqType::Q8_0) 11 | .with_topology( 12 | Topology::empty() 13 | .with_range( 14 | 0..8, 15 | LayerTopology { 16 | isq: Some(IsqType::Q3K), 17 | device: None, 18 | }, 19 | ) 20 | .with_range( 21 | 8..16, 22 | LayerTopology { 23 | isq: Some(IsqType::Q4K), 24 | device: None, 25 | }, 26 | ) 27 | .with_range( 28 | 16..24, 29 | LayerTopology { 30 | isq: Some(IsqType::Q6K), 31 | device: None, 32 | }, 33 | ) 34 | .with_range( 35 | 24..32, 36 | LayerTopology { 37 | isq: Some(IsqType::Q8_0), 38 | device: None, 39 | }, 40 | ), 41 | ) 42 | .with_logging() 43 | .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())? 44 | .build() 45 | .await?; 46 | 47 | let messages = TextMessages::new() 48 | .add_message( 49 | TextMessageRole::System, 50 | "You are an AI agent with a specialty in programming.", 51 | ) 52 | .add_message( 53 | TextMessageRole::User, 54 | "Hello! How are you? Please write generic binary search function in Rust.", 55 | ); 56 | 57 | let response = model.send_chat_request(messages).await?; 58 | 59 | println!("{}", response.choices[0].message.content.as_ref().unwrap()); 60 | dbg!( 61 | response.usage.avg_prompt_tok_per_sec, 62 | response.usage.avg_compl_tok_per_sec 63 | ); 64 | 65 | Ok(()) 66 | } 67 | -------------------------------------------------------------------------------- /examples/server/phi3v_base64.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import httpx 3 | import textwrap 4 | import json 5 | import base64 6 | 7 | 8 | def log_response(response: httpx.Response): 9 | request = response.request 10 | print(f"Request: {request.method} {request.url}") 11 | print(" Headers:") 12 | for key, value in request.headers.items(): 13 | if key.lower() == "authorization": 14 | value = "[...]" 15 | if key.lower() == "cookie": 16 | value = value.split("=")[0] + "=..." 17 | print(f" {key}: {value}") 18 | print(" Body:") 19 | try: 20 | request_body = json.loads(request.content) 21 | print(textwrap.indent(json.dumps(request_body, indent=2), " ")) 22 | except json.JSONDecodeError: 23 | print(textwrap.indent(request.content.decode(), " ")) 24 | print(f"Response: status_code={response.status_code}") 25 | print(" Headers:") 26 | for key, value in response.headers.items(): 27 | if key.lower() == "set-cookie": 28 | value = value.split("=")[0] + "=..." 29 | print(f" {key}: {value}") 30 | 31 | 32 | BASE_URL = "http://localhost:1234/v1" 33 | 34 | # Enable this to log requests and responses 35 | # openai.http_client = httpx.Client( 36 | # event_hooks={"request": [print], "response": [log_response]} 37 | # ) 38 | 39 | FILENAME = "picture.jpg" 40 | with open(FILENAME, "rb") as image_file: 41 | encoded_string = base64.b64encode(image_file.read()).decode("utf-8") 42 | 43 | headers = { 44 | "Content-Type": "application/json", 45 | } 46 | 47 | payload = { 48 | "model": "phi3v", 49 | "messages": [ 50 | { 51 | "role": "user", 52 | "content": [ 53 | { 54 | "type": "image_url", 55 | "image_url": { 56 | "url": str(encoded_string), 57 | }, 58 | }, 59 | { 60 | "type": "text", 61 | "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.", 62 | }, 63 | ], 64 | } 65 | ], 66 | "max_tokens": 300, 67 | } 68 | 69 | response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=payload) 70 | print(response.json()) 71 | -------------------------------------------------------------------------------- /mistralrs-paged-attn/src/backend/mod.rs: -------------------------------------------------------------------------------- 1 | mod cache; 2 | mod paged_attention; 3 | 4 | use std::{ 5 | marker::PhantomData, 6 | ptr::{addr_of, NonNull}, 7 | }; 8 | 9 | use candle_core::{ 10 | cuda::cudarc::driver::DeviceRepr, cuda_backend::cudarc::driver::CudaFunction, CudaDevice, 11 | DType, Result, 12 | }; 13 | 14 | pub use cache::{copy_blocks, swap_blocks}; 15 | pub use paged_attention::{paged_attention, reshape_and_cache}; 16 | 17 | const COPY_BLOCKS_KERNEL_NAME: &str = "copy_blocks_kernel"; 18 | 19 | pub fn get_or_load_func( 20 | ptx_file: &'static str, 21 | kernel_base: &str, 22 | dtype: DType, 23 | suffix: Option<&str>, 24 | device: &CudaDevice, 25 | ) -> Result { 26 | let spec = match dtype { 27 | DType::U8 => "_u8", 28 | DType::U32 => "_u32", 29 | DType::I16 => "_i16", 30 | DType::I32 => "_i32", 31 | DType::I64 => "_i64", 32 | DType::BF16 => "_bf16", 33 | DType::F16 => "_f16", 34 | DType::F32 => "_f32", 35 | DType::F64 => "_f64", 36 | DType::F8E4M3 => "_f8_e4m3", 37 | }; 38 | let spec = if let Some(suffix) = suffix { 39 | spec.to_owned() + suffix 40 | } else { 41 | spec.to_owned() 42 | }; 43 | let kernel = kernel_base.to_owned() + &spec; 44 | device.get_or_load_func(&kernel, ptx_file) 45 | } 46 | 47 | #[repr(transparent)] 48 | struct Conjoined<'a, T, R> { 49 | raw: *mut T, 50 | _ref: PhantomData<&'a mut R>, 51 | } 52 | 53 | impl<'a, T, R> Conjoined<'a, T, R> { 54 | fn new(raw: NonNull, _ref: &'a mut R) -> Self { 55 | Self { 56 | raw: raw.as_ptr(), 57 | _ref: PhantomData, 58 | } 59 | } 60 | } 61 | 62 | /// According to the docs: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15 63 | /// Each of the kernel params (*mut c_void) "must point to a region of memory from which the actual kernel parameter will be copied". 64 | /// This means that we must return a pointer to our pointer. 65 | /// 66 | /// ## Safety 67 | /// - The returned pointer **must not** outlive the &self reference. Otherwise, a dangling pointer is created. 68 | unsafe impl<'a, T, R> DeviceRepr for Conjoined<'a, T, R> { 69 | fn as_kernel_param(&self) -> *mut std::ffi::c_void { 70 | addr_of!(self.raw) as *mut _ 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /docs/CHAT_TOK.md: -------------------------------------------------------------------------------- 1 | # Chat templates and tokenizer customization 2 | 3 | ## Chat templates 4 | Mistral.rs attempts to automatically load a chat template from the `tokenizer_config.json` file. This enables high flexibility across instruction-tuned models and ensures accurate chat templating. However, if the `chat_template` field is missing, then a JINJA chat template should be provided. The JINJA chat template may use `messages`, `add_generation_prompt`, `bos_token`, `eos_token`, and `unk_token` as inputs. 5 | 6 | We provide some chat templates [here](../chat_templates/), and it is easy to modify or create others to customize chat template behavior. 7 | 8 | For example, to use the `chatml` template, `--chat-template` is specified *before* the model architecture. For example: 9 | 10 | ```bash 11 | ./mitralrs-server --port 1234 --log output.log --chat-template ./chat_templates/chatml.json llama 12 | ``` 13 | 14 | > Note: For GGUF models, the chat template may be loaded directly from the GGUF file by omitting any other chat template sources. 15 | 16 | ## Tokenizer 17 | 18 | Some models do not provide a `tokenizer.json` file although mistral.rs expects one. To solve this, please run [this](../scripts/get_tokenizers_json.py) script. It will output the `tokenizer.json` file for your specific model. This may be used by passing the `--tokenizer-json` flag *after* the model architecture. For example: 19 | 20 | ```bash 21 | $ python3 scripts/get_tokenizers_json.py 22 | Enter model ID: microsoft/Orca-2-13b 23 | $ ./mistralrs-server --port 1234 --log output.log plain -m microsoft/Orca-2-13b --tokenizer-json tokenizer.json 24 | ``` 25 | 26 | Putting it all together, to run, for example, an [Orca](https://huggingface.co/microsoft/Orca-2-13b) model (which does not come with a `tokenizer.json` or chat template): 27 | 1) Generate the `tokenizer.json` by running the script at `scripts/get_tokenizers_json.py`. This will output some files including `tokenizer.json` in the working directory. 28 | 2) Find and copy the correct chat template from `chat-templates` to the working directory (eg., `cp chat_templates/chatml.json .`) 29 | 3) Run `mistralrs-server`, specifying the tokenizer and chat template: `cargo run --release --features cuda -- --port 1234 --log output.txt --chat-template chatml.json plain -m microsoft/Orca-2-13b -t tokenizer.json -a llama` 30 | 31 | > Note: For GGUF models, the tokenizer may be loaded directly from the GGUF file by omitting the tokenizer model ID. 32 | --------------------------------------------------------------------------------