├── mistralrs-core
    ├── src
    │   ├── cuda
    │   │   └── mod.rs
    │   ├── diffusion_models
    │   │   ├── clip
    │   │   │   └── mod.rs
    │   │   ├── flux
    │   │   │   └── mod.rs
    │   │   └── mod.rs
    │   ├── paged_attention
    │   │   ├── layers
    │   │   │   └── mod.rs
    │   │   ├── block_engine_sequence.rs
    │   │   └── config.rs
    │   ├── dummy_paged_attention
    │   │   ├── layers
    │   │   │   └── mod.rs
    │   │   ├── block_engine_sequence.rs
    │   │   ├── config.rs
    │   │   └── cache_engine.rs
    │   ├── aici
    │   │   ├── README.md
    │   │   ├── mod.rs
    │   │   └── bytes.rs
    │   ├── vision_models
    │   │   ├── llava
    │   │   │   └── mod.rs
    │   │   ├── processor_config.rs
    │   │   ├── mod.rs
    │   │   ├── image_processor.rs
    │   │   └── preprocessor_config.rs
    │   ├── layers_utils.rs
    │   ├── models
    │   │   └── mod.rs
    │   ├── utils
    │   │   ├── log.rs
    │   │   ├── debug.rs
    │   │   ├── tokens.rs
    │   │   ├── tokenizer.rs
    │   │   └── memory_usage.rs
    │   ├── tools
    │   │   ├── response.rs
    │   │   └── request.rs
    │   ├── gguf
    │   │   ├── chat_template.rs
    │   │   └── mod.rs
    │   ├── amoe
    │   │   ├── macros.rs
    │   │   └── inputs.rs
    │   ├── xlora_models
    │   │   └── config.rs
    │   └── scheduler
    │   │   └── mod.rs
    └── README.md
├── .gitignore
├── mistralrs-paged-attn
    ├── README.md
    ├── src
    │   ├── attention
    │   │   ├── attention_dtypes.h
    │   │   ├── attention_generic.cuh
    │   │   └── attention_utils.cuh
    │   ├── lib.rs
    │   ├── cuda_compat.h
    │   ├── ffi.rs
    │   └── backend
    │   │   └── mod.rs
    └── Cargo.toml
├── .dockerignore
├── mistralrs-pyo3
    ├── build.rs
    ├── pyproject.toml
    ├── pyproject_template.toml
    ├── .gitignore
    ├── Cargo_template.toml
    ├── Cargo.toml
    └── src
    │   └── stream.rs
├── toml-selectors
    ├── plain.toml
    ├── lora.toml
    ├── xlora.toml
    ├── gguf.toml
    ├── speculative-gguf.toml
    ├── speculative-same-gguf.toml
    ├── anymoe.toml
    └── anymoe_lora.toml
├── topologies
    ├── isq.yml
    └── isq_and_device.yml
├── mistralrs-server
    ├── resources
    │   ├── rust-logo-32x32.png
    │   └── LICENSE.md
    └── Cargo.toml
├── examples
    ├── README.md
    ├── server
    │   ├── flux.py
    │   ├── streaming_completion.py
    │   ├── yacc.py
    │   ├── streaming.py
    │   ├── stream_completion_bench.py
    │   ├── regex.py
    │   ├── completion.py
    │   ├── chat.py
    │   ├── adapter_chat.py
    │   ├── idefics2.py
    │   ├── phi3v_local_img.py
    │   ├── llava.py
    │   ├── phi3v.py
    │   ├── llama_vision.py
    │   ├── llava_next.py
    │   └── phi3v_base64.py
    ├── python
    │   ├── flux.py
    │   ├── plain.py
    │   ├── isq.py
    │   ├── paged_attention.py
    │   ├── mixture_of_quant_experts.py
    │   ├── topology.py
    │   ├── gguf.py
    │   ├── streaming.py
    │   ├── xlora_gemma.py
    │   ├── token_source.py
    │   ├── lora_activation.py
    │   ├── xlora_zephyr.py
    │   ├── speculative.py
    │   ├── lora_zephyr.py
    │   ├── idefics2.py
    │   ├── speculative_xlora.py
    │   ├── phi3v_local_img.py
    │   ├── anymoe.py
    │   ├── phi3v.py
    │   ├── llava_next.py
    │   ├── anymoe_inference.py
    │   ├── anymoe_lora.py
    │   ├── phi3v_base64.py
    │   └── llama_vision.py
    └── amoe.json
├── orderings
    └── lora-paper-ordering.json
├── scripts
    ├── get_tokenizers_json.py
    ├── testgen_text.py
    ├── set_names.py
    ├── lora_add_preload_adapters.py
    ├── testgen_vision.py
    └── create_ordering.py
├── mistralrs-vision
    ├── README.md
    ├── Cargo.toml
    ├── src
    │   ├── utils.rs
    │   └── ops.rs
    └── tests
    │   └── integration.rs
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   ├── bug_report.md
    │   └── build_failure.md
    └── workflows
    │   ├── release_python.yml
    │   ├── docs.yml
    │   └── analysis.yaml
├── chat_templates
    ├── chatml.json
    ├── phi3.json
    ├── llama3.json
    ├── phi3.5.json
    ├── mistral.json
    ├── llama2.json
    ├── default.json
    └── vicuna.json
├── .typos.toml
├── mistralrs-quant
    ├── src
    │   ├── gptq
    │   │   ├── mod.rs
    │   │   ├── ffi.rs
    │   │   └── gptq_cpu.rs
    │   ├── utils
    │   │   ├── ffi.rs
    │   │   └── mod.rs
    │   ├── dummy
    │   │   └── mod.rs
    │   └── hqq
    │   │   └── ffi.rs
    ├── README.md
    ├── kernels
    │   └── gptq
    │   │   ├── qdq_8.cuh
    │   │   ├── qdq_util.cuh
    │   │   └── compat.cuh
    └── Cargo.toml
├── docs
    ├── IMAGEGEN_MODELS.md
    ├── SAMPLING.md
    ├── TOOL_CALLING.md
    ├── VISION_MODELS.md
    ├── LORA_XLORA.md
    ├── README.md
    ├── DEVICE_MAPPING.md
    ├── QUANTS.md
    ├── NON_GRANULAR.md
    ├── GEMMA2.md
    ├── ISQ.md
    └── CHAT_TOK.md
├── .cargo
    └── config.toml
├── mistralrs
    ├── README.md
    └── examples
    │   ├── gemma2
    │       └── main.rs
    │   ├── grammar
    │       └── main.rs
    │   ├── flux
    │       └── main.rs
    │   ├── xlora
    │       └── main.rs
    │   ├── gguf
    │       └── main.rs
    │   ├── phi3_5_moe
    │       └── main.rs
    │   ├── lora
    │       └── main.rs
    │   ├── mixture_of_quant_experts
    │       └── main.rs
    │   ├── idefics2
    │       └── main.rs
    │   ├── llava_next
    │       └── main.rs
    │   ├── phi3v
    │       └── main.rs
    │   ├── llava
    │       └── main.rs
    │   ├── paged_attn
    │       └── main.rs
    │   ├── llama_vision
    │       └── main.rs
    │   ├── isq
    │       └── main.rs
    │   ├── custom_logits_processor
    │       └── main.rs
    │   ├── simple
    │       └── main.rs
    │   ├── anymoe_lora
    │       └── main.rs
    │   ├── lora_activation
    │       └── main.rs
    │   ├── anymoe
    │       └── main.rs
    │   ├── batching
    │       └── main.rs
    │   ├── gguf_locally
    │       └── main.rs
    │   └── topology
    │       └── main.rs
├── .gitattributes
├── mistralrs-bench
    ├── Cargo.toml
    └── README.md
├── LICENSE
├── Dockerfile
├── Cargo.toml
└── Dockerfile.cuda-all


/mistralrs-core/src/cuda/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod ffi;
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .ruff_cache
3 | .vscode
4 | *.a


--------------------------------------------------------------------------------
/mistralrs-paged-attn/README.md:
--------------------------------------------------------------------------------
1 | # mistralrs-paged-attn


--------------------------------------------------------------------------------
/mistralrs-core/src/diffusion_models/clip/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod text;
2 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | docs
2 | examples
3 | orderings
4 | scripts
5 | target
6 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pyo3_build_config::add_extension_module_link_args();
3 | }
4 | 


--------------------------------------------------------------------------------
/toml-selectors/plain.toml:
--------------------------------------------------------------------------------
1 | [model]
2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1"
3 | arch = "mistral"


--------------------------------------------------------------------------------
/mistralrs-core/src/paged_attention/layers/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod paged_attention;
2 | 
3 | pub use paged_attention::PagedAttention;
4 | 


--------------------------------------------------------------------------------
/toml-selectors/lora.toml:
--------------------------------------------------------------------------------
1 | [model]
2 | adapters_model_id = "lamm-mit/x-lora"
3 | order = "ordering-file.json"
4 | arch = "mistral"


--------------------------------------------------------------------------------
/toml-selectors/xlora.toml:
--------------------------------------------------------------------------------
1 | [model]
2 | xlora_model_id = "lamm-mit/x-lora"
3 | order = "ordering-file.json"
4 | arch = "mistral"


--------------------------------------------------------------------------------
/mistralrs-core/src/dummy_paged_attention/layers/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod paged_attention;
2 | 
3 | pub use paged_attention::PagedAttention;
4 | 


--------------------------------------------------------------------------------
/topologies/isq.yml:
--------------------------------------------------------------------------------
1 | 0-8:
2 |   isq: Q3K
3 | 8-16:
4 |   isq: Q4K
5 | 16-24:
6 |   isq: Q6K
7 | # Skip 24-28
8 | 28-32:
9 |   isq: Q8_0


--------------------------------------------------------------------------------
/mistralrs-core/src/diffusion_models/flux/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod autoencoder;
2 | pub mod model;
3 | pub mod sampling;
4 | pub mod stepper;
5 | 


--------------------------------------------------------------------------------
/mistralrs-server/resources/rust-logo-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/mistral.rs/master/mistralrs-server/resources/rust-logo-32x32.png


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | - Python: [examples here](python)
3 | - HTTP Server: [examples here](server)
4 | - Rust: [examples here](../mistralrs/examples/)


--------------------------------------------------------------------------------
/mistralrs-core/src/aici/README.md:
--------------------------------------------------------------------------------
1 | // Originally from https://github.com/microsoft/aici/blob/64f0b551dee49e320e9b3b92289f3d6f2e888276
2 | // Licensed under the MIT license
3 | 


--------------------------------------------------------------------------------
/orderings/lora-paper-ordering.json:
--------------------------------------------------------------------------------
1 | {"base_model_id": "HuggingFaceH4/zephyr-7b-beta", "order": ["adapter_1"], "preload_adapters": [{"name":"adapter_2","adapter_model_id":"lamm-mit/x-lora"}]}


--------------------------------------------------------------------------------
/scripts/get_tokenizers_json.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer
2 | 
3 | model = input("Enter model ID: ")
4 | tok = AutoTokenizer.from_pretrained(model)
5 | tok.save_pretrained(".")
6 | 


--------------------------------------------------------------------------------
/mistralrs-core/README.md:
--------------------------------------------------------------------------------
1 | # `mistralrs-core`
2 | 
3 | Core crate of `mistral.rs` including the models and associated executors.
4 | 
5 | Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs/
6 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | 


--------------------------------------------------------------------------------
/toml-selectors/gguf.toml:
--------------------------------------------------------------------------------
1 | [model]
2 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
3 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
4 | quantized_filename = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"


--------------------------------------------------------------------------------
/topologies/isq_and_device.yml:
--------------------------------------------------------------------------------
 1 | 0-8:
 2 |   isq: Q3K
 3 |   device: cuda[0]
 4 | 8-16:
 5 |   isq: Q4K
 6 |   device: cpu
 7 | 16-24:
 8 |   isq: Q6K
 9 | # Skip 24-28
10 | 28-32:
11 |   isq: Q8_0
12 |   device: cuda[0]


--------------------------------------------------------------------------------
/mistralrs-core/src/vision_models/llava/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod config;
2 | pub mod llava15;
3 | pub mod llava_inputs_processor;
4 | pub mod llava_llm;
5 | pub mod llava_next;
6 | pub mod llava_next_inputs_processor;
7 | mod utils;
8 | 


--------------------------------------------------------------------------------
/mistralrs-vision/README.md:
--------------------------------------------------------------------------------
1 | # `mistralrs-vision`
2 | 
3 | This crate provides vision utilities for mistral.rs inspired by torchvision.
4 | 
5 | Documentation: https://ericlbuehler.github.io/mistral.rs/mistralrs_vision/index.html


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Feature request, such as new models or new technologies
 4 | title: ''
 5 | labels: ["new feature"]
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/paged_attention/block_engine_sequence.rs:
--------------------------------------------------------------------------------
1 | pub trait BlockEngineSequence {
2 |     fn blocks_to_add_new_tok(&self) -> usize;
3 |     fn get_id(&self) -> usize;
4 |     fn get_logical_token_blocks(&self) -> usize;
5 | }
6 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/dummy_paged_attention/block_engine_sequence.rs:
--------------------------------------------------------------------------------
1 | pub trait BlockEngineSequence {
2 |     fn blocks_to_add_new_tok(&self) -> usize;
3 |     fn get_id(&self) -> usize;
4 |     fn get_logical_token_blocks(&self) -> usize;
5 | }
6 | 


--------------------------------------------------------------------------------
/chat_templates/chatml.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
3 | }


--------------------------------------------------------------------------------
/mistralrs-core/src/vision_models/processor_config.rs:
--------------------------------------------------------------------------------
1 | use serde::Deserialize;
2 | 
3 | #[allow(dead_code)]
4 | #[derive(Deserialize, Debug)]
5 | pub struct ProcessorConfig {
6 |     pub(crate) chat_template: Option<String>,
7 |     pub(crate) image_seq_len: Option<usize>,
8 | }
9 | 


--------------------------------------------------------------------------------
/.typos.toml:
--------------------------------------------------------------------------------
 1 | [default]
 2 | extend-ignore-identifiers-re = [
 3 |     "Mmaped",
 4 |     "mmaped",
 5 |     "arange",
 6 |     "Nd",
 7 |     "nin"
 8 | ]
 9 | 
10 | [files]
11 | extend-exclude = [
12 |     "mistralrs-pyo3/pdoc/*",
13 |     "examples/server/phi3_duckduckgo_mistral.rs.ipynb"
14 | ]


--------------------------------------------------------------------------------
/mistralrs-quant/src/gptq/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "cuda")]
 2 | mod ffi;
 3 | #[cfg(not(feature = "cuda"))]
 4 | mod gptq_cpu;
 5 | #[cfg(feature = "cuda")]
 6 | mod gptq_cuda;
 7 | 
 8 | #[cfg(not(feature = "cuda"))]
 9 | pub use gptq_cpu::GptqLayer;
10 | #[cfg(feature = "cuda")]
11 | pub use gptq_cuda::GptqLayer;
12 | 


--------------------------------------------------------------------------------
/mistralrs-server/resources/LICENSE.md:
--------------------------------------------------------------------------------
1 | [Rust Logo](https://www.rust-lang.org/logos/rust-logo-32x32.png)(`rust-logo-32x32.png`) by Rust Foundation is licensed under
2 | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/?ref=chooser-v1).
3 | 
4 | This project is not affiliated with or endorsed by the Rust Foundation.


--------------------------------------------------------------------------------
/docs/IMAGEGEN_MODELS.md:
--------------------------------------------------------------------------------
1 | # Image generation model support in mistral.rs
2 | 
3 | Mistral.rs supports various modalities of models, including image generation models. Image generation models take text as input and generate images.
4 | 
5 | Please see docs for the following model types:
6 | 
7 | - FLUX.1 [FLUX.md](FLUX.md)
8 | 


--------------------------------------------------------------------------------
/chat_templates/phi3.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
3 | }


--------------------------------------------------------------------------------
/examples/server/flux.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
 4 | 
 5 | result = client.images.generate(
 6 |     model="flux",
 7 |     prompt="A vibrant sunset in the mountains, 4k, high quality.",
 8 |     n=1,
 9 | )
10 | print(result.data[0].url)
11 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/aici/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
 2 | 
 3 | pub(crate) mod bintokens;
 4 | pub(crate) mod bytes;
 5 | pub(crate) mod cfg;
 6 | pub(crate) mod lex;
 7 | pub(crate) mod recognizer;
 8 | pub(crate) mod rx;
 9 | pub(crate) mod svob;
10 | pub(crate) mod toktree;
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Report a bug.
 4 | title: ''
 5 | labels: ["bug"]
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | ## Latest commit or version
14 | Which commit or version you ran with.
15 | 
16 | 


--------------------------------------------------------------------------------
/.cargo/config.toml:
--------------------------------------------------------------------------------
 1 | [target.x86_64-unknown-linux-gnu]
 2 | rustflags = ["-C", "target-cpu=native"]
 3 | 
 4 | [target.aarch64-apple-darwin]
 5 | [build]
 6 | rustflags = ["-C", "target-cpu=native"]
 7 | 
 8 | [target.wasm32-unknown-unknown]
 9 | rustflags = ["-C", "target-feature=+simd128"]
10 | 
11 | [target.x86_64-apple-darwin]
12 | rustflags = ["-C", "target-feature=-avx,-avx2"]


--------------------------------------------------------------------------------
/docs/SAMPLING.md:
--------------------------------------------------------------------------------
 1 | # Sampling and penalty techniques in mistral.rs
 2 | 
 3 | We currently support the following sampling and penalty techniques in mistral.rs:
 4 | 
 5 | - Top K
 6 | - Top P
 7 | - Min P
 8 | - [Dry Penalty](https://github.com/oobabooga/text-generation-webui/pull/5677)
 9 | - Frequency Penalty
10 | - Presence Penalty
11 | 
12 | Please suggest more by raising an issue!


--------------------------------------------------------------------------------
/toml-selectors/speculative-gguf.toml:
--------------------------------------------------------------------------------
 1 | [model]
 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 3 | arch = "mistral"
 4 | 
 5 | [speculative]
 6 | gamma = 32
 7 | 
 8 | [speculative.draft_model]
 9 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
10 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
11 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf"


--------------------------------------------------------------------------------
/mistralrs-core/src/layers_utils.rs:
--------------------------------------------------------------------------------
 1 | use candle_core::{Result, Tensor};
 2 | 
 3 | pub fn repeat_kv(x: Tensor, n_rep: usize) -> Result<Tensor> {
 4 |     if n_rep == 1 {
 5 |         Ok(x)
 6 |     } else {
 7 |         let (b_sz, n_kv_head, seq_len, head_dim) = x.dims4()?;
 8 |         Tensor::cat(&vec![&x; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/mistralrs/README.md:
--------------------------------------------------------------------------------
1 | # mistral.rs Rust API: `mistralrs`
2 | [![Documentation](https://github.com/EricLBuehler/mistral.rs/actions/workflows/docs.yml/badge.csv)](https://ericlbuehler.github.io/mistral.rs/mistralrs/)
3 | 
4 | Mistral.rs provides a convenient Rust multithreaded/async API. To install, add `mistralrs = { git = "https://github.com/EricLBuehler/mistral.rs.git" }` to the Cargo.toml file.
5 | 
6 | Examples can be found [here](examples).


--------------------------------------------------------------------------------
/chat_templates/llama3.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
3 | }


--------------------------------------------------------------------------------
/mistralrs-core/src/models/mod.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod gemma;
 2 | pub(crate) mod gemma2;
 3 | pub(crate) mod llama;
 4 | pub(crate) mod mistral;
 5 | pub(crate) mod mixtral;
 6 | pub(crate) mod phi2;
 7 | pub(crate) mod phi3;
 8 | pub(crate) mod phi3_5_moe;
 9 | pub(crate) mod quantized_llama;
10 | pub(crate) mod quantized_phi2;
11 | pub(crate) mod quantized_phi3;
12 | pub(crate) mod quantized_starcoder2;
13 | pub(crate) mod qwen2;
14 | pub(crate) mod starcoder2;
15 | 


--------------------------------------------------------------------------------
/examples/server/streaming_completion.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from openai import OpenAI
 3 | 
 4 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
 5 | 
 6 | 
 7 | response = client.completions.create(
 8 |     model="mistral",
 9 |     prompt="My favorite theorem is",
10 |     max_tokens=32,
11 |     stream=True,
12 | )
13 | for chunk in response:
14 |     delta = chunk.choices[0].text
15 |     print(delta, end="")
16 |     sys.stdout.flush()
17 | 


--------------------------------------------------------------------------------
/mistralrs-vision/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-vision"
 3 | readme = "README.md"
 4 | authors = ["Eric Buehler"]
 5 | version.workspace = true
 6 | edition.workspace = true
 7 | description.workspace = true
 8 | repository.workspace = true
 9 | keywords.workspace = true
10 | categories.workspace = true
11 | license.workspace = true
12 | homepage.workspace = true
13 | 
14 | [dependencies]
15 | candle-core.workspace = true
16 | image.workspace = true
17 | 


--------------------------------------------------------------------------------
/chat_templates/phi3.5.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
3 | }


--------------------------------------------------------------------------------
/toml-selectors/speculative-same-gguf.toml:
--------------------------------------------------------------------------------
 1 | [model]
 2 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 3 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
 4 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf"
 5 | 
 6 | [speculative]
 7 | gamma = 32
 8 | 
 9 | [speculative.draft_model]
10 | tok_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
11 | quantized_model_id = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
12 | quantized_filename = "mistral-7b-instruct-v0.1.Q2_K.gguf"


--------------------------------------------------------------------------------
/toml-selectors/anymoe.toml:
--------------------------------------------------------------------------------
 1 | [model]
 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 3 | arch = "mistral"
 4 | 
 5 | [anymoe]
 6 | dataset_json = "examples/amoe.json"
 7 | prefix = "model.layers"
 8 | mlp = "mlp"
 9 | model_ids = ["HuggingFaceH4/zephyr-7b-beta"]
10 | layers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
11 | 
12 | [anymoe.config]
13 | hidden_size = 4096
14 | epochs = 25
15 | expert_type = "fine_tuned"
16 | gate_model_id = "saved_gate"
17 | loss_csv_path = "loss.csv"
18 | 


--------------------------------------------------------------------------------
/mistralrs-quant/README.md:
--------------------------------------------------------------------------------
 1 | # `mistralrs-quant`
 2 | 
 3 | Quantization techniques for mistral.rs. This implements a common trait for all quantization methods to implement for ease of extension and development.
 4 | 
 5 | Currently supported:
 6 | - GGUF: `GgufMatMul`
 7 | - Gptq: `GptqLayer`
 8 | - Hqq: `HqqLayer`
 9 | - Unquantized (used for ISQ): `UnquantLinear`
10 | 
11 | Some kernels are copied or based on implementations in:
12 | - https://github.com/vllm-project/vllm
13 | - https://github.com/mobiusml/hqq
14 | 


--------------------------------------------------------------------------------
/toml-selectors/anymoe_lora.toml:
--------------------------------------------------------------------------------
 1 | [model]
 2 | model_id = "mistralai/Mistral-7B-Instruct-v0.1"
 3 | arch = "mistral"
 4 | 
 5 | [anymoe]
 6 | dataset_json = "examples/amoe.json"
 7 | prefix = "model.layers"
 8 | mlp = "mlp"
 9 | model_ids = ["typeof/zephyr-7b-beta-lora"]
10 | 
11 | [anymoe.config]
12 | hidden_size = 4096
13 | epochs = 25
14 | gate_model_id = "saved_gate"
15 | loss_csv_path = "loss.csv"
16 | 
17 | [anymoe.config.expert_type.lora_adapter]
18 | rank = 64
19 | alpha = 16
20 | target_modules = ["gate_proj"]
21 | 


--------------------------------------------------------------------------------
/chat_templates/mistral.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
3 | }


--------------------------------------------------------------------------------
/examples/python/flux.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import (
 2 |     Runner,
 3 |     Which,
 4 |     DiffusionArchitecture,
 5 |     ImageGenerationResponseFormat,
 6 | )
 7 | 
 8 | runner = Runner(
 9 |     which=Which.DiffusionPlain(
10 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
11 |         arch=DiffusionArchitecture.FluxOffloaded,
12 |     ),
13 | )
14 | 
15 | res = runner.generate_image(
16 |     "A vibrant sunset in the mountains, 4k, high quality.",
17 |     ImageGenerationResponseFormat.Url,
18 | )
19 | print(res.choices[0].url)
20 | 


--------------------------------------------------------------------------------
/scripts/testgen_text.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | tokenizer = transformers.AutoTokenizer.from_pretrained(
 4 |     "mistralai/Mistral-7B-Instruct-v0.3"
 5 | )
 6 | 
 7 | res = tokenizer.apply_chat_template(
 8 |     [
 9 |         {"role": "user", "content": "Hello"},
10 |         {"role": "assistant", "content": "Hi there"},
11 |         {"role": "user", "content": "Who are you"},
12 |         {"role": "assistant", "content": "   I am an assistant   "},
13 |         {"role": "user", "content": "Another question"},
14 |     ],
15 |     add_generation_prompt=True,
16 |     tokenize=False,
17 | )
18 | print(res.replace("\n", "\\n"))
19 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin==1.4"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "mistralrs"
 7 | version = "0.3.1"
 8 | requires-python = ">=3.8"
 9 | classifiers = [
10 |     "Programming Language :: Rust",
11 |     "Programming Language :: Python :: Implementation :: CPython",
12 |     "Programming Language :: Python :: Implementation :: PyPy",
13 |     "License :: OSI Approved :: MIT License",
14 |     "Programming Language :: Python :: 3.8",
15 |     "Programming Language :: Rust",
16 | ]
17 | dynamic = ["description"]
18 | 
19 | [tool.maturin]
20 | features = ["pyo3/extension-module"]
21 | profile = "release"
22 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-paged-attn"
 3 | readme = "README.md"
 4 | authors = ["Eric Buehler"]
 5 | version.workspace = true
 6 | edition.workspace = true
 7 | description.workspace = true
 8 | repository.workspace = true
 9 | keywords.workspace = true
10 | categories.workspace = true
11 | license.workspace = true
12 | homepage.workspace = true
13 | 
14 | [dependencies]
15 | candle-core.workspace = true
16 | half.workspace = true
17 | float8.workspace = true
18 | 
19 | [build-dependencies]
20 | bindgen_cuda = {git = "https://github.com/guoqingbao/bindgen_cuda.git", version = "0.1.6"}
21 | anyhow.workspace = true
22 | 
23 | [features]
24 | cuda = []


--------------------------------------------------------------------------------
/mistralrs-pyo3/pyproject_template.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin==1.4"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "$name"
 7 | version = "0.3.1"
 8 | requires-python = ">=3.8"
 9 | classifiers = [
10 |     "Programming Language :: Rust",
11 |     "Programming Language :: Python :: Implementation :: CPython",
12 |     "Programming Language :: Python :: Implementation :: PyPy",
13 |     "License :: OSI Approved :: MIT License",
14 |     "Programming Language :: Python :: 3.8",
15 |     "Programming Language :: Rust",
16 | ]
17 | dynamic = ["description"]
18 | 
19 | [tool.maturin]
20 | features = ["pyo3/extension-module"]
21 | profile = "release"
22 | 


--------------------------------------------------------------------------------
/examples/python/plain.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         arch=Architecture.Mistral,
 7 |     ),
 8 | )
 9 | 
10 | res = runner.send_chat_completion_request(
11 |     ChatCompletionRequest(
12 |         model="mistral",
13 |         messages=[
14 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
15 |         ],
16 |         max_tokens=256,
17 |         presence_penalty=1.0,
18 |         top_p=0.1,
19 |         temperature=0.1,
20 |     )
21 | )
22 | print(res.choices[0].message.content)
23 | print(res.usage)
24 | 


--------------------------------------------------------------------------------
/mistralrs-quant/src/utils/ffi.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::c_void;
 2 | 
 3 | #[allow(dead_code)]
 4 | extern "C" {
 5 |     // Linking to definitions in mistralrs-core
 6 |     pub(crate) fn mq_bitwise_or_u8(
 7 |         d_in1: *const c_void,
 8 |         d_in2: *const c_void,
 9 |         d_out: *mut c_void,
10 |         N: u32,
11 |     );
12 |     pub(crate) fn mq_bitwise_or_i32(
13 |         d_in1: *const c_void,
14 |         d_in2: *const c_void,
15 |         d_out: *mut c_void,
16 |         N: u32,
17 |     );
18 | 
19 |     pub(crate) fn mq_leftshift_u8(d_in1: *const c_void, d_out: *mut c_void, N: u32, k: i32);
20 |     pub(crate) fn mq_leftshift_i32(d_in1: *const c_void, d_out: *mut c_void, N: u32, k: i32);
21 | }
22 | 


--------------------------------------------------------------------------------
/examples/server/yacc.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
 4 | 
 5 | with open("examples/server/c.y", "r") as f:
 6 |     c_yacc = f.read()
 7 | 
 8 | completion = client.chat.completions.create(
 9 |     model="mistral",
10 |     messages=[
11 |         {
12 |             "role": "user",
13 |             "content": "Write the main function in C, returning 42. Answer with just the code, no explanation.",
14 |         }
15 |     ],
16 |     max_tokens=256,
17 |     frequency_penalty=1.0,
18 |     top_p=0.1,
19 |     temperature=0,
20 |     extra_body={"grammar": {"type": "yacc", "value": c_yacc}},
21 | )
22 | 
23 | print(completion.choices[0].message.content)
24 | 


--------------------------------------------------------------------------------
/mistralrs-vision/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use image::{DynamicImage, Pixel};
 2 | 
 3 | pub(crate) fn empty_image(h: usize, w: usize) -> Vec<Vec<Vec<u8>>> {
 4 |     vec![vec![vec![]; w]; h]
 5 | }
 6 | 
 7 | pub(crate) fn get_pixel_data(
 8 |     n_channels: usize,
 9 |     pixels: image::ImageBuffer<image::Rgba<u8>, Vec<u8>>,
10 |     h: usize,
11 |     w: usize,
12 | ) -> Vec<Vec<Vec<u8>>> {
13 |     let mut pixel_data = empty_image(h, w);
14 |     for (x, y, pixel) in pixels.enumerate_pixels() {
15 |         pixel_data[y as usize][x as usize] = pixel.channels()[..n_channels].to_vec()
16 |     }
17 |     pixel_data
18 | }
19 | 
20 | pub(crate) fn n_channels(image: &DynamicImage) -> usize {
21 |     image.color().channel_count() as usize
22 | }
23 | 


--------------------------------------------------------------------------------
/examples/python/isq.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         arch=Architecture.Mistral,
 7 |     ),
 8 |     in_situ_quant="Q4K",
 9 | )
10 | 
11 | res = runner.send_chat_completion_request(
12 |     ChatCompletionRequest(
13 |         model="mistral",
14 |         messages=[
15 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
16 |         ],
17 |         max_tokens=256,
18 |         presence_penalty=1.0,
19 |         top_p=0.1,
20 |         temperature=0.1,
21 |     )
22 | )
23 | print(res.choices[0].message.content)
24 | print(res.usage)
25 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/utils/log.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     hash::{DefaultHasher, Hash, Hasher},
 3 |     sync::Mutex,
 4 | };
 5 | 
 6 | use once_cell::sync::Lazy;
 7 | use tracing::info;
 8 | 
 9 | static HASHED_AUTOLOADER_LOGS: Lazy<Mutex<Vec<u64>>> = Lazy::new(|| Mutex::new(Vec::new()));
10 | 
11 | pub fn once_log_info<M: AsRef<str>>(msg: M) {
12 |     let msg = msg.as_ref();
13 |     let mut hasher = DefaultHasher::new();
14 |     msg.hash(&mut hasher);
15 |     let hash = hasher.finish();
16 | 
17 |     let mut log = HASHED_AUTOLOADER_LOGS.lock().expect("Poisoned Lock");
18 |     if !log.contains(&hash) {
19 |         info!("{msg}");
20 |         log.push(hasher.finish());
21 |     } else {
22 |         log.push(hasher.finish());
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/docs/TOOL_CALLING.md:
--------------------------------------------------------------------------------
 1 | # Tool calling
 2 | 
 3 | Tool calling makes LLMs smarter.
 4 | 
 5 | LLMs use tool calling to interact with the outside world. Mistral.rs has OpenAI compatible support for tool calling in all APIs, HTTP, Python, and Rust.
 6 | 
 7 | OpenAI docs: https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
 8 | 
 9 | ## OpenAI compatible HTTP example
10 | Please see [our example here](../examples/server/tool_calling.py).
11 | 
12 | > OpenAI docs: https://platform.openai.com/docs/api-reference/chat/create?lang=curl
13 | 
14 | ## Rust example
15 | Please see [our example here](../mistralrs/examples/tools/main.rs).
16 | 
17 | ## Python example
18 | Please see [our notebook here](../examples/python/tool_calling.ipynb).
19 | 


--------------------------------------------------------------------------------
/docs/VISION_MODELS.md:
--------------------------------------------------------------------------------
 1 | # Vision model support in mistral.rs
 2 | 
 3 | Mistral.rs supports various modalities of models, including vision models. Vision models take images and text as input and have the capability to reason over both.
 4 | 
 5 | Please see docs for the following model types:
 6 | 
 7 | - Phi 3 Vision: [PHI3V.md](PHI3V.md)
 8 | - Idefics2: [IDEFICS2.md](IDEFICS2.md)
 9 | - LLaVA and LLaVANext [LLAVA.md](LLaVA.md)
10 | - Llama 3.2 Vision [VLLAMA.md](VLLAMA.md)
11 | 
12 | > Note for the Python and HTTP APIs:
13 | > We follow the OpenAI specification for structuring the image messages and allow both base64 encoded images as well as a URL/path to the image. There are many examples of this, see [this Python example](../examples/python/phi3v.py).


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/build_failure.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Build failure
 3 | about: Report a build failure
 4 | title: ''
 5 | labels: ["bug", "build"]
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Minimum reproducible example
11 | The minium example to reproduce the error. Simpler examples make it easier and faster to fix!
12 | 
13 | ## Error
14 | What is the error? If the error is very long, please try to extract an excerpt of it and post a link to a [Gist](https://gist.github.com/)
15 | 
16 | ## Other information
17 | Please specify:
18 | - Operating system (Windows, MacOS, WSL2, Linux, etc)
19 | - GPU or accelerator information
20 |     - If CUDA, please run `nvcc --version`, `nvidia-smi`
21 | 
22 | ## Latest commit or version
23 | Which commit or version you ran with.
24 | 


--------------------------------------------------------------------------------
/examples/python/paged_attention.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         arch=Architecture.Mistral,
 7 |     ),
 8 |     pa_gpu_mem=4096,
 9 |     pa_blk_size=32,
10 | )
11 | 
12 | res = runner.send_chat_completion_request(
13 |     ChatCompletionRequest(
14 |         model="mistral",
15 |         messages=[
16 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
17 |         ],
18 |         max_tokens=256,
19 |         presence_penalty=1.0,
20 |         top_p=0.1,
21 |         temperature=0.1,
22 |     )
23 | )
24 | print(res.choices[0].message.content)
25 | print(res.usage)
26 | 


--------------------------------------------------------------------------------
/examples/python/mixture_of_quant_experts.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="microsoft/Phi-3.5-MoE-instruct",
 6 |         arch=Architecture.Mistral,
 7 |         organization="moqe",
 8 |     ),
 9 |     in_situ_quant="Q4K",
10 | )
11 | 
12 | res = runner.send_chat_completion_request(
13 |     ChatCompletionRequest(
14 |         model="mistral",
15 |         messages=[
16 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
17 |         ],
18 |         max_tokens=256,
19 |         presence_penalty=1.0,
20 |         top_p=0.1,
21 |         temperature=0.1,
22 |     )
23 | )
24 | print(res.choices[0].message.content)
25 | print(res.usage)
26 | 


--------------------------------------------------------------------------------
/examples/python/topology.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         arch=Architecture.Mistral,
 7 |         topology="topologies/isq.yml",
 8 |     ),
 9 |     in_situ_quant="Q4K",
10 | )
11 | 
12 | res = runner.send_chat_completion_request(
13 |     ChatCompletionRequest(
14 |         model="mistral",
15 |         messages=[
16 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
17 |         ],
18 |         max_tokens=256,
19 |         presence_penalty=1.0,
20 |         top_p=0.1,
21 |         temperature=0.1,
22 |     )
23 | )
24 | print(res.choices[0].message.content)
25 | print(res.usage)
26 | 


--------------------------------------------------------------------------------
/examples/python/gguf.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.GGUF(
 5 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
 7 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
 8 |     )
 9 | )
10 | 
11 | res = runner.send_chat_completion_request(
12 |     ChatCompletionRequest(
13 |         model="mistral",
14 |         messages=[
15 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
16 |         ],
17 |         max_tokens=256,
18 |         presence_penalty=1.0,
19 |         top_p=0.1,
20 |         temperature=0.1,
21 |     )
22 | )
23 | print(res.choices[0].message.content)
24 | print(res.usage)
25 | 


--------------------------------------------------------------------------------
/scripts/set_names.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | filename = input("Enter input ordering file: ")
 4 | 
 5 | with open(filename, "r") as f:
 6 |     data = json.loads(f.read())
 7 |     print(
 8 |         "Note: if you are using an X-LoRA model, it is very important that the adapter names are specified in the correct order"
 9 |         ", which is the order used during training. If you are using a LoRA model this is not necessary."
10 |     )
11 |     adapters = input("Enter a comma delimited list of adapter names: ")
12 |     split = adapters.split(",")
13 |     split = [x for x in split if len(x) > 0]
14 |     split = [x.strip() for x in split]
15 |     data["order"] = split
16 |     outname = input("Enter output ordering file: ")
17 |     with open(outname, "w") as f:
18 |         f.write(json.dumps(data))
19 | 


--------------------------------------------------------------------------------
/examples/python/streaming.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.GGUF(
 5 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
 7 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
 8 |     )
 9 | )
10 | 
11 | res = runner.send_chat_completion_request(
12 |     ChatCompletionRequest(
13 |         model="mistral",
14 |         messages=[
15 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
16 |         ],
17 |         max_tokens=256,
18 |         presence_penalty=1.0,
19 |         top_p=0.1,
20 |         temperature=0.1,
21 |         stream=True,
22 |     )
23 | )
24 | for chunk in res:
25 |     print(chunk)
26 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(all(feature = "cuda", target_family = "unix"))]
 2 | pub const COPY_BLOCKS_KERNEL: &str =
 3 |     include_str!(concat!(env!("OUT_DIR"), "/copy_blocks_kernel.ptx"));
 4 | #[cfg(all(feature = "cuda", target_family = "unix"))]
 5 | pub const PAGEDATTENTION: &str = include_str!(concat!(env!("OUT_DIR"), "/pagedattention.ptx"));
 6 | #[cfg(all(feature = "cuda", target_family = "unix"))]
 7 | pub const RESHAPE_AND_CACHE_KERNEL: &str =
 8 |     include_str!(concat!(env!("OUT_DIR"), "/reshape_and_cache_kernel.ptx"));
 9 | 
10 | #[cfg(all(feature = "cuda", target_family = "unix"))]
11 | mod backend;
12 | #[cfg(all(feature = "cuda", target_family = "unix"))]
13 | mod ffi;
14 | 
15 | #[cfg(all(feature = "cuda", target_family = "unix"))]
16 | pub use backend::{copy_blocks, paged_attention, reshape_and_cache, swap_blocks};
17 | 


--------------------------------------------------------------------------------
/chat_templates/llama2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
3 | }


--------------------------------------------------------------------------------
/mistralrs/examples/gemma2/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextModelBuilder,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model = TextModelBuilder::new("google/gemma-2-9b-it")
 9 |         .with_isq(IsqType::Q4K)
10 |         .with_logging()
11 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
12 |         .build()
13 |         .await?;
14 | 
15 |     let request = RequestBuilder::new().add_message(
16 |         TextMessageRole::User,
17 |         "Please write a mathematical equation where a few numbers are added.",
18 |     );
19 | 
20 |     let response = model.send_chat_request(request).await?;
21 | 
22 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
23 | 
24 |     Ok(())
25 | }
26 | 


--------------------------------------------------------------------------------
/scripts/lora_add_preload_adapters.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | filename = input("Enter input ordering file: ")
 4 | 
 5 | with open(filename, "r") as f:
 6 |     data = json.loads(f.read())
 7 |     preload_adapters = input(
 8 |         "Enter a comma delimited list of *preloaded* adapter names to preload: "
 9 |     )
10 |     preload_adapters_model_id = input(
11 |         "Enter the model id where the preload adapters will be loaded: "
12 |     )
13 |     split = preload_adapters.split(",")
14 |     split = [x for x in split if len(x) > 0]
15 |     split = [x.strip() for x in split]
16 |     res = []
17 |     for s in split:
18 |         res.append({"name": s, "adapter_model_id": preload_adapters_model_id})
19 |     data["preload_adapters"] = res
20 |     outname = input("Enter output ordering file: ")
21 |     with open(outname, "w") as f:
22 |         f.write(json.dumps(data))
23 | 


--------------------------------------------------------------------------------
/examples/server/streaming.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from openai import OpenAI
 3 | 
 4 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
 5 | 
 6 | messages = []
 7 | prompt = input("Enter system prompt >>> ")
 8 | if len(prompt) > 0:
 9 |     messages.append({"role": "system", "content": prompt})
10 | 
11 | 
12 | while True:
13 |     prompt = input(">>> ")
14 |     messages.append({"role": "user", "content": prompt})
15 |     resp = ""
16 |     response = client.chat.completions.create(
17 |         model="mistral",
18 |         messages=messages,
19 |         max_tokens=256,
20 |         stream=True,
21 |     )
22 |     for chunk in response:
23 |         delta = chunk.choices[0].delta.content
24 |         print(delta, end="")
25 |         sys.stdout.flush()
26 |         resp += delta
27 |     messages.append({"role": "assistant", "content": resp})
28 |     print()
29 | 


--------------------------------------------------------------------------------
/mistralrs-quant/kernels/gptq/qdq_8.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_8_cuh
 6 | #define _qdq_8_cuh
 7 | 
 8 | #include "qdq_util.cuh"
 9 | 
10 | __forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
11 | 
12 | __forceinline__ __device__ void dequant_8bit_8(const uint32_t q_0,
13 |                                                const uint32_t q_1,
14 |                                                half2 (&dq)[4], int stride,
15 |                                                const uint32_t zero) {
16 |   half dqh[8];
17 |   for (int i = 0; i < 4; i++) dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
18 |   for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
19 | 
20 |   for (int i = 0; i < 4; i++)
21 |     dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/examples/python/xlora_gemma.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.XLora(
 5 |         model_id=None,  # Automatically determine from ordering file
 6 |         xlora_model_id="lamm-mit/x-lora-gemma-7b",
 7 |         order="orderings/xlora-gemma-paper-ordering.json",
 8 |         tgt_non_granular_index=None,
 9 |         arch=Architecture.Mistral,
10 |     )
11 | )
12 | 
13 | res = runner.send_chat_completion_request(
14 |     ChatCompletionRequest(
15 |         model="mistral",
16 |         messages=[
17 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
18 |         ],
19 |         max_tokens=256,
20 |         presence_penalty=1.0,
21 |         top_p=0.1,
22 |         temperature=0.5,
23 |     )
24 | )
25 | print(res.choices[0].message.content)
26 | print(res.usage)
27 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/tools/response.rs:
--------------------------------------------------------------------------------
 1 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass(eq, eq_int))]
 2 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))]
 3 | #[derive(Clone, Debug, serde::Serialize, PartialEq)]
 4 | #[serde(rename_all = "snake_case")]
 5 | pub enum ToolCallType {
 6 |     Function,
 7 | }
 8 | 
 9 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
10 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))]
11 | #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
12 | pub struct CalledFunction {
13 |     pub name: String,
14 |     pub arguments: String,
15 | }
16 | 
17 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
18 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))]
19 | #[derive(Clone, Debug, serde::Serialize)]
20 | pub struct ToolCallResponse {
21 |     pub id: String,
22 |     #[serde(rename = "type")]
23 |     pub tp: ToolCallType,
24 |     pub function: CalledFunction,
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/python/token_source.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.GGUF(
 5 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
 7 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
 8 |     ),
 9 |     token_source="literal: ...",  # One of: "literal:<value>", "env:<value>", "path:<value>", "cache", "none"
10 | )
11 | 
12 | res = runner.send_chat_completion_request(
13 |     ChatCompletionRequest(
14 |         model="mistral",
15 |         messages=[
16 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
17 |         ],
18 |         max_tokens=256,
19 |         presence_penalty=1.0,
20 |         top_p=0.1,
21 |         temperature=0.1,
22 |     )
23 | )
24 | print(res.choices[0].message.content)
25 | print(res.usage)
26 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Pattern syntax:
 2 | # https://git-scm.com/docs/gitignore#_pattern_format
 3 | 
 4 | # Normalize line endings of all non-binary files to LF upon check-in (`git add` / `git commit`):
 5 | * text=auto
 6 | 
 7 | # Use `eol=lf` / `eol=crlf` to enforce specific line endings on checkout for compatibility:
 8 | # https://www.git-scm.com/docs/gitattributes/#_eol
 9 | # NOTE:
10 | # - This setting implies the `text` attribute.
11 | # - `eol=lf` may not work as expected, if a file was committed with CRLF prior to the introduction of `.gitattribtues`.
12 | #
13 | # Relevant files for this setting:
14 | # - `.sh` (LF) / `.bat` (CRLF) and similar scripts that are platform specific.
15 | # - Scripts that utilize a shebang (`#!/usr/bin/env python3`) to hint the interpreter to run.
16 | # - `Dockerfile` (base image environment may require LF):
17 | #   https://github.com/EricLBuehler/mistral.rs/pull/361
18 | 
19 | Dockerfile* eol=lf
20 | 


--------------------------------------------------------------------------------
/mistralrs-quant/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-quant"
 3 | readme = "README.md"
 4 | authors = ["Eric Buehler"]
 5 | version.workspace = true
 6 | edition.workspace = true
 7 | description.workspace = true
 8 | repository.workspace = true
 9 | keywords.workspace = true
10 | categories.workspace = true
11 | license.workspace = true
12 | homepage.workspace = true
13 | 
14 | [dependencies]
15 | candle-core.workspace = true
16 | candle-nn.workspace = true
17 | half.workspace = true
18 | serde.workspace = true
19 | lazy_static = "1.4"
20 | paste = "1.0.15"
21 | tracing.workspace = true
22 | rayon.workspace = true
23 | byteorder = "1.5.0"
24 | float8.workspace = true
25 | once_cell.workspace = true
26 | 
27 | [features]
28 | cuda = ["candle-core/cuda", "candle-nn/cuda", "dep:bindgen_cuda"]
29 | metal = ["candle-core/metal", "candle-nn/metal"]
30 | 
31 | [build-dependencies]
32 | bindgen_cuda = { version = "0.1.5", optional = true }
33 | 


--------------------------------------------------------------------------------
/examples/python/lora_activation.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.LoraGGUF(
 5 |         tok_model_id=None,  # Automatically determine from ordering file
 6 |         quantized_model_id="TheBloke/zephyr-7B-beta-GGUF",
 7 |         quantized_filename="zephyr-7b-beta.Q4_0.gguf",
 8 |         xlora_model_id="lamm-mit/x-lora",
 9 |         order="orderings/xlora-paper-ordering.json",
10 |     )
11 | )
12 | 
13 | res = runner.send_chat_completion_request(
14 |     ChatCompletionRequest(
15 |         model="mistral",
16 |         messages=[
17 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
18 |         ],
19 |         max_tokens=256,
20 |         presence_penalty=1.0,
21 |         top_p=0.1,
22 |         temperature=0.5,
23 |         adapters=["adapter_4"],
24 |     )
25 | )
26 | print(res.choices[0].message.content)
27 | print(res.usage)
28 | 


--------------------------------------------------------------------------------
/examples/python/xlora_zephyr.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.XLoraGGUF(
 5 |         tok_model_id=None,  # Automatically determine from ordering file
 6 |         quantized_model_id="TheBloke/zephyr-7B-beta-GGUF",
 7 |         quantized_filename="zephyr-7b-beta.Q4_0.gguf",
 8 |         xlora_model_id="lamm-mit/x-lora",
 9 |         order="orderings/xlora-paper-ordering.json",
10 |         tgt_non_granular_index=None,
11 |     )
12 | )
13 | 
14 | res = runner.send_chat_completion_request(
15 |     ChatCompletionRequest(
16 |         model="mistral",
17 |         messages=[
18 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
19 |         ],
20 |         max_tokens=256,
21 |         presence_penalty=1.0,
22 |         top_p=0.1,
23 |         temperature=0.5,
24 |     )
25 | )
26 | print(res.choices[0].message.content)
27 | print(res.usage)
28 | 


--------------------------------------------------------------------------------
/docs/LORA_XLORA.md:
--------------------------------------------------------------------------------
 1 | # Examples of LoRA and X-LoRA models
 2 | 
 3 | - X-LoRA with no quantization
 4 | 
 5 | To start an X-LoRA server with the exactly as presented in [the paper](https://arxiv.org/abs/2402.07148):
 6 | 
 7 | ```bash
 8 | ./mistralrs-server --port 1234 x-lora-plain -o orderings/xlora-paper-ordering.json -x lamm-mit/x-lora
 9 | ```
10 | - LoRA with a model from GGUF
11 | 
12 | To start an LoRA server with adapters from the X-LoRA paper (you should modify the ordering file to use only one adapter, as the adapter static scalings are all 1 and so the signal will become distorted):
13 | 
14 | ```bash
15 | ./mistralrs-server --port 1234 lora-gguf -o orderings/xlora-paper-ordering.json -m TheBloke/zephyr-7B-beta-GGUF -f zephyr-7b-beta.Q8_0.gguf -a lamm-mit/x-lora
16 | ```
17 | 
18 | Normally with a LoRA model you would use a custom ordering file. However, for this example we use the ordering from the X-LoRA paper because we are using the adapters from the X-LoRA paper.


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | ## Models
 4 | - Image generation [models](IMAGEGEN_MODELS.md)
 5 | - Vision [models](VISION_MODELS.md)
 6 | 
 7 | - [FLUX](FLUX.md)
 8 | - [Gemma 2](GEMMA2.md)
 9 | - [Idefics 2](IDEFICS2.md)
10 | - [LLaVA](LLaVA.md)
11 | - [Phi 3.5 MoE](PHI3.5MOE.md)
12 | - [Phi 3.5 Vision](PHI3V.md)
13 | - [Llama 3.2 Vision](VLLAMA.md)
14 | 
15 | ## Adapters
16 | - [Docs](ADAPTER_MODELS.md)
17 | - [X-LoRA non-granular](NON_GRANULAR.md)
18 | - [LoRA and X-LoRA examples](LORA_XLORA.md)
19 | 
20 | ## Quantization
21 | - [Docs](QUANTS.md)
22 | - [ISQ](ISQ.md)
23 | - [UQFF](UQFF.md)
24 | - [Topology](TOPOLOLGY.md)
25 | 
26 | ## Other
27 | - [Chat templates and tokenizers](CHAT_TOK.md)
28 | - [Paged Attention](PAGED_ATTENTION.md)
29 | - [Sampling](SAMPLING.md)
30 | - [TOML selector](TOML_SELECTOR.md)
31 | - [Tool calling](TOOL_CALLING.md)
32 | 
33 | ## Cross-device inference
34 | - [Device mapping](DEVICE_MAPPING.md)
35 | - [Topology](TOPOLOLGY.md)
36 | 
37 | 


--------------------------------------------------------------------------------
/mistralrs/examples/grammar/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextModelBuilder,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
 9 |         .with_isq(IsqType::Q4K)
10 |         .with_logging()
11 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
12 |         .build()
13 |         .await?;
14 | 
15 |     // Bullet list regex
16 |     let request = RequestBuilder::new()
17 |         .set_constraint(mistralrs::Constraint::Regex(
18 |             "(- [^\n]*\n)+(- [^\n]*)(\n\n)?".to_string(),
19 |         ))
20 |         .add_message(TextMessageRole::User, "Please write a few jokes.");
21 | 
22 |     let response = model.send_chat_request(request).await?;
23 | 
24 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
25 | 
26 |     Ok(())
27 | }
28 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/cuda_compat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | #ifndef USE_ROCM
 5 |   #define VLLM_LDG(arg) __ldg(arg)
 6 | #else
 7 |   #define VLLM_LDG(arg) *(arg)
 8 | #endif
 9 | 
10 | #ifndef USE_ROCM
11 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
12 | #else
13 |   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
14 | #endif
15 | 
16 | #ifndef USE_ROCM
17 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
18 | #else
19 |   #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
20 | #endif
21 | 
22 | #ifndef USE_ROCM
23 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
24 |     cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
25 | #else
26 |   #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
27 |     hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
28 | #endif
29 | 
30 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/aici/bytes.rs:
--------------------------------------------------------------------------------
 1 | use std::mem::size_of;
 2 | 
 3 | use bytemuck_derive::{Pod, Zeroable};
 4 | 
 5 | pub(crate) type TokenId = u32;
 6 | 
 7 | #[derive(Clone, Copy, PartialEq, Eq, Debug, Zeroable, Pod)]
 8 | #[repr(C)]
 9 | pub struct TokRxInfo {
10 |     pub vocab_size: u32,
11 |     pub tok_eos: TokenId,
12 | }
13 | 
14 | #[derive(Clone, Copy, PartialEq, Eq, Debug, Zeroable, Pod)]
15 | #[repr(C)]
16 | pub struct U32Pair(pub u32, pub u32);
17 | 
18 | pub fn vec_from_bytes<T: bytemuck::Pod>(bytes: &[u8]) -> Vec<T> {
19 |     if bytes.len() % size_of::<T>() != 0 {
20 |         panic!(
21 |             "vecT: got {} bytes, needed multiple of {}",
22 |             bytes.len(),
23 |             size_of::<T>()
24 |         );
25 |     }
26 |     bytemuck::cast_slice(bytes).to_vec()
27 | }
28 | 
29 | pub fn to_hex_string(bytes: &[u8]) -> String {
30 |     bytes
31 |         .iter()
32 |         .map(|b| format!("{:02x}", b))
33 |         .collect::<Vec<_>>()
34 |         .join("")
35 | }
36 | 


--------------------------------------------------------------------------------
/examples/python/speculative.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.Plain(
 5 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
 6 |         arch=Architecture.Mistral,
 7 |     ),
 8 |     which_draft=Which.GGUF(
 9 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
10 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
11 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
12 |     ),
13 |     speculative_gamma=32,
14 | )
15 | 
16 | res = runner.send_chat_completion_request(
17 |     ChatCompletionRequest(
18 |         model="mistral",
19 |         messages=[
20 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
21 |         ],
22 |         max_tokens=256,
23 |         presence_penalty=1.0,
24 |         top_p=0.1,
25 |         temperature=0.1,
26 |     )
27 | )
28 | print(res.choices[0].message.content)
29 | print(res.usage)
30 | 


--------------------------------------------------------------------------------
/docs/DEVICE_MAPPING.md:
--------------------------------------------------------------------------------
 1 | # Device mapping
 2 | 
 3 | There are 2 ways to do device mapping:
 4 | 1) Specify the number of layers to put on the GPU - this uses the GPU with ordinal 0.
 5 | 2) Specify the ordinals and number of layers - this allows for cross-GPU device mapping.
 6 | 
 7 | The format for the ordinals and number of layers is `ORD:NUM;...` where ORD is the unique ordinal and NUM is the number of layers for that GPU. This may be repeated as many times as necessary.
 8 | 
 9 | > Note: We refer to GPU layers as "device layers" throughout mistral.rs.
10 | 
11 | ## Example of specifying ordinals
12 | ```
13 | cargo run --release --features cuda -- -n "0:16;1:16" -i plain -m gradientai/Llama-3-8B-Instruct-262k -a llama
14 | ```
15 | 
16 | > Note: In the Python API, the "0:16;1:16" string is passed as the list `["0:16", "1:16"]`.
17 | 
18 | ## Example of specifying the number of GPU layers
19 | ```
20 | cargo run --release --features cuda -- -n 16 -i plain -m gradientai/Llama-3-8B-Instruct-262k -a llama
21 | ```


--------------------------------------------------------------------------------
/examples/python/lora_zephyr.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.LoraGGUF(
 5 |         tok_model_id=None,  # Automatically determine from ordering file
 6 |         quantized_model_id="TheBloke/zephyr-7B-beta-GGUF",
 7 |         quantized_filename="zephyr-7b-beta.Q4_0.gguf",
 8 |         xlora_model_id="lamm-mit/x-lora",
 9 |         order="orderings/xlora-paper-ordering.json",
10 |     )
11 | )
12 | 
13 | # Example: Make adapter_3 the active adapter
14 | runner.activate_adapters(["adapter_3"])
15 | 
16 | res = runner.send_chat_completion_request(
17 |     ChatCompletionRequest(
18 |         model="mistral",
19 |         messages=[
20 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
21 |         ],
22 |         max_tokens=256,
23 |         presence_penalty=1.0,
24 |         top_p=0.1,
25 |         temperature=0.5,
26 |     )
27 | )
28 | print(res.choices[0].message.content)
29 | print(res.usage)
30 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/diffusion_models/mod.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod clip;
 2 | pub(crate) mod flux;
 3 | pub(crate) mod processor;
 4 | pub(crate) mod response;
 5 | pub(crate) mod t5;
 6 | 
 7 | macro_rules! generate_repr {
 8 |     ($t:ident) => {
 9 |         #[cfg(feature = "pyo3_macros")]
10 |         #[pyo3::pymethods]
11 |         impl $t {
12 |             fn __repr__(&self) -> String {
13 |                 format!("{self:#?}")
14 |             }
15 |         }
16 |     };
17 | }
18 | 
19 | #[cfg_attr(feature = "pyo3_macros", pyo3::pyclass)]
20 | #[cfg_attr(feature = "pyo3_macros", pyo3(get_all))]
21 | #[derive(Debug, Clone)]
22 | pub struct DiffusionGenerationParams {
23 |     pub height: usize,
24 |     pub width: usize,
25 | }
26 | 
27 | generate_repr!(DiffusionGenerationParams);
28 | 
29 | impl Default for DiffusionGenerationParams {
30 |     /// Image dimensions will be 720x1280.
31 |     fn default() -> Self {
32 |         Self {
33 |             height: 720,
34 |             width: 1280,
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/mistralrs-bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-bench"
 3 | publish = false
 4 | version.workspace = true
 5 | edition.workspace = true
 6 | description.workspace = true
 7 | homepage.workspace = true
 8 | repository.workspace = true
 9 | keywords.workspace = true
10 | categories.workspace = true
11 | license.workspace = true
12 | 
13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
14 | 
15 | [dependencies]
16 | anyhow.workspace = true
17 | candle-core.workspace = true
18 | serde.workspace = true
19 | serde_json.workspace = true
20 | clap.workspace = true
21 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core" }
22 | tracing.workspace = true
23 | tokio.workspace = true
24 | cli-table = "0.4.7"
25 | 
26 | [features]
27 | cuda = ["mistralrs-core/cuda"]
28 | cudnn = ["mistralrs-core/cudnn"]
29 | metal = ["mistralrs-core/metal"]
30 | flash-attn = ["cuda", "mistralrs-core/flash-attn"]
31 | accelerate = ["mistralrs-core/accelerate"]
32 | mkl = ["mistralrs-core/mkl"]
33 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | .pytest_cache/
 6 | *.py[cod]
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | .venv/
14 | env/
15 | bin/
16 | build/
17 | develop-eggs/
18 | dist/
19 | eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | include/
26 | man/
27 | venv/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | pip-selfcheck.json
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | 
45 | # Translations
46 | *.mo
47 | 
48 | # Mr Developer
49 | .mr.developer.cfg
50 | .project
51 | .pydevproject
52 | 
53 | # Rope
54 | .ropeproject
55 | 
56 | # Django stuff:
57 | *.log
58 | *.pot
59 | 
60 | .DS_Store
61 | 
62 | # Sphinx documentation
63 | docs/_build/
64 | 
65 | # PyCharm
66 | .idea/
67 | 
68 | # VSCode
69 | .vscode/
70 | 
71 | # Pyenv
72 | .python-version
73 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/tools/request.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use serde_json::Value;
 4 | 
 5 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
 6 | pub enum ToolType {
 7 |     #[serde(rename = "function")]
 8 |     Function,
 9 | }
10 | 
11 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
12 | pub enum ToolChoice {
13 |     #[serde(rename = "none")]
14 |     /// Disallow selection of tools.
15 |     None,
16 |     #[serde(rename = "auto")]
17 |     /// Allow automatic selection of any given tool, or none.
18 |     Auto,
19 |     #[serde(untagged)]
20 |     /// Force selection of a given tool.
21 |     Tool(Tool),
22 | }
23 | 
24 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
25 | pub struct Function {
26 |     pub description: Option<String>,
27 |     pub name: String,
28 |     pub parameters: Option<HashMap<String, Value>>,
29 | }
30 | 
31 | #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
32 | pub struct Tool {
33 |     #[serde(rename = "type")]
34 |     pub tp: ToolType,
35 |     pub function: Function,
36 | }
37 | 


--------------------------------------------------------------------------------
/mistralrs/examples/flux/main.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Instant;
 2 | 
 3 | use anyhow::Result;
 4 | use mistralrs::{
 5 |     DiffusionGenerationParams, DiffusionLoaderType, DiffusionModelBuilder,
 6 |     ImageGenerationResponseFormat,
 7 | };
 8 | 
 9 | #[tokio::main]
10 | async fn main() -> Result<()> {
11 |     let model = DiffusionModelBuilder::new(
12 |         "black-forest-labs/FLUX.1-schnell",
13 |         DiffusionLoaderType::FluxOffloaded,
14 |     )
15 |     .with_logging()
16 |     .build()
17 |     .await?;
18 | 
19 |     let start = Instant::now();
20 | 
21 |     let response = model
22 |         .generate_image(
23 |             "A vibrant sunset in the mountains, 4k, high quality.".to_string(),
24 |             ImageGenerationResponseFormat::Url,
25 |             DiffusionGenerationParams::default(),
26 |         )
27 |         .await?;
28 | 
29 |     let finished = Instant::now();
30 | 
31 |     println!(
32 |         "Done! Took {} s. Image saved at: {}",
33 |         finished.duration_since(start).as_secs_f32(),
34 |         response.data[0].url.as_ref().unwrap()
35 |     );
36 | 
37 |     Ok(())
38 | }
39 | 


--------------------------------------------------------------------------------
/mistralrs/examples/xlora/main.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | 
 3 | use anyhow::Result;
 4 | use mistralrs::{TextMessageRole, TextMessages, TextModelBuilder, XLoraModelBuilder};
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model =
 9 |         XLoraModelBuilder::from_text_model_builder(
10 |             TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(),
11 |             "lamm-mit/x-lora",
12 |             serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| {
13 |                 panic!("Could not load ordering file at my-ordering-file.json")
14 |             }))?,
15 |         )
16 |         .build()
17 |         .await?;
18 | 
19 |     let messages =
20 |         TextMessages::new().add_message(TextMessageRole::User, "Hello! What is graphene.");
21 | 
22 |     let response = model.send_chat_request(messages).await?;
23 | 
24 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
25 |     dbg!(
26 |         response.usage.avg_prompt_tok_per_sec,
27 |         response.usage.avg_compl_tok_per_sec
28 |     );
29 | 
30 |     Ok(())
31 | }
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Eric Buehler
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/mistralrs-core/src/paged_attention/config.rs:
--------------------------------------------------------------------------------
 1 | pub trait ModelConfigLike {
 2 |     fn num_layers(&self) -> usize;
 3 |     fn hidden_size(&self) -> usize;
 4 |     fn num_kv_heads(&self) -> usize;
 5 |     fn num_attn_heads(&self) -> usize;
 6 |     fn head_dim(&self) -> usize {
 7 |         self.hidden_size() / self.num_attn_heads()
 8 |     }
 9 | }
10 | 
11 | pub struct ModelConfigMetadata {
12 |     pub num_layers: usize,
13 |     pub hidden_size: usize,
14 |     pub num_kv_heads: usize,
15 |     pub num_attn_heads: usize,
16 |     pub sliding_window: Option<usize>,
17 |     pub head_dim: Option<usize>,
18 | }
19 | 
20 | impl ModelConfigLike for ModelConfigMetadata {
21 |     fn hidden_size(&self) -> usize {
22 |         self.hidden_size
23 |     }
24 |     fn num_attn_heads(&self) -> usize {
25 |         self.num_attn_heads
26 |     }
27 |     fn num_kv_heads(&self) -> usize {
28 |         self.num_kv_heads
29 |     }
30 |     fn num_layers(&self) -> usize {
31 |         self.num_layers
32 |     }
33 |     fn head_dim(&self) -> usize {
34 |         self.head_dim
35 |             .unwrap_or(self.hidden_size() / self.num_attn_heads())
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/dummy_paged_attention/config.rs:
--------------------------------------------------------------------------------
 1 | pub trait ModelConfigLike {
 2 |     fn num_layers(&self) -> usize;
 3 |     fn hidden_size(&self) -> usize;
 4 |     fn num_kv_heads(&self) -> usize;
 5 |     fn num_attn_heads(&self) -> usize;
 6 |     fn head_dim(&self) -> usize {
 7 |         self.hidden_size() / self.num_attn_heads()
 8 |     }
 9 | }
10 | 
11 | pub struct ModelConfigMetadata {
12 |     pub num_layers: usize,
13 |     pub hidden_size: usize,
14 |     pub num_kv_heads: usize,
15 |     pub num_attn_heads: usize,
16 |     pub sliding_window: Option<usize>,
17 |     pub head_dim: Option<usize>,
18 | }
19 | 
20 | impl ModelConfigLike for ModelConfigMetadata {
21 |     fn hidden_size(&self) -> usize {
22 |         self.hidden_size
23 |     }
24 |     fn num_attn_heads(&self) -> usize {
25 |         self.num_attn_heads
26 |     }
27 |     fn num_kv_heads(&self) -> usize {
28 |         self.num_kv_heads
29 |     }
30 |     fn num_layers(&self) -> usize {
31 |         self.num_layers
32 |     }
33 |     fn head_dim(&self) -> usize {
34 |         self.head_dim
35 |             .unwrap_or(self.hidden_size() / self.num_attn_heads())
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/vision_models/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::any::Any;
 2 | 
 3 | use candle_core::Tensor;
 4 | 
 5 | pub(crate) mod clip;
 6 | pub(crate) mod idefics2;
 7 | pub(crate) mod idefics2_input_processor;
 8 | pub(crate) mod image_processor;
 9 | pub(crate) mod mllama;
10 | 
11 | pub(crate) mod llava;
12 | pub(crate) mod phi3;
13 | pub(crate) mod phi3_inputs_processor;
14 | pub(crate) mod preprocessor_config;
15 | pub(crate) mod processor_config;
16 | pub(crate) use llava::llava15;
17 | pub(crate) use llava::llava_inputs_processor;
18 | pub(crate) use llava::llava_next;
19 | pub(crate) use llava::llava_next_inputs_processor;
20 | 
21 | use crate::pipeline::text_models_inputs_processor::{FlashParams, PagedAttentionInputMetadata};
22 | 
23 | pub struct ModelInputs {
24 |     pub input_ids: Tensor,
25 |     pub seqlen_offsets: Vec<usize>,
26 |     pub seqlen_offsets_kernel: Tensor,
27 |     pub context_lens: Vec<(usize, usize)>,
28 |     pub position_ids: Vec<usize>,
29 |     pub pixel_values: Option<Tensor>,
30 |     pub model_specific_args: Box<dyn Any>,
31 |     pub paged_attn_meta: Option<PagedAttentionInputMetadata>,
32 |     pub flash_meta: FlashParams,
33 | }
34 | 


--------------------------------------------------------------------------------
/mistralrs/examples/gguf/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{GgufModelBuilder, TextMessageRole, TextMessages};
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<()> {
 6 |     let model = GgufModelBuilder::new(
 7 |         "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
 8 |         vec!["Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"],
 9 |     )
10 |     .with_tok_model_id("meta-llama/Meta-Llama-3.1-8B-Instruct")
11 |     .with_logging()
12 |     .build()
13 |     .await?;
14 | 
15 |     let messages = TextMessages::new()
16 |         .add_message(
17 |             TextMessageRole::System,
18 |             "You are an AI agent with a specialty in programming.",
19 |         )
20 |         .add_message(
21 |             TextMessageRole::User,
22 |             "Hello! How are you? Please write generic binary search function in Rust.",
23 |         );
24 | 
25 |     let response = model.send_chat_request(messages).await?;
26 | 
27 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
28 |     dbg!(
29 |         response.usage.avg_prompt_tok_per_sec,
30 |         response.usage.avg_compl_tok_per_sec
31 |     );
32 | 
33 |     Ok(())
34 | }
35 | 


--------------------------------------------------------------------------------
/mistralrs/examples/phi3_5_moe/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-MoE-instruct")
 9 |         .with_isq(IsqType::Q4K)
10 |         .with_logging()
11 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
12 |         .build()
13 |         .await?;
14 | 
15 |     let messages = TextMessages::new()
16 |         .add_message(
17 |             TextMessageRole::System,
18 |             "You are an AI agent with a specialty in programming.",
19 |         )
20 |         .add_message(
21 |             TextMessageRole::User,
22 |             "Hello! How are you? Please write generic binary search function in Rust.",
23 |         );
24 | 
25 |     let response = model.send_chat_request(messages).await?;
26 | 
27 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
28 |     dbg!(
29 |         response.usage.avg_prompt_tok_per_sec,
30 |         response.usage.avg_compl_tok_per_sec
31 |     );
32 | 
33 |     Ok(())
34 | }
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rust:latest AS builder
 2 | 
 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 4 |     && rm -rf /var/lib/apt/lists/*
 5 | 
 6 | WORKDIR /mistralrs
 7 | 
 8 | COPY . .
 9 | 
10 | RUN cargo build --release --workspace --exclude mistralrs-pyo3
11 | 
12 | FROM debian:bookworm-slim AS base
13 | 
14 | ENV HUGGINGFACE_HUB_CACHE=/data \
15 |     PORT=80 \
16 |     MKL_ENABLE_INSTRUCTIONS=AVX512_E4 \
17 |     RAYON_NUM_THREADS=8 \
18 |     LD_LIBRARY_PATH=/usr/local/lib
19 | 
20 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
21 |     libomp-dev \
22 |     ca-certificates \
23 |     libssl-dev \
24 |     curl \
25 |     pkg-config \
26 |     && rm -rf /var/lib/apt/lists/*
27 | 
28 | FROM base
29 | 
30 | COPY --from=builder /mistralrs/target/release/mistralrs-bench /usr/local/bin/mistralrs-bench
31 | RUN chmod +x /usr/local/bin/mistralrs-bench
32 | COPY --from=builder /mistralrs/target/release/mistralrs-server /usr/local/bin/mistralrs-server
33 | RUN chmod +x /usr/local/bin/mistralrs-server
34 | ENTRYPOINT ["mistralrs-server", "--port", "80", "--token-source", "env:HUGGING_FACE_HUB_TOKEN"]


--------------------------------------------------------------------------------
/mistralrs/examples/lora/main.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | 
 3 | use anyhow::Result;
 4 | use mistralrs::{LoraModelBuilder, TextMessageRole, TextMessages, TextModelBuilder};
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model =
 9 |         LoraModelBuilder::from_text_model_builder(
10 |             TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(),
11 |             "lamm-mit/x-lora",
12 |             serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| {
13 |                 panic!("Could not load ordering file at my-ordering-file.json")
14 |             }))?,
15 |         )
16 |         .build()
17 |         .await?;
18 | 
19 |     let messages = TextMessages::new().add_message(
20 |         TextMessageRole::User,
21 |         "Hello! How are you? Please write generic binary search function in Rust.",
22 |     );
23 | 
24 |     let response = model.send_chat_request(messages).await?;
25 | 
26 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
27 |     dbg!(
28 |         response.usage.avg_prompt_tok_per_sec,
29 |         response.usage.avg_compl_tok_per_sec
30 |     );
31 | 
32 |     Ok(())
33 | }
34 | 


--------------------------------------------------------------------------------
/examples/python/idefics2.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.VisionPlain(
 5 |         model_id="lamm-mit/Cephalo-Idefics-2-vision-8b-beta",
 6 |         arch=VisionArchitecture.Idefics2,
 7 |     ),
 8 | )
 9 | 
10 | res = runner.send_chat_completion_request(
11 |     ChatCompletionRequest(
12 |         model="idefics2",
13 |         messages=[
14 |             {
15 |                 "role": "user",
16 |                 "content": [
17 |                     {
18 |                         "type": "image_url",
19 |                         "image_url": {
20 |                             "url": "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg"
21 |                         },
22 |                     },
23 |                     {
24 |                         "type": "text",
25 |                         "text": "What is shown in this image?",
26 |                     },
27 |                 ],
28 |             },
29 |         ],
30 |         max_tokens=256,
31 |         presence_penalty=1.0,
32 |         top_p=0.1,
33 |         temperature=0.1,
34 |     )
35 | )
36 | print(res.choices[0].message.content)
37 | print(res.usage)
38 | 


--------------------------------------------------------------------------------
/examples/python/speculative_xlora.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest
 2 | 
 3 | runner = Runner(
 4 |     which=Which.XLoraGGUF(
 5 |         tok_model_id=None,  # Automatically determine from ordering file
 6 |         quantized_model_id="TheBloke/zephyr-7B-beta-GGUF",
 7 |         quantized_filename="zephyr-7b-beta.Q4_0.gguf",
 8 |         xlora_model_id="lamm-mit/x-lora",
 9 |         order="orderings/xlora-paper-ordering.json",
10 |         tgt_non_granular_index=None,
11 |     ),
12 |     which_draft=Which.GGUF(
13 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
14 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
15 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
16 |     ),
17 |     speculative_gamma=32,
18 | )
19 | 
20 | res = runner.send_chat_completion_request(
21 |     ChatCompletionRequest(
22 |         model="mistral",
23 |         messages=[
24 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
25 |         ],
26 |         max_tokens=256,
27 |         presence_penalty=1.0,
28 |         top_p=0.1,
29 |         temperature=0.1,
30 |     )
31 | )
32 | print(res.choices[0].message.content)
33 | print(res.usage)
34 | 


--------------------------------------------------------------------------------
/mistralrs/examples/mixture_of_quant_experts/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-MoE-instruct")
 9 |         .with_isq(IsqType::Q4K)
10 |         .with_mixture_qexperts_isq()
11 |         .with_logging()
12 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
13 |         .build()
14 |         .await?;
15 | 
16 |     let messages = TextMessages::new()
17 |         .add_message(
18 |             TextMessageRole::System,
19 |             "You are an AI agent with a specialty in programming.",
20 |         )
21 |         .add_message(
22 |             TextMessageRole::User,
23 |             "Hello! How are you? Please write generic binary search function in Rust.",
24 |         );
25 | 
26 |     let response = model.send_chat_request(messages).await?;
27 | 
28 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
29 |     dbg!(
30 |         response.usage.avg_prompt_tok_per_sec,
31 |         response.usage.avg_compl_tok_per_sec
32 |     );
33 | 
34 |     Ok(())
35 | }
36 | 


--------------------------------------------------------------------------------
/mistralrs-quant/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "cuda")]
 2 | mod ffi;
 3 | pub(crate) mod isq;
 4 | mod ops;
 5 | 
 6 | mod uqff;
 7 | 
 8 | pub use ops::{BitWiseOp, LeftshiftOp};
 9 | pub(crate) use uqff::{
10 |     deserialize_tensor, read_dtype, serialize_tensor, version_is_compatible, write_dtype,
11 |     HQFF_VERSION,
12 | };
13 | 
14 | #[cfg(feature = "cuda")]
15 | use candle_core::{
16 |     cuda::{cudarc::driver::DevicePtr, CudaDType},
17 |     CudaDevice, Device, Storage, Tensor, WithDType,
18 | };
19 | 
20 | #[cfg(feature = "cuda")]
21 | pub(crate) fn get_cuda_slice<T: WithDType + CudaDType>(
22 |     x: &Tensor,
23 | ) -> candle_core::Result<*const T> {
24 |     let offset = x.layout().start_offset();
25 |     match &*x.storage_and_layout().0 {
26 |         Storage::Cuda(a_storage) => {
27 |             Ok(*a_storage.as_cuda_slice::<T>()?.slice(offset..).device_ptr() as *const T)
28 |         }
29 |         _ => candle_core::bail!("Expected CUDA storage."),
30 |     }
31 | }
32 | 
33 | #[cfg(feature = "cuda")]
34 | pub(crate) fn get_cuda_device(x: &Tensor) -> candle_core::Result<&CudaDevice> {
35 |     match x.device() {
36 |         Device::Cuda(dev) => Ok(dev),
37 |         _ => candle_core::bail!("Expected CUDA device"),
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/python/phi3v_local_img.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.VisionPlain(
 5 |         model_id="microsoft/Phi-3.5-vision-instruct",
 6 |         arch=VisionArchitecture.Phi3V,
 7 |     ),
 8 | )
 9 | 
10 | FILENAME = "picture.jpg"
11 | 
12 | res = runner.send_chat_completion_request(
13 |     ChatCompletionRequest(
14 |         model="phi3v",
15 |         messages=[
16 |             {
17 |                 "role": "user",
18 |                 "content": [
19 |                     {
20 |                         "type": "image_url",
21 |                         "image_url": {
22 |                             "url": FILENAME,
23 |                         },
24 |                     },
25 |                     {
26 |                         "type": "text",
27 |                         "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
28 |                     },
29 |                 ],
30 |             }
31 |         ],
32 |         max_tokens=256,
33 |         presence_penalty=1.0,
34 |         top_p=0.1,
35 |         temperature=0.1,
36 |     )
37 | )
38 | print(res.choices[0].message.content)
39 | print(res.usage)
40 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/Cargo_template.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-pyo3"
 3 | authors = ["Eric Buehler"]
 4 | version.workspace = true
 5 | edition.workspace = true
 6 | description.workspace = true
 7 | repository.workspace = true
 8 | keywords.workspace = true
 9 | categories.workspace = true
10 | license.workspace = true
11 | homepage.workspace = true
12 | 
13 | [lib]
14 | name = "mistralrs"
15 | crate-type = ["cdylib"]
16 | doc = false
17 | 
18 | [dependencies]
19 | pyo3.workspace = true
20 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
21 | serde.workspace = true
22 | serde_json.workspace = true
23 | candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251", features=["$feature_name"] }
24 | indexmap.workspace = true
25 | accelerate-src = { workspace = true, optional = true }
26 | intel-mkl-src = { workspace = true, optional = true }
27 | either.workspace = true
28 | futures.workspace = true
29 | tokio.workspace = true
30 | image.workspace = true
31 | reqwest.workspace = true
32 | base64.workspace = true
33 | url.workspace = true
34 | data-url.workspace = true
35 | anyhow.workspace = true
36 | 
37 | [build-dependencies]
38 | pyo3-build-config = "0.22"
39 | 


--------------------------------------------------------------------------------
/examples/python/anymoe.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import (
 2 |     Runner,
 3 |     Which,
 4 |     ChatCompletionRequest,
 5 |     Architecture,
 6 |     AnyMoeConfig,
 7 |     AnyMoeExpertType,
 8 | )
 9 | 
10 | runner = Runner(
11 |     which=Which.Plain(
12 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
13 |         arch=Architecture.Mistral,
14 |     ),
15 |     anymoe_config=AnyMoeConfig(
16 |         hidden_size=4096,
17 |         dataset_json="examples/amoe.json",
18 |         prefix="model.layers",
19 |         mlp="mlp",
20 |         expert_type=AnyMoeExpertType.FineTuned(),
21 |         lr=1e-3,
22 |         epochs=100,
23 |         batch_size=4,
24 |         model_ids=["HuggingFaceH4/zephyr-7b-beta"],
25 |         layers=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
26 |         loss_csv_path="loss.csv",
27 |     ),
28 | )
29 | 
30 | res = runner.send_chat_completion_request(
31 |     ChatCompletionRequest(
32 |         model="mistral",
33 |         messages=[
34 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
35 |         ],
36 |         max_tokens=256,
37 |         presence_penalty=1.0,
38 |         top_p=0.1,
39 |         temperature=0.1,
40 |     )
41 | )
42 | print(res.choices[0].message.content)
43 | print(res.usage)
44 | 


--------------------------------------------------------------------------------
/examples/server/stream_completion_bench.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | from datetime import datetime
 3 | 
 4 | Runs = 4
 5 | 
 6 | ENDPOINT = "http://localhost:1234/v1/"
 7 | 
 8 | 
 9 | def request(stream: bool):
10 |     client = openai.Client(api_key="foobar", base_url=ENDPOINT)
11 |     return client.chat.completions.create(
12 |         model="mistral",
13 |         messages=[
14 |             {
15 |                 "role": "user",
16 |                 "content": "What is the meaning of life? Write a long story.",
17 |             }
18 |         ],
19 |         stream=stream,
20 |         max_tokens=400,
21 |         temperature=0.0,
22 |     )
23 | 
24 | 
25 | def run():
26 |     for run in range(Runs):
27 |         print("\nCompletion: ")
28 |         print("=" * 15)
29 | 
30 |         now = datetime.now()
31 |         request(stream=False)
32 |         finished = datetime.now()
33 | 
34 |         print(f"Duration: {finished-now}")
35 | 
36 |         print("\nStreaming: ")
37 |         print("=" * 15)
38 | 
39 |         now = datetime.now()
40 |         stream = request(stream=True)
41 |         for _message in stream:
42 |             pass
43 |         finished = datetime.now()
44 | 
45 |         print(f"Duration: {finished-now}")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     run()
50 | 


--------------------------------------------------------------------------------
/examples/python/phi3v.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.VisionPlain(
 5 |         model_id="microsoft/Phi-3.5-vision-instruct",
 6 |         arch=VisionArchitecture.Phi3V,
 7 |     ),
 8 | )
 9 | 
10 | res = runner.send_chat_completion_request(
11 |     ChatCompletionRequest(
12 |         model="phi3v",
13 |         messages=[
14 |             {
15 |                 "role": "user",
16 |                 "content": [
17 |                     {
18 |                         "type": "image_url",
19 |                         "image_url": {
20 |                             "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
21 |                         },
22 |                     },
23 |                     {
24 |                         "type": "text",
25 |                         "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
26 |                     },
27 |                 ],
28 |             }
29 |         ],
30 |         max_tokens=256,
31 |         presence_penalty=1.0,
32 |         top_p=0.1,
33 |         temperature=0.1,
34 |     )
35 | )
36 | print(res.choices[0].message.content)
37 | print(res.usage)
38 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/gguf/chat_template.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use tracing::info;
 3 | 
 4 | use crate::utils::gguf_metadata::ContentMetadata;
 5 | 
 6 | use super::Content;
 7 | 
 8 | struct PropsGGUFTemplate {
 9 |     chat_template: Option<String>,
10 | }
11 | 
12 | impl TryFrom<ContentMetadata<'_>> for PropsGGUFTemplate {
13 |     type Error = anyhow::Error;
14 | 
15 |     fn try_from(c: ContentMetadata) -> Result<Self, Self::Error> {
16 |         // No required keys
17 | 
18 |         let props = Self {
19 |             chat_template: c.get_option_value("chat_template")?,
20 |         };
21 | 
22 |         Ok(props)
23 |     }
24 | }
25 | 
26 | // Get chat template from GGUF metadata if it exists
27 | pub fn get_gguf_chat_template<R: std::io::Seek + std::io::Read>(
28 |     content: &Content<'_, R>,
29 | ) -> Result<Option<String>> {
30 |     let metadata = ContentMetadata {
31 |         path_prefix: "tokenizer",
32 |         metadata: content.get_metadata(),
33 |     };
34 |     let props = PropsGGUFTemplate::try_from(metadata)?;
35 |     if let Some(ref chat_template) = props.chat_template {
36 |         info!(
37 |             "Discovered and using GGUF chat template: `{}`",
38 |             chat_template.replace('\n', "\\n")
39 |         );
40 |     }
41 |     Ok(props.chat_template)
42 | }
43 | 


--------------------------------------------------------------------------------
/mistralrs/examples/idefics2/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<()> {
 6 |     let model = VisionModelBuilder::new(
 7 |         "HuggingFaceM4/idefics2-8b-chatty",
 8 |         VisionLoaderType::Idefics2,
 9 |     )
10 |     .with_isq(IsqType::Q4K)
11 |     .with_logging()
12 |     .build()
13 |     .await?;
14 | 
15 |     let bytes = match reqwest::blocking::get(
16 |         "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
17 |     ) {
18 |         Ok(http_resp) => http_resp.bytes()?.to_vec(),
19 |         Err(e) => anyhow::bail!(e),
20 |     };
21 |     let image = image::load_from_memory(&bytes)?;
22 | 
23 |     let messages = VisionMessages::new().add_idefics_image_message(
24 |         TextMessageRole::User,
25 |         "What is depicted here? Please describe the scene in detail.",
26 |         image,
27 |     );
28 | 
29 |     let response = model.send_chat_request(messages).await?;
30 | 
31 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
32 |     dbg!(
33 |         response.usage.avg_prompt_tok_per_sec,
34 |         response.usage.avg_compl_tok_per_sec
35 |     );
36 | 
37 |     Ok(())
38 | }
39 | 


--------------------------------------------------------------------------------
/mistralrs/examples/llava_next/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<()> {
 6 |     let model = VisionModelBuilder::new(
 7 |         "llava-hf/llava-v1.6-mistral-7b-hf",
 8 |         VisionLoaderType::LLaVANext,
 9 |     )
10 |     .with_isq(IsqType::Q4K)
11 |     .with_logging()
12 |     .build()
13 |     .await?;
14 | 
15 |     let bytes = match reqwest::blocking::get(
16 |         "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
17 |     ) {
18 |         Ok(http_resp) => http_resp.bytes()?.to_vec(),
19 |         Err(e) => anyhow::bail!(e),
20 |     };
21 |     let image = image::load_from_memory(&bytes)?;
22 | 
23 |     let messages = VisionMessages::new().add_llava_image_message(
24 |         TextMessageRole::User,
25 |         "What is depicted here? Please describe the scene in detail.",
26 |         image,
27 |     );
28 | 
29 |     let response = model.send_chat_request(messages).await?;
30 | 
31 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
32 |     dbg!(
33 |         response.usage.avg_prompt_tok_per_sec,
34 |         response.usage.avg_compl_tok_per_sec
35 |     );
36 | 
37 |     Ok(())
38 | }
39 | 


--------------------------------------------------------------------------------
/mistralrs/examples/phi3v/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<()> {
 6 |     let model =
 7 |         VisionModelBuilder::new("microsoft/Phi-3.5-vision-instruct", VisionLoaderType::Phi3V)
 8 |             .with_isq(IsqType::Q4K)
 9 |             .with_logging()
10 |             .build()
11 |             .await?;
12 | 
13 |     let bytes = match reqwest::blocking::get(
14 |         "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
15 |     ) {
16 |         Ok(http_resp) => http_resp.bytes()?.to_vec(),
17 |         Err(e) => anyhow::bail!(e),
18 |     };
19 |     let image = image::load_from_memory(&bytes)?;
20 | 
21 |     let messages = VisionMessages::new().add_phiv_image_message(
22 |         TextMessageRole::User,
23 |         "What is depicted here? Please describe the scene in detail.",
24 |         image,
25 |     );
26 | 
27 |     let response = model.send_chat_request(messages).await?;
28 | 
29 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
30 |     dbg!(
31 |         response.usage.avg_prompt_tok_per_sec,
32 |         response.usage.avg_compl_tok_per_sec
33 |     );
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/python/llava_next.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | 
 3 | runner = Runner(
 4 |     which=Which.VisionPlain(
 5 |         model_id="llava-hf/llava-v1.6-mistral-7b-hf",
 6 |         arch=VisionArchitecture.LLaVANext,
 7 |     ),
 8 | )
 9 | 
10 | res = runner.send_chat_completion_request(
11 |     ChatCompletionRequest(
12 |         model="llava_next",
13 |         messages=[
14 |             {
15 |                 "role": "user",
16 |                 "content": [
17 |                     {
18 |                         "type": "image_url",
19 |                         "image_url": {
20 |                             "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
21 |                         },
22 |                     },
23 |                     {
24 |                         "type": "text",
25 |                         "text": "<image>What is shown in this image? Write a detailed response analyzing the scene.",
26 |                     },
27 |                 ],
28 |             }
29 |         ],
30 |         max_tokens=256,
31 |         presence_penalty=1.0,
32 |         top_p=0.1,
33 |         temperature=0.1,
34 |     )
35 | )
36 | print(res.choices[0].message.content)
37 | print(res.usage)
38 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/gguf/mod.rs:
--------------------------------------------------------------------------------
 1 | mod chat_template;
 2 | mod content;
 3 | mod gguf_tokenizer;
 4 | use strum::EnumString;
 5 | 
 6 | use anyhow::{Context, Result};
 7 | pub(crate) use chat_template::get_gguf_chat_template;
 8 | pub(crate) use content::Content;
 9 | pub(crate) use gguf_tokenizer::{convert_gguf_to_hf_tokenizer, GgufTokenizerConversion};
10 | use std::str::FromStr;
11 | 
12 | pub const GGUF_MULTI_FILE_DELIMITER: &str = " ";
13 | 
14 | #[derive(Debug, EnumString, Clone, Copy)]
15 | #[strum(serialize_all = "kebab-case")]
16 | pub enum GGUFArchitecture {
17 |     Llama,
18 |     Mpt,
19 |     Gptneox,
20 |     Gptj,
21 |     Gpt2,
22 |     Bloom,
23 |     Falcon,
24 |     Mamba,
25 |     Rwkv,
26 |     Phi2,
27 |     Phi3,
28 |     Starcoder2,
29 | }
30 | 
31 | // Wraps from_str() for some convenience:
32 | // - Case-insensitive variant matching (TODO: is this desirable?)
33 | // - Customized error until potential upstream support: https://github.com/Peternator7/strum/issues/332
34 | impl GGUFArchitecture {
35 |     pub fn from_value<T: AsRef<str> + std::fmt::Display>(value: T) -> Result<Self> {
36 |         Self::from_str(&value.as_ref().to_ascii_lowercase())
37 |             .with_context(|| format!("Unknown GGUF architecture `{value}`"))
38 |             .map_err(anyhow::Error::msg)
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/.github/workflows/release_python.yml:
--------------------------------------------------------------------------------
 1 | name: py-release
 2 | 
 3 | # gh workflow run py-release
 4 | # This also runs on release deploy
 5 | on:
 6 |   workflow_dispatch:
 7 |   release:
 8 |     types: [published]
 9 |   push:
10 |     tags:
11 |       - '**[0-9]+.[0-9]+.[0-9]+*'
12 | 
13 | permissions:
14 |   contents: write
15 |   pages: write
16 |   id-token: write
17 | 
18 | jobs:
19 |   upload:
20 |     runs-on: ubuntu-latest
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         rust: [stable]
25 | 
26 |     steps:
27 |       - name: Checkout
28 |         uses: actions/checkout@v4
29 |         
30 |       - uses: actions-rs/toolchain@v1
31 |         with:
32 |           profile: minimal
33 |           toolchain: ${{ matrix.rust }}
34 |           override: true
35 | 
36 |       - name: Set up Python
37 |         uses: actions/setup-python@v5
38 |         with:
39 |           python-version: 3.8
40 |       
41 |       - name: Upload
42 |         env:
43 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
44 |         run: |
45 |           sudo apt install libssl-dev
46 |           sudo apt install pkg-config
47 |           pip install maturin[patchelf]==1.4.0 twine
48 |           python3 -m venv venv
49 |           source venv/bin/activate
50 |           cd mistralrs-pyo3
51 |           python3 upload.py
52 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/amoe/macros.rs:
--------------------------------------------------------------------------------
 1 | #[macro_export]
 2 | #[doc(hidden)]
 3 | macro_rules! get_delta_from_lora_ab {
 4 |     ($vb_mlp:expr, $rank:expr, $alpha:expr, ($in_d:expr, $out_d:expr), $name:expr) => {{
 5 |         let proj_a = $vb_mlp
 6 |             .pp($name)
 7 |             .pp("lora_A")
 8 |             .get(($rank, $in_d), "weight")?;
 9 |         let proj_b = $vb_mlp
10 |             .pp($name)
11 |             .pp("lora_B")
12 |             .get(($out_d, $rank), "weight")?;
13 |         let scale = if $rank > 0 {
14 |             $alpha / $rank as f64
15 |         } else {
16 |             1.0
17 |         };
18 |         (proj_b.matmul(&proj_a)? * scale)?
19 |     }};
20 | }
21 | 
22 | #[macro_export]
23 | #[doc(hidden)]
24 | macro_rules! merge_delta {
25 |     ($qmatmul:expr, $delta:expr) => {
26 |         match &$qmatmul {
27 |             QMatMul::Tensor(w) => QMatMul::Tensor((w + $delta)?),
28 |             QMatMul::TensorF16(w) => QMatMul::TensorF16((w + $delta)?),
29 |             QMatMul::QTensor(w) => {
30 |                 let (w, dtype) = (w.dequantize(&w.device())?, w.dtype());
31 |                 QMatMul::QTensor(std::sync::Arc::new(
32 |                     candle_core::quantized::QTensor::quantize(&(w + $delta)?, dtype)?,
33 |                 ))
34 |             }
35 |         }
36 |     };
37 | }
38 | 


--------------------------------------------------------------------------------
/mistralrs-vision/tests/integration.rs:
--------------------------------------------------------------------------------
 1 | use candle_core::Device;
 2 | use image::{ColorType, DynamicImage};
 3 | use mistralrs_vision::{ApplyTransforms, InterpolateResize, Normalize, ToTensor, Transforms};
 4 | 
 5 | #[test]
 6 | fn normalize() {
 7 |     let image = DynamicImage::new(3, 4, ColorType::Rgb8);
 8 |     let transforms = Transforms {
 9 |         input: &ToTensor,
10 |         inner_transforms: &[&Normalize {
11 |             mean: vec![0.5, 0.5, 0.5],
12 |             std: vec![0.5, 0.5, 0.5],
13 |         }],
14 |     };
15 |     let transformed = image.apply(transforms, &Device::Cpu).unwrap();
16 |     assert_eq!(transformed.dims(), &[3, 4, 3]);
17 | }
18 | 
19 | #[test]
20 | fn normalize_and_interpolate_resize() {
21 |     let image = DynamicImage::new(300, 400, ColorType::Rgb8);
22 |     let transforms = Transforms {
23 |         input: &ToTensor,
24 |         inner_transforms: &[
25 |             &Normalize {
26 |                 mean: vec![0.5, 0.5, 0.5],
27 |                 std: vec![0.5, 0.5, 0.5],
28 |             },
29 |             &InterpolateResize {
30 |                 target_h: 336,
31 |                 target_w: 336,
32 |             },
33 |         ],
34 |     };
35 |     let transformed = image.apply(transforms, &Device::Cpu).unwrap();
36 |     assert_eq!(transformed.dims(), &[3, 336, 336]);
37 | }
38 | 


--------------------------------------------------------------------------------
/mistralrs/examples/llava/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<()> {
 6 |     let model = VisionModelBuilder::new("llava-hf/llava-1.5-7b-hf", VisionLoaderType::LLaVA)
 7 |         .with_isq(IsqType::Q4K)
 8 |         .with_chat_template("chat_templates/vicuna.json")
 9 |         .with_logging()
10 |         .build()
11 |         .await?;
12 | 
13 |     let bytes = match reqwest::blocking::get(
14 |         "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
15 |     ) {
16 |         Ok(http_resp) => http_resp.bytes()?.to_vec(),
17 |         Err(e) => anyhow::bail!(e),
18 |     };
19 |     let image = image::load_from_memory(&bytes)?;
20 | 
21 |     let messages = VisionMessages::new().add_llava_image_message(
22 |         TextMessageRole::User,
23 |         "What is depicted here? Please describe the scene in detail.",
24 |         image,
25 |     );
26 | 
27 |     let response = model.send_chat_request(messages).await?;
28 | 
29 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
30 |     dbg!(
31 |         response.usage.avg_prompt_tok_per_sec,
32 |         response.usage.avg_compl_tok_per_sec
33 |     );
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/python/anymoe_inference.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import (
 2 |     Runner,
 3 |     Which,
 4 |     ChatCompletionRequest,
 5 |     Architecture,
 6 |     AnyMoeConfig,
 7 |     AnyMoeExpertType,
 8 | )
 9 | 
10 | runner = Runner(
11 |     which=Which.Plain(
12 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
13 |         arch=Architecture.Mistral,
14 |     ),
15 |     anymoe_config=AnyMoeConfig(
16 |         hidden_size=4096,
17 |         dataset_json="examples/amoe.json",
18 |         prefix="model.layers",
19 |         mlp="mlp",
20 |         expert_type=AnyMoeExpertType.FineTuned(),
21 |         lr=1e-3,
22 |         epochs=100,
23 |         batch_size=4,
24 |         model_ids=["HuggingFaceH4/zephyr-7b-beta"],
25 |         layers=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
26 |         gate_model_id="path/to/pretrained/gating_model_id",
27 |         loss_csv_path="loss.csv",
28 |     ),
29 | )
30 | 
31 | res = runner.send_chat_completion_request(
32 |     ChatCompletionRequest(
33 |         model="mistral",
34 |         messages=[
35 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
36 |         ],
37 |         max_tokens=256,
38 |         presence_penalty=1.0,
39 |         top_p=0.1,
40 |         temperature=0.1,
41 |     )
42 | )
43 | print(res.choices[0].message.content)
44 | print(res.usage)
45 | 


--------------------------------------------------------------------------------
/examples/python/anymoe_lora.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import (
 2 |     Runner,
 3 |     Which,
 4 |     ChatCompletionRequest,
 5 |     Architecture,
 6 |     AnyMoeConfig,
 7 |     AnyMoeExpertType,
 8 | )
 9 | 
10 | runner = Runner(
11 |     which=Which.Plain(
12 |         model_id="mistralai/Mistral-7B-Instruct-v0.1",
13 |         arch=Architecture.Mistral,
14 |     ),
15 |     anymoe_config=AnyMoeConfig(
16 |         hidden_size=4096,
17 |         dataset_json="examples/amoe.json",
18 |         prefix="model.layers",
19 |         mlp="mlp",
20 |         expert_type=AnyMoeExpertType.LoraAdapter(
21 |             rank=64, alpha=16.0, target_modules=["gate_proj"]
22 |         ),
23 |         lr=1e-3,
24 |         epochs=100,
25 |         batch_size=4,
26 |         model_ids=["typeof/zephyr-7b-beta-lora"],
27 |         # For inference (use a pretrained gating layer) see `anymoe_inference.py`
28 |         loss_csv_path="loss.csv",
29 |     ),
30 | )
31 | 
32 | res = runner.send_chat_completion_request(
33 |     ChatCompletionRequest(
34 |         model="mistral",
35 |         messages=[
36 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
37 |         ],
38 |         max_tokens=256,
39 |         presence_penalty=1.0,
40 |         top_p=0.1,
41 |         temperature=0.1,
42 |     )
43 | )
44 | print(res.choices[0].message.content)
45 | print(res.usage)
46 | 


--------------------------------------------------------------------------------
/mistralrs/examples/paged_attn/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, MemoryGpuConfig, PagedAttentionMetaBuilder, TextMessageRole, TextMessages,
 4 |     TextModelBuilder,
 5 | };
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
10 |         .with_isq(IsqType::Q8_0)
11 |         .with_logging()
12 |         .with_paged_attn(|| {
13 |             PagedAttentionMetaBuilder::default()
14 |                 .with_block_size(32)
15 |                 .with_gpu_memory(MemoryGpuConfig::ContextSize(1024))
16 |                 .build()
17 |         })?
18 |         .build()
19 |         .await?;
20 | 
21 |     let messages = TextMessages::new()
22 |         .add_message(
23 |             TextMessageRole::System,
24 |             "You are an AI agent with a specialty in programming.",
25 |         )
26 |         .add_message(
27 |             TextMessageRole::User,
28 |             "Hello! How are you? Please write generic binary search function in Rust.",
29 |         );
30 | 
31 |     let response = model.send_chat_request(messages).await?;
32 | 
33 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
34 |     dbg!(
35 |         response.usage.avg_prompt_tok_per_sec,
36 |         response.usage.avg_compl_tok_per_sec
37 |     );
38 | 
39 |     Ok(())
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/python/phi3v_base64.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | import base64
 3 | 
 4 | runner = Runner(
 5 |     which=Which.VisionPlain(
 6 |         model_id="microsoft/Phi-3.5-vision-instruct",
 7 |         arch=VisionArchitecture.Phi3V,
 8 |     ),
 9 | )
10 | 
11 | FILENAME = "picture.jpg"
12 | with open(FILENAME, "rb") as image_file:
13 |     encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
14 | 
15 | res = runner.send_chat_completion_request(
16 |     ChatCompletionRequest(
17 |         model="phi3v",
18 |         messages=[
19 |             {
20 |                 "role": "user",
21 |                 "content": [
22 |                     {
23 |                         "type": "image_url",
24 |                         "image_url": {
25 |                             "url": str(encoded_string),
26 |                         },
27 |                     },
28 |                     {
29 |                         "type": "text",
30 |                         "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
31 |                     },
32 |                 ],
33 |             }
34 |         ],
35 |         max_tokens=256,
36 |         presence_penalty=1.0,
37 |         top_p=0.1,
38 |         temperature=0.1,
39 |     )
40 | )
41 | print(res.choices[0].message.content)
42 | print(res.usage)
43 | 


--------------------------------------------------------------------------------
/examples/python/llama_vision.py:
--------------------------------------------------------------------------------
 1 | from mistralrs import Runner, Which, ChatCompletionRequest, VisionArchitecture
 2 | 
 3 | # MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 4 | MODEL_ID = "lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k"
 5 | 
 6 | runner = Runner(
 7 |     which=Which.VisionPlain(
 8 |         model_id="MODEL_ID",
 9 |         arch=VisionArchitecture.VLlama,
10 |     ),
11 | )
12 | 
13 | res = runner.send_chat_completion_request(
14 |     ChatCompletionRequest(
15 |         model="llama-vision",
16 |         messages=[
17 |             {
18 |                 "role": "user",
19 |                 "content": [
20 |                     {
21 |                         "type": "image_url",
22 |                         "image_url": {
23 |                             "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
24 |                         },
25 |                     },
26 |                     {
27 |                         "type": "text",
28 |                         "text": "What is shown in this image? Write a detailed response analyzing the scene.",
29 |                     },
30 |                 ],
31 |             }
32 |         ],
33 |         max_tokens=256,
34 |         presence_penalty=1.0,
35 |         top_p=0.1,
36 |         temperature=0.1,
37 |     )
38 | )
39 | print(res.choices[0].message.content)
40 | print(res.usage)
41 | 


--------------------------------------------------------------------------------
/mistralrs/examples/llama_vision/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
 3 | 
 4 | // const MODEL_ID: &str = "meta-llama/Llama-3.2-11B-Vision-Instruct";
 5 | const MODEL_ID: &str = "lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k";
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let model = VisionModelBuilder::new(MODEL_ID, VisionLoaderType::VLlama)
10 |         .with_isq(IsqType::Q4K)
11 |         .with_logging()
12 |         .build()
13 |         .await?;
14 | 
15 |     let bytes = match reqwest::blocking::get(
16 |         "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
17 |     ) {
18 |         Ok(http_resp) => http_resp.bytes()?.to_vec(),
19 |         Err(e) => anyhow::bail!(e),
20 |     };
21 |     let image = image::load_from_memory(&bytes)?;
22 | 
23 |     let messages = VisionMessages::new().add_vllama_image_message(
24 |         TextMessageRole::User,
25 |         "What is depicted here? Please describe the scene in detail.",
26 |         image,
27 |     );
28 | 
29 |     let response = model.send_chat_request(messages).await?;
30 | 
31 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
32 |     dbg!(
33 |         response.usage.avg_prompt_tok_per_sec,
34 |         response.usage.avg_compl_tok_per_sec
35 |     );
36 | 
37 |     Ok(())
38 | }
39 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-pyo3"
 3 | authors = ["Eric Buehler"]
 4 | version.workspace = true
 5 | edition.workspace = true
 6 | description.workspace = true
 7 | repository.workspace = true
 8 | keywords.workspace = true
 9 | categories.workspace = true
10 | license.workspace = true
11 | homepage.workspace = true
12 | 
13 | [lib]
14 | name = "mistralrs"
15 | crate-type = ["cdylib"]
16 | doc = false
17 | 
18 | [dependencies]
19 | pyo3.workspace = true
20 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core", features = ["pyo3_macros"] }
21 | serde.workspace = true
22 | serde_json.workspace = true
23 | candle-core.workspace = true
24 | indexmap.workspace = true
25 | accelerate-src = { workspace = true, optional = true }
26 | intel-mkl-src = { workspace = true, optional = true }
27 | either.workspace = true
28 | futures.workspace = true
29 | tokio.workspace = true
30 | image.workspace = true
31 | reqwest.workspace = true
32 | base64.workspace = true
33 | url.workspace = true
34 | data-url.workspace = true
35 | anyhow.workspace = true
36 | 
37 | [build-dependencies]
38 | pyo3-build-config = "0.22"
39 | 
40 | [features]
41 | cuda = ["candle-core/cuda", "mistralrs-core/cuda"]
42 | cudnn = ["candle-core/cudnn", "mistralrs-core/cudnn"]
43 | metal = ["candle-core/metal", "mistralrs-core/metal"]
44 | flash-attn = ["cuda", "mistralrs-core/flash-attn"]
45 | accelerate = ["mistralrs-core/accelerate"]
46 | mkl = ["mistralrs-core/mkl"]
47 | 


--------------------------------------------------------------------------------
/docs/QUANTS.md:
--------------------------------------------------------------------------------
 1 | # Quantization in mistral.rs
 2 | 
 3 | Mistral.rs supports the following quantization:
 4 | - GGUF/GGML
 5 |     - Q, K type
 6 |     - Supported in GGUF/GGML and GGUF/GGML adapter models
 7 |     - Supported in all plain and adapter models
 8 |     - I quants coming!
 9 |     - CPU, CUDA, Metal (all supported devices)
10 |     - 2, 3, 4, 5, 6, 8 bit
11 | - GPTQ
12 |     - Supported in all plain and adapter models
13 |     - CUDA only
14 |     - 2, 3, 4, 8 bit
15 | - HQQ
16 |     - Supported in all plain and adapter models via ISQ
17 |     - CUDA and CPU only
18 |     - 4, 8 bit
19 | - ISQ
20 |     - Q, K type GGUF quants
21 |     - Supported in all plain and adapter models
22 |     - HQQ quants
23 |     - CPU, CUDA, Metal (all supported devices)
24 | 
25 | ## Using a GGUF quantized model
26 | - Use the `gguf` (cli) / `GGUF` (Python) model selector
27 | - Provide the GGUF file
28 | 
29 | ```
30 | cargo run --features cuda -- -i gguf -f my-gguf-file.gguf
31 | ```
32 | 
33 | ## Using ISQ
34 | See the [docs](ISQ.md)
35 | 
36 | ```
37 | cargo run --features cuda -- -i --isq Q4K plain -m microsoft/Phi-3-mini-4k-instruct -a phi3
38 | ```
39 | 
40 | ## Using a GPTQ quantized model
41 | - Use the `plain` (cli) / `Plain` (Python) model selector
42 | - Provide the model ID for the GPTQ model
43 | - Mistral.rs will automatically detect and use GPTQ quantization.
44 | 
45 | ```
46 | cargo run --features cuda -- -i plain -m kaitchup/Phi-3-mini-4k-instruct-gptq-4bit -a phi3
47 | ```


--------------------------------------------------------------------------------
/mistralrs-core/src/utils/debug.rs:
--------------------------------------------------------------------------------
 1 | use candle_core::{Device, DeviceLocation};
 2 | use tracing::level_filters::LevelFilter;
 3 | use tracing_subscriber::EnvFilter;
 4 | 
 5 | use crate::DEBUG;
 6 | 
 7 | static LOGGER: std::sync::OnceLock<()> = std::sync::OnceLock::new();
 8 | 
 9 | /// This should be called to initialize the debug flag and logging.
10 | /// This should not be called in mistralrs-core code due to Rust usage.
11 | pub fn initialize_logging() {
12 |     let is_debug = std::env::var("MISTRALRS_DEBUG")
13 |         .unwrap_or_default()
14 |         .contains('1');
15 |     DEBUG.store(is_debug, std::sync::atomic::Ordering::Relaxed);
16 | 
17 |     LOGGER.get_or_init(|| {
18 |         let filter = EnvFilter::builder()
19 |             .with_default_directive(if is_debug {
20 |                 LevelFilter::DEBUG.into()
21 |             } else {
22 |                 LevelFilter::INFO.into()
23 |             })
24 |             .from_env_lossy();
25 |         tracing_subscriber::fmt().with_env_filter(filter).init();
26 |     });
27 | }
28 | 
29 | pub(crate) trait DeviceRepr {
30 |     fn device_pretty_repr(&self) -> String;
31 | }
32 | 
33 | impl DeviceRepr for Device {
34 |     fn device_pretty_repr(&self) -> String {
35 |         match self.location() {
36 |             DeviceLocation::Cpu => "cpu".to_string(),
37 |             DeviceLocation::Cuda { gpu_id } => format!("cuda[{gpu_id}]"),
38 |             DeviceLocation::Metal { gpu_id } => format!("metal[{gpu_id}]"),
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/scripts/testgen_vision.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | processor = transformers.AutoProcessor.from_pretrained(
 4 |     "microsoft/Phi-3.5-vision-instruct", trust_remote_code=True
 5 | )
 6 | 
 7 | res = processor.tokenizer.apply_chat_template(
 8 |     [
 9 |         {
10 |             "role": "system",
11 |             "content": [{"type": "text", "text": "You are a helpful assistant"}],
12 |         },
13 |         {
14 |             "role": "user",
15 |             "content": [
16 |                 {"type": "image"},
17 |                 {
18 |                     "type": "text",
19 |                     "text": "Hello, please describe the above.",
20 |                 },
21 |             ],
22 |         },
23 |         {"role": "assistant", "content": [{"type": "text", "text": "Hi there"}]},
24 |         {
25 |             "role": "user",
26 |             "content": [
27 |                 {"type": "image"},
28 |                 {"type": "text", "text": "This is me, who are you"},
29 |             ],
30 |         },
31 |         {
32 |             "role": "assistant",
33 |             "content": [{"type": "text", "text": "   I am an assistant   "}],
34 |         },
35 |         {
36 |             "role": "user",
37 |             "content": [
38 |                 {"type": "image"},
39 |                 {"type": "text", "text": "Another question, what is this?"},
40 |             ],
41 |         },
42 |     ],
43 |     add_generation_prompt=True,
44 |     tokenize=False,
45 | )
46 | print(res.replace("\n", "\\n"))
47 | 


--------------------------------------------------------------------------------
/mistralrs-quant/src/gptq/ffi.rs:
--------------------------------------------------------------------------------
 1 | use half::f16;
 2 | 
 3 | #[allow(dead_code)]
 4 | extern "C" {
 5 |     pub(crate) fn reconstruct_exllama(
 6 |         b_q_weight: *const u32,
 7 |         b_gptq_qzeros: *const u32,
 8 |         b_gptq_scales: *const f16,
 9 |         b_q_perm: *const i32,
10 |         out: *mut f16,
11 |         size_k: i32,
12 |         size_n: i32,
13 |         groups: i32,
14 |         bit: i32,
15 |     );
16 | 
17 |     pub(crate) fn reconstruct_gptq(
18 |         b_q_weight: *const u32,
19 |         b_gptq_qzeros: *const u32,
20 |         b_gptq_scales: *const f16,
21 |         b_q_perm: *const i32,
22 |         out: *mut f16,
23 |         size_k: i32,
24 |         size_n: i32,
25 |         groups: i32,
26 |         bit: i32,
27 |     );
28 | 
29 |     pub(crate) fn gemm_half_q_half_cuda_part(
30 |         a: *const f16,
31 |         b_q_weight: *const u32,
32 |         b_gptq_qzeros: *const u32,
33 |         b_gptq_scales: *const f16,
34 |         b_q_perm: *const i32,
35 |         out: *mut f16,
36 |         m: i32,
37 |         n: i32,
38 |         k: i32,
39 |         m_count: i32,
40 |         groups: i32,
41 |         bit: i32,
42 |     );
43 | 
44 |     pub(crate) fn gemm_half_q_half_alt(
45 |         a: *const f16,
46 |         b_q_weight: *const u32,
47 |         b_gptq_qzeros: *const u32,
48 |         b_gptq_scales: *const f16,
49 |         b_g_idx: *const i32,
50 |         out: *mut f16,
51 |         m: i32,
52 |         n: i32,
53 |         k: i32,
54 |         bit: i32,
55 |     );
56 | }
57 | 


--------------------------------------------------------------------------------
/examples/server/regex.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
 4 | 
 5 | BULLET_LIST_REGEX = "(- [^\n]*\n)+(- [^\n]*)(\n\n)?"
 6 | 
 7 | completion = client.chat.completions.create(
 8 |     model="mistral",
 9 |     messages=[
10 |         {
11 |             "role": "user",
12 |             "content": "Write a list of jokes. Return a markdown list where each item is a joke.",
13 |         }
14 |     ],
15 |     max_tokens=256,
16 |     frequency_penalty=1.0,
17 |     top_p=0.1,
18 |     temperature=0,
19 |     extra_body={"grammar": {"type": "regex", "value": BULLET_LIST_REGEX}},
20 | )
21 | 
22 | print(completion.choices[0].message.content)
23 | 
24 | print("---")
25 | 
26 | # The following does token healing. Prompting the model to continue after a space usually breaks
27 | # the text because the model wants to start the new token with a space. By setting the a space after
28 | # "Sure!" we guarantee a space after "Sure!" but we haven't forced which token that starts with space should be used yet.
29 | 
30 | completion = client.chat.completions.create(
31 |     model="mistral",
32 |     messages=[
33 |         {
34 |             "role": "user",
35 |             "content": "Tell me a joke.",
36 |         }
37 |     ],
38 |     max_tokens=256,
39 |     frequency_penalty=1.0,
40 |     top_p=0.1,
41 |     temperature=0,
42 |     extra_body={"grammar": {"type": "regex", "value": "Sure! (?s:.)*"}},
43 | )
44 | 
45 | print(completion.choices[0].message.content)
46 | 


--------------------------------------------------------------------------------
/examples/amoe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "rows": [
 3 |         {
 4 |           "prompt": "Discuss the impact of Renaissance art on modern aesthetics",
 5 |           "expert": 0
 6 |         },
 7 |         {
 8 |           "prompt": "Explain the significance of the theory of relativity in modern physics",
 9 |           "expert": 1
10 |         },
11 |         {
12 |           "prompt": "Analyze the themes of existentialism in 20th-century literature",
13 |           "expert": 0
14 |         },
15 |         {
16 |           "prompt": "Describe the process of photosynthesis and its importance to ecosystems",
17 |           "expert": 1
18 |         },
19 |         {
20 |           "prompt": "Evaluate the role of classical music in contemporary film scores",
21 |           "expert": 0
22 |         },
23 |         {
24 |           "prompt": "Outline the steps of the scientific method and their importance in experiments",
25 |           "expert": 1
26 |         },
27 |         {
28 |           "prompt": "Compare and contrast the philosophies of Socrates and Nietzsche",
29 |           "expert": 0
30 |         },
31 |         {
32 |           "prompt": "Discuss the ethical implications of artificial intelligence in society",
33 |           "expert": 1
34 |         },
35 |         {
36 |           "prompt": "Interpret the symbolism in Salvador Dalí's paintings",
37 |           "expert": 0
38 |         },
39 |         {
40 |           "prompt": "Describe the function and structure of DNA in genetic inheritance",
41 |           "expert": 1
42 |         }
43 |     ]
44 | }
45 |   


--------------------------------------------------------------------------------
/mistralrs-core/src/utils/tokens.rs:
--------------------------------------------------------------------------------
 1 | use std::{env, fs};
 2 | use thiserror::Error;
 3 | 
 4 | use anyhow::Result;
 5 | use tracing::info;
 6 | 
 7 | use crate::pipeline::TokenSource;
 8 | 
 9 | #[derive(Error, Debug)]
10 | enum TokenRetrievalError {
11 |     #[error("No home directory.")]
12 |     HomeDirectoryMissing,
13 | }
14 | 
15 | /// This reads a token from a specified source. If the token cannot be read, a warning is logged with `tracing`
16 | /// and *no token is used*.
17 | pub(crate) fn get_token(source: &TokenSource) -> Result<Option<String>> {
18 |     fn skip_token(input: &str) -> Option<String> {
19 |         info!("Could not load token at {input:?}, using no HF token.");
20 |         None
21 |     }
22 | 
23 |     let token = match source {
24 |         TokenSource::Literal(data) => Some(data.clone()),
25 |         TokenSource::EnvVar(envvar) => env::var(envvar).ok().or_else(|| skip_token(envvar)),
26 |         TokenSource::Path(path) => fs::read_to_string(path).ok().or_else(|| skip_token(path)),
27 |         TokenSource::CacheToken => {
28 |             let home = format!(
29 |                 "{}/.cache/huggingface/token",
30 |                 dirs::home_dir()
31 |                     .ok_or(TokenRetrievalError::HomeDirectoryMissing)?
32 |                     .display()
33 |             );
34 | 
35 |             fs::read_to_string(home.clone())
36 |                 .ok()
37 |                 .or_else(|| skip_token(&home))
38 |         }
39 |         TokenSource::None => None,
40 |     };
41 | 
42 |     Ok(token.map(|s| s.trim().to_string()))
43 | }
44 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | #https://dev.to/deciduously/prepare-your-rust-api-docs-for-github-pages-2n5i
 3 | on:
 4 |   push:
 5 |     branches: ["master"]
 6 | 
 7 |   workflow_dispatch:
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pages: write
12 |   id-token: write
13 | 
14 | concurrency:
15 |   group: "pages"
16 |   cancel-in-progress: false
17 | 
18 | jobs:
19 |   deploy:
20 |     runs-on: ubuntu-latest
21 |     strategy:
22 |       matrix:
23 |         rust: [stable]
24 |     steps:
25 |       - name: Checkout
26 |         uses: actions/checkout@v4
27 |       - uses: actions-rs/toolchain@v1
28 |         with:
29 |           profile: minimal
30 |           toolchain: ${{ matrix.rust }}
31 |           override: true
32 |       - name: Setup Pages
33 |         uses: actions/configure-pages@v5
34 |       - uses: actions-rs/cargo@v1
35 |         with:
36 |           command: doc
37 |           args: --no-deps
38 |       - name: Build docs
39 |         run: |
40 |           rm -rf ./docs
41 |           echo "<meta http-equiv=\"refresh\" content=\"0; url=mistralrs\">" > target/doc/index.html
42 |           cp -r target/doc ./docs
43 |       - name: Build Python docs
44 |         run: |
45 |           python3 -m venv myenv
46 |           source myenv/bin/activate
47 |           pip install maturin[patchelf] pdoc
48 |           cd mistralrs-pyo3
49 |           maturin develop
50 |           cd ..
51 |           pdoc mistralrs -o ./docs/pyo3
52 |       - name: Deploy
53 |         uses: JamesIves/github-pages-deploy-action@v4
54 |         with:
55 |           folder: ./docs


--------------------------------------------------------------------------------
/chat_templates/default.json:
--------------------------------------------------------------------------------
1 | {
2 |     "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
3 | }


--------------------------------------------------------------------------------
/mistralrs-server/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mistralrs-server"
 3 | readme = "README.md"
 4 | authors = ["Eric Buehler"]
 5 | version.workspace = true
 6 | edition.workspace = true
 7 | description.workspace = true
 8 | repository.workspace = true
 9 | keywords.workspace = true
10 | categories.workspace = true
11 | license.workspace = true
12 | homepage.workspace = true
13 | default-run = "mistralrs-server"
14 | 
15 | [dependencies]
16 | anyhow.workspace = true
17 | ctrlc = "3.4.4"
18 | candle-core.workspace = true
19 | serde.workspace = true
20 | serde_json.workspace = true
21 | axum = { version = "0.7.4", features = ["tokio"] }
22 | tower-http = { version = "0.5.1", features = ["cors"]}
23 | utoipa = { version = "4.2", features = ["axum_extras"] }
24 | utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]}
25 | mistralrs-core = { version = "0.3.1", path = "../mistralrs-core" }
26 | indexmap.workspace = true
27 | accelerate-src = { workspace = true, optional = true }
28 | intel-mkl-src = { workspace = true, optional = true }
29 | futures.workspace = true
30 | tracing.workspace = true
31 | tokio.workspace = true
32 | either.workspace = true
33 | clap.workspace = true
34 | once_cell.workspace=true
35 | reqwest.workspace = true
36 | image.workspace = true
37 | url.workspace = true
38 | data-url.workspace = true
39 | 
40 | [features]
41 | cuda = ["mistralrs-core/cuda"]
42 | cudnn = ["mistralrs-core/cudnn"]
43 | metal = ["mistralrs-core/metal"]
44 | flash-attn = ["cuda", "mistralrs-core/flash-attn"]
45 | accelerate = ["mistralrs-core/accelerate"]
46 | mkl = ["mistralrs-core/mkl"]
47 | 


--------------------------------------------------------------------------------
/docs/NON_GRANULAR.md:
--------------------------------------------------------------------------------
 1 | # X-LoRA non-granular scalings
 2 | 
 3 | A key limitation of the X-LoRA architecture is the need for 2 forward passes of the model per generation step. To trade off model performance for speed, mistral.rs allows the user to reduce the granularity of the scalings by caching them in a technique we call Non Granular Scalings.
 4 | 
 5 | ## How it works
 6 | For the first $k$ generation steps, the scalings are calculated normally for each token. However, for the rest of the tokens, it is cached and re-used. In this way, we are able to avoid the second forward pass and the performance is increased significantly. To maintain correctness, enabling non-granular scalings will restrict the engine to processing one sequence at a time.
 7 | 
 8 | ## How to use it
 9 | ### Command line
10 | This can be enabled by passing `--tgt-non-granular-index` followed by $k$:
11 | ```
12 | ./mistralrs-server --port 1234 x-lora-plain -o orderings/xlora-paper-ordering.json -x lamm-mit/x-lora --tgt-non-granular-index 5
13 | ```
14 | 
15 | ### Python
16 | Set the `tgt_non_granular_index` attribute to a non-`None` value in the `Which` selection:
17 | ```py
18 | from mistralrs import Runner, Which
19 | 
20 | runner = Runner(
21 |     which=Which.XLoraGGUF(
22 |         tok_model_id=None,  # Automatically determine from ordering file
23 |         quantized_model_id="TheBloke/zephyr-7B-beta-GGUF",
24 |         quantized_filename="zephyr-7b-beta.Q4_0.gguf",
25 |         xlora_model_id="lamm-mit/x-lora",
26 |         order="orderings/xlora-paper-ordering.json",
27 |         tgt_non_granular_index=5,
28 |     )
29 | )
30 | 
31 | ...
32 | ```


--------------------------------------------------------------------------------
/mistralrs-quant/kernels/gptq/qdq_util.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _qdq_util_cuh
 6 | #define _qdq_util_cuh
 7 | 
 8 | union half2_uint32 {
 9 |   uint32_t as_uint32;
10 |   half2 as_half2;
11 |   __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
12 |   __device__ half2_uint32(half2 val) : as_half2(val) {}
13 | };
14 | 
15 | union half_uint16 {
16 |   uint16_t as_uint16;
17 |   half as_half;
18 |   __device__ half_uint16(uint16_t val) : as_uint16(val) {}
19 |   __device__ half_uint16(half val) : as_half(val) {}
20 | };
21 | 
22 | // Max_scale premultiplied by 1/256
23 | 
24 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
25 |   int qs_i = qs + 1;
26 |   half qs_h = __int2half_rn(qs_i * qs_i);
27 |   qs_h = __hmul(qs_h, max_scale);
28 |   return qs_h;
29 | }
30 | 
31 | __forceinline__ __device__ half dq(const int q, const int qzero,
32 |                                    const half scale) {
33 |   return __hmul(__int2half_rn(q - qzero), scale);
34 | }
35 | 
36 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) {
37 |   // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
38 |   return __int2half_rn(q - qzero);
39 | }
40 | 
41 | __forceinline__ __device__ int exb(const uint32_t q, const int shift,
42 |                                    const int mask) {
43 |   return (int)((q >> shift) & mask);
44 | }
45 | 
46 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0,
47 |                                    const int shift, const int mask) {
48 |   return (int)(__funnelshift_rc(q0, q1, shift) & mask);
49 | }
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/scripts/create_ordering.py:
--------------------------------------------------------------------------------
 1 | from peft.tuners import lora
 2 | from transformers import AutoModelForCausalLM  # type: ignore
 3 | import json
 4 | from peft.tuners.lora.config import LoraConfig
 5 | 
 6 | model_id = input("Enter the base model ID: ")
 7 | target_modules_in = input("Enter the target modules as a comma delimited list: ")
 8 | target_modules = target_modules_in.split(",")
 9 | target_modules = [x for x in target_modules if len(x) > 0]
10 | target_modules = [x.strip() for x in target_modules]
11 | 
12 | model = AutoModelForCausalLM.from_pretrained(model_id)
13 | lora_config = LoraConfig(target_modules=target_modules, init_lora_weights=False)
14 | 
15 | model.add_adapter(lora_config, "default")
16 | 
17 | total_swapped = 0
18 | loras = {}
19 | for n, module in model.named_modules():
20 |     if isinstance(module, lora.Linear):
21 |         loras[n.split("lora_A.")[0]] = total_swapped
22 |         total_swapped += 1
23 |     elif isinstance(module, lora.Embedding):
24 |         loras[n.split("lora_embedding_A.")[0]] = total_swapped
25 |         total_swapped += 1
26 |     elif isinstance(module, lora.Conv2d):
27 |         loras[n.split("lora_A.")[0]] = total_swapped
28 |         total_swapped += 1
29 | 
30 | adapters_in = input(
31 |     "Enter a comma delimited list of adapter names as they were specified when training: "
32 | )
33 | adapters = adapters_in.split(",")
34 | adapters = [x for x in adapters if len(x) > 0]
35 | adapters = [x.strip() for x in adapters]
36 | 
37 | out = {"order": adapters, "layers": loras, "base_model_id": model_id}
38 | 
39 | outfile = input("Enter output file: ")
40 | with open(outfile, "w") as f:
41 |     f.write(json.dumps(out))
42 | 


--------------------------------------------------------------------------------
/mistralrs/examples/isq/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages, TextModelBuilder,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
 9 |         .with_isq(IsqType::Q8_0)
10 |         .with_logging()
11 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
12 |         .build()
13 |         .await?;
14 | 
15 |     let messages = TextMessages::new()
16 |         .add_message(
17 |             TextMessageRole::System,
18 |             "You are an AI agent with a specialty in programming.",
19 |         )
20 |         .add_message(
21 |             TextMessageRole::User,
22 |             "Hello! How are you? Please write generic binary search function in Rust.",
23 |         );
24 | 
25 |     let response = model.send_chat_request(messages).await?;
26 | 
27 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
28 |     dbg!(
29 |         response.usage.avg_prompt_tok_per_sec,
30 |         response.usage.avg_compl_tok_per_sec
31 |     );
32 | 
33 |     // Next example: re-ISQ the model at runtime
34 |     model.re_isq_model(IsqType::HQQ4).await?;
35 | 
36 |     let messages = TextMessages::new().add_message(TextMessageRole::User, "Why is the sky blue?");
37 | 
38 |     let response = model.send_chat_request(messages).await?;
39 | 
40 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
41 |     dbg!(
42 |         response.usage.avg_prompt_tok_per_sec,
43 |         response.usage.avg_compl_tok_per_sec
44 |     );
45 | 
46 |     Ok(())
47 | }
48 | 


--------------------------------------------------------------------------------
/mistralrs-bench/README.md:
--------------------------------------------------------------------------------
 1 | # `mistralrs-bench`
 2 | 
 3 | This is our official benchmarking application, which allows you to collect structured information about the speed of `mistral.rs`.
 4 | 
 5 | > [!NOTE]
 6 | > You should replace `--features ...` with one of the features specified [here](../README.md#supported-accelerators), or remove it for pure CPU inference.
 7 | 
 8 | To run: `cargo run --release --features ... --package mistralrs-bench`
 9 | 
10 | ```bash
11 | Fast and easy LLM serving.
12 | 
13 | Usage: mistralrs-bench [OPTIONS] <COMMAND>
14 | 
15 | Commands:
16 |   plain        Select a plain model
17 |   x-lora       Select an X-LoRA architecture
18 |   lora         Select a LoRA architecture
19 |   gguf         Select a GGUF model
20 |   x-lora-gguf  Select a GGUF model with X-LoRA
21 |   lora-gguf    Select a GGUF model with LoRA
22 |   ggml         Select a GGML model
23 |   x-lora-ggml  Select a GGML model with X-LoRA
24 |   lora-ggml    Select a GGML model with LoRA
25 |   help         Print this message or the help of the given subcommand(s)
26 | 
27 | Options:
28 |   -p, --n-prompt <N_PROMPT>
29 |           Number of prompt tokens to run [default: 512]
30 |   -g, --n-gen <N_GEN>
31 |           Number of generations tokens to run [default: 128]
32 |   -c, --concurrency <CONCURRENCY>
33 |           Number of concurrent requests to run. Default is 1
34 |   -r, --repetitions <REPETITIONS>
35 |           Number of times to repeat each test [default: 5]
36 |   -n, --num-device-layers <NUM_DEVICE_LAYERS>
37 |           Number of device layers to load and run on the device. All others will be on the CPU
38 |   -h, --help
39 |           Print help
40 |   -V, --version
41 |           Print version
42 | ```


--------------------------------------------------------------------------------
/examples/server/completion.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | while True:
39 |     prompt = input(">>> ")
40 |     completion = client.completions.create(
41 |         model="mistral",
42 |         prompt=prompt,
43 |         max_tokens=256,
44 |         frequency_penalty=1.0,
45 |         top_p=0.1,
46 |         temperature=0,
47 |     )
48 |     resp = completion.choices[0].text
49 |     print(resp)
50 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/vision_models/image_processor.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
 2 | 
 3 | use candle_core::{Device, Result, Tensor};
 4 | use image::DynamicImage;
 5 | 
 6 | use crate::pipeline::InputsProcessor;
 7 | 
 8 | use super::preprocessor_config::PreProcessorConfig;
 9 | 
10 | #[allow(dead_code)]
11 | pub(crate) struct PreprocessedImages {
12 |     /// Without batch size, safe to unsqueeze & concat in dim0
13 |     pub(crate) pixel_values: Tensor,
14 |     /// Without batch size, safe to unsqueeze & concat in dim0
15 |     pub(crate) pixel_attention_mask: Option<Tensor>,
16 |     pub(crate) image_sizes: Option<(usize, usize)>,
17 |     pub(crate) num_img_tokens: Option<Vec<usize>>,
18 |     /// Without batch size, safe to unsqueeze & concat in dim0
19 |     pub(crate) aspect_ratio_ids: Option<Tensor>,
20 |     /// Without batch size, safe to unsqueeze & concat in dim0
21 |     pub(crate) aspect_ratio_mask: Option<Tensor>,
22 |     /// Without batch size
23 |     pub(crate) num_tiles: Option<Vec<usize>>,
24 | }
25 | 
26 | /// ImagePreProcessor: process images for the model (similar to `InputsProcessor`, typically called by it)
27 | pub trait ImagePreProcessor: InputsProcessor {
28 |     const DEFAULT_MEAN: [f64; 3];
29 |     const DEFAULT_STD: [f64; 3];
30 | 
31 |     /// Preprocess the images for a specific batch.
32 |     /// `(bs, max_num_images)`, max_num_images is the max images per batches.
33 |     #[allow(clippy::too_many_arguments)]
34 |     fn preprocess(
35 |         &self,
36 |         images: Vec<DynamicImage>,
37 |         config: &PreProcessorConfig,
38 |         device: &Device,
39 |         batch_info: (usize, usize),
40 |     ) -> Result<PreprocessedImages>;
41 | }
42 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/amoe/inputs.rs:
--------------------------------------------------------------------------------
 1 | use std::{fs::File, path::Path};
 2 | 
 3 | use csv::Reader;
 4 | use serde::Deserialize;
 5 | 
 6 | pub struct AnyMoeTrainingResult {
 7 |     pub steps: usize,
 8 |     /// One for each gating layer
 9 |     pub final_loss: Vec<f32>,
10 | }
11 | 
12 | #[derive(Deserialize, Debug)]
13 | pub struct AnyMoeTrainingInputRow {
14 |     pub prompt: String,
15 |     pub expert: usize,
16 |     pub image_urls: Option<Vec<String>>,
17 | }
18 | 
19 | #[derive(Deserialize, Debug)]
20 | pub struct AnyMoeTrainingInputs {
21 |     rows: Vec<AnyMoeTrainingInputRow>,
22 | }
23 | 
24 | impl AnyMoeTrainingInputs {
25 |     /// From a CSV file with the mandatory columns `prompt` (String), `expert` (usize), and the optional
26 |     /// column `image_urls` (`Vec<String>`).
27 |     pub fn from_csv<P: AsRef<Path>>(file: P) -> anyhow::Result<Self> {
28 |         let file = File::open(file)?;
29 |         let mut reader = Reader::from_reader(file);
30 |         let mut rows = Vec::new();
31 |         for result in reader.deserialize() {
32 |             let row: AnyMoeTrainingInputRow = result?;
33 |             rows.push(row);
34 |         }
35 |         Ok(Self { rows })
36 |     }
37 | 
38 |     /// From a JSON file with the top-level key being `rows` (array), which contains objects with the
39 |     /// keys `prompt` (String), `expert` (usize), `image_urls` (Option<Vec<String>>).
40 |     pub fn from_json<P: AsRef<Path>>(file: P) -> anyhow::Result<Self> {
41 |         let file = File::open(file)?;
42 |         Ok(serde_json::from_reader(file)?)
43 |     }
44 | 
45 |     pub fn len(&self) -> usize {
46 |         self.rows.len()
47 |     }
48 | 
49 |     pub fn into_inner(self) -> Vec<AnyMoeTrainingInputRow> {
50 |         self.rows
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/docs/GEMMA2.md:
--------------------------------------------------------------------------------
 1 | # Gemma 2 Model
 2 | 
 3 | **[See the Gemma 2 model Collection](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)**
 4 | 
 5 | The Gemma 2 models are a family of text-to-text decoder-only LLMs. As such, the methods to use them are the same as with all other text-to-text LLMs supported by mistral.rs.
 6 | 
 7 | ## HTTP API
 8 | 
 9 | ```py
10 | import openai
11 | 
12 | messages = []
13 | prompt = input("Enter system prompt >>> ")
14 | if len(prompt) > 0:
15 |     messages.append({"role": "system", "content": prompt})
16 | 
17 | 
18 | while True:
19 |     prompt = input(">>> ")
20 |     messages.append({"role": "user", "content": prompt})
21 |     completion = client.chat.completions.create(
22 |         model="gemma2",
23 |         messages=messages,
24 |         max_tokens=256,
25 |         frequency_penalty=1.0,
26 |         top_p=0.1,
27 |         temperature=0,
28 |     )
29 |     resp = completion.choices[0].message.content
30 |     print(resp)
31 |     messages.append({"role": "assistant", "content": resp})
32 | ```
33 | 
34 | ## Python API
35 | ```py
36 | from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
37 | 
38 | runner = Runner(
39 |     which=Which.Plain(
40 |         model_id="google/gemma-2-9b-it",
41 |         arch=Architecture.Gemma2,
42 |     ),
43 | )
44 | 
45 | res = runner.send_chat_completion_request(
46 |     ChatCompletionRequest(
47 |         model="mistral",
48 |         messages=[
49 |             {"role": "user", "content": "Tell me a story about the Rust type system."}
50 |         ],
51 |         max_tokens=256,
52 |         presence_penalty=1.0,
53 |         top_p=0.1,
54 |         temperature=0.1,
55 |     )
56 | )
57 | print(res.choices[0].message.content)
58 | print(res.usage)
59 | ```


--------------------------------------------------------------------------------
/mistralrs/examples/custom_logits_processor/main.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use anyhow::Result;
 4 | use mistralrs::{
 5 |     CustomLogitsProcessor, IsqType, PagedAttentionMetaBuilder, RequestBuilder, Tensor,
 6 |     TextMessageRole, TextModelBuilder,
 7 | };
 8 | use rand::Rng;
 9 | 
10 | struct ThresholdLogitsProcessor {
11 |     threshold: f64,
12 | }
13 | 
14 | impl CustomLogitsProcessor for ThresholdLogitsProcessor {
15 |     fn apply(&self, logits: &Tensor, _context: &[u32]) -> mistralrs::Result<Tensor> {
16 |         // Mask is 1 for true, 0 for false.
17 |         let mask = logits.ge(self.threshold)?;
18 |         logits.broadcast_mul(&mask.to_dtype(logits.dtype())?)
19 |     }
20 | }
21 | 
22 | #[tokio::main]
23 | async fn main() -> Result<()> {
24 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
25 |         .with_isq(IsqType::Q4K)
26 |         .with_logging()
27 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
28 |         .build()
29 |         .await?;
30 | 
31 |     let mut rng = rand::thread_rng();
32 |     let random_value: f64 = rng.gen_range(0.0..=1.0);
33 |     let threshold: f64 = rng.gen_range(0.0..=0.5);
34 | 
35 |     let request = RequestBuilder::new()
36 |         .add_logits_processor(Arc::new(move |logits: &Tensor, _context: &[u32]| {
37 |             logits * random_value
38 |         }))
39 |         .add_logits_processor(Arc::new(ThresholdLogitsProcessor { threshold }))
40 |         .add_message(
41 |             TextMessageRole::User,
42 |             "Please write a mathematical equation where a few numbers are added.",
43 |         );
44 | 
45 |     let response = model.send_chat_request(request).await?;
46 | 
47 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
48 | 
49 |     Ok(())
50 | }
51 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/xlora_models/config.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use either::Either;
 4 | use serde::Deserialize;
 5 | 
 6 | fn true_default() -> bool {
 7 |     true
 8 | }
 9 | 
10 | fn false_default() -> bool {
11 |     false
12 | }
13 | fn default_1() -> usize {
14 |     1
15 | }
16 | 
17 | fn default_2048() -> usize {
18 |     1
19 | }
20 | fn default_dropout() -> f32 {
21 |     0.2
22 | }
23 | fn default_1f64() -> f64 {
24 |     1.0
25 | }
26 | fn default_0f64() -> f64 {
27 |     0.0
28 | }
29 | 
30 | #[derive(Clone, Debug, Deserialize)]
31 | pub struct XLoraConfig {
32 |     pub hidden_size: usize,
33 |     pub base_model_id: String,
34 |     #[serde(rename = "adapters")]
35 |     #[serde(with = "either::serde_untagged")]
36 |     pub _adapters: Either<Vec<String>, HashMap<String, String>>,
37 |     #[serde(default = "false_default")]
38 |     pub layerwise_scalings: bool,
39 |     #[serde(default = "false_default")]
40 |     pub enable_relu_and_dropout: bool,
41 |     #[serde(default = "default_1")]
42 |     pub xlora_depth: usize,
43 |     #[serde(default = "default_2048")]
44 |     pub xlora_size: usize,
45 |     #[serde(default = "default_dropout")]
46 |     pub xlora_dropout_p: f32,
47 |     #[serde(default = "true_default")]
48 |     pub enable_softmax: bool,
49 |     #[serde(default = "default_1f64")]
50 |     pub softmax_temperature: f64,
51 |     #[serde(default = "default_0f64")]
52 |     pub scaling_pass_value: f64,
53 |     #[serde(default = "false_default", rename = "use_trainable_adapters")]
54 |     pub _use_trainable_adapters: bool,
55 |     #[serde(default = "true_default")]
56 |     pub use_bias: bool,
57 |     #[serde(default = "default_1f64")]
58 |     pub global_scaling_weight: f64,
59 |     pub top_k_lora: Option<usize>,
60 |     #[serde(default = "false_default")]
61 |     pub enable_softmax_topk: bool,
62 | }
63 | 


--------------------------------------------------------------------------------
/mistralrs/examples/simple/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextMessages,
 4 |     TextModelBuilder,
 5 | };
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
10 |         .with_isq(IsqType::Q8_0)
11 |         .with_logging()
12 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
13 |         .build()
14 |         .await?;
15 | 
16 |     let messages = TextMessages::new()
17 |         .add_message(
18 |             TextMessageRole::System,
19 |             "You are an AI agent with a specialty in programming.",
20 |         )
21 |         .add_message(
22 |             TextMessageRole::User,
23 |             "Hello! How are you? Please write generic binary search function in Rust.",
24 |         );
25 | 
26 |     let response = model.send_chat_request(messages).await?;
27 | 
28 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
29 |     dbg!(
30 |         response.usage.avg_prompt_tok_per_sec,
31 |         response.usage.avg_compl_tok_per_sec
32 |     );
33 | 
34 |     // Next example: Return some logprobs with the `RequestBuilder`, which enables higher configurability.
35 |     let request = RequestBuilder::new().return_logprobs(true).add_message(
36 |         TextMessageRole::User,
37 |         "Please write a mathematical equation where a few numbers are added.",
38 |     );
39 | 
40 |     let response = model.send_chat_request(request).await?;
41 | 
42 |     println!(
43 |         "Logprobs: {:?}",
44 |         &response.choices[0]
45 |             .logprobs
46 |             .as_ref()
47 |             .unwrap()
48 |             .content
49 |             .as_ref()
50 |             .unwrap()[0..3]
51 |     );
52 | 
53 |     Ok(())
54 | }
55 | 


--------------------------------------------------------------------------------
/mistralrs-quant/kernels/gptq/compat.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copied from https://github.com/turboderp/exllamav2
 3 | */
 4 | 
 5 | #ifndef _compat_cuh
 6 | #define _compat_cuh
 7 | 
 8 | // atomicAdd for half types, to support CC < 7.x
 9 | 
10 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) {
11 |   unsigned int* address_as_ui =
12 |       (unsigned int*)((char*)address - ((size_t)address & 2));
13 |   unsigned int old = *address_as_ui;
14 |   unsigned int assumed;
15 | 
16 |   do {
17 |     assumed = old;
18 |     __half_raw hsum;
19 |     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
20 |     half tmpres = __hadd(hsum, val);
21 |     hsum = __half_raw(tmpres);
22 |     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)
23 |                               : (old & 0xffff0000) | hsum.x;
24 |     old = atomicCAS(address_as_ui, assumed, old);
25 |   } while (assumed != old);
26 | }
27 | 
28 | // atomicAdd for half2 types
29 | 
30 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
31 |   unsigned int* address_as_ui = (unsigned int*)address;
32 |   unsigned int old = *address_as_ui;
33 |   unsigned int assumed;
34 |   do {
35 |     assumed = old;
36 |     half2 old_val = *((half2*)&old);
37 |     half2 new_val = __hadd2(old_val, val);
38 |     old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
39 |   } while (assumed != old);
40 | }
41 | 
42 | //
43 | 
44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM)
45 |   #if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
46 | 
47 | __device__ __forceinline__ void atomicAdd(half* address, half val) {
48 |   atomicAdd_half(address, val);
49 | }
50 | 
51 |     #if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
52 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
53 |   atomicAdd_half2(address, val);
54 | }
55 |     #endif
56 | 
57 |   #endif
58 | #endif
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/chat_templates/vicuna.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "add_bos_token": true,
 3 |     "add_eos_token": false,
 4 |     "bos_token": {
 5 |         "__type": "AddedToken",
 6 |         "content": "<s>",
 7 |         "lstrip": false,
 8 |         "normalized": false,
 9 |         "rstrip": false,
10 |         "single_word": false
11 |     },
12 |     "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ system_message }}{% endif %}{% if message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}",
13 |     "clean_up_tokenization_spaces": false,
14 |     "eos_token": {
15 |         "__type": "AddedToken",
16 |         "content": "</s>",
17 |         "lstrip": false,
18 |         "normalized": false,
19 |         "rstrip": false,
20 |         "single_word": false
21 |     },
22 |     "legacy": false,
23 |     "model_max_length": 4096,
24 |     "pad_token": null,
25 |     "padding_side": "right",
26 |     "sp_model_kwargs": {},
27 |     "tokenizer_class": "LlamaTokenizer",
28 |     "unk_token": {
29 |         "__type": "AddedToken",
30 |         "content": "<unk>",
31 |         "lstrip": false,
32 |         "normalized": false,
33 |         "rstrip": false,
34 |         "single_word": false
35 |     }
36 | }


--------------------------------------------------------------------------------
/mistralrs-core/src/utils/tokenizer.rs:
--------------------------------------------------------------------------------
 1 | use std::{collections::HashMap, path::Path};
 2 | 
 3 | use anyhow::Result;
 4 | use serde::Deserialize;
 5 | use serde_json::Value;
 6 | use tokenizers::{tokenizer, Tokenizer};
 7 | 
 8 | #[derive(Deserialize)]
 9 | struct AddedToken {
10 |     id: usize,
11 |     content: String,
12 | }
13 | 
14 | /// May fix the tokenizer according to: https://gist.github.com/jneuff/682d47b786329f19291d166957b3274a
15 | pub(crate) fn get_tokenizer<P: AsRef<Path> + Clone>(
16 |     p: P,
17 |     processor_added_tokens: Option<&[&str]>,
18 | ) -> Result<Tokenizer> {
19 |     let mut tokenizer = {
20 |         let raw = std::fs::read(p.clone()).map_err(anyhow::Error::msg)?;
21 |         let mut tokenizer: Value = serde_json::from_slice(&raw).unwrap();
22 |         let added_tokens: Vec<AddedToken> =
23 |             serde_json::from_value(tokenizer["added_tokens"].clone()).unwrap();
24 |         let vocab: HashMap<String, usize> =
25 |             serde_json::from_value(tokenizer["model"]["vocab"].clone()).unwrap();
26 |         for token in added_tokens {
27 |             if !vocab.contains_key(&token.content) {
28 |                 tokenizer["model"]["vocab"]
29 |                     .as_object_mut()
30 |                     .unwrap()
31 |                     .insert(token.content, token.id.into())
32 |                     .ok_or(())
33 |                     .unwrap_err();
34 |             }
35 |         }
36 |         let raw_fixed = serde_json::to_vec_pretty(&tokenizer).unwrap();
37 |         Tokenizer::from_bytes(&raw_fixed).map_err(anyhow::Error::msg)?
38 |     };
39 |     if let Some(added_tokens) = processor_added_tokens {
40 |         tokenizer.add_special_tokens(
41 |             &added_tokens
42 |                 .iter()
43 |                 .map(|x| tokenizer::AddedToken::from(x.to_string(), true))
44 |                 .collect::<Vec<_>>(),
45 |         );
46 |     }
47 |     Ok(tokenizer)
48 | }
49 | 


--------------------------------------------------------------------------------
/mistralrs-vision/src/ops.rs:
--------------------------------------------------------------------------------
 1 | use candle_core::{Result, Tensor};
 2 | 
 3 | /// Pad an image of shape (c, h, w) to (c, max_h, max_w) by padding with zeros on the right and bottom.
 4 | pub fn pad(image: &Tensor, max_h: usize, max_w: usize) -> Result<Tensor> {
 5 |     let (c, h, w) = image.dims3()?;
 6 |     let new_image = Tensor::zeros((c, max_h, max_w), image.dtype(), image.device())?;
 7 |     new_image.slice_assign(&[&(..c), &(..h), &(..w)], image)
 8 | }
 9 | 
10 | /// Generate pixel mask of shape (c, max_h, max_w). 1 indicates valid pixel, 0 indicates padding.
11 | ///
12 | /// The input tensor is of shape (c, max_h, max_w) and the output mask is the same shape and
13 | /// represents where pixels are. The mask shape is in the top left corner is passed as `h` and `w`.
14 | pub fn make_pixel_mask(image: &Tensor, h: usize, w: usize) -> Result<Tensor> {
15 |     let (_c, max_h, max_w) = image.dims3()?;
16 |     let mask = Tensor::ones((h, w), image.dtype(), image.device())?;
17 |     let zeros = Tensor::zeros((max_h, max_w), image.dtype(), image.device())?;
18 |     // TODO(EricLBuehler): https://github.com/huggingface/candle/pull/2223 will make this nicer
19 |     zeros.slice_assign(&[&(..h), &(..w)], &mask)
20 | }
21 | 
22 | /// Given the image sizes (h, w) and the minimum and maximum lengths, calculate the image dimensions
23 | /// which will preserve aspect ration while respecing the minimum and maximum lengths.
24 | pub fn get_resize_image_size(
25 |     (h, w): (usize, usize),
26 |     (min_len, max_len): (usize, usize),
27 | ) -> (usize, usize) {
28 |     let aspect_ratio = w as f64 / h as f64;
29 | 
30 |     let (new_h, new_w) = if w >= h && w > max_len {
31 |         ((max_len as f64 / aspect_ratio) as usize, max_len)
32 |     } else if h > w && h > max_len {
33 |         (max_len, (max_len as f64 * aspect_ratio) as usize)
34 |     } else {
35 |         (h, w)
36 |     };
37 |     (new_h.max(min_len), new_w.max(min_len))
38 | }
39 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/scheduler/mod.rs:
--------------------------------------------------------------------------------
 1 | mod default_scheduler;
 2 | 
 3 | pub use default_scheduler::{DefaultScheduler, DefaultSchedulerMethod, DefaultSchedulerOutput};
 4 | 
 5 | use crate::{
 6 |     paged_attention::{
 7 |         BlockEngine, BlockTables, CacheConfig, PagedAttentionScheduler,
 8 |         PagedAttentionSchedulerConfig, PagedAttentionSchedulerOutput,
 9 |     },
10 |     sequence::Sequence,
11 | };
12 | 
13 | #[derive(Clone)]
14 | pub enum SchedulerConfig {
15 |     DefaultScheduler {
16 |         method: DefaultSchedulerMethod,
17 |     },
18 |     PagedAttentionMeta {
19 |         max_num_seqs: usize,
20 |         config: CacheConfig,
21 |     },
22 | }
23 | 
24 | impl SchedulerConfig {
25 |     pub fn into_scheduler(self) -> Box<dyn Scheduler> {
26 |         match self {
27 |             Self::DefaultScheduler { method } => Box::new(DefaultScheduler::new(method)),
28 |             Self::PagedAttentionMeta {
29 |                 max_num_seqs,
30 |                 config,
31 |             } => Box::new(PagedAttentionScheduler::new(
32 |                 PagedAttentionSchedulerConfig { max_num_seqs },
33 |                 config,
34 |             )),
35 |         }
36 |     }
37 | }
38 | 
39 | pub enum SchedulerOutput<'a> {
40 |     DefaultScheduler {
41 |         output: DefaultSchedulerOutput<'a>,
42 |     },
43 |     PagedAttention {
44 |         output: PagedAttentionSchedulerOutput,
45 |     },
46 | }
47 | 
48 | pub trait Scheduler {
49 |     fn schedule(&mut self) -> SchedulerOutput<'_>;
50 |     fn waiting_len(&self) -> usize;
51 |     fn running_len(&self) -> usize;
52 |     fn add_seq(&mut self, seq: Sequence);
53 |     /// This may do nothing. It depends on the implementation
54 |     fn free_finished_sequence_groups(&mut self);
55 | 
56 |     // PagedAttention metadata
57 |     fn block_tables(&self) -> Option<&BlockTables>;
58 |     fn block_size(&self) -> Option<usize>;
59 |     fn block_engine(&mut self) -> Option<&mut BlockEngine>;
60 | }
61 | 


--------------------------------------------------------------------------------
/mistralrs/examples/anymoe_lora/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     AnyMoeConfig, AnyMoeExpertType, AnyMoeModelBuilder, IsqType, PagedAttentionMetaBuilder,
 4 |     TextMessageRole, TextMessages, TextModelBuilder,
 5 | };
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let text_builder = TextModelBuilder::new("mistralai/Mistral-7B-Instruct-v0.1")
10 |         .with_isq(IsqType::Q8_0)
11 |         .with_logging()
12 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?;
13 | 
14 |     let model = AnyMoeModelBuilder::from_text_builder(
15 |         text_builder,
16 |         AnyMoeConfig {
17 |             hidden_size: 4096,
18 |             lr: 1e-3,
19 |             epochs: 100,
20 |             batch_size: 4,
21 |             expert_type: AnyMoeExpertType::FineTuned,
22 |             gate_model_id: None, // Set this to Some("path/to/model/id") for the pretrained gating model id
23 |             training: true,
24 |             loss_csv_path: None,
25 |         },
26 |         "model.layers",
27 |         "mlp",
28 |         "examples/amoe.json",
29 |         vec!["typeof/zephyr-7b-beta-lora"],
30 |         vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
31 |     )
32 |     .build()
33 |     .await?;
34 | 
35 |     let messages = TextMessages::new()
36 |         .add_message(
37 |             TextMessageRole::System,
38 |             "You are an AI agent with a specialty in programming.",
39 |         )
40 |         .add_message(
41 |             TextMessageRole::User,
42 |             "Hello! How are you? Please write generic binary search function in Rust.",
43 |         );
44 | 
45 |     let response = model.send_chat_request(messages).await?;
46 | 
47 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
48 |     dbg!(
49 |         response.usage.avg_prompt_tok_per_sec,
50 |         response.usage.avg_compl_tok_per_sec
51 |     );
52 | 
53 |     Ok(())
54 | }
55 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |     "mistralrs-server",
 4 |     "mistralrs-core",
 5 |     "mistralrs-pyo3",
 6 |     "mistralrs",
 7 |     "mistralrs-bench",
 8 |     "mistralrs-vision",
 9 |     "mistralrs-quant",
10 | ]
11 | exclude = [
12 |     "mistralrs-paged_attn",
13 | ]
14 | resolver = "2"
15 | 
16 | [workspace.package]
17 | version = "0.3.1"
18 | edition = "2021"
19 | description = "Fast and easy LLM serving."
20 | homepage = "https://github.com/EricLBuehler/mistral.rs"
21 | repository = "https://github.com/EricLBuehler/mistral.rs"
22 | keywords = ["machine-learning"]
23 | categories = ["science"]
24 | license = "MIT"
25 | 
26 | [workspace.dependencies]
27 | anyhow = "1.0.80"
28 | candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251" }
29 | candle-nn = { git = "https://github.com/EricLBuehler/candle.git", version = "0.7.0", rev = "60eb251" }
30 | serde = "1.0.197"
31 | serde_json = "1.0.114"
32 | indexmap = { version = "2.2.5", features = ["serde"] }
33 | either = { version = "1.10.0", features = ["serde"] }
34 | accelerate-src = { version = "0.3.2" }
35 | intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] }
36 | tracing = "0.1.40"
37 | tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
38 | futures = "0.3"
39 | clap = { version = "4.5.1", features = ["derive"] }
40 | pyo3 = { version = "0.22.0", features = ["full", "extension-module", "either"] }
41 | tokio = { version = "1.36.0", features = ["full", "rt-multi-thread"] }
42 | once_cell = "1.19.0"
43 | # All features but avif, avif increases the msrv dramatically
44 | image = { version = "0.25.1", default-features = false, features = ['bmp', 'dds', 'exr', 'ff', 'gif', 'hdr', 'ico', 'jpeg', 'png', 'pnm', 'qoi', 'tga', 'tiff', 'webp']}
45 | reqwest = { version = "0.12.4", features = ["blocking"] }
46 | base64 = "0.22.1"
47 | half = "2.4.0"
48 | rayon = "1.1.0"
49 | url = "2.5.2"
50 | data-url = "0.3.1"
51 | buildstructor = "0.5.4"
52 | float8 = "0.1.1"
53 | 


--------------------------------------------------------------------------------
/Dockerfile.cuda-all:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder
 2 | 
 3 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 4 |     curl \
 5 |     libssl-dev \
 6 |     pkg-config \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
10 | ENV PATH="/root/.cargo/bin:${PATH}"
11 | RUN rustup update nightly
12 | RUN rustup default nightly
13 | 
14 | WORKDIR /mistralrs
15 | 
16 | COPY . .
17 | 
18 | ARG CUDA_COMPUTE_CAP=80
19 | ENV CUDA_COMPUTE_CAP=${CUDA_COMPUTE_CAP}
20 | ARG FEATURES="cuda cudnn"
21 | ENV RAYON_NUM_THREADS=4
22 | RUN RUSTFLAGS="-Z threads=4" cargo build --release --workspace --exclude mistralrs-pyo3 --features "${FEATURES}"
23 | 
24 | FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base
25 | 
26 | ENV HUGGINGFACE_HUB_CACHE=/data \
27 |     PORT=80 \
28 |     RAYON_NUM_THREADS=8 \ 
29 |     LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
30 | 
31 | # Run the script to create symlinks in /usr/local/cuda/lib64
32 | RUN set -eux; \
33 |     for lib in $(ls /usr/local/cuda/lib64); do \
34 |         base=$(echo $lib | sed -r 's/(.+)\.so\..+/\1.so/'); \
35 |         if [ "$lib" != "$base" ]; then \
36 |             ln -sf "/usr/local/cuda/lib64/$lib" "/usr/local/cuda/lib64/$base"; \
37 |         fi; \
38 |     done
39 | 
40 | RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
41 |     libomp-dev \
42 |     ca-certificates \
43 |     libssl-dev \
44 |     curl \
45 |     pkg-config \
46 |     && rm -rf /var/lib/apt/lists/*
47 | 
48 | FROM base
49 | 
50 | COPY --from=builder /mistralrs/target/release/mistralrs-bench /usr/local/bin/mistralrs-bench
51 | RUN chmod +x /usr/local/bin/mistralrs-bench
52 | COPY --from=builder /mistralrs/target/release/mistralrs-server /usr/local/bin/mistralrs-server
53 | RUN chmod +x /usr/local/bin/mistralrs-server
54 | ENTRYPOINT ["mistralrs-server", "--port", "80", "--token-source", "env:HUGGING_FACE_HUB_TOKEN"]
55 | 


--------------------------------------------------------------------------------
/mistralrs-quant/src/dummy/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::{QuantMethod, QuantizedSerde};
 2 | 
 3 | #[derive(Debug)]
 4 | pub struct DummyLayer;
 5 | 
 6 | impl QuantMethod for DummyLayer {
 7 |     fn new(_method: crate::QuantMethodConfig) -> candle_core::Result<Self>
 8 |     where
 9 |         Self: Sized,
10 |     {
11 |         Ok(Self)
12 |     }
13 |     fn add_delta_w(
14 |         &self,
15 |         _delta: &candle_core::Tensor,
16 |     ) -> candle_core::Result<std::sync::Arc<dyn QuantMethod>> {
17 |         candle_core::bail!("DummyLayer should not ever be present in forward pass!")
18 |     }
19 |     fn apply_isq(
20 |         self: std::sync::Arc<Self>,
21 |         _dtype: Option<crate::IsqType>,
22 |         _device: candle_core::Device,
23 |         _n_quantized: &std::sync::atomic::AtomicUsize,
24 |     ) -> candle_core::Result<std::sync::Arc<dyn QuantMethod>> {
25 |         candle_core::bail!("DummyLayer should not ever be present in forward pass!")
26 |     }
27 |     fn dtype_and_device(&self) -> (candle_core::DType, candle_core::Device) {
28 |         (candle_core::DType::F64, candle_core::Device::Cpu)
29 |     }
30 |     fn forward(&self, _a: &candle_core::Tensor) -> candle_core::Result<candle_core::Tensor> {
31 |         candle_core::bail!("DummyLayer should not ever be present in forward pass!")
32 |     }
33 |     fn forward_via_half(
34 |         &self,
35 |         _a: &candle_core::Tensor,
36 |     ) -> candle_core::Result<candle_core::Tensor> {
37 |         candle_core::bail!("DummyLayer should not ever be present in forward pass!")
38 |     }
39 |     fn get_bias_mut(&mut self) -> Option<&mut candle_core::Tensor> {
40 |         None
41 |     }
42 |     fn get_max_isq_cpu_threads(&self, _dtype: crate::IsqType) -> Option<std::num::NonZeroUsize> {
43 |         None
44 |     }
45 |     fn quantized_act_type(&self) -> Option<candle_core::DType> {
46 |         None
47 |     }
48 | }
49 | 
50 | impl QuantizedSerde for DummyLayer {
51 |     fn name(&self) -> &'static str {
52 |         "dummy"
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/examples/server/chat.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | messages = []
39 | prompt = input("Enter system prompt >>> ")
40 | if len(prompt) > 0:
41 |     messages.append({"role": "system", "content": prompt})
42 | 
43 | 
44 | while True:
45 |     prompt = input(">>> ")
46 |     messages.append({"role": "user", "content": prompt})
47 |     completion = client.chat.completions.create(
48 |         model="mistral",
49 |         messages=messages,
50 |         max_tokens=256,
51 |         frequency_penalty=1.0,
52 |         top_p=0.1,
53 |         temperature=0,
54 |     )
55 |     resp = completion.choices[0].message.content
56 |     print(resp)
57 |     messages.append({"role": "assistant", "content": resp})
58 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/vision_models/preprocessor_config.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use candle_core::Result;
 4 | use image::imageops::FilterType;
 5 | use serde::Deserialize;
 6 | 
 7 | #[derive(Deserialize, Debug, Clone)]
 8 | #[allow(dead_code)]
 9 | pub struct PreProcessorConfig {
10 |     pub(crate) do_convert_rgb: Option<bool>,
11 |     pub(crate) do_image_splitting: Option<bool>,
12 |     pub(crate) do_normalize: Option<bool>,
13 |     pub(crate) do_pad: Option<bool>,
14 |     pub(crate) do_rescale: Option<bool>,
15 |     pub(crate) do_resize: Option<bool>,
16 |     pub(crate) do_center_crop: Option<bool>,
17 |     pub(crate) image_mean: Option<[f64; 3]>,
18 |     pub(crate) image_std: Option<[f64; 3]>,
19 |     pub(crate) rescale_factor: Option<f64>,
20 |     pub(crate) resampling: Option<usize>,
21 |     pub(crate) size: Option<HashMap<String, u32>>,
22 |     pub(crate) crop_size: Option<HashMap<String, u32>>,
23 |     pub(crate) num_img_tokens: Option<usize>,
24 |     pub(crate) num_crops: Option<usize>,
25 |     pub(crate) max_image_tiles: Option<usize>,
26 | }
27 | 
28 | #[allow(dead_code)]
29 | pub(crate) trait ToFilter {
30 |     fn to_filter(self) -> Result<FilterType>;
31 | }
32 | 
33 | impl ToFilter for Option<usize> {
34 |     // https://github.com/python-pillow/Pillow/blob/4b68563e8a818fb9c528fa159ddf3f4eaefa35e6/src/PIL/Image.py#L164-L170
35 |     // Default: https://github.com/huggingface/transformers/blob/0df888ffb72ea370555efdef45985378d3cc7b2b/src/transformers/models/idefics2/image_processing_idefics2.py#L226
36 |     fn to_filter(self) -> Result<FilterType> {
37 |         match self {
38 |             Some(0) => Ok(FilterType::Nearest),
39 |             Some(1) => Ok(FilterType::Lanczos3),
40 |             Some(2) | None => Ok(FilterType::Triangle), // BiLinear
41 |             Some(3) => Ok(FilterType::CatmullRom),      // BiCubic
42 |             Some(4) => Ok(FilterType::Nearest),
43 |             Some(x) => candle_core::bail!("Filter number {x} not supported"),
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/mistralrs-pyo3/src/stream.rs:
--------------------------------------------------------------------------------
 1 | use tokio::sync::mpsc::Receiver;
 2 | 
 3 | use mistralrs_core::{ChatCompletionChunkResponse, Response};
 4 | use pyo3::{exceptions::PyValueError, pyclass, pymethods, PyRef, PyRefMut, PyResult};
 5 | 
 6 | #[pyclass]
 7 | pub struct ChatCompletionStreamer {
 8 |     rx: Receiver<Response>,
 9 |     is_done: bool,
10 | }
11 | 
12 | impl ChatCompletionStreamer {
13 |     pub fn from_rx(rx: Receiver<Response>) -> Self {
14 |         Self { rx, is_done: false }
15 |     }
16 | }
17 | 
18 | #[pymethods]
19 | impl ChatCompletionStreamer {
20 |     fn __iter__(this: PyRef<'_, Self>) -> PyRef<'_, Self> {
21 |         this
22 |     }
23 |     fn __next__(mut this: PyRefMut<'_, Self>) -> Option<PyResult<ChatCompletionChunkResponse>> {
24 |         if this.is_done {
25 |             return None;
26 |         }
27 |         match this.rx.blocking_recv() {
28 |             Some(resp) => match resp {
29 |                 Response::ModelError(msg, _) => Some(Err(PyValueError::new_err(msg.to_string()))),
30 |                 Response::ValidationError(e) => Some(Err(PyValueError::new_err(e.to_string()))),
31 |                 Response::InternalError(e) => Some(Err(PyValueError::new_err(e.to_string()))),
32 |                 Response::Chunk(response) => {
33 |                     if response.choices.iter().all(|x| x.finish_reason.is_some()) {
34 |                         this.is_done = true;
35 |                     }
36 |                     Some(Ok(response))
37 |                 }
38 |                 Response::Done(_) => unreachable!(),
39 |                 Response::CompletionDone(_) => unreachable!(),
40 |                 Response::CompletionModelError(_, _) => unreachable!(),
41 |                 Response::CompletionChunk(_) => unreachable!(),
42 |                 Response::ImageGeneration(_) => unreachable!(),
43 |             },
44 |             None => Some(Err(PyValueError::new_err(
45 |                 "Received none in ChatCompletionStreamer".to_string(),
46 |             ))),
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace vllm {
23 | 
24 | // A vector type to store Q, K, V elements.
25 | template<typename T, int VEC_SIZE>
26 | struct Vec {};
27 | 
28 | // A vector type to store FP32 accumulators.
29 | template<typename T>
30 | struct FloatVec {};
31 | 
32 | // Template vector operations.
33 | template<typename Acc, typename A, typename B>
34 | inline __device__ Acc mul(A a, B b);
35 | 
36 | template<typename T>
37 | inline __device__ float sum(T v);
38 | 
39 | template<typename T>
40 | inline __device__ float dot(T a, T b) {
41 |   return sum(mul<T, T, T>(a, b));
42 | }
43 | 
44 | template<typename A, typename T>
45 | inline __device__ float dot(T a, T b) {
46 |   return sum(mul<A, T, T>(a, b));
47 | }
48 | 
49 | template<typename T>
50 | inline __device__ void zero(T& dst) {
51 |   constexpr int WORDS = sizeof(T) / 4;
52 |   union {
53 |     T raw;
54 |     uint32_t words[WORDS];
55 |   } tmp;
56 | 
57 | #pragma unroll
58 |   for (int ii = 0; ii < WORDS; ++ii) {
59 |     tmp.words[ii] = 0u;
60 |   }
61 |   dst = tmp.raw;
62 | }
63 | 
64 | } // namespace vllm
65 | 


--------------------------------------------------------------------------------
/mistralrs/examples/lora_activation/main.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | 
 3 | use anyhow::Result;
 4 | use mistralrs::{
 5 |     LoraModelBuilder, RequestBuilder, TextMessageRole, TextMessages, TextModelBuilder,
 6 | };
 7 | 
 8 | #[tokio::main]
 9 | async fn main() -> Result<()> {
10 |     let model =
11 |         LoraModelBuilder::from_text_model_builder(
12 |             TextModelBuilder::new("HuggingFaceH4/zephyr-7b-beta").with_logging(),
13 |             "lamm-mit/x-lora",
14 |             serde_json::from_reader(File::open("my-ordering-file.json").unwrap_or_else(|_| {
15 |                 panic!("Could not load ordering file at my-ordering-file.json")
16 |             }))?,
17 |         )
18 |         .build()
19 |         .await?;
20 | 
21 |     // First example: activate adapters per-request
22 |     let messages = RequestBuilder::new()
23 |         .set_adapters(vec!["adapter_2".to_string()])
24 |         .add_message(
25 |             TextMessageRole::User,
26 |             "Hello! How are you? Please write generic binary search function in Rust.",
27 |         );
28 | 
29 |     let response = model.send_chat_request(messages).await?;
30 | 
31 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
32 |     dbg!(
33 |         response.usage.avg_prompt_tok_per_sec,
34 |         response.usage.avg_compl_tok_per_sec
35 |     );
36 | 
37 |     // Second example: activate adapters for the whole model, used for all subsequent requests
38 |     model
39 |         .activate_adapters(vec!["adapter_1".to_string()])
40 |         .await?;
41 | 
42 |     let messages = TextMessages::new().add_message(
43 |         TextMessageRole::User,
44 |         "Hello! How are you? Please write generic binary search function in Rust.",
45 |     );
46 | 
47 |     let response = model.send_chat_request(messages).await?;
48 | 
49 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
50 |     dbg!(
51 |         response.usage.avg_prompt_tok_per_sec,
52 |         response.usage.avg_compl_tok_per_sec
53 |     );
54 | 
55 |     Ok(())
56 | }
57 | 


--------------------------------------------------------------------------------
/.github/workflows/analysis.yaml:
--------------------------------------------------------------------------------
 1 | name: Analysis
 2 | on:
 3 |   pull_request_target
 4 | 
 5 | jobs:
 6 |   comment:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Checkout code
10 |         uses: actions/checkout@v4
11 | 
12 |       - name: Install Rust and Cargo
13 |         run: |
14 |           curl -sSf https://sh.rustup.rs | sh -s -- -y
15 |           source $HOME/.cargo/env
16 | 
17 |       - name: Install Tokei
18 |         run: cargo install tokei
19 | 
20 |       - name: Run Tokei and get the lines of code
21 |         run: tokei . > tokei_output.txt
22 | 
23 |       - name: Comment or Update PR
24 |         uses: actions/github-script@v7
25 |         with:
26 |           script: |
27 |             const fs = require('fs');
28 |             const tokeiOutput = fs.readFileSync('tokei_output.txt', 'utf8');
29 |             const uniqueIdentifier = 'Code Metrics Report'; 
30 |             const codeReport = `
31 |               <details>
32 |               <summary>${uniqueIdentifier}</summary>
33 |               <pre>
34 |               ${tokeiOutput}
35 |               </pre>
36 |               </details>
37 |             `;
38 | 
39 |             const issue_number = context.issue.number;
40 |             const { owner, repo } = context.repo;
41 | 
42 |             const comments = await github.rest.issues.listComments({
43 |               issue_number,
44 |               owner,
45 |               repo
46 |             });
47 | 
48 |             const existingComment = comments.data.find(comment => comment.body.includes(uniqueIdentifier));
49 | 
50 |             if (existingComment) {
51 |               await github.rest.issues.updateComment({
52 |                 owner,
53 |                 repo,
54 |                 comment_id: existingComment.id,
55 |                 body: codeReport
56 |               });
57 |             } else {
58 |               await github.rest.issues.createComment({
59 |                 issue_number,
60 |                 owner,
61 |                 repo,
62 |                 body: codeReport
63 |               });
64 |             }
65 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/ffi.rs:
--------------------------------------------------------------------------------
 1 | use core::ffi::{c_int, c_long, c_void};
 2 | 
 3 | extern "C" {
 4 |     pub fn reshape_and_cache(
 5 |         key: *const c_void,
 6 |         value: *const c_void,
 7 |         key_cache: *const c_void,
 8 |         value_cache: *const c_void,
 9 |         slot_mapping: *const c_long,
10 | 
11 |         num_tokens: c_int,
12 |         num_heads: c_int,
13 |         head_size: c_int,
14 |         block_size: c_int,
15 |         x: c_int,
16 |         key_stride: c_int,
17 |         value_stride: c_int,
18 | 
19 |         dtype: u32,
20 |     );
21 | 
22 |     pub fn paged_attention_v1(
23 |         out: *const c_void,
24 |         query: *const c_void,
25 |         key_cache: *const c_void,
26 |         value_cache: *const c_void,
27 |         num_kv_heads: c_int,
28 |         scale: f32,
29 |         softcapping: f32,
30 |         block_tables: *const c_int,
31 |         context_lens: *const c_int,
32 |         block_size: c_int,
33 |         max_context_len: c_int,
34 | 
35 |         num_seqs: c_int,
36 |         num_heads: c_int,
37 |         head_size: c_int,
38 |         max_num_blocks_per_seq: c_int,
39 |         q_stride: c_int,
40 |         kv_block_stride: c_int,
41 |         kv_head_stride: c_int,
42 | 
43 |         dtype: u32,
44 |     );
45 | 
46 |     pub fn paged_attention_v2(
47 |         out: *const c_void,
48 |         exp_sums: *const f32,
49 |         max_logits: *const f32,
50 |         tmp_out: *const c_void,
51 |         query: *const c_void,
52 |         key_cache: *const c_void,
53 |         value_cache: *const c_void,
54 |         num_kv_heads: c_int,
55 |         scale: f32,
56 |         softcapping: f32,
57 |         block_tables: *const c_int,
58 |         context_lens: *const c_int,
59 |         block_size: c_int,
60 |         max_context_len: c_int,
61 | 
62 |         num_seqs: c_int,
63 |         num_heads: c_int,
64 |         head_size: c_int,
65 |         max_num_blocks_per_seq: c_int,
66 |         q_stride: c_int,
67 |         kv_block_stride: c_int,
68 |         kv_head_stride: c_int,
69 | 
70 |         dtype: u32,
71 |     );
72 | }
73 | 


--------------------------------------------------------------------------------
/mistralrs/examples/anymoe/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     AnyMoeConfig, AnyMoeExpertType, AnyMoeModelBuilder, IsqType, PagedAttentionMetaBuilder,
 4 |     TextMessageRole, TextMessages, TextModelBuilder,
 5 | };
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let text_builder = TextModelBuilder::new("mistralai/Mistral-7B-Instruct-v0.1")
10 |         .with_isq(IsqType::Q8_0)
11 |         .with_logging()
12 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?;
13 | 
14 |     let model = AnyMoeModelBuilder::from_text_builder(
15 |         text_builder,
16 |         AnyMoeConfig {
17 |             hidden_size: 4096,
18 |             lr: 1e-3,
19 |             epochs: 100,
20 |             batch_size: 4,
21 |             expert_type: AnyMoeExpertType::LoraAdapter {
22 |                 rank: 64,
23 |                 alpha: 16.,
24 |                 target_modules: vec!["gate_proj".to_string()],
25 |             },
26 |             gate_model_id: None, // Set this to Some("path/to/model/id") for the pretrained gating model id
27 |             training: true,
28 |             loss_csv_path: None,
29 |         },
30 |         "model.layers",
31 |         "mlp",
32 |         "examples/amoe.json",
33 |         vec!["HuggingFaceH4/zephyr-7b-beta"],
34 |         vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
35 |     )
36 |     .build()
37 |     .await?;
38 | 
39 |     let messages = TextMessages::new()
40 |         .add_message(
41 |             TextMessageRole::System,
42 |             "You are an AI agent with a specialty in programming.",
43 |         )
44 |         .add_message(
45 |             TextMessageRole::User,
46 |             "Hello! How are you? Please write generic binary search function in Rust.",
47 |         );
48 | 
49 |     let response = model.send_chat_request(messages).await?;
50 | 
51 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
52 |     dbg!(
53 |         response.usage.avg_prompt_tok_per_sec,
54 |         response.usage.avg_compl_tok_per_sec
55 |     );
56 | 
57 |     Ok(())
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/server/adapter_chat.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | messages = []
39 | prompt = input("Enter system prompt >>> ")
40 | if len(prompt) > 0:
41 |     messages.append({"role": "system", "content": prompt})
42 | 
43 | 
44 | while True:
45 |     prompt = input(">>> ")
46 |     adapter = input("Active adapter >>> ")
47 |     messages.append({"role": "user", "content": prompt})
48 |     completion = client.chat.completions.create(
49 |         model="mistral",
50 |         messages=messages,
51 |         max_tokens=256,
52 |         frequency_penalty=1.0,
53 |         top_p=0.1,
54 |         temperature=0,
55 |         extra_body={"adapters": [adapter]},
56 |     )
57 |     resp = completion.choices[0].message.content
58 |     print(resp)
59 |     messages.append({"role": "assistant", "content": resp})
60 | 


--------------------------------------------------------------------------------
/mistralrs/examples/batching/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     ChatCompletionResponse, IsqType, PagedAttentionMetaBuilder, TextMessageRole, TextMessages,
 4 |     TextModelBuilder, Usage,
 5 | };
 6 | 
 7 | const N_REQUESTS: usize = 10;
 8 | 
 9 | #[tokio::main]
10 | async fn main() -> Result<()> {
11 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
12 |         .with_isq(IsqType::Q8_0)
13 |         .with_logging()
14 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
15 |         .build()
16 |         .await?;
17 | 
18 |     let messages = TextMessages::new()
19 |         .add_message(
20 |             TextMessageRole::System,
21 |             "You are an AI agent with a specialty in programming.",
22 |         )
23 |         .add_message(
24 |             TextMessageRole::User,
25 |             "Hello! How are you? Please write generic binary search function in Rust.",
26 |         );
27 | 
28 |     let mut handles = Vec::new();
29 |     for _ in 0..N_REQUESTS {
30 |         handles.push(model.send_chat_request(messages.clone()));
31 |     }
32 |     let responses = futures::future::join_all(handles)
33 |         .await
34 |         .into_iter()
35 |         .collect::<Result<Vec<_>>>()?;
36 | 
37 |     let mut max_prompt = f32::MIN;
38 |     let mut max_completion = f32::MIN;
39 | 
40 |     for response in responses {
41 |         let ChatCompletionResponse {
42 |             usage:
43 |                 Usage {
44 |                     avg_compl_tok_per_sec,
45 |                     avg_prompt_tok_per_sec,
46 |                     ..
47 |                 },
48 |             ..
49 |         } = response;
50 |         dbg!(avg_compl_tok_per_sec, avg_prompt_tok_per_sec);
51 |         if avg_compl_tok_per_sec > max_prompt {
52 |             max_prompt = avg_prompt_tok_per_sec;
53 |         }
54 |         if avg_compl_tok_per_sec > max_completion {
55 |             max_completion = avg_compl_tok_per_sec;
56 |         }
57 |     }
58 |     println!("Individual sequence stats: {max_prompt} max PP T/s, {max_completion} max TG T/s");
59 | 
60 |     Ok(())
61 | }
62 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/dummy_paged_attention/cache_engine.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     collections::HashMap,
 3 |     sync::{Arc, Mutex, MutexGuard},
 4 | };
 5 | 
 6 | use candle_core::{DType, Device, Result, Tensor};
 7 | 
 8 | use super::config::ModelConfigLike;
 9 | 
10 | #[derive(Clone, Debug)]
11 | pub struct CacheConfig {
12 |     pub block_size: usize,
13 |     pub num_gpu_blocks: usize,
14 |     pub num_cpu_blocks: usize,
15 | }
16 | 
17 | pub type KVCache = (Tensor, Tensor);
18 | 
19 | pub struct CacheEngine {
20 |     dummy_cache: Arc<Mutex<Vec<KVCache>>>,
21 | }
22 | 
23 | impl CacheEngine {
24 |     pub fn new(
25 |         _model_config: &dyn ModelConfigLike,
26 |         _cache_config: &CacheConfig,
27 |         _dtype: DType,
28 |         _device: &Device,
29 |     ) -> Result<Self> {
30 |         Ok(Self {
31 |             dummy_cache: Arc::new(Mutex::new(Vec::new())),
32 |         })
33 |     }
34 | 
35 |     pub fn get_kv_cache(&self) -> MutexGuard<'_, Vec<KVCache>> {
36 |         loop {
37 |             if let Ok(v) = self.dummy_cache.try_lock() {
38 |                 return v;
39 |             }
40 |         }
41 |     }
42 | }
43 | 
44 | impl CacheEngine {
45 |     pub fn execute_scheduler_ops(
46 |         &self,
47 |         blocks_to_swap_in: HashMap<usize, usize>,
48 |         blocks_to_swap_out: HashMap<usize, usize>,
49 |         blocks_to_copy: HashMap<usize, Vec<usize>>,
50 |     ) -> Result<()> {
51 |         if !blocks_to_swap_in.is_empty() {
52 |             self.swap_in(blocks_to_swap_in)?;
53 |         }
54 |         if !blocks_to_swap_out.is_empty() {
55 |             self.swap_out(blocks_to_swap_out)?;
56 |         }
57 |         if !blocks_to_copy.is_empty() {
58 |             self.copy(blocks_to_copy)?;
59 |         }
60 |         Ok(())
61 |     }
62 | 
63 |     pub fn swap_in(&self, _src_to_dst: HashMap<usize, usize>) -> Result<()> {
64 |         Ok(())
65 |     }
66 | 
67 |     pub fn swap_out(&self, _src_to_dst: HashMap<usize, usize>) -> Result<()> {
68 |         Ok(())
69 |     }
70 | 
71 |     pub fn copy(&self, _src_to_dst: HashMap<usize, Vec<usize>>) -> Result<()> {
72 |         Ok(())
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/docs/ISQ.md:
--------------------------------------------------------------------------------
 1 | # In situ quantization
 2 | 
 3 | In situ quantization works by quantizing non GGUF or GGML models in-place. This allows you to take advantage of flash attention, and reduces memory footprint when running the model. Currently, all layers which would be `Linear` are able to be quantized.
 4 | 
 5 | An API is exposed on the Python and Rust APIs which provide the ability to dynamically re-ISQ models at runtime.
 6 | 
 7 | To set the ISQ type for individual layers, use a model [`topology`](TOPOLOGY.md).
 8 | 
 9 | ## ISQ quantization types
10 | - Q4_0
11 | - Q4_1
12 | - Q5_0
13 | - Q5_1
14 | - Q8_0
15 | - Q8_1 (*not available on CUDA*)
16 | - Q2K
17 | - Q3K
18 | - Q4K
19 | - Q5K
20 | - Q6K
21 | - Q8K  (*not available on CUDA*)
22 | - HQQ4
23 | - HQQ8
24 | - FP8
25 | 
26 | When using ISQ, it will automatically load ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision.
27 | 
28 | For Mixture of Expert models, a method called [MoQE](https://arxiv.org/abs/2310.02410) can be applied to only quantize MoE layers. This is configured via the ISQ organization parameter in all APIs.
29 | 
30 | ## Python Example
31 | ```python
32 | runner = Runner(
33 |     which=Which.GGUF(
34 |         tok_model_id="mistralai/Mistral-7B-Instruct-v0.1",
35 |         quantized_model_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
36 |         quantized_filename="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
37 |     ),
38 |     in_situ_quant="Q4K",
39 | )
40 | ```
41 | 
42 | ## Rust Example
43 | You can find this example [here](../mistralrs/examples/isq/main.rs).
44 | 
45 | ```rust
46 | let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
47 |     .with_isq(IsqType::Q8_0)
48 |     .with_logging()
49 |     .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
50 |     .build()
51 |     .await?;
52 | ```
53 | 
54 | ## Server example
55 | ```
56 | cargo run --release --features "cuda flash-attn" -- --port 1234 --log output.txt --isq Q2K plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral
57 | ```


--------------------------------------------------------------------------------
/mistralrs-quant/src/hqq/ffi.rs:
--------------------------------------------------------------------------------
 1 | macro_rules! dequant_kernel {
 2 |     ($wq:ty, $scalar:ty, $postfix:tt) => {
 3 |         paste! {
 4 |             pub(crate) fn [< dequantize_ $postfix >](
 5 |                 wq_packed: *const $wq,
 6 |                 scale: *const $scalar,
 7 |                 zero: *const $scalar,
 8 |                 out: *const $scalar,
 9 |                 h: i32,
10 |                 w: i32
11 |             );
12 |         }
13 |     };
14 | }
15 | 
16 | pub mod eight_bit {
17 |     use half::{bf16, f16};
18 |     use paste::paste;
19 | 
20 |     #[allow(dead_code)]
21 |     extern "C" {
22 |         dequant_kernel!(u8, f32, 8bit_u8_kernel_f32);
23 |         dequant_kernel!(u8, f16, 8bit_u8_kernel_f16);
24 |         dequant_kernel!(u8, bf16, 8bit_u8_kernel_bf16);
25 |     }
26 | }
27 | 
28 | pub mod four_bit {
29 |     use half::{bf16, f16};
30 |     use paste::paste;
31 | 
32 |     #[allow(dead_code)]
33 |     extern "C" {
34 |         dequant_kernel!(u8, f32, 4bit_u8_kernel_f32);
35 |         dequant_kernel!(u8, f16, 4bit_u8_kernel_f16);
36 |         dequant_kernel!(u8, bf16, 4bit_u8_kernel_bf16);
37 |     }
38 | }
39 | 
40 | pub mod three_bit {
41 |     use half::{bf16, f16};
42 |     use paste::paste;
43 | 
44 |     #[allow(dead_code)]
45 |     extern "C" {
46 |         dequant_kernel!(i32, f32, 3bit_32_kernel_f32);
47 |         dequant_kernel!(i32, f16, 3bit_32_kernel_f16);
48 |         dequant_kernel!(i32, bf16, 3bit_32_kernel_bf16);
49 |     }
50 | }
51 | 
52 | pub mod two_bit {
53 |     use half::{bf16, f16};
54 |     use paste::paste;
55 | 
56 |     #[allow(dead_code)]
57 |     extern "C" {
58 |         dequant_kernel!(u8, f32, 2bit_u8_kernel_f32);
59 |         dequant_kernel!(u8, f16, 2bit_u8_kernel_f16);
60 |         dequant_kernel!(u8, bf16, 2bit_u8_kernel_bf16);
61 |     }
62 | }
63 | 
64 | pub mod one_bit {
65 |     use half::{bf16, f16};
66 |     use paste::paste;
67 | 
68 |     #[allow(dead_code)]
69 |     extern "C" {
70 |         dequant_kernel!(u8, f32, 1bit_u8_kernel_f32);
71 |         dequant_kernel!(u8, f16, 1bit_u8_kernel_f16);
72 |         dequant_kernel!(u8, bf16, 1bit_u8_kernel_bf16);
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "../cuda_compat.h"
21 | #include "attention_dtypes.h"
22 | 
23 | #include <float.h>
24 | #include <type_traits>
25 | 
26 | namespace vllm {
27 | 
28 | // Q*K^T operation.
29 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
30 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
31 |   using A_vec = typename FloatVec<Vec>::Type;
32 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
33 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
34 | #pragma unroll
35 |   for (int ii = 1; ii < N; ++ii) {
36 |     qk_vec = fma(q[ii], k[ii], qk_vec);
37 |   }
38 | 
39 |   // Finalize the reduction across lanes.
40 |   float qk = sum(qk_vec);
41 | #pragma unroll
42 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
43 |     qk += VLLM_SHFL_XOR_SYNC(qk, mask);
44 |   }
45 |   return qk;
46 | }
47 | 
48 | template<typename T, int THREAD_GROUP_SIZE>
49 | struct Qk_dot {
50 |   template<typename Vec, int N>
51 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
52 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
53 |   }
54 | };
55 | 
56 | } // namespace vllm
57 | 


--------------------------------------------------------------------------------
/mistralrs/examples/gguf_locally/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     GgufModelBuilder, PagedAttentionMetaBuilder, RequestBuilder, TextMessageRole, TextMessages,
 4 | };
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<()> {
 8 |     // We do not use any files from remote servers here, and instead load the
 9 |     // chat template from the specified file, and the tokenizer and model from a
10 |     // local GGUF file at the path specified.
11 |     let model = GgufModelBuilder::new(
12 |         "gguf_models/mistral_v0.1/",
13 |         vec!["mistral-7b-instruct-v0.1.Q4_K_M.gguf"],
14 |     )
15 |     .with_chat_template("chat_templates/mistral.json")
16 |     .with_logging()
17 |     .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
18 |     .build()
19 |     .await?;
20 | 
21 |     let messages = TextMessages::new()
22 |         .add_message(
23 |             TextMessageRole::System,
24 |             "You are an AI agent with a specialty in programming.",
25 |         )
26 |         .add_message(
27 |             TextMessageRole::User,
28 |             "Hello! How are you? Please write generic binary search function in Rust.",
29 |         );
30 | 
31 |     let response = model.send_chat_request(messages).await?;
32 | 
33 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
34 |     dbg!(
35 |         response.usage.avg_prompt_tok_per_sec,
36 |         response.usage.avg_compl_tok_per_sec
37 |     );
38 | 
39 |     // Next example: Return some logprobs with the `RequestBuilder`, which enables higher configurability.
40 |     let request = RequestBuilder::new().return_logprobs(true).add_message(
41 |         TextMessageRole::User,
42 |         "Please write a mathematical equation where a few numbers are added.",
43 |     );
44 | 
45 |     let response = model.send_chat_request(request).await?;
46 | 
47 |     println!(
48 |         "Logprobs: {:?}",
49 |         &response.choices[0]
50 |             .logprobs
51 |             .as_ref()
52 |             .unwrap()
53 |             .content
54 |             .as_ref()
55 |             .unwrap()[0..3]
56 |     );
57 | 
58 |     Ok(())
59 | }
60 | 


--------------------------------------------------------------------------------
/mistralrs-quant/src/gptq/gptq_cpu.rs:
--------------------------------------------------------------------------------
 1 | use crate::{IsqType, QuantMethod, QuantMethodConfig, QuantizedSerde};
 2 | use candle_core::{DType, Device, Result, Tensor};
 3 | use std::{
 4 |     num::NonZeroUsize,
 5 |     sync::{atomic::AtomicUsize, Arc},
 6 | };
 7 | 
 8 | #[derive(Debug)]
 9 | pub struct GptqLayer;
10 | 
11 | impl QuantMethod for GptqLayer {
12 |     fn new(method: QuantMethodConfig) -> Result<Self>
13 |     where
14 |         Self: Sized,
15 |     {
16 |         match method {
17 |             QuantMethodConfig::Gptq {
18 |                 bits: _,
19 |                 use_exllama: _,
20 |                 q_weight: _,
21 |                 gptq_qzeros: _,
22 |                 gptq_scales: _,
23 |                 g_idx: _,
24 |                 bias: _,
25 |             } => candle_core::bail!("GPTQ is only supported on CUDA."),
26 |             QuantMethodConfig::Gguf { .. }
27 |             | QuantMethodConfig::Unquantized(_)
28 |             | QuantMethodConfig::Hqq { .. }
29 |             | QuantMethodConfig::Dummy
30 |             | QuantMethodConfig::FP8 { .. } => {
31 |                 unreachable!()
32 |             }
33 |         }
34 |     }
35 | 
36 |     fn forward(&self, _a: &Tensor) -> Result<Tensor> {
37 |         todo!()
38 |     }
39 | 
40 |     fn quantized_act_type(&self) -> Option<DType> {
41 |         todo!()
42 |     }
43 | 
44 |     fn add_delta_w(&self, _delta: &Tensor) -> Result<Arc<dyn QuantMethod>> {
45 |         todo!()
46 |     }
47 | 
48 |     fn dtype_and_device(&self) -> (DType, candle_core::Device) {
49 |         todo!()
50 |     }
51 | 
52 |     fn get_bias_mut(&mut self) -> Option<&mut Tensor> {
53 |         todo!()
54 |     }
55 | 
56 |     fn apply_isq(
57 |         self: Arc<Self>,
58 |         _dtype: Option<IsqType>,
59 |         _device: Device,
60 |         _n_quantized: &AtomicUsize,
61 |     ) -> Result<Arc<dyn QuantMethod>> {
62 |         todo!()
63 |     }
64 | 
65 |     fn get_max_isq_cpu_threads(&self, _dtype: IsqType) -> Option<NonZeroUsize> {
66 |         todo!()
67 |     }
68 | }
69 | 
70 | impl QuantizedSerde for GptqLayer {
71 |     fn name(&self) -> &'static str {
72 |         "gptq"
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/examples/server/idefics2.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | completion = client.chat.completions.create(
39 |     model="idefics2",
40 |     messages=[
41 |         {
42 |             "role": "user",
43 |             "content": [
44 |                 {
45 |                     "type": "image_url",
46 |                     "image_url": {
47 |                         "url": "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg"
48 |                     },
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": "What is shown in this image?",
53 |                 },
54 |             ],
55 |         },
56 |     ],
57 |     max_tokens=256,
58 |     frequency_penalty=1.0,
59 |     top_p=0.1,
60 |     temperature=0,
61 | )
62 | resp = completion.choices[0].message.content
63 | print(resp)
64 | 


--------------------------------------------------------------------------------
/examples/server/phi3v_local_img.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | BASE_URL = "http://localhost:1234/v1"
32 | 
33 | # Enable this to log requests and responses
34 | # openai.http_client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | FILENAME = "picture.jpg"
39 | 
40 | headers = {
41 |     "Content-Type": "application/json",
42 | }
43 | 
44 | payload = {
45 |     "model": "phi3v",
46 |     "messages": [
47 |         {
48 |             "role": "user",
49 |             "content": [
50 |                 {
51 |                     "type": "image_url",
52 |                     "image_url": {
53 |                         "url": FILENAME,
54 |                     },
55 |                 },
56 |                 {
57 |                     "type": "text",
58 |                     "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
59 |                 },
60 |             ],
61 |         }
62 |     ],
63 |     "max_tokens": 300,
64 | }
65 | 
66 | response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=payload)
67 | print(response.json())
68 | 


--------------------------------------------------------------------------------
/examples/server/llava.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | completion = client.chat.completions.create(
39 |     model="llava",
40 |     messages=[
41 |         {
42 |             "role": "user",
43 |             "content": [
44 |                 {
45 |                     "type": "image_url",
46 |                     "image_url": {
47 |                         "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
48 |                     },
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": "<image>What is shown in this image? Write a detailed response analyzing the scene.",
53 |                 },
54 |             ],
55 |         },
56 |     ],
57 |     max_tokens=256,
58 |     frequency_penalty=1.0,
59 |     top_p=0.1,
60 |     temperature=0,
61 | )
62 | resp = completion.choices[0].message.content
63 | print(resp)
64 | 


--------------------------------------------------------------------------------
/mistralrs-core/src/utils/memory_usage.rs:
--------------------------------------------------------------------------------
 1 | use candle_core::{Device, Result};
 2 | use sysinfo::System;
 3 | 
 4 | const KB_TO_BYTES: usize = 1024;
 5 | 
 6 | pub struct MemoryUsage;
 7 | 
 8 | impl MemoryUsage {
 9 |     /// Amount of available memory in bytes.
10 |     pub fn get_memory_available(&self, device: &Device) -> Result<usize> {
11 |         match device {
12 |             Device::Cpu => {
13 |                 let mut sys = System::new_all();
14 |                 sys.refresh_cpu();
15 |                 Ok(usize::try_from(sys.free_memory())? * KB_TO_BYTES)
16 |             }
17 |             #[cfg(feature = "cuda")]
18 |             Device::Cuda(_) => {
19 |                 use candle_core::cuda_backend::WrapErr;
20 |                 Ok(candle_core::cuda::cudarc::driver::result::mem_get_info()
21 |                     .w()?
22 |                     .0)
23 |             }
24 |             #[cfg(not(feature = "cuda"))]
25 |             Device::Cuda(_) => {
26 |                 candle_core::bail!("Cannot get memory available for CUDA device")
27 |             }
28 |             Device::Metal(_) => {
29 |                 candle_core::bail!("Cannot get memory available for Metal device")
30 |             }
31 |         }
32 |     }
33 | 
34 |     /// Amount of total memory in bytes.
35 |     pub fn get_total_memory(&self, device: &Device) -> Result<usize> {
36 |         match device {
37 |             Device::Cpu => {
38 |                 let mut sys = System::new_all();
39 |                 sys.refresh_cpu();
40 |                 Ok(usize::try_from(sys.total_memory())? * KB_TO_BYTES)
41 |             }
42 |             #[cfg(feature = "cuda")]
43 |             Device::Cuda(_) => {
44 |                 use candle_core::cuda_backend::WrapErr;
45 |                 Ok(candle_core::cuda::cudarc::driver::result::mem_get_info()
46 |                     .w()?
47 |                     .1)
48 |             }
49 |             #[cfg(not(feature = "cuda"))]
50 |             Device::Cuda(_) => {
51 |                 candle_core::bail!("Cannot get total memory for CUDA device")
52 |             }
53 |             Device::Metal(_) => {
54 |                 candle_core::bail!("Cannot get total memory for Metal device")
55 |             }
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/server/phi3v.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | completion = client.chat.completions.create(
39 |     model="phi3v",
40 |     messages=[
41 |         {
42 |             "role": "user",
43 |             "content": [
44 |                 {
45 |                     "type": "image_url",
46 |                     "image_url": {
47 |                         "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
48 |                     },
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
53 |                 },
54 |             ],
55 |         },
56 |     ],
57 |     max_tokens=256,
58 |     frequency_penalty=1.0,
59 |     top_p=0.1,
60 |     temperature=0,
61 | )
62 | resp = completion.choices[0].message.content
63 | print(resp)
64 | 


--------------------------------------------------------------------------------
/examples/server/llama_vision.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | completion = client.chat.completions.create(
39 |     model="llama-vision",
40 |     messages=[
41 |         {
42 |             "role": "user",
43 |             "content": [
44 |                 {
45 |                     "type": "image_url",
46 |                     "image_url": {
47 |                         "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
48 |                     },
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": "What is shown in this image? Write a detailed response analyzing the scene.",
53 |                 },
54 |             ],
55 |         },
56 |     ],
57 |     max_tokens=256,
58 |     frequency_penalty=1.0,
59 |     top_p=0.1,
60 |     temperature=0,
61 | )
62 | resp = completion.choices[0].message.content
63 | print(resp)
64 | 


--------------------------------------------------------------------------------
/examples/server/llava_next.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | 
 6 | 
 7 | def log_response(response: httpx.Response):
 8 |     request = response.request
 9 |     print(f"Request: {request.method} {request.url}")
10 |     print("  Headers:")
11 |     for key, value in request.headers.items():
12 |         if key.lower() == "authorization":
13 |             value = "[...]"
14 |         if key.lower() == "cookie":
15 |             value = value.split("=")[0] + "=..."
16 |         print(f"    {key}: {value}")
17 |     print("  Body:")
18 |     try:
19 |         request_body = json.loads(request.content)
20 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
21 |     except json.JSONDecodeError:
22 |         print(textwrap.indent(request.content.decode(), "    "))
23 |     print(f"Response: status_code={response.status_code}")
24 |     print("  Headers:")
25 |     for key, value in response.headers.items():
26 |         if key.lower() == "set-cookie":
27 |             value = value.split("=")[0] + "=..."
28 |         print(f"    {key}: {value}")
29 | 
30 | 
31 | client = OpenAI(api_key="foobar", base_url="http://localhost:1234/v1/")
32 | 
33 | # Enable this to log requests and responses
34 | # client._client = httpx.Client(
35 | #     event_hooks={"request": [print], "response": [log_response]}
36 | # )
37 | 
38 | completion = client.chat.completions.create(
39 |     model="llava_next",
40 |     messages=[
41 |         {
42 |             "role": "user",
43 |             "content": [
44 |                 {
45 |                     "type": "image_url",
46 |                     "image_url": {
47 |                         "url": "https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg"
48 |                     },
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": "<image>What is shown in this image? Write a detailed response analyzing the scene.",
53 |                 },
54 |             ],
55 |         },
56 |     ],
57 |     max_tokens=256,
58 |     frequency_penalty=1.0,
59 |     top_p=0.1,
60 |     temperature=0,
61 | )
62 | resp = completion.choices[0].message.content
63 | print(resp)
64 | 


--------------------------------------------------------------------------------
/mistralrs/examples/topology/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use mistralrs::{
 3 |     IsqType, LayerTopology, PagedAttentionMetaBuilder, TextMessageRole, TextMessages,
 4 |     TextModelBuilder, Topology,
 5 | };
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<()> {
 9 |     let model = TextModelBuilder::new("microsoft/Phi-3.5-mini-instruct")
10 |         .with_isq(IsqType::Q8_0)
11 |         .with_topology(
12 |             Topology::empty()
13 |                 .with_range(
14 |                     0..8,
15 |                     LayerTopology {
16 |                         isq: Some(IsqType::Q3K),
17 |                         device: None,
18 |                     },
19 |                 )
20 |                 .with_range(
21 |                     8..16,
22 |                     LayerTopology {
23 |                         isq: Some(IsqType::Q4K),
24 |                         device: None,
25 |                     },
26 |                 )
27 |                 .with_range(
28 |                     16..24,
29 |                     LayerTopology {
30 |                         isq: Some(IsqType::Q6K),
31 |                         device: None,
32 |                     },
33 |                 )
34 |                 .with_range(
35 |                     24..32,
36 |                     LayerTopology {
37 |                         isq: Some(IsqType::Q8_0),
38 |                         device: None,
39 |                     },
40 |                 ),
41 |         )
42 |         .with_logging()
43 |         .with_paged_attn(|| PagedAttentionMetaBuilder::default().build())?
44 |         .build()
45 |         .await?;
46 | 
47 |     let messages = TextMessages::new()
48 |         .add_message(
49 |             TextMessageRole::System,
50 |             "You are an AI agent with a specialty in programming.",
51 |         )
52 |         .add_message(
53 |             TextMessageRole::User,
54 |             "Hello! How are you? Please write generic binary search function in Rust.",
55 |         );
56 | 
57 |     let response = model.send_chat_request(messages).await?;
58 | 
59 |     println!("{}", response.choices[0].message.content.as_ref().unwrap());
60 |     dbg!(
61 |         response.usage.avg_prompt_tok_per_sec,
62 |         response.usage.avg_compl_tok_per_sec
63 |     );
64 | 
65 |     Ok(())
66 | }
67 | 


--------------------------------------------------------------------------------
/examples/server/phi3v_base64.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import httpx
 3 | import textwrap
 4 | import json
 5 | import base64
 6 | 
 7 | 
 8 | def log_response(response: httpx.Response):
 9 |     request = response.request
10 |     print(f"Request: {request.method} {request.url}")
11 |     print("  Headers:")
12 |     for key, value in request.headers.items():
13 |         if key.lower() == "authorization":
14 |             value = "[...]"
15 |         if key.lower() == "cookie":
16 |             value = value.split("=")[0] + "=..."
17 |         print(f"    {key}: {value}")
18 |     print("  Body:")
19 |     try:
20 |         request_body = json.loads(request.content)
21 |         print(textwrap.indent(json.dumps(request_body, indent=2), "    "))
22 |     except json.JSONDecodeError:
23 |         print(textwrap.indent(request.content.decode(), "    "))
24 |     print(f"Response: status_code={response.status_code}")
25 |     print("  Headers:")
26 |     for key, value in response.headers.items():
27 |         if key.lower() == "set-cookie":
28 |             value = value.split("=")[0] + "=..."
29 |         print(f"    {key}: {value}")
30 | 
31 | 
32 | BASE_URL = "http://localhost:1234/v1"
33 | 
34 | # Enable this to log requests and responses
35 | # openai.http_client = httpx.Client(
36 | #     event_hooks={"request": [print], "response": [log_response]}
37 | # )
38 | 
39 | FILENAME = "picture.jpg"
40 | with open(FILENAME, "rb") as image_file:
41 |     encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
42 | 
43 | headers = {
44 |     "Content-Type": "application/json",
45 | }
46 | 
47 | payload = {
48 |     "model": "phi3v",
49 |     "messages": [
50 |         {
51 |             "role": "user",
52 |             "content": [
53 |                 {
54 |                     "type": "image_url",
55 |                     "image_url": {
56 |                         "url": str(encoded_string),
57 |                     },
58 |                 },
59 |                 {
60 |                     "type": "text",
61 |                     "text": "<|image_1|>\nWhat is shown in this image? Write a detailed response analyzing the scene.",
62 |                 },
63 |             ],
64 |         }
65 |     ],
66 |     "max_tokens": 300,
67 | }
68 | 
69 | response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=payload)
70 | print(response.json())
71 | 


--------------------------------------------------------------------------------
/mistralrs-paged-attn/src/backend/mod.rs:
--------------------------------------------------------------------------------
 1 | mod cache;
 2 | mod paged_attention;
 3 | 
 4 | use std::{
 5 |     marker::PhantomData,
 6 |     ptr::{addr_of, NonNull},
 7 | };
 8 | 
 9 | use candle_core::{
10 |     cuda::cudarc::driver::DeviceRepr, cuda_backend::cudarc::driver::CudaFunction, CudaDevice,
11 |     DType, Result,
12 | };
13 | 
14 | pub use cache::{copy_blocks, swap_blocks};
15 | pub use paged_attention::{paged_attention, reshape_and_cache};
16 | 
17 | const COPY_BLOCKS_KERNEL_NAME: &str = "copy_blocks_kernel";
18 | 
19 | pub fn get_or_load_func(
20 |     ptx_file: &'static str,
21 |     kernel_base: &str,
22 |     dtype: DType,
23 |     suffix: Option<&str>,
24 |     device: &CudaDevice,
25 | ) -> Result<CudaFunction> {
26 |     let spec = match dtype {
27 |         DType::U8 => "_u8",
28 |         DType::U32 => "_u32",
29 |         DType::I16 => "_i16",
30 |         DType::I32 => "_i32",
31 |         DType::I64 => "_i64",
32 |         DType::BF16 => "_bf16",
33 |         DType::F16 => "_f16",
34 |         DType::F32 => "_f32",
35 |         DType::F64 => "_f64",
36 |         DType::F8E4M3 => "_f8_e4m3",
37 |     };
38 |     let spec = if let Some(suffix) = suffix {
39 |         spec.to_owned() + suffix
40 |     } else {
41 |         spec.to_owned()
42 |     };
43 |     let kernel = kernel_base.to_owned() + &spec;
44 |     device.get_or_load_func(&kernel, ptx_file)
45 | }
46 | 
47 | #[repr(transparent)]
48 | struct Conjoined<'a, T, R> {
49 |     raw: *mut T,
50 |     _ref: PhantomData<&'a mut R>,
51 | }
52 | 
53 | impl<'a, T, R> Conjoined<'a, T, R> {
54 |     fn new(raw: NonNull<T>, _ref: &'a mut R) -> Self {
55 |         Self {
56 |             raw: raw.as_ptr(),
57 |             _ref: PhantomData,
58 |         }
59 |     }
60 | }
61 | 
62 | /// According to the docs: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
63 | /// Each of the kernel params (*mut c_void) "must point to a region of memory from which the actual kernel parameter will be copied".
64 | /// This means that we must return a pointer to our pointer.
65 | ///
66 | /// ## Safety
67 | /// - The returned pointer **must not** outlive the &self reference. Otherwise, a dangling pointer is created.
68 | unsafe impl<'a, T, R> DeviceRepr for Conjoined<'a, T, R> {
69 |     fn as_kernel_param(&self) -> *mut std::ffi::c_void {
70 |         addr_of!(self.raw) as *mut _
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/docs/CHAT_TOK.md:
--------------------------------------------------------------------------------
 1 | # Chat templates and tokenizer customization
 2 | 
 3 | ## Chat templates
 4 | Mistral.rs attempts to automatically load a chat template from the `tokenizer_config.json` file. This enables high flexibility across instruction-tuned models and ensures accurate chat templating. However, if the `chat_template` field is missing, then a JINJA chat template should be provided. The JINJA chat template may use `messages`, `add_generation_prompt`, `bos_token`, `eos_token`, and `unk_token` as inputs.
 5 | 
 6 | We provide some chat templates [here](../chat_templates/), and it is easy to modify or create others to customize chat template behavior.
 7 | 
 8 | For example, to use the `chatml` template, `--chat-template` is specified *before* the model architecture. For example:
 9 | 
10 | ```bash
11 | ./mitralrs-server --port 1234 --log output.log --chat-template ./chat_templates/chatml.json llama
12 | ```
13 | 
14 | > Note: For GGUF models, the chat template may be loaded directly from the GGUF file by omitting any other chat template sources.
15 | 
16 | ## Tokenizer
17 | 
18 | Some models do not provide a `tokenizer.json` file although mistral.rs expects one. To solve this, please run [this](../scripts/get_tokenizers_json.py) script. It will output the `tokenizer.json` file for your specific model. This may be used by passing the `--tokenizer-json` flag *after* the model architecture. For example:
19 | 
20 | ```bash
21 | $ python3 scripts/get_tokenizers_json.py
22 | Enter model ID: microsoft/Orca-2-13b
23 | $ ./mistralrs-server --port 1234 --log output.log plain -m microsoft/Orca-2-13b --tokenizer-json tokenizer.json
24 | ```
25 | 
26 | Putting it all together, to run, for example, an [Orca](https://huggingface.co/microsoft/Orca-2-13b) model (which does not come with a `tokenizer.json` or chat template):
27 | 1) Generate the `tokenizer.json` by running the script at `scripts/get_tokenizers_json.py`. This will output some files including `tokenizer.json` in the working directory.
28 | 2) Find and copy the correct chat template from `chat-templates` to the working directory (eg., `cp chat_templates/chatml.json .`)
29 | 3) Run `mistralrs-server`, specifying the tokenizer and chat template: `cargo run --release --features cuda -- --port 1234 --log output.txt --chat-template chatml.json plain -m microsoft/Orca-2-13b -t tokenizer.json -a llama`
30 | 
31 | > Note: For GGUF models, the tokenizer may be loaded directly from the GGUF file by omitting the tokenizer model ID.
32 | 


--------------------------------------------------------------------------------