'"
12 | raise ValueError(msg)
13 |
14 | try:
15 | scheme = json.loads(sys.argv[1])
16 | llm_kwargs = json.loads(sys.argv[2])
17 | prompts = json.loads(sys.argv[3])
18 | except json.JSONDecodeError as e:
19 | raise ValueError(f"Invalid JSON input: {e}")
20 |
21 | if "W4A16_2of4" in scheme:
22 | # required by the kernel
23 | llm_kwargs["dtype"] = torch.float16
24 |
25 | return llm_kwargs, prompts
26 |
27 |
28 | def run_vllm(llm_kwargs: dict, prompts: list[str]) -> None:
29 | """Run vLLM with given kwargs and prompts, then print outputs."""
30 | sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
31 |
32 | llm = LLM(**llm_kwargs)
33 | outputs = llm.generate(prompts, sampling_params)
34 |
35 | print("================= vLLM GENERATION =================")
36 | for output in outputs:
37 | if not output or not output.outputs:
38 | print("[Warning] Empty output for prompt:", output.prompt)
39 | continue
40 |
41 | print(f"\nPROMPT:\n{output.prompt}")
42 | print(f"GENERATED TEXT:\n{output.outputs[0].text}")
43 |
44 |
45 | def main():
46 | llm_kwargs, prompts = parse_args()
47 | run_vllm(llm_kwargs, prompts)
48 |
49 |
50 | if __name__ == "__main__":
51 | main()
52 |
--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/llava1.5_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoProcessor, LlavaForConditionalGeneration
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 | from llmcompressor.utils import dispatch_for_generation
6 |
7 | MODEL_ID = "llava-hf/llava-1.5-7b-hf"
8 |
9 | # Load model.
10 | model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 |
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | # * quantize the weights to fp8 with per channel via ptq
16 | # * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 | targets="Linear",
19 | scheme="FP8_DYNAMIC",
20 | ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"],
21 | )
22 |
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 |
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
30 | model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 |
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
38 | model.save_pretrained(SAVE_DIR, save_compressed=True)
39 | processor.save_pretrained(SAVE_DIR)
40 |
--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/recipe/metadata.py:
--------------------------------------------------------------------------------
1 | """
2 | Metadata classes for recipe and model information tracking.
3 |
4 | This module defines Pydantic models for capturing and validating metadata about
5 | datasets, parameters, layers, and models used in compression recipes. Provides
6 | structured data containers for recipe configuration and execution tracking.
7 | """
8 |
9 | from typing import Any, Dict, List, Optional
10 |
11 | from pydantic import BaseModel, Field
12 |
13 | __all__ = [
14 | "DatasetMetaData",
15 | "ParamMetaData",
16 | "LayerMetaData",
17 | "ModelMetaData",
18 | ]
19 |
20 |
21 | class DatasetMetaData(BaseModel):
22 | name: str = None
23 | version: str = None
24 | hash: str = None
25 | shape: List[int] = Field(default_factory=list)
26 | num_classes: int = None
27 | num_train_samples: int = None
28 | num_val_samples: int = None
29 | num_test_samples: int = None
30 |
31 |
32 | class ParamMetaData(BaseModel):
33 | name: str = None
34 | shape: List[int] = None
35 | weight_hash: str = None
36 |
37 |
38 | class LayerMetaData(BaseModel):
39 | name: str = None
40 | type: str = None
41 | index: int = None
42 | attributes: Dict[str, Any] = None
43 | input_shapes: List[List[int]] = None
44 | output_shapes: List[List[int]] = None
45 | params: Dict[str, ParamMetaData] = None
46 |
47 |
48 | class ModelMetaData(BaseModel):
49 | architecture: str = None
50 | sub_architecture: str = None
51 | input_shapes: List[List[int]] = None
52 | output_shapes: List[List[int]] = None
53 | layers: List[LayerMetaData] = Field(default_factory=list)
54 | layer_prefix: Optional[str] = None
55 |
--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/llama3.2_vision_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoProcessor, MllamaForConditionalGeneration
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 | from llmcompressor.utils import dispatch_for_generation
6 |
7 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
8 |
9 | # Load model.
10 | model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 |
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | # * quantize the weights to fp8 with per channel via ptq
16 | # * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 | targets="Linear",
19 | scheme="FP8_DYNAMIC",
20 | ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_model.*"],
21 | )
22 |
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 |
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
30 | model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 |
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
38 | model.save_pretrained(SAVE_DIR, save_compressed=True)
39 | processor.save_pretrained(SAVE_DIR)
40 |
--------------------------------------------------------------------------------
/setup_env.sh:
--------------------------------------------------------------------------------
1 | GIT_LFS_SKIP_SMUDGE=1 pip install -e ".[dev]"
2 | pip install torch==2.7.1
3 | pip install torchaudio==2.7.1
4 | pip install flash-attn==2.7.4.post1 --no-build-isolation
5 | pip install trl==0.21.0
6 | pip install vllm==0.10.1
7 | # replace vllm/vllm/lora/models.py with vllm_replacement/models.py
8 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
9 | cp -rv replacement/vllm_replacement/models.py $site_pkg_path/vllm/lora/models.py
10 | # replace vllm/vllm/lora/worker_manager.py with vllm_replacement/worker_manager.py
11 | cp -rv replacement/vllm_replacement/worker_manager.py $site_pkg_path/vllm/lora/worker_manager.py
12 | # make an empty folder to pass asserts in vllm lora requests
13 | mkdir -p simon_lora_path simon_stub_path
14 |
15 | pip install peft
16 |
17 | git clone --branch 0.11.0 --depth 1 https://github.com/neuralmagic/compressed-tensors.git
18 | cd compressed-tensors
19 | pip install -e . --no-deps
20 | cd ..
21 | # replace compressed-tensors/src/compressed_tensors/linear/compressed_linear.py with compressed-tensors_replacement/compressed_linear.py
22 | cp replacement/compressed-tensors_replacement/compressed_linear.py compressed-tensors/src/compressed_tensors/linear/compressed_linear.py
23 | # replace compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py with compressed-tensors_replacement/forward.py
24 | cp replacement/compressed-tensors_replacement/forward.py compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py
25 |
26 | pip install accelerate==1.10.1 --no-deps
27 |
28 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
29 | cp -rv replacement/trainer.py $site_pkg_path/transformers/trainer.py
30 |
--------------------------------------------------------------------------------
/llm-compressor/tests/unit/core/events/test_event.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from llmcompressor.core import Event, EventType
4 |
5 |
6 | @pytest.mark.smoke
7 | def test_event_epoch_based():
8 | event = Event(steps_per_epoch=10)
9 | assert event.epoch_based is True
10 |
11 |
12 | @pytest.mark.smoke
13 | def test_event_epoch():
14 | event = Event(steps_per_epoch=10, global_step=25)
15 | assert event.epoch == 2
16 |
17 |
18 | @pytest.mark.smoke
19 | def test_event_epoch_full():
20 | event = Event(steps_per_epoch=10, global_step=25)
21 | assert event.epoch_full == 2.5
22 |
23 |
24 | @pytest.mark.smoke
25 | def test_event_epoch_step():
26 | event = Event(steps_per_epoch=10, global_step=25)
27 | assert event.epoch_step == 5
28 |
29 |
30 | @pytest.mark.smoke
31 | def test_event_epoch_batch():
32 | event = Event(
33 | steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50
34 | )
35 | assert event.epoch_batch == 10
36 |
37 |
38 | @pytest.mark.smoke
39 | def test_event_current_index():
40 | event = Event(steps_per_epoch=10, global_step=25)
41 | assert event.current_index == 2.5
42 |
43 |
44 | @pytest.mark.smoke
45 | def test_event_should_update():
46 | event = Event(steps_per_epoch=10, global_step=25)
47 | assert event.should_update(start=0, end=30, update=2.5) is True
48 | assert event.should_update(start=0, end=20, update=5) is False
49 | assert event.should_update(start=0, end=30, update=0) is True
50 |
51 |
52 | @pytest.mark.smoke
53 | def test_event_new_instance():
54 | event = Event(type_=EventType.INITIALIZE, global_step=25)
55 | new_event = event.new_instance(global_step=30)
56 | assert new_event.global_step == 30
57 | assert new_event.type_ == EventType.INITIALIZE
58 |
--------------------------------------------------------------------------------
/llm-compressor/docs/developer/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | weight: -3
3 | ---
4 |
5 | # Developer
6 |
7 | Welcome to the Developer section of LLM Compressor! This area provides essential resources for developers who want to contribute to or extend LLM Compressor. Whether you're interested in fixing bugs, adding new features, improving documentation, or understanding the project's governance, you'll find comprehensive guides to help you get started.
8 |
9 | LLM Compressor is an open-source project that values community contributions. We maintain high standards for code quality, documentation, and community interactions to ensure that LLM Compressor remains a robust, reliable, and user-friendly tool for compressing large language models.
10 |
11 | ## Developer Resources
12 |
13 |
14 |
15 | - :material-handshake:{ .lg .middle } Code of Conduct
16 |
17 | ---
18 |
19 | Our community guidelines ensure that participation in the LLM Compressor project is a positive, inclusive, and respectful experience for everyone.
20 |
21 | [:octicons-arrow-right-24: Code of Conduct](code-of-conduct.md)
22 |
23 | - :material-source-pull:{ .lg .middle } Contributing Guide
24 |
25 | ---
26 |
27 | Learn how to effectively contribute to LLM Compressor, including reporting bugs, suggesting features, improving documentation, and submitting code.
28 |
29 | [:octicons-arrow-right-24: Contributing Guide](contributing.md)
30 |
31 | - :material-tools:{ .lg .middle } Development Guide
32 |
33 | ---
34 |
35 | Detailed instructions for setting up your development environment, implementing changes, and adhering to the project's coding standards and best practices.
36 |
37 | [:octicons-arrow-right-24: Development Guide](developing.md)
38 |
39 |
40 |
--------------------------------------------------------------------------------
/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
3 |
4 | from trl import ModelConfig, get_kbit_device_map, get_quantization_config
5 |
6 | from ..configs import GRPOConfig, SFTConfig
7 |
8 |
9 | def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer:
10 | """Get the tokenizer for the model."""
11 | tokenizer = AutoTokenizer.from_pretrained(
12 | model_args.model_name_or_path,
13 | revision=model_args.model_revision,
14 | trust_remote_code=model_args.trust_remote_code,
15 | )
16 |
17 | if training_args.chat_template is not None:
18 | tokenizer.chat_template = training_args.chat_template
19 |
20 | return tokenizer
21 |
22 |
23 | def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
24 | """Get the model"""
25 | torch_dtype = (
26 | model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
27 | )
28 | quantization_config = get_quantization_config(model_args)
29 | model_kwargs = dict(
30 | revision=model_args.model_revision,
31 | trust_remote_code=model_args.trust_remote_code,
32 | attn_implementation=model_args.attn_implementation,
33 | torch_dtype=torch_dtype,
34 | use_cache=False if training_args.gradient_checkpointing else True,
35 | device_map=get_kbit_device_map() if quantization_config is not None else None,
36 | quantization_config=quantization_config,
37 | )
38 | model = AutoModelForCausalLM.from_pretrained(
39 | model_args.model_name_or_path,
40 | **model_kwargs,
41 | )
42 | return model
43 |
--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_lm_head.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import MagicMock
2 |
3 | import pytest
4 | import torch
5 | from transformers import AutoModelForCausalLM
6 |
7 | from llmcompressor.core.state import State
8 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
9 |
10 |
11 | @pytest.fixture
12 | def model():
13 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
14 | return AutoModelForCausalLM.from_pretrained(
15 | "nm-testing/tinysmokellama-3.2", device_map=device
16 | )
17 |
18 |
19 | @pytest.fixture
20 | def dataloader():
21 | dataset = MagicMock()
22 | dataset.column_names = []
23 | dataloader = MagicMock()
24 | dataloader.dataset = dataset
25 | dataloader.__iter__.return_value = iter([])
26 | return dataloader
27 |
28 |
29 | @pytest.mark.integration
30 | @pytest.mark.parametrize("extra_targets,expected", [([], 0), (["lm_head"], 1)])
31 | def test_lm_head(extra_targets, expected, model, dataloader):
32 | kwargs = {
33 | "sparsity": 0.5,
34 | "block_size": 128,
35 | "targets": [
36 | "model.layers.0",
37 | "model.layers.1",
38 | "model.layers.2",
39 | "model.layers.3",
40 | "model.layers.4",
41 | "model.layers.5",
42 | ]
43 | + extra_targets,
44 | }
45 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
46 |
47 | modifier = SparseGPTModifier(**kwargs)
48 |
49 | state = State()
50 | state.update(model=model, device=device, calib_data=dataloader)
51 | modifier.initialize(state)
52 | modifier.on_start(state, None)
53 |
54 | assert len(model.lm_head._forward_hooks) == expected
55 |
56 | modifier.finalize(state)
57 |
--------------------------------------------------------------------------------