├── assets
    ├── logo
    ├── aqn.png
    ├── lora.png
    ├── qerl.png
    ├── curve.png
    ├── da_gr.png
    ├── logo4.png
    ├── performance.png
    └── rank_speed.png
├── eval
    └── __init__.py
├── trl_trainer
    ├── __init__.py
    └── noise_scheduler.py
├── llm-compressor
    ├── tests
    │   ├── __init__.py
    │   ├── e2e
    │   │   ├── __init__.py
    │   │   └── vLLM
    │   │   │   ├── __init__.py
    │   │   │   ├── configs
    │   │   │       ├── fp4_nvfp4a16.yaml
    │   │   │       ├── fp8_dynamic_per_token_qwen.yaml
    │   │   │       ├── fp8_dynamic_per_token.yaml
    │   │   │       ├── fp8_static_per_tensor.yaml
    │   │   │       ├── int8_dynamic_per_token.yaml
    │   │   │       ├── fp8_weight_only_channel.yaml
    │   │   │       ├── fp8_weight_only_tensor.yaml
    │   │   │       ├── fp4_nvfp4.yaml
    │   │   │       ├── w4a16_grouped_quant.yaml
    │   │   │       ├── w8a16_grouped_quant.yaml
    │   │   │       ├── kv_cache_tinyllama.yaml
    │   │   │       ├── w4a16_channel_quant.yaml
    │   │   │       ├── w4a16_channel_quant_qwen.yaml
    │   │   │       ├── w8a16_channel_quant.yaml
    │   │   │       ├── kv_cache_gptq_tinyllama.yaml
    │   │   │       ├── sparse2of4_fp8_dynamic_qwen.yaml
    │   │   │       ├── w4a16_2of4_channel_quant.yaml
    │   │   │       ├── w4a16_2of4_grouped_quant.yaml
    │   │   │       ├── kv_cache_phi3.yaml
    │   │   │       ├── sparse_24.yaml
    │   │   │       ├── sparse2of4_fp8_dynamic.yaml
    │   │   │       ├── w4a16_actorder_none_qwen.yaml
    │   │   │       ├── w4a16_actorder_group_qwen.yaml
    │   │   │       ├── int8_tensor_weight_static_per_tensor_act_qwen.yaml
    │   │   │       ├── w4a16_actorder_weight_qwen.yaml
    │   │   │       ├── w4a16_grouped_quant_asym_awq.yaml
    │   │   │       ├── int8_channel_weight_static_per_tensor_act.yaml
    │   │   │       ├── w4a16_actorder_group.yaml
    │   │   │       ├── w4a16_actorder_none.yaml
    │   │   │       ├── w4a16_grouped_quant_sym_awq.yaml
    │   │   │       ├── w8a8_static_asym.yaml
    │   │   │       ├── int8_tensor_weight_static_per_tensor_act.yaml
    │   │   │       ├── w4a16_actorder_weight.yaml
    │   │   │       └── w8a8_dynamic_asym.yaml
    │   │   │   ├── recipes
    │   │   │       ├── kv_cache
    │   │   │       │   ├── default.yaml
    │   │   │       │   └── gptq.yaml
    │   │   │       ├── Sparse_2of4
    │   │   │       │   ├── recipe_sparse_2of4.yaml
    │   │   │       │   └── recipe_sparse_2of4_fp8_dynamic.yaml
    │   │   │       ├── WNA16
    │   │   │       │   ├── recipe_w4a16_awq_asym.yaml
    │   │   │       │   ├── recipe_w4a16_channel_quant.yaml
    │   │   │       │   ├── recipe_w8a16_channel_quant.yaml
    │   │   │       │   └── recipe_w4a16_awq_sym.yaml
    │   │   │       ├── FP8
    │   │   │       │   ├── recipe_fp8_weight_only_channel.yaml
    │   │   │       │   ├── recipe_fp8_weight_only_per_tensor.yaml
    │   │   │       │   └── recipe_fp8_dynamic.yaml
    │   │   │       ├── actorder
    │   │   │       │   ├── recipe_w4a16_actorder_none.yaml
    │   │   │       │   ├── recipe_w4a16_actorder_group.yaml
    │   │   │       │   └── recipe_w4a16_actorder_weight.yaml
    │   │   │       ├── INT8
    │   │   │       │   ├── recipe_int8_tensor_weight_static_per_tensor_act.yaml
    │   │   │       │   ├── recipe_int8_channel_weight_static_per_tensor_act.yaml
    │   │   │       │   ├── recipe_w8a8_static_asym.yaml
    │   │   │       │   ├── recipe_int8_channel_weight_dynamic_per_token.yaml
    │   │   │       │   └── recipe_w8a8_dynamic_asym.yaml
    │   │   │       └── WNA16_2of4
    │   │   │       │   ├── 2of4_w4a16_recipe.yaml
    │   │   │       │   └── 2of4_w4a16_group-128_recipe.yaml
    │   │   │   ├── run_tests.sh
    │   │   │   └── run_vllm.py
    │   ├── unit
    │   │   ├── __init__.py
    │   │   └── core
    │   │   │   ├── __init__.py
    │   │   │   └── events
    │   │   │       ├── __init__.py
    │   │   │       └── test_event.py
    │   ├── examples
    │   │   └── __init__.py
    │   ├── lmeval
    │   │   ├── __init__.py
    │   │   └── configs
    │   │   │   ├── fp8_dynamic_per_token.yaml
    │   │   │   ├── fp8_static_per_tensor.yaml
    │   │   │   ├── w4a16_grouped_quant.yaml
    │   │   │   ├── w4a16_awq_sym.yaml
    │   │   │   ├── w4a16_actorder_none.yaml
    │   │   │   ├── w4a16_actorder_group.yaml
    │   │   │   ├── w4a16_actorder_weight.yaml
    │   │   │   ├── int8_w8a8_dynamic_per_token.yaml
    │   │   │   ├── vl_fp8_dynamic_per_token.yaml
    │   │   │   ├── w4a4_nvfp4.yaml
    │   │   │   ├── vl_w4a16_actorder_weight.yaml
    │   │   │   └── vl_int8_w8a8_dynamic_per_token.yaml
    │   ├── llmcompressor
    │   │   ├── __init__.py
    │   │   ├── recipe
    │   │   │   └── __init__.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── pytorch
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_module.py
    │   │   ├── metrics
    │   │   │   ├── __init__.py
    │   │   │   └── utils
    │   │   │   │   └── __init__.py
    │   │   ├── modifiers
    │   │   │   ├── __init__.py
    │   │   │   ├── awq
    │   │   │   │   └── __init__.py
    │   │   │   ├── calibration
    │   │   │   │   └── __init__.py
    │   │   │   ├── pruning
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── wanda
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_base.py
    │   │   │   │   └── sparsegpt
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_base.py
    │   │   │   ├── smoothquant
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_base.py
    │   │   │   │   └── test_utils.py
    │   │   │   ├── quantization
    │   │   │   │   └── __init__.py
    │   │   │   ├── logarithmic_equalization
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_base.py
    │   │   │   ├── transform
    │   │   │   │   └── test_serialization.py
    │   │   │   └── conf.py
    │   │   ├── transformers
    │   │   │   ├── __init__.py
    │   │   │   ├── finetune
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── data
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── conftest.py
    │   │   │   │   │   └── test_dataset_helpers.py
    │   │   │   │   ├── finetune_generic
    │   │   │   │   │   └── config1.yaml
    │   │   │   │   ├── finetune_custom
    │   │   │   │   │   ├── config1.yaml
    │   │   │   │   │   ├── config2.yaml
    │   │   │   │   │   └── gpu
    │   │   │   │   │   │   └── gpu_config.yaml
    │   │   │   │   ├── finetune_tokenizer
    │   │   │   │   │   └── config1.yaml
    │   │   │   │   ├── finetune_oneshot_configs
    │   │   │   │   │   ├── gpu
    │   │   │   │   │   │   └── gpu_config.yaml
    │   │   │   │   │   └── config.yaml
    │   │   │   │   ├── test_finetune_recipe.yaml
    │   │   │   │   ├── test_alternate_recipe.yaml
    │   │   │   │   ├── test_finetune_without_recipe.py
    │   │   │   │   ├── test_quantization.yaml
    │   │   │   │   └── test_safetensors.py
    │   │   │   ├── oneshot
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── oneshot_configs
    │   │   │   │   │   ├── recipes
    │   │   │   │   │       └── recipe.yaml
    │   │   │   │   │   ├── tiny_smoke_conf6.yaml
    │   │   │   │   │   ├── tiny_smoke_conf5.yaml
    │   │   │   │   │   ├── tiny_smoke_conf2.yaml
    │   │   │   │   │   ├── tiny_smoke_conf3.yaml
    │   │   │   │   │   ├── tiny_smoke_conf1.yaml
    │   │   │   │   │   └── tiny_smoke_conf4.yaml
    │   │   │   ├── compression
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configs
    │   │   │   │   │   ├── fp8_smoke.yaml
    │   │   │   │   │   ├── inputs_smoke.yaml
    │   │   │   │   │   ├── channelwise_smoke.yaml
    │   │   │   │   │   ├── weights_only_smoke.yaml
    │   │   │   │   │   ├── weights_only_1.1b.yaml
    │   │   │   │   │   ├── fp8_1.1b.yaml
    │   │   │   │   │   ├── group_1.1b.yaml
    │   │   │   │   │   ├── inputs_1.1b.yaml
    │   │   │   │   │   ├── channelwise_1.1b.yaml
    │   │   │   │   │   ├── actorder_group_1.1b.yaml
    │   │   │   │   │   └── actorder_weight_1.1b.yaml
    │   │   │   │   ├── decompression_configs
    │   │   │   │   │   ├── w8a8.yaml
    │   │   │   │   │   ├── w4a16.yaml
    │   │   │   │   │   ├── w8a16_dense.yaml
    │   │   │   │   │   └── fp8_dynamic.yaml
    │   │   │   │   ├── recipes
    │   │   │   │   │   ├── sparse_24.yaml
    │   │   │   │   │   ├── new_quant_fp8.yaml
    │   │   │   │   │   ├── smoothquant_gptq_w8a8.yaml
    │   │   │   │   │   ├── new_quant_channel.yaml
    │   │   │   │   │   ├── new_quant_weight.yaml
    │   │   │   │   │   ├── new_quant_group.yaml
    │   │   │   │   │   ├── new_quant_actorder_group.yaml
    │   │   │   │   │   ├── new_quant_actorder_weight.yaml
    │   │   │   │   │   ├── new_quant_full.yaml
    │   │   │   │   │   ├── sparse_24_fp8.yaml
    │   │   │   │   │   └── new_quant_simple.yaml
    │   │   │   │   ├── run_compressed_configs
    │   │   │   │   │   ├── w4a16.yaml
    │   │   │   │   │   ├── w8a16.yaml
    │   │   │   │   │   ├── fp8_dynamic.yaml
    │   │   │   │   │   └── w8a8.yaml
    │   │   │   │   └── test_has_gpu.py
    │   │   │   ├── sparsegpt
    │   │   │   │   ├── sparsegpt_configs
    │   │   │   │   │   ├── sparsity_generic
    │   │   │   │   │   │   └── config.yaml
    │   │   │   │   │   ├── sparse
    │   │   │   │   │   │   ├── tiny_llama_sparse.yaml
    │   │   │   │   │   │   └── gpu
    │   │   │   │   │   │   │   └── llama_7b_sparse.yaml
    │   │   │   │   │   ├── completion
    │   │   │   │   │   │   ├── tiny_llama_quant.yaml
    │   │   │   │   │   │   ├── gpu
    │   │   │   │   │   │   │   ├── llama_7b_quant.yaml
    │   │   │   │   │   │   │   ├── llama_7b_sparse.yml
    │   │   │   │   │   │   │   └── llama_7b_quant_and_sparse.yaml
    │   │   │   │   │   │   └── tiny_llama_quant_and_sparse.yaml
    │   │   │   │   │   ├── consec_runs
    │   │   │   │   │   │   ├── tiny_llama_consec_runs.yaml
    │   │   │   │   │   │   └── gpu
    │   │   │   │   │   │   │   └── llama_consec_runs.yaml
    │   │   │   │   │   └── mask_structure
    │   │   │   │   │   │   └── tiny_llama_mask_structure_preservation.yaml
    │   │   │   │   ├── recipes
    │   │   │   │   │   ├── sparse.yaml
    │   │   │   │   │   ├── additional_sparsity.yaml
    │   │   │   │   │   ├── sparse_with_mask_structure.yaml
    │   │   │   │   │   ├── quant.yaml
    │   │   │   │   │   ├── test_tiny2.yaml
    │   │   │   │   │   ├── quant_and_sparse.yaml
    │   │   │   │   │   └── additional_sparsity_with_quant.yaml
    │   │   │   │   ├── test_sparsegpt_infer_targets.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_oneshot_with_modifier.py
    │   │   │   │   ├── test_sparsegpt_owl.py
    │   │   │   │   └── test_sparsegpt_lm_head.py
    │   │   │   └── conftest.py
    │   │   ├── pytorch
    │   │   │   ├── modifiers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pruning
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── constant
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   ├── sparsegpt
    │   │   │   │   │   │   └── __init__.py
    │   │   │   │   │   └── wanda
    │   │   │   │   │   │   └── test_pytorch.py
    │   │   │   │   ├── smoothquant
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_pytorch.py
    │   │   │   │   ├── logarithmic_equalization
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test_pytorch.py
    │   │   │   │   └── conftest.py
    │   │   │   ├── __init__.py
    │   │   │   └── utils
    │   │   │   │   └── __init__.py
    │   │   ├── test_sentinel.py
    │   │   ├── observers
    │   │   │   └── __init__.py
    │   │   ├── pipelines
    │   │   │   └── sequential
    │   │   │   │   └── test_helpers.py
    │   │   └── modeling
    │   │   │   └── test_fuse.py
    │   └── test_timer
    │   │   ├── __init__.py
    │   │   └── timer_utils.py
    ├── docs
    │   ├── scripts
    │   │   ├── __init__.py
    │   │   └── mathjax.js
    │   ├── stylesheets
    │   │   └── style.css
    │   ├── assets
    │   │   ├── llmcompressor-icon.png
    │   │   ├── llmcompressor-icon-white.png
    │   │   └── llmcompressor-user-flows.png
    │   ├── README.md
    │   ├── examples
    │   │   └── index.md
    │   ├── Makefile
    │   ├── guides
    │   │   ├── index.md
    │   │   └── compression_formats.md
    │   ├── getting-started
    │   │   └── index.md
    │   └── developer
    │   │   └── index.md
    ├── .coveragerc
    ├── src
    │   └── llmcompressor
    │   │   ├── pytorch
    │   │       ├── model_load
    │   │       │   └── __init__.py
    │   │       ├── utils
    │   │       │   ├── sparsification_info
    │   │       │   │   └── __init__.py
    │   │       │   └── __init__.py
    │   │       └── __init__.py
    │   │   ├── modifiers
    │   │       ├── experimental
    │   │       │   └── __init__.py
    │   │       ├── pruning
    │   │       │   ├── utils
    │   │       │   │   ├── __init__.py
    │   │       │   │   └── pytorch
    │   │       │   │   │   └── __init__.py
    │   │       │   ├── wanda
    │   │       │   │   └── __init__.py
    │   │       │   ├── constant
    │   │       │   │   └── __init__.py
    │   │       │   ├── magnitude
    │   │       │   │   └── __init__.py
    │   │       │   ├── sparsegpt
    │   │       │   │   └── __init__.py
    │   │       │   └── __init__.py
    │   │       ├── distillation
    │   │       │   ├── utils
    │   │       │   │   ├── __init__.py
    │   │       │   │   └── pytorch
    │   │       │   │   │   └── __init__.py
    │   │       │   ├── output
    │   │       │   │   └── __init__.py
    │   │       │   └── __init__.py
    │   │       ├── obcq
    │   │       │   ├── __init__.py
    │   │       │   └── sgpt_base.py
    │   │       ├── smoothquant
    │   │       │   └── __init__.py
    │   │       ├── quantization
    │   │       │   ├── gptq
    │   │       │   │   └── __init__.py
    │   │       │   ├── quantization
    │   │       │   │   └── __init__.py
    │   │       │   └── __init__.py
    │   │       ├── transform
    │   │       │   ├── quip
    │   │       │   │   └── __init__.py
    │   │       │   ├── spinquant
    │   │       │   │   └── __init__.py
    │   │       │   └── __init__.py
    │   │       ├── logarithmic_equalization
    │   │       │   └── __init__.py
    │   │       ├── awq
    │   │       │   └── __init__.py
    │   │       ├── utils
    │   │       │   ├── __init__.py
    │   │       │   └── constants.py
    │   │       └── __init__.py
    │   │   ├── transformers
    │   │       ├── compression
    │   │       │   └── __init__.py
    │   │       ├── tracing
    │   │       │   └── __init__.py
    │   │       ├── finetune
    │   │       │   ├── __init__.py
    │   │       │   ├── data
    │   │       │   │   ├── __init__.py
    │   │       │   │   ├── custom.py
    │   │       │   │   ├── c4.py
    │   │       │   │   ├── wikitext.py
    │   │       │   │   ├── cnn_dailymail.py
    │   │       │   │   └── gsm8k.py
    │   │       │   └── trainer.py
    │   │       ├── utils
    │   │       │   ├── __init__.py
    │   │       │   └── preprocessing_functions.py
    │   │       └── __init__.py
    │   │   ├── utils
    │   │       ├── fsdp
    │   │       │   └── __init__.py
    │   │       ├── pytorch
    │   │       │   ├── __init__.py
    │   │       │   └── utils.py
    │   │       └── __init__.py
    │   │   ├── pipelines
    │   │       ├── basic
    │   │       │   └── __init__.py
    │   │       ├── data_free
    │   │       │   ├── __init__.py
    │   │       │   └── pipeline.py
    │   │       ├── independent
    │   │       │   └── __init__.py
    │   │       ├── sequential
    │   │       │   ├── __init__.py
    │   │       │   └── README.md
    │   │       ├── layer_sequential
    │   │       │   └── __init__.py
    │   │       └── __init__.py
    │   │   ├── metrics
    │   │       ├── utils
    │   │       │   └── __init__.py
    │   │       └── __init__.py
    │   │   ├── entrypoints
    │   │       └── __init__.py
    │   │   ├── datasets
    │   │       └── __init__.py
    │   │   ├── modeling
    │   │       └── __init__.py
    │   │   ├── args
    │   │       ├── __init__.py
    │   │       ├── recipe_arguments.py
    │   │       └── training_arguments.py
    │   │   ├── core
    │   │       ├── events
    │   │       │   └── __init__.py
    │   │       ├── model_layer.py
    │   │       └── __init__.py
    │   │   ├── observers
    │   │       ├── __init__.py
    │   │       └── helpers.py
    │   │   ├── version.py
    │   │   ├── typing.py
    │   │   ├── recipe
    │   │       ├── __init__.py
    │   │       └── metadata.py
    │   │   └── __init__.py
    ├── MANIFEST.in
    ├── examples
    │   ├── big_models_with_sequential_onloading
    │   │   └── assets
    │   │   │   └── sequential_onloading.png
    │   ├── finetuning
    │   │   ├── example_single_gpu_config.yaml
    │   │   ├── configure_fsdp.md
    │   │   ├── example_fsdp_config.yaml
    │   │   └── example_alternating_recipe.yaml
    │   ├── trl_mixin
    │   │   ├── sft_trainer.py
    │   │   └── README.md
    │   ├── quantization_2of4_sparse_w4a16
    │   │   ├── 2of4_w4a16_recipe.yaml
    │   │   └── 2of4_w4a16_group-128_recipe.yaml
    │   ├── quantization_w4a16_fp4
    │   │   ├── llama3_example.py
    │   │   └── qwen3_example.py
    │   ├── compressed_inference
    │   │   └── fp8_compressed_inference.py
    │   └── quantization_w8a8_fp8
    │   │   ├── llama3_example.py
    │   │   ├── fp8_block_example.py
    │   │   ├── qwen3_vl_moe_fp8_example.py
    │   │   ├── qwen3_next_example.py
    │   │   ├── qwen2vl_example.py
    │   │   ├── qwen_2_5_vl_example.py
    │   │   ├── llava1.5_example.py
    │   │   └── llama3.2_vision_example.py
    ├── CITATION.cff
    ├── .MAINTAINERS
    ├── .readthedocs.yaml
    └── pyproject.toml
├── .gitignore
├── src
    └── open_r1
    │   ├── utils
    │       ├── __init__.py
    │       ├── competitive_programming
    │       │   ├── utils.py
    │       │   ├── __init__.py
    │       │   └── ioi_utils.py
    │       ├── wandb_logging.py
    │       ├── import_utils.py
    │       └── model_utils.py
    │   └── __init__.py
├── utils
    ├── competitive_programming
    │   ├── utils.py
    │   ├── __init__.py
    │   └── ioi_utils.py
    └── import_utils.py
├── recipes
    ├── accelerate_configs
    │   ├── ddp.yaml
    │   ├── zero2.yaml
    │   ├── zero3.yaml
    │   ├── zero3_offload.yaml
    │   └── fsdp.yaml
    └── dataset_filtering
    │   ├── filter_dapo.yaml
    │   └── filter_python.yaml
├── openr1_tool
    ├── pass_rate_filtering
    │   ├── launch_filtering.sh
    │   └── README.md
    └── get_tensor_parallel_size.py
├── __init__.py
├── setup.cfg
└── setup_env.sh


/assets/logo:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trl_trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/stylesheets/style.css:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/unit/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/unit/core/events/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | patch = subprocess
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pytorch/model_load/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/experimental/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/calibration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/distillation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/utils/fsdp/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/aqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/aqn.png


--------------------------------------------------------------------------------
/assets/lora.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/lora.png


--------------------------------------------------------------------------------
/assets/qerl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/qerl.png


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 


--------------------------------------------------------------------------------
/assets/curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/curve.png


--------------------------------------------------------------------------------
/assets/da_gr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/da_gr.png


--------------------------------------------------------------------------------
/assets/logo4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/logo4.png


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/performance.png


--------------------------------------------------------------------------------
/assets/rank_speed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/rank_speed.png


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/test_timer/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .timer import Timer
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/basic/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .module import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/obcq/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .sgpt_base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/data_free/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/independent/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | compressed-tensors/
 2 | 
 3 | vllm/
 4 | 
 5 | wandb/
 6 | 
 7 | **/__pycache__/
 8 | 
 9 | ckpt
10 | 


--------------------------------------------------------------------------------
/llm-compressor/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | recursive-exclude src *.png *.jpg *.jpeg *.gif *.svg *.bmp *.webp
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .frequency_manager import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/quantization/gptq/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/transform/quip/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/layer_sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/distillation/output/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/transform/spinquant/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/tracing/__init__.py:
--------------------------------------------------------------------------------
1 | from .debug import trace
2 | 
3 | __all__ = ["trace"]
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | from .mappings import *
5 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .constants import *
4 | from .helpers import *
5 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import ConstantPruningModifier
4 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/magnitude/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import MagnitudePruningModifier
4 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/assets/llmcompressor-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-icon.png


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/quantization/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import *
4 | from .mixin import *
5 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/assets/llmcompressor-icon-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-icon-white.png


--------------------------------------------------------------------------------
/llm-compressor/docs/assets/llmcompressor-user-flows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-user-flows.png


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .layer_mask import *
4 | from .mask_factory import *
5 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/transform/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .quip import QuIPModifier
4 | from .spinquant import SpinQuantModifier
5 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp4_nvfp4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: NVFP4A16


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: FP8_DYNAMIC


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .cache import *
4 | from .gptq import *
5 | from .quantization import *
6 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .base import SparseGPTModifier
4 | 
5 | __all__ = ["SparseGPTModifier"]
6 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .data import TextGenerationDataset
4 | from .session_mixin import SessionManagerMixIn
5 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8_DYNAMIC


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .kd_factory import *
4 | from .kd_wrapper import *
5 | from .model_wrapper import *
6 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa
2 | 
3 | from .constant import *
4 | from .magnitude import *
5 | from .wanda import *
6 | from .sparsegpt import *
7 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | General utility functions used throughout LLM Compressor.
3 | """
4 | 
5 | # ruff: noqa
6 | 
7 | from .dev import *
8 | from .helpers import *
9 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for applying sparsification algorithms to Hugging Face transformers flows
3 | """
4 | 
5 | # ruff: noqa
6 | from .helpers import *
7 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | file_extension: json
5 | num_train_epochs: 1


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | file_extension: csv
5 | num_train_epochs: 1


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Generic code used as utilities and helpers for PyTorch
3 | """
4 | 
5 | # ruff: noqa
6 | 
7 | from .helpers import *
8 | from .sparsification import *
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparsity_generic/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus


--------------------------------------------------------------------------------
/llm-compressor/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/kv_cache/default.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       kv_cache_scheme:
5 |         {num_bits: 8, type: float, symmetric: true, strategy: tensor}
6 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | file_extension: json
5 | num_train_epochs: 0.5


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
4 | dataset_config_name: wikitext-2-raw-v1
5 | dataset: wikitext


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml
5 | scheme: FP8A16_channel


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml
5 | scheme: FP8A16_tensor


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/fp4_nvfp4.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: NVFP4
5 | num_calibration_samples: 20
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 |   sparsity_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.5
5 |       mask_structure: "2:4"
6 |       targets: ["Linear"]
7 |       ignore: ["re:.*lm_head"]
8 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/fp8_smoke.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/tinysmokellama-3.2"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: FP8_DYNAMIC
4 | lmeval:
5 |   metrics:
6 |     exact_match,flexible-extract: 0.75
7 |     exact_match,strict-match: 0.75
8 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/inputs_smoke.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/tinysmokellama-3.2"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/test_sentinel.py:
--------------------------------------------------------------------------------
1 | from llmcompressor.sentinel import Sentinel
2 | 
3 | 
4 | def test_sentinel():
5 |     assert Sentinel("MISSING") == Sentinel("MISSING")
6 |     assert Sentinel("MISSING", "module_one") != Sentinel("MISSING", "module_two")
7 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/channelwise_smoke.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/tinysmokellama-3.2"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/weights_only_smoke.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/tinysmokellama-3.2"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.core import State
 4 | from tests.llmcompressor.pytorch.helpers import LinearNet
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def state():
 9 |     return State(model=LinearNet())
10 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.5
5 |       block_size: 128
6 |       targets: [
7 |         're:model.layers.3.mlp.gate_proj.weight'
8 |       ]


--------------------------------------------------------------------------------
/llm-compressor/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 |   - name: Red Hat AI
5 |   - name: vLLM Project
6 | title: "LLM Compressor"
7 | date-released: 2024-08-08
8 | url: https://github.com/vllm-project/llm-compressor
9 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools for integrating LLM Compressor with transformers training flows.
 3 | """
 4 | 
 5 | # ruff: noqa
 6 | 
 7 | # (import order matters for circular import avoidance)
 8 | from .utils import *
 9 | from .finetune import *
10 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | pruning_stage:
2 |     obcq_modifiers:
3 |         SparseGPTModifier:
4 |             sparsity: 0.5
5 |             mask_structure: "2:4"
6 |             targets: ["Linear"]
7 |             ignore: ["re:.*lm_head"]


--------------------------------------------------------------------------------
/src/open_r1/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import get_dataset
2 | from .import_utils import is_e2b_available, is_morph_available
3 | from .model_utils import get_model, get_tokenizer
4 | 
5 | 
6 | __all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"]
7 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf6.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/tinysmokellama-3.2"
5 | dataset: "gsm8k"
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.3
5 |       block_size: 128
6 |       dampening_frac: 0.01
7 |       targets: ["model.layers.0", "model.layers.1"]
8 |       mask_structure: "0:0"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/fp8_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/inputs_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf5.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/tinysmokellama-3.2"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparse/tiny_llama_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml"
6 | sparsity: 0.3


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/tinysmokellama-3.2"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_tinyllama


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/channelwise_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: W4A16_channel
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/kv_cache_gptq_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_gptq_tinyllama


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: garage-bAInd/Open-Platypus
7 | dataset_split: train


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/actorder_group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/configs/actorder_weight_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/tiny_llama_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml"
6 | num_samples: 32
7 | perplexity: 5000


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparse/gpu/llama_7b_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml"
6 | sparsity: 0.3
7 | device: "cuda:0"


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/kv_cache_phi3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: microsoft/Phi-3-mini-4k-instruct
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_phi3
8 | gpu_memory_utilization: 0.8


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
5 | scheme: sparse2of4_only
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft
8 | save_compressed: True


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     AWQModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/tinysmokellama-3.2"
5 | dataset: "gsm8k"
6 | dataset_config_name: "main"
7 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: FP8
 4 | dataset_id: HuggingFaceH4/ultrachat_200k
 5 | dataset_split: train_sft
 6 | lmeval:
 7 |   metrics:
 8 |     exact_match,flexible-extract: 0.75
 9 |     exact_match,strict-match: 0.75
10 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/distillation/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Provides model distillation functionality, specifically importing output-based
 5 |     distillation modifiers for transferring knowledge from teacher to student
 6 |     models during compression.
 7 | """
 8 | 
 9 | from .output import *
10 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.7
5 |       block_size: 128
6 |       dampening_frac: 0.01
7 |       mask_structure: "0:0"
8 |       targets: ["re:.*model.layers.0$"]
9 |       preserve_sparsity_mask: True


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/sequential/README.md:
--------------------------------------------------------------------------------
1 | # Sequential Pipeline #
2 | The sequential pipeline is a data pipeline, primarily used for compressing models with the
3 | [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the
4 | [SparseGPTModifier](/src/llmcompressor/modifiers/pruning/sparsegpt/base.py).
5 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/sparse_with_mask_structure.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       block_size: 128
 6 |       dampening_frac: 0.01
 7 |       mask_structure: "2:4"
 8 |       targets: [
 9 |         "re:.*model.layers.0$",
10 |       ]


--------------------------------------------------------------------------------
/llm-compressor/.MAINTAINERS:
--------------------------------------------------------------------------------
 1 | # list of active maintainers
 2 | # uncommented maintainers will be included in code review triage
 3 | 
 4 | markurtz
 5 | dsikka
 6 | rahul-tuli
 7 | horheynm
 8 | brian-dellabetta
 9 | kylesayrs
10 | 
11 | # mgoin
12 | # anmarques
13 | # eldarkurtic
14 | # chibukach
15 | # shubhra
16 | # abhinavnmagic
17 | # eiofinov
18 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True)
 7 | def run_before_and_after_tests(tmp_path):
 8 |     os.environ["TRANSFORMERS_CACHE"] = str(tmp_path / "transformers")
 9 |     os.environ["HF_DATASETS_CACHE"] = str(tmp_path / "datasets")
10 |     yield
11 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/tiny_llama_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml"
6 | num_samples: 32
7 | perplexity: 5000


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | dataset: "ultrachat-200k"
5 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
6 | num_train_epochs: 0.05
7 | concat_txt: False
8 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_sparse.yml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16
 4 | dataset_id: HuggingFaceH4/ultrachat_200k
 5 | dataset_split: train_sft
 6 | quant_type: "GPTQ"
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
5 | dataset_id: neuralmagic/LLM_compression_calibration
6 | dataset_split: train
7 | scheme: W4A16_actorder_none
8 | save_dir: Qwen2.5-0.5B-actorder-none


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: neuralmagic/LLM_compression_calibration
6 | dataset_split: train
7 | scheme: W4A16_actorder_group
8 | save_dir: Qwen2.5-0.5B-actorder-group


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: [lm_head]
 5 |       actorder: null
 6 |       config_groups:
 7 |         group_0:
 8 |           weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
 9 |           targets: [Linear]
10 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: [lm_head]
 5 |       actorder: null
 6 |       config_groups:
 7 |         group_0:
 8 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
 9 |           targets: [Linear]
10 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: wikitext
5 | dataset_config_name: "wikitext-2-raw-v1"
6 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
7 | num_train_epochs: 0.25
8 | concat_txt: False


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: garage-bAInd/Open-Platypus
6 | dataset_split: train
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | scheme: W4A16_actorder_weight
8 | save_dir: Qwen2.5-0.5B-actorder-weight


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/utils/competitive_programming/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import islice
 2 | 
 3 | 
 4 | def batched(iterable, n):
 5 |     "Batch data into lists of length n. The last batch may be shorter."
 6 |     # batched('ABCDEFG', 3) --> ABC DEF G
 7 |     if n < 1:
 8 |         return iterable
 9 |     it = iter(iterable)
10 |     while batch := list(islice(it, n)):
11 |         yield batch
12 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W4A16_weight_asym_awq
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_channel_weight_static_per_tensor


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_group
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_none
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W4A16_weight_sym_awq
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_static_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Asym
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_weight
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/configs/w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_dynamic_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Asym
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a16_awq_sym.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16
 4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.70
10 |     exact_match,strict-match: 0.70
11 | 


--------------------------------------------------------------------------------
/src/open_r1/utils/competitive_programming/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import islice
 2 | 
 3 | 
 4 | def batched(iterable, n):
 5 |     "Batch data into lists of length n. The last batch may be shorter."
 6 |     # batched('ABCDEFG', 3) --> ABC DEF G
 7 |     if n < 1:
 8 |         return iterable
 9 |     it = iter(iterable)
10 |     while batch := list(islice(it, n)):
11 |         yield batch
12 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/entrypoints/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Provides entry points for model compression workflows.
 5 | 
 6 | Includes oneshot compression, training, and pre and post-processing utilities
 7 | for model optimization tasks.
 8 | """
 9 | 
10 | from .oneshot import Oneshot, oneshot
11 | from .train import train
12 | from .utils import post_process, pre_process
13 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | PyTorch-specific utilities and tools for model compression workflows.
3 | 
4 | Provides PyTorch-specific functionality including model loading,
5 | sparsification utilities, and PyTorch tensor operations optimized for
6 | compression workflows. Includes utilities for handling PyTorch models
7 | and tensors during compression operations.
8 | """
9 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/consec_runs/tiny_llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml"


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     QuantizationModifier:
 4 |       ignore: ["lm_head"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 8
 9 |             type: "float"
10 |             symmetric: true
11 |             strategy: channel
12 |           targets: ["Linear"]


--------------------------------------------------------------------------------
/llm-compressor/examples/finetuning/example_single_gpu_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: 'NO'
 4 | enable_cpu_affinity: false
 5 | gpu_ids: 0
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | num_machines: 1
 9 | num_processes: 1
10 | rdzv_backend: static
11 | same_network: true
12 | tpu_env: []
13 | tpu_use_cluster: false
14 | tpu_use_sudo: false
15 | use_cpu: false


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/utils/constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constants for modifier operations and compression thresholds.
 3 | 
 4 | This module defines global constants used throughout the compression
 5 | framework for determining sparsity thresholds, pruning criteria, and
 6 | other modifier-specific parameters.
 7 | """
 8 | 
 9 | __all__ = ["SPARSITY_THRESHOLD"]
10 | 
11 | SPARSITY_THRESHOLD: float = 0.05
12 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     AWQModifier:
 4 |       ignore: ["lm_head"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 4
 9 |             type: "int"
10 |             symmetric: true
11 |             strategy: "group"
12 |             group_size: 128
13 |           targets: ["Linear"]
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a16_actorder_none.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16_actorder_none
 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/consec_runs/gpu/llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml"
7 | device: "cuda:0"


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16_actorder_group
 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16_actorder_weight
 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/recipes/accelerate_configs/ddp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: MULTI_GPU
 4 | downcast_bf16: 'no'
 5 | gpu_ids: all
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | mixed_precision: bf16
 9 | num_machines: 1
10 | num_processes: 8
11 | rdzv_backend: static
12 | same_network: true
13 | tpu_env: []
14 | tpu_use_cluster: false
15 | tpu_use_sudo: false
16 | use_cpu: false
17 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/test_has_gpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | 
 7 | @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") != "true", reason="Only run for GHA")
 8 | def test_has_gpu():
 9 |     """
10 |     This test exists purely to raise an error if
11 |     a runner performs transformers tests without a GPU
12 |     """
13 |     assert torch.cuda.is_available()
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: INT8_dyn_per_token
 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.77
10 |     exact_match,strict-match: 0.76


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.6
 5 |     GPTQModifier:
 6 |       block_size: 128
 7 |       dampening_frac: 0.01
 8 |       config_groups:
 9 |         group_0:
10 |           weights:
11 |             num_bits: 8
12 |           input_activations:
13 |             num_bits: 8
14 |           targets: ["Linear"]


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head"]
 5 |       actorder: null
 6 |       config_groups:
 7 |         group_0:
 8 |           weights:
 9 |             num_bits: 4
10 |             type: "int"
11 |             symmetric: true
12 |             strategy: "group"
13 |             group_size: 128
14 |           targets: ["Linear"]
15 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf1.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "commit"
 2 | test_type: "smoke"
 3 | tokenize: False
 4 | model: "nm-testing/tinysmokellama-3.2"
 5 | dataset: open_platypus
 6 | recipe: |
 7 |   test_stage:
 8 |     obcq_modifiers:
 9 |       SparseGPTModifier:
10 |         sparsity: 0.5
11 |         block_size: 128
12 |         targets: [
13 |           're:model.layers.3.mlp.gate_proj.weight'
14 |         ]


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head"]
 5 |       actorder: "group"
 6 |       config_groups:
 7 |         group_0:
 8 |           weights:
 9 |             num_bits: 4
10 |             type: "int"
11 |             symmetric: true
12 |             strategy: "group"
13 |             group_size: 128
14 |           targets: ["Linear"]
15 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Metrics logging and monitoring framework for compression workflows.
 5 | 
 6 | Provides comprehensive metrics collection, logging, and monitoring
 7 | capabilities for model compression operations. Includes base loggers,
 8 | frequency management, and specialized metrics tracking for training and
 9 | inference performance during compression.
10 | """
11 | 
12 | from .logger import *
13 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/test_tiny2.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       block_size: 128
 6 |       dampening_frac: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: [
 9 |         "model.layers.0",
10 |         "model.layers.1",
11 |         "model.layers.2",
12 |         "model.layers.3",
13 |         "model.layers.4",
14 |         "model.layers.5"
15 |       ]


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Provides dataset utilities for model calibration and processing.
 5 | 
 6 | Includes functions to format calibration data, create dataloaders,
 7 | process datasets, and split datasets for quantization workflows.
 8 | """
 9 | 
10 | from .utils import (
11 |     format_calibration_data,
12 |     get_calibration_dataloader,
13 |     get_processed_dataset,
14 |     make_dataset_splits,
15 | )
16 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modeling/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Model preparation and fusion utilities for compression workflows.
 5 | 
 6 | Provides tools for preparing models for compression including
 7 | layer fusion, module preparation, and model structure optimization.
 8 | Handles pre-compression transformations and architectural modifications
 9 | needed for efficient compression.
10 | """
11 | 
12 | from .fuse import *
13 | from .prepare import *
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf4.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "commit"
 2 | test_type: "smoke"
 3 | tokenize: False
 4 | model: "nm-testing/tinysmokellama-3.2"
 5 | dataset: "gsm8k"
 6 | dataset_config_name: "main"
 7 | recipe: |
 8 |   test_stage:
 9 |     obcq_modifiers:
10 |       SparseGPTModifier:
11 |         sparsity: 0.5
12 |         block_size: 128
13 |         targets: [
14 |           're:model.layers.3.mlp.gate_proj.weight'
15 |         ]


--------------------------------------------------------------------------------
/llm-compressor/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Getting started with LLM Compressor docs
 2 | 
 3 | ```bash
 4 | cd docs
 5 | ```
 6 | 
 7 | - Install the dependencies:
 8 | 
 9 | ```bash
10 | make install
11 | ```
12 | 
13 | - Clean the previous build (optional but recommended):
14 | 
15 | ```bash
16 | make clean
17 | ```
18 | 
19 | - Serve the docs:
20 | 
21 | ```bash
22 | make serve
23 | ```
24 | 
25 | This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.


--------------------------------------------------------------------------------
/openr1_tool/pass_rate_filtering/launch_filtering.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # a bash foor loop from 0 to 17,400 in chunks of 200
 4 | 
 5 | for i in {0..17000..200}
 6 | do
 7 |   START=$i
 8 |   END=$((i + 200))
 9 |   echo "Processing chunk from $START to $END"
10 |   
11 |   # Submit the job to SLURM
12 |   sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml $START $END
13 | done
14 | 
15 | sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml 17200 17398
16 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/scripts/mathjax.js:
--------------------------------------------------------------------------------
 1 | window.MathJax = {
 2 |   tex: {
 3 |     inlineMath: [["\\(", "\\)"]],
 4 |     displayMath: [["\\[", "\\]"]],
 5 |     processEscapes: true,
 6 |     processEnvironments: true
 7 |   },
 8 |   options: {
 9 |     ignoreHtmlClass: ".*|",
10 |     processHtmlClass: "arithmatex"
11 |   }
12 | };
13 | 
14 | document$.subscribe(() => {
15 |   MathJax.startup.output.clearCache()
16 |   MathJax.typesetClear()
17 |   MathJax.texReset()
18 |   MathJax.typesetPromise()
19 | })
20 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     QuantizationModifier:
 6 |       ignore: [lm_head]
 7 |       config_groups:
 8 |         group_0:
 9 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
10 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
11 |           targets: [Linear]


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/args/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Arguments package for LLM Compressor.
 5 | 
 6 | Defines structured argument classes for datasets, models, training, and
 7 | recipes, along with utilities for parsing them.
 8 | """
 9 | 
10 | from .dataset_arguments import DatasetArguments
11 | from .model_arguments import ModelArguments
12 | from .recipe_arguments import RecipeArguments
13 | from .training_arguments import TrainingArguments
14 | from .utils import parse_args
15 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/core/events/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LLM Compressor Core Events Package
 3 | 
 4 | This package provides the core components and lifecycle management for events
 5 | used in the LLM Compressor framework. It includes definitions for various
 6 | event types and lifecycles that are critical for managing the state and
 7 | execution flow of the model compression and training processes.
 8 | """
 9 | 
10 | from .event import Event, EventType
11 | 
12 | __all__ = ["Event", "EventType"]
13 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     QuantizationModifier:
13 |       targets: ["Linear"]
14 |       ignore: ["lm_head"]
15 |       scheme: "FP8_DYNAMIC"
16 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/smoothquant_gptq_w8a8.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |       mappings:
 6 |       - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
 7 |         - re:.*input_layernorm
 8 |       - - ['re:.*gate_proj', 're:.*up_proj']
 9 |         - re:.*post_attention_layernorm
10 |     GPTQModifier:
11 |       targets: ["Linear"]
12 |       ignore: [lm_head]
13 |       scheme: W8A8
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: [lm_head]
 7 |       actorder: null
 8 |       config_groups:
 9 |         group_0:
10 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
11 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
12 |           targets: [Linear]


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: [lm_head]
 7 |       actorder: null
 8 |       config_groups:
 9 |         group_0:
10 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
11 |           input_activations: {num_bits: 8, symmetric: false, dynamic: false, strategy: tensor, type: int}
12 |           targets: [Linear] 
13 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Framework for monitoring and analyzing model behavior during compression.
 5 | 
 6 | Provides observers for tracking tensor statistics, activation
 7 | ranges, and model behavior during compression workflows. Includes
 8 | min-max observers, MSE observers, and helper utilities for quantization
 9 | and other compression techniques.
10 | """
11 | 
12 | from .helpers import *
13 | from .base import *
14 | from .min_max import *
15 | from .mse import *
16 | 


--------------------------------------------------------------------------------
/llm-compressor/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | # Required
 5 | version: 2
 6 | 
 7 | # Set the OS, Python version, and other tools you might need
 8 | build:
 9 |   os: ubuntu-24.04
10 |   tools:
11 |     python: "3.12"
12 | 
13 | # Build documentation with Mkdocs
14 | mkdocs:
15 |    configuration: mkdocs.yml
16 | 
17 | python:
18 |   install:
19 |     - method: pip
20 |       path: .
21 |       extra_requirements:
22 |         - dev
23 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
 5 |       actorder: "weight"
 6 |       config_groups:
 7 |         group_0:
 8 |           weights:
 9 |             num_bits: 4
10 |             type: "int"
11 |             symmetric: true
12 |             strategy: "group"
13 |             group_size: 128
14 |           targets: ["Linear"]
15 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: [lm_head]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 8
 9 |             type: "int"
10 |             strategy: "channel"
11 |           targets: [Linear]
12 |     SparseGPTModifier:
13 |       sparsity: 0.5
14 |       block_size: 128
15 |       dampening_frac: 0.01
16 |       mask_structure: "0:0"
17 |       targets: ["re:.*model.layers.0$"]


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/tinysmokellama-3.2"
4 | dataset: open_platypus
5 | initial_pruning_only_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse_with_mask_structure.yaml"
6 | initial_sparsity: 0.5
7 | recipe_mask_structure: "2:4"
8 | subsequent_prune_and_quant_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity_with_quant.yaml"
9 | final_sparsity: 0.7


--------------------------------------------------------------------------------
/src/open_r1/utils/wandb_logging.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def init_wandb_training(training_args):
 5 |     """
 6 |     Helper function for setting up Weights & Biases logging tools.
 7 |     """
 8 |     if training_args.wandb_entity is not None:
 9 |         os.environ["WANDB_ENTITY"] = training_args.wandb_entity
10 |     if training_args.wandb_project is not None:
11 |         os.environ["WANDB_PROJECT"] = training_args.wandb_project
12 |     if training_args.wandb_run_group is not None:
13 |         os.environ["WANDB_RUN_GROUP"] = training_args.wandb_run_group
14 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | from .base import TextGenerationDataset
 4 | from .c4 import C4Dataset
 5 | from .cnn_dailymail import CNNDailyMailDataset
 6 | from .custom import CustomDataset
 7 | from .evolcodealpaca import EvolCodeAlpacaDataset
 8 | from .flickr_30k import Flickr30K
 9 | from .gsm8k import GSM8KDataset
10 | from .open_platypus import OpenPlatypusDataset
11 | from .peoples_speech import PeoplesSpeech
12 | from .ultrachat_200k import UltraChatDataset
13 | from .wikitext import WikiTextDataset
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head"]
 5 |       actorder: null
 6 |       config_groups:
 7 |           group_0:
 8 |               weights:
 9 |                   num_bits: 4
10 |                   type: "int"
11 |                   symmetric: true
12 |                   strategy: "channel"
13 |                   actorder: False
14 |               targets: ["Linear"]
15 |       kv_cache_scheme:
16 |         {num_bits: 8, type: float, symmetric: true, strategy: tensor}


--------------------------------------------------------------------------------
/llm-compressor/docs/examples/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | weight: -4
 3 | ---
 4 | 
 5 | # LLM Compressor examples
 6 | 
 7 | This section provides practical demonstrations showing how to use LLM Compressor to optimize large language models for faster and more efficient deployment with vLLM. These examples will help you understand the various compression techniques and functionalities available in LLM Compressor, making it easier to apply them to your own models.
 8 | 
 9 | Each example is designed to be self-contained, with clear instructions and code snippets that you can run directly.
10 | 


--------------------------------------------------------------------------------
/recipes/accelerate_configs/zero2.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/transform/test_serialization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.transform import QuIPModifier, SpinQuantModifier
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("modifier", [SpinQuantModifier, QuIPModifier])
 7 | @pytest.mark.parametrize("transform_block_size", [16, 32])
 8 | def test_reload(modifier, transform_block_size):
 9 |     instance = modifier(
10 |         transform_type="hadamard", transform_block_size=transform_block_size
11 |     )
12 |     dump = instance.model_dump()
13 |     assert modifier.model_validate(dump) == instance
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: weekly
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: FP8_DYNAMIC
 5 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 6 | lmeval:
 7 |   model: "hf-multimodal"
 8 |   model_args:
 9 |     dtype: bfloat16
10 |     add_bos_token: True
11 |     convert_img_format: True
12 |   task: mmmu_val_literature
13 |   num_fewshot: 0
14 |   batch_size: 8
15 |   # dense model achieves accuracy of 0.9 +/ 0.0557
16 |   metrics:
17 |     acc,none: 0.8333
18 |     acc_stderr,none: 0.0557
19 | 


--------------------------------------------------------------------------------
/recipes/accelerate_configs/zero3.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
 7 |       actorder: null
 8 |       config_groups:
 9 |         group_0:
10 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
11 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true}
12 |           targets: [Linear]
13 | 


--------------------------------------------------------------------------------
/recipes/accelerate_configs/zero3_offload.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   deepspeed_multinode_launcher: standard
 5 |   offload_optimizer_device: cpu
 6 |   offload_param_device: cpu
 7 |   zero3_init_flag: true
 8 |   zero3_save_16bit_model: true
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   pruning_modifiers:
 3 |     ConstantPruningModifier:
 4 |       targets: [
 5 |         "re:.*self_attn.q_proj",
 6 |         "re:.*self_attn.k_proj",
 7 |         "re:.*self_attn.v_proj",
 8 |         "re:.*self_attn.o_proj",
 9 |         "re:.*mlp.gate_proj",
10 |         "re:.*mlp.up_proj"
11 |       ]
12 |       start: 0
13 |   distillation_modifiers:
14 |     OutputDistillationModifier:
15 |       targets: ["re:model.layers.\\d+$"]
16 |       comparison: "square_head"
17 |       start: 0
18 |       orig_scale: 1.0
19 |       distill_scale: 1.0


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_infer_targets.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from accelerate import init_empty_weights
 3 | from transformers import AutoModelForCausalLM
 4 | 
 5 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
 6 | 
 7 | 
 8 | @pytest.mark.integration
 9 | def test_infer_targets():
10 |     modifier = SparseGPTModifier(sparsity=0.0)
11 |     with init_empty_weights():
12 |         model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
13 | 
14 |     inferred = modifier._infer_sequential_targets(model)
15 |     assert inferred == ["LlamaDecoderLayer"]
16 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Compression modifiers for applying various optimization techniques.
 3 | 
 4 | Provides the core modifier system for applying compression techniques like
 5 | quantization, pruning, distillation, and other optimization methods to neural
 6 | networks. Includes base classes, factory patterns, and interfaces for
 7 | extensible compression workflows.
 8 | """
 9 | 
10 | from .factory import ModifierFactory
11 | from .interface import ModifierInterface
12 | from .modifier import Modifier
13 | 
14 | __all__ = [
15 |     "ModifierFactory",
16 |     "ModifierInterface",
17 |     "Modifier",
18 | ]
19 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/w4a4_nvfp4.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Llama-3.1-8B-Instruct
 3 | scheme: NVFP4
 4 | dataset_id: HuggingFaceH4/ultrachat_200k
 5 | dataset_split: train_sft
 6 | num_calibration_samples: 20
 7 | lmeval:
 8 |   # NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8
 9 |   # Observed: strict-match ~92.81%, flexible-extract ~89.59%
10 |   recovery_threshold:
11 |     exact_match,strict-match: 0.92
12 |     exact_match,flexible-extract: 0.89
13 |   # Absolute metrics for warnings only
14 |   metrics:
15 |     exact_match,flexible-extract: 0.70
16 |     exact_match,strict-match: 0.65
17 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal mkdocs makefile
 2 | 
 3 | PYTHON      := python3
 4 | MKDOCS_CMD  := mkdocs
 5 | MKDOCS_CONF := ../mkdocs.yml
 6 | 
 7 | .PHONY: help install serve build clean
 8 | 
 9 | help:
10 | 	@echo "Available targets:"
11 | 	@echo "  install  Install dependencies globally"
12 | 	@echo "  serve    Serve docs locally"
13 | 	@echo "  build    Build static site"
14 | 	@echo "  clean    Remove build artifacts"
15 | 
16 | install:
17 | 	pip install -e "../[dev]"
18 | 
19 | serve:
20 | 	$(MKDOCS_CMD) serve --livereload -f $(MKDOCS_CONF)
21 | 
22 | build:
23 | 	$(MKDOCS_CMD) build -f $(MKDOCS_CONF)
24 | 
25 | clean:
26 | 	rm -rf site/ .cache/ 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity_with_quant.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.7
 5 |       block_size: 128
 6 |       dampening_frac: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: [
 9 |         "re:.*model.layers.0$",
10 |       ]
11 |       preserve_sparsity_mask: True
12 |     GPTQModifier:
13 |       config_groups:
14 |         group_0:
15 |           weights:
16 |             num_bits: 8
17 |             type: "int"
18 |             strategy: "channel"
19 |           targets: [
20 |             "re:.*model.layers.0.self_attn.q_proj",
21 |           ]


--------------------------------------------------------------------------------
/llm-compressor/examples/finetuning/configure_fsdp.md:
--------------------------------------------------------------------------------
 1 | # Configuring FSDP for Sparse Finetuning
 2 | 
 3 | An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this
 4 | folder. It can be used out of the box by editing the `num_processes` parameter to 
 5 | fit the number of GPUs on your machine.
 6 | 
 7 | You can also customize your own config file by running the following prompt
 8 | ```
 9 | accelerate config
10 | ```
11 | 
12 | An FSDP config file can be passed to the LLM Compressor finetuning script like this:
13 | ```
14 | accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune
15 | ```
16 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | """
 4 | Compression pipelines for orchestrating different compression strategies.
 5 | 
 6 | Provides various compression pipelines including basic, sequential,
 7 | independent, layer-sequential, and data-free approaches. Each pipeline
 8 | coordinates different compression techniques and workflows for optimal
 9 | model optimization based on specific requirements and constraints.
10 | """
11 | 
12 | # populate registry
13 | from .basic import *
14 | from .data_free import *
15 | from .independent import *
16 | from .layer_sequential import *
17 | from .registry import *
18 | from .sequential import *
19 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     GPTQModifier:
13 |       ignore: ["lm_head"]
14 |       actorder: null
15 |       config_groups:
16 |         group_0:
17 |           weights:
18 |             num_bits: 4
19 |             type: "int"
20 |             symmetric: true
21 |             strategy: "channel"
22 |           targets: ["Linear"]
23 | 


--------------------------------------------------------------------------------
/src/open_r1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/utils/competitive_programming/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cf_scoring import score_submission
 2 | from .code_patcher import patch_code
 3 | from .ioi_scoring import SubtaskResult, score_subtask, score_subtasks
 4 | from .ioi_utils import add_includes
 5 | from .morph_client import get_morph_client_from_env
 6 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
 7 | 
 8 | 
 9 | __all__ = [
10 |     "get_piston_client_from_env",
11 |     "get_slurm_piston_endpoints",
12 |     "get_morph_client_from_env",
13 |     "patch_code",
14 |     "score_submission",
15 |     "score_subtask",
16 |     "score_subtasks",
17 |     "add_includes",
18 |     "SubtaskResult",
19 | ]
20 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/trainer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Enhanced trainer class for fine-tuning with compression support.
 3 | 
 4 | This module provides a Trainer class that extends HuggingFace's Trainer with
 5 | LLM compression session management capabilities. Integrates compression
 6 | workflows into the standard training loop for seamless model optimization
 7 | during fine-tuning.
 8 | """
 9 | 
10 | from transformers import Trainer as HFTransformersTrainer
11 | 
12 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
13 | 
14 | __all__ = ["Trainer"]
15 | 
16 | 
17 | class Trainer(SessionManagerMixIn, HFTransformersTrainer):
18 |     pass
19 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "channel"
12 |                     input_activations: null
13 |                     output_activations: null
14 |                     targets: ["Linear"]
15 |         GPTQModifier:
16 |             block_size: 128


--------------------------------------------------------------------------------
/src/open_r1/utils/competitive_programming/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cf_scoring import score_submission
 2 | from .code_patcher import patch_code
 3 | from .ioi_scoring import SubtaskResult, score_subtask, score_subtasks
 4 | from .ioi_utils import add_includes
 5 | from .morph_client import get_morph_client_from_env
 6 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints
 7 | 
 8 | 
 9 | __all__ = [
10 |     "get_piston_client_from_env",
11 |     "get_slurm_piston_endpoints",
12 |     "get_morph_client_from_env",
13 |     "patch_code",
14 |     "score_submission",
15 |     "score_subtask",
16 |     "score_subtasks",
17 |     "add_includes",
18 |     "SubtaskResult",
19 | ]
20 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/version.py:
--------------------------------------------------------------------------------
 1 | # file generated by setuptools-scm
 2 | # don't change, don't track in version control
 3 | 
 4 | __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
 5 | 
 6 | TYPE_CHECKING = False
 7 | if TYPE_CHECKING:
 8 |     from typing import Tuple
 9 |     from typing import Union
10 | 
11 |     VERSION_TUPLE = Tuple[Union[int, str], ...]
12 | else:
13 |     VERSION_TUPLE = object
14 | 
15 | version: str
16 | __version__: str
17 | __version_tuple__: VERSION_TUPLE
18 | version_tuple: VERSION_TUPLE
19 | 
20 | __version__ = version = '0.8.1.dev0+g33ef5f49.d20251006'
21 | __version_tuple__ = version_tuple = (0, 8, 1, 'dev0', 'g33ef5f49.d20251006')
22 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: W4A16_actorder_weight
 5 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 6 | dataset_id: lmms-lab/flickr30k
 7 | dataset_split: "test[:512]"
 8 | lmeval:
 9 |   model: "hf-multimodal"
10 |   model_args:
11 |     dtype: bfloat16
12 |     add_bos_token: True
13 |     convert_img_format: True
14 |   task: mmmu_val_literature
15 |   num_fewshot: 0
16 |   batch_size: 8
17 |   # dense model achieves accuracy of 0.9 +/ 0.0557
18 |   metrics:
19 |     acc,none: 0.8333
20 |     acc_stderr,none: 0.0557


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     QuantizationModifier:
 4 |       ignore: ["lm_head", "re:vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 8
 9 |             type: "float"
10 |             symmetric: true
11 |             strategy: "channel"
12 |             dynamic: false
13 |           input_activations:
14 |             num_bits: 8
15 |             type: "float"
16 |             symmetric: true
17 |             strategy: "token"
18 |             dynamic: true
19 |           targets: ["Linear"]
20 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 8
 9 |                         type: "int"
10 |                         symmetric: true
11 |                         strategy: "tensor"
12 |                     input_activations: null
13 |                     output_activations: null
14 |                     targets: ["Linear", "Embedding"]
15 |         GPTQModifier:
16 |             block_size: 128
17 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/typing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Defines type aliases for the llm-compressor library.
 3 | """
 4 | 
 5 | from typing import Union
 6 | 
 7 | from datasets import Dataset, DatasetDict, IterableDataset
 8 | from transformers import (
 9 |     BaseImageProcessor,
10 |     FeatureExtractionMixin,
11 |     PreTrainedTokenizer,
12 |     ProcessorMixin,
13 | )
14 | 
15 | # Tokenizer or Processor. Processors do not inherit from a unified base class
16 | Processor = Union[
17 |     PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin
18 | ]
19 | 
20 | # Supported dataset types, IterableDataset is a streamed dataset
21 | DatasetType = Union[Dataset, DatasetDict, IterableDataset]
22 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml:
--------------------------------------------------------------------------------
 1 | test_oneshot_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.7
 5 |       block_size: 128
 6 |       dampening_frac: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: ["Linear"]
 9 |       ignore: ["re:.*lm_head"]
10 | test_train_stage:
11 |   pruning_modifiers:
12 |     ConstantPruningModifier:
13 |       targets: [
14 |         "re:.*self_attn.q_proj",
15 |         "re:.*self_attn.k_proj",
16 |         "re:.*self_attn.v_proj",
17 |         "re:.*self_attn.o_proj",
18 |         "re:.*mlp.down_proj",
19 |         "re:.*mlp.gate_proj",
20 |         "re:.*mlp.up_proj"
21 |       ]
22 |       start: 0


--------------------------------------------------------------------------------
/llm-compressor/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: INT8_dyn_per_token
 5 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 6 | dataset_id: lmms-lab/flickr30k
 7 | dataset_split: "test[:512]"
 8 | lmeval:
 9 |   model: "hf-multimodal"
10 |   model_args:
11 |     dtype: bfloat16
12 |     add_bos_token: True
13 |     convert_img_format: True
14 |   task: mmmu_val_literature
15 |   num_fewshot: 0
16 |   batch_size: 8
17 |   # dense model achieves accuracy of 0.9 +/ 0.0557
18 |   metrics:
19 |     acc,none: 0.833
20 |     acc_stderr,none: 0.0557


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     GPTQModifier:
13 |       ignore: ["lm_head"]
14 |       actorder: null
15 |       config_groups:
16 |         group_0:
17 |           weights:
18 |             num_bits: 4
19 |             type: "int"
20 |             symmetric: true
21 |             strategy: "group"
22 |             group_size: 128
23 |           targets: ["Linear"]
24 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |       mappings:
 6 |       - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
 7 |         - re:.*input_layernorm
 8 |       - - ['re:.*gate_proj', 're:.*up_proj']
 9 |         - re:.*post_attention_layernorm
10 |     GPTQModifier:
11 |       ignore: [lm_head]
12 |       actorder: null
13 |       config_groups:
14 |         group_0:
15 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
16 |           input_activations: {num_bits: 8, symmetric: false, dynamic: true, strategy: token, type: int}
17 |           targets: [Linear] 
18 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                     input_activations: null
14 |                     output_activations: null
15 |                     targets: ["Linear"]
16 |         GPTQModifier:
17 |             block_size: 128


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/modifiers/obcq/sgpt_base.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from llmcompressor.modifiers.pruning.sparsegpt import (
 4 |     SparseGPTModifier as PruningSparseGPTModifier,
 5 | )
 6 | 
 7 | __all__ = ["SparseGPTModifier"]
 8 | 
 9 | # Legacy shim for backwards-compat imports
10 | 
11 | 
12 | class SparseGPTModifier(PruningSparseGPTModifier):
13 |     def __init__(cls, **kwargs):
14 |         warnings.warn(
15 |             "SparseGPTModifier has moved. In future, please initialize it from "
16 |             "`llmcompressor.modifiers.pruning.sparsegpt.SparseGPTModifier`.",
17 |             DeprecationWarning,
18 |             stacklevel=2,  # Adjust stacklevel to point to the user's code
19 |         )
20 |         return super().__init__(**kwargs)
21 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.factory import ModifierFactory
 4 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | @pytest.mark.usefixtures("setup_modifier_factory")
 9 | def test_sparse_gpt_is_registered():
10 |     sparsity = 0.5
11 |     targets = "__ALL_PRUNABLE__"
12 |     type_ = ModifierFactory.create(
13 |         type_="SparseGPTModifier",
14 |         allow_experimental=False,
15 |         allow_registered=True,
16 |         sparsity=sparsity,
17 |         targets=targets,
18 |     )
19 | 
20 |     assert isinstance(
21 |         type_, SparseGPTModifier
22 |     ), "PyTorch SparseGPTModifier not registered"
23 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/pruning/wanda/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.factory import ModifierFactory
 4 | from llmcompressor.modifiers.pruning.wanda.base import WandaPruningModifier
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | @pytest.mark.usefixtures("setup_modifier_factory")
 9 | def test_wanda_is_registered():
10 |     sparsity = 0.5
11 |     targets = "__ALL_PRUNABLE__"
12 |     type_ = ModifierFactory.create(
13 |         type_="WandaPruningModifier",
14 |         allow_experimental=False,
15 |         allow_registered=True,
16 |         sparsity=sparsity,
17 |         targets=targets,
18 |     )
19 | 
20 |     assert isinstance(
21 |         type_, WandaPruningModifier
22 |     ), "PyTorch WandaPruningModifier not registered"
23 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/finetuning/example_fsdp_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | fsdp_config:
 6 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 7 |   fsdp_backward_prefetch_policy: BACKWARD_PRE
 8 |   fsdp_cpu_ram_efficient_loading: false
 9 |   fsdp_forward_prefetch: false
10 |   fsdp_offload_params: false
11 |   fsdp_sharding_strategy: 1
12 |   fsdp_state_dict_type: SHARDED_STATE_DICT
13 |   fsdp_sync_module_states: true
14 |   fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | num_machines: 1
18 | num_processes: 4
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false
25 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                         actorder: "group"
14 |                     input_activations: null
15 |                     output_activations: null
16 |                     targets: ["Linear"]
17 |         GPTQModifier:
18 |             block_size: 128


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                         actorder: "weight"
14 |                     input_activations: null
15 |                     output_activations: null
16 |                     targets: ["Linear"]
17 |         GPTQModifier:
18 |             block_size: 128


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from torch.nn import Linear
 3 | 
 4 | from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | def test_smooth_quant_mapping(state):
 9 |     mappings = [(["seq.fc1"], "seq.fc2")]
10 |     modifier = SmoothQuantModifier(mappings=mappings)
11 | 
12 |     modifier.ignore = []
13 |     modifier.resolved_mappings_ = modifier._resolve_mappings(state.model)
14 | 
15 |     assert len(modifier.resolved_mappings_) == len(mappings)
16 | 
17 |     mapping = modifier.resolved_mappings_[0]
18 |     assert mapping.smooth_name == mappings[0][1]
19 |     assert isinstance(mapping.smooth_layer, Linear)
20 |     assert isinstance(mapping.balance_layers[0], Linear)
21 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Recipe system for defining and managing compression workflows.
 3 | 
 4 | Provides the recipe framework for specifying compression
 5 | configurations, including metadata tracking, recipe parsing, and
 6 | workflow orchestration. Supports stage-based execution and flexible
 7 | parameter management for complex compression pipelines.
 8 | """
 9 | 
10 | from .metadata import DatasetMetaData, LayerMetaData, ModelMetaData, ParamMetaData
11 | from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput
12 | 
13 | __all__ = [
14 |     "DatasetMetaData",
15 |     "ParamMetaData",
16 |     "LayerMetaData",
17 |     "ModelMetaData",
18 |     "Recipe",
19 |     "RecipeInput",
20 |     "RecipeStageInput",
21 |     "RecipeArgsInput",
22 | ]
23 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.factory import ModifierFactory
 4 | from llmcompressor.modifiers.pruning.wanda import WandaPruningModifier
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | @pytest.mark.usefixtures("setup_modifier_factory")
 9 | def test_wanda_pytorch_is_registered():
10 |     sparsity = 0.5
11 |     targets = "__ALL_PRUNABLE__"
12 | 
13 |     type_ = ModifierFactory.create(
14 |         type_="WandaPruningModifier",
15 |         allow_experimental=False,
16 |         allow_registered=True,
17 |         sparsity=sparsity,
18 |         targets=targets,
19 |     )
20 | 
21 |     assert isinstance(
22 |         type_, WandaPruningModifier
23 |     ), "PyTorch ConstantPruningModifier not registered"
24 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         GPTQModifier:
 4 |             block_size: 128
 5 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 6 |             config_groups:
 7 |                 group_0:
 8 |                     weights:
 9 |                         num_bits: 8
10 |                         type: "int"
11 |                         symmetric: false
12 |                         strategy: "channel"
13 |                     input_activations:
14 |                         num_bits: 8
15 |                         type: "int"
16 |                         symmetric: false
17 |                         strategy: "tensor"
18 |                     output_activations: null
19 |                     targets: ["Linear"]


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/data/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import AutoTokenizer
 3 | 
 4 | from llmcompressor.args import ModelArguments
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def tiny_llama_path():
 9 |     return "nm-testing/tinysmokellama-3.2"
10 | 
11 | 
12 | @pytest.fixture
13 | def tiny_llama_model_args(tiny_llama_path):
14 |     return ModelArguments(model=tiny_llama_path)
15 | 
16 | 
17 | @pytest.fixture
18 | def tiny_llama_tokenizer(tiny_llama_model_args):
19 |     tokenizer = AutoTokenizer.from_pretrained(
20 |         tiny_llama_model_args.model,
21 |         cache_dir=tiny_llama_model_args.cache_dir,
22 |         use_fast=True,
23 |         revision=tiny_llama_model_args.model_revision,
24 |         use_auth_token=True if tiny_llama_model_args.use_auth_token else None,
25 |     )
26 |     return tokenizer
27 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from torch.nn import Linear
 3 | 
 4 | from llmcompressor.modifiers.logarithmic_equalization import (
 5 |     LogarithmicEqualizationModifier,
 6 | )
 7 | 
 8 | 
 9 | @pytest.mark.unit
10 | def test_log_equalization_mapping(state):
11 |     mappings = [(["seq.fc2"], "seq.block1.fc1")]
12 |     modifier = LogarithmicEqualizationModifier(mappings=mappings)
13 | 
14 |     modifier.ignore = []
15 |     modifier.resolved_mappings_ = modifier._resolve_mappings(state.model)
16 | 
17 |     assert len(modifier.resolved_mappings_) == len(mappings)
18 | 
19 |     mapping = modifier.resolved_mappings_[0]
20 |     assert mapping.smooth_name == mappings[0][1]
21 |     assert isinstance(mapping.smooth_layer, Linear)
22 |     assert isinstance(mapping.balance_layers[0], Linear)
23 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = open_r1
 7 | known_third_party =
 8 |     transformers
 9 |     datasets
10 |     fugashi
11 |     git
12 |     h5py
13 |     matplotlib
14 |     nltk
15 |     numpy
16 |     packaging
17 |     pandas
18 |     psutil
19 |     pytest
20 |     rouge_score
21 |     sacrebleu
22 |     seqeval
23 |     sklearn
24 |     streamlit
25 |     torch
26 |     tqdm
27 | 
28 | line_length = 119
29 | lines_after_imports = 2
30 | multi_line_output = 3
31 | use_parentheses = True
32 | 
33 | [flake8]
34 | ignore = E203, E501, E741, W503, W605
35 | max-line-length = 119
36 | per-file-ignores =
37 |     # imported but unused
38 |     __init__.py: F401
39 | 
40 | [tool:pytest]
41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/core/model_layer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model layer utility classes for LLM compression workflows.
 3 | 
 4 | Provides dataclass containers for managing model layers and their associated
 5 | parameters during compression operations. Facilitates tracking and manipulation
 6 | of specific model components and their parameters.
 7 | """
 8 | 
 9 | from dataclasses import dataclass
10 | from typing import Any
11 | 
12 | __all__ = ["ModelParameterizedLayer"]
13 | 
14 | 
15 | @dataclass
16 | class ModelParameterizedLayer:
17 |     """
18 |     A dataclass for holding a parameter and its layer
19 | 
20 |     :param layer_name: the name of the layer
21 |     :param layer: the layer object
22 |     :param param_name: the name of the parameter
23 |     :param param: the parameter object
24 |     """
25 | 
26 |     layer_name: str
27 |     layer: Any
28 |     param_name: str
29 |     param: Any
30 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SUCCESS=0
 4 | 
 5 | while getopts "c:t:" OPT; do
 6 |   case ${OPT} in
 7 |     c )
 8 |         CONFIG="$OPTARG"
 9 |         ;;
10 |     t )
11 |         TEST="$OPTARG"
12 |         ;;
13 |     \? )
14 |         exit 1
15 |         ;;
16 |   esac
17 | done
18 | 
19 | # Parse list of configs.
20 | for MODEL_CONFIG in "$CONFIG"/*
21 | do
22 |     LOCAL_SUCCESS=0
23 | 
24 |     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
25 | 
26 |     export TEST_DATA_FILE="$MODEL_CONFIG"
27 |     pytest \
28 |         --capture=tee-sys \
29 |         "$TEST" || LOCAL_SUCCESS=$?
30 | 
31 |     if [[ $LOCAL_SUCCESS == 0 ]]; then
32 |         echo "=== PASSED MODEL: $MODEL_CONFIG ==="
33 |     else
34 |         echo "=== FAILED MODEL: $MODEL_CONFIG ==="
35 |     fi
36 | 
37 |     SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
38 | 
39 | done
40 | 
41 | exit "$SUCCESS"
42 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LLM Compressor is a library for compressing large language models utilizing
 3 | the latest techniques and research in the field for both training aware and
 4 | post-training techniques.
 5 | 
 6 | The library is designed to be flexible and easy to use on top of
 7 | PyTorch and HuggingFace Transformers, allowing for quick experimentation.
 8 | """
 9 | 
10 | # ruff: noqa
11 | 
12 | from .logger import LoggerConfig, configure_logger, logger
13 | from .version import __version__, version
14 | 
15 | __all__ = [
16 |     "__version__",
17 |     "version",
18 |     "configure_logger",
19 |     "logger",
20 |     "LoggerConfig",
21 | ]
22 | 
23 | from llmcompressor.core.session_functions import (
24 |     active_session,
25 |     callbacks,
26 |     create_session,
27 |     reset_session,
28 | )
29 | from llmcompressor.entrypoints import Oneshot, oneshot, train
30 | 


--------------------------------------------------------------------------------
/recipes/accelerate_configs/fsdp.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | enable_cpu_affinity: false
 6 | fsdp_config:
 7 |   fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
 8 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 9 |   fsdp_backward_prefetch: BACKWARD_PRE
10 |   fsdp_cpu_ram_efficient_loading: true
11 |   fsdp_forward_prefetch: true
12 |   fsdp_offload_params: false
13 |   fsdp_sharding_strategy: FULL_SHARD
14 |   fsdp_state_dict_type: FULL_STATE_DICT
15 |   fsdp_sync_module_states: true
16 |   fsdp_use_orig_params: true
17 | machine_rank: 0
18 | main_training_function: main
19 | mixed_precision: bf16
20 | num_machines: 1
21 | num_processes: 8
22 | rdzv_backend: static
23 | same_network: true
24 | tpu_env: []
25 | tpu_use_cluster: false
26 | tpu_use_sudo: false
27 | use_cpu: false


--------------------------------------------------------------------------------
/llm-compressor/examples/trl_mixin/sft_trainer.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from trl import SFTConfig as TRLSFTConfig
 4 | from trl import SFTTrainer as TRLSFTTrainer
 5 | 
 6 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
 7 | 
 8 | __all__ = ["SFTTrainer"]
 9 | 
10 | 
11 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
12 |     def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs):
13 |         if trl_sft_config_args is not None:
14 |             kwargs["args"] = TRLSFTConfig(**trl_sft_config_args)
15 |         super().__init__(*args, **kwargs)
16 | 
17 |     def _prepare_dataset(self, dataset, *args, **kwargs):
18 |         if "input_ids" in dataset.column_names:
19 |             # dataset is already tokenized, skip preprocessing
20 |             return dataset
21 | 
22 |         return super()._prepare_dataset(dataset, *args, **kwargs)
23 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/finetuning/example_alternating_recipe.yaml:
--------------------------------------------------------------------------------
 1 | initial_sparsity_stage:
 2 |   run_type: oneshot
 3 |   obcq_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       block_size: 128
 7 |       dampening_frac: 0.01
 8 |       mask_structure: "0:0"
 9 |       targets: ["Linear"]
10 |       ignore: ["re:.*lm_head"]
11 | initial_training_stage:
12 |   run_type: train
13 |   pruning_modifiers:
14 |     ConstantPruningModifier:
15 |       targets: '__ALL__'
16 |       start: 0
17 | next_sparsity_stage:
18 |   run_type: oneshot
19 |   obcq_modifiers:
20 |     SparseGPTModifier:
21 |       sparsity: 0.7
22 |       block_size: 128
23 |       dampening_frac: 0.01
24 |       mask_structure: "0:0"
25 |       targets: ["Linear"]
26 |       ignore: ["re:.*lm_head"]
27 | next_training_stage:
28 |   run_type: train
29 |   pruning_modifiers:
30 |     ConstantPruningModifier:
31 |       targets: '__ALL__'
32 |       start: 0


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/pipelines/sequential/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from llmcompressor.pipelines.sequential.helpers import get_sequential_ancestors
 4 | 
 5 | 
 6 | class DummyModel(torch.nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.seq = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU())
10 |         self.fc = torch.nn.Linear(20, 5)
11 | 
12 |     def forward(self, x):
13 |         x = self.seq(x)
14 |         return self.fc(x)
15 | 
16 | 
17 | def test_get_sequential_ancestors():
18 |     model = DummyModel()
19 | 
20 |     assert get_sequential_ancestors(model, set()) == set()
21 |     assert get_sequential_ancestors(model, {model}) == set()
22 |     assert get_sequential_ancestors(model, {model.fc}) == {model}
23 |     assert get_sequential_ancestors(model, {model.seq[0]}) == {model, model.seq}
24 |     assert get_sequential_ancestors(model, {model.seq[1]}) == {model, model.seq}
25 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor import train
 4 | from tests.testing_utils import parse_params, requires_gpu
 5 | 
 6 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
 7 | 
 8 | 
 9 | @pytest.mark.integration
10 | @requires_gpu
11 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY))
12 | def test_finetune_without_recipe(config, tmp_path):
13 |     model = config["model"]
14 |     dataset = config["dataset"]
15 |     output = tmp_path / "finetune_output"
16 | 
17 |     recipe_str = None
18 | 
19 |     concatenate_data = False
20 |     max_steps = 50
21 |     splits = "train"
22 | 
23 |     train(
24 |         model=model,
25 |         dataset=dataset,
26 |         output_dir=output,
27 |         recipe=recipe_str,
28 |         max_steps=max_steps,
29 |         concatenate_data=concatenate_data,
30 |         splits=splits,
31 |     )
32 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   sparsity_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       mask_structure: "2:4"
 6 |       targets: ["Linear"]
 7 |       ignore: ["re:.*lm_head"]
 8 | finetuning_stage:
 9 |   finetuning_modifiers:
10 |     ConstantPruningModifier:
11 |       targets: [
12 |         're:.*q_proj.weight',
13 |         're:.*k_proj.weight', 
14 |         're:.*v_proj.weight',
15 |         're:.*o_proj.weight',
16 |         're:.*gate_proj.weight',
17 |         're:.*up_proj.weight',
18 |         're:.*down_proj.weight',
19 |       ]
20 |       start: 0
21 | quantization_stage:
22 |   quantization_modifiers:
23 |     GPTQModifier:
24 |       ignore: ["lm_head"]
25 |       config_groups:
26 |         group_0:
27 |           weights:
28 |             num_bits: 4
29 |             type: "int"
30 |             symmetric: true
31 |             strategy: "channel"
32 |           targets: ["Linear"]
33 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml:
--------------------------------------------------------------------------------
 1 | pruning_stage:
 2 |     obcq_modifiers:
 3 |         SparseGPTModifier:
 4 |             sparsity: 0.5
 5 |             mask_structure: "2:4"
 6 |             targets: ["Linear"]
 7 |             ignore: ["re:.*lm_head"]
 8 | quant_stage:
 9 |     quant_modifiers:
10 |         QuantizationModifier:
11 |             ignore: ["lm_head"]
12 |             config_groups:
13 |                 group_0:
14 |                     weights:
15 |                         num_bits: 8
16 |                         type: float
17 |                         strategy: channel
18 |                         dynamic: false
19 |                         symmetric: true
20 |                     input_activations:
21 |                         num_bits: 8
22 |                         type: float
23 |                         strategy: token
24 |                         dynamic: true
25 |                         symmetric: true
26 |                     targets: ["Linear"]


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   sparsity_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       mask_structure: "2:4"
 6 |       targets: ["Linear"]
 7 |       ignore: ["re:.*lm_head"]
 8 | finetuning_stage:
 9 |   finetuning_modifiers:
10 |     ConstantPruningModifier:
11 |       targets: [
12 |         're:.*q_proj.weight',
13 |         're:.*k_proj.weight', 
14 |         're:.*v_proj.weight',
15 |         're:.*o_proj.weight',
16 |         're:.*gate_proj.weight',
17 |         're:.*up_proj.weight',
18 |         're:.*down_proj.weight',
19 |       ]
20 |       start: 0
21 | quantization_stage:
22 |   quantization_modifiers:
23 |     GPTQModifier:
24 |       ignore: ["lm_head"]
25 |       config_groups:
26 |         group_0:
27 |           weights:
28 |             num_bits: 4
29 |             type: "int"
30 |             symmetric: true
31 |             strategy: "group"
32 |             group_size: 128
33 |           targets: ["Linear"]
34 | 


--------------------------------------------------------------------------------
/recipes/dataset_filtering/filter_dapo.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B
 3 | model_revision: v03.00-step-000008190
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | # We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
 9 | dataset_name: open-r1/DAPO-Math-17k-Processed
10 | dataset_config: all
11 | dataset_split: train
12 | 
13 | # Generation arguments
14 | max_completion_length: 32000
15 | num_generations: 8
16 | temperature: 1.0
17 | 
18 | # Reward func arguments
19 | reward_funcs:
20 | - accuracy
21 | reward_weights:
22 | - 1.0
23 | 
24 | # Filtering arguments. Samples with mean reward outside of low / high will be filtered
25 | pass_rate_min: 0.1
26 | pass_rate_max: 0.6
27 | 
28 | output_dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-v03.00-step-000008190-filter
29 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/custom.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom dataset implementation for JSON and CSV data sources.
 3 | 
 4 | This module provides a CustomDataset class for loading and processing
 5 | local JSON and CSV files for text generation fine-tuning. Supports
 6 | flexible data formats and custom preprocessing pipelines for
 7 | user-provided datasets.
 8 | """
 9 | 
10 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
11 | 
12 | 
13 | @TextGenerationDataset.register(name="custom", alias=["json", "csv"])
14 | class CustomDataset(TextGenerationDataset):
15 |     """
16 |     Child text generation class for custom local dataset supporting load
17 |     for csv and json
18 | 
19 |     :param dataset_args: configuration settings for dataset loading
20 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
21 |         Can also be set to None to load all the splits
22 |     :param processor: processor or tokenizer to use on dataset
23 | 
24 |     """
25 | 
26 |     pass
27 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modeling/test_fuse.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from llmcompressor.modeling.fuse import center_embeddings, fuse_norm_linears
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | def test_center_embeddings():
 9 |     embedding = torch.nn.Embedding(10, 10)
10 |     center_embeddings(embedding)
11 | 
12 |     assert torch.allclose(
13 |         embedding.weight.mean(dim=1), torch.zeros(embedding.num_embeddings), atol=1e-5
14 |     )
15 | 
16 | 
17 | @pytest.mark.unit
18 | def test_fuse_norm_linears():
19 |     norm = torch.nn.LayerNorm((5,))
20 |     norm.weight.data = torch.rand(norm.weight.shape)
21 |     linears = [
22 |         torch.nn.Linear(5, 5),
23 |         torch.nn.Linear(5, 5),
24 |     ]
25 | 
26 |     input = torch.rand((1, 5), requires_grad=False)
27 |     true_output = torch.stack([linear(norm(input)) for linear in linears])
28 | 
29 |     fuse_norm_linears(norm, linears)
30 |     output = torch.stack([linear(norm(input)) for linear in linears])
31 | 
32 |     assert torch.allclose(true_output, output)
33 | 


--------------------------------------------------------------------------------
/recipes/dataset_filtering/filter_python.yaml:
--------------------------------------------------------------------------------
 1 | # Model arguments
 2 | model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges
 3 | model_revision: v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50
 4 | torch_dtype: bfloat16
 5 | attn_implementation: flash_attention_2
 6 | 
 7 | # Data training arguments
 8 | # We edit the DeepSeek chat template to ensure (a) the reasoning block within <think> and </think> is included in the completion and (b) the <think> tag is not part of the prefill so that the format reward works
 9 | dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
10 | dataset_prompt_column: problem
11 | 
12 | # Generation arguments
13 | max_completion_length: 16000
14 | num_generations: 8
15 | temperature: 0.7
16 | 
17 | # Reward func arguments
18 | reward_funcs:
19 | - binary_code
20 | reward_weights:
21 | - 1.0
22 | e2b_router_url: ip-10-53-85-92:8000
23 | 
24 | # Filtering arguments. Samples with mean reward outside of low / high will be filtered
25 | pass_rate_min: 0.1
26 | pass_rate_max: 0.6
27 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/utils/preprocessing_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dataset preprocessing functions for text generation tasks.
 3 | 
 4 | This module provides a registry of preprocessing functions for various
 5 | datasets used in fine-tuning workflows. Includes chat templates,
 6 | instruction formatting, and dataset-specific transformations for
 7 | popular training datasets.
 8 | """
 9 | 
10 | from typing import TYPE_CHECKING, Dict
11 | 
12 | from compressed_tensors.registry import RegistryMixin
13 | 
14 | if TYPE_CHECKING:
15 |     from llmcompressor.transformers.finetune.data.base import TextGenerationDataset
16 | 
17 | 
18 | class PreprocessingFunctionRegistry(RegistryMixin):
19 |     pass
20 | 
21 | 
22 | @PreprocessingFunctionRegistry.register()
23 | def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict):
24 |     PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:"""
25 |     data["prompt"] = PROMPT_DICT.format_map(data)
26 |     data["text"] = data["prompt"] + data["output"]
27 |     return data
28 | 


--------------------------------------------------------------------------------
/utils/import_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from transformers.utils.import_utils import _is_package_available
16 | 
17 | 
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 | 
21 | 
22 | def is_e2b_available() -> bool:
23 |     return _e2b_available
24 | 
25 | 
26 | _morph_available = _is_package_available("morphcloud")
27 | 
28 | 
29 | def is_morph_available() -> bool:
30 |     return _morph_available
31 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/test_quantization.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   quant_modifiers:
 3 |     QuantizationModifier:
 4 |       ignore:
 5 |         - model.layers.0.mlp.down_proj
 6 |         - model.layers.1.mlp.down_proj
 7 |         - model.layers.2.mlp.down_proj
 8 |         - model.layers.3.mlp.down_proj
 9 |         - model.layers.4.mlp.down_proj
10 |         - model.layers.5.mlp.down_proj
11 |       config_groups:
12 |           group_0:
13 |               weights:
14 |                   num_bits: 8
15 |                   type: "int"
16 |                   symmetric: False
17 |                   strategy: "tensor"
18 |               input_activations: null
19 |               output_activations: null
20 |               targets: ["Linear"]
21 |   pruning_modifiers:
22 |     ConstantPruningModifier:
23 |       targets: [
24 |         "re:.*self_attn.q_proj",
25 |         "re:.*self_attn.k_proj",
26 |         "re:.*self_attn.v_proj",
27 |         "re:.*self_attn.o_proj",
28 |         "re:.*mlp.gate_proj",
29 |         "re:.*mlp.up_proj"
30 |       ]
31 |       start: 0
32 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/test_timer/timer_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | from tests.test_timer import Timer
 4 | 
 5 | __all__ = ["log_time", "get_singleton_manager"]
 6 | 
 7 | 
 8 | def get_singleton_manager(enable_logging: bool = True):
 9 |     """
10 |     Return the Timer. If not has not yet been initialized, initialize and
11 |     return. If it has, return the existing Timer.
12 |     """
13 |     if Timer._instance is None:
14 |         Timer._instance = Timer(enable_logging=enable_logging)
15 |     return Timer._instance
16 | 
17 | 
18 | def log_time(func):
19 |     """
20 |     Decorator to time functions. Times for the function are stored using
21 |     the class and function names.
22 |     """
23 | 
24 |     @wraps(func)
25 |     def wrapper(*args, **kwargs):
26 |         TIMER_MANAGER = get_singleton_manager()
27 |         func_name = func.__name__
28 | 
29 |         if not TIMER_MANAGER.enable_logging:
30 |             return func(*args, **kwargs)
31 | 
32 |         with TIMER_MANAGER.time(func_name):
33 |             return func(*args, **kwargs)
34 | 
35 |     return wrapper
36 | 


--------------------------------------------------------------------------------
/src/open_r1/utils/import_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from transformers.utils.import_utils import _is_package_available
16 | 
17 | 
18 | # Use same as transformers.utils.import_utils
19 | _e2b_available = _is_package_available("e2b")
20 | 
21 | 
22 | def is_e2b_available() -> bool:
23 |     return _e2b_available
24 | 
25 | 
26 | _morph_available = _is_package_available("morphcloud")
27 | 
28 | 
29 | def is_morph_available() -> bool:
30 |     return _morph_available
31 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/observers/helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper functions for observer token counting and analysis.
 3 | 
 4 | Provides utility functions for analyzing observer statistics
 5 | and token counts across model modules. Used for monitoring compression
 6 | effects and understanding model behavior during quantization and
 7 | pruning operations.
 8 | """
 9 | 
10 | from collections import Counter
11 | 
12 | import torch
13 | 
14 | __all__ = ["get_observer_token_count"]
15 | 
16 | 
17 | def get_observer_token_count(module: torch.nn.Module) -> Counter:
18 |     """
19 |     Parse the module and return the number of tokens observed by
20 |     each module's observer.
21 | 
22 |     :param module: module to parse
23 |     :return: counter with the number of tokens observed by each observer
24 |     """
25 |     token_counts = Counter()
26 |     for name, module in module.named_modules():
27 |         if name.endswith(".input_observer"):
28 |             token_counts[name.replace(".input_observer", "")] = (
29 |                 module._num_observed_tokens
30 |             )
31 |     return token_counts
32 | 


--------------------------------------------------------------------------------
/llm-compressor/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.mypy]
 6 | files = "src/guidellm"
 7 | 
 8 | [tool.ruff]
 9 | extend-exclude = ["env", "src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"]
10 | line-length = 88
11 | lint.select = ["E", "F", "W", "I"]
12 | lint.extend-ignore = ["E203", "W605"]
13 | 
14 | [tool.ruff.lint.isort]
15 | known-first-party = ["llmcompressor"]
16 | 
17 | [tool.pytest.ini_options]
18 | markers = [
19 |     "smoke: quick tests to check basic functionality",
20 |     "sanity: tests to ensure that new changes do not break existing functionality",
21 |     "regression: detailed tests to ensure major functions work correctly",
22 |     "integration: tests which integrate with a third party service such as HF",
23 |     "unit: tests to ensure code correctness and regression test functionality",
24 |     "example: tests for content in the 'examples' folder",
25 |     "multi_gpu: tests that require multiple GPUs",
26 | ]
27 | tmp_path_retention_policy = "failed"
28 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/guides/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | weight: -5
 3 | ---
 4 | 
 5 | # Guides
 6 | 
 7 | Welcome to the LLM Compressor guides section! Here you'll find comprehensive documentation covering key components and concepts of LLM Compressor. These guides will help you understand the various compression options available, how to apply them effectively, and how to deploy your optimized models for maximum performance.
 8 | 
 9 | ## Key Guides
10 | 
11 | <div class="grid cards" markdown>
12 | 
13 | - :material-tune:{ .lg .middle } Compression Schemes
14 | 
15 |     ---
16 | 
17 |     Explore the available compression schemes for Quantization and Pruning to determine which is best for your use case.
18 | 
19 |     [:octicons-arrow-right-24: Compression Schemes](compression_schemes.md)
20 | 
21 | - :material-content-save:{ .lg .middle } Saving Models
22 | 
23 |     ---
24 | 
25 |     Learn the enhanced ways to save your compressed models with the library's extended `save_pretrained` functionality for compatibility with vLLM deployment.
26 | 
27 |     [:octicons-arrow-right-24: Saving a Model](saving_a_model.md)
28 | 
29 | </div>
30 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/c4.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="c4")
12 | class C4Dataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the C4 dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     def __init__(
22 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 |     ):
24 |         dataset_args = deepcopy(dataset_args)
25 |         dataset_args.dataset = "allenai/c4"
26 |         dataset_args.text_column = "text"
27 | 
28 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
29 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 8
 9 |                         type: "int"
10 |                         symmetric: true
11 |                         strategy: "tensor"
12 |                     input_activations:
13 |                         num_bits: 8
14 |                         type: "int"
15 |                         symmetric: false
16 |                         strategy: "tensor"
17 |                     output_activations: null
18 |                     targets: ["Linear"]
19 |                 group_1:
20 |                     weights:
21 |                         num_bits: 8
22 |                         type: "int"
23 |                         symmetric: true
24 |                         strategy: "tensor"
25 |                     input_activations: null
26 |                     output_activations: null
27 |                     targets: ["Embedding"]
28 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/logarithmic_equalization/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.factory import ModifierFactory
 4 | from llmcompressor.modifiers.logarithmic_equalization.base import (
 5 |     LogarithmicEqualizationModifier,
 6 | )
 7 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
 8 | 
 9 | 
10 | @pytest.mark.unit
11 | @pytest.mark.usefixtures("setup_modifier_factory")
12 | def test_logarithmic_equalization_is_registered():
13 |     smoothing_strength = 0.3
14 |     mappings = [(["layer1", "layer2"], "layer3")]
15 |     modifier = ModifierFactory.create(
16 |         type_="LogarithmicEqualizationModifier",
17 |         allow_experimental=False,
18 |         allow_registered=True,
19 |         smoothing_strength=smoothing_strength,
20 |         mappings=mappings,
21 |     )
22 | 
23 |     assert isinstance(
24 |         modifier, LogarithmicEqualizationModifier
25 |     ), "PyTorch LogarithmicEqualizationModifier not registered"
26 |     assert isinstance(modifier, SmoothQuantModifier)
27 |     assert modifier.smoothing_strength == smoothing_strength
28 |     assert modifier.mappings == mappings
29 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_oneshot_with_modifier.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.pruning.sparsegpt.base import SparseGPTModifier
 5 | from tests.testing_utils import parse_params, requires_gpu
 6 | 
 7 | CONFIGS_DIRECTORY = (
 8 |     "tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparsity_generic"
 9 | )
10 | 
11 | 
12 | @requires_gpu
13 | @pytest.mark.integration
14 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY))
15 | def test_oneshot_with_modifier_object(tmp_path, config):
16 |     output_dir = tmp_path / "oneshot_out"
17 |     recipe_str = [SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])]
18 | 
19 |     concatenate_data = False
20 |     num_calibration_samples = 64
21 |     splits = {"calibration": "train[:10%]"}
22 | 
23 |     oneshot(
24 |         model=config["model"],
25 |         dataset=config["dataset"],
26 |         output_dir=output_dir,
27 |         num_calibration_samples=num_calibration_samples,
28 |         recipe=recipe_str,
29 |         concatenate_data=concatenate_data,
30 |         splits=splits,
31 |     )
32 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides the core compression framework for LLM Compressor.
 3 | 
 4 | The core API manages compression sessions, tracks state changes, handles events
 5 | during compression, and Provides lifecycle hooks for the compression
 6 | process.
 7 | """
 8 | 
 9 | from llmcompressor.core.events import Event, EventType
10 | from llmcompressor.core.lifecycle import CompressionLifecycle
11 | from llmcompressor.core.model_layer import ModelParameterizedLayer
12 | from llmcompressor.core.session import CompressionSession
13 | from llmcompressor.core.session_functions import (
14 |     LifecycleCallbacks,
15 |     active_session,
16 |     callbacks,
17 |     create_session,
18 |     reset_session,
19 | )
20 | from llmcompressor.core.state import Data, Hardware, ModifiedState, State
21 | 
22 | __all__ = [
23 |     "Event",
24 |     "EventType",
25 |     "State",
26 |     "Data",
27 |     "Hardware",
28 |     "ModifiedState",
29 |     "ModelParameterizedLayer",
30 |     "CompressionLifecycle",
31 |     "CompressionSession",
32 |     "create_session",
33 |     "active_session",
34 |     "reset_session",
35 |     "apply",
36 |     "callbacks",
37 |     "LifecycleCallbacks",
38 | ]
39 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/wikitext.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="wikitext")
12 | class WikiTextDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the Open Platypus dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     def __init__(
22 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 |     ):
24 |         dataset_args = deepcopy(dataset_args)
25 |         dataset_args.dataset = "Salesforce/wikitext"
26 |         dataset_args.text_column = "text"
27 | 
28 |         super().__init__(
29 |             dataset_args=dataset_args,
30 |             split=split,
31 |             processor=processor,
32 |         )
33 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/utils/pytorch/test_module.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch.nn as nn
 3 | 
 4 | from llmcompressor.utils.pytorch import get_layer_by_name
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def example_nested_module() -> str:
 9 |     return nn.Sequential(
10 |         nn.Linear(10, 20),
11 |         nn.Sequential(nn.ReLU(), nn.Linear(20, 10)),
12 |         nn.Sequential(nn.SiLU(), nn.Linear(20, 10)),
13 |         nn.Softmax(dim=1),
14 |     )
15 | 
16 | 
17 | @pytest.mark.unit
18 | def test_get_layer_by_name(example_nested_module):
19 |     # Test getting the parent of a nested layer
20 |     layer = get_layer_by_name("0", example_nested_module)
21 |     assert layer == example_nested_module[0]
22 | 
23 |     layer = get_layer_by_name("1.1", example_nested_module)
24 |     assert layer == example_nested_module[1][1]
25 | 
26 |     layer = get_layer_by_name("2.0", example_nested_module)
27 |     assert layer == example_nested_module[2][0]
28 | 
29 |     layer = get_layer_by_name("2.1", example_nested_module)
30 |     assert layer == example_nested_module[2][1]
31 | 
32 |     # Test getting the parent of a non-existent layer
33 |     with pytest.raises(AttributeError):
34 |         get_layer_by_name("non_existent_layer", example_nested_module)
35 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/utils/pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import torch
 4 | 
 5 | __all__ = ["measure_cuda_memory"]
 6 | 
 7 | 
 8 | class measure_cuda_memory:
 9 |     def __init__(self, device=None):
10 |         self.device = device
11 | 
12 |     def reset_peak_memory_stats(self):
13 |         torch.cuda.reset_peak_memory_stats(self.device)
14 | 
15 |     def current_memory_usage(self) -> float:
16 |         # Return the memory usage in bytes.
17 |         self.reset_peak_memory_stats()
18 |         mem = torch.cuda.max_memory_allocated(self.device)
19 |         return mem
20 | 
21 |     def peak_memory_usage(self) -> float:
22 |         # Return the peak memory usage in bytes since the last reset
23 |         mem = torch.cuda.max_memory_allocated(self.device)
24 |         return mem
25 | 
26 |     def __enter__(self):
27 |         self.initial_memory = self.current_memory_usage()
28 |         # This allows us to call methods of the context manager if needed
29 |         return self
30 | 
31 |     def __exit__(self, exc_type, exc_val, exc_tb):
32 |         self.overall_peak_memory = self.peak_memory_usage()
33 |         self.peak_consumed_memory = self.overall_peak_memory - self.initial_memory
34 | 
35 |         # Force garbage collection
36 |         gc.collect()
37 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/test_safetensors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor import train
 6 | from tests.testing_utils import parse_params, requires_gpu
 7 | 
 8 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
 9 | 
10 | 
11 | @pytest.mark.integration
12 | @requires_gpu
13 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY))
14 | def test_safetensors(config, tmp_path):
15 |     model = config["model"]
16 |     dataset = config["dataset"]
17 |     output = tmp_path / "finetune_output"
18 | 
19 |     output_dir = output / "output1"
20 |     max_steps = 10
21 |     splits = {"train": "train[:10%]"}
22 | 
23 |     train(
24 |         model=model,
25 |         dataset=dataset,
26 |         output_dir=output_dir,
27 |         max_steps=max_steps,
28 |         splits=splits,
29 |     )
30 | 
31 |     assert os.path.exists(output_dir / "model.safetensors")
32 |     assert not os.path.exists(output_dir / "pytorch_model.bin")
33 | 
34 |     # test we can also load
35 |     new_output_dir = output / "output2"
36 |     train(
37 |         model=output_dir,
38 |         dataset=dataset,
39 |         output_dir=new_output_dir,
40 |         max_steps=max_steps,
41 |         splits=splits,
42 |     )
43 | 


--------------------------------------------------------------------------------
/openr1_tool/get_tensor_parallel_size.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transformers import AutoConfig
 3 | from math import gcd
 4 | 
 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
 6 |     try:
 7 |         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
 8 |         num_heads = getattr(config, 'num_attention_heads', None)
 9 | 
10 |         if num_heads is not None and num_heads % default_tp != 0:
11 |             tp = gcd(num_heads, default_tp)
12 |             return max(tp, 1)
13 |         else:
14 |             return default_tp
15 |     except Exception as e:
16 |         print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
17 |         return default_tp
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
22 |     parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
23 |     parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
28 |     print(tp)
29 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/pipelines/data_free/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Optional
 2 | 
 3 | import torch
 4 | from torch.utils.data.dataloader import DataLoader
 5 | 
 6 | from llmcompressor.core.session_functions import LifecycleCallbacks
 7 | from llmcompressor.pipelines.registry import CalibrationPipeline
 8 | from llmcompressor.utils.dev import dispatch_for_generation
 9 | 
10 | if TYPE_CHECKING:
11 |     from llmcompressor.args.dataset_arguments import DatasetArguments
12 | 
13 | __all__ = ["DataFreePipeline"]
14 | 
15 | 
16 | @CalibrationPipeline.register("datafree")
17 | class DataFreePipeline(CalibrationPipeline):
18 |     @staticmethod
19 |     def __call__(
20 |         model: torch.nn.Module,
21 |         dataloader: Optional[DataLoader],
22 |         dataset_args: "DatasetArguments",
23 |     ):
24 |         """
25 |         A pipeline for data-free calibration
26 | 
27 |         :param model: model being calibrated
28 |         :param dataloader: loads data for calibration
29 |         :param dataset_args: dataset arguments relevant to pipelines
30 |         """
31 |         # some ops are still performed on the model by modifiers
32 |         # we want those ops to occur on the GPU
33 |         dispatch_for_generation(model)
34 | 
35 |         LifecycleCallbacks.calibration_epoch_start()
36 |         LifecycleCallbacks.calibration_epoch_end()
37 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w4a16_fp4/llama3_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 8 | 
 9 | # Load model.
10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp4 with per group 16 via ptq
16 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])
17 | 
18 | # Apply quantization.
19 | oneshot(model=model, recipe=recipe)
20 | 
21 | print("\n\n")
22 | print("========== SAMPLE GENERATION ==============")
23 | dispatch_for_generation(model)
24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
25 |     model.device
26 | )
27 | output = model.generate(input_ids, max_new_tokens=100)
28 | print(tokenizer.decode(output[0]))
29 | print("==========================================\n\n")
30 | 
31 | 
32 | # Save to disk in compressed-tensors format.
33 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
34 | model.save_pretrained(SAVE_DIR, save_compressed=True)
35 | tokenizer.save_pretrained(SAVE_DIR)
36 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/conf.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | from torch.utils.data import DataLoader
 4 | 
 5 | from llmcompressor.core import Event, EventType, State
 6 | 
 7 | 
 8 | class LifecyleTestingHarness:
 9 |     def __init__(
10 |         self,
11 |         model=None,
12 |         optimizer=None,
13 |         device="cpu",
14 |         start=0,
15 |     ):
16 |         self.state = State()
17 |         self.state.update(
18 |             model=model,
19 |             device=device,
20 |             optimizer=optimizer,
21 |             start=start,
22 |             steps_per_epoch=1,
23 |             calib_data=DataLoader(MagicMock(__len__=lambda _: 0, column_names=[])),
24 |         )
25 | 
26 |     def update_modifier(self, modifier, event_type):
27 |         event = Event(event_type=event_type)
28 |         modifier.update_event(self.state, event=event)
29 | 
30 |     def get_state(self):
31 |         return self.state
32 | 
33 |     def trigger_modifier_for_epochs(self, modifier, num_epochs):
34 |         for _ in range(num_epochs):
35 |             self.update_modifier(modifier, EventType.BATCH_START)
36 |             self.update_modifier(modifier, EventType.LOSS_CALCULATED)
37 |             self.update_modifier(modifier, EventType.OPTIM_PRE_STEP)
38 |             self.update_modifier(modifier, EventType.OPTIM_POST_STEP)
39 |             self.update_modifier(modifier, EventType.BATCH_END)
40 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/getting-started/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | weight: -10
 3 | ---
 4 | 
 5 | # Getting Started
 6 | 
 7 | Welcome to LLM Compressor! This section will guide you through the process of installing the library, compressing your first model, and deploying it with vLLM for faster, more efficient inference.
 8 | 
 9 | LLM Compressor makes it simple to optimize large language models for deployment, offering various quantization techniques that help you find the perfect balance between model quality, performance, and resource efficiency.
10 | 
11 | ## Quick Start Guides
12 | 
13 | Follow the guides below to get started with LLM Compressor and optimize your models for production deployment.
14 | 
15 | <div class="grid cards" markdown>
16 | 
17 | - :material-package-variant:{ .lg .middle } Installation
18 | 
19 |     ---
20 | 
21 |     Learn how to install LLM Compressor using pip or from source.
22 | 
23 |     [:octicons-arrow-right-24: Installation Guide](install.md)
24 | 
25 | - :material-memory:{ .lg .middle } Compress Your Model
26 | 
27 |     ---
28 | 
29 |     Learn how to apply quantization to your models using different algorithms and formats.
30 | 
31 |     [:octicons-arrow-right-24: Compression Guide](compress.md)
32 | 
33 | - :material-rocket-launch:{ .lg .middle } Deploy with vLLM
34 | 
35 |     ---
36 | 
37 |     Deploy your compressed model for efficient inference using vLLM.
38 | 
39 |     [:octicons-arrow-right-24: Deployment Guide](deploy.md)
40 | 
41 | </div>
42 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w4a16_fp4/qwen3_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | # Load model.
 8 | MODEL_ID = "Qwen/Qwen3-32B"
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     MODEL_ID, torch_dtype="auto", trust_remote_code=True
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp4 with per group 16 via ptq
17 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])
18 | 
19 | # Apply quantization.
20 | oneshot(model=model, recipe=recipe)
21 | 
22 | print("\n\n========== SAMPLE GENERATION ==============")
23 | dispatch_for_generation(model)
24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
25 |     model.device
26 | )
27 | output = model.generate(input_ids, max_new_tokens=100)
28 | print(tokenizer.decode(output[0], skip_special_tokens=True))
29 | print("==========================================\n\n")
30 | 
31 | # Save to disk in compressed-tensors format.
32 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
33 | model.save_pretrained(SAVE_DIR, save_compressed=True)
34 | tokenizer.save_pretrained(SAVE_DIR)
35 | 


--------------------------------------------------------------------------------
/trl_trainer/noise_scheduler.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm
 2 | from transformers.models.llama.modeling_llama import LlamaRMSNorm
 3 | import torch
 4 | 
 5 | def get_sigma_by_step(step, total_steps, sigma_trend):
 6 |     step = min(step, total_steps) 
 7 | 
 8 |     num_intervals = len(sigma_trend) + 1 
 9 |     steps_per_interval = total_steps / num_intervals 
10 | 
11 |     interval_id = int(step // steps_per_interval)  
12 | 
13 |     if interval_id == 0:
14 |         return interval_id, 0 
15 | 
16 |     sigma_id = interval_id - 1 
17 |     sigma_id = min(sigma_id, len(sigma_trend) - 1)
18 | 
19 |     sigma = sigma_trend[sigma_id]
20 |     return sigma_id, sigma
21 | 
22 | def generate_gaussian_noise(model, step, total_step, sigma_trend):
23 |     for name, module in model.named_modules():
24 |         if isinstance(module, Qwen2RMSNorm) or isinstance(module, LlamaRMSNorm): 
25 |             weight_tensor = module.weight
26 |             sigma_id, sigma = get_sigma_by_step(step, total_step, sigma_trend)
27 |             print("Current step:", step, "Total steps:", total_step, "Sigma id:", sigma_id, "Sigma:", sigma)
28 |             if sigma == 0:
29 |                 return
30 |             noise = torch.normal(mean=0, std=sigma, size=weight_tensor.shape, dtype=torch.float32).to(weight_tensor.device)
31 |             noise = noise.to(weight_tensor.dtype)  
32 |             with torch.no_grad(): 
33 |                 module.weight.add_(noise)


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/args/recipe_arguments.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Recipe argument classes for LLM compression workflows.
 3 | 
 4 | Defines dataclass-based argument containers for configuring sparsification
 5 | recipes, compression sessions, and stage-based execution parameters used in
 6 | model compression and optimization workflows.
 7 | """
 8 | 
 9 | from dataclasses import dataclass, field
10 | from typing import List, Optional
11 | 
12 | 
13 | @dataclass
14 | class RecipeArguments:
15 |     """Recipe and session variables"""
16 | 
17 |     recipe: Optional[str] = field(
18 |         default=None,
19 |         metadata={
20 |             "help": "Path to a LLM Compressor sparsification recipe",
21 |         },
22 |     )
23 |     recipe_args: Optional[List[str]] = field(
24 |         default=None,
25 |         metadata={
26 |             "help": (
27 |                 "List of recipe arguments to evaluate, of the format key1=value1 "
28 |                 "key2=value2"
29 |             )
30 |         },
31 |     )
32 |     clear_sparse_session: Optional[bool] = field(
33 |         default=False,
34 |         metadata={
35 |             "help": (
36 |                 "Whether to clear CompressionSession/CompressionLifecycle ",
37 |                 "data between runs.",
38 |             )
39 |         },
40 |     )
41 |     stage: Optional[str] = field(
42 |         default=None,
43 |         metadata={"help": ("The stage of the recipe to use for oneshot / train.",)},
44 |     )
45 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/guides/compression_formats.md:
--------------------------------------------------------------------------------
 1 | # Compression Formats
 2 | 
 3 | The following table outlines the possible quantization and sparsity 
 4 | compression formats that are applied to a model during compression.
 5 | The formats are determined according to the quantization scheme and 
 6 | sparsity type. For more details on the quantization schemes, see 
 7 | `guides/compression_schemes.md`.
 8 | 
 9 | 
10 | | Quantization  | Sparsity | Quant Compressor     | Sparsity Compressor |
11 | |---------------|----------|----------------------|---------------------|
12 | | W8A8 - int    | None     | int_quantized        | Dense               |
13 | | W8A8 - float  | None     | float_quantized      | Dense               |
14 | | W4A16 - float | None     | nvfp4_pack_quantized | Dense               |
15 | | W4A4 - float  | None     | nvfp4_pack_quantized | Dense               |
16 | | W4A16 - int   | None     | pack_quantized       | Dense               |
17 | | W8A16 - int   | None     | pack_quantized       | Dense               |
18 | | W8A16 - float | None     | naive_quantized      | Dense               |
19 | | W8A8 - int    | 2:4      | int_quantized        | Sparse24            |
20 | | W8A8 - float  | 2:4      | float_quantized      | Sparse24            |
21 | | W4A16 - int   | 2:4      | marlin_24            | Dense               |
22 | | W8A16 - int   | 2:4      | marlin_24            | Dense               |
23 | | W8A16 - float | 2:4      | naive_quantized      | Dense               |
24 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/args/training_arguments.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Training argument classes for LLM compression workflows.
 3 | 
 4 | This module defines dataclass-based argument containers for configuring
 5 | training and one-shot calibration workflows. Extends HuggingFace's
 6 | TrainingArguments with additional parameters specific to compression and
 7 | stage-based execution.
 8 | """
 9 | 
10 | from dataclasses import dataclass, field
11 | from typing import Optional
12 | 
13 | from transformers import TrainingArguments as HFTrainingArgs
14 | 
15 | __all__ = [
16 |     "TrainingArguments",
17 | ]
18 | 
19 | 
20 | @dataclass
21 | class TrainingArguments(HFTrainingArgs):
22 |     """
23 |     Training arguments specific to LLM Compressor Transformers workflow using
24 |     HFTrainingArgs as base class
25 | 
26 |     """
27 | 
28 |     do_oneshot: Optional[bool] = field(
29 |         default=False,
30 |         metadata={"help": "Whether to run one-shot calibration in stages"},
31 |     )
32 |     run_stages: Optional[bool] = field(
33 |         default=False, metadata={"help": "Whether to trigger recipe stage by stage"}
34 |     )
35 |     output_dir: str = field(
36 |         default="./output",
37 |         metadata={
38 |             "help": "The output directory where the model safetensors, "
39 |             "recipe, config, and optionally checkpoints will be written."
40 |         },
41 |     )
42 | 
43 |     @property
44 |     def place_model_on_device(self):
45 |         return False
46 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="cnn_dailymail")
12 | class CNNDailyMailDataset(TextGenerationDataset):
13 |     """
14 |     Text generation class for the CNN/DailyMail dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"
22 | 
23 |     def __init__(
24 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 |     ):
26 |         dataset_args = deepcopy(dataset_args)
27 |         dataset_args.dataset = "cnn_dailymail"
28 |         dataset_args.dataset_config_name = "3.0.0"
29 | 
30 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 | 
32 |     def dataset_template(self, sample):
33 |         return {
34 |             "text": self.SAMPLE_TEMPLATE.format(
35 |                 article=sample["article"], highlights=sample["highlights"]
36 |             )
37 |         }
38 | 


--------------------------------------------------------------------------------
/openr1_tool/pass_rate_filtering/README.md:
--------------------------------------------------------------------------------
 1 | # Pass rate filtering
 2 | 
 3 | We provide support to filter datasets by generating and computing pass rate on veriable tasks
 4 | 
 5 | See `scripts/pass_rate_filtering/compute_pass_rate.py` and `scripts/pass_rate_filtering/launch_filtering.sh` (hardcoded for DAPO at the moment)
 6 | 
 7 | By default the script chunks the dataset, merge can be run using the following snippet (example for DAPO) :
 8 | 
 9 | from datasets import load_dataset, concatenate_datasets
10 | 
11 | name = "open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-0.3-0.7-filter"
12 | 
13 | ```python
14 | gen_datasets = []
15 | filt_datasets = []
16 | for start in range(0,17400,200):
17 |     end = start + 200
18 |     if start == 17200:
19 |         end = 17398
20 |     gen_config_name = f"gen-{start}-{end}"
21 |     gen_dataset = load_dataset(name, gen_config_name, revision="gen",  split="train")
22 |     gen_datasets.append(gen_dataset)
23 |     
24 |     filt_config_name = f"filt-0.1-0.6-{start}-{end}"
25 |     filt_dataset = load_dataset(name, filt_config_name, revision="pass_rate",  split="train")
26 |     filt_datasets.append(filt_dataset)
27 |     
28 | gen_dataset = concatenate_datasets(gen_datasets)
29 | gen_dataset.push_to_hub(name, config_name="gen", split="train")
30 | print(gen_dataset)
31 | 
32 | filt_dataset = concatenate_datasets(filt_datasets)
33 | filt_dataset.push_to_hub(name, config_name="default", split="train")
34 | 
35 | print(filt_dataset)
36 | ```


--------------------------------------------------------------------------------
/llm-compressor/examples/compressed_inference/fp8_compressed_inference.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | """
 4 | This example covers how to load a quantized model using AutoModelForCausalLM.
 5 | 
 6 | During inference, each layer will be decompressed as needed before the forward pass.
 7 | This saves memory as only a single layer is ever uncompressed at a time, but increases
 8 | runtime as we need to decompress each layer before running the forward pass
 9 | 
10 | """
11 | 
12 | # any model with the "compressed-tensors" quant_method and "compressed"
13 | # quantization_status in the quantization config is supported
14 | MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed"
15 | 
16 | SAMPLE_INPUT = [
17 |     "I love quantization because",
18 |     "What is the capital of France?",
19 |     "def fibonacci(n):",
20 | ]
21 | 
22 | compressed_model = AutoModelForCausalLM.from_pretrained(
23 |     MODEL_STUB,
24 |     torch_dtype="auto",
25 |     device_map="auto",
26 | )
27 | 
28 | # tokenize the sample data
29 | tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
30 | inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
31 |     compressed_model.device
32 | )
33 | 
34 | # run the compressed model and decode the output
35 | output = compressed_model.generate(**inputs, max_length=50)
36 | print("========== SAMPLE GENERATION ==============")
37 | text_output = tokenizer.batch_decode(output)
38 | for sample in text_output:
39 |     print(sample)
40 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/trl_mixin/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse Finetuning with TRL's SFTTrainer
 2 | 
 3 | The `SessionManagerMixin` can be added to other Trainer classes that inherit from 
 4 | [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer).
 5 | 
 6 | For example, we can add LLM Compressor support to TRL's SFTTrainer like so: 
 7 | 
 8 | Note: install `trl` using `pip install trl`
 9 | 
10 | ```python
11 | from trl import SFTTrainer as TRLSFTTrainer
12 | 
13 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
14 |     ...
15 | ```
16 | 
17 | The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during 
18 | supervised finetuning, will full support for all of the original TRL features. The full
19 | class is defined in the script `sft_trainer.py` and requires very minimal 
20 | additional code: just a dataset load override to support passing in tokenized datasets 
21 | to the Trainer. 
22 | 
23 | ### Examples
24 | 
25 | * Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model,
26 | using TRL's dataset preprocessing. Sparsity is maintained throughout training by 
27 | applying a `ConstantPruningModifier` recipe to the `SFTTrainer` 
28 | 
29 | * Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b 
30 | model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained 
31 | throughout training with a `ConstantPruningModifier` and layer-wise knowledge 
32 | distillation is handled by the `OutputDistillationModifier`


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/llama3_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 8 | 
 9 | # Load model.
10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
19 | )
20 | 
21 | # Apply quantization.
22 | oneshot(model=model, recipe=recipe)
23 | 
24 | # Confirm generations of the quantized model look sane.
25 | print("========== SAMPLE GENERATION ==============")
26 | dispatch_for_generation(model)
27 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
28 |     model.device
29 | )
30 | output = model.generate(input_ids, max_new_tokens=20)
31 | print(tokenizer.decode(output[0]))
32 | print("==========================================")
33 | 
34 | # Save to disk in compressed-tensors format.
35 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
36 | model.save_pretrained(SAVE_DIR)
37 | tokenizer.save_pretrained(SAVE_DIR)
38 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/fp8_block_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "Qwen/Qwen3-30B-A3B"
 8 | 
 9 | # Load model.
10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear",
19 |     scheme="FP8_BLOCK",
20 |     ignore=["lm_head", "re:.*mlp.gate$"],
21 | )
22 | 
23 | # Apply quantization.
24 | oneshot(model=model, recipe=recipe)
25 | 
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
30 |     model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(tokenizer.decode(output[0]))
34 | print("==========================================")
35 | 
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-BLOCK"
38 | model.save_pretrained(SAVE_DIR)
39 | tokenizer.save_pretrained(SAVE_DIR)
40 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/smoothquant/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.modifiers.factory import ModifierFactory
 4 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
 5 | 
 6 | 
 7 | @pytest.mark.unit
 8 | @pytest.mark.usefixtures("setup_modifier_factory")
 9 | def test_smooth_quant_is_registered():
10 |     smoothing_strength = 0.3
11 |     mappings = [(["layer1", "layer2"], "layer3")]
12 |     modifier = ModifierFactory.create(
13 |         type_="SmoothQuantModifier",
14 |         allow_experimental=False,
15 |         allow_registered=True,
16 |         smoothing_strength=smoothing_strength,
17 |         mappings=mappings,
18 |     )
19 | 
20 |     assert isinstance(
21 |         modifier, SmoothQuantModifier
22 |     ), "PyTorch SmoothQuant not registered"
23 |     assert modifier.smoothing_strength == smoothing_strength
24 |     assert modifier.mappings == mappings
25 | 
26 | 
27 | @pytest.mark.unit
28 | @pytest.mark.usefixtures("setup_modifier_factory")
29 | def test_smooth_quant_defaults():
30 |     default_sq = SmoothQuantModifier()
31 |     assert default_sq.smoothing_strength == 0.5
32 | 
33 | 
34 | @pytest.mark.unit
35 | def test_override_defaults():
36 |     strength = 0.7
37 |     dummy_map = [(["layer1", "layer2"], "layer3")]
38 |     non_default_sq = SmoothQuantModifier(
39 |         smoothing_strength=strength, mappings=dummy_map
40 |     )
41 | 
42 |     assert non_default_sq.smoothing_strength == strength
43 |     assert non_default_sq.mappings == dummy_map
44 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/transformers/finetune/data/gsm8k.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="gsm8k")
12 | class GSM8KDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the Grade School Math 8k dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     GSM_TEMPLATE = "Question: {question}\nAnswer:"
22 | 
23 |     def __init__(
24 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 |     ):
26 |         dataset_args = deepcopy(dataset_args)
27 |         dataset_args.dataset = "gsm8k"
28 |         dataset_args.text_column = "text"
29 | 
30 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 | 
32 |     def dataset_template(self, sample):
33 |         prompt = self.GSM_TEMPLATE.format(question=sample["question"])
34 |         text = prompt
35 |         if "answer" in sample:
36 |             text += " " + sample["answer"]
37 | 
38 |         return {
39 |             "text": text,
40 |             self.PROMPT_KEY: prompt,
41 |         }
42 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_owl.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from datasets import Dataset
 4 | from transformers import AutoModelForCausalLM
 5 | 
 6 | from llmcompressor.core.session_functions import create_session
 7 | from llmcompressor.datasets import format_calibration_data
 8 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
 9 | from llmcompressor.utils.pytorch.module import get_layers
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_infer_owl_layer_sparsity():
14 |     target_sparsity = 0.7
15 |     vocab_size = 512
16 |     seq_len = 2048
17 |     ds_size = 16
18 | 
19 |     with create_session() as session:
20 |         session.initialize()
21 |         modifier = SparseGPTModifier(
22 |             sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05
23 |         )
24 |         model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2")
25 | 
26 |         dataset = Dataset.from_dict(
27 |             {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))}
28 |         )
29 |         dataloader = format_calibration_data(dataset)
30 | 
31 |         sequential_targets = modifier._infer_sequential_targets(model)
32 |         layers = get_layers(sequential_targets, model)
33 |         sparsities = modifier._infer_owl_layer_sparsity(model, layers, dataloader)
34 |         assert sparsities.keys() == layers.keys()
35 | 
36 |         for sparsity in sparsities.values():
37 |             assert sparsity == pytest.approx(target_sparsity, abs=0.1)
38 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modeling import replace_modules_for_calibration
 5 | from llmcompressor.modifiers.quantization import QuantizationModifier
 6 | 
 7 | # NOTE: Qwen3-VL-MoE support is not in transformers<=4.56.2
 8 | # you may need to install transformers from source
 9 | 
10 | 
11 | MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
12 | 
13 | # Load model.
14 | model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
15 | processor = AutoProcessor.from_pretrained(MODEL_ID)
16 | model = replace_modules_for_calibration(model)
17 | 
18 | # Configure the quantization algorithm and scheme.
19 | # In this case, we:
20 | #   * quantize the weights to fp8 with channel-wise quantization
21 | #   * quantize the activations to fp8 with dynamic token activations
22 | # NOTE: only datafree quantization is supported for Qwen3-VL-MoE currently
23 | recipe = QuantizationModifier(
24 |     targets="Linear",
25 |     scheme="FP8_DYNAMIC",
26 |     ignore=[
27 |         "re:.*lm_head",
28 |         "re:visual.*",
29 |         "re:model.visual.*",
30 |         "re:.*mlp.gate$",
31 |     ],
32 | )
33 | 
34 | # Apply quantization.
35 | oneshot(model=model, recipe=recipe)
36 | 
37 | # Save to disk in compressed-tensors format.
38 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-DYNAMIC"
39 | model.save_pretrained(SAVE_DIR)
40 | processor.save_pretrained(SAVE_DIR)
41 | 


--------------------------------------------------------------------------------
/utils/competitive_programming/ioi_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from functools import lru_cache
 3 | 
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | def add_includes(code: str, problem_id: str) -> str:
 8 |     """
 9 |     Fix common compilation errors for IOI problems.
10 |     """
11 |     if not code:
12 |         return code
13 |     # has most of the useful functions
14 |     code_header = "#include <bits/stdc++.h>\n"
15 |     # include the problem header
16 |     problem_header_include = f'#include "{problem_id}.h"'
17 |     if problem_header_include not in code:
18 |         code_header += problem_header_include + "\n"
19 |     # use namespace std since models forget std:: often
20 |     if "using namespace std;" not in code and "std::" not in code:
21 |         code_header += "\nusing namespace std;\n\n"
22 |     return code_header + code
23 | 
24 | 
25 | @lru_cache
26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
27 |     """
28 |     Load IOI tests for a given year.
29 |     """
30 |     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
31 |     test_cases = defaultdict(dict)
32 |     for test_case in tests_dataset:
33 |         test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
34 |     return test_cases
35 | 
36 | 
37 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
38 |     """
39 |     Load IOI tests for a given year and problem id.
40 |     """
41 |     return load_ioi_tests_for_year(year)[problem_id]
42 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/qwen3_next_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
 8 | 
 9 | # Load model.
10 | model = AutoModelForCausalLM.from_pretrained(
11 |     MODEL_ID,
12 |     torch_dtype="auto",
13 |     low_cpu_mem_usage=True,
14 |     trust_remote_code=True,
15 | )
16 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
17 | 
18 | recipe = QuantizationModifier(
19 |     targets=["Linear"],
20 |     scheme="FP8_DYNAMIC",
21 |     ignore=[
22 |         "lm_head",
23 |         "re:.*mlp.gate$",
24 |         "re:.*mlp.shared_expert_gate$",
25 |         "re:.*linear_attn.*",
26 |     ],
27 | )
28 | 
29 | # Apply quantization.
30 | oneshot(model=model, recipe=recipe)
31 | 
32 | # Confirm generations of the quantized model look sane.
33 | print("========== SAMPLE GENERATION ==============")
34 | dispatch_for_generation(model)
35 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
36 |     model.device
37 | )
38 | output = model.generate(input_ids, max_new_tokens=20)
39 | print(tokenizer.decode(output[0]))
40 | print("==========================================")
41 | 
42 | # Save to disk in compressed-tensors format.
43 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
44 | model.save_pretrained(SAVE_DIR, save_compressed=True)
45 | tokenizer.save_pretrained(SAVE_DIR)
46 | 


--------------------------------------------------------------------------------
/src/open_r1/utils/competitive_programming/ioi_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from functools import lru_cache
 3 | 
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | def add_includes(code: str, problem_id: str) -> str:
 8 |     """
 9 |     Fix common compilation errors for IOI problems.
10 |     """
11 |     if not code:
12 |         return code
13 |     # has most of the useful functions
14 |     code_header = "#include <bits/stdc++.h>\n"
15 |     # include the problem header
16 |     problem_header_include = f'#include "{problem_id}.h"'
17 |     if problem_header_include not in code:
18 |         code_header += problem_header_include + "\n"
19 |     # use namespace std since models forget std:: often
20 |     if "using namespace std;" not in code and "std::" not in code:
21 |         code_header += "\nusing namespace std;\n\n"
22 |     return code_header + code
23 | 
24 | 
25 | @lru_cache
26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]:
27 |     """
28 |     Load IOI tests for a given year.
29 |     """
30 |     tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train")
31 |     test_cases = defaultdict(dict)
32 |     for test_case in tests_dataset:
33 |         test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"]
34 |     return test_cases
35 | 
36 | 
37 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]:
38 |     """
39 |     Load IOI tests for a given year and problem id.
40 |     """
41 |     return load_ioi_tests_for_year(year)[problem_id]
42 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/modifiers/smoothquant/test_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.smoothquant.utils import (
 6 |     get_layer_mappings_from_architecture,
 7 |     handle_mapping_resolution_errors,
 8 | )
 9 | 
10 | smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils"
11 | 
12 | 
13 | @pytest.mark.unit
14 | def test_handle_mapping_resolution_errors():
15 |     README_LOCATION = (
16 |         "https://github.com/vllm-project/llm-compressor/tree/main/"
17 |         "src/llmcompressor/modifiers/smoothquant"
18 |     )
19 | 
20 |     @handle_mapping_resolution_errors
21 |     def func_that_raises_exception():
22 |         raise ValueError("An error occurred")
23 | 
24 |     with pytest.raises(RuntimeError) as excinfo:
25 |         func_that_raises_exception()
26 | 
27 |     assert "Error resolving mappings for given architecture." in str(excinfo.value)
28 |     assert "Please refer to the README at" in str(excinfo.value)
29 |     assert README_LOCATION in str(excinfo.value)
30 | 
31 | 
32 | @pytest.mark.unit
33 | @patch(
34 |     f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"}
35 | )
36 | @patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping")
37 | def test_get_layer_mappings_from_architecture():
38 |     # Test when architecture is in MAPPINGS_REGISTRY
39 |     assert get_layer_mappings_from_architecture("arch1") == "mapping1"
40 | 
41 |     # Test when architecture is not in MAPPINGS_REGISTRY
42 |     assert get_layer_mappings_from_architecture("arch3") == "default_mapping"
43 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.args import DatasetArguments
 4 | from llmcompressor.datasets import make_dataset_splits
 5 | from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset
 6 | 
 7 | 
 8 | @pytest.mark.unit
 9 | def test_combined_datasets():
10 |     dataset_args = DatasetArguments(
11 |         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
12 |     )
13 |     raw_wikitext2 = get_raw_dataset(dataset_args)
14 |     datasets = {"all": raw_wikitext2}
15 |     split_datasets = make_dataset_splits(datasets, do_train=True)
16 |     assert split_datasets.get("train") is not None
17 | 
18 |     split_datasets = make_dataset_splits(datasets, do_train=True)
19 |     assert split_datasets.get("train") is not None
20 | 
21 | 
22 | @pytest.mark.unit
23 | def test_separate_datasets():
24 |     splits = {"train": "train[:5%]", "validation": "train[10%:20%]"}
25 |     dataset_args = DatasetArguments(
26 |         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
27 |     )
28 |     datasets = {}
29 |     for split_name, split_str in splits.items():
30 |         raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str)
31 |         datasets[split_name] = raw_wikitext2
32 | 
33 |     split_datasets = make_dataset_splits(datasets, do_train=True)
34 |     assert split_datasets.get("train") is not None
35 | 
36 |     with pytest.raises(ValueError):
37 |         # fails due to no test split specified
38 | 
39 |         datasets.pop("train")
40 |         split_datasets = make_dataset_splits(datasets, do_train=True)
41 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/qwen2vl_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 8 | 
 9 | # Load model.
10 | model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear",
19 |     scheme="FP8_DYNAMIC",
20 |     ignore=["re:.*lm_head", "re:visual.*"],
21 | )
22 | 
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 | 
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
30 |     model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 | 
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
38 | model.save_pretrained(SAVE_DIR, save_compressed=True)
39 | processor.save_pretrained(SAVE_DIR)
40 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/qwen_2_5_vl_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 8 | 
 9 | # Load model.
10 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear",
19 |     scheme="FP8_DYNAMIC",
20 |     ignore=["lm_head", "re:visual.*", "re:model.visual.*"],
21 | )
22 | 
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 | 
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
30 | output = model.generate(input_ids, max_new_tokens=20)
31 | print(processor.decode(output[0]))
32 | print("==========================================")
33 | 
34 | # Save to disk in compressed-tensors format.
35 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
36 | model.save_pretrained(SAVE_DIR, save_compressed=True)
37 | processor.save_pretrained(SAVE_DIR)
38 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/e2e/vLLM/run_vllm.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | import torch
 5 | from vllm import LLM, SamplingParams
 6 | 
 7 | 
 8 | def parse_args():
 9 |     """Parse JSON arguments passed via command line."""
10 |     if len(sys.argv) < 4:
11 |         msg = "Usage: python script.py '<scheme>' '<llm_kwargs_json>' '<prompts_json>'"
12 |         raise ValueError(msg)
13 | 
14 |     try:
15 |         scheme = json.loads(sys.argv[1])
16 |         llm_kwargs = json.loads(sys.argv[2])
17 |         prompts = json.loads(sys.argv[3])
18 |     except json.JSONDecodeError as e:
19 |         raise ValueError(f"Invalid JSON input: {e}")
20 | 
21 |     if "W4A16_2of4" in scheme:
22 |         # required by the kernel
23 |         llm_kwargs["dtype"] = torch.float16
24 | 
25 |     return llm_kwargs, prompts
26 | 
27 | 
28 | def run_vllm(llm_kwargs: dict, prompts: list[str]) -> None:
29 |     """Run vLLM with given kwargs and prompts, then print outputs."""
30 |     sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
31 | 
32 |     llm = LLM(**llm_kwargs)
33 |     outputs = llm.generate(prompts, sampling_params)
34 | 
35 |     print("================= vLLM GENERATION =================")
36 |     for output in outputs:
37 |         if not output or not output.outputs:
38 |             print("[Warning] Empty output for prompt:", output.prompt)
39 |             continue
40 | 
41 |         print(f"\nPROMPT:\n{output.prompt}")
42 |         print(f"GENERATED TEXT:\n{output.outputs[0].text}")
43 | 
44 | 
45 | def main():
46 |     llm_kwargs, prompts = parse_args()
47 |     run_vllm(llm_kwargs, prompts)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/llava1.5_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 8 | 
 9 | # Load model.
10 | model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear",
19 |     scheme="FP8_DYNAMIC",
20 |     ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"],
21 | )
22 | 
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 | 
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
30 |     model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 | 
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
38 | model.save_pretrained(SAVE_DIR, save_compressed=True)
39 | processor.save_pretrained(SAVE_DIR)
40 | 


--------------------------------------------------------------------------------
/llm-compressor/src/llmcompressor/recipe/metadata.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Metadata classes for recipe and model information tracking.
 3 | 
 4 | This module defines Pydantic models for capturing and validating metadata about
 5 | datasets, parameters, layers, and models used in compression recipes. Provides
 6 | structured data containers for recipe configuration and execution tracking.
 7 | """
 8 | 
 9 | from typing import Any, Dict, List, Optional
10 | 
11 | from pydantic import BaseModel, Field
12 | 
13 | __all__ = [
14 |     "DatasetMetaData",
15 |     "ParamMetaData",
16 |     "LayerMetaData",
17 |     "ModelMetaData",
18 | ]
19 | 
20 | 
21 | class DatasetMetaData(BaseModel):
22 |     name: str = None
23 |     version: str = None
24 |     hash: str = None
25 |     shape: List[int] = Field(default_factory=list)
26 |     num_classes: int = None
27 |     num_train_samples: int = None
28 |     num_val_samples: int = None
29 |     num_test_samples: int = None
30 | 
31 | 
32 | class ParamMetaData(BaseModel):
33 |     name: str = None
34 |     shape: List[int] = None
35 |     weight_hash: str = None
36 | 
37 | 
38 | class LayerMetaData(BaseModel):
39 |     name: str = None
40 |     type: str = None
41 |     index: int = None
42 |     attributes: Dict[str, Any] = None
43 |     input_shapes: List[List[int]] = None
44 |     output_shapes: List[List[int]] = None
45 |     params: Dict[str, ParamMetaData] = None
46 | 
47 | 
48 | class ModelMetaData(BaseModel):
49 |     architecture: str = None
50 |     sub_architecture: str = None
51 |     input_shapes: List[List[int]] = None
52 |     output_shapes: List[List[int]] = None
53 |     layers: List[LayerMetaData] = Field(default_factory=list)
54 |     layer_prefix: Optional[str] = None
55 | 


--------------------------------------------------------------------------------
/llm-compressor/examples/quantization_w8a8_fp8/llama3.2_vision_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, MllamaForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | from llmcompressor.utils import dispatch_for_generation
 6 | 
 7 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 8 | 
 9 | # Load model.
10 | model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
11 | processor = AutoProcessor.from_pretrained(MODEL_ID)
12 | 
13 | # Configure the quantization algorithm and scheme.
14 | # In this case, we:
15 | #   * quantize the weights to fp8 with per channel via ptq
16 | #   * quantize the activations to fp8 with dynamic per token
17 | recipe = QuantizationModifier(
18 |     targets="Linear",
19 |     scheme="FP8_DYNAMIC",
20 |     ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_model.*"],
21 | )
22 | 
23 | # Apply quantization and save to disk in compressed-tensors format.
24 | oneshot(model=model, recipe=recipe)
25 | 
26 | # Confirm generations of the quantized model look sane.
27 | print("========== SAMPLE GENERATION ==============")
28 | dispatch_for_generation(model)
29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to(
30 |     model.device
31 | )
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 | 
36 | # Save to disk in compressed-tensors format.
37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
38 | model.save_pretrained(SAVE_DIR, save_compressed=True)
39 | processor.save_pretrained(SAVE_DIR)
40 | 


--------------------------------------------------------------------------------
/setup_env.sh:
--------------------------------------------------------------------------------
 1 | GIT_LFS_SKIP_SMUDGE=1 pip install -e ".[dev]"
 2 | pip install torch==2.7.1
 3 | pip install torchaudio==2.7.1
 4 | pip install flash-attn==2.7.4.post1 --no-build-isolation
 5 | pip install trl==0.21.0
 6 | pip install vllm==0.10.1
 7 | # replace vllm/vllm/lora/models.py with vllm_replacement/models.py
 8 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
 9 | cp -rv replacement/vllm_replacement/models.py $site_pkg_path/vllm/lora/models.py
10 | # replace vllm/vllm/lora/worker_manager.py with vllm_replacement/worker_manager.py
11 | cp -rv replacement/vllm_replacement/worker_manager.py $site_pkg_path/vllm/lora/worker_manager.py
12 | # make an empty folder to pass asserts in vllm lora requests
13 | mkdir -p simon_lora_path simon_stub_path
14 | 
15 | pip install peft
16 | 
17 | git clone --branch 0.11.0 --depth 1 https://github.com/neuralmagic/compressed-tensors.git
18 | cd compressed-tensors
19 | pip install -e . --no-deps
20 | cd ..
21 | # replace compressed-tensors/src/compressed_tensors/linear/compressed_linear.py with compressed-tensors_replacement/compressed_linear.py 
22 | cp replacement/compressed-tensors_replacement/compressed_linear.py compressed-tensors/src/compressed_tensors/linear/compressed_linear.py
23 | # replace compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py with compressed-tensors_replacement/forward.py
24 | cp replacement/compressed-tensors_replacement/forward.py compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py
25 | 
26 | pip install accelerate==1.10.1 --no-deps
27 | 
28 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])')
29 | cp -rv replacement/trainer.py $site_pkg_path/transformers/trainer.py
30 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/unit/core/events/test_event.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.core import Event, EventType
 4 | 
 5 | 
 6 | @pytest.mark.smoke
 7 | def test_event_epoch_based():
 8 |     event = Event(steps_per_epoch=10)
 9 |     assert event.epoch_based is True
10 | 
11 | 
12 | @pytest.mark.smoke
13 | def test_event_epoch():
14 |     event = Event(steps_per_epoch=10, global_step=25)
15 |     assert event.epoch == 2
16 | 
17 | 
18 | @pytest.mark.smoke
19 | def test_event_epoch_full():
20 |     event = Event(steps_per_epoch=10, global_step=25)
21 |     assert event.epoch_full == 2.5
22 | 
23 | 
24 | @pytest.mark.smoke
25 | def test_event_epoch_step():
26 |     event = Event(steps_per_epoch=10, global_step=25)
27 |     assert event.epoch_step == 5
28 | 
29 | 
30 | @pytest.mark.smoke
31 | def test_event_epoch_batch():
32 |     event = Event(
33 |         steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50
34 |     )
35 |     assert event.epoch_batch == 10
36 | 
37 | 
38 | @pytest.mark.smoke
39 | def test_event_current_index():
40 |     event = Event(steps_per_epoch=10, global_step=25)
41 |     assert event.current_index == 2.5
42 | 
43 | 
44 | @pytest.mark.smoke
45 | def test_event_should_update():
46 |     event = Event(steps_per_epoch=10, global_step=25)
47 |     assert event.should_update(start=0, end=30, update=2.5) is True
48 |     assert event.should_update(start=0, end=20, update=5) is False
49 |     assert event.should_update(start=0, end=30, update=0) is True
50 | 
51 | 
52 | @pytest.mark.smoke
53 | def test_event_new_instance():
54 |     event = Event(type_=EventType.INITIALIZE, global_step=25)
55 |     new_event = event.new_instance(global_step=30)
56 |     assert new_event.global_step == 30
57 |     assert new_event.type_ == EventType.INITIALIZE
58 | 


--------------------------------------------------------------------------------
/llm-compressor/docs/developer/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | weight: -3
 3 | ---
 4 | 
 5 | # Developer
 6 | 
 7 | Welcome to the Developer section of LLM Compressor! This area provides essential resources for developers who want to contribute to or extend LLM Compressor. Whether you're interested in fixing bugs, adding new features, improving documentation, or understanding the project's governance, you'll find comprehensive guides to help you get started.
 8 | 
 9 | LLM Compressor is an open-source project that values community contributions. We maintain high standards for code quality, documentation, and community interactions to ensure that LLM Compressor remains a robust, reliable, and user-friendly tool for compressing large language models.
10 | 
11 | ## Developer Resources
12 | 
13 | <div class="grid cards" markdown>
14 | 
15 | - :material-handshake:{ .lg .middle } Code of Conduct
16 | 
17 |     ---
18 | 
19 |     Our community guidelines ensure that participation in the LLM Compressor project is a positive, inclusive, and respectful experience for everyone.
20 | 
21 |     [:octicons-arrow-right-24: Code of Conduct](code-of-conduct.md)
22 | 
23 | - :material-source-pull:{ .lg .middle } Contributing Guide
24 | 
25 |     ---
26 | 
27 |     Learn how to effectively contribute to LLM Compressor, including reporting bugs, suggesting features, improving documentation, and submitting code.
28 | 
29 |     [:octicons-arrow-right-24: Contributing Guide](contributing.md)
30 | 
31 | - :material-tools:{ .lg .middle } Development Guide
32 | 
33 |     ---
34 | 
35 |     Detailed instructions for setting up your development environment, implementing changes, and adhering to the project's coding standards and best practices.
36 | 
37 |     [:octicons-arrow-right-24: Development Guide](developing.md)
38 | 
39 | </div>
40 | 


--------------------------------------------------------------------------------
/src/open_r1/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
 3 | 
 4 | from trl import ModelConfig, get_kbit_device_map, get_quantization_config
 5 | 
 6 | from ..configs import GRPOConfig, SFTConfig
 7 | 
 8 | 
 9 | def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer:
10 |     """Get the tokenizer for the model."""
11 |     tokenizer = AutoTokenizer.from_pretrained(
12 |         model_args.model_name_or_path,
13 |         revision=model_args.model_revision,
14 |         trust_remote_code=model_args.trust_remote_code,
15 |     )
16 | 
17 |     if training_args.chat_template is not None:
18 |         tokenizer.chat_template = training_args.chat_template
19 | 
20 |     return tokenizer
21 | 
22 | 
23 | def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM:
24 |     """Get the model"""
25 |     torch_dtype = (
26 |         model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
27 |     )
28 |     quantization_config = get_quantization_config(model_args)
29 |     model_kwargs = dict(
30 |         revision=model_args.model_revision,
31 |         trust_remote_code=model_args.trust_remote_code,
32 |         attn_implementation=model_args.attn_implementation,
33 |         torch_dtype=torch_dtype,
34 |         use_cache=False if training_args.gradient_checkpointing else True,
35 |         device_map=get_kbit_device_map() if quantization_config is not None else None,
36 |         quantization_config=quantization_config,
37 |     )
38 |     model = AutoModelForCausalLM.from_pretrained(
39 |         model_args.model_name_or_path,
40 |         **model_kwargs,
41 |     )
42 |     return model
43 | 


--------------------------------------------------------------------------------
/llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_lm_head.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | import pytest
 4 | import torch
 5 | from transformers import AutoModelForCausalLM
 6 | 
 7 | from llmcompressor.core.state import State
 8 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
 9 | 
10 | 
11 | @pytest.fixture
12 | def model():
13 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
14 |     return AutoModelForCausalLM.from_pretrained(
15 |         "nm-testing/tinysmokellama-3.2", device_map=device
16 |     )
17 | 
18 | 
19 | @pytest.fixture
20 | def dataloader():
21 |     dataset = MagicMock()
22 |     dataset.column_names = []
23 |     dataloader = MagicMock()
24 |     dataloader.dataset = dataset
25 |     dataloader.__iter__.return_value = iter([])
26 |     return dataloader
27 | 
28 | 
29 | @pytest.mark.integration
30 | @pytest.mark.parametrize("extra_targets,expected", [([], 0), (["lm_head"], 1)])
31 | def test_lm_head(extra_targets, expected, model, dataloader):
32 |     kwargs = {
33 |         "sparsity": 0.5,
34 |         "block_size": 128,
35 |         "targets": [
36 |             "model.layers.0",
37 |             "model.layers.1",
38 |             "model.layers.2",
39 |             "model.layers.3",
40 |             "model.layers.4",
41 |             "model.layers.5",
42 |         ]
43 |         + extra_targets,
44 |     }
45 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
46 | 
47 |     modifier = SparseGPTModifier(**kwargs)
48 | 
49 |     state = State()
50 |     state.update(model=model, device=device, calib_data=dataloader)
51 |     modifier.initialize(state)
52 |     modifier.on_start(state, None)
53 | 
54 |     assert len(model.lm_head._forward_hooks) == expected
55 | 
56 |     modifier.finalize(state)
57 | 


--------------------------------------------------------------------------------