├── assets ├── logo ├── aqn.png ├── lora.png ├── qerl.png ├── curve.png ├── da_gr.png ├── logo4.png ├── performance.png └── rank_speed.png ├── eval └── __init__.py ├── trl_trainer ├── __init__.py └── noise_scheduler.py ├── llm-compressor ├── tests │ ├── __init__.py │ ├── e2e │ │ ├── __init__.py │ │ └── vLLM │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ ├── fp4_nvfp4a16.yaml │ │ │ ├── fp8_dynamic_per_token_qwen.yaml │ │ │ ├── fp8_dynamic_per_token.yaml │ │ │ ├── fp8_static_per_tensor.yaml │ │ │ ├── int8_dynamic_per_token.yaml │ │ │ ├── fp8_weight_only_channel.yaml │ │ │ ├── fp8_weight_only_tensor.yaml │ │ │ ├── fp4_nvfp4.yaml │ │ │ ├── w4a16_grouped_quant.yaml │ │ │ ├── w8a16_grouped_quant.yaml │ │ │ ├── kv_cache_tinyllama.yaml │ │ │ ├── w4a16_channel_quant.yaml │ │ │ ├── w4a16_channel_quant_qwen.yaml │ │ │ ├── w8a16_channel_quant.yaml │ │ │ ├── kv_cache_gptq_tinyllama.yaml │ │ │ ├── sparse2of4_fp8_dynamic_qwen.yaml │ │ │ ├── w4a16_2of4_channel_quant.yaml │ │ │ ├── w4a16_2of4_grouped_quant.yaml │ │ │ ├── kv_cache_phi3.yaml │ │ │ ├── sparse_24.yaml │ │ │ ├── sparse2of4_fp8_dynamic.yaml │ │ │ ├── w4a16_actorder_none_qwen.yaml │ │ │ ├── w4a16_actorder_group_qwen.yaml │ │ │ ├── int8_tensor_weight_static_per_tensor_act_qwen.yaml │ │ │ ├── w4a16_actorder_weight_qwen.yaml │ │ │ ├── w4a16_grouped_quant_asym_awq.yaml │ │ │ ├── int8_channel_weight_static_per_tensor_act.yaml │ │ │ ├── w4a16_actorder_group.yaml │ │ │ ├── w4a16_actorder_none.yaml │ │ │ ├── w4a16_grouped_quant_sym_awq.yaml │ │ │ ├── w8a8_static_asym.yaml │ │ │ ├── int8_tensor_weight_static_per_tensor_act.yaml │ │ │ ├── w4a16_actorder_weight.yaml │ │ │ └── w8a8_dynamic_asym.yaml │ │ │ ├── recipes │ │ │ ├── kv_cache │ │ │ │ ├── default.yaml │ │ │ │ └── gptq.yaml │ │ │ ├── Sparse_2of4 │ │ │ │ ├── recipe_sparse_2of4.yaml │ │ │ │ └── recipe_sparse_2of4_fp8_dynamic.yaml │ │ │ ├── WNA16 │ │ │ │ ├── recipe_w4a16_awq_asym.yaml │ │ │ │ ├── recipe_w4a16_channel_quant.yaml │ │ │ │ ├── recipe_w8a16_channel_quant.yaml │ │ │ │ └── recipe_w4a16_awq_sym.yaml │ │ │ ├── FP8 │ │ │ │ ├── recipe_fp8_weight_only_channel.yaml │ │ │ │ ├── recipe_fp8_weight_only_per_tensor.yaml │ │ │ │ └── recipe_fp8_dynamic.yaml │ │ │ ├── actorder │ │ │ │ ├── recipe_w4a16_actorder_none.yaml │ │ │ │ ├── recipe_w4a16_actorder_group.yaml │ │ │ │ └── recipe_w4a16_actorder_weight.yaml │ │ │ ├── INT8 │ │ │ │ ├── recipe_int8_tensor_weight_static_per_tensor_act.yaml │ │ │ │ ├── recipe_int8_channel_weight_static_per_tensor_act.yaml │ │ │ │ ├── recipe_w8a8_static_asym.yaml │ │ │ │ ├── recipe_int8_channel_weight_dynamic_per_token.yaml │ │ │ │ └── recipe_w8a8_dynamic_asym.yaml │ │ │ └── WNA16_2of4 │ │ │ │ ├── 2of4_w4a16_recipe.yaml │ │ │ │ └── 2of4_w4a16_group-128_recipe.yaml │ │ │ ├── run_tests.sh │ │ │ └── run_vllm.py │ ├── unit │ │ ├── __init__.py │ │ └── core │ │ │ ├── __init__.py │ │ │ └── events │ │ │ ├── __init__.py │ │ │ └── test_event.py │ ├── examples │ │ └── __init__.py │ ├── lmeval │ │ ├── __init__.py │ │ └── configs │ │ │ ├── fp8_dynamic_per_token.yaml │ │ │ ├── fp8_static_per_tensor.yaml │ │ │ ├── w4a16_grouped_quant.yaml │ │ │ ├── w4a16_awq_sym.yaml │ │ │ ├── w4a16_actorder_none.yaml │ │ │ ├── w4a16_actorder_group.yaml │ │ │ ├── w4a16_actorder_weight.yaml │ │ │ ├── int8_w8a8_dynamic_per_token.yaml │ │ │ ├── vl_fp8_dynamic_per_token.yaml │ │ │ ├── w4a4_nvfp4.yaml │ │ │ ├── vl_w4a16_actorder_weight.yaml │ │ │ └── vl_int8_w8a8_dynamic_per_token.yaml │ ├── llmcompressor │ │ ├── __init__.py │ │ ├── recipe │ │ │ └── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── pytorch │ │ │ │ ├── __init__.py │ │ │ │ └── test_module.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ └── utils │ │ │ │ └── __init__.py │ │ ├── modifiers │ │ │ ├── __init__.py │ │ │ ├── awq │ │ │ │ └── __init__.py │ │ │ ├── calibration │ │ │ │ └── __init__.py │ │ │ ├── pruning │ │ │ │ ├── __init__.py │ │ │ │ ├── wanda │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_base.py │ │ │ │ └── sparsegpt │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_base.py │ │ │ ├── smoothquant │ │ │ │ ├── __init__.py │ │ │ │ ├── test_base.py │ │ │ │ └── test_utils.py │ │ │ ├── quantization │ │ │ │ └── __init__.py │ │ │ ├── logarithmic_equalization │ │ │ │ ├── __init__.py │ │ │ │ └── test_base.py │ │ │ ├── transform │ │ │ │ └── test_serialization.py │ │ │ └── conf.py │ │ ├── transformers │ │ │ ├── __init__.py │ │ │ ├── finetune │ │ │ │ ├── __init__.py │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── conftest.py │ │ │ │ │ └── test_dataset_helpers.py │ │ │ │ ├── finetune_generic │ │ │ │ │ └── config1.yaml │ │ │ │ ├── finetune_custom │ │ │ │ │ ├── config1.yaml │ │ │ │ │ ├── config2.yaml │ │ │ │ │ └── gpu │ │ │ │ │ │ └── gpu_config.yaml │ │ │ │ ├── finetune_tokenizer │ │ │ │ │ └── config1.yaml │ │ │ │ ├── finetune_oneshot_configs │ │ │ │ │ ├── gpu │ │ │ │ │ │ └── gpu_config.yaml │ │ │ │ │ └── config.yaml │ │ │ │ ├── test_finetune_recipe.yaml │ │ │ │ ├── test_alternate_recipe.yaml │ │ │ │ ├── test_finetune_without_recipe.py │ │ │ │ ├── test_quantization.yaml │ │ │ │ └── test_safetensors.py │ │ │ ├── oneshot │ │ │ │ ├── __init__.py │ │ │ │ └── oneshot_configs │ │ │ │ │ ├── recipes │ │ │ │ │ └── recipe.yaml │ │ │ │ │ ├── tiny_smoke_conf6.yaml │ │ │ │ │ ├── tiny_smoke_conf5.yaml │ │ │ │ │ ├── tiny_smoke_conf2.yaml │ │ │ │ │ ├── tiny_smoke_conf3.yaml │ │ │ │ │ ├── tiny_smoke_conf1.yaml │ │ │ │ │ └── tiny_smoke_conf4.yaml │ │ │ ├── compression │ │ │ │ ├── __init__.py │ │ │ │ ├── configs │ │ │ │ │ ├── fp8_smoke.yaml │ │ │ │ │ ├── inputs_smoke.yaml │ │ │ │ │ ├── channelwise_smoke.yaml │ │ │ │ │ ├── weights_only_smoke.yaml │ │ │ │ │ ├── weights_only_1.1b.yaml │ │ │ │ │ ├── fp8_1.1b.yaml │ │ │ │ │ ├── group_1.1b.yaml │ │ │ │ │ ├── inputs_1.1b.yaml │ │ │ │ │ ├── channelwise_1.1b.yaml │ │ │ │ │ ├── actorder_group_1.1b.yaml │ │ │ │ │ └── actorder_weight_1.1b.yaml │ │ │ │ ├── decompression_configs │ │ │ │ │ ├── w8a8.yaml │ │ │ │ │ ├── w4a16.yaml │ │ │ │ │ ├── w8a16_dense.yaml │ │ │ │ │ └── fp8_dynamic.yaml │ │ │ │ ├── recipes │ │ │ │ │ ├── sparse_24.yaml │ │ │ │ │ ├── new_quant_fp8.yaml │ │ │ │ │ ├── smoothquant_gptq_w8a8.yaml │ │ │ │ │ ├── new_quant_channel.yaml │ │ │ │ │ ├── new_quant_weight.yaml │ │ │ │ │ ├── new_quant_group.yaml │ │ │ │ │ ├── new_quant_actorder_group.yaml │ │ │ │ │ ├── new_quant_actorder_weight.yaml │ │ │ │ │ ├── new_quant_full.yaml │ │ │ │ │ ├── sparse_24_fp8.yaml │ │ │ │ │ └── new_quant_simple.yaml │ │ │ │ ├── run_compressed_configs │ │ │ │ │ ├── w4a16.yaml │ │ │ │ │ ├── w8a16.yaml │ │ │ │ │ ├── fp8_dynamic.yaml │ │ │ │ │ └── w8a8.yaml │ │ │ │ └── test_has_gpu.py │ │ │ ├── sparsegpt │ │ │ │ ├── sparsegpt_configs │ │ │ │ │ ├── sparsity_generic │ │ │ │ │ │ └── config.yaml │ │ │ │ │ ├── sparse │ │ │ │ │ │ ├── tiny_llama_sparse.yaml │ │ │ │ │ │ └── gpu │ │ │ │ │ │ │ └── llama_7b_sparse.yaml │ │ │ │ │ ├── completion │ │ │ │ │ │ ├── tiny_llama_quant.yaml │ │ │ │ │ │ ├── gpu │ │ │ │ │ │ │ ├── llama_7b_quant.yaml │ │ │ │ │ │ │ ├── llama_7b_sparse.yml │ │ │ │ │ │ │ └── llama_7b_quant_and_sparse.yaml │ │ │ │ │ │ └── tiny_llama_quant_and_sparse.yaml │ │ │ │ │ ├── consec_runs │ │ │ │ │ │ ├── tiny_llama_consec_runs.yaml │ │ │ │ │ │ └── gpu │ │ │ │ │ │ │ └── llama_consec_runs.yaml │ │ │ │ │ └── mask_structure │ │ │ │ │ │ └── tiny_llama_mask_structure_preservation.yaml │ │ │ │ ├── recipes │ │ │ │ │ ├── sparse.yaml │ │ │ │ │ ├── additional_sparsity.yaml │ │ │ │ │ ├── sparse_with_mask_structure.yaml │ │ │ │ │ ├── quant.yaml │ │ │ │ │ ├── test_tiny2.yaml │ │ │ │ │ ├── quant_and_sparse.yaml │ │ │ │ │ └── additional_sparsity_with_quant.yaml │ │ │ │ ├── test_sparsegpt_infer_targets.py │ │ │ │ ├── __init__.py │ │ │ │ ├── test_oneshot_with_modifier.py │ │ │ │ ├── test_sparsegpt_owl.py │ │ │ │ └── test_sparsegpt_lm_head.py │ │ │ └── conftest.py │ │ ├── pytorch │ │ │ ├── modifiers │ │ │ │ ├── __init__.py │ │ │ │ ├── pruning │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── constant │ │ │ │ │ │ └── __init__.py │ │ │ │ │ ├── sparsegpt │ │ │ │ │ │ └── __init__.py │ │ │ │ │ └── wanda │ │ │ │ │ │ └── test_pytorch.py │ │ │ │ ├── smoothquant │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_pytorch.py │ │ │ │ ├── logarithmic_equalization │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_pytorch.py │ │ │ │ └── conftest.py │ │ │ ├── __init__.py │ │ │ └── utils │ │ │ │ └── __init__.py │ │ ├── test_sentinel.py │ │ ├── observers │ │ │ └── __init__.py │ │ ├── pipelines │ │ │ └── sequential │ │ │ │ └── test_helpers.py │ │ └── modeling │ │ │ └── test_fuse.py │ └── test_timer │ │ ├── __init__.py │ │ └── timer_utils.py ├── docs │ ├── scripts │ │ ├── __init__.py │ │ └── mathjax.js │ ├── stylesheets │ │ └── style.css │ ├── assets │ │ ├── llmcompressor-icon.png │ │ ├── llmcompressor-icon-white.png │ │ └── llmcompressor-user-flows.png │ ├── README.md │ ├── examples │ │ └── index.md │ ├── Makefile │ ├── guides │ │ ├── index.md │ │ └── compression_formats.md │ ├── getting-started │ │ └── index.md │ └── developer │ │ └── index.md ├── .coveragerc ├── src │ └── llmcompressor │ │ ├── pytorch │ │ ├── model_load │ │ │ └── __init__.py │ │ ├── utils │ │ │ ├── sparsification_info │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ └── __init__.py │ │ ├── modifiers │ │ ├── experimental │ │ │ └── __init__.py │ │ ├── pruning │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ └── pytorch │ │ │ │ │ └── __init__.py │ │ │ ├── wanda │ │ │ │ └── __init__.py │ │ │ ├── constant │ │ │ │ └── __init__.py │ │ │ ├── magnitude │ │ │ │ └── __init__.py │ │ │ ├── sparsegpt │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── distillation │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ └── pytorch │ │ │ │ │ └── __init__.py │ │ │ ├── output │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── obcq │ │ │ ├── __init__.py │ │ │ └── sgpt_base.py │ │ ├── smoothquant │ │ │ └── __init__.py │ │ ├── quantization │ │ │ ├── gptq │ │ │ │ └── __init__.py │ │ │ ├── quantization │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── transform │ │ │ ├── quip │ │ │ │ └── __init__.py │ │ │ ├── spinquant │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ ├── logarithmic_equalization │ │ │ └── __init__.py │ │ ├── awq │ │ │ └── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── constants.py │ │ └── __init__.py │ │ ├── transformers │ │ ├── compression │ │ │ └── __init__.py │ │ ├── tracing │ │ │ └── __init__.py │ │ ├── finetune │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ ├── custom.py │ │ │ │ ├── c4.py │ │ │ │ ├── wikitext.py │ │ │ │ ├── cnn_dailymail.py │ │ │ │ └── gsm8k.py │ │ │ └── trainer.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── preprocessing_functions.py │ │ └── __init__.py │ │ ├── utils │ │ ├── fsdp │ │ │ └── __init__.py │ │ ├── pytorch │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ └── __init__.py │ │ ├── pipelines │ │ ├── basic │ │ │ └── __init__.py │ │ ├── data_free │ │ │ ├── __init__.py │ │ │ └── pipeline.py │ │ ├── independent │ │ │ └── __init__.py │ │ ├── sequential │ │ │ ├── __init__.py │ │ │ └── README.md │ │ ├── layer_sequential │ │ │ └── __init__.py │ │ └── __init__.py │ │ ├── metrics │ │ ├── utils │ │ │ └── __init__.py │ │ └── __init__.py │ │ ├── entrypoints │ │ └── __init__.py │ │ ├── datasets │ │ └── __init__.py │ │ ├── modeling │ │ └── __init__.py │ │ ├── args │ │ ├── __init__.py │ │ ├── recipe_arguments.py │ │ └── training_arguments.py │ │ ├── core │ │ ├── events │ │ │ └── __init__.py │ │ ├── model_layer.py │ │ └── __init__.py │ │ ├── observers │ │ ├── __init__.py │ │ └── helpers.py │ │ ├── version.py │ │ ├── typing.py │ │ ├── recipe │ │ ├── __init__.py │ │ └── metadata.py │ │ └── __init__.py ├── MANIFEST.in ├── examples │ ├── big_models_with_sequential_onloading │ │ └── assets │ │ │ └── sequential_onloading.png │ ├── finetuning │ │ ├── example_single_gpu_config.yaml │ │ ├── configure_fsdp.md │ │ ├── example_fsdp_config.yaml │ │ └── example_alternating_recipe.yaml │ ├── trl_mixin │ │ ├── sft_trainer.py │ │ └── README.md │ ├── quantization_2of4_sparse_w4a16 │ │ ├── 2of4_w4a16_recipe.yaml │ │ └── 2of4_w4a16_group-128_recipe.yaml │ ├── quantization_w4a16_fp4 │ │ ├── llama3_example.py │ │ └── qwen3_example.py │ ├── compressed_inference │ │ └── fp8_compressed_inference.py │ └── quantization_w8a8_fp8 │ │ ├── llama3_example.py │ │ ├── fp8_block_example.py │ │ ├── qwen3_vl_moe_fp8_example.py │ │ ├── qwen3_next_example.py │ │ ├── qwen2vl_example.py │ │ ├── qwen_2_5_vl_example.py │ │ ├── llava1.5_example.py │ │ └── llama3.2_vision_example.py ├── CITATION.cff ├── .MAINTAINERS ├── .readthedocs.yaml └── pyproject.toml ├── .gitignore ├── src └── open_r1 │ ├── utils │ ├── __init__.py │ ├── competitive_programming │ │ ├── utils.py │ │ ├── __init__.py │ │ └── ioi_utils.py │ ├── wandb_logging.py │ ├── import_utils.py │ └── model_utils.py │ └── __init__.py ├── utils ├── competitive_programming │ ├── utils.py │ ├── __init__.py │ └── ioi_utils.py └── import_utils.py ├── recipes ├── accelerate_configs │ ├── ddp.yaml │ ├── zero2.yaml │ ├── zero3.yaml │ ├── zero3_offload.yaml │ └── fsdp.yaml └── dataset_filtering │ ├── filter_dapo.yaml │ └── filter_python.yaml ├── openr1_tool ├── pass_rate_filtering │ ├── launch_filtering.sh │ └── README.md └── get_tensor_parallel_size.py ├── __init__.py ├── setup.cfg └── setup_env.sh /assets/logo: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trl_trainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/docs/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/docs/stylesheets/style.css: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/unit/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/recipe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/unit/core/events/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | patch = subprocess 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pytorch/model_load/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/metrics/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/awq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/calibration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/distillation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/compression/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/utils/fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/pruning/wanda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/aqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/aqn.png -------------------------------------------------------------------------------- /assets/lora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/lora.png -------------------------------------------------------------------------------- /assets/qerl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/qerl.png -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | -------------------------------------------------------------------------------- /assets/curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/curve.png -------------------------------------------------------------------------------- /assets/da_gr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/da_gr.png -------------------------------------------------------------------------------- /assets/logo4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/logo4.png -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/performance.png -------------------------------------------------------------------------------- /assets/rank_speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/assets/rank_speed.png -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-compressor/tests/test_timer/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .timer import Timer 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/basic/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .module import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/obcq/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .sgpt_base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/data_free/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/independent/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/sequential/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | compressed-tensors/ 2 | 3 | vllm/ 4 | 5 | wandb/ 6 | 7 | **/__pycache__/ 8 | 9 | ckpt 10 | -------------------------------------------------------------------------------- /llm-compressor/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | recursive-exclude src *.png *.jpg *.jpeg *.gif *.svg *.bmp *.webp 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/metrics/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .frequency_manager import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/wanda/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/quantization/gptq/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/transform/quip/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/layer_sequential/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/distillation/output/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/transform/spinquant/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/tracing/__init__.py: -------------------------------------------------------------------------------- 1 | from .debug import trace 2 | 3 | __all__ = ["trace"] 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/awq/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | from .mappings import * 5 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .constants import * 4 | from .helpers import * 5 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/constant/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import ConstantPruningModifier 4 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/magnitude/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import MagnitudePruningModifier 4 | -------------------------------------------------------------------------------- /llm-compressor/docs/assets/llmcompressor-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-icon.png -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/quantization/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import * 4 | from .mixin import * 5 | -------------------------------------------------------------------------------- /llm-compressor/docs/assets/llmcompressor-icon-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-icon-white.png -------------------------------------------------------------------------------- /llm-compressor/docs/assets/llmcompressor-user-flows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/docs/assets/llmcompressor-user-flows.png -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .layer_mask import * 4 | from .mask_factory import * 5 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .quip import QuIPModifier 4 | from .spinquant import SpinQuantModifier 5 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp4_nvfp4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: NVFP4A16 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | scheme: FP8_DYNAMIC -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .cache import * 4 | from .gptq import * 5 | from .quantization import * 6 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/sparsegpt/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import SparseGPTModifier 4 | 5 | __all__ = ["SparseGPTModifier"] 6 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .data import TextGenerationDataset 4 | from .session_mixin import SessionManagerMixIn 5 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: FP8_DYNAMIC -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .kd_factory import * 4 | from .kd_wrapper import * 5 | from .model_wrapper import * 6 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .constant import * 4 | from .magnitude import * 5 | from .wanda import * 6 | from .sparsegpt import * 7 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | General utility functions used throughout LLM Compressor. 3 | """ 4 | 5 | # ruff: noqa 6 | 7 | from .dev import * 8 | from .helpers import * 9 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for applying sparsification algorithms to Hugging Face transformers flows 3 | """ 4 | 5 | # ruff: noqa 6 | from .helpers import * 7 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | file_extension: json 5 | num_train_epochs: 1 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | file_extension: csv 5 | num_train_epochs: 1 -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pytorch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generic code used as utilities and helpers for PyTorch 3 | """ 4 | 5 | # ruff: noqa 6 | 7 | from .helpers import * 8 | from .sparsification import * 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparsity_generic/config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus -------------------------------------------------------------------------------- /llm-compressor/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/QeRL/HEAD/llm-compressor/examples/big_models_with_sequential_onloading/assets/sequential_onloading.png -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/kv_cache/default.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | kv_cache_scheme: 5 | {num_bits: 8, type: float, symmetric: true, strategy: tensor} 6 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "neuralmagic/Llama-2-7b-ultrachat200k" 4 | file_extension: json 5 | num_train_epochs: 0.5 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: FP8 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A8 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 4 | dataset_config_name: wikitext-2-raw-v1 5 | dataset: wikitext -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml 5 | scheme: FP8A16_channel -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml 5 | scheme: FP8A16_tensor -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/fp4_nvfp4.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: NVFP4 5 | num_calibration_samples: 20 6 | dataset_id: HuggingFaceH4/ultrachat_200k 7 | dataset_split: train_sft -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/fp8_smoke.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/tinysmokellama-3.2" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: FP8_DYNAMIC 4 | lmeval: 5 | metrics: 6 | exact_match,flexible-extract: 0.75 7 | exact_match,strict-match: 0.75 8 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | quant_type: "GPTQ" -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A16 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | quant_type: "GPTQ" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/inputs_smoke.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/tinysmokellama-3.2" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/test_sentinel.py: -------------------------------------------------------------------------------- 1 | from llmcompressor.sentinel import Sentinel 2 | 3 | 4 | def test_sentinel(): 5 | assert Sentinel("MISSING") == Sentinel("MISSING") 6 | assert Sentinel("MISSING", "module_one") != Sentinel("MISSING", "module_two") 7 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/channelwise_smoke.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/tinysmokellama-3.2" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/weights_only_smoke.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/tinysmokellama-3.2" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.core import State 4 | from tests.llmcompressor.pytorch.helpers import LinearNet 5 | 6 | 7 | @pytest.fixture 8 | def state(): 9 | return State(model=LinearNet()) 10 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | targets: [ 7 | 're:model.layers.3.mlp.gate_proj.weight' 8 | ] -------------------------------------------------------------------------------- /llm-compressor/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - name: Red Hat AI 5 | - name: vLLM Project 6 | title: "LLM Compressor" 7 | date-released: 2024-08-08 8 | url: https://github.com/vllm-project/llm-compressor 9 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for integrating LLM Compressor with transformers training flows. 3 | """ 4 | 5 | # ruff: noqa 6 | 7 | # (import order matters for circular import avoidance) 8 | from .utils import * 9 | from .finetune import * 10 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml: -------------------------------------------------------------------------------- 1 | pruning_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] -------------------------------------------------------------------------------- /src/open_r1/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import get_dataset 2 | from .import_utils import is_e2b_available, is_morph_available 3 | from .model_utils import get_model, get_tokenizer 4 | 5 | 6 | __all__ = ["get_tokenizer", "is_e2b_available", "is_morph_available", "get_model", "get_dataset"] 7 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf6.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: True 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: "gsm8k" 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.3 5 | block_size: 128 6 | dampening_frac: 0.01 7 | targets: ["model.layers.0", "model.layers.1"] 8 | mask_structure: "0:0" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/fp8_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/group_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/inputs_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf5.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: True 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: open_platypus 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparse/tiny_llama_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml" 6 | sparsity: 0.3 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf2.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: open_platypus 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_tinyllama -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/channelwise_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | scheme: W4A16_channel 5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected 6 | dataset_split: train 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w8a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A16_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/kv_cache_gptq_tinyllama.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_gptq_tinyllama -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml 5 | scheme: sparse2of4_fp8_dynamic 6 | dataset_id: garage-bAInd/Open-Platypus 7 | dataset_split: train -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_2of4_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/actorder_group_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/configs/actorder_weight_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_2of4 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/tiny_llama_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml" 6 | num_samples: 32 7 | perplexity: 5000 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparse/gpu/llama_7b_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml" 6 | sparsity: 0.3 7 | device: "cuda:0" -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/kv_cache_phi3.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: microsoft/Phi-3-mini-4k-instruct 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_phi3 8 | gpu_memory_utilization: 0.8 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/sparse_24.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml 5 | scheme: sparse2of4_only 6 | dataset_id: HuggingFaceH4/ultrachat_200k 7 | dataset_split: train_sft 8 | save_compressed: True -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | AWQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf3.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: "gsm8k" 6 | dataset_config_name: "main" 7 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/fp8_static_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: FP8 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | lmeval: 7 | metrics: 8 | exact_match,flexible-extract: 0.75 9 | exact_match,strict-match: 0.75 10 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Provides model distillation functionality, specifically importing output-based 5 | distillation modifiers for transferring knowledge from teacher to student 6 | models during compression. 7 | """ 8 | 9 | from .output import * 10 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | dampening_frac: 0.01 7 | mask_structure: "0:0" 8 | targets: ["re:.*model.layers.0$"] 9 | preserve_sparsity_mask: True -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/sequential/README.md: -------------------------------------------------------------------------------- 1 | # Sequential Pipeline # 2 | The sequential pipeline is a data pipeline, primarily used for compressing models with the 3 | [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the 4 | [SparseGPTModifier](/src/llmcompressor/modifiers/pruning/sparsegpt/base.py). 5 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml 5 | scheme: sparse2of4_fp8_dynamic 6 | dataset_id: HuggingFaceH4/ultrachat_200k 7 | dataset_split: train_sft -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/sparse_with_mask_structure.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | dampening_frac: 0.01 7 | mask_structure: "2:4" 8 | targets: [ 9 | "re:.*model.layers.0$", 10 | ] -------------------------------------------------------------------------------- /llm-compressor/.MAINTAINERS: -------------------------------------------------------------------------------- 1 | # list of active maintainers 2 | # uncommented maintainers will be included in code review triage 3 | 4 | markurtz 5 | dsikka 6 | rahul-tuli 7 | horheynm 8 | brian-dellabetta 9 | kylesayrs 10 | 11 | # mgoin 12 | # anmarques 13 | # eldarkurtic 14 | # chibukach 15 | # shubhra 16 | # abhinavnmagic 17 | # eiofinov 18 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def run_before_and_after_tests(tmp_path): 8 | os.environ["TRANSFORMERS_CACHE"] = str(tmp_path / "transformers") 9 | os.environ["HF_DATASETS_CACHE"] = str(tmp_path / "datasets") 10 | yield 11 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/tiny_llama_quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml" 6 | num_samples: 32 7 | perplexity: 5000 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "neuralmagic/Llama-2-7b-ultrachat200k" 4 | dataset: "ultrachat-200k" 5 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" 6 | num_train_epochs: 0.05 7 | concat_txt: False 8 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_sparse.yml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | quant_type: "GPTQ" 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml 5 | dataset_id: neuralmagic/LLM_compression_calibration 6 | dataset_split: train 7 | scheme: W4A16_actorder_none 8 | save_dir: Qwen2.5-0.5B-actorder-none -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: neuralmagic/LLM_compression_calibration 6 | dataset_split: train 7 | scheme: W4A16_actorder_group 8 | save_dir: Qwen2.5-0.5B-actorder-group -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | actorder: null 6 | config_groups: 7 | group_0: 8 | weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} 9 | targets: [Linear] 10 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | actorder: null 6 | config_groups: 7 | group_0: 8 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false} 9 | targets: [Linear] 10 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: wikitext 5 | dataset_config_name: "wikitext-2-raw-v1" 6 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" 7 | num_train_epochs: 0.25 8 | concat_txt: False -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml 5 | dataset_id: garage-bAInd/Open-Platypus 6 | dataset_split: train 7 | scheme: W8A8_tensor_weight_static_per_tensor_act 8 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected 6 | dataset_split: train 7 | scheme: W4A16_actorder_weight 8 | save_dir: Qwen2.5-0.5B-actorder-weight -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/completion/gpu/llama_7b_quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /utils/competitive_programming/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | 3 | 4 | def batched(iterable, n): 5 | "Batch data into lists of length n. The last batch may be shorter." 6 | # batched('ABCDEFG', 3) --> ABC DEF G 7 | if n < 1: 8 | return iterable 9 | it = iter(iterable) 10 | while batch := list(islice(it, n)): 11 | yield batch 12 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W4A16_weight_asym_awq 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W8A8_channel_weight_static_per_tensor -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: openai/gsm8k 6 | dataset_config: main 7 | dataset_split: train 8 | scheme: W4A16_actorder_group 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml 5 | dataset_id: openai/gsm8k 6 | dataset_config: main 7 | dataset_split: train 8 | scheme: W4A16_actorder_none 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W4A16_weight_sym_awq 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w8a8_static_asym.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | scheme: W8A8_static_asym_activations 7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Asym 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W8A8_tensor_weight_static_per_tensor_act 8 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: openai/gsm8k 6 | dataset_config: main 7 | dataset_split: train 8 | scheme: W4A16_actorder_weight 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/configs/w8a8_dynamic_asym.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | scheme: W8A8_dynamic_asym_activations 7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Asym 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a16_awq_sym.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16 4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.70 10 | exact_match,strict-match: 0.70 11 | -------------------------------------------------------------------------------- /src/open_r1/utils/competitive_programming/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | 3 | 4 | def batched(iterable, n): 5 | "Batch data into lists of length n. The last batch may be shorter." 6 | # batched('ABCDEFG', 3) --> ABC DEF G 7 | if n < 1: 8 | return iterable 9 | it = iter(iterable) 10 | while batch := list(islice(it, n)): 11 | yield batch 12 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Provides entry points for model compression workflows. 5 | 6 | Includes oneshot compression, training, and pre and post-processing utilities 7 | for model optimization tasks. 8 | """ 9 | 10 | from .oneshot import Oneshot, oneshot 11 | from .train import train 12 | from .utils import post_process, pre_process 13 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyTorch-specific utilities and tools for model compression workflows. 3 | 4 | Provides PyTorch-specific functionality including model loading, 5 | sparsification utilities, and PyTorch tensor operations optimized for 6 | compression workflows. Includes utilities for handling PyTorch models 7 | and tensors during compression operations. 8 | """ 9 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/consec_runs/tiny_llama_consec_runs.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus 5 | first_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml" 6 | second_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml" -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "float" 10 | symmetric: true 11 | strategy: channel 12 | targets: ["Linear"] -------------------------------------------------------------------------------- /llm-compressor/examples/finetuning/example_single_gpu_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: 'NO' 4 | enable_cpu_affinity: false 5 | gpu_ids: 0 6 | machine_rank: 0 7 | main_training_function: main 8 | num_machines: 1 9 | num_processes: 1 10 | rdzv_backend: static 11 | same_network: true 12 | tpu_env: [] 13 | tpu_use_cluster: false 14 | tpu_use_sudo: false 15 | use_cpu: false -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/utils/constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constants for modifier operations and compression thresholds. 3 | 4 | This module defines global constants used throughout the compression 5 | framework for determining sparsity thresholds, pruning criteria, and 6 | other modifier-specific parameters. 7 | """ 8 | 9 | __all__ = ["SPARSITY_THRESHOLD"] 10 | 11 | SPARSITY_THRESHOLD: float = 0.05 12 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | AWQModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: true 11 | strategy: "group" 12 | group_size: 128 13 | targets: ["Linear"] 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a16_actorder_none.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16_actorder_none 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/consec_runs/gpu/llama_consec_runs.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | first_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml" 6 | second_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity.yaml" 7 | device: "cuda:0" -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16_actorder_group 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16_actorder_weight 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: MULTI_GPU 4 | downcast_bf16: 'no' 5 | gpu_ids: all 6 | machine_rank: 0 7 | main_training_function: main 8 | mixed_precision: bf16 9 | num_machines: 1 10 | num_processes: 8 11 | rdzv_backend: static 12 | same_network: true 13 | tpu_env: [] 14 | tpu_use_cluster: false 15 | tpu_use_sudo: false 16 | use_cpu: false 17 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/test_has_gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import torch 5 | 6 | 7 | @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") != "true", reason="Only run for GHA") 8 | def test_has_gpu(): 9 | """ 10 | This test exists purely to raise an error if 11 | a runner performs transformers tests without a GPU 12 | """ 13 | assert torch.cuda.is_available() 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: INT8_dyn_per_token 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.77 10 | exact_match,strict-match: 0.76 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/quant.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.6 5 | GPTQModifier: 6 | block_size: 128 7 | dampening_frac: 0.01 8 | config_groups: 9 | group_0: 10 | weights: 11 | num_bits: 8 12 | input_activations: 13 | num_bits: 8 14 | targets: ["Linear"] -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head"] 5 | actorder: null 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 4 10 | type: "int" 11 | symmetric: true 12 | strategy: "group" 13 | group_size: 128 14 | targets: ["Linear"] 15 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: open_platypus 6 | recipe: | 7 | test_stage: 8 | obcq_modifiers: 9 | SparseGPTModifier: 10 | sparsity: 0.5 11 | block_size: 128 12 | targets: [ 13 | 're:model.layers.3.mlp.gate_proj.weight' 14 | ] -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head"] 5 | actorder: "group" 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 4 10 | type: "int" 11 | symmetric: true 12 | strategy: "group" 13 | group_size: 128 14 | targets: ["Linear"] 15 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Metrics logging and monitoring framework for compression workflows. 5 | 6 | Provides comprehensive metrics collection, logging, and monitoring 7 | capabilities for model compression operations. Includes base loggers, 8 | frequency management, and specialized metrics tracking for training and 9 | inference performance during compression. 10 | """ 11 | 12 | from .logger import * 13 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/test_tiny2.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | dampening_frac: 0.01 7 | mask_structure: "0:0" 8 | targets: [ 9 | "model.layers.0", 10 | "model.layers.1", 11 | "model.layers.2", 12 | "model.layers.3", 13 | "model.layers.4", 14 | "model.layers.5" 15 | ] -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Provides dataset utilities for model calibration and processing. 5 | 6 | Includes functions to format calibration data, create dataloaders, 7 | process datasets, and split datasets for quantization workflows. 8 | """ 9 | 10 | from .utils import ( 11 | format_calibration_data, 12 | get_calibration_dataloader, 13 | get_processed_dataset, 14 | make_dataset_splits, 15 | ) 16 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Model preparation and fusion utilities for compression workflows. 5 | 6 | Provides tools for preparing models for compression including 7 | layer fusion, module preparation, and model structure optimization. 8 | Handles pre-compression transformations and architectural modifications 9 | needed for efficient compression. 10 | """ 11 | 12 | from .fuse import * 13 | from .prepare import * 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_smoke_conf4.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/tinysmokellama-3.2" 5 | dataset: "gsm8k" 6 | dataset_config_name: "main" 7 | recipe: | 8 | test_stage: 9 | obcq_modifiers: 10 | SparseGPTModifier: 11 | sparsity: 0.5 12 | block_size: 128 13 | targets: [ 14 | 're:model.layers.3.mlp.gate_proj.weight' 15 | ] -------------------------------------------------------------------------------- /llm-compressor/docs/README.md: -------------------------------------------------------------------------------- 1 | # Getting started with LLM Compressor docs 2 | 3 | ```bash 4 | cd docs 5 | ``` 6 | 7 | - Install the dependencies: 8 | 9 | ```bash 10 | make install 11 | ``` 12 | 13 | - Clean the previous build (optional but recommended): 14 | 15 | ```bash 16 | make clean 17 | ``` 18 | 19 | - Serve the docs: 20 | 21 | ```bash 22 | make serve 23 | ``` 24 | 25 | This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. -------------------------------------------------------------------------------- /openr1_tool/pass_rate_filtering/launch_filtering.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | # a bash foor loop from 0 to 17,400 in chunks of 200 4 | 5 | for i in {0..17000..200} 6 | do 7 | START=$i 8 | END=$((i + 200)) 9 | echo "Processing chunk from $START to $END" 10 | 11 | # Submit the job to SLURM 12 | sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml $START $END 13 | done 14 | 15 | sbatch slurm/compute_pass_rate.slurm recipes/dataset_filtering/filter_dapo.yaml 17200 17398 16 | -------------------------------------------------------------------------------- /llm-compressor/docs/scripts/mathjax.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.startup.output.clearCache() 16 | MathJax.typesetClear() 17 | MathJax.texReset() 18 | MathJax.typesetPromise() 19 | }) 20 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | QuantizationModifier: 6 | ignore: [lm_head] 7 | config_groups: 8 | group_0: 9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 11 | targets: [Linear] -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/args/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Arguments package for LLM Compressor. 5 | 6 | Defines structured argument classes for datasets, models, training, and 7 | recipes, along with utilities for parsing them. 8 | """ 9 | 10 | from .dataset_arguments import DatasetArguments 11 | from .model_arguments import ModelArguments 12 | from .recipe_arguments import RecipeArguments 13 | from .training_arguments import TrainingArguments 14 | from .utils import parse_args 15 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/core/events/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM Compressor Core Events Package 3 | 4 | This package provides the core components and lifecycle management for events 5 | used in the LLM Compressor framework. It includes definitions for various 6 | event types and lifecycles that are critical for managing the state and 7 | execution flow of the model compression and training processes. 8 | """ 9 | 10 | from .event import Event, EventType 11 | 12 | __all__ = ["Event", "EventType"] 13 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | QuantizationModifier: 13 | targets: ["Linear"] 14 | ignore: ["lm_head"] 15 | scheme: "FP8_DYNAMIC" 16 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/smoothquant_gptq_w8a8.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | mappings: 6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] 7 | - re:.*input_layernorm 8 | - - ['re:.*gate_proj', 're:.*up_proj'] 9 | - re:.*post_attention_layernorm 10 | GPTQModifier: 11 | targets: ["Linear"] 12 | ignore: [lm_head] 13 | scheme: W8A8 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: [lm_head] 7 | actorder: null 8 | config_groups: 9 | group_0: 10 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 11 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 12 | targets: [Linear] -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: [lm_head] 7 | actorder: null 8 | config_groups: 9 | group_0: 10 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 11 | input_activations: {num_bits: 8, symmetric: false, dynamic: false, strategy: tensor, type: int} 12 | targets: [Linear] 13 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/observers/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Framework for monitoring and analyzing model behavior during compression. 5 | 6 | Provides observers for tracking tensor statistics, activation 7 | ranges, and model behavior during compression workflows. Includes 8 | min-max observers, MSE observers, and helper utilities for quantization 9 | and other compression techniques. 10 | """ 11 | 12 | from .helpers import * 13 | from .base import * 14 | from .min_max import * 15 | from .mse import * 16 | -------------------------------------------------------------------------------- /llm-compressor/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version, and other tools you might need 8 | build: 9 | os: ubuntu-24.04 10 | tools: 11 | python: "3.12" 12 | 13 | # Build documentation with Mkdocs 14 | mkdocs: 15 | configuration: mkdocs.yml 16 | 17 | python: 18 | install: 19 | - method: pip 20 | path: . 21 | extra_requirements: 22 | - dev 23 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"] 5 | actorder: "weight" 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 4 10 | type: "int" 11 | symmetric: true 12 | strategy: "group" 13 | group_size: 128 14 | targets: ["Linear"] 15 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | strategy: "channel" 11 | targets: [Linear] 12 | SparseGPTModifier: 13 | sparsity: 0.5 14 | block_size: 128 15 | dampening_frac: 0.01 16 | mask_structure: "0:0" 17 | targets: ["re:.*model.layers.0$"] -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/tinysmokellama-3.2" 4 | dataset: open_platypus 5 | initial_pruning_only_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/sparse_with_mask_structure.yaml" 6 | initial_sparsity: 0.5 7 | recipe_mask_structure: "2:4" 8 | subsequent_prune_and_quant_recipe: "tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity_with_quant.yaml" 9 | final_sparsity: 0.7 -------------------------------------------------------------------------------- /src/open_r1/utils/wandb_logging.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def init_wandb_training(training_args): 5 | """ 6 | Helper function for setting up Weights & Biases logging tools. 7 | """ 8 | if training_args.wandb_entity is not None: 9 | os.environ["WANDB_ENTITY"] = training_args.wandb_entity 10 | if training_args.wandb_project is not None: 11 | os.environ["WANDB_PROJECT"] = training_args.wandb_project 12 | if training_args.wandb_run_group is not None: 13 | os.environ["WANDB_RUN_GROUP"] = training_args.wandb_run_group 14 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | from .base import TextGenerationDataset 4 | from .c4 import C4Dataset 5 | from .cnn_dailymail import CNNDailyMailDataset 6 | from .custom import CustomDataset 7 | from .evolcodealpaca import EvolCodeAlpacaDataset 8 | from .flickr_30k import Flickr30K 9 | from .gsm8k import GSM8KDataset 10 | from .open_platypus import OpenPlatypusDataset 11 | from .peoples_speech import PeoplesSpeech 12 | from .ultrachat_200k import UltraChatDataset 13 | from .wikitext import WikiTextDataset 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head"] 5 | actorder: null 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 4 10 | type: "int" 11 | symmetric: true 12 | strategy: "channel" 13 | actorder: False 14 | targets: ["Linear"] 15 | kv_cache_scheme: 16 | {num_bits: 8, type: float, symmetric: true, strategy: tensor} -------------------------------------------------------------------------------- /llm-compressor/docs/examples/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | weight: -4 3 | --- 4 | 5 | # LLM Compressor examples 6 | 7 | This section provides practical demonstrations showing how to use LLM Compressor to optimize large language models for faster and more efficient deployment with vLLM. These examples will help you understand the various compression techniques and functionalities available in LLM Compressor, making it easier to apply them to your own models. 8 | 9 | Each example is designed to be self-contained, with clear instructions and code snippets that you can run directly. 10 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/zero2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/transform/test_serialization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.transform import QuIPModifier, SpinQuantModifier 4 | 5 | 6 | @pytest.mark.parametrize("modifier", [SpinQuantModifier, QuIPModifier]) 7 | @pytest.mark.parametrize("transform_block_size", [16, 32]) 8 | def test_reload(modifier, transform_block_size): 9 | instance = modifier( 10 | transform_type="hadamard", transform_block_size=transform_block_size 11 | ) 12 | dump = instance.model_dump() 13 | assert modifier.model_validate(dump) == instance 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: weekly 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: FP8_DYNAMIC 5 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml 6 | lmeval: 7 | model: "hf-multimodal" 8 | model_args: 9 | dtype: bfloat16 10 | add_bos_token: True 11 | convert_img_format: True 12 | task: mmmu_val_literature 13 | num_fewshot: 0 14 | batch_size: 8 15 | # dense model achieves accuracy of 0.9 +/ 0.0557 16 | metrics: 17 | acc,none: 0.8333 18 | acc_stderr,none: 0.0557 19 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/zero3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"] 7 | actorder: null 8 | config_groups: 9 | group_0: 10 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 11 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true} 12 | targets: [Linear] 13 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/zero3_offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | deepspeed_multinode_launcher: standard 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | pruning_modifiers: 3 | ConstantPruningModifier: 4 | targets: [ 5 | "re:.*self_attn.q_proj", 6 | "re:.*self_attn.k_proj", 7 | "re:.*self_attn.v_proj", 8 | "re:.*self_attn.o_proj", 9 | "re:.*mlp.gate_proj", 10 | "re:.*mlp.up_proj" 11 | ] 12 | start: 0 13 | distillation_modifiers: 14 | OutputDistillationModifier: 15 | targets: ["re:model.layers.\\d+$"] 16 | comparison: "square_head" 17 | start: 0 18 | orig_scale: 1.0 19 | distill_scale: 1.0 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_infer_targets.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from accelerate import init_empty_weights 3 | from transformers import AutoModelForCausalLM 4 | 5 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier 6 | 7 | 8 | @pytest.mark.integration 9 | def test_infer_targets(): 10 | modifier = SparseGPTModifier(sparsity=0.0) 11 | with init_empty_weights(): 12 | model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2") 13 | 14 | inferred = modifier._infer_sequential_targets(model) 15 | assert inferred == ["LlamaDecoderLayer"] 16 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compression modifiers for applying various optimization techniques. 3 | 4 | Provides the core modifier system for applying compression techniques like 5 | quantization, pruning, distillation, and other optimization methods to neural 6 | networks. Includes base classes, factory patterns, and interfaces for 7 | extensible compression workflows. 8 | """ 9 | 10 | from .factory import ModifierFactory 11 | from .interface import ModifierInterface 12 | from .modifier import Modifier 13 | 14 | __all__ = [ 15 | "ModifierFactory", 16 | "ModifierInterface", 17 | "Modifier", 18 | ] 19 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/w4a4_nvfp4.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Llama-3.1-8B-Instruct 3 | scheme: NVFP4 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | num_calibration_samples: 20 7 | lmeval: 8 | # NVFP4 (4-bit weights + 4-bit activations) has lower recovery than FP8/INT8 9 | # Observed: strict-match ~92.81%, flexible-extract ~89.59% 10 | recovery_threshold: 11 | exact_match,strict-match: 0.92 12 | exact_match,flexible-extract: 0.89 13 | # Absolute metrics for warnings only 14 | metrics: 15 | exact_match,flexible-extract: 0.70 16 | exact_match,strict-match: 0.65 17 | -------------------------------------------------------------------------------- /llm-compressor/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal mkdocs makefile 2 | 3 | PYTHON := python3 4 | MKDOCS_CMD := mkdocs 5 | MKDOCS_CONF := ../mkdocs.yml 6 | 7 | .PHONY: help install serve build clean 8 | 9 | help: 10 | @echo "Available targets:" 11 | @echo " install Install dependencies globally" 12 | @echo " serve Serve docs locally" 13 | @echo " build Build static site" 14 | @echo " clean Remove build artifacts" 15 | 16 | install: 17 | pip install -e "../[dev]" 18 | 19 | serve: 20 | $(MKDOCS_CMD) serve --livereload -f $(MKDOCS_CONF) 21 | 22 | build: 23 | $(MKDOCS_CMD) build -f $(MKDOCS_CONF) 24 | 25 | clean: 26 | rm -rf site/ .cache/ -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/recipes/additional_sparsity_with_quant.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | dampening_frac: 0.01 7 | mask_structure: "0:0" 8 | targets: [ 9 | "re:.*model.layers.0$", 10 | ] 11 | preserve_sparsity_mask: True 12 | GPTQModifier: 13 | config_groups: 14 | group_0: 15 | weights: 16 | num_bits: 8 17 | type: "int" 18 | strategy: "channel" 19 | targets: [ 20 | "re:.*model.layers.0.self_attn.q_proj", 21 | ] -------------------------------------------------------------------------------- /llm-compressor/examples/finetuning/configure_fsdp.md: -------------------------------------------------------------------------------- 1 | # Configuring FSDP for Sparse Finetuning 2 | 3 | An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this 4 | folder. It can be used out of the box by editing the `num_processes` parameter to 5 | fit the number of GPUs on your machine. 6 | 7 | You can also customize your own config file by running the following prompt 8 | ``` 9 | accelerate config 10 | ``` 11 | 12 | An FSDP config file can be passed to the LLM Compressor finetuning script like this: 13 | ``` 14 | accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune 15 | ``` 16 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | """ 4 | Compression pipelines for orchestrating different compression strategies. 5 | 6 | Provides various compression pipelines including basic, sequential, 7 | independent, layer-sequential, and data-free approaches. Each pipeline 8 | coordinates different compression techniques and workflows for optimal 9 | model optimization based on specific requirements and constraints. 10 | """ 11 | 12 | # populate registry 13 | from .basic import * 14 | from .data_free import * 15 | from .independent import * 16 | from .layer_sequential import * 17 | from .registry import * 18 | from .sequential import * 19 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | GPTQModifier: 13 | ignore: ["lm_head"] 14 | actorder: null 15 | config_groups: 16 | group_0: 17 | weights: 18 | num_bits: 4 19 | type: "int" 20 | symmetric: true 21 | strategy: "channel" 22 | targets: ["Linear"] 23 | -------------------------------------------------------------------------------- /src/open_r1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /utils/competitive_programming/__init__.py: -------------------------------------------------------------------------------- 1 | from .cf_scoring import score_submission 2 | from .code_patcher import patch_code 3 | from .ioi_scoring import SubtaskResult, score_subtask, score_subtasks 4 | from .ioi_utils import add_includes 5 | from .morph_client import get_morph_client_from_env 6 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints 7 | 8 | 9 | __all__ = [ 10 | "get_piston_client_from_env", 11 | "get_slurm_piston_endpoints", 12 | "get_morph_client_from_env", 13 | "patch_code", 14 | "score_submission", 15 | "score_subtask", 16 | "score_subtasks", 17 | "add_includes", 18 | "SubtaskResult", 19 | ] 20 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enhanced trainer class for fine-tuning with compression support. 3 | 4 | This module provides a Trainer class that extends HuggingFace's Trainer with 5 | LLM compression session management capabilities. Integrates compression 6 | workflows into the standard training loop for seamless model optimization 7 | during fine-tuning. 8 | """ 9 | 10 | from transformers import Trainer as HFTransformersTrainer 11 | 12 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn 13 | 14 | __all__ = ["Trainer"] 15 | 16 | 17 | class Trainer(SessionManagerMixIn, HFTransformersTrainer): 18 | pass 19 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "channel" 12 | input_activations: null 13 | output_activations: null 14 | targets: ["Linear"] 15 | GPTQModifier: 16 | block_size: 128 -------------------------------------------------------------------------------- /src/open_r1/utils/competitive_programming/__init__.py: -------------------------------------------------------------------------------- 1 | from .cf_scoring import score_submission 2 | from .code_patcher import patch_code 3 | from .ioi_scoring import SubtaskResult, score_subtask, score_subtasks 4 | from .ioi_utils import add_includes 5 | from .morph_client import get_morph_client_from_env 6 | from .piston_client import get_piston_client_from_env, get_slurm_piston_endpoints 7 | 8 | 9 | __all__ = [ 10 | "get_piston_client_from_env", 11 | "get_slurm_piston_endpoints", 12 | "get_morph_client_from_env", 13 | "patch_code", 14 | "score_submission", 15 | "score_subtask", 16 | "score_subtasks", 17 | "add_includes", 18 | "SubtaskResult", 19 | ] 20 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/version.py: -------------------------------------------------------------------------------- 1 | # file generated by setuptools-scm 2 | # don't change, don't track in version control 3 | 4 | __all__ = ["__version__", "__version_tuple__", "version", "version_tuple"] 5 | 6 | TYPE_CHECKING = False 7 | if TYPE_CHECKING: 8 | from typing import Tuple 9 | from typing import Union 10 | 11 | VERSION_TUPLE = Tuple[Union[int, str], ...] 12 | else: 13 | VERSION_TUPLE = object 14 | 15 | version: str 16 | __version__: str 17 | __version_tuple__: VERSION_TUPLE 18 | version_tuple: VERSION_TUPLE 19 | 20 | __version__ = version = '0.8.1.dev0+g33ef5f49.d20251006' 21 | __version_tuple__ = version_tuple = (0, 8, 1, 'dev0', 'g33ef5f49.d20251006') 22 | -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: W4A16_actorder_weight 5 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 6 | dataset_id: lmms-lab/flickr30k 7 | dataset_split: "test[:512]" 8 | lmeval: 9 | model: "hf-multimodal" 10 | model_args: 11 | dtype: bfloat16 12 | add_bos_token: True 13 | convert_img_format: True 14 | task: mmmu_val_literature 15 | num_fewshot: 0 16 | batch_size: 8 17 | # dense model achieves accuracy of 0.9 +/ 0.0557 18 | metrics: 19 | acc,none: 0.8333 20 | acc_stderr,none: 0.0557 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "re:vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "float" 10 | symmetric: true 11 | strategy: "channel" 12 | dynamic: false 13 | input_activations: 14 | num_bits: 8 15 | type: "float" 16 | symmetric: true 17 | strategy: "token" 18 | dynamic: true 19 | targets: ["Linear"] 20 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | symmetric: true 11 | strategy: "tensor" 12 | input_activations: null 13 | output_activations: null 14 | targets: ["Linear", "Embedding"] 15 | GPTQModifier: 16 | block_size: 128 17 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/typing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines type aliases for the llm-compressor library. 3 | """ 4 | 5 | from typing import Union 6 | 7 | from datasets import Dataset, DatasetDict, IterableDataset 8 | from transformers import ( 9 | BaseImageProcessor, 10 | FeatureExtractionMixin, 11 | PreTrainedTokenizer, 12 | ProcessorMixin, 13 | ) 14 | 15 | # Tokenizer or Processor. Processors do not inherit from a unified base class 16 | Processor = Union[ 17 | PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin 18 | ] 19 | 20 | # Supported dataset types, IterableDataset is a streamed dataset 21 | DatasetType = Union[Dataset, DatasetDict, IterableDataset] 22 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml: -------------------------------------------------------------------------------- 1 | test_oneshot_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | dampening_frac: 0.01 7 | mask_structure: "0:0" 8 | targets: ["Linear"] 9 | ignore: ["re:.*lm_head"] 10 | test_train_stage: 11 | pruning_modifiers: 12 | ConstantPruningModifier: 13 | targets: [ 14 | "re:.*self_attn.q_proj", 15 | "re:.*self_attn.k_proj", 16 | "re:.*self_attn.v_proj", 17 | "re:.*self_attn.o_proj", 18 | "re:.*mlp.down_proj", 19 | "re:.*mlp.gate_proj", 20 | "re:.*mlp.up_proj" 21 | ] 22 | start: 0 -------------------------------------------------------------------------------- /llm-compressor/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: INT8_dyn_per_token 5 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml 6 | dataset_id: lmms-lab/flickr30k 7 | dataset_split: "test[:512]" 8 | lmeval: 9 | model: "hf-multimodal" 10 | model_args: 11 | dtype: bfloat16 12 | add_bos_token: True 13 | convert_img_format: True 14 | task: mmmu_val_literature 15 | num_fewshot: 0 16 | batch_size: 8 17 | # dense model achieves accuracy of 0.9 +/ 0.0557 18 | metrics: 19 | acc,none: 0.833 20 | acc_stderr,none: 0.0557 -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | GPTQModifier: 13 | ignore: ["lm_head"] 14 | actorder: null 15 | config_groups: 16 | group_0: 17 | weights: 18 | num_bits: 4 19 | type: "int" 20 | symmetric: true 21 | strategy: "group" 22 | group_size: 128 23 | targets: ["Linear"] 24 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/observers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | mappings: 6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] 7 | - re:.*input_layernorm 8 | - - ['re:.*gate_proj', 're:.*up_proj'] 9 | - re:.*post_attention_layernorm 10 | GPTQModifier: 11 | ignore: [lm_head] 12 | actorder: null 13 | config_groups: 14 | group_0: 15 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 16 | input_activations: {num_bits: 8, symmetric: false, dynamic: true, strategy: token, type: int} 17 | targets: [Linear] 18 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | input_activations: null 14 | output_activations: null 15 | targets: ["Linear"] 16 | GPTQModifier: 17 | block_size: 128 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/modifiers/obcq/sgpt_base.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from llmcompressor.modifiers.pruning.sparsegpt import ( 4 | SparseGPTModifier as PruningSparseGPTModifier, 5 | ) 6 | 7 | __all__ = ["SparseGPTModifier"] 8 | 9 | # Legacy shim for backwards-compat imports 10 | 11 | 12 | class SparseGPTModifier(PruningSparseGPTModifier): 13 | def __init__(cls, **kwargs): 14 | warnings.warn( 15 | "SparseGPTModifier has moved. In future, please initialize it from " 16 | "`llmcompressor.modifiers.pruning.sparsegpt.SparseGPTModifier`.", 17 | DeprecationWarning, 18 | stacklevel=2, # Adjust stacklevel to point to the user's code 19 | ) 20 | return super().__init__(**kwargs) 21 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.factory import ModifierFactory 4 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier 5 | 6 | 7 | @pytest.mark.unit 8 | @pytest.mark.usefixtures("setup_modifier_factory") 9 | def test_sparse_gpt_is_registered(): 10 | sparsity = 0.5 11 | targets = "__ALL_PRUNABLE__" 12 | type_ = ModifierFactory.create( 13 | type_="SparseGPTModifier", 14 | allow_experimental=False, 15 | allow_registered=True, 16 | sparsity=sparsity, 17 | targets=targets, 18 | ) 19 | 20 | assert isinstance( 21 | type_, SparseGPTModifier 22 | ), "PyTorch SparseGPTModifier not registered" 23 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/pruning/wanda/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.factory import ModifierFactory 4 | from llmcompressor.modifiers.pruning.wanda.base import WandaPruningModifier 5 | 6 | 7 | @pytest.mark.unit 8 | @pytest.mark.usefixtures("setup_modifier_factory") 9 | def test_wanda_is_registered(): 10 | sparsity = 0.5 11 | targets = "__ALL_PRUNABLE__" 12 | type_ = ModifierFactory.create( 13 | type_="WandaPruningModifier", 14 | allow_experimental=False, 15 | allow_registered=True, 16 | sparsity=sparsity, 17 | targets=targets, 18 | ) 19 | 20 | assert isinstance( 21 | type_, WandaPruningModifier 22 | ), "PyTorch WandaPruningModifier not registered" 23 | -------------------------------------------------------------------------------- /llm-compressor/examples/finetuning/example_fsdp_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch_policy: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: false 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: false 11 | fsdp_sharding_strategy: 1 12 | fsdp_state_dict_type: SHARDED_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: false 15 | machine_rank: 0 16 | main_training_function: main 17 | num_machines: 1 18 | num_processes: 4 19 | rdzv_backend: static 20 | same_network: true 21 | tpu_env: [] 22 | tpu_use_cluster: false 23 | tpu_use_sudo: false 24 | use_cpu: false 25 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "group" 14 | input_activations: null 15 | output_activations: null 16 | targets: ["Linear"] 17 | GPTQModifier: 18 | block_size: 128 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "weight" 14 | input_activations: null 15 | output_activations: null 16 | targets: ["Linear"] 17 | GPTQModifier: 18 | block_size: 128 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from torch.nn import Linear 3 | 4 | from llmcompressor.modifiers.smoothquant import SmoothQuantModifier 5 | 6 | 7 | @pytest.mark.unit 8 | def test_smooth_quant_mapping(state): 9 | mappings = [(["seq.fc1"], "seq.fc2")] 10 | modifier = SmoothQuantModifier(mappings=mappings) 11 | 12 | modifier.ignore = [] 13 | modifier.resolved_mappings_ = modifier._resolve_mappings(state.model) 14 | 15 | assert len(modifier.resolved_mappings_) == len(mappings) 16 | 17 | mapping = modifier.resolved_mappings_[0] 18 | assert mapping.smooth_name == mappings[0][1] 19 | assert isinstance(mapping.smooth_layer, Linear) 20 | assert isinstance(mapping.balance_layers[0], Linear) 21 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/recipe/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Recipe system for defining and managing compression workflows. 3 | 4 | Provides the recipe framework for specifying compression 5 | configurations, including metadata tracking, recipe parsing, and 6 | workflow orchestration. Supports stage-based execution and flexible 7 | parameter management for complex compression pipelines. 8 | """ 9 | 10 | from .metadata import DatasetMetaData, LayerMetaData, ModelMetaData, ParamMetaData 11 | from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput 12 | 13 | __all__ = [ 14 | "DatasetMetaData", 15 | "ParamMetaData", 16 | "LayerMetaData", 17 | "ModelMetaData", 18 | "Recipe", 19 | "RecipeInput", 20 | "RecipeStageInput", 21 | "RecipeArgsInput", 22 | ] 23 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.factory import ModifierFactory 4 | from llmcompressor.modifiers.pruning.wanda import WandaPruningModifier 5 | 6 | 7 | @pytest.mark.unit 8 | @pytest.mark.usefixtures("setup_modifier_factory") 9 | def test_wanda_pytorch_is_registered(): 10 | sparsity = 0.5 11 | targets = "__ALL_PRUNABLE__" 12 | 13 | type_ = ModifierFactory.create( 14 | type_="WandaPruningModifier", 15 | allow_experimental=False, 16 | allow_registered=True, 17 | sparsity=sparsity, 18 | targets=targets, 19 | ) 20 | 21 | assert isinstance( 22 | type_, WandaPruningModifier 23 | ), "PyTorch ConstantPruningModifier not registered" 24 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | block_size: 128 5 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 8 10 | type: "int" 11 | symmetric: false 12 | strategy: "channel" 13 | input_activations: 14 | num_bits: 8 15 | type: "int" 16 | symmetric: false 17 | strategy: "tensor" 18 | output_activations: null 19 | targets: ["Linear"] -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/data/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import AutoTokenizer 3 | 4 | from llmcompressor.args import ModelArguments 5 | 6 | 7 | @pytest.fixture 8 | def tiny_llama_path(): 9 | return "nm-testing/tinysmokellama-3.2" 10 | 11 | 12 | @pytest.fixture 13 | def tiny_llama_model_args(tiny_llama_path): 14 | return ModelArguments(model=tiny_llama_path) 15 | 16 | 17 | @pytest.fixture 18 | def tiny_llama_tokenizer(tiny_llama_model_args): 19 | tokenizer = AutoTokenizer.from_pretrained( 20 | tiny_llama_model_args.model, 21 | cache_dir=tiny_llama_model_args.cache_dir, 22 | use_fast=True, 23 | revision=tiny_llama_model_args.model_revision, 24 | use_auth_token=True if tiny_llama_model_args.use_auth_token else None, 25 | ) 26 | return tokenizer 27 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from torch.nn import Linear 3 | 4 | from llmcompressor.modifiers.logarithmic_equalization import ( 5 | LogarithmicEqualizationModifier, 6 | ) 7 | 8 | 9 | @pytest.mark.unit 10 | def test_log_equalization_mapping(state): 11 | mappings = [(["seq.fc2"], "seq.block1.fc1")] 12 | modifier = LogarithmicEqualizationModifier(mappings=mappings) 13 | 14 | modifier.ignore = [] 15 | modifier.resolved_mappings_ = modifier._resolve_mappings(state.model) 16 | 17 | assert len(modifier.resolved_mappings_) == len(mappings) 18 | 19 | mapping = modifier.resolved_mappings_[0] 20 | assert mapping.smooth_name == mappings[0][1] 21 | assert isinstance(mapping.smooth_layer, Linear) 22 | assert isinstance(mapping.balance_layers[0], Linear) 23 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | default_section = FIRSTPARTY 3 | ensure_newline_before_comments = True 4 | force_grid_wrap = 0 5 | include_trailing_comma = True 6 | known_first_party = open_r1 7 | known_third_party = 8 | transformers 9 | datasets 10 | fugashi 11 | git 12 | h5py 13 | matplotlib 14 | nltk 15 | numpy 16 | packaging 17 | pandas 18 | psutil 19 | pytest 20 | rouge_score 21 | sacrebleu 22 | seqeval 23 | sklearn 24 | streamlit 25 | torch 26 | tqdm 27 | 28 | line_length = 119 29 | lines_after_imports = 2 30 | multi_line_output = 3 31 | use_parentheses = True 32 | 33 | [flake8] 34 | ignore = E203, E501, E741, W503, W605 35 | max-line-length = 119 36 | per-file-ignores = 37 | # imported but unused 38 | __init__.py: F401 39 | 40 | [tool:pytest] 41 | doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/core/model_layer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model layer utility classes for LLM compression workflows. 3 | 4 | Provides dataclass containers for managing model layers and their associated 5 | parameters during compression operations. Facilitates tracking and manipulation 6 | of specific model components and their parameters. 7 | """ 8 | 9 | from dataclasses import dataclass 10 | from typing import Any 11 | 12 | __all__ = ["ModelParameterizedLayer"] 13 | 14 | 15 | @dataclass 16 | class ModelParameterizedLayer: 17 | """ 18 | A dataclass for holding a parameter and its layer 19 | 20 | :param layer_name: the name of the layer 21 | :param layer: the layer object 22 | :param param_name: the name of the parameter 23 | :param param: the parameter object 24 | """ 25 | 26 | layer_name: str 27 | layer: Any 28 | param_name: str 29 | param: Any 30 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SUCCESS=0 4 | 5 | while getopts "c:t:" OPT; do 6 | case ${OPT} in 7 | c ) 8 | CONFIG="$OPTARG" 9 | ;; 10 | t ) 11 | TEST="$OPTARG" 12 | ;; 13 | \? ) 14 | exit 1 15 | ;; 16 | esac 17 | done 18 | 19 | # Parse list of configs. 20 | for MODEL_CONFIG in "$CONFIG"/* 21 | do 22 | LOCAL_SUCCESS=0 23 | 24 | echo "=== RUNNING MODEL: $MODEL_CONFIG ===" 25 | 26 | export TEST_DATA_FILE="$MODEL_CONFIG" 27 | pytest \ 28 | --capture=tee-sys \ 29 | "$TEST" || LOCAL_SUCCESS=$? 30 | 31 | if [[ $LOCAL_SUCCESS == 0 ]]; then 32 | echo "=== PASSED MODEL: $MODEL_CONFIG ===" 33 | else 34 | echo "=== FAILED MODEL: $MODEL_CONFIG ===" 35 | fi 36 | 37 | SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) 38 | 39 | done 40 | 41 | exit "$SUCCESS" 42 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM Compressor is a library for compressing large language models utilizing 3 | the latest techniques and research in the field for both training aware and 4 | post-training techniques. 5 | 6 | The library is designed to be flexible and easy to use on top of 7 | PyTorch and HuggingFace Transformers, allowing for quick experimentation. 8 | """ 9 | 10 | # ruff: noqa 11 | 12 | from .logger import LoggerConfig, configure_logger, logger 13 | from .version import __version__, version 14 | 15 | __all__ = [ 16 | "__version__", 17 | "version", 18 | "configure_logger", 19 | "logger", 20 | "LoggerConfig", 21 | ] 22 | 23 | from llmcompressor.core.session_functions import ( 24 | active_session, 25 | callbacks, 26 | create_session, 27 | reset_session, 28 | ) 29 | from llmcompressor.entrypoints import Oneshot, oneshot, train 30 | -------------------------------------------------------------------------------- /recipes/accelerate_configs/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | enable_cpu_affinity: false 6 | fsdp_config: 7 | fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610 8 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 9 | fsdp_backward_prefetch: BACKWARD_PRE 10 | fsdp_cpu_ram_efficient_loading: true 11 | fsdp_forward_prefetch: true 12 | fsdp_offload_params: false 13 | fsdp_sharding_strategy: FULL_SHARD 14 | fsdp_state_dict_type: FULL_STATE_DICT 15 | fsdp_sync_module_states: true 16 | fsdp_use_orig_params: true 17 | machine_rank: 0 18 | main_training_function: main 19 | mixed_precision: bf16 20 | num_machines: 1 21 | num_processes: 8 22 | rdzv_backend: static 23 | same_network: true 24 | tpu_env: [] 25 | tpu_use_cluster: false 26 | tpu_use_sudo: false 27 | use_cpu: false -------------------------------------------------------------------------------- /llm-compressor/examples/trl_mixin/sft_trainer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from trl import SFTConfig as TRLSFTConfig 4 | from trl import SFTTrainer as TRLSFTTrainer 5 | 6 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn 7 | 8 | __all__ = ["SFTTrainer"] 9 | 10 | 11 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): 12 | def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs): 13 | if trl_sft_config_args is not None: 14 | kwargs["args"] = TRLSFTConfig(**trl_sft_config_args) 15 | super().__init__(*args, **kwargs) 16 | 17 | def _prepare_dataset(self, dataset, *args, **kwargs): 18 | if "input_ids" in dataset.column_names: 19 | # dataset is already tokenized, skip preprocessing 20 | return dataset 21 | 22 | return super()._prepare_dataset(dataset, *args, **kwargs) 23 | -------------------------------------------------------------------------------- /llm-compressor/examples/finetuning/example_alternating_recipe.yaml: -------------------------------------------------------------------------------- 1 | initial_sparsity_stage: 2 | run_type: oneshot 3 | obcq_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | block_size: 128 7 | dampening_frac: 0.01 8 | mask_structure: "0:0" 9 | targets: ["Linear"] 10 | ignore: ["re:.*lm_head"] 11 | initial_training_stage: 12 | run_type: train 13 | pruning_modifiers: 14 | ConstantPruningModifier: 15 | targets: '__ALL__' 16 | start: 0 17 | next_sparsity_stage: 18 | run_type: oneshot 19 | obcq_modifiers: 20 | SparseGPTModifier: 21 | sparsity: 0.7 22 | block_size: 128 23 | dampening_frac: 0.01 24 | mask_structure: "0:0" 25 | targets: ["Linear"] 26 | ignore: ["re:.*lm_head"] 27 | next_training_stage: 28 | run_type: train 29 | pruning_modifiers: 30 | ConstantPruningModifier: 31 | targets: '__ALL__' 32 | start: 0 -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/pipelines/sequential/test_helpers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from llmcompressor.pipelines.sequential.helpers import get_sequential_ancestors 4 | 5 | 6 | class DummyModel(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.seq = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU()) 10 | self.fc = torch.nn.Linear(20, 5) 11 | 12 | def forward(self, x): 13 | x = self.seq(x) 14 | return self.fc(x) 15 | 16 | 17 | def test_get_sequential_ancestors(): 18 | model = DummyModel() 19 | 20 | assert get_sequential_ancestors(model, set()) == set() 21 | assert get_sequential_ancestors(model, {model}) == set() 22 | assert get_sequential_ancestors(model, {model.fc}) == {model} 23 | assert get_sequential_ancestors(model, {model.seq[0]}) == {model, model.seq} 24 | assert get_sequential_ancestors(model, {model.seq[1]}) == {model, model.seq} 25 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor import train 4 | from tests.testing_utils import parse_params, requires_gpu 5 | 6 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" 7 | 8 | 9 | @pytest.mark.integration 10 | @requires_gpu 11 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) 12 | def test_finetune_without_recipe(config, tmp_path): 13 | model = config["model"] 14 | dataset = config["dataset"] 15 | output = tmp_path / "finetune_output" 16 | 17 | recipe_str = None 18 | 19 | concatenate_data = False 20 | max_steps = 50 21 | splits = "train" 22 | 23 | train( 24 | model=model, 25 | dataset=dataset, 26 | output_dir=output, 27 | recipe=recipe_str, 28 | max_steps=max_steps, 29 | concatenate_data=concatenate_data, 30 | splits=splits, 31 | ) 32 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | finetuning_stage: 9 | finetuning_modifiers: 10 | ConstantPruningModifier: 11 | targets: [ 12 | 're:.*q_proj.weight', 13 | 're:.*k_proj.weight', 14 | 're:.*v_proj.weight', 15 | 're:.*o_proj.weight', 16 | 're:.*gate_proj.weight', 17 | 're:.*up_proj.weight', 18 | 're:.*down_proj.weight', 19 | ] 20 | start: 0 21 | quantization_stage: 22 | quantization_modifiers: 23 | GPTQModifier: 24 | ignore: ["lm_head"] 25 | config_groups: 26 | group_0: 27 | weights: 28 | num_bits: 4 29 | type: "int" 30 | symmetric: true 31 | strategy: "channel" 32 | targets: ["Linear"] 33 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml: -------------------------------------------------------------------------------- 1 | pruning_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | quant_stage: 9 | quant_modifiers: 10 | QuantizationModifier: 11 | ignore: ["lm_head"] 12 | config_groups: 13 | group_0: 14 | weights: 15 | num_bits: 8 16 | type: float 17 | strategy: channel 18 | dynamic: false 19 | symmetric: true 20 | input_activations: 21 | num_bits: 8 22 | type: float 23 | strategy: token 24 | dynamic: true 25 | symmetric: true 26 | targets: ["Linear"] -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | finetuning_stage: 9 | finetuning_modifiers: 10 | ConstantPruningModifier: 11 | targets: [ 12 | 're:.*q_proj.weight', 13 | 're:.*k_proj.weight', 14 | 're:.*v_proj.weight', 15 | 're:.*o_proj.weight', 16 | 're:.*gate_proj.weight', 17 | 're:.*up_proj.weight', 18 | 're:.*down_proj.weight', 19 | ] 20 | start: 0 21 | quantization_stage: 22 | quantization_modifiers: 23 | GPTQModifier: 24 | ignore: ["lm_head"] 25 | config_groups: 26 | group_0: 27 | weights: 28 | num_bits: 4 29 | type: "int" 30 | symmetric: true 31 | strategy: "group" 32 | group_size: 128 33 | targets: ["Linear"] 34 | -------------------------------------------------------------------------------- /recipes/dataset_filtering/filter_dapo.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B 3 | model_revision: v03.00-step-000008190 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | # We edit the DeepSeek chat template to ensure (a) the reasoning block within and is included in the completion and (b) the tag is not part of the prefill so that the format reward works 9 | dataset_name: open-r1/DAPO-Math-17k-Processed 10 | dataset_config: all 11 | dataset_split: train 12 | 13 | # Generation arguments 14 | max_completion_length: 32000 15 | num_generations: 8 16 | temperature: 1.0 17 | 18 | # Reward func arguments 19 | reward_funcs: 20 | - accuracy 21 | reward_weights: 22 | - 1.0 23 | 24 | # Filtering arguments. Samples with mean reward outside of low / high will be filtered 25 | pass_rate_min: 0.1 26 | pass_rate_max: 0.6 27 | 28 | output_dataset_name: open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-v03.00-step-000008190-filter 29 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/custom.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom dataset implementation for JSON and CSV data sources. 3 | 4 | This module provides a CustomDataset class for loading and processing 5 | local JSON and CSV files for text generation fine-tuning. Supports 6 | flexible data formats and custom preprocessing pipelines for 7 | user-provided datasets. 8 | """ 9 | 10 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 11 | 12 | 13 | @TextGenerationDataset.register(name="custom", alias=["json", "csv"]) 14 | class CustomDataset(TextGenerationDataset): 15 | """ 16 | Child text generation class for custom local dataset supporting load 17 | for csv and json 18 | 19 | :param dataset_args: configuration settings for dataset loading 20 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 21 | Can also be set to None to load all the splits 22 | :param processor: processor or tokenizer to use on dataset 23 | 24 | """ 25 | 26 | pass 27 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modeling/test_fuse.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from llmcompressor.modeling.fuse import center_embeddings, fuse_norm_linears 5 | 6 | 7 | @pytest.mark.unit 8 | def test_center_embeddings(): 9 | embedding = torch.nn.Embedding(10, 10) 10 | center_embeddings(embedding) 11 | 12 | assert torch.allclose( 13 | embedding.weight.mean(dim=1), torch.zeros(embedding.num_embeddings), atol=1e-5 14 | ) 15 | 16 | 17 | @pytest.mark.unit 18 | def test_fuse_norm_linears(): 19 | norm = torch.nn.LayerNorm((5,)) 20 | norm.weight.data = torch.rand(norm.weight.shape) 21 | linears = [ 22 | torch.nn.Linear(5, 5), 23 | torch.nn.Linear(5, 5), 24 | ] 25 | 26 | input = torch.rand((1, 5), requires_grad=False) 27 | true_output = torch.stack([linear(norm(input)) for linear in linears]) 28 | 29 | fuse_norm_linears(norm, linears) 30 | output = torch.stack([linear(norm(input)) for linear in linears]) 31 | 32 | assert torch.allclose(true_output, output) 33 | -------------------------------------------------------------------------------- /recipes/dataset_filtering/filter_python.yaml: -------------------------------------------------------------------------------- 1 | # Model arguments 2 | model_name_or_path: open-r1/R1-Distill-Qwen-Math-7B-Merges 3 | model_revision: v00.00-step-000003660_v01.00-step-000002600_weights-0.50-0.50 4 | torch_dtype: bfloat16 5 | attn_implementation: flash_attention_2 6 | 7 | # Data training arguments 8 | # We edit the DeepSeek chat template to ensure (a) the reasoning block within and is included in the completion and (b) the tag is not part of the prefill so that the format reward works 9 | dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled 10 | dataset_prompt_column: problem 11 | 12 | # Generation arguments 13 | max_completion_length: 16000 14 | num_generations: 8 15 | temperature: 0.7 16 | 17 | # Reward func arguments 18 | reward_funcs: 19 | - binary_code 20 | reward_weights: 21 | - 1.0 22 | e2b_router_url: ip-10-53-85-92:8000 23 | 24 | # Filtering arguments. Samples with mean reward outside of low / high will be filtered 25 | pass_rate_min: 0.1 26 | pass_rate_max: 0.6 27 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/utils/preprocessing_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset preprocessing functions for text generation tasks. 3 | 4 | This module provides a registry of preprocessing functions for various 5 | datasets used in fine-tuning workflows. Includes chat templates, 6 | instruction formatting, and dataset-specific transformations for 7 | popular training datasets. 8 | """ 9 | 10 | from typing import TYPE_CHECKING, Dict 11 | 12 | from compressed_tensors.registry import RegistryMixin 13 | 14 | if TYPE_CHECKING: 15 | from llmcompressor.transformers.finetune.data.base import TextGenerationDataset 16 | 17 | 18 | class PreprocessingFunctionRegistry(RegistryMixin): 19 | pass 20 | 21 | 22 | @PreprocessingFunctionRegistry.register() 23 | def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict): 24 | PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:""" 25 | data["prompt"] = PROMPT_DICT.format_map(data) 26 | data["text"] = data["prompt"] + data["output"] 27 | return data 28 | -------------------------------------------------------------------------------- /utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | 25 | 26 | _morph_available = _is_package_available("morphcloud") 27 | 28 | 29 | def is_morph_available() -> bool: 30 | return _morph_available 31 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/test_quantization.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: 5 | - model.layers.0.mlp.down_proj 6 | - model.layers.1.mlp.down_proj 7 | - model.layers.2.mlp.down_proj 8 | - model.layers.3.mlp.down_proj 9 | - model.layers.4.mlp.down_proj 10 | - model.layers.5.mlp.down_proj 11 | config_groups: 12 | group_0: 13 | weights: 14 | num_bits: 8 15 | type: "int" 16 | symmetric: False 17 | strategy: "tensor" 18 | input_activations: null 19 | output_activations: null 20 | targets: ["Linear"] 21 | pruning_modifiers: 22 | ConstantPruningModifier: 23 | targets: [ 24 | "re:.*self_attn.q_proj", 25 | "re:.*self_attn.k_proj", 26 | "re:.*self_attn.v_proj", 27 | "re:.*self_attn.o_proj", 28 | "re:.*mlp.gate_proj", 29 | "re:.*mlp.up_proj" 30 | ] 31 | start: 0 32 | -------------------------------------------------------------------------------- /llm-compressor/tests/test_timer/timer_utils.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | from tests.test_timer import Timer 4 | 5 | __all__ = ["log_time", "get_singleton_manager"] 6 | 7 | 8 | def get_singleton_manager(enable_logging: bool = True): 9 | """ 10 | Return the Timer. If not has not yet been initialized, initialize and 11 | return. If it has, return the existing Timer. 12 | """ 13 | if Timer._instance is None: 14 | Timer._instance = Timer(enable_logging=enable_logging) 15 | return Timer._instance 16 | 17 | 18 | def log_time(func): 19 | """ 20 | Decorator to time functions. Times for the function are stored using 21 | the class and function names. 22 | """ 23 | 24 | @wraps(func) 25 | def wrapper(*args, **kwargs): 26 | TIMER_MANAGER = get_singleton_manager() 27 | func_name = func.__name__ 28 | 29 | if not TIMER_MANAGER.enable_logging: 30 | return func(*args, **kwargs) 31 | 32 | with TIMER_MANAGER.time(func_name): 33 | return func(*args, **kwargs) 34 | 35 | return wrapper 36 | -------------------------------------------------------------------------------- /src/open_r1/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from transformers.utils.import_utils import _is_package_available 16 | 17 | 18 | # Use same as transformers.utils.import_utils 19 | _e2b_available = _is_package_available("e2b") 20 | 21 | 22 | def is_e2b_available() -> bool: 23 | return _e2b_available 24 | 25 | 26 | _morph_available = _is_package_available("morphcloud") 27 | 28 | 29 | def is_morph_available() -> bool: 30 | return _morph_available 31 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/observers/helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for observer token counting and analysis. 3 | 4 | Provides utility functions for analyzing observer statistics 5 | and token counts across model modules. Used for monitoring compression 6 | effects and understanding model behavior during quantization and 7 | pruning operations. 8 | """ 9 | 10 | from collections import Counter 11 | 12 | import torch 13 | 14 | __all__ = ["get_observer_token_count"] 15 | 16 | 17 | def get_observer_token_count(module: torch.nn.Module) -> Counter: 18 | """ 19 | Parse the module and return the number of tokens observed by 20 | each module's observer. 21 | 22 | :param module: module to parse 23 | :return: counter with the number of tokens observed by each observer 24 | """ 25 | token_counts = Counter() 26 | for name, module in module.named_modules(): 27 | if name.endswith(".input_observer"): 28 | token_counts[name.replace(".input_observer", "")] = ( 29 | module._num_observed_tokens 30 | ) 31 | return token_counts 32 | -------------------------------------------------------------------------------- /llm-compressor/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.mypy] 6 | files = "src/guidellm" 7 | 8 | [tool.ruff] 9 | extend-exclude = ["env", "src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"] 10 | line-length = 88 11 | lint.select = ["E", "F", "W", "I"] 12 | lint.extend-ignore = ["E203", "W605"] 13 | 14 | [tool.ruff.lint.isort] 15 | known-first-party = ["llmcompressor"] 16 | 17 | [tool.pytest.ini_options] 18 | markers = [ 19 | "smoke: quick tests to check basic functionality", 20 | "sanity: tests to ensure that new changes do not break existing functionality", 21 | "regression: detailed tests to ensure major functions work correctly", 22 | "integration: tests which integrate with a third party service such as HF", 23 | "unit: tests to ensure code correctness and regression test functionality", 24 | "example: tests for content in the 'examples' folder", 25 | "multi_gpu: tests that require multiple GPUs", 26 | ] 27 | tmp_path_retention_policy = "failed" 28 | -------------------------------------------------------------------------------- /llm-compressor/docs/guides/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | weight: -5 3 | --- 4 | 5 | # Guides 6 | 7 | Welcome to the LLM Compressor guides section! Here you'll find comprehensive documentation covering key components and concepts of LLM Compressor. These guides will help you understand the various compression options available, how to apply them effectively, and how to deploy your optimized models for maximum performance. 8 | 9 | ## Key Guides 10 | 11 |
12 | 13 | - :material-tune:{ .lg .middle } Compression Schemes 14 | 15 | --- 16 | 17 | Explore the available compression schemes for Quantization and Pruning to determine which is best for your use case. 18 | 19 | [:octicons-arrow-right-24: Compression Schemes](compression_schemes.md) 20 | 21 | - :material-content-save:{ .lg .middle } Saving Models 22 | 23 | --- 24 | 25 | Learn the enhanced ways to save your compressed models with the library's extended `save_pretrained` functionality for compatibility with vLLM deployment. 26 | 27 | [:octicons-arrow-right-24: Saving a Model](saving_a_model.md) 28 | 29 |
30 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/c4.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="c4") 12 | class C4Dataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the C4 dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | def __init__( 22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 23 | ): 24 | dataset_args = deepcopy(dataset_args) 25 | dataset_args.dataset = "allenai/c4" 26 | dataset_args.text_column = "text" 27 | 28 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 29 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | symmetric: true 11 | strategy: "tensor" 12 | input_activations: 13 | num_bits: 8 14 | type: "int" 15 | symmetric: false 16 | strategy: "tensor" 17 | output_activations: null 18 | targets: ["Linear"] 19 | group_1: 20 | weights: 21 | num_bits: 8 22 | type: "int" 23 | symmetric: true 24 | strategy: "tensor" 25 | input_activations: null 26 | output_activations: null 27 | targets: ["Embedding"] 28 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/logarithmic_equalization/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.factory import ModifierFactory 4 | from llmcompressor.modifiers.logarithmic_equalization.base import ( 5 | LogarithmicEqualizationModifier, 6 | ) 7 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier 8 | 9 | 10 | @pytest.mark.unit 11 | @pytest.mark.usefixtures("setup_modifier_factory") 12 | def test_logarithmic_equalization_is_registered(): 13 | smoothing_strength = 0.3 14 | mappings = [(["layer1", "layer2"], "layer3")] 15 | modifier = ModifierFactory.create( 16 | type_="LogarithmicEqualizationModifier", 17 | allow_experimental=False, 18 | allow_registered=True, 19 | smoothing_strength=smoothing_strength, 20 | mappings=mappings, 21 | ) 22 | 23 | assert isinstance( 24 | modifier, LogarithmicEqualizationModifier 25 | ), "PyTorch LogarithmicEqualizationModifier not registered" 26 | assert isinstance(modifier, SmoothQuantModifier) 27 | assert modifier.smoothing_strength == smoothing_strength 28 | assert modifier.mappings == mappings 29 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_oneshot_with_modifier.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.pruning.sparsegpt.base import SparseGPTModifier 5 | from tests.testing_utils import parse_params, requires_gpu 6 | 7 | CONFIGS_DIRECTORY = ( 8 | "tests/llmcompressor/transformers/sparsegpt/sparsegpt_configs/sparsity_generic" 9 | ) 10 | 11 | 12 | @requires_gpu 13 | @pytest.mark.integration 14 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) 15 | def test_oneshot_with_modifier_object(tmp_path, config): 16 | output_dir = tmp_path / "oneshot_out" 17 | recipe_str = [SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])] 18 | 19 | concatenate_data = False 20 | num_calibration_samples = 64 21 | splits = {"calibration": "train[:10%]"} 22 | 23 | oneshot( 24 | model=config["model"], 25 | dataset=config["dataset"], 26 | output_dir=output_dir, 27 | num_calibration_samples=num_calibration_samples, 28 | recipe=recipe_str, 29 | concatenate_data=concatenate_data, 30 | splits=splits, 31 | ) 32 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides the core compression framework for LLM Compressor. 3 | 4 | The core API manages compression sessions, tracks state changes, handles events 5 | during compression, and Provides lifecycle hooks for the compression 6 | process. 7 | """ 8 | 9 | from llmcompressor.core.events import Event, EventType 10 | from llmcompressor.core.lifecycle import CompressionLifecycle 11 | from llmcompressor.core.model_layer import ModelParameterizedLayer 12 | from llmcompressor.core.session import CompressionSession 13 | from llmcompressor.core.session_functions import ( 14 | LifecycleCallbacks, 15 | active_session, 16 | callbacks, 17 | create_session, 18 | reset_session, 19 | ) 20 | from llmcompressor.core.state import Data, Hardware, ModifiedState, State 21 | 22 | __all__ = [ 23 | "Event", 24 | "EventType", 25 | "State", 26 | "Data", 27 | "Hardware", 28 | "ModifiedState", 29 | "ModelParameterizedLayer", 30 | "CompressionLifecycle", 31 | "CompressionSession", 32 | "create_session", 33 | "active_session", 34 | "reset_session", 35 | "apply", 36 | "callbacks", 37 | "LifecycleCallbacks", 38 | ] 39 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/wikitext.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="wikitext") 12 | class WikiTextDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the Open Platypus dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | def __init__( 22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 23 | ): 24 | dataset_args = deepcopy(dataset_args) 25 | dataset_args.dataset = "Salesforce/wikitext" 26 | dataset_args.text_column = "text" 27 | 28 | super().__init__( 29 | dataset_args=dataset_args, 30 | split=split, 31 | processor=processor, 32 | ) 33 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/utils/pytorch/test_module.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch.nn as nn 3 | 4 | from llmcompressor.utils.pytorch import get_layer_by_name 5 | 6 | 7 | @pytest.fixture 8 | def example_nested_module() -> str: 9 | return nn.Sequential( 10 | nn.Linear(10, 20), 11 | nn.Sequential(nn.ReLU(), nn.Linear(20, 10)), 12 | nn.Sequential(nn.SiLU(), nn.Linear(20, 10)), 13 | nn.Softmax(dim=1), 14 | ) 15 | 16 | 17 | @pytest.mark.unit 18 | def test_get_layer_by_name(example_nested_module): 19 | # Test getting the parent of a nested layer 20 | layer = get_layer_by_name("0", example_nested_module) 21 | assert layer == example_nested_module[0] 22 | 23 | layer = get_layer_by_name("1.1", example_nested_module) 24 | assert layer == example_nested_module[1][1] 25 | 26 | layer = get_layer_by_name("2.0", example_nested_module) 27 | assert layer == example_nested_module[2][0] 28 | 29 | layer = get_layer_by_name("2.1", example_nested_module) 30 | assert layer == example_nested_module[2][1] 31 | 32 | # Test getting the parent of a non-existent layer 33 | with pytest.raises(AttributeError): 34 | get_layer_by_name("non_existent_layer", example_nested_module) 35 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/utils/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import torch 4 | 5 | __all__ = ["measure_cuda_memory"] 6 | 7 | 8 | class measure_cuda_memory: 9 | def __init__(self, device=None): 10 | self.device = device 11 | 12 | def reset_peak_memory_stats(self): 13 | torch.cuda.reset_peak_memory_stats(self.device) 14 | 15 | def current_memory_usage(self) -> float: 16 | # Return the memory usage in bytes. 17 | self.reset_peak_memory_stats() 18 | mem = torch.cuda.max_memory_allocated(self.device) 19 | return mem 20 | 21 | def peak_memory_usage(self) -> float: 22 | # Return the peak memory usage in bytes since the last reset 23 | mem = torch.cuda.max_memory_allocated(self.device) 24 | return mem 25 | 26 | def __enter__(self): 27 | self.initial_memory = self.current_memory_usage() 28 | # This allows us to call methods of the context manager if needed 29 | return self 30 | 31 | def __exit__(self, exc_type, exc_val, exc_tb): 32 | self.overall_peak_memory = self.peak_memory_usage() 33 | self.peak_consumed_memory = self.overall_peak_memory - self.initial_memory 34 | 35 | # Force garbage collection 36 | gc.collect() 37 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/test_safetensors.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from llmcompressor import train 6 | from tests.testing_utils import parse_params, requires_gpu 7 | 8 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" 9 | 10 | 11 | @pytest.mark.integration 12 | @requires_gpu 13 | @pytest.mark.parametrize("config", parse_params(CONFIGS_DIRECTORY)) 14 | def test_safetensors(config, tmp_path): 15 | model = config["model"] 16 | dataset = config["dataset"] 17 | output = tmp_path / "finetune_output" 18 | 19 | output_dir = output / "output1" 20 | max_steps = 10 21 | splits = {"train": "train[:10%]"} 22 | 23 | train( 24 | model=model, 25 | dataset=dataset, 26 | output_dir=output_dir, 27 | max_steps=max_steps, 28 | splits=splits, 29 | ) 30 | 31 | assert os.path.exists(output_dir / "model.safetensors") 32 | assert not os.path.exists(output_dir / "pytorch_model.bin") 33 | 34 | # test we can also load 35 | new_output_dir = output / "output2" 36 | train( 37 | model=output_dir, 38 | dataset=dataset, 39 | output_dir=new_output_dir, 40 | max_steps=max_steps, 41 | splits=splits, 42 | ) 43 | -------------------------------------------------------------------------------- /openr1_tool/get_tensor_parallel_size.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import AutoConfig 3 | from math import gcd 4 | 5 | def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int: 6 | try: 7 | config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True) 8 | num_heads = getattr(config, 'num_attention_heads', None) 9 | 10 | if num_heads is not None and num_heads % default_tp != 0: 11 | tp = gcd(num_heads, default_tp) 12 | return max(tp, 1) 13 | else: 14 | return default_tp 15 | except Exception as e: 16 | print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}") 17 | return default_tp 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path") 22 | parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable") 23 | parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)") 24 | 25 | args = parser.parse_args() 26 | 27 | tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp) 28 | print(tp) 29 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/pipelines/data_free/pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional 2 | 3 | import torch 4 | from torch.utils.data.dataloader import DataLoader 5 | 6 | from llmcompressor.core.session_functions import LifecycleCallbacks 7 | from llmcompressor.pipelines.registry import CalibrationPipeline 8 | from llmcompressor.utils.dev import dispatch_for_generation 9 | 10 | if TYPE_CHECKING: 11 | from llmcompressor.args.dataset_arguments import DatasetArguments 12 | 13 | __all__ = ["DataFreePipeline"] 14 | 15 | 16 | @CalibrationPipeline.register("datafree") 17 | class DataFreePipeline(CalibrationPipeline): 18 | @staticmethod 19 | def __call__( 20 | model: torch.nn.Module, 21 | dataloader: Optional[DataLoader], 22 | dataset_args: "DatasetArguments", 23 | ): 24 | """ 25 | A pipeline for data-free calibration 26 | 27 | :param model: model being calibrated 28 | :param dataloader: loads data for calibration 29 | :param dataset_args: dataset arguments relevant to pipelines 30 | """ 31 | # some ops are still performed on the model by modifiers 32 | # we want those ops to occur on the GPU 33 | dispatch_for_generation(model) 34 | 35 | LifecycleCallbacks.calibration_epoch_start() 36 | LifecycleCallbacks.calibration_epoch_end() 37 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w4a16_fp4/llama3_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" 8 | 9 | # Load model. 10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp4 with per group 16 via ptq 16 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]) 17 | 18 | # Apply quantization. 19 | oneshot(model=model, recipe=recipe) 20 | 21 | print("\n\n") 22 | print("========== SAMPLE GENERATION ==============") 23 | dispatch_for_generation(model) 24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( 25 | model.device 26 | ) 27 | output = model.generate(input_ids, max_new_tokens=100) 28 | print(tokenizer.decode(output[0])) 29 | print("==========================================\n\n") 30 | 31 | 32 | # Save to disk in compressed-tensors format. 33 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16" 34 | model.save_pretrained(SAVE_DIR, save_compressed=True) 35 | tokenizer.save_pretrained(SAVE_DIR) 36 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/conf.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from torch.utils.data import DataLoader 4 | 5 | from llmcompressor.core import Event, EventType, State 6 | 7 | 8 | class LifecyleTestingHarness: 9 | def __init__( 10 | self, 11 | model=None, 12 | optimizer=None, 13 | device="cpu", 14 | start=0, 15 | ): 16 | self.state = State() 17 | self.state.update( 18 | model=model, 19 | device=device, 20 | optimizer=optimizer, 21 | start=start, 22 | steps_per_epoch=1, 23 | calib_data=DataLoader(MagicMock(__len__=lambda _: 0, column_names=[])), 24 | ) 25 | 26 | def update_modifier(self, modifier, event_type): 27 | event = Event(event_type=event_type) 28 | modifier.update_event(self.state, event=event) 29 | 30 | def get_state(self): 31 | return self.state 32 | 33 | def trigger_modifier_for_epochs(self, modifier, num_epochs): 34 | for _ in range(num_epochs): 35 | self.update_modifier(modifier, EventType.BATCH_START) 36 | self.update_modifier(modifier, EventType.LOSS_CALCULATED) 37 | self.update_modifier(modifier, EventType.OPTIM_PRE_STEP) 38 | self.update_modifier(modifier, EventType.OPTIM_POST_STEP) 39 | self.update_modifier(modifier, EventType.BATCH_END) 40 | -------------------------------------------------------------------------------- /llm-compressor/docs/getting-started/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | weight: -10 3 | --- 4 | 5 | # Getting Started 6 | 7 | Welcome to LLM Compressor! This section will guide you through the process of installing the library, compressing your first model, and deploying it with vLLM for faster, more efficient inference. 8 | 9 | LLM Compressor makes it simple to optimize large language models for deployment, offering various quantization techniques that help you find the perfect balance between model quality, performance, and resource efficiency. 10 | 11 | ## Quick Start Guides 12 | 13 | Follow the guides below to get started with LLM Compressor and optimize your models for production deployment. 14 | 15 |
16 | 17 | - :material-package-variant:{ .lg .middle } Installation 18 | 19 | --- 20 | 21 | Learn how to install LLM Compressor using pip or from source. 22 | 23 | [:octicons-arrow-right-24: Installation Guide](install.md) 24 | 25 | - :material-memory:{ .lg .middle } Compress Your Model 26 | 27 | --- 28 | 29 | Learn how to apply quantization to your models using different algorithms and formats. 30 | 31 | [:octicons-arrow-right-24: Compression Guide](compress.md) 32 | 33 | - :material-rocket-launch:{ .lg .middle } Deploy with vLLM 34 | 35 | --- 36 | 37 | Deploy your compressed model for efficient inference using vLLM. 38 | 39 | [:octicons-arrow-right-24: Deployment Guide](deploy.md) 40 | 41 |
42 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w4a16_fp4/qwen3_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | # Load model. 8 | MODEL_ID = "Qwen/Qwen3-32B" 9 | model = AutoModelForCausalLM.from_pretrained( 10 | MODEL_ID, torch_dtype="auto", trust_remote_code=True 11 | ) 12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp4 with per group 16 via ptq 17 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]) 18 | 19 | # Apply quantization. 20 | oneshot(model=model, recipe=recipe) 21 | 22 | print("\n\n========== SAMPLE GENERATION ==============") 23 | dispatch_for_generation(model) 24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( 25 | model.device 26 | ) 27 | output = model.generate(input_ids, max_new_tokens=100) 28 | print(tokenizer.decode(output[0], skip_special_tokens=True)) 29 | print("==========================================\n\n") 30 | 31 | # Save to disk in compressed-tensors format. 32 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16" 33 | model.save_pretrained(SAVE_DIR, save_compressed=True) 34 | tokenizer.save_pretrained(SAVE_DIR) 35 | -------------------------------------------------------------------------------- /trl_trainer/noise_scheduler.py: -------------------------------------------------------------------------------- 1 | from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm 2 | from transformers.models.llama.modeling_llama import LlamaRMSNorm 3 | import torch 4 | 5 | def get_sigma_by_step(step, total_steps, sigma_trend): 6 | step = min(step, total_steps) 7 | 8 | num_intervals = len(sigma_trend) + 1 9 | steps_per_interval = total_steps / num_intervals 10 | 11 | interval_id = int(step // steps_per_interval) 12 | 13 | if interval_id == 0: 14 | return interval_id, 0 15 | 16 | sigma_id = interval_id - 1 17 | sigma_id = min(sigma_id, len(sigma_trend) - 1) 18 | 19 | sigma = sigma_trend[sigma_id] 20 | return sigma_id, sigma 21 | 22 | def generate_gaussian_noise(model, step, total_step, sigma_trend): 23 | for name, module in model.named_modules(): 24 | if isinstance(module, Qwen2RMSNorm) or isinstance(module, LlamaRMSNorm): 25 | weight_tensor = module.weight 26 | sigma_id, sigma = get_sigma_by_step(step, total_step, sigma_trend) 27 | print("Current step:", step, "Total steps:", total_step, "Sigma id:", sigma_id, "Sigma:", sigma) 28 | if sigma == 0: 29 | return 30 | noise = torch.normal(mean=0, std=sigma, size=weight_tensor.shape, dtype=torch.float32).to(weight_tensor.device) 31 | noise = noise.to(weight_tensor.dtype) 32 | with torch.no_grad(): 33 | module.weight.add_(noise) -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/args/recipe_arguments.py: -------------------------------------------------------------------------------- 1 | """ 2 | Recipe argument classes for LLM compression workflows. 3 | 4 | Defines dataclass-based argument containers for configuring sparsification 5 | recipes, compression sessions, and stage-based execution parameters used in 6 | model compression and optimization workflows. 7 | """ 8 | 9 | from dataclasses import dataclass, field 10 | from typing import List, Optional 11 | 12 | 13 | @dataclass 14 | class RecipeArguments: 15 | """Recipe and session variables""" 16 | 17 | recipe: Optional[str] = field( 18 | default=None, 19 | metadata={ 20 | "help": "Path to a LLM Compressor sparsification recipe", 21 | }, 22 | ) 23 | recipe_args: Optional[List[str]] = field( 24 | default=None, 25 | metadata={ 26 | "help": ( 27 | "List of recipe arguments to evaluate, of the format key1=value1 " 28 | "key2=value2" 29 | ) 30 | }, 31 | ) 32 | clear_sparse_session: Optional[bool] = field( 33 | default=False, 34 | metadata={ 35 | "help": ( 36 | "Whether to clear CompressionSession/CompressionLifecycle ", 37 | "data between runs.", 38 | ) 39 | }, 40 | ) 41 | stage: Optional[str] = field( 42 | default=None, 43 | metadata={"help": ("The stage of the recipe to use for oneshot / train.",)}, 44 | ) 45 | -------------------------------------------------------------------------------- /llm-compressor/docs/guides/compression_formats.md: -------------------------------------------------------------------------------- 1 | # Compression Formats 2 | 3 | The following table outlines the possible quantization and sparsity 4 | compression formats that are applied to a model during compression. 5 | The formats are determined according to the quantization scheme and 6 | sparsity type. For more details on the quantization schemes, see 7 | `guides/compression_schemes.md`. 8 | 9 | 10 | | Quantization | Sparsity | Quant Compressor | Sparsity Compressor | 11 | |---------------|----------|----------------------|---------------------| 12 | | W8A8 - int | None | int_quantized | Dense | 13 | | W8A8 - float | None | float_quantized | Dense | 14 | | W4A16 - float | None | nvfp4_pack_quantized | Dense | 15 | | W4A4 - float | None | nvfp4_pack_quantized | Dense | 16 | | W4A16 - int | None | pack_quantized | Dense | 17 | | W8A16 - int | None | pack_quantized | Dense | 18 | | W8A16 - float | None | naive_quantized | Dense | 19 | | W8A8 - int | 2:4 | int_quantized | Sparse24 | 20 | | W8A8 - float | 2:4 | float_quantized | Sparse24 | 21 | | W4A16 - int | 2:4 | marlin_24 | Dense | 22 | | W8A16 - int | 2:4 | marlin_24 | Dense | 23 | | W8A16 - float | 2:4 | naive_quantized | Dense | 24 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/args/training_arguments.py: -------------------------------------------------------------------------------- 1 | """ 2 | Training argument classes for LLM compression workflows. 3 | 4 | This module defines dataclass-based argument containers for configuring 5 | training and one-shot calibration workflows. Extends HuggingFace's 6 | TrainingArguments with additional parameters specific to compression and 7 | stage-based execution. 8 | """ 9 | 10 | from dataclasses import dataclass, field 11 | from typing import Optional 12 | 13 | from transformers import TrainingArguments as HFTrainingArgs 14 | 15 | __all__ = [ 16 | "TrainingArguments", 17 | ] 18 | 19 | 20 | @dataclass 21 | class TrainingArguments(HFTrainingArgs): 22 | """ 23 | Training arguments specific to LLM Compressor Transformers workflow using 24 | HFTrainingArgs as base class 25 | 26 | """ 27 | 28 | do_oneshot: Optional[bool] = field( 29 | default=False, 30 | metadata={"help": "Whether to run one-shot calibration in stages"}, 31 | ) 32 | run_stages: Optional[bool] = field( 33 | default=False, metadata={"help": "Whether to trigger recipe stage by stage"} 34 | ) 35 | output_dir: str = field( 36 | default="./output", 37 | metadata={ 38 | "help": "The output directory where the model safetensors, " 39 | "recipe, config, and optionally checkpoints will be written." 40 | }, 41 | ) 42 | 43 | @property 44 | def place_model_on_device(self): 45 | return False 46 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="cnn_dailymail") 12 | class CNNDailyMailDataset(TextGenerationDataset): 13 | """ 14 | Text generation class for the CNN/DailyMail dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n" 22 | 23 | def __init__( 24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 25 | ): 26 | dataset_args = deepcopy(dataset_args) 27 | dataset_args.dataset = "cnn_dailymail" 28 | dataset_args.dataset_config_name = "3.0.0" 29 | 30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 31 | 32 | def dataset_template(self, sample): 33 | return { 34 | "text": self.SAMPLE_TEMPLATE.format( 35 | article=sample["article"], highlights=sample["highlights"] 36 | ) 37 | } 38 | -------------------------------------------------------------------------------- /openr1_tool/pass_rate_filtering/README.md: -------------------------------------------------------------------------------- 1 | # Pass rate filtering 2 | 3 | We provide support to filter datasets by generating and computing pass rate on veriable tasks 4 | 5 | See `scripts/pass_rate_filtering/compute_pass_rate.py` and `scripts/pass_rate_filtering/launch_filtering.sh` (hardcoded for DAPO at the moment) 6 | 7 | By default the script chunks the dataset, merge can be run using the following snippet (example for DAPO) : 8 | 9 | from datasets import load_dataset, concatenate_datasets 10 | 11 | name = "open-r1/DAPO-Math-17k-Processed-R1-Distill-Qwen-Math-7B-Merges-v00.02-v01.02-0.3-0.7-filter" 12 | 13 | ```python 14 | gen_datasets = [] 15 | filt_datasets = [] 16 | for start in range(0,17400,200): 17 | end = start + 200 18 | if start == 17200: 19 | end = 17398 20 | gen_config_name = f"gen-{start}-{end}" 21 | gen_dataset = load_dataset(name, gen_config_name, revision="gen", split="train") 22 | gen_datasets.append(gen_dataset) 23 | 24 | filt_config_name = f"filt-0.1-0.6-{start}-{end}" 25 | filt_dataset = load_dataset(name, filt_config_name, revision="pass_rate", split="train") 26 | filt_datasets.append(filt_dataset) 27 | 28 | gen_dataset = concatenate_datasets(gen_datasets) 29 | gen_dataset.push_to_hub(name, config_name="gen", split="train") 30 | print(gen_dataset) 31 | 32 | filt_dataset = concatenate_datasets(filt_datasets) 33 | filt_dataset.push_to_hub(name, config_name="default", split="train") 34 | 35 | print(filt_dataset) 36 | ``` -------------------------------------------------------------------------------- /llm-compressor/examples/compressed_inference/fp8_compressed_inference.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | """ 4 | This example covers how to load a quantized model using AutoModelForCausalLM. 5 | 6 | During inference, each layer will be decompressed as needed before the forward pass. 7 | This saves memory as only a single layer is ever uncompressed at a time, but increases 8 | runtime as we need to decompress each layer before running the forward pass 9 | 10 | """ 11 | 12 | # any model with the "compressed-tensors" quant_method and "compressed" 13 | # quantization_status in the quantization config is supported 14 | MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed" 15 | 16 | SAMPLE_INPUT = [ 17 | "I love quantization because", 18 | "What is the capital of France?", 19 | "def fibonacci(n):", 20 | ] 21 | 22 | compressed_model = AutoModelForCausalLM.from_pretrained( 23 | MODEL_STUB, 24 | torch_dtype="auto", 25 | device_map="auto", 26 | ) 27 | 28 | # tokenize the sample data 29 | tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB) 30 | inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( 31 | compressed_model.device 32 | ) 33 | 34 | # run the compressed model and decode the output 35 | output = compressed_model.generate(**inputs, max_length=50) 36 | print("========== SAMPLE GENERATION ==============") 37 | text_output = tokenizer.batch_decode(output) 38 | for sample in text_output: 39 | print(sample) 40 | -------------------------------------------------------------------------------- /llm-compressor/examples/trl_mixin/README.md: -------------------------------------------------------------------------------- 1 | # Sparse Finetuning with TRL's SFTTrainer 2 | 3 | The `SessionManagerMixin` can be added to other Trainer classes that inherit from 4 | [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer). 5 | 6 | For example, we can add LLM Compressor support to TRL's SFTTrainer like so: 7 | 8 | Note: install `trl` using `pip install trl` 9 | 10 | ```python 11 | from trl import SFTTrainer as TRLSFTTrainer 12 | 13 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): 14 | ... 15 | ``` 16 | 17 | The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during 18 | supervised finetuning, will full support for all of the original TRL features. The full 19 | class is defined in the script `sft_trainer.py` and requires very minimal 20 | additional code: just a dataset load override to support passing in tokenized datasets 21 | to the Trainer. 22 | 23 | ### Examples 24 | 25 | * Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model, 26 | using TRL's dataset preprocessing. Sparsity is maintained throughout training by 27 | applying a `ConstantPruningModifier` recipe to the `SFTTrainer` 28 | 29 | * Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b 30 | model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained 31 | throughout training with a `ConstantPruningModifier` and layer-wise knowledge 32 | distillation is handled by the `OutputDistillationModifier` -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/llama3_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" 8 | 9 | # Load model. 10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] 19 | ) 20 | 21 | # Apply quantization. 22 | oneshot(model=model, recipe=recipe) 23 | 24 | # Confirm generations of the quantized model look sane. 25 | print("========== SAMPLE GENERATION ==============") 26 | dispatch_for_generation(model) 27 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( 28 | model.device 29 | ) 30 | output = model.generate(input_ids, max_new_tokens=20) 31 | print(tokenizer.decode(output[0])) 32 | print("==========================================") 33 | 34 | # Save to disk in compressed-tensors format. 35 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 36 | model.save_pretrained(SAVE_DIR) 37 | tokenizer.save_pretrained(SAVE_DIR) 38 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/fp8_block_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "Qwen/Qwen3-30B-A3B" 8 | 9 | # Load model. 10 | model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", 19 | scheme="FP8_BLOCK", 20 | ignore=["lm_head", "re:.*mlp.gate$"], 21 | ) 22 | 23 | # Apply quantization. 24 | oneshot(model=model, recipe=recipe) 25 | 26 | # Confirm generations of the quantized model look sane. 27 | print("========== SAMPLE GENERATION ==============") 28 | dispatch_for_generation(model) 29 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( 30 | model.device 31 | ) 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(tokenizer.decode(output[0])) 34 | print("==========================================") 35 | 36 | # Save to disk in compressed-tensors format. 37 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-BLOCK" 38 | model.save_pretrained(SAVE_DIR) 39 | tokenizer.save_pretrained(SAVE_DIR) 40 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/smoothquant/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.modifiers.factory import ModifierFactory 4 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier 5 | 6 | 7 | @pytest.mark.unit 8 | @pytest.mark.usefixtures("setup_modifier_factory") 9 | def test_smooth_quant_is_registered(): 10 | smoothing_strength = 0.3 11 | mappings = [(["layer1", "layer2"], "layer3")] 12 | modifier = ModifierFactory.create( 13 | type_="SmoothQuantModifier", 14 | allow_experimental=False, 15 | allow_registered=True, 16 | smoothing_strength=smoothing_strength, 17 | mappings=mappings, 18 | ) 19 | 20 | assert isinstance( 21 | modifier, SmoothQuantModifier 22 | ), "PyTorch SmoothQuant not registered" 23 | assert modifier.smoothing_strength == smoothing_strength 24 | assert modifier.mappings == mappings 25 | 26 | 27 | @pytest.mark.unit 28 | @pytest.mark.usefixtures("setup_modifier_factory") 29 | def test_smooth_quant_defaults(): 30 | default_sq = SmoothQuantModifier() 31 | assert default_sq.smoothing_strength == 0.5 32 | 33 | 34 | @pytest.mark.unit 35 | def test_override_defaults(): 36 | strength = 0.7 37 | dummy_map = [(["layer1", "layer2"], "layer3")] 38 | non_default_sq = SmoothQuantModifier( 39 | smoothing_strength=strength, mappings=dummy_map 40 | ) 41 | 42 | assert non_default_sq.smoothing_strength == strength 43 | assert non_default_sq.mappings == dummy_map 44 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/transformers/finetune/data/gsm8k.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="gsm8k") 12 | class GSM8KDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the Grade School Math 8k dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | GSM_TEMPLATE = "Question: {question}\nAnswer:" 22 | 23 | def __init__( 24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 25 | ): 26 | dataset_args = deepcopy(dataset_args) 27 | dataset_args.dataset = "gsm8k" 28 | dataset_args.text_column = "text" 29 | 30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 31 | 32 | def dataset_template(self, sample): 33 | prompt = self.GSM_TEMPLATE.format(question=sample["question"]) 34 | text = prompt 35 | if "answer" in sample: 36 | text += " " + sample["answer"] 37 | 38 | return { 39 | "text": text, 40 | self.PROMPT_KEY: prompt, 41 | } 42 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_owl.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from datasets import Dataset 4 | from transformers import AutoModelForCausalLM 5 | 6 | from llmcompressor.core.session_functions import create_session 7 | from llmcompressor.datasets import format_calibration_data 8 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier 9 | from llmcompressor.utils.pytorch.module import get_layers 10 | 11 | 12 | @pytest.mark.integration 13 | def test_infer_owl_layer_sparsity(): 14 | target_sparsity = 0.7 15 | vocab_size = 512 16 | seq_len = 2048 17 | ds_size = 16 18 | 19 | with create_session() as session: 20 | session.initialize() 21 | modifier = SparseGPTModifier( 22 | sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05 23 | ) 24 | model = AutoModelForCausalLM.from_pretrained("nm-testing/tinysmokellama-3.2") 25 | 26 | dataset = Dataset.from_dict( 27 | {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))} 28 | ) 29 | dataloader = format_calibration_data(dataset) 30 | 31 | sequential_targets = modifier._infer_sequential_targets(model) 32 | layers = get_layers(sequential_targets, model) 33 | sparsities = modifier._infer_owl_layer_sparsity(model, layers, dataloader) 34 | assert sparsities.keys() == layers.keys() 35 | 36 | for sparsity in sparsities.values(): 37 | assert sparsity == pytest.approx(target_sparsity, abs=0.1) 38 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modeling import replace_modules_for_calibration 5 | from llmcompressor.modifiers.quantization import QuantizationModifier 6 | 7 | # NOTE: Qwen3-VL-MoE support is not in transformers<=4.56.2 8 | # you may need to install transformers from source 9 | 10 | 11 | MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct" 12 | 13 | # Load model. 14 | model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") 15 | processor = AutoProcessor.from_pretrained(MODEL_ID) 16 | model = replace_modules_for_calibration(model) 17 | 18 | # Configure the quantization algorithm and scheme. 19 | # In this case, we: 20 | # * quantize the weights to fp8 with channel-wise quantization 21 | # * quantize the activations to fp8 with dynamic token activations 22 | # NOTE: only datafree quantization is supported for Qwen3-VL-MoE currently 23 | recipe = QuantizationModifier( 24 | targets="Linear", 25 | scheme="FP8_DYNAMIC", 26 | ignore=[ 27 | "re:.*lm_head", 28 | "re:visual.*", 29 | "re:model.visual.*", 30 | "re:.*mlp.gate$", 31 | ], 32 | ) 33 | 34 | # Apply quantization. 35 | oneshot(model=model, recipe=recipe) 36 | 37 | # Save to disk in compressed-tensors format. 38 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-DYNAMIC" 39 | model.save_pretrained(SAVE_DIR) 40 | processor.save_pretrained(SAVE_DIR) 41 | -------------------------------------------------------------------------------- /utils/competitive_programming/ioi_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import lru_cache 3 | 4 | from datasets import load_dataset 5 | 6 | 7 | def add_includes(code: str, problem_id: str) -> str: 8 | """ 9 | Fix common compilation errors for IOI problems. 10 | """ 11 | if not code: 12 | return code 13 | # has most of the useful functions 14 | code_header = "#include \n" 15 | # include the problem header 16 | problem_header_include = f'#include "{problem_id}.h"' 17 | if problem_header_include not in code: 18 | code_header += problem_header_include + "\n" 19 | # use namespace std since models forget std:: often 20 | if "using namespace std;" not in code and "std::" not in code: 21 | code_header += "\nusing namespace std;\n\n" 22 | return code_header + code 23 | 24 | 25 | @lru_cache 26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]: 27 | """ 28 | Load IOI tests for a given year. 29 | """ 30 | tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train") 31 | test_cases = defaultdict(dict) 32 | for test_case in tests_dataset: 33 | test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"] 34 | return test_cases 35 | 36 | 37 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]: 38 | """ 39 | Load IOI tests for a given year and problem id. 40 | """ 41 | return load_ioi_tests_for_year(year)[problem_id] 42 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/qwen3_next_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct" 8 | 9 | # Load model. 10 | model = AutoModelForCausalLM.from_pretrained( 11 | MODEL_ID, 12 | torch_dtype="auto", 13 | low_cpu_mem_usage=True, 14 | trust_remote_code=True, 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) 17 | 18 | recipe = QuantizationModifier( 19 | targets=["Linear"], 20 | scheme="FP8_DYNAMIC", 21 | ignore=[ 22 | "lm_head", 23 | "re:.*mlp.gate$", 24 | "re:.*mlp.shared_expert_gate$", 25 | "re:.*linear_attn.*", 26 | ], 27 | ) 28 | 29 | # Apply quantization. 30 | oneshot(model=model, recipe=recipe) 31 | 32 | # Confirm generations of the quantized model look sane. 33 | print("========== SAMPLE GENERATION ==============") 34 | dispatch_for_generation(model) 35 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( 36 | model.device 37 | ) 38 | output = model.generate(input_ids, max_new_tokens=20) 39 | print(tokenizer.decode(output[0])) 40 | print("==========================================") 41 | 42 | # Save to disk in compressed-tensors format. 43 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 44 | model.save_pretrained(SAVE_DIR, save_compressed=True) 45 | tokenizer.save_pretrained(SAVE_DIR) 46 | -------------------------------------------------------------------------------- /src/open_r1/utils/competitive_programming/ioi_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from functools import lru_cache 3 | 4 | from datasets import load_dataset 5 | 6 | 7 | def add_includes(code: str, problem_id: str) -> str: 8 | """ 9 | Fix common compilation errors for IOI problems. 10 | """ 11 | if not code: 12 | return code 13 | # has most of the useful functions 14 | code_header = "#include \n" 15 | # include the problem header 16 | problem_header_include = f'#include "{problem_id}.h"' 17 | if problem_header_include not in code: 18 | code_header += problem_header_include + "\n" 19 | # use namespace std since models forget std:: often 20 | if "using namespace std;" not in code and "std::" not in code: 21 | code_header += "\nusing namespace std;\n\n" 22 | return code_header + code 23 | 24 | 25 | @lru_cache 26 | def load_ioi_tests_for_year(year: int) -> dict[str, dict[str, tuple[str, str]]]: 27 | """ 28 | Load IOI tests for a given year. 29 | """ 30 | tests_dataset = load_dataset("open-r1/ioi-test-cases", name=f"{year}", split="train") 31 | test_cases = defaultdict(dict) 32 | for test_case in tests_dataset: 33 | test_cases[test_case["problem_id"]][test_case["test_name"]] = test_case["test_input"], test_case["test_output"] 34 | return test_cases 35 | 36 | 37 | def load_ioi_tests(year: int, problem_id: str) -> dict[str, tuple[str, str]]: 38 | """ 39 | Load IOI tests for a given year and problem id. 40 | """ 41 | return load_ioi_tests_for_year(year)[problem_id] 42 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/modifiers/smoothquant/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.smoothquant.utils import ( 6 | get_layer_mappings_from_architecture, 7 | handle_mapping_resolution_errors, 8 | ) 9 | 10 | smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils" 11 | 12 | 13 | @pytest.mark.unit 14 | def test_handle_mapping_resolution_errors(): 15 | README_LOCATION = ( 16 | "https://github.com/vllm-project/llm-compressor/tree/main/" 17 | "src/llmcompressor/modifiers/smoothquant" 18 | ) 19 | 20 | @handle_mapping_resolution_errors 21 | def func_that_raises_exception(): 22 | raise ValueError("An error occurred") 23 | 24 | with pytest.raises(RuntimeError) as excinfo: 25 | func_that_raises_exception() 26 | 27 | assert "Error resolving mappings for given architecture." in str(excinfo.value) 28 | assert "Please refer to the README at" in str(excinfo.value) 29 | assert README_LOCATION in str(excinfo.value) 30 | 31 | 32 | @pytest.mark.unit 33 | @patch( 34 | f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"} 35 | ) 36 | @patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping") 37 | def test_get_layer_mappings_from_architecture(): 38 | # Test when architecture is in MAPPINGS_REGISTRY 39 | assert get_layer_mappings_from_architecture("arch1") == "mapping1" 40 | 41 | # Test when architecture is not in MAPPINGS_REGISTRY 42 | assert get_layer_mappings_from_architecture("arch3") == "default_mapping" 43 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.args import DatasetArguments 4 | from llmcompressor.datasets import make_dataset_splits 5 | from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset 6 | 7 | 8 | @pytest.mark.unit 9 | def test_combined_datasets(): 10 | dataset_args = DatasetArguments( 11 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1" 12 | ) 13 | raw_wikitext2 = get_raw_dataset(dataset_args) 14 | datasets = {"all": raw_wikitext2} 15 | split_datasets = make_dataset_splits(datasets, do_train=True) 16 | assert split_datasets.get("train") is not None 17 | 18 | split_datasets = make_dataset_splits(datasets, do_train=True) 19 | assert split_datasets.get("train") is not None 20 | 21 | 22 | @pytest.mark.unit 23 | def test_separate_datasets(): 24 | splits = {"train": "train[:5%]", "validation": "train[10%:20%]"} 25 | dataset_args = DatasetArguments( 26 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1" 27 | ) 28 | datasets = {} 29 | for split_name, split_str in splits.items(): 30 | raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str) 31 | datasets[split_name] = raw_wikitext2 32 | 33 | split_datasets = make_dataset_splits(datasets, do_train=True) 34 | assert split_datasets.get("train") is not None 35 | 36 | with pytest.raises(ValueError): 37 | # fails due to no test split specified 38 | 39 | datasets.pop("train") 40 | split_datasets = make_dataset_splits(datasets, do_train=True) 41 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/qwen2vl_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" 8 | 9 | # Load model. 10 | model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | processor = AutoProcessor.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", 19 | scheme="FP8_DYNAMIC", 20 | ignore=["re:.*lm_head", "re:visual.*"], 21 | ) 22 | 23 | # Apply quantization and save to disk in compressed-tensors format. 24 | oneshot(model=model, recipe=recipe) 25 | 26 | # Confirm generations of the quantized model look sane. 27 | print("========== SAMPLE GENERATION ==============") 28 | dispatch_for_generation(model) 29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( 30 | model.device 31 | ) 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(processor.decode(output[0])) 34 | print("==========================================") 35 | 36 | # Save to disk in compressed-tensors format. 37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 38 | model.save_pretrained(SAVE_DIR, save_compressed=True) 39 | processor.save_pretrained(SAVE_DIR) 40 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/qwen_2_5_vl_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" 8 | 9 | # Load model. 10 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | processor = AutoProcessor.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", 19 | scheme="FP8_DYNAMIC", 20 | ignore=["lm_head", "re:visual.*", "re:model.visual.*"], 21 | ) 22 | 23 | # Apply quantization and save to disk in compressed-tensors format. 24 | oneshot(model=model, recipe=recipe) 25 | 26 | # Confirm generations of the quantized model look sane. 27 | print("========== SAMPLE GENERATION ==============") 28 | dispatch_for_generation(model) 29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") 30 | output = model.generate(input_ids, max_new_tokens=20) 31 | print(processor.decode(output[0])) 32 | print("==========================================") 33 | 34 | # Save to disk in compressed-tensors format. 35 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 36 | model.save_pretrained(SAVE_DIR, save_compressed=True) 37 | processor.save_pretrained(SAVE_DIR) 38 | -------------------------------------------------------------------------------- /llm-compressor/tests/e2e/vLLM/run_vllm.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | import torch 5 | from vllm import LLM, SamplingParams 6 | 7 | 8 | def parse_args(): 9 | """Parse JSON arguments passed via command line.""" 10 | if len(sys.argv) < 4: 11 | msg = "Usage: python script.py '' '' ''" 12 | raise ValueError(msg) 13 | 14 | try: 15 | scheme = json.loads(sys.argv[1]) 16 | llm_kwargs = json.loads(sys.argv[2]) 17 | prompts = json.loads(sys.argv[3]) 18 | except json.JSONDecodeError as e: 19 | raise ValueError(f"Invalid JSON input: {e}") 20 | 21 | if "W4A16_2of4" in scheme: 22 | # required by the kernel 23 | llm_kwargs["dtype"] = torch.float16 24 | 25 | return llm_kwargs, prompts 26 | 27 | 28 | def run_vllm(llm_kwargs: dict, prompts: list[str]) -> None: 29 | """Run vLLM with given kwargs and prompts, then print outputs.""" 30 | sampling_params = SamplingParams(temperature=0.80, top_p=0.95) 31 | 32 | llm = LLM(**llm_kwargs) 33 | outputs = llm.generate(prompts, sampling_params) 34 | 35 | print("================= vLLM GENERATION =================") 36 | for output in outputs: 37 | if not output or not output.outputs: 38 | print("[Warning] Empty output for prompt:", output.prompt) 39 | continue 40 | 41 | print(f"\nPROMPT:\n{output.prompt}") 42 | print(f"GENERATED TEXT:\n{output.outputs[0].text}") 43 | 44 | 45 | def main(): 46 | llm_kwargs, prompts = parse_args() 47 | run_vllm(llm_kwargs, prompts) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/llava1.5_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "llava-hf/llava-1.5-7b-hf" 8 | 9 | # Load model. 10 | model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | processor = AutoProcessor.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", 19 | scheme="FP8_DYNAMIC", 20 | ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"], 21 | ) 22 | 23 | # Apply quantization and save to disk in compressed-tensors format. 24 | oneshot(model=model, recipe=recipe) 25 | 26 | # Confirm generations of the quantized model look sane. 27 | print("========== SAMPLE GENERATION ==============") 28 | dispatch_for_generation(model) 29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( 30 | model.device 31 | ) 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(processor.decode(output[0])) 34 | print("==========================================") 35 | 36 | # Save to disk in compressed-tensors format. 37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 38 | model.save_pretrained(SAVE_DIR, save_compressed=True) 39 | processor.save_pretrained(SAVE_DIR) 40 | -------------------------------------------------------------------------------- /llm-compressor/src/llmcompressor/recipe/metadata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metadata classes for recipe and model information tracking. 3 | 4 | This module defines Pydantic models for capturing and validating metadata about 5 | datasets, parameters, layers, and models used in compression recipes. Provides 6 | structured data containers for recipe configuration and execution tracking. 7 | """ 8 | 9 | from typing import Any, Dict, List, Optional 10 | 11 | from pydantic import BaseModel, Field 12 | 13 | __all__ = [ 14 | "DatasetMetaData", 15 | "ParamMetaData", 16 | "LayerMetaData", 17 | "ModelMetaData", 18 | ] 19 | 20 | 21 | class DatasetMetaData(BaseModel): 22 | name: str = None 23 | version: str = None 24 | hash: str = None 25 | shape: List[int] = Field(default_factory=list) 26 | num_classes: int = None 27 | num_train_samples: int = None 28 | num_val_samples: int = None 29 | num_test_samples: int = None 30 | 31 | 32 | class ParamMetaData(BaseModel): 33 | name: str = None 34 | shape: List[int] = None 35 | weight_hash: str = None 36 | 37 | 38 | class LayerMetaData(BaseModel): 39 | name: str = None 40 | type: str = None 41 | index: int = None 42 | attributes: Dict[str, Any] = None 43 | input_shapes: List[List[int]] = None 44 | output_shapes: List[List[int]] = None 45 | params: Dict[str, ParamMetaData] = None 46 | 47 | 48 | class ModelMetaData(BaseModel): 49 | architecture: str = None 50 | sub_architecture: str = None 51 | input_shapes: List[List[int]] = None 52 | output_shapes: List[List[int]] = None 53 | layers: List[LayerMetaData] = Field(default_factory=list) 54 | layer_prefix: Optional[str] = None 55 | -------------------------------------------------------------------------------- /llm-compressor/examples/quantization_w8a8_fp8/llama3.2_vision_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, MllamaForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | from llmcompressor.utils import dispatch_for_generation 6 | 7 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" 8 | 9 | # Load model. 10 | model = MllamaForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") 11 | processor = AutoProcessor.from_pretrained(MODEL_ID) 12 | 13 | # Configure the quantization algorithm and scheme. 14 | # In this case, we: 15 | # * quantize the weights to fp8 with per channel via ptq 16 | # * quantize the activations to fp8 with dynamic per token 17 | recipe = QuantizationModifier( 18 | targets="Linear", 19 | scheme="FP8_DYNAMIC", 20 | ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_model.*"], 21 | ) 22 | 23 | # Apply quantization and save to disk in compressed-tensors format. 24 | oneshot(model=model, recipe=recipe) 25 | 26 | # Confirm generations of the quantized model look sane. 27 | print("========== SAMPLE GENERATION ==============") 28 | dispatch_for_generation(model) 29 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to( 30 | model.device 31 | ) 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(processor.decode(output[0])) 34 | print("==========================================") 35 | 36 | # Save to disk in compressed-tensors format. 37 | SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic" 38 | model.save_pretrained(SAVE_DIR, save_compressed=True) 39 | processor.save_pretrained(SAVE_DIR) 40 | -------------------------------------------------------------------------------- /setup_env.sh: -------------------------------------------------------------------------------- 1 | GIT_LFS_SKIP_SMUDGE=1 pip install -e ".[dev]" 2 | pip install torch==2.7.1 3 | pip install torchaudio==2.7.1 4 | pip install flash-attn==2.7.4.post1 --no-build-isolation 5 | pip install trl==0.21.0 6 | pip install vllm==0.10.1 7 | # replace vllm/vllm/lora/models.py with vllm_replacement/models.py 8 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])') 9 | cp -rv replacement/vllm_replacement/models.py $site_pkg_path/vllm/lora/models.py 10 | # replace vllm/vllm/lora/worker_manager.py with vllm_replacement/worker_manager.py 11 | cp -rv replacement/vllm_replacement/worker_manager.py $site_pkg_path/vllm/lora/worker_manager.py 12 | # make an empty folder to pass asserts in vllm lora requests 13 | mkdir -p simon_lora_path simon_stub_path 14 | 15 | pip install peft 16 | 17 | git clone --branch 0.11.0 --depth 1 https://github.com/neuralmagic/compressed-tensors.git 18 | cd compressed-tensors 19 | pip install -e . --no-deps 20 | cd .. 21 | # replace compressed-tensors/src/compressed_tensors/linear/compressed_linear.py with compressed-tensors_replacement/compressed_linear.py 22 | cp replacement/compressed-tensors_replacement/compressed_linear.py compressed-tensors/src/compressed_tensors/linear/compressed_linear.py 23 | # replace compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py with compressed-tensors_replacement/forward.py 24 | cp replacement/compressed-tensors_replacement/forward.py compressed-tensors/src/compressed_tensors/quantization/lifecycle/forward.py 25 | 26 | pip install accelerate==1.10.1 --no-deps 27 | 28 | site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])') 29 | cp -rv replacement/trainer.py $site_pkg_path/transformers/trainer.py 30 | -------------------------------------------------------------------------------- /llm-compressor/tests/unit/core/events/test_event.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.core import Event, EventType 4 | 5 | 6 | @pytest.mark.smoke 7 | def test_event_epoch_based(): 8 | event = Event(steps_per_epoch=10) 9 | assert event.epoch_based is True 10 | 11 | 12 | @pytest.mark.smoke 13 | def test_event_epoch(): 14 | event = Event(steps_per_epoch=10, global_step=25) 15 | assert event.epoch == 2 16 | 17 | 18 | @pytest.mark.smoke 19 | def test_event_epoch_full(): 20 | event = Event(steps_per_epoch=10, global_step=25) 21 | assert event.epoch_full == 2.5 22 | 23 | 24 | @pytest.mark.smoke 25 | def test_event_epoch_step(): 26 | event = Event(steps_per_epoch=10, global_step=25) 27 | assert event.epoch_step == 5 28 | 29 | 30 | @pytest.mark.smoke 31 | def test_event_epoch_batch(): 32 | event = Event( 33 | steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50 34 | ) 35 | assert event.epoch_batch == 10 36 | 37 | 38 | @pytest.mark.smoke 39 | def test_event_current_index(): 40 | event = Event(steps_per_epoch=10, global_step=25) 41 | assert event.current_index == 2.5 42 | 43 | 44 | @pytest.mark.smoke 45 | def test_event_should_update(): 46 | event = Event(steps_per_epoch=10, global_step=25) 47 | assert event.should_update(start=0, end=30, update=2.5) is True 48 | assert event.should_update(start=0, end=20, update=5) is False 49 | assert event.should_update(start=0, end=30, update=0) is True 50 | 51 | 52 | @pytest.mark.smoke 53 | def test_event_new_instance(): 54 | event = Event(type_=EventType.INITIALIZE, global_step=25) 55 | new_event = event.new_instance(global_step=30) 56 | assert new_event.global_step == 30 57 | assert new_event.type_ == EventType.INITIALIZE 58 | -------------------------------------------------------------------------------- /llm-compressor/docs/developer/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | weight: -3 3 | --- 4 | 5 | # Developer 6 | 7 | Welcome to the Developer section of LLM Compressor! This area provides essential resources for developers who want to contribute to or extend LLM Compressor. Whether you're interested in fixing bugs, adding new features, improving documentation, or understanding the project's governance, you'll find comprehensive guides to help you get started. 8 | 9 | LLM Compressor is an open-source project that values community contributions. We maintain high standards for code quality, documentation, and community interactions to ensure that LLM Compressor remains a robust, reliable, and user-friendly tool for compressing large language models. 10 | 11 | ## Developer Resources 12 | 13 |
14 | 15 | - :material-handshake:{ .lg .middle } Code of Conduct 16 | 17 | --- 18 | 19 | Our community guidelines ensure that participation in the LLM Compressor project is a positive, inclusive, and respectful experience for everyone. 20 | 21 | [:octicons-arrow-right-24: Code of Conduct](code-of-conduct.md) 22 | 23 | - :material-source-pull:{ .lg .middle } Contributing Guide 24 | 25 | --- 26 | 27 | Learn how to effectively contribute to LLM Compressor, including reporting bugs, suggesting features, improving documentation, and submitting code. 28 | 29 | [:octicons-arrow-right-24: Contributing Guide](contributing.md) 30 | 31 | - :material-tools:{ .lg .middle } Development Guide 32 | 33 | --- 34 | 35 | Detailed instructions for setting up your development environment, implementing changes, and adhering to the project's coding standards and best practices. 36 | 37 | [:octicons-arrow-right-24: Development Guide](developing.md) 38 | 39 |
40 | -------------------------------------------------------------------------------- /src/open_r1/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer 3 | 4 | from trl import ModelConfig, get_kbit_device_map, get_quantization_config 5 | 6 | from ..configs import GRPOConfig, SFTConfig 7 | 8 | 9 | def get_tokenizer(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> PreTrainedTokenizer: 10 | """Get the tokenizer for the model.""" 11 | tokenizer = AutoTokenizer.from_pretrained( 12 | model_args.model_name_or_path, 13 | revision=model_args.model_revision, 14 | trust_remote_code=model_args.trust_remote_code, 15 | ) 16 | 17 | if training_args.chat_template is not None: 18 | tokenizer.chat_template = training_args.chat_template 19 | 20 | return tokenizer 21 | 22 | 23 | def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig) -> AutoModelForCausalLM: 24 | """Get the model""" 25 | torch_dtype = ( 26 | model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype) 27 | ) 28 | quantization_config = get_quantization_config(model_args) 29 | model_kwargs = dict( 30 | revision=model_args.model_revision, 31 | trust_remote_code=model_args.trust_remote_code, 32 | attn_implementation=model_args.attn_implementation, 33 | torch_dtype=torch_dtype, 34 | use_cache=False if training_args.gradient_checkpointing else True, 35 | device_map=get_kbit_device_map() if quantization_config is not None else None, 36 | quantization_config=quantization_config, 37 | ) 38 | model = AutoModelForCausalLM.from_pretrained( 39 | model_args.model_name_or_path, 40 | **model_kwargs, 41 | ) 42 | return model 43 | -------------------------------------------------------------------------------- /llm-compressor/tests/llmcompressor/transformers/sparsegpt/test_sparsegpt_lm_head.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import pytest 4 | import torch 5 | from transformers import AutoModelForCausalLM 6 | 7 | from llmcompressor.core.state import State 8 | from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier 9 | 10 | 11 | @pytest.fixture 12 | def model(): 13 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 14 | return AutoModelForCausalLM.from_pretrained( 15 | "nm-testing/tinysmokellama-3.2", device_map=device 16 | ) 17 | 18 | 19 | @pytest.fixture 20 | def dataloader(): 21 | dataset = MagicMock() 22 | dataset.column_names = [] 23 | dataloader = MagicMock() 24 | dataloader.dataset = dataset 25 | dataloader.__iter__.return_value = iter([]) 26 | return dataloader 27 | 28 | 29 | @pytest.mark.integration 30 | @pytest.mark.parametrize("extra_targets,expected", [([], 0), (["lm_head"], 1)]) 31 | def test_lm_head(extra_targets, expected, model, dataloader): 32 | kwargs = { 33 | "sparsity": 0.5, 34 | "block_size": 128, 35 | "targets": [ 36 | "model.layers.0", 37 | "model.layers.1", 38 | "model.layers.2", 39 | "model.layers.3", 40 | "model.layers.4", 41 | "model.layers.5", 42 | ] 43 | + extra_targets, 44 | } 45 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 46 | 47 | modifier = SparseGPTModifier(**kwargs) 48 | 49 | state = State() 50 | state.update(model=model, device=device, calib_data=dataloader) 51 | modifier.initialize(state) 52 | modifier.on_start(state, None) 53 | 54 | assert len(model.lm_head._forward_hooks) == expected 55 | 56 | modifier.finalize(state) 57 | --------------------------------------------------------------------------------