├── .MAINTAINERS
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── doc-edit.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── TODO.txt
    └── workflows
    │   ├── build-and-publish-release-images.yaml
    │   ├── linkcheck.yml
    │   ├── linkspector
    │       └── linkspector.yml
    │   ├── quality-check.yaml
    │   ├── result.xml.fail
    │   ├── result.xml.success
    │   ├── set-comment.yaml
    │   ├── test-check-transformers.yaml
    │   └── test-check.yaml
├── .gitignore
├── CONTRIBUTING.md
├── DEVELOPING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE
├── README.md
├── docs
    ├── save_pretrained.md
    └── schemes.md
├── examples
    ├── awq
    │   ├── README.md
    │   ├── llama_example.py
    │   └── qwen3_moe_example.py
    ├── big_models_with_accelerate
    │   ├── README.md
    │   ├── cpu_offloading_fp8.py
    │   ├── mult_gpus_int8_device_map.py
    │   └── multi_gpu_int8.py
    ├── compressed_inference
    │   └── fp8_compressed_inference.py
    ├── finetuning
    │   ├── configure_fsdp.md
    │   ├── example_alternating_recipe.yaml
    │   ├── example_fsdp_config.yaml
    │   └── example_single_gpu_config.yaml
    ├── multimodal_audio
    │   ├── README.md
    │   └── whisper_example.py
    ├── multimodal_vision
    │   ├── README.md
    │   ├── gemma3_example.py
    │   ├── idefics3_example.py
    │   ├── llava_example.py
    │   ├── mistral3_chat_template.json
    │   ├── mistral3_example.py
    │   ├── mllama_example.py
    │   ├── phi3_vision_example.py
    │   ├── pixtral_example.py
    │   ├── qwen2_vl_example.py
    │   └── qwen_2_5_vl_example.py
    ├── quantization_2of4_sparse_w4a16
    │   ├── 2of4_w4a16_group-128_recipe.yaml
    │   ├── 2of4_w4a16_recipe.yaml
    │   ├── README.md
    │   └── llama7b_sparse_w4a16.py
    ├── quantization_kv_cache
    │   ├── README.md
    │   ├── gemma2_fp8_kv_example.py
    │   ├── llama3_fp8_kv_example.py
    │   └── phi3.5_fp8_kv_example.py
    ├── quantization_w4a16
    │   ├── README.md
    │   └── llama3_example.py
    ├── quantization_w4a16_fp4
    │   └── llama3_example.py
    ├── quantization_w4a4_fp4
    │   └── llama3_example.py
    ├── quantization_w8a8_fp8
    │   ├── README.md
    │   ├── gemma2_example.py
    │   ├── llama3.2_vision_example.py
    │   ├── llama3_example.py
    │   ├── llava1.5_example.py
    │   ├── qwen2vl_example.py
    │   └── whisper_example.py
    ├── quantization_w8a8_int8
    │   ├── README.md
    │   ├── gemma2_example.py
    │   └── llama3_example.py
    ├── quantizing_moe
    │   ├── README.md
    │   ├── deepseek_moe_w4a16.py
    │   ├── deepseek_moe_w8a8_fp8.py
    │   ├── deepseek_moe_w8a8_int8.py
    │   ├── deepseek_recipe_w4a16.yaml
    │   ├── mixtral_moe_w8a8_fp8.py
    │   └── qwen_moe_w4a16.py
    ├── sparse_2of4_quantization_fp8
    │   ├── README.md
    │   └── llama3_8b_2of4.py
    └── trl_mixin
    │   ├── README.md
    │   ├── ex_trl_constant.py
    │   ├── ex_trl_distillation.py
    │   └── sft_trainer.py
├── pyproject.toml
├── setup.py
├── src
    └── llmcompressor
    │   ├── __init__.py
    │   ├── args
    │       ├── README.md
    │       ├── __init__.py
    │       ├── dataset_arguments.py
    │       ├── model_arguments.py
    │       ├── recipe_arguments.py
    │       ├── training_arguments.py
    │       └── utils.py
    │   ├── core
    │       ├── __init__.py
    │       ├── events
    │       │   ├── __init__.py
    │       │   └── event.py
    │       ├── helpers.py
    │       ├── lifecycle.py
    │       ├── model_layer.py
    │       ├── session.py
    │       ├── session_functions.py
    │       └── state.py
    │   ├── datasets
    │       ├── __init__.py
    │       └── utils.py
    │   ├── entrypoints
    │       ├── README.md
    │       ├── __init__.py
    │       ├── oneshot.py
    │       ├── train.py
    │       └── utils.py
    │   ├── logger.py
    │   ├── metrics
    │       ├── __init__.py
    │       ├── logger.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   └── frequency_manager.py
    │   ├── modifiers
    │       ├── README.md
    │       ├── __init__.py
    │       ├── awq
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   └── mappings.py
    │       ├── distillation
    │       │   ├── __init__.py
    │       │   ├── output
    │       │   │   ├── __init__.py
    │       │   │   └── base.py
    │       │   └── utils
    │       │   │   ├── __init__.py
    │       │   │   └── pytorch
    │       │   │       ├── __init__.py
    │       │   │       ├── kd_factory.py
    │       │   │       ├── kd_wrapper.py
    │       │   │       └── model_wrapper.py
    │       ├── experimental
    │       │   └── __init__.py
    │       ├── factory.py
    │       ├── interface.py
    │       ├── logarithmic_equalization
    │       │   ├── __init__.py
    │       │   └── base.py
    │       ├── modifier.py
    │       ├── obcq
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   ├── sgpt_base.py
    │       │   └── sgpt_sparsify.py
    │       ├── pruning
    │       │   ├── __init__.py
    │       │   ├── constant
    │       │   │   ├── __init__.py
    │       │   │   └── base.py
    │       │   ├── helpers.py
    │       │   ├── magnitude
    │       │   │   ├── __init__.py
    │       │   │   └── base.py
    │       │   ├── utils
    │       │   │   ├── __init__.py
    │       │   │   └── pytorch
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── layer_mask.py
    │       │   │   │   └── mask_factory.py
    │       │   └── wanda
    │       │   │   ├── __init__.py
    │       │   │   ├── base.py
    │       │   │   └── wanda_sparsify.py
    │       ├── quantization
    │       │   ├── __init__.py
    │       │   ├── cache.py
    │       │   ├── calibration.py
    │       │   ├── gptq
    │       │   │   ├── __init__.py
    │       │   │   ├── base.py
    │       │   │   └── gptq_quantize.py
    │       │   └── quantization
    │       │   │   ├── __init__.py
    │       │   │   ├── base.py
    │       │   │   └── mixin.py
    │       ├── smoothquant
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── base.py
    │       │   └── utils.py
    │       ├── stage.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── constants.py
    │       │   ├── helpers.py
    │       │   ├── hooks.py
    │       │   └── pytorch_helpers.py
    │   ├── observers
    │       ├── __init__.py
    │       ├── base.py
    │       ├── helpers.py
    │       ├── min_max.py
    │       └── mse.py
    │   ├── pipelines
    │       ├── __init__.py
    │       ├── basic
    │       │   ├── __init__.py
    │       │   └── pipeline.py
    │       ├── cache.py
    │       ├── data_free
    │       │   ├── __init__.py
    │       │   └── pipeline.py
    │       ├── independent
    │       │   ├── __init__.py
    │       │   └── pipeline.py
    │       ├── layer_sequential
    │       │   ├── __init__.py
    │       │   ├── helpers.py
    │       │   └── pipeline.py
    │       ├── registry.py
    │       └── sequential
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── ast_helpers.py
    │       │   ├── ast_utils
    │       │       ├── auto_wrapper.py
    │       │       ├── control_flow_analyzer.py
    │       │       └── name_analyzer.py
    │       │   ├── helpers.py
    │       │   └── pipeline.py
    │   ├── pytorch
    │       ├── __init__.py
    │       ├── model_load
    │       │   ├── __init__.py
    │       │   └── helpers.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── helpers.py
    │       │   ├── sparsification.py
    │       │   └── sparsification_info
    │       │       ├── __init__.py
    │       │       ├── configs.py
    │       │       ├── helpers.py
    │       │       └── module_sparsification_info.py
    │   ├── recipe
    │       ├── __init__.py
    │       ├── base.py
    │       ├── metadata.py
    │       ├── modifier.py
    │       ├── recipe.py
    │       └── stage.py
    │   ├── sentinel.py
    │   ├── transformers
    │       ├── __init__.py
    │       ├── compression
    │       │   ├── __init__.py
    │       │   ├── helpers.py
    │       │   ├── quantization_format.py
    │       │   └── sparsity_metadata_config.py
    │       ├── finetune
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── callbacks.py
    │       │   ├── data
    │       │   │   ├── __init__.py
    │       │   │   ├── base.py
    │       │   │   ├── c4.py
    │       │   │   ├── cnn_dailymail.py
    │       │   │   ├── custom.py
    │       │   │   ├── data_helpers.py
    │       │   │   ├── evolcodealpaca.py
    │       │   │   ├── flickr_30k.py
    │       │   │   ├── gsm8k.py
    │       │   │   ├── open_platypus.py
    │       │   │   ├── peoples_speech.py
    │       │   │   ├── ptb.py
    │       │   │   ├── ultrachat_200k.py
    │       │   │   └── wikitext.py
    │       │   ├── session_mixin.py
    │       │   ├── text_generation.py
    │       │   └── trainer.py
    │       ├── sparsification
    │       │   ├── __init__.py
    │       │   ├── compressed_tensors_utils.py
    │       │   └── sparse_model.py
    │       ├── tracing
    │       │   ├── __init__.py
    │       │   └── debug.py
    │       └── utils
    │       │   ├── __init__.py
    │       │   ├── helpers.py
    │       │   └── preprocessing_functions.py
    │   ├── typing.py
    │   └── utils
    │       ├── __init__.py
    │       ├── dev.py
    │       ├── fsdp
    │           ├── __init__.py
    │           ├── context.py
    │           └── helpers.py
    │       ├── helpers.py
    │       ├── metric_logging.py
    │       └── pytorch
    │           ├── __init__.py
    │           ├── module.py
    │           └── utils.py
└── tests
    ├── __init__.py
    ├── custom_test.py
    ├── data.py
    ├── e2e
        ├── __init__.py
        ├── e2e_utils.py
        └── vLLM
        │   ├── __init__.py
        │   ├── configs
        │       ├── fp8_dynamic_per_token.yaml
        │       ├── fp8_dynamic_per_token_qwen.yaml
        │       ├── fp8_static_per_tensor.yaml
        │       ├── fp8_weight_only_channel.yaml
        │       ├── fp8_weight_only_tensor.yaml
        │       ├── int8_channel_weight_static_per_tensor_act.yaml
        │       ├── int8_dynamic_per_token.yaml
        │       ├── int8_tensor_weight_static_per_tensor_act.yaml
        │       ├── int8_tensor_weight_static_per_tensor_act_qwen.yaml
        │       ├── kv_cache_gptq_tinyllama.yaml
        │       ├── kv_cache_phi3.yaml
        │       ├── kv_cache_tinyllama.yaml
        │       ├── sparse2of4_fp8_dynamic.yaml
        │       ├── sparse2of4_fp8_dynamic_qwen.yaml
        │       ├── sparse_24.yaml
        │       ├── w4a16_2of4_channel_quant.yaml
        │       ├── w4a16_2of4_grouped_quant.yaml
        │       ├── w4a16_actorder_group.yaml
        │       ├── w4a16_actorder_group_qwen.yaml
        │       ├── w4a16_actorder_weight.yaml
        │       ├── w4a16_actorder_weight_qwen.yaml
        │       ├── w4a16_channel_quant.yaml
        │       ├── w4a16_channel_quant_qwen.yaml
        │       ├── w4a16_grouped_quant.yaml
        │       ├── w4a16_grouped_quant_asym_awq.yaml
        │       ├── w8a16_channel_quant.yaml
        │       ├── w8a16_grouped_quant.yaml
        │       ├── w8a8_dynamic_asym.yaml
        │       └── w8a8_static_asym.yaml
        │   ├── recipes
        │       ├── FP8
        │       │   ├── recipe_fp8_weight_only_channel.yaml
        │       │   └── recipe_fp8_weight_only_per_tensor.yaml
        │       ├── INT8
        │       │   ├── recipe_int8_channel_weight_dynamic_per_token.yaml
        │       │   ├── recipe_int8_channel_weight_static_per_tensor_act.yaml
        │       │   ├── recipe_int8_tensor_weight_static_per_tensor_act.yaml
        │       │   ├── recipe_w8a8_dynamic_asym.yaml
        │       │   └── recipe_w8a8_static_asym.yaml
        │       ├── Sparse_2of4
        │       │   ├── recipe_sparse_2of4.yaml
        │       │   └── recipe_sparse_2of4_fp8_dynamic.yaml
        │       ├── WNA16
        │       │   ├── recipe_w4a16_channel_quant.yaml
        │       │   ├── recipe_w4a16_group_quant_asym_awq.yaml
        │       │   └── recipe_w8a16_channel_quant.yaml
        │       ├── WNA16_2of4
        │       │   ├── 2of4_w4a16_group-128_recipe.yaml
        │       │   └── 2of4_w4a16_recipe.yaml
        │       ├── actorder
        │       │   ├── recipe_w4a16_actorder_group.yaml
        │       │   └── recipe_w4a16_actorder_weight.yaml
        │       └── kv_cache
        │       │   ├── default.yaml
        │       │   └── gptq.yaml
        │   ├── run_tests.sh
        │   ├── skipped_configs
        │       └── fp4_nvfp4a16.yaml
        │   └── test_vllm.py
    ├── examples
        ├── __init__.py
        ├── test_big_models_with_accelerate.py
        ├── test_compressed_inference.py
        ├── test_quantization_2of4_sparse_w4a16.py
        ├── test_quantization_kv_cache.py
        ├── test_quantization_w4a16.py
        ├── test_quantization_w8a8_fp8.py
        ├── test_quantization_w8a8_int8.py
        ├── test_quantizing_moe.py
        ├── test_sparse_2of4_quantization_fp8.py
        ├── test_trl_mixin.py
        └── utils.py
    ├── llmcompressor
        ├── __init__.py
        ├── conftest.py
        ├── helpers.py
        ├── metrics
        │   ├── __init__.py
        │   ├── test_logger.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   └── test_frequency_manager.py
        ├── modifiers
        │   ├── __init__.py
        │   ├── awq
        │   │   ├── __init__.py
        │   │   └── test_base.py
        │   ├── calibration
        │   │   ├── __init__.py
        │   │   ├── test_cache.py
        │   │   ├── test_frozen.py
        │   │   ├── test_kv_cache.py
        │   │   └── test_observers.py
        │   ├── conf.py
        │   ├── logarithmic_equalization
        │   │   ├── __init__.py
        │   │   └── test_base.py
        │   ├── pruning
        │   │   ├── __init__.py
        │   │   ├── sparsegpt
        │   │   │   ├── __init__.py
        │   │   │   └── test_base.py
        │   │   └── wanda
        │   │   │   ├── __init__.py
        │   │   │   └── test_base.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   └── test_base.py
        │   ├── smoothquant
        │   │   ├── __init__.py
        │   │   ├── test_base.py
        │   │   └── test_utils.py
        │   └── utils
        │   │   └── test_hooks.py
        ├── observers
        │   ├── __init__.py
        │   ├── test_helpers.py
        │   ├── test_min_max.py
        │   └── test_mse.py
        ├── pipelines
        │   ├── sequential
        │   │   ├── ast_utils.py
        │   │   │   └── test_auto_wrapper.py
        │   │   └── test_helpers.py
        │   └── test_cache.py
        ├── pytorch
        │   ├── __init__.py
        │   ├── helpers.py
        │   ├── modifiers
        │   │   ├── __init__.py
        │   │   ├── logarithmic_equalization
        │   │   │   ├── __init__.py
        │   │   │   └── test_pytorch.py
        │   │   ├── pruning
        │   │   │   ├── __init__.py
        │   │   │   ├── constant
        │   │   │   │   ├── __init__.py
        │   │   │   │   └── test_pytorch.py
        │   │   │   ├── sparsegpt
        │   │   │   │   ├── __init__.py
        │   │   │   │   └── test_pytorch.py
        │   │   │   └── wanda
        │   │   │   │   └── test_pytorch.py
        │   │   └── smoothquant
        │   │   │   ├── __init__.py
        │   │   │   └── test_pytorch.py
        │   └── utils
        │   │   ├── __init__.py
        │   │   └── test_helpers.py
        ├── recipe
        │   ├── __init__.py
        │   ├── recipe.yaml
        │   ├── test_recipe.py
        │   └── test_recipe_parsing.py
        ├── test_sentinel.py
        ├── transformers
        │   ├── __init__.py
        │   ├── compression
        │   │   ├── __init__.py
        │   │   ├── configs
        │   │   │   ├── actorder_group_1.1b.yaml
        │   │   │   ├── actorder_weight_1.1b.yaml
        │   │   │   ├── channelwise_1.1b.yaml
        │   │   │   ├── channelwise_15m.yaml
        │   │   │   ├── fp8_1.1b.yaml
        │   │   │   ├── fp8_15m.yaml
        │   │   │   ├── group_1.1b.yaml
        │   │   │   ├── inputs_1.1b.yaml
        │   │   │   ├── inputs_15m.yaml
        │   │   │   ├── weights_only_1.1b.yaml
        │   │   │   └── weights_only_15m.yaml
        │   │   ├── decompression_configs
        │   │   │   ├── fp8_dynamic.yaml
        │   │   │   ├── w4a16.yaml
        │   │   │   └── w8a16_dense.yaml
        │   │   ├── decompression_configs_skipped
        │   │   │   └── w8a8.yaml
        │   │   ├── recipes
        │   │   │   ├── new_quant_actorder_group.yaml
        │   │   │   ├── new_quant_actorder_weight.yaml
        │   │   │   ├── new_quant_channel.yaml
        │   │   │   ├── new_quant_fp8.yaml
        │   │   │   ├── new_quant_full.yaml
        │   │   │   ├── new_quant_group.yaml
        │   │   │   ├── new_quant_simple.yaml
        │   │   │   ├── new_quant_weight.yaml
        │   │   │   ├── sparse_24.yaml
        │   │   │   └── sparse_24_fp8.yaml
        │   │   ├── run_compressed_configs
        │   │   │   ├── fp8_dynamic.yaml
        │   │   │   ├── w4a16.yaml
        │   │   │   └── w8a16.yaml
        │   │   ├── run_compressed_configs_skipped
        │   │   │   └── w8a8.yaml
        │   │   ├── test_decompress.py
        │   │   ├── test_has_gpu.py
        │   │   ├── test_helpers.py
        │   │   ├── test_infer_quant_format.py
        │   │   ├── test_quantization.py
        │   │   ├── test_run_compressed.py
        │   │   └── test_sparsity_metadata_config.py
        │   ├── conftest.py
        │   ├── finetune
        │   │   ├── __init__.py
        │   │   ├── data
        │   │   │   ├── __init__.py
        │   │   │   ├── conftest.py
        │   │   │   ├── test_dataset_helpers.py
        │   │   │   ├── test_dataset_loading.py
        │   │   │   └── test_registry.py
        │   │   ├── finetune_custom
        │   │   │   ├── config1.yaml
        │   │   │   ├── config2.yaml
        │   │   │   └── gpu
        │   │   │   │   └── gpu_config.yaml
        │   │   ├── finetune_generic
        │   │   │   └── config1.yaml
        │   │   ├── finetune_oneshot_configs
        │   │   │   ├── config.yaml
        │   │   │   └── gpu
        │   │   │   │   └── gpu_config.yaml
        │   │   ├── finetune_tokenizer
        │   │   │   └── config1.yaml
        │   │   ├── test_alternate_recipe.yaml
        │   │   ├── test_finetune_no_recipe_custom_dataset.py
        │   │   ├── test_finetune_recipe.yaml
        │   │   ├── test_finetune_without_recipe.py
        │   │   ├── test_oneshot_and_finetune.py
        │   │   ├── test_oneshot_and_finetune_with_tokenizer.py
        │   │   ├── test_oneshot_then_finetune.py
        │   │   ├── test_quantization.yaml
        │   │   ├── test_safetensors.py
        │   │   └── test_session_mixin.py
        │   ├── gptq
        │   │   └── test_oneshot.py
        │   ├── kv_cache
        │   │   └── test_kv_cache.py
        │   ├── obcq
        │   │   ├── __init__.py
        │   │   ├── obcq_configs
        │   │   │   ├── completion
        │   │   │   │   ├── gpu
        │   │   │   │   │   ├── llama_7b_quant.yaml
        │   │   │   │   │   ├── llama_7b_quant_and_sparse.yaml
        │   │   │   │   │   └── llama_7b_sparse.yml
        │   │   │   │   ├── tiny_llama_quant.yaml
        │   │   │   │   └── tiny_llama_quant_and_sparse.yaml
        │   │   │   ├── consec_runs
        │   │   │   │   ├── gpu
        │   │   │   │   │   └── llama_consec_runs.yaml
        │   │   │   │   └── tiny_llama_consec_runs.yaml
        │   │   │   ├── mask_structure
        │   │   │   │   └── tiny_llama_mask_structure_preservation.yaml
        │   │   │   ├── sparse
        │   │   │   │   ├── gpu
        │   │   │   │   │   └── llama_7b_sparse.yaml
        │   │   │   │   └── tiny_llama_sparse.yaml
        │   │   │   └── sparsity_generic
        │   │   │   │   └── config.yaml
        │   │   ├── recipes
        │   │   │   ├── additional_sparsity.yaml
        │   │   │   ├── additional_sparsity_with_quant.yaml
        │   │   │   ├── quant.yaml
        │   │   │   ├── quant_and_sparse.yaml
        │   │   │   ├── sparse.yaml
        │   │   │   ├── sparse_with_mask_structure.yaml
        │   │   │   └── test_tiny2.yaml
        │   │   ├── test_consecutive_runs.py
        │   │   ├── test_mask_structure_preservation.py
        │   │   ├── test_obcq_completion.py
        │   │   ├── test_obcq_infer_targets.py
        │   │   ├── test_obcq_lm_head.py
        │   │   ├── test_obcq_owl.py
        │   │   ├── test_obcq_sparsity.py
        │   │   └── test_oneshot_with_modifier.py
        │   ├── oneshot
        │   │   ├── __init__.py
        │   │   ├── dataset_processing.py
        │   │   ├── oneshot_configs
        │   │   │   ├── recipes
        │   │   │   │   └── recipe.yaml
        │   │   │   ├── tiny_stories_conf1.yaml
        │   │   │   ├── tiny_stories_conf2.yaml
        │   │   │   ├── tiny_stories_conf3.yaml
        │   │   │   ├── tiny_stories_conf4.yaml
        │   │   │   ├── tiny_stories_conf5.yaml
        │   │   │   └── tiny_stories_conf6.yaml
        │   │   └── test_api_inputs.py
        │   ├── sparsification
        │   │   ├── __init__.py
        │   │   └── test_compress_tensor_utils.py
        │   └── tracing
        │   │   └── test_models.py
        └── utils
        │   ├── __init__.py
        │   ├── pytorch
        │       ├── __init__.py
        │       └── test_module.py
        │   └── test_helpers.py
    ├── lmeval
        ├── __init__.py
        ├── configs
        │   ├── fp8_dynamic_per_token.yaml
        │   ├── fp8_static_per_tensor.yaml
        │   ├── int8_w8a8_dynamic_per_token.yaml
        │   ├── vl_fp8_dynamic_per_token.yaml
        │   ├── vl_int8_w8a8_dynamic_per_token.yaml
        │   ├── vl_w4a16_actorder_weight.yaml
        │   ├── w4a16_actorder_group.yaml
        │   ├── w4a16_actorder_weight.yaml
        │   └── w4a16_grouped_quant.yaml
        └── test_lmeval.py
    ├── test_timer
        ├── __init__.py
        ├── timer.py
        └── timer_utils.py
    ├── testing_utils.py
    └── unit
        ├── __init__.py
        ├── core
            ├── __init__.py
            ├── events
            │   ├── __init__.py
            │   └── test_event.py
            └── test_state.py
        └── test_logger.py


/.MAINTAINERS:
--------------------------------------------------------------------------------
 1 | # list of active maintainers
 2 | # uncommented maintainers will be included in code review triage
 3 | 
 4 | markurtz
 5 | dsikka
 6 | rahul-tuli
 7 | horheynm
 8 | brian-dellabetta
 9 | kylesayrs
10 | 
11 | # mgoin
12 | # anmarques
13 | # eldarkurtic
14 | # chibukach
15 | # shubhra
16 | # abhinavnmagic
17 | # eiofinov
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | labels: bug
 5 | 
 6 | ---
 7 | 
 8 | **Describe the bug**
 9 | A clear and concise description of what the bug is.
10 | 
11 | **Expected behavior**
12 | A clear and concise description of what you expected to happen.
13 | 
14 | **Environment**
15 | Include all relevant environment information:
16 | 1. OS [e.g. Ubuntu 20.04]:
17 | 2. Python version [e.g. 3.7]:
18 | 3. LLM Compressor version or commit hash [e.g. 0.1.0, `f7245c8`]:
19 | 4. ML framework version(s) [e.g. torch 2.3.1]:
20 | 5. Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]:
21 | 6. Other relevant environment information [e.g. hardware, CUDA version]:
22 | 
23 | **To Reproduce**
24 | Exact steps to reproduce the behavior:
25 | 
26 | 
27 | **Errors**
28 | If applicable, add a full print-out of any errors or exceptions that are raised or include screenshots to help explain your problem.
29 | 
30 | **Additional context**
31 | Add any other context about the problem here. Also include any relevant files.
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/doc-edit.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Doc edit
 3 | about: Propose changes to project documentation
 4 | labels: documentation
 5 | 
 6 | ---
 7 | 
 8 | **What is the URL, file, or UI containing proposed doc change**
 9 | Where does one find the original content or where would this change go?
10 | 
11 | **What is the current content or situation in question**
12 | Copy/paste the source content or describe gap.
13 | 
14 | **What is the proposed change**
15 | Add new content.
16 | 
17 | **Additional context**
18 | Add any other context about the change here. Also include any relevant files or URLs.
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | labels: enhancement
 5 | 
 6 | ---
 7 | 
 8 | **Is your feature request related to a problem? Please describe.**
 9 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
10 | 
11 | **Describe the solution you'd like**
12 | A clear and concise description of what you want to happen.
13 | 
14 | **Describe alternatives you've considered**
15 | A clear and concise description of any alternative solutions or features you've considered.
16 | 
17 | **Additional context**
18 | Add any other context or screenshots about the feature request here.
19 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | SUMMARY:
2 | "please provide a brief summary"
3 | 
4 | 
5 | TEST PLAN:
6 | "please outline how the changes were tested"
7 | 


--------------------------------------------------------------------------------
/.github/TODO.txt:
--------------------------------------------------------------------------------
1 | TODO: update for upstream push


--------------------------------------------------------------------------------
/.github/workflows/linkcheck.yml:
--------------------------------------------------------------------------------
 1 | name: Check Markdown links
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 |   # Allows you to run this workflow manually from the Actions tab
12 |   workflow_dispatch:
13 | 
14 | jobs:
15 |   markdown-link-check:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v4
19 |     - uses: umbrelladocs/action-linkspector@v1
20 |       with:
21 |         github_token: ${{ secrets.github_token }}
22 |         reporter: github-pr-review
23 |         fail_on_error: true
24 |         config_file: '.github/workflows/linkspector/linkspector.yml'
25 | 


--------------------------------------------------------------------------------
/.github/workflows/linkspector/linkspector.yml:
--------------------------------------------------------------------------------
 1 | aliveStatusCodes:
 2 |   - 0
 3 |   - 200
 4 | ignorePatterns:
 5 |   - pattern: '.*localhost.*'
 6 |   - pattern: '.*127\\.0\\.0\\.1.*'
 7 |   - pattern: '.*0\\.0\\.0\\.0.*'
 8 | dirs:
 9 |   - .
10 | useGitIgnore: true


--------------------------------------------------------------------------------
/.github/workflows/quality-check.yaml:
--------------------------------------------------------------------------------
 1 | name: Quality Checks
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - 'release/*'
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 |       - 'release/*'
11 | jobs:
12 |   quality-check:
13 |     runs-on: ubuntu-22.04
14 |     steps:
15 |       - uses: actions/setup-python@v5
16 |         with:
17 |           python-version: '3.9'
18 |       - uses: actions/checkout@v4
19 |       - name: "⚙️ Install dependencies"
20 |         run: pip3 install .[dev]
21 |       - name: "🧹 Running quality checks"
22 |         run: make quality
23 | 


--------------------------------------------------------------------------------
/.github/workflows/result.xml.fail:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <testsuite name="BuildStatus" tests="1" failures="1" errors="1">
3 |   <testcase name="BuildStatus">
4 |         <failure message="build failed" type="Error"/>
5 |   </testcase>
6 | </testsuite>
7 | 


--------------------------------------------------------------------------------
/.github/workflows/result.xml.success:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <testsuite name="BuildStatus" tests="1" failures="0" errors="0">
3 |   <testcase name="BuildStatus">
4 |   </testcase>
5 | </testsuite>
6 | 


--------------------------------------------------------------------------------
/.github/workflows/set-comment.yaml:
--------------------------------------------------------------------------------
 1 | name: PR Reminder Comment Bot
 2 | on:
 3 |   pull_request_target:
 4 |     branches: [main]
 5 |     types: [opened]
 6 | 
 7 | jobs:
 8 |   pr_reminder:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Remind to add ready label
12 |         uses: actions/github-script@v7
13 |         with:
14 |           script: |
15 |             github.rest.issues.createComment({
16 |               owner: context.repo.owner,
17 |               repo: context.repo.repo,
18 |               issue_number: context.issue.number,
19 |               body: '👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.\n\n**Note:** This is required to complete the testing suite, please only add the label once the PR is code complete and local testing has been performed.'
20 |             })
21 |         env:
22 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to LLM Compressor
 2 | 
 3 | Thank you for your interest in contributing to LLM Compressor!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add new compression methods or research.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about LLM Compressor and the vLLM community.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | ## Setup for development
19 | 
20 | ### Install from source
21 | 
22 | ```bash
23 | pip install -e ./[dev]
24 | ```
25 | 
26 | ### Code Styling and Formatting checks
27 | 
28 | ```bash
29 | make style
30 | make quality
31 | ```
32 | 
33 | ### Testing
34 | 
35 | ```bash
36 | make test
37 | ```
38 | 
39 | ## Contributing Guidelines
40 | 
41 | ### Issue Reporting
42 | 
43 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
44 | If not, please file a new issue, providing as much relevant information as possible.
45 | 
46 | ### Pull Requests & Code Reviews
47 | 
48 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
49 | 
50 | ### Thank You
51 | 
52 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to LLM Compressor.
53 | Your contributions make LLM Compressor a great tool for everyone!
54 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | recursive-exclude src *.png *.jpg *.jpeg *.gif *.svg *.bmp *.webp
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BUILDDIR := $(PWD)
 2 | CHECKDIRS := src tests examples setup.py
 3 | DOCDIR := docs
 4 | 
 5 | BUILD_ARGS :=  # set nightly to build nightly release
 6 | 
 7 | # refer to setup.py for allowed values for BUILD_TYPE
 8 | BUILD_TYPE?=dev
 9 | export BUILD_TYPE
10 | 
11 | TARGETS := ""  # targets for running pytests: deepsparse,keras,onnx,pytorch,pytorch_models,export,pytorch_datasets,tensorflow_v1,tensorflow_v1_models,tensorflow_v1_datasets
12 | PYTEST_ARGS ?= ""
13 | ifneq ($(findstring transformers,$(TARGETS)),transformers)
14 |     PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/transformers
15 | endif
16 | ifneq ($(findstring pytorch,$(TARGETS)),pytorch)
17 |     PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/pytorch
18 | endif
19 | ifneq ($(findstring examples,$(TARGETS)),examples)
20 |     PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/examples
21 | endif
22 | 
23 | # run checks on all files for the repo
24 | # leaving out mypy src for now
25 | quality:
26 | 	@echo "Running python quality checks";
27 | 	ruff check $(CHECKDIRS);
28 | 	isort --check-only $(CHECKDIRS);
29 | 	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
30 | 
31 | # style the code according to accepted standards for the repo
32 | style:
33 | 	@echo "Running python styling";
34 | 	ruff format $(CHECKDIRS);
35 | 	isort $(CHECKDIRS);
36 | 	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
37 | 
38 | # run tests for the repo
39 | test:
40 | 	@echo "Running python tests";
41 | 	pytest tests $(PYTEST_ARGS)
42 | 
43 | # creates wheel file
44 | .PHONY: build
45 | build:
46 | 	python3 setup.py sdist bdist_wheel $(BUILD_ARGS)
47 | 
48 | # clean package
49 | clean:
50 | 	rm -fr .pytest_cache;
51 | 	rm -fr docs/_build docs/build;
52 | 	find $(CHECKDIRS) | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -fr;
53 | 


--------------------------------------------------------------------------------
/examples/big_models_with_accelerate/cpu_offloading_fp8.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 7 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
 8 | 
 9 | # Load model
10 | # Note: device_map="auto" will offload to CPU if not enough space on GPU.
11 | model = AutoModelForCausalLM.from_pretrained(
12 |     MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
13 | )
14 | 
15 | # Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
16 | recipe = QuantizationModifier(
17 |     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
18 | )
19 | 
20 | # Apply quantization and save in `compressed-tensors` format.
21 | oneshot(
22 |     model=model,
23 |     recipe=recipe,
24 |     tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
25 |     output_dir=OUTPUT_DIR,
26 | )
27 | 


--------------------------------------------------------------------------------
/examples/compressed_inference/fp8_compressed_inference.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | """
 4 | This example covers how to load a quantized model using AutoModelForCausalLM.
 5 | 
 6 | During inference, each layer will be decompressed as needed before the forward pass.
 7 | This saves memory as only a single layer is ever uncompressed at a time, but increases
 8 | runtime as we need to decompress each layer before running the forward pass
 9 | 
10 | """
11 | 
12 | # any model with the "compressed-tensors" quant_method and "compressed"
13 | # quantization_status in the quantization config is supported
14 | MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed"
15 | 
16 | SAMPLE_INPUT = [
17 |     "I love quantization because",
18 |     "What is the capital of France?",
19 |     "def fibonacci(n):",
20 | ]
21 | 
22 | compressed_model = AutoModelForCausalLM.from_pretrained(
23 |     MODEL_STUB,
24 |     torch_dtype="auto",
25 |     device_map="cuda:0",
26 | )
27 | 
28 | # tokenize the sample data
29 | tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
30 | inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
31 |     compressed_model.device
32 | )
33 | 
34 | # run the compressed model and decode the output
35 | output = compressed_model.generate(**inputs, max_length=50)
36 | print("========== SAMPLE GENERATION ==============")
37 | text_output = tokenizer.batch_decode(output)
38 | for sample in text_output:
39 |     print(sample)
40 | 


--------------------------------------------------------------------------------
/examples/finetuning/configure_fsdp.md:
--------------------------------------------------------------------------------
 1 | # Configuring FSDP for Sparse Finetuning
 2 | 
 3 | An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this
 4 | folder. It can be used out of the box by editing the `num_processes` parameter to 
 5 | fit the number of GPUs on your machine.
 6 | 
 7 | You can also customize your own config file by running the following prompt
 8 | ```
 9 | accelerate config
10 | ```
11 | 
12 | An FSDP config file can be passed to the LLM Compressor finetuning script like this:
13 | ```
14 | accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune
15 | ```
16 | 


--------------------------------------------------------------------------------
/examples/finetuning/example_alternating_recipe.yaml:
--------------------------------------------------------------------------------
 1 | initial_sparsity_stage:
 2 |   run_type: oneshot
 3 |   obcq_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       block_size: 128
 7 |       percdamp: 0.01
 8 |       mask_structure: "0:0"
 9 |       targets: ["Linear"]
10 |       ignore: ["re:.*lm_head"]
11 | initial_training_stage:
12 |   run_type: train
13 |   pruning_modifiers:
14 |     ConstantPruningModifier:
15 |       targets: '__ALL__'
16 |       start: 0
17 | next_sparsity_stage:
18 |   run_type: oneshot
19 |   obcq_modifiers:
20 |     SparseGPTModifier:
21 |       sparsity: 0.7
22 |       block_size: 128
23 |       percdamp: 0.01
24 |       mask_structure: "0:0"
25 |       targets: ["Linear"]
26 |       ignore: ["re:.*lm_head"]
27 | next_training_stage:
28 |   run_type: train
29 |   pruning_modifiers:
30 |     ConstantPruningModifier:
31 |       targets: '__ALL__'
32 |       start: 0


--------------------------------------------------------------------------------
/examples/finetuning/example_fsdp_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: FSDP
 4 | downcast_bf16: 'no'
 5 | fsdp_config:
 6 |   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 7 |   fsdp_backward_prefetch_policy: BACKWARD_PRE
 8 |   fsdp_cpu_ram_efficient_loading: false
 9 |   fsdp_forward_prefetch: false
10 |   fsdp_offload_params: false
11 |   fsdp_sharding_strategy: 1
12 |   fsdp_state_dict_type: SHARDED_STATE_DICT
13 |   fsdp_sync_module_states: true
14 |   fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | num_machines: 1
18 | num_processes: 4
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false
25 | 


--------------------------------------------------------------------------------
/examples/finetuning/example_single_gpu_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | distributed_type: 'NO'
 4 | enable_cpu_affinity: false
 5 | gpu_ids: 0
 6 | machine_rank: 0
 7 | main_training_function: main
 8 | num_machines: 1
 9 | num_processes: 1
10 | rdzv_backend: static
11 | same_network: true
12 | tpu_env: []
13 | tpu_use_cluster: false
14 | tpu_use_sudo: false
15 | use_cpu: false


--------------------------------------------------------------------------------
/examples/multimodal_vision/mistral3_chat_template.json:
--------------------------------------------------------------------------------
1 | {
2 |   "chat_template": "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content'] %}\n    {%- else %}\n        {%- set system_message = messages[0]['content'][0]['text'] %}\n    {%- endif %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n        {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' }}\n            {%- for block in message['content'] %}\n                {%- if block['type'] == 'text' %}\n                    {{- block['text'] }}\n                {%- elif block['type'] in ['image', 'image_url'] %}\n                    {{- '[IMG]' }}\n                {%- else %}\n                    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n                {%- endif %}\n            {%- endfor %}\n            {{- '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {%- if message['content'] is string %}\n            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n        {%- else %}\n            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {%- if message['content'] is string %}\n            {{- message['content'] + eos_token }}\n        {%- else %}\n            {{- message['content'][0]['text'] + eos_token }}\n        {%- endif %}\n    {%- else %}\n        {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}"
3 | }


--------------------------------------------------------------------------------
/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   sparsity_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       mask_structure: "2:4"
 6 |       targets: ["Linear"]
 7 |       ignore: ["re:.*lm_head"]
 8 | finetuning_stage:
 9 |   finetuning_modifiers:
10 |     ConstantPruningModifier:
11 |       targets: [
12 |         're:.*q_proj.weight',
13 |         're:.*k_proj.weight', 
14 |         're:.*v_proj.weight',
15 |         're:.*o_proj.weight',
16 |         're:.*gate_proj.weight',
17 |         're:.*up_proj.weight',
18 |         're:.*down_proj.weight',
19 |       ]
20 |       start: 0
21 | quantization_stage:
22 |   quantization_modifiers:
23 |     GPTQModifier:
24 |       ignore: ["lm_head"]
25 |       config_groups:
26 |         group_0:
27 |           weights:
28 |             num_bits: 4
29 |             type: "int"
30 |             symmetric: true
31 |             strategy: "group"
32 |             group_size: 128
33 |           targets: ["Linear"]
34 | 


--------------------------------------------------------------------------------
/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   sparsity_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       mask_structure: "2:4"
 6 |       targets: ["Linear"]
 7 |       ignore: ["re:.*lm_head"]
 8 | finetuning_stage:
 9 |   finetuning_modifiers:
10 |     ConstantPruningModifier:
11 |       targets: [
12 |         're:.*q_proj.weight',
13 |         're:.*k_proj.weight', 
14 |         're:.*v_proj.weight',
15 |         're:.*o_proj.weight',
16 |         're:.*gate_proj.weight',
17 |         're:.*up_proj.weight',
18 |         're:.*down_proj.weight',
19 |       ]
20 |       start: 0
21 | quantization_stage:
22 |   quantization_modifiers:
23 |     GPTQModifier:
24 |       ignore: ["lm_head"]
25 |       config_groups:
26 |         group_0:
27 |           weights:
28 |             num_bits: 4
29 |             type: "int"
30 |             symmetric: true
31 |             strategy: "channel"
32 |           targets: ["Linear"]
33 | 


--------------------------------------------------------------------------------
/examples/quantization_w4a16_fp4/llama3_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
 7 | 
 8 | # Load model.
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp4 with per group 16 via ptq
17 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])
18 | 
19 | # Apply quantization.
20 | oneshot(model=model, recipe=recipe)
21 | 
22 | print("\n\n")
23 | print("========== SAMPLE GENERATION ==============")
24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
25 | output = model.generate(input_ids, max_new_tokens=100)
26 | print(tokenizer.decode(output[0]))
27 | print("==========================================\n\n")
28 | 
29 | 
30 | # Save to disk in compressed-tensors format.
31 | SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
32 | model.save_pretrained(SAVE_DIR, save_compressed=True)
33 | tokenizer.save_pretrained(SAVE_DIR)
34 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/gemma2_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "google/gemma-2-27b-it"
 7 | 
 8 | # 1) Load model.
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 | 
14 | # 2) Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp8 with per channel via ptq
17 | #   * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 |     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
20 | )
21 | 
22 | # 3) Apply quantization and save in compressed-tensors format.
23 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
24 | oneshot(
25 |     model=model,
26 |     recipe=recipe,
27 |     tokenizer=tokenizer,
28 |     output_dir=OUTPUT_DIR,
29 | )
30 | 
31 | # Confirm generations of the quantized model look sane.
32 | # NOTE: transformers 4.49.0 results in a generation error with gemma2.
33 | # Consider either downgrading your transformers version to a previous version
34 | # or use vLLM for sample generation.
35 | print("========== SAMPLE GENERATION ==============")
36 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
37 | output = model.generate(input_ids, max_new_tokens=20)
38 | print(tokenizer.decode(output[0]))
39 | print("==========================================")
40 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llama3.2_vision_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, MllamaForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 7 | 
 8 | # Load model.
 9 | model = MllamaForConditionalGeneration.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp8 with per channel via ptq
17 | #   * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 |     targets="Linear",
20 |     scheme="FP8_DYNAMIC",
21 |     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
22 | )
23 | 
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(
27 |     model=model,
28 |     recipe=recipe,
29 |     output_dir=SAVE_DIR,
30 | )
31 | processor.save_pretrained(SAVE_DIR)
32 | 
33 | # Confirm generations of the quantized model look sane.
34 | print("========== SAMPLE GENERATION ==============")
35 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
36 | output = model.generate(input_ids, max_new_tokens=20)
37 | print(processor.decode(output[0]))
38 | print("==========================================")
39 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llama3_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 7 | 
 8 | # Load model.
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp8 with per channel via ptq
17 | #   * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 |     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
20 | )
21 | 
22 | # Apply quantization.
23 | oneshot(model=model, recipe=recipe)
24 | 
25 | # Confirm generations of the quantized model look sane.
26 | print("========== SAMPLE GENERATION ==============")
27 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
28 | output = model.generate(input_ids, max_new_tokens=20)
29 | print(tokenizer.decode(output[0]))
30 | print("==========================================")
31 | 
32 | # Save to disk in compressed-tensors format.
33 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
34 | model.save_pretrained(SAVE_DIR)
35 | tokenizer.save_pretrained(SAVE_DIR)
36 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llava1.5_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 7 | 
 8 | # Load model.
 9 | model = LlavaForConditionalGeneration.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp8 with per channel via ptq
17 | #   * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 |     targets="Linear",
20 |     scheme="FP8_DYNAMIC",
21 |     ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
22 | )
23 | 
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27 | processor.save_pretrained(SAVE_DIR)
28 | 
29 | # Confirm generations of the quantized model look sane.
30 | print("========== SAMPLE GENERATION ==============")
31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/qwen2vl_example.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 2 | 
 3 | from llmcompressor import oneshot
 4 | from llmcompressor.modifiers.quantization import QuantizationModifier
 5 | 
 6 | MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 7 | 
 8 | # Load model.
 9 | model = Qwen2VLForConditionalGeneration.from_pretrained(
10 |     MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 | 
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | #   * quantize the weights to fp8 with per channel via ptq
17 | #   * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 |     targets="Linear",
20 |     scheme="FP8_DYNAMIC",
21 |     ignore=["re:.*lm_head", "re:visual.*"],
22 | )
23 | 
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27 | processor.save_pretrained(SAVE_DIR)
28 | 
29 | # Confirm generations of the quantized model look sane.
30 | print("========== SAMPLE GENERATION ==============")
31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 | 


--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/whisper_example.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from transformers import AutoProcessor, WhisperForConditionalGeneration
 3 | 
 4 | from llmcompressor import oneshot
 5 | from llmcompressor.modifiers.quantization import QuantizationModifier
 6 | 
 7 | MODEL_ID = "openai/whisper-large-v2"
 8 | 
 9 | # Load model.
10 | model = WhisperForConditionalGeneration.from_pretrained(
11 |     MODEL_ID, device_map="auto", torch_dtype="auto"
12 | )
13 | model.config.forced_decoder_ids = None
14 | processor = AutoProcessor.from_pretrained(MODEL_ID)
15 | processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
16 | 
17 | # Configure the quantization algorithm and scheme.
18 | # In this case, we:
19 | #   * quantize the weights to fp8 with per channel via ptq
20 | #   * quantize the activations to fp8 with dynamic per token
21 | recipe = QuantizationModifier(
22 |     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
23 | )
24 | 
25 | # Apply quantization.
26 | oneshot(model=model, recipe=recipe)
27 | 
28 | # Confirm generations of the quantized model look sane.
29 | print("========== SAMPLE GENERATION ==============")
30 | ds = load_dataset(
31 |     "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
32 | )
33 | sample = ds[0]["audio"]
34 | input_features = processor(
35 |     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
36 | ).input_features
37 | input_features = input_features.to(model.device)
38 | output_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
39 | print(processor.batch_decode(output_ids, skip_special_tokens=False)[0])
40 | # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
41 | print("==========================================")
42 | 
43 | # Save to disk in compressed-tensors format.
44 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
45 | model.save_pretrained(SAVE_DIR, save_compressed=True)
46 | processor.save_pretrained(SAVE_DIR)
47 | 


--------------------------------------------------------------------------------
/examples/quantizing_moe/deepseek_recipe_w4a16.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     GPTQModifier:
4 |       ignore: [lm_head, "re:.*mlp.gate$"]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/examples/trl_mixin/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse Finetuning with TRL's SFTTrainer
 2 | 
 3 | The `SessionManagerMixin` can be added to other Trainer classes that inherit from 
 4 | [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer).
 5 | 
 6 | For example, we can add LLM Compressor support to TRL's SFTTrainer like so: 
 7 | 
 8 | Note: install `trl` using `pip install trl`
 9 | 
10 | ```python
11 | from trl import SFTTrainer as TRLSFTTrainer
12 | 
13 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
14 |     ...
15 | ```
16 | 
17 | The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during 
18 | supervised finetuning, will full support for all of the original TRL features. The full
19 | class is defined in the script `sft_trainer.py` and requires very minimal 
20 | additional code: just a dataset load override to support passing in tokenized datasets 
21 | to the Trainer. 
22 | 
23 | ### Examples
24 | 
25 | * Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model,
26 | using TRL's dataset preprocessing. Sparsity is maintained throughout training by 
27 | applying a `ConstantPruningModifier` recipe to the `SFTTrainer` 
28 | 
29 | * Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b 
30 | model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained 
31 | throughout training with a `ConstantPruningModifier` and layer-wise knowledge 
32 | distillation is handled by the `OutputDistillationModifier`


--------------------------------------------------------------------------------
/examples/trl_mixin/ex_trl_constant.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from sft_trainer import SFTTrainer
 3 | from transformers import AutoModelForCausalLM, AutoTokenizer
 4 | from trl import DataCollatorForCompletionOnlyLM
 5 | 
 6 | from llmcompressor.args import ModelArguments
 7 | 
 8 | model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
 9 | output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data"
10 | model = AutoModelForCausalLM.from_pretrained(
11 |     model_path, torch_dtype="auto", device_map="auto"
12 | )
13 | tokenizer = AutoTokenizer.from_pretrained(model_path)
14 | tokenizer.pad_token = tokenizer.eos_token
15 | 
16 | # recipe for maintaining model sparsity during finetuning
17 | recipe = """
18 | test_stage:
19 |   pruning_modifiers:
20 |     ConstantPruningModifier:
21 |       targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight',
22 |       're:.*o_proj.weight','re:.*gate_proj.weight', 're:.*up_proj.weight',
23 |       're:.*down_proj.weight']
24 |       start: 0
25 | """
26 | 
27 | # Load gsm8k using TRL dataset tools
28 | dataset = load_dataset("gsm8k", "main", split="train")
29 | 
30 | 
31 | def formatting_prompts_func(example):
32 |     output_texts = []
33 |     for i in range(len(example["question"])):
34 |         text = f"Question: {example['question'][i]}\n Answer: {example['answer'][i]}"
35 |         output_texts.append(text)
36 |     return output_texts
37 | 
38 | 
39 | response_template = "Answer:"
40 | collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
41 | 
42 | trl_sft_config_args = dict(
43 |     output_dir=output_dir,
44 |     num_train_epochs=0.6,
45 |     logging_steps=50,
46 |     gradient_checkpointing=True,
47 |     max_seq_length=512,
48 | )
49 | model_args = ModelArguments(model=model)
50 | 
51 | trainer = SFTTrainer(
52 |     model=model,
53 |     processing_class=tokenizer,
54 |     recipe=recipe,
55 |     train_dataset=dataset,
56 |     formatting_func=formatting_prompts_func,
57 |     data_collator=collator,
58 |     trl_sft_config_args=trl_sft_config_args,
59 |     model_args=model_args,
60 | )
61 | trainer.train()
62 | 


--------------------------------------------------------------------------------
/examples/trl_mixin/sft_trainer.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional
 2 | 
 3 | from trl import SFTConfig as TRLSFTConfig
 4 | from trl import SFTTrainer as TRLSFTTrainer
 5 | 
 6 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
 7 | 
 8 | __all__ = ["SFTTrainer"]
 9 | 
10 | 
11 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
12 |     def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs):
13 |         if trl_sft_config_args is not None:
14 |             kwargs["args"] = TRLSFTConfig(**trl_sft_config_args)
15 |         super().__init__(*args, **kwargs)
16 | 
17 |     def _prepare_dataset(self, dataset, *args, **kwargs):
18 |         if "input_ids" in dataset.column_names:
19 |             # dataset is already tokenized, skip preprocessing
20 |             return dataset
21 | 
22 |         return super()._prepare_dataset(dataset, *args, **kwargs)
23 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.black]
 6 | line-length = 88
 7 | target-version = ['py38']
 8 | 
 9 | [tool.isort]
10 | profile = "black"
11 | skip = ["src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"]
12 | 
13 | [tool.mypy]
14 | files = "src/guidellm"
15 | 
16 | [tool.ruff]
17 | exclude = ["build", "dist", "env", ".venv", "src/llmcompressor/transformers/tracing/"]
18 | lint.select = ["E", "F", "W"]
19 | 
20 | [tool.flake8]
21 | max-line-length = 88
22 | extend-ignore = 'E203'
23 | 
24 | [tool.pytest.ini_options]
25 | markers = [
26 |     "smoke: quick tests to check basic functionality",
27 |     "sanity: tests to ensure that new changes do not break existing functionality",
28 |     "regression: detailed tests to ensure major functions work correctly",
29 |     "integration: tests which integrate with a third party service such as HF",
30 |     "unit: tests to ensure code correctness and regression test functionality",
31 |     "example: tests for content in the 'examples' folder",
32 |     "multi_gpu: tests that require multiple GPUs",
33 | ]
34 | tmp_path_retention_policy = "failed"
35 | 


--------------------------------------------------------------------------------
/src/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A library for compressing large language models utilizing the latest techniques and
 3 | research in the field for both training aware and post training techniques.
 4 | 
 5 | The library is designed to be flexible and easy to use on top of
 6 | PyTorch and HuggingFace Transformers, allowing for quick experimentation.
 7 | """
 8 | 
 9 | # flake8: noqa
10 | 
11 | from .logger import LoggerConfig, configure_logger, logger
12 | from .version import __version__, version
13 | 
14 | __all__ = [
15 |     "__version__",
16 |     "version",
17 |     "configure_logger",
18 |     "logger",
19 |     "LoggerConfig",
20 | ]
21 | 
22 | from llmcompressor.core.session_functions import (
23 |     active_session,
24 |     callbacks,
25 |     create_session,
26 |     reset_session,
27 | )
28 | from llmcompressor.entrypoints import Oneshot, oneshot, train
29 | 


--------------------------------------------------------------------------------
/src/llmcompressor/args/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .dataset_arguments import DatasetArguments
4 | from .model_arguments import ModelArguments
5 | from .recipe_arguments import RecipeArguments
6 | from .training_arguments import TrainingArguments
7 | from .utils import parse_args
8 | 


--------------------------------------------------------------------------------
/src/llmcompressor/args/recipe_arguments.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List, Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class RecipeArguments:
 7 |     """Recipe and session variables"""
 8 | 
 9 |     recipe: Optional[str] = field(
10 |         default=None,
11 |         metadata={
12 |             "help": "Path to a LLM Compressor sparsification recipe",
13 |         },
14 |     )
15 |     recipe_args: Optional[List[str]] = field(
16 |         default=None,
17 |         metadata={
18 |             "help": (
19 |                 "List of recipe arguments to evaluate, of the format key1=value1 "
20 |                 "key2=value2"
21 |             )
22 |         },
23 |     )
24 |     clear_sparse_session: Optional[bool] = field(
25 |         default=False,
26 |         metadata={
27 |             "help": (
28 |                 "Whether to clear CompressionSession/CompressionLifecycle ",
29 |                 "data between runs.",
30 |             )
31 |         },
32 |     )
33 |     stage: Optional[str] = field(
34 |         default=None,
35 |         metadata={"help": ("The stage of the recipe to use for oneshot / train.",)},
36 |     )
37 | 


--------------------------------------------------------------------------------
/src/llmcompressor/args/training_arguments.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from transformers import TrainingArguments as HFTrainingArgs
 5 | 
 6 | __all__ = [
 7 |     "TrainingArguments",
 8 | ]
 9 | 
10 | 
11 | @dataclass
12 | class TrainingArguments(HFTrainingArgs):
13 |     """
14 |     Training arguments specific to LLM Compressor Transformers workflow using
15 |     HFTrainingArgs as base class
16 | 
17 |     """
18 | 
19 |     do_oneshot: Optional[bool] = field(
20 |         default=False,
21 |         metadata={"help": "Whether to run one-shot calibration in stages"},
22 |     )
23 |     run_stages: Optional[bool] = field(
24 |         default=False, metadata={"help": "Whether to trigger recipe stage by stage"}
25 |     )
26 |     output_dir: str = field(
27 |         default="./output",
28 |         metadata={
29 |             "help": "The output directory where the model safetensors, "
30 |             "recipe, config, and optionally checkpoints will be written."
31 |         },
32 |     )
33 | 
34 |     @property
35 |     def place_model_on_device(self):
36 |         return False
37 | 


--------------------------------------------------------------------------------
/src/llmcompressor/core/__init__.py:
--------------------------------------------------------------------------------
 1 | from llmcompressor.core.events import Event, EventType
 2 | from llmcompressor.core.lifecycle import CompressionLifecycle
 3 | from llmcompressor.core.model_layer import ModelParameterizedLayer
 4 | from llmcompressor.core.session import CompressionSession
 5 | from llmcompressor.core.session_functions import (
 6 |     LifecycleCallbacks,
 7 |     active_session,
 8 |     callbacks,
 9 |     create_session,
10 |     reset_session,
11 | )
12 | from llmcompressor.core.state import Data, Hardware, ModifiedState, State
13 | 
14 | __all__ = [
15 |     "Event",
16 |     "EventType",
17 |     "State",
18 |     "Data",
19 |     "Hardware",
20 |     "ModifiedState",
21 |     "ModelParameterizedLayer",
22 |     "CompressionLifecycle",
23 |     "CompressionSession",
24 |     "create_session",
25 |     "active_session",
26 |     "reset_session",
27 |     "apply",
28 |     "callbacks",
29 |     "LifecycleCallbacks",
30 | ]
31 | 


--------------------------------------------------------------------------------
/src/llmcompressor/core/events/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LLM Compressor Core Events Package
 3 | 
 4 | This package provides the core components and lifecycle management for events
 5 | used in the LLM Compressor framework. It includes definitions for various
 6 | event types and lifecycles that are critical for managing the state and
 7 | execution flow of the model compression and training processes.
 8 | """
 9 | 
10 | from .event import Event, EventType
11 | 
12 | __all__ = ["Event", "EventType"]
13 | 


--------------------------------------------------------------------------------
/src/llmcompressor/core/model_layer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any
 3 | 
 4 | __all__ = ["ModelParameterizedLayer"]
 5 | 
 6 | 
 7 | @dataclass
 8 | class ModelParameterizedLayer:
 9 |     """
10 |     A dataclass for holding a parameter and its layer
11 | 
12 |     :param layer_name: the name of the layer
13 |     :param layer: the layer object
14 |     :param param_name: the name of the parameter
15 |     :param param: the parameter object
16 |     """
17 | 
18 |     layer_name: str
19 |     layer: Any
20 |     param_name: str
21 |     param: Any
22 | 


--------------------------------------------------------------------------------
/src/llmcompressor/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .utils import (
4 |     format_calibration_data,
5 |     get_calibration_dataloader,
6 |     get_processed_dataset,
7 |     make_dataset_splits,
8 | )
9 | 


--------------------------------------------------------------------------------
/src/llmcompressor/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .oneshot import Oneshot, oneshot
3 | from .train import train
4 | from .utils import post_process, pre_process
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .logger import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .frequency_manager import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .factory import ModifierFactory
 2 | from .interface import ModifierInterface
 3 | from .modifier import Modifier
 4 | from .stage import StageModifiers
 5 | 
 6 | __all__ = [
 7 |     "ModifierFactory",
 8 |     "ModifierInterface",
 9 |     "Modifier",
10 |     "StageModifiers",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | from .mappings import *
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .output import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/output/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/distillation/utils/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .kd_factory import *
4 | from .kd_wrapper import *
5 | from .model_wrapper import *
6 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/experimental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/experimental/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/interface.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from llmcompressor.core.events import Event
 4 | from llmcompressor.core.state import State
 5 | 
 6 | __all__ = ["ModifierInterface"]
 7 | 
 8 | 
 9 | class ModifierInterface(ABC):
10 |     """
11 |     Defines the contract that all modifiers must implement
12 |     """
13 | 
14 |     @property
15 |     @abstractmethod
16 |     def initialized(self) -> bool:
17 |         """
18 |         :return: True if the modifier has been initialized
19 |         """
20 |         raise NotImplementedError()
21 | 
22 |     @property
23 |     @abstractmethod
24 |     def finalized(self) -> bool:
25 |         """
26 |         :return: True if the modifier has been finalized
27 |         """
28 |         raise NotImplementedError()
29 | 
30 |     @abstractmethod
31 |     def initialize(self, state: State, **kwargs):
32 |         """
33 |         Initialize the modifier
34 | 
35 |         :param state: The current state of the model
36 |         :param kwargs: Additional keyword arguments
37 |             for modifier initialization
38 |         """
39 |         raise NotImplementedError()
40 | 
41 |     @abstractmethod
42 |     def finalize(self, state: State, **kwargs):
43 |         """
44 |         Finalize the modifier
45 | 
46 |         :param state: The current state of the model
47 |         :param kwargs: Additional keyword arguments for
48 |             modifier finalization
49 |         """
50 |         raise NotImplementedError()
51 | 
52 |     @abstractmethod
53 |     def update_event(self, state: State, event: Event, **kwargs):
54 |         """
55 |         Update the modifier based on the event
56 | 
57 |         :param state: The current state of the model
58 |         :param event: The event to update the modifier with
59 |         :param kwargs: Additional keyword arguments for
60 |             modifier update
61 |         """
62 |         raise NotImplementedError()
63 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/obcq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .constant import *
4 | from .magnitude import *
5 | from .wanda import *
6 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import ConstantPruningModifier
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/magnitude/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import MagnitudePruningModifier
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/pruning/utils/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .layer_mask import *
4 | from .mask_factory import *
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .cache import *
4 | from .gptq import *
5 | from .quantization import *
6 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/gptq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | from .mixin import *
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .base import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .constants import *
4 | from .helpers import *
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/constants.py:
--------------------------------------------------------------------------------
1 | __all__ = ["SPARSITY_THRESHOLD"]
2 | 
3 | SPARSITY_THRESHOLD: float = 0.05
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/pytorch_helpers.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import torch
 4 | from torch.nn import Module
 5 | 
 6 | __all__ = [
 7 |     "apply_pad_mask_to_batch",
 8 |     "is_moe_model",
 9 | ]
10 | 
11 | 
12 | def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
13 |     """
14 |     Apply a mask to the input ids of a batch. This is used to zero out
15 |     padding tokens so they do not contribute to the hessian calculation in the
16 |     GPTQ and SparseGPT algorithms
17 | 
18 |     Assumes that `attention_mask` only contains zeros and ones
19 | 
20 |     :param batch: batch to apply padding to if it exists
21 |     :return: batch with padding zeroed out in the input_ids
22 |     """
23 |     if "attention_mask" in batch:
24 |         for key in ("input_ids", "decoder_input_ids"):
25 |             if key in batch:
26 |                 batch[key] = batch[key] * batch["attention_mask"]
27 | 
28 |     return batch
29 | 
30 | 
31 | def is_moe_model(model: Module) -> bool:
32 |     """
33 |     Check if the model is a mixture of experts model
34 | 
35 |     :param model: the model to check
36 |     :return: True if the model is a mixture of experts model
37 |     """
38 | 
39 |     # Check for MoE components
40 |     for _, module in model.named_modules():
41 |         module_name = module.__class__.__name__
42 |         if "MoE" in module_name or "Expert" in module_name:
43 |             return True
44 | 
45 |     # Check config for MoE attributes
46 |     if hasattr(model, "config"):
47 |         if any(
48 |             "moe" in attr.lower() or "expert" in attr.lower()
49 |             for attr in dir(model.config)
50 |         ):
51 |             return True
52 | 
53 |     return False
54 | 


--------------------------------------------------------------------------------
/src/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # isort: skip_file
3 | 
4 | from .helpers import *
5 | from .base import *
6 | from .min_max import *
7 | from .mse import *
8 | 


--------------------------------------------------------------------------------
/src/llmcompressor/observers/helpers.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | import torch
 4 | 
 5 | __all__ = ["get_observer_token_count"]
 6 | 
 7 | 
 8 | def get_observer_token_count(module: torch.nn.Module) -> Counter:
 9 |     """
10 |     Parse the module and return the number of tokens observed by
11 |     each module's observer.
12 | 
13 |     :param module: module to parse
14 |     :return: counter with the number of tokens observed by each observer
15 |     """
16 |     token_counts = Counter()
17 |     for name, module in module.named_modules():
18 |         if name.endswith(".input_observer"):
19 |             token_counts[name.replace(".input_observer", "")] = (
20 |                 module._num_observed_tokens
21 |             )
22 |     return token_counts
23 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # populate registry
3 | from .basic import *
4 | from .data_free import *
5 | from .independent import *
6 | from .layer_sequential import *
7 | from .registry import *
8 | from .sequential import *
9 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/basic/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/basic/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Union
 2 | 
 3 | import torch
 4 | import tqdm
 5 | from compressed_tensors.utils import get_execution_device
 6 | from torch.utils.data.dataloader import DataLoader
 7 | 
 8 | from llmcompressor.core import LifecycleCallbacks
 9 | from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
10 | from llmcompressor.pipelines.registry import CalibrationPipeline
11 | from llmcompressor.pytorch.utils.helpers import tensors_to_device
12 | from llmcompressor.utils.helpers import calibration_forward_context
13 | 
14 | if TYPE_CHECKING:
15 |     from llmcompressor.args.dataset_arguments import DatasetArguments
16 | 
17 | __all__ = ["BasicPipeline", "run_calibration"]
18 | 
19 | 
20 | @CalibrationPipeline.register("basic")
21 | class BasicPipeline(CalibrationPipeline):
22 |     @staticmethod
23 |     def __call__(
24 |         model: torch.nn.Module,
25 |         dataloader: DataLoader,
26 |         dataset_args: Union["DatasetArguments", None],
27 |     ):
28 |         """
29 |         Run a basic data pipeline.
30 | 
31 |         Batches are fetched from the data loader and are used to perform forward passes
32 |         through the model. This pipeline is typically used for basic model calibration
33 |         and, unlike the sequential pipelines, does not propagate compression error when
34 |         used to calibrate model compression
35 | 
36 |         :param model: model being calibrated
37 |         :param dataloader: loads data for calibration
38 |         :param dataset_args: dataset arguments relevant to pipelines
39 |         """
40 |         model_device = get_execution_device(model)
41 | 
42 |         LifecycleCallbacks.calibration_epoch_start()
43 | 
44 |         with calibration_forward_context(model):
45 |             for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
46 |                 batch = apply_pad_mask_to_batch(batch)
47 |                 batch = tensors_to_device(batch, model_device)
48 |                 model(**batch)
49 | 
50 |         LifecycleCallbacks.calibration_epoch_end()
51 | 
52 | 
53 | def run_calibration(model: torch.nn.Module, dataloader: DataLoader):
54 |     pipeline = BasicPipeline()
55 |     pipeline(model, dataloader, None)
56 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/data_free/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/data_free/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Optional
 2 | 
 3 | import torch
 4 | from torch.utils.data.dataloader import DataLoader
 5 | 
 6 | from llmcompressor.core.session_functions import LifecycleCallbacks
 7 | from llmcompressor.pipelines.registry import CalibrationPipeline
 8 | 
 9 | if TYPE_CHECKING:
10 |     from llmcompressor.args.dataset_arguments import DatasetArguments
11 | 
12 | __all__ = ["DataFreePipeline"]
13 | 
14 | 
15 | @CalibrationPipeline.register("datafree")
16 | class DataFreePipeline(CalibrationPipeline):
17 |     @staticmethod
18 |     def __call__(
19 |         model: torch.nn.Module,
20 |         dataloader: Optional[DataLoader],
21 |         dataset_args: "DatasetArguments",
22 |     ):
23 |         """
24 |         A pipeline for data-free calibration
25 | 
26 |         :param model: model being calibrated
27 |         :param dataloader: loads data for calibration
28 |         :param dataset_args: dataset arguments relevant to pipelines
29 |         """
30 |         LifecycleCallbacks.calibration_epoch_start()
31 |         LifecycleCallbacks.calibration_epoch_end()
32 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/independent/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/independent/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | import torch
 4 | from loguru import logger
 5 | from torch.utils.data.dataloader import DataLoader
 6 | 
 7 | from llmcompressor.core import active_session
 8 | from llmcompressor.modifiers.stage import StageModifiers
 9 | from llmcompressor.pipelines.registry import CalibrationPipeline
10 | from llmcompressor.utils.helpers import patch_attr
11 | 
12 | if TYPE_CHECKING:
13 |     from llmcompressor.args.dataset_arguments import DatasetArguments
14 | 
15 | __all__ = ["IndependentPipeline"]
16 | 
17 | 
18 | @CalibrationPipeline.register("independent")
19 | class IndependentPipeline(CalibrationPipeline):
20 |     @staticmethod
21 |     def __call__(
22 |         model: torch.nn.Module,
23 |         dataloader: DataLoader,
24 |         dataset_args: "DatasetArguments",
25 |     ):
26 |         """
27 |         Data pipeline where each modifier is assigned its own calibration epoch and data
28 |         pipeline
29 | 
30 |         :param model: model being calibrated
31 |         :param dataloader: loads data for calibration
32 |         :param dataset_args: dataset arguments relevant to pipelines
33 |         """
34 |         _logger = logger.patch(lambda r: r.update(function="IndependentPipeline"))
35 | 
36 |         session = active_session()
37 |         modifiers = session.get_modifiers()
38 |         with patch_attr(session.lifecycle, "modifiers", None):
39 |             for index, modifier in enumerate(modifiers):
40 |                 mod_type = str(type(modifier).__name__)
41 |                 session.lifecycle.modifiers = [
42 |                     StageModifiers(modifiers=[modifier], group=mod_type, index=index)
43 |                 ]
44 | 
45 |                 pipeline = CalibrationPipeline.from_modifiers([modifier])
46 |                 pipeline_name = pipeline.__class__.__name__
47 |                 _logger.info(f"Inferred `{pipeline_name}` for `{mod_type}`")
48 | 
49 |                 pipeline(model, dataloader, dataset_args)
50 | 
51 |             # restore modifiers on exit so model can be compressed based on recipe
52 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/layer_sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/sequential/README.md:
--------------------------------------------------------------------------------
1 | # Sequential Pipeline #
2 | The sequential pipeline is a data pipeline, primarily used for compressing models with the
3 | [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the
4 | [SparseGPTModifier](/src/llmcompressor/modifiers/obcq/base.py).
5 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .helpers import get_targets_from_modifiers
3 | from .pipeline import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functionality for working with and sparsifying Models in the PyTorch framework
 3 | """
 4 | 
 5 | import os
 6 | import warnings
 7 | 
 8 | from packaging import version
 9 | 
10 | try:
11 |     import torch
12 | 
13 |     _PARSED_TORCH_VERSION = version.parse(torch.__version__)
14 | 
15 |     if _PARSED_TORCH_VERSION.major >= 2:
16 |         torch_compile_func = torch.compile
17 | 
18 |         def raise_torch_compile_warning(*args, **kwargs):
19 |             warnings.warn(
20 |                 "torch.compile is not supported by llmcompressor for torch 2.0.x"
21 |             )
22 |             return torch_compile_func(*args, **kwargs)
23 | 
24 |         torch.compile = raise_torch_compile_warning
25 | 
26 |     _BYPASS = bool(int(os.environ.get("NM_BYPASS_TORCH_VERSION", "0")))
27 |     if _PARSED_TORCH_VERSION.major == 1 and _PARSED_TORCH_VERSION.minor in [10, 11]:
28 |         if not _BYPASS:
29 |             raise RuntimeError(
30 |                 "llmcompressor does not support torch==1.10.* or 1.11.*. "
31 |                 f"Found torch version {torch.__version__}.\n\n"
32 |                 "To bypass this error, set environment variable "
33 |                 "`NM_BYPASS_TORCH_VERSION` to '1'.\n\n"
34 |                 "Bypassing may result in errors or "
35 |                 "incorrect behavior, so set at your own risk."
36 |             )
37 |         else:
38 |             warnings.warn(
39 |                 "llmcompressor quantized onnx export does not work "
40 |                 "with torch==1.10.* or 1.11.*"
41 |             )
42 | except ImportError:
43 |     pass
44 | 
45 | # flake8: noqa
46 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/model_load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/model_load/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Generic code used as utilities and helpers for PyTorch
3 | """
4 | 
5 | # flake8: noqa
6 | 
7 | from .helpers import *
8 | from .sparsification import *
9 | 


--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import RecipeBase
 2 | from .metadata import DatasetMetaData, LayerMetaData, ModelMetaData, ParamMetaData
 3 | from .modifier import RecipeModifier
 4 | from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput
 5 | from .stage import RecipeStage
 6 | 
 7 | __all__ = [
 8 |     "DatasetMetaData",
 9 |     "ParamMetaData",
10 |     "LayerMetaData",
11 |     "ModelMetaData",
12 |     "RecipeBase",
13 |     "RecipeModifier",
14 |     "RecipeStage",
15 |     "Recipe",
16 |     "RecipeInput",
17 |     "RecipeStageInput",
18 |     "RecipeArgsInput",
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/llmcompressor/recipe/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any
 3 | 
 4 | from pydantic import BaseModel, ConfigDict
 5 | 
 6 | __all__ = ["RecipeBase"]
 7 | 
 8 | 
 9 | class RecipeBase(BaseModel, ABC):
10 |     """
11 |     Defines the contract that `Recipe` and its components
12 |     such as `RecipeModifier` and `RecipeStage` must follow.
13 | 
14 |     All inheritors of this class must implement the following methods:
15 |         - calculate_start
16 |         - calculate_end
17 |         - evaluate
18 |         - create_modifier
19 |     """
20 | 
21 |     model_config = ConfigDict(arbitrary_types_allowed=True)
22 | 
23 |     @abstractmethod
24 |     def create_modifier(self) -> Any:
25 |         raise NotImplementedError()
26 | 


--------------------------------------------------------------------------------
/src/llmcompressor/recipe/metadata.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Optional
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | __all__ = [
 6 |     "DatasetMetaData",
 7 |     "ParamMetaData",
 8 |     "LayerMetaData",
 9 |     "ModelMetaData",
10 | ]
11 | 
12 | 
13 | class DatasetMetaData(BaseModel):
14 |     name: str = None
15 |     version: str = None
16 |     hash: str = None
17 |     shape: List[int] = Field(default_factory=list)
18 |     num_classes: int = None
19 |     num_train_samples: int = None
20 |     num_val_samples: int = None
21 |     num_test_samples: int = None
22 | 
23 | 
24 | class ParamMetaData(BaseModel):
25 |     name: str = None
26 |     shape: List[int] = None
27 |     weight_hash: str = None
28 | 
29 | 
30 | class LayerMetaData(BaseModel):
31 |     name: str = None
32 |     type: str = None
33 |     index: int = None
34 |     attributes: Dict[str, Any] = None
35 |     input_shapes: List[List[int]] = None
36 |     output_shapes: List[List[int]] = None
37 |     params: Dict[str, ParamMetaData] = None
38 | 
39 | 
40 | class ModelMetaData(BaseModel):
41 |     architecture: str = None
42 |     sub_architecture: str = None
43 |     input_shapes: List[List[int]] = None
44 |     output_shapes: List[List[int]] = None
45 |     layers: List[LayerMetaData] = Field(default_factory=list)
46 |     layer_prefix: Optional[str] = None
47 | 


--------------------------------------------------------------------------------
/src/llmcompressor/sentinel.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | from pydantic_core import core_schema
 4 | 
 5 | _registry = {}
 6 | 
 7 | 
 8 | class Sentinel:
 9 |     """
10 |     Unique sentinel values. Implements https://peps.python.org/pep-0661/
11 |     with dummy pydantic validation
12 |     """
13 | 
14 |     def __new__(cls, name, module_name=None):
15 |         name = str(name)
16 | 
17 |         if module_name is None:
18 |             module_name = inspect.currentframe().f_globals.get("__file__")
19 |             if module_name is None:
20 |                 module_name = __name__
21 | 
22 |         registry_key = f"{module_name}-{name}"
23 | 
24 |         sentinel = _registry.get(registry_key, None)
25 |         if sentinel is not None:
26 |             return sentinel
27 | 
28 |         sentinel = super().__new__(cls)
29 |         sentinel._name = name
30 |         sentinel._module_name = module_name
31 | 
32 |         return _registry.setdefault(registry_key, sentinel)
33 | 
34 |     def __repr__(self):
35 |         return self._name
36 | 
37 |     def __reduce__(self):
38 |         return (
39 |             self.__class__,
40 |             (
41 |                 self._name,
42 |                 self._module_name,
43 |             ),
44 |         )
45 | 
46 |     @classmethod
47 |     def __get_pydantic_core_schema__(cls, _source_type, _handler):
48 |         return core_schema.no_info_plain_validator_function(cls.validate)
49 | 
50 |     @classmethod
51 |     def validate(cls, value: "Sentinel") -> "Sentinel":
52 |         return value
53 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools for integrating LLM Compressor with transformers training flows
 3 | """
 4 | 
 5 | # flake8: noqa
 6 | 
 7 | # isort: skip_file
 8 | # (import order matters for circular import avoidance)
 9 | from .utils import *
10 | 
11 | from .sparsification import (
12 |     SparseAutoModelForCausalLM,
13 | )
14 | from .finetune import *
15 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/transformers/compression/__init__.py


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .data import TextGenerationDataset
4 | from .session_mixin import SessionManagerMixIn
5 | from .text_generation import apply, oneshot, train
6 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | from .base import TextGenerationDataset
 4 | from .c4 import C4Dataset
 5 | from .cnn_dailymail import CNNDailyMailDataset
 6 | from .custom import CustomDataset
 7 | from .evolcodealpaca import EvolCodeAlpacaDataset
 8 | from .flickr_30k import Flickr30K
 9 | from .gsm8k import GSM8KDataset
10 | from .open_platypus import OpenPlatypusDataset
11 | from .peoples_speech import PeoplesSpeech
12 | from .ptb import PtbDataset
13 | from .ultrachat_200k import UltraChatDataset
14 | from .wikitext import WikiTextDataset
15 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/c4.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="c4")
12 | class C4Dataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the C4 dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     def __init__(
22 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 |     ):
24 |         dataset_args = deepcopy(dataset_args)
25 |         dataset_args.dataset = "allenai/c4"
26 |         dataset_args.text_column = "text"
27 | 
28 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
29 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="cnn_dailymail")
12 | class CNNDailyMailDataset(TextGenerationDataset):
13 |     """
14 |     Text generation class for the CNN/DailyMail dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"
22 | 
23 |     def __init__(
24 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 |     ):
26 |         dataset_args = deepcopy(dataset_args)
27 |         dataset_args.dataset = "cnn_dailymail"
28 |         dataset_args.dataset_config_name = "3.0.0"
29 | 
30 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 | 
32 |     def dataset_template(self, sample):
33 |         return {
34 |             "text": self.SAMPLE_TEMPLATE.format(
35 |                 article=sample["article"], highlights=sample["highlights"]
36 |             )
37 |         }
38 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/custom.py:
--------------------------------------------------------------------------------
 1 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 2 | 
 3 | 
 4 | @TextGenerationDataset.register(name="custom", alias=["json", "csv"])
 5 | class CustomDataset(TextGenerationDataset):
 6 |     """
 7 |     Child text generation class for custom local dataset supporting load
 8 |     for csv and json
 9 | 
10 |     :param dataset_args: configuration settings for dataset loading
11 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
12 |         Can also be set to None to load all the splits
13 |     :param processor: processor or tokenizer to use on dataset
14 | 
15 |     """
16 | 
17 |     pass
18 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="evolcodealpaca")
12 | class EvolCodeAlpacaDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the Evol Code Alpaca dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     EVOL_ALPACA_TEMPLATE = (
22 |         "Below is an instruction that describes a "
23 |         "programming task. Write a program that appropriately "
24 |         "completes the request.\n\n### Instruction:\n{instruction}"
25 |         "\n\n### Response:\n"
26 |     )
27 | 
28 |     def __init__(
29 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
30 |     ):
31 |         dataset_args = deepcopy(dataset_args)
32 |         dataset_args.dataset = "theblackcat102/evol-codealpaca-v1"
33 |         dataset_args.text_column = "text"
34 | 
35 |         super().__init__(dataset_args, split=split, processor=processor)
36 | 
37 |     def dataset_template(self, sample):
38 |         prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"])
39 |         text = prompt
40 |         if "output" in text:
41 |             text += sample["output"]
42 | 
43 |         return {
44 |             "text": text,
45 |             self.PROMPT_KEY: prompt,
46 |         }
47 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/gsm8k.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="gsm8k")
12 | class GSM8KDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the Grade School Math 8k dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     GSM_TEMPLATE = "Question: {question}\nAnswer:"
22 | 
23 |     def __init__(
24 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 |     ):
26 |         dataset_args = deepcopy(dataset_args)
27 |         dataset_args.dataset = "gsm8k"
28 |         dataset_args.text_column = "text"
29 | 
30 |         super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 | 
32 |     def dataset_template(self, sample):
33 |         prompt = self.GSM_TEMPLATE.format(question=sample["question"])
34 |         text = prompt
35 |         if "answer" in sample:
36 |             text += " " + sample["answer"]
37 | 
38 |         return {
39 |             "text": text,
40 |             self.PROMPT_KEY: prompt,
41 |         }
42 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/ptb.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="ptb")
12 | class PtbDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the PTB dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     def __init__(
22 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 |     ):
24 |         dataset_args = deepcopy(dataset_args)
25 |         dataset_args.dataset = "ptb_text_only"
26 |         dataset_args.text_column = "sentence"
27 | 
28 |         super().__init__(
29 |             dataset_args=dataset_args,
30 |             split=split,
31 |             processor=processor,
32 |         )
33 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/wikitext.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import TYPE_CHECKING
 3 | 
 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
 5 | from llmcompressor.typing import Processor
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from llmcompressor.args import DatasetArguments
 9 | 
10 | 
11 | @TextGenerationDataset.register(name="wikitext")
12 | class WikiTextDataset(TextGenerationDataset):
13 |     """
14 |     Child text generation class for the Open Platypus dataset
15 | 
16 |     :param dataset_args: configuration settings for dataset loading
17 |     :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 |     :param processor: processor or tokenizer to use on dataset
19 |     """
20 | 
21 |     def __init__(
22 |         self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 |     ):
24 |         dataset_args = deepcopy(dataset_args)
25 |         dataset_args.dataset = "Salesforce/wikitext"
26 |         dataset_args.text_column = "text"
27 | 
28 |         super().__init__(
29 |             dataset_args=dataset_args,
30 |             split=split,
31 |             processor=processor,
32 |         )
33 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/text_generation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Adapted from https://github.com/huggingface/transformers
18 | # vllm-project: no copyright
19 | 
20 | 
21 | from compressed_tensors.utils.helpers import deprecated
22 | 
23 | 
24 | @deprecated(
25 |     message=(
26 |         "`from llmcompressor.transformers import oneshot` is deprecated, "
27 |         "please use `from llmcompressor import oneshot`."
28 |     )
29 | )
30 | def oneshot(**kwargs) -> None:
31 |     from llmcompressor import oneshot
32 | 
33 |     oneshot(**kwargs)
34 | 
35 | 
36 | @deprecated(
37 |     message=(
38 |         "`from llmcompressor import train` is deprecated, "
39 |         "please use `from llmcompressor import train`."
40 |     )
41 | )
42 | def train(**kwargs):
43 |     from llmcompressor import train
44 | 
45 |     train(**kwargs)
46 | 
47 | 
48 | def apply(**kwargs):
49 |     message = (
50 |         "`from llmcompressor.transformers import apply, compress` is deprecated, "
51 |         "please use `from llmcompressor import oneshot, train` "
52 |         "for sequential stages."
53 |     )
54 |     raise ValueError(message)
55 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/trainer.py:
--------------------------------------------------------------------------------
 1 | from transformers import Trainer as HFTransformersTrainer
 2 | 
 3 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
 4 | 
 5 | __all__ = ["Trainer"]
 6 | 
 7 | 
 8 | class Trainer(SessionManagerMixIn, HFTransformersTrainer):
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/sparsification/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Objects, classes, and methods for applying sparsification algorithms to
3 | Hugging Face transformers flows
4 | """
5 | 
6 | # flake8: noqa
7 | from .sparse_model import *
8 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/sparsification/sparse_model.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from typing import Optional
 3 | 
 4 | from loguru import logger
 5 | from torch.nn import Module
 6 | from transformers import AutoModelForCausalLM
 7 | 
 8 | __all__ = [
 9 |     "SparseAutoModelForCausalLM",
10 |     "get_processor_name_from_model",
11 | ]
12 | 
13 | 
14 | class SparseAutoModelForCausalLM:
15 |     def from_pretrained(*args, **kwargs):
16 |         logger.warning(
17 |             "SparseAutoModelForCausalLM is deprecated, "
18 |             "please use AutoModelForCausalLM"
19 |         )
20 |         return AutoModelForCausalLM.from_pretrained(*args, **kwargs)
21 | 
22 | 
23 | def get_processor_name_from_model(student: Module, teacher: Optional[Module]) -> str:
24 |     """
25 |     Get a processor/tokenizer source used for both student and teacher, assuming
26 |     that they could be shared
27 | 
28 |     :param student: the student model
29 |     :param teacher: the teacher model
30 |     :return: the source for the processor/tokenizer shared between teacher and model
31 |     """
32 | 
33 |     if teacher is not None and teacher not in ("disable", "self"):
34 |         student_forward_params = list(
35 |             inspect.signature(student.forward).parameters.keys()
36 |         )
37 |         teacher_forward_params = list(
38 |             inspect.signature(teacher.forward).parameters.keys()
39 |         )
40 |         diff = [p for p in student_forward_params if p not in teacher_forward_params]
41 |         if diff:
42 |             raise RuntimeError(
43 |                 "Teacher tokenizer cannot be used for student "
44 |                 f"due to missing args: {diff}"
45 |             )
46 |         src_model = teacher
47 |     else:
48 |         src_model = student
49 |     return src_model.config._name_or_path
50 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/tracing/__init__.py:
--------------------------------------------------------------------------------
1 | from .debug import trace
2 | 
3 | __all__ = ["trace"]
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for applying sparsification algorithms to Hugging Face transformers flows
3 | """
4 | 
5 | # flake8: noqa
6 | from .helpers import *
7 | 


--------------------------------------------------------------------------------
/src/llmcompressor/transformers/utils/preprocessing_functions.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Dict
 2 | 
 3 | from compressed_tensors.registry import RegistryMixin
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from llmcompressor.transformers.finetune.data.base import TextGenerationDataset
 7 | 
 8 | 
 9 | class PreprocessingFunctionRegistry(RegistryMixin):
10 |     pass
11 | 
12 | 
13 | @PreprocessingFunctionRegistry.register()
14 | def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict):
15 |     PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:"""
16 |     data["prompt"] = PROMPT_DICT.format_map(data)
17 |     data["text"] = data["prompt"] + data["output"]
18 |     return data
19 | 


--------------------------------------------------------------------------------
/src/llmcompressor/typing.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from datasets import Dataset, DatasetDict, IterableDataset
 4 | from transformers import (
 5 |     BaseImageProcessor,
 6 |     FeatureExtractionMixin,
 7 |     PreTrainedTokenizer,
 8 |     ProcessorMixin,
 9 | )
10 | 
11 | # Tokenizer or Processor. Processors do not inherit from a unified base class
12 | Processor = Union[
13 |     PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin
14 | ]
15 | 
16 | # Supported dataset types, IterableDataset is a streamed dataset
17 | DatasetType = Union[Dataset, DatasetDict, IterableDataset]
18 | 


--------------------------------------------------------------------------------
/src/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | General utility functions used throughout llmcompressor
3 | """
4 | 
5 | # flake8: noqa
6 | 
7 | from .helpers import *
8 | 


--------------------------------------------------------------------------------
/src/llmcompressor/utils/fsdp/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 


--------------------------------------------------------------------------------
/src/llmcompressor/utils/fsdp/context.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from accelerate import Accelerator
 3 | except ImportError:
 4 |     Accelerator = None
 5 | 
 6 | try:
 7 |     from torch.distributed.fsdp import FullyShardedDataParallel
 8 |     from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState
 9 | except ImportError:
10 |     FullyShardedDataParallel = None
11 | 
12 | from contextlib import nullcontext
13 | 
14 | __all__ = [
15 |     "summon_full_params_context",
16 |     "main_process_first_context",
17 |     "fix_fsdp_module_name",
18 | ]
19 | 
20 | 
21 | def summon_full_params_context(model, offload_to_cpu: bool = False):
22 |     if FullyShardedDataParallel is not None:
23 |         # avoid nested summon_full_param context
24 |         if (
25 |             hasattr(model, "training_state")
26 |             and model.training_state is TrainingState.SUMMON_FULL_PARAMS
27 |         ):
28 |             return nullcontext()
29 |         return FullyShardedDataParallel.summon_full_params(
30 |             model, offload_to_cpu=offload_to_cpu
31 |         )
32 | 
33 |     return nullcontext()
34 | 
35 | 
36 | def main_process_first_context():
37 |     """
38 |     Creates a context manager where the main process runs the block before all other
39 |     processes. Returns a nullcontext when called from a single process application.
40 |     """
41 |     if Accelerator is None:
42 |         return nullcontext()
43 | 
44 |     return Accelerator().main_process_first()
45 | 
46 | 
47 | def fix_fsdp_module_name(name: str) -> str:
48 |     """
49 |     Remove FSDP wrapper prefixes from a module name.
50 |     Accounts for scenario where FSDP_WRAPPED_MODULE is
51 |     at the end of the name, as well as in the middle.
52 | 
53 |     :param name: name to strip
54 |     :return: stripped name
55 |     """
56 |     if FullyShardedDataParallel is None:
57 |         return name
58 | 
59 |     return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace(
60 |         "." + FSDP_WRAPPED_MODULE, ""
61 |     )
62 | 


--------------------------------------------------------------------------------
/src/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .module import *
4 | 


--------------------------------------------------------------------------------
/src/llmcompressor/utils/pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import torch
 4 | 
 5 | __all__ = ["measure_cuda_memory"]
 6 | 
 7 | 
 8 | class measure_cuda_memory:
 9 |     def __init__(self, device=None):
10 |         self.device = device
11 | 
12 |     def reset_peak_memory_stats(self):
13 |         torch.cuda.reset_peak_memory_stats(self.device)
14 | 
15 |     def current_memory_usage(self) -> float:
16 |         # Return the memory usage in bytes.
17 |         self.reset_peak_memory_stats()
18 |         mem = torch.cuda.max_memory_allocated(self.device)
19 |         return mem
20 | 
21 |     def peak_memory_usage(self) -> float:
22 |         # Return the peak memory usage in bytes since the last reset
23 |         mem = torch.cuda.max_memory_allocated(self.device)
24 |         return mem
25 | 
26 |     def __enter__(self):
27 |         self.initial_memory = self.current_memory_usage()
28 |         # This allows us to call methods of the context manager if needed
29 |         return self
30 | 
31 |     def __exit__(self, exc_type, exc_val, exc_tb):
32 |         self.overall_peak_memory = self.peak_memory_usage()
33 |         self.peak_consumed_memory = self.overall_peak_memory - self.initial_memory
34 | 
35 |         # Force garbage collection
36 |         gc.collect()
37 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | 
 4 | 
 5 | # TODO: maybe test type as decorators?
 6 | class TestType(Enum):
 7 |     SANITY = "sanity"
 8 |     REGRESSION = "regression"
 9 |     SMOKE = "smoke"
10 | 
11 | 
12 | class Cadence(Enum):
13 |     COMMIT = "commit"
14 |     WEEKLY = "weekly"
15 |     NIGHTLY = "nightly"
16 | 
17 | 
18 | @dataclass
19 | class TestConfig:
20 |     test_type: TestType
21 |     cadence: Cadence
22 | 
23 | 
24 | @dataclass
25 | class CustomTestConfig(TestConfig):
26 |     script_path: str
27 | 


--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/vLLM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/vLLM/__init__.py


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8_DYNAMIC


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: FP8_DYNAMIC


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml
5 | scheme: FP8A16_channel


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml
5 | scheme: FP8A16_tensor


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_channel_weight_static_per_tensor


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: garage-bAInd/Open-Platypus
6 | dataset_split: train
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_gptq_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_gptq_tinyllama


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_phi3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: microsoft/Phi-3-mini-4k-instruct
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_phi3


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_tinyllama


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: garage-bAInd/Open-Platypus
7 | dataset_split: train


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
5 | scheme: sparse2of4_only
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft
8 | save_compressed: True


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_group
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: neuralmagic/LLM_compression_calibration
6 | dataset_split: train
7 | scheme: W4A16_actorder_group
8 | save_dir: Qwen2.5-0.5B-actorder-group


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_weight
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | scheme: W4A16_actorder_weight
8 | save_dir: Qwen2.5-0.5B-actorder-weight


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: W4A16_channel
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml
5 | dataset_id: "mit-han-lab/pile-val-backup"
6 | dataset_split: validation
7 | num_calibration_samples: 2000
8 | scheme: W4A16_weight_asym_awq
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_dynamic_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Asym
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_static_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Asym
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
 7 |       config_groups:
 8 |         group_0:
 9 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true}
11 |           targets: [Linear]
12 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: [lm_head]
 7 |       config_groups:
 8 |         group_0:
 9 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
11 |           targets: [Linear]


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     QuantizationModifier:
 6 |       ignore: [lm_head]
 7 |       config_groups:
 8 |         group_0:
 9 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
10 |           input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
11 |           targets: [Linear]


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |       mappings:
 6 |       - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
 7 |         - re:.*input_layernorm
 8 |       - - ['re:.*gate_proj', 're:.*up_proj']
 9 |         - re:.*post_attention_layernorm
10 |     GPTQModifier:
11 |       ignore: [lm_head]
12 |       config_groups:
13 |         group_0:
14 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
15 |           input_activations: {num_bits: 8, symmetric: false, dynamic: true, strategy: token, type: int}
16 |           targets: [Linear] 
17 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |     GPTQModifier:
 6 |       ignore: [lm_head]
 7 |       config_groups:
 8 |         group_0:
 9 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 |           input_activations: {num_bits: 8, symmetric: false, dynamic: false, strategy: tensor, type: int}
11 |           targets: [Linear] 
12 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 |   sparsity_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.5
5 |       mask_structure: "2:4"
6 |       targets: ["Linear"]
7 |       ignore: ["re:.*lm_head"]
8 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     QuantizationModifier:
13 |       targets: ["Linear"]
14 |       ignore: ["lm_head"]
15 |       scheme: "FP8_DYNAMIC"
16 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     GPTQModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     AWQModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     GPTQModifier:
4 |       ignore: [lm_head]
5 |       config_groups:
6 |         group_0:
7 |           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
8 |           targets: [Linear]
9 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     GPTQModifier:
13 |       ignore: ["lm_head"]
14 |       config_groups:
15 |         group_0:
16 |           weights:
17 |             num_bits: 4
18 |             type: "int"
19 |             symmetric: true
20 |             strategy: "group"
21 |             group_size: 128
22 |           targets: ["Linear"]
23 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
 1 | sparsity_stage:
 2 |   run_type: oneshot
 3 |   sparsity_modifiers:
 4 |     SparseGPTModifier:
 5 |       sparsity: 0.5
 6 |       mask_structure: "2:4"
 7 |       targets: ["Linear"]
 8 |       ignore: ["re:.*lm_head"]
 9 | quantization_stage:
10 |   run_type: oneshot
11 |   quantization_modifiers:
12 |     GPTQModifier:
13 |       ignore: ["lm_head"]
14 |       config_groups:
15 |         group_0:
16 |           weights:
17 |             num_bits: 4
18 |             type: "int"
19 |             symmetric: true
20 |             strategy: "channel"
21 |           targets: ["Linear"]
22 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 4
 9 |             type: "int"
10 |             symmetric: true
11 |             strategy: "group"
12 |             group_size: 128
13 |             actorder: "group"
14 |           targets: ["Linear"]
15 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 4
 9 |             type: "int"
10 |             symmetric: true
11 |             strategy: "group"
12 |             group_size: 128
13 |             actorder: "weight"
14 |           targets: ["Linear"]
15 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/kv_cache/default.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 |   quant_modifiers:
3 |     QuantizationModifier:
4 |       kv_cache_scheme:
5 |         {num_bits: 8, type: float, symmetric: true, strategy: tensor}
6 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     GPTQModifier:
 4 |       sequential_update: false
 5 |       ignore: ["lm_head"]
 6 |       config_groups:
 7 |           group_0:
 8 |               weights:
 9 |                   num_bits: 4
10 |                   type: "int"
11 |                   symmetric: true
12 |                   strategy: "channel"
13 |                   actorder: False
14 |               targets: ["Linear"]
15 |       kv_cache_scheme:
16 |         {num_bits: 8, type: float, symmetric: true, strategy: tensor}


--------------------------------------------------------------------------------
/tests/e2e/vLLM/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SUCCESS=0
 4 | 
 5 | while getopts "c:t:" OPT; do
 6 |   case ${OPT} in
 7 |     c )
 8 |         CONFIG="$OPTARG"
 9 |         ;;
10 |     t )
11 |         TEST="$OPTARG"
12 |         ;;
13 |     \? )
14 |         exit 1
15 |         ;;
16 |   esac
17 | done
18 | 
19 | # Parse list of configs.
20 | for MODEL_CONFIG in "$CONFIG"/*
21 | do
22 |     LOCAL_SUCCESS=0
23 | 
24 |     echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
25 | 
26 |     export TEST_DATA_FILE="$MODEL_CONFIG"
27 |     pytest \
28 |         --capture=tee-sys \
29 |         "$TEST" || LOCAL_SUCCESS=$?
30 | 
31 |     if [[ $LOCAL_SUCCESS == 0 ]]; then
32 |         echo "=== PASSED MODEL: $MODEL_CONFIG ==="
33 |     else
34 |         echo "=== FAILED MODEL: $MODEL_CONFIG ==="
35 |     fi
36 | 
37 |     SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
38 | 
39 | done
40 | 
41 | exit "$SUCCESS"
42 | 


--------------------------------------------------------------------------------
/tests/e2e/vLLM/skipped_configs/fp4_nvfp4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: NVFP4A16


--------------------------------------------------------------------------------
/tests/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/examples/__init__.py


--------------------------------------------------------------------------------
/tests/examples/test_compressed_inference.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.examples.utils import (
 6 |     copy_and_run_script,
 7 |     gen_cmd_fail_message,
 8 |     requires_gpu_count,
 9 | )
10 | 
11 | 
12 | @pytest.fixture
13 | def example_dir() -> str:
14 |     return "examples/compressed_inference"
15 | 
16 | 
17 | @pytest.mark.example
18 | @requires_gpu_count(1)
19 | class TestCompressedInference:
20 |     """
21 |     Tests for examples in the "compressed_inference" example folder.
22 |     """
23 | 
24 |     def test_fp8_example_script(self, example_dir: str, tmp_path: Path):
25 |         """
26 |         Test for the "fp8_compressed_inference.py" script in the folder.
27 |         """
28 |         script_filename = "fp8_compressed_inference.py"
29 |         command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
30 | 
31 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
32 | 


--------------------------------------------------------------------------------
/tests/examples/test_quantization_kv_cache.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.examples.utils import (
 7 |     ReadMe,
 8 |     copy_and_run_command,
 9 |     gen_cmd_fail_message,
10 |     requires_gpu_count,
11 | )
12 | 
13 | 
14 | @pytest.fixture
15 | def example_dir() -> str:
16 |     return "examples/quantization_kv_cache"
17 | 
18 | 
19 | @pytest.mark.example
20 | @requires_gpu_count(1)
21 | class TestQuantizationKVCache:
22 |     """
23 |     Tests for examples in the "quantization_kv_cache" example folder.
24 |     """
25 | 
26 |     def test_doc_example_command(self, example_dir: str, tmp_path: Path):
27 |         """
28 |         Test for the example command in the README.
29 |         """
30 |         readme_path = Path.cwd() / example_dir / "README.md"
31 |         readme = ReadMe(readme_path)
32 | 
33 |         command = readme.get_code_block_content(position=2, lang="shell")
34 |         assert command.startswith("python")
35 | 
36 |         command = shlex.split(command)
37 |         result = copy_and_run_command(tmp_path, example_dir, command)
38 | 
39 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
40 | 


--------------------------------------------------------------------------------
/tests/examples/test_quantization_w4a16.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.examples.utils import (
 7 |     ReadMe,
 8 |     copy_and_run_command,
 9 |     gen_cmd_fail_message,
10 |     requires_gpu_count,
11 | )
12 | 
13 | 
14 | @pytest.fixture
15 | def example_dir() -> str:
16 |     return "examples/quantization_w4a16"
17 | 
18 | 
19 | @pytest.mark.example
20 | @requires_gpu_count(1)
21 | class TestQuantizationW4A16:
22 |     """
23 |     Tests for examples in the "quantization_w4a16" example folder.
24 |     """
25 | 
26 |     def test_doc_example_command(self, example_dir: str, tmp_path: Path):
27 |         """
28 |         Test for the example command in the README.
29 |         """
30 |         readme_path = Path.cwd() / example_dir / "README.md"
31 |         readme = ReadMe(readme_path)
32 | 
33 |         command = readme.get_code_block_content(position=2, lang="shell")
34 |         assert command.startswith("python")
35 | 
36 |         command = shlex.split(command)
37 |         result = copy_and_run_command(tmp_path, example_dir, command)
38 | 
39 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
40 | 


--------------------------------------------------------------------------------
/tests/examples/test_quantization_w8a8_fp8.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.examples.utils import (
 7 |     ReadMe,
 8 |     copy_and_run_command,
 9 |     copy_and_run_script,
10 |     gen_cmd_fail_message,
11 |     requires_gpu_count,
12 | )
13 | 
14 | 
15 | @pytest.fixture
16 | def example_dir() -> str:
17 |     return "examples/quantization_w8a8_fp8"
18 | 
19 | 
20 | @pytest.mark.example
21 | @requires_gpu_count(1)
22 | class TestQuantizationW8A8_FP8:
23 |     """
24 |     Tests for examples in the "quantization_w8a8_fp8" example folder.
25 |     """
26 | 
27 |     def test_doc_example_command(self, example_dir: str, tmp_path: Path):
28 |         """
29 |         Test for the example command in the README.
30 |         """
31 |         readme_path = Path.cwd() / example_dir / "README.md"
32 |         readme = ReadMe(readme_path)
33 | 
34 |         command = readme.get_code_block_content(position=2, lang="shell")
35 |         assert command.startswith("python")
36 | 
37 |         command = shlex.split(command)
38 |         result = copy_and_run_command(tmp_path, example_dir, command)
39 | 
40 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 | 
42 |     def test_gemma2_example_script(self, example_dir: str, tmp_path: Path):
43 |         """
44 |         Test for the "gemma2_example.py" script in the folder.
45 |         """
46 |         script_filename = "gemma2_example.py"
47 |         command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
48 | 
49 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
50 | 


--------------------------------------------------------------------------------
/tests/examples/test_quantization_w8a8_int8.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.examples.utils import (
 7 |     ReadMe,
 8 |     copy_and_run_command,
 9 |     copy_and_run_script,
10 |     gen_cmd_fail_message,
11 |     requires_gpu_count,
12 | )
13 | 
14 | 
15 | @pytest.fixture
16 | def example_dir() -> str:
17 |     return "examples/quantization_w8a8_int8"
18 | 
19 | 
20 | @pytest.mark.example
21 | @requires_gpu_count(1)
22 | class TestQuantizationW8A8_Int8:
23 |     """
24 |     Tests for examples in the "quantization_w8a8_int8" example folder.
25 |     """
26 | 
27 |     def test_doc_example_command(self, example_dir: str, tmp_path: Path):
28 |         """
29 |         Test for the example command in the README.
30 |         """
31 |         readme_path = Path.cwd() / example_dir / "README.md"
32 |         readme = ReadMe(readme_path)
33 | 
34 |         command = readme.get_code_block_content(position=2, lang="shell")
35 |         assert command.startswith("python")
36 | 
37 |         command = shlex.split(command)
38 |         result = copy_and_run_command(tmp_path, example_dir, command)
39 | 
40 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 | 
42 |     def test_gemma2_example_script(self, example_dir: str, tmp_path: Path):
43 |         """
44 |         Test for the "gemma2_example.py" script in the folder.
45 |         """
46 |         script_filename = "gemma2_example.py"
47 |         command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
48 | 
49 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
50 | 


--------------------------------------------------------------------------------
/tests/examples/test_quantizing_moe.py:
--------------------------------------------------------------------------------
 1 | import shlex
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from tests.examples.utils import (
 7 |     ReadMe,
 8 |     copy_and_run_command,
 9 |     copy_and_run_script,
10 |     gen_cmd_fail_message,
11 |     requires_gpu_count,
12 | )
13 | 
14 | 
15 | @pytest.fixture
16 | def example_dir() -> str:
17 |     return "examples/quantizing_moe"
18 | 
19 | 
20 | @pytest.mark.example
21 | class TestQuantizingMOE:
22 |     """
23 |     Tests for examples in the "quantizing_moe" example folder.
24 |     """
25 | 
26 |     @pytest.mark.multi_gpu
27 |     @requires_gpu_count(2)
28 |     def test_doc_example_command(self, example_dir: str, tmp_path: Path):
29 |         """
30 |         Test for the example command in the README.
31 |         """
32 |         readme_path = Path.cwd() / example_dir / "README.md"
33 |         readme = ReadMe(readme_path)
34 | 
35 |         command = readme.get_code_block_content(position=2, lang="shell")
36 |         assert command.startswith("python")
37 | 
38 |         command = shlex.split(command)
39 |         result = copy_and_run_command(tmp_path, example_dir, command)
40 | 
41 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
42 | 
43 |     @pytest.mark.parametrize(
44 |         "script_filename",
45 |         [
46 |             pytest.param(
47 |                 "deepseek_moe_w4a16.py",
48 |                 marks=[
49 |                     pytest.mark.multi_gpu,
50 |                     pytest.mark.skip(reason="exceptionally long run time"),
51 |                 ],
52 |             ),
53 |             pytest.param("deepseek_moe_w8a8_fp8.py"),
54 |             pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
55 |         ],
56 |     )
57 |     def test_deepseek_example_script(
58 |         self, script_filename: str, example_dir: str, tmp_path: Path
59 |     ):
60 |         """
61 |         Test for the other example scripts in the folder.
62 |         """
63 |         command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
64 | 
65 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
66 | 


--------------------------------------------------------------------------------
/tests/examples/test_sparse_2of4_quantization_fp8.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.examples.utils import (
 6 |     copy_and_run_script,
 7 |     gen_cmd_fail_message,
 8 |     requires_gpu_count,
 9 | )
10 | 
11 | 
12 | @pytest.fixture
13 | def example_dir() -> str:
14 |     return "examples/sparse_2of4_quantization_fp8"
15 | 
16 | 
17 | @requires_gpu_count(1)
18 | class TestSparse2of4QuantizationFP8:
19 |     """
20 |     Tests for examples in the "sparse_2of4_quantization_fp8" example folder.
21 |     """
22 | 
23 |     @pytest.mark.parametrize(("flags"), [[], ["--fp8"]])
24 |     def test_2of4_example_script(
25 |         self, example_dir: str, tmp_path: Path, flags: list[str]
26 |     ):
27 |         """
28 |         Tests for the "llama3_8b_2of4.py" example script.
29 |         """
30 |         script_filename = "llama3_8b_2of4.py"
31 |         command, result = copy_and_run_script(
32 |             tmp_path, example_dir, script_filename, flags=flags
33 |         )
34 | 
35 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
36 | 


--------------------------------------------------------------------------------
/tests/examples/test_trl_mixin.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.examples.utils import (
 6 |     copy_and_run_script,
 7 |     gen_cmd_fail_message,
 8 |     requires_gpu_count,
 9 | )
10 | 
11 | 
12 | @pytest.fixture
13 | def example_dir() -> str:
14 |     return "examples/trl_mixin"
15 | 
16 | 
17 | @pytest.mark.example
18 | @requires_gpu_count(1)
19 | class TestTRLMixin:
20 |     """
21 |     Tests for examples in the "trl_mixin" example folder.
22 |     """
23 | 
24 |     @pytest.mark.parametrize(
25 |         "script_filename",
26 |         [
27 |             "ex_trl_constant.py",
28 |             # ex_trl_distillation.py hits CUDA OOM on 1x H100 (80 GiB VRAM)
29 |             pytest.param("ex_trl_distillation.py", marks=pytest.mark.multi_gpu),
30 |         ],
31 |     )
32 |     def test_example_scripts(
33 |         self, example_dir: str, script_filename: str, tmp_path: Path
34 |     ):
35 |         """
36 |         Test for the example scripts in the folder.
37 |         """
38 |         command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
39 | 
40 |         assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/utils/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/awq/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/calibration/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/test_frozen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from compressed_tensors.quantization.lifecycle.initialize import (
16 |     initialize_module_for_quantization,
17 | )
18 | from compressed_tensors.quantization.quant_args import QuantizationArgs
19 | from compressed_tensors.quantization.quant_config import QuantizationStatus
20 | from compressed_tensors.quantization.quant_scheme import QuantizationScheme
21 | from torch.nn import Linear
22 | 
23 | from llmcompressor.modifiers.quantization.calibration import (
24 |     freeze_module_quantization,
25 |     initialize_observer,
26 | )
27 | 
28 | 
29 | def test_set_module_for_calibration():
30 |     num_bits = 8
31 |     quantization_scheme = QuantizationScheme(
32 |         targets=["*"],
33 |         weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
34 |         input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
35 |     )
36 | 
37 |     layer = Linear(4, 4)
38 | 
39 |     initialize_module_for_quantization(layer, quantization_scheme)
40 |     layer.quantization_status = QuantizationStatus("calibration")
41 |     initialize_observer(layer, "weight")
42 | 
43 |     # should have both input and weight observer after initalizing
44 |     assert hasattr(layer, "weight_observer")
45 | 
46 |     # observers should get deleted after freezing
47 |     freeze_module_quantization(layer)
48 |     assert not hasattr(layer, "input_observer")
49 |     assert not hasattr(layer, "weight_observer")
50 | 
51 |     assert layer.quantization_status == QuantizationStatus("frozen")
52 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/test_observers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from compressed_tensors.quantization import (
 4 |     QuantizationArgs,
 5 |     QuantizationScheme,
 6 |     initialize_module_for_quantization,
 7 | )
 8 | 
 9 | from llmcompressor.modifiers.quantization.calibration import initialize_observer
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "shape,group_size,actorder",
14 |     [
15 |         ((1, 1), None, False),
16 |         ((1, 1), 128, False),
17 |         ((1, 1), 128, True),
18 |         ((64, 64), None, False),
19 |         ((64, 64), 128, False),
20 |         ((64, 64), 128, True),
21 |         ((1792, 4096), None, False),
22 |         ((1792, 4096), 128, False),
23 |         ((1792, 4096), 128, True),
24 |         ((3420, 64), None, False),
25 |         ((3420, 64), 128, False),
26 |         ((3420, 64), 128, True),
27 |     ],
28 | )
29 | def test_observers_update(shape, group_size, actorder):
30 |     module = torch.nn.Linear(*shape)
31 |     scheme = QuantizationScheme(
32 |         targets=["Linear"],
33 |         weights=QuantizationArgs(group_size=group_size, actorder=actorder),
34 |         input_activations=QuantizationArgs(),
35 |         output_activations=QuantizationArgs(),
36 |     )
37 | 
38 |     input = torch.empty(module.in_features, dtype=module.weight.dtype)
39 |     output = torch.empty(module.out_features, dtype=module.weight.dtype)
40 | 
41 |     initialize_module_for_quantization(module, scheme)
42 |     initialize_observer(module, "weight")
43 |     initialize_observer(module, "input")
44 |     initialize_observer(module, "output")
45 | 
46 |     for location, value in (
47 |         ("weight", module.weight),
48 |         ("input", input),
49 |         ("output", output),
50 |     ):
51 |         observer = getattr(module, f"{location}_observer")
52 |         g_idx = getattr(module, "g_idx", None)
53 |         updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
54 | 
55 |         assert_alike(updated_scale, getattr(module, f"{location}_scale"))
56 |         assert_alike(updated_zero_point, getattr(module, f"{location}_zero_point"))
57 | 
58 | 
59 | def assert_alike(a, b):
60 |     assert a.dtype == b.dtype
61 |     assert a.shape == b.shape
62 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/conf.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | from torch.utils.data import DataLoader
 4 | 
 5 | from llmcompressor.core import Event, EventType, State
 6 | from llmcompressor.modifiers.factory import ModifierFactory
 7 | 
 8 | 
 9 | def setup_modifier_factory():
10 |     ModifierFactory.refresh()
11 |     assert ModifierFactory._loaded, "ModifierFactory not loaded"
12 | 
13 | 
14 | class LifecyleTestingHarness:
15 |     def __init__(
16 |         self,
17 |         model=None,
18 |         optimizer=None,
19 |         device="cpu",
20 |         start=0,
21 |     ):
22 |         self.state = State()
23 |         self.state.update(
24 |             model=model,
25 |             device=device,
26 |             optimizer=optimizer,
27 |             start=start,
28 |             steps_per_epoch=1,
29 |             calib_data=DataLoader(MagicMock(__len__=lambda _: 0, column_names=[])),
30 |         )
31 | 
32 |     def update_modifier(self, modifier, event_type):
33 |         event = Event(event_type=event_type)
34 |         modifier.update_event(self.state, event=event)
35 | 
36 |     def get_state(self):
37 |         return self.state
38 | 
39 |     def trigger_modifier_for_epochs(self, modifier, num_epochs):
40 |         for _ in range(num_epochs):
41 |             self.update_modifier(modifier, EventType.BATCH_START)
42 |             self.update_modifier(modifier, EventType.LOSS_CALCULATED)
43 |             self.update_modifier(modifier, EventType.OPTIM_PRE_STEP)
44 |             self.update_modifier(modifier, EventType.OPTIM_POST_STEP)
45 |             self.update_modifier(modifier, EventType.BATCH_END)
46 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/logarithmic_equalization/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.factory import ModifierFactory
 6 | from llmcompressor.modifiers.logarithmic_equalization.base import (
 7 |     LogarithmicEqualizationModifier,
 8 | )
 9 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
10 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
11 | 
12 | 
13 | @pytest.mark.unit
14 | class TestLogarithmicEqualizationIsRegistered(unittest.TestCase):
15 |     def setUp(self):
16 |         self.kwargs = dict(
17 |             smoothing_strength=0.3,
18 |             mappings=[(["layer1", "layer2"], "layer3")],
19 |         )
20 |         setup_modifier_factory()
21 | 
22 |     def test_log_equalization_is_registered(self):
23 |         modifier = ModifierFactory.create(
24 |             type_="LogarithmicEqualizationModifier",
25 |             allow_experimental=False,
26 |             allow_registered=True,
27 |             **self.kwargs,
28 |         )
29 | 
30 |         self.assertIsInstance(
31 |             modifier,
32 |             LogarithmicEqualizationModifier,
33 |             "PyTorch LogarithmicEqualizationModifier not registered",
34 |         )
35 | 
36 |         self.assertIsInstance(modifier, SmoothQuantModifier)
37 |         self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"])
38 |         self.assertEqual(modifier.mappings, self.kwargs["mappings"])
39 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.factory import ModifierFactory
 6 | from llmcompressor.modifiers.obcq.base import SparseGPTModifier
 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
 8 | 
 9 | 
10 | @pytest.mark.unit
11 | class TestSparseGPTIsRegistered(unittest.TestCase):
12 |     def setUp(self):
13 |         self.kwargs = dict(
14 |             sparsity=0.5,
15 |             targets="__ALL_PRUNABLE__",
16 |         )
17 |         setup_modifier_factory()
18 | 
19 |     def test_wanda_is_registered(self):
20 |         type_ = ModifierFactory.create(
21 |             type_="SparseGPTModifier",
22 |             allow_experimental=False,
23 |             allow_registered=True,
24 |             **self.kwargs,
25 |         )
26 | 
27 |         self.assertIsInstance(
28 |             type_,
29 |             SparseGPTModifier,
30 |             "PyTorch SparseGPTModifier not registered",
31 |         )
32 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/wanda/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/wanda/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.factory import ModifierFactory
 6 | from llmcompressor.modifiers.pruning.wanda.base import WandaPruningModifier
 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
 8 | 
 9 | 
10 | @pytest.mark.unit
11 | class TestWandaIsRegistered(unittest.TestCase):
12 |     def setUp(self):
13 |         self.kwargs = dict(
14 |             sparsity=0.5,
15 |             targets="__ALL_PRUNABLE__",
16 |         )
17 |         setup_modifier_factory()
18 | 
19 |     def test_wanda_is_registered(self):
20 |         type_ = ModifierFactory.create(
21 |             type_="WandaPruningModifier",
22 |             allow_experimental=False,
23 |             allow_registered=True,
24 |             **self.kwargs,
25 |         )
26 | 
27 |         self.assertIsInstance(
28 |             type_,
29 |             WandaPruningModifier,
30 |             "PyTorch WandaPruningModifier not registered",
31 |         )
32 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/quantization/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/smoothquant/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.factory import ModifierFactory
 6 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
 8 | 
 9 | 
10 | @pytest.mark.unit
11 | class TestSmoothQuantIsRegistered(unittest.TestCase):
12 |     def setUp(self):
13 |         self.kwargs = dict(
14 |             smoothing_strength=0.3,
15 |             mappings=[(["layer1", "layer2"], "layer3")],
16 |         )
17 |         setup_modifier_factory()
18 | 
19 |     def test_smooth_quant_is_registered(self):
20 |         modifier = ModifierFactory.create(
21 |             type_="SmoothQuantModifier",
22 |             allow_experimental=False,
23 |             allow_registered=True,
24 |             **self.kwargs,
25 |         )
26 | 
27 |         self.assertIsInstance(
28 |             modifier,
29 |             SmoothQuantModifier,
30 |             "PyTorch SmoothQuant not registered",
31 |         )
32 | 
33 |         self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"])
34 |         self.assertEqual(modifier.mappings, self.kwargs["mappings"])
35 | 
36 | 
37 | @pytest.mark.unit
38 | class TestSmoothQuantDefaults(unittest.TestCase):
39 |     def setUp(self):
40 |         setup_modifier_factory()
41 | 
42 |     def test_defaults(self):
43 |         default_sq = SmoothQuantModifier()
44 |         assert default_sq.smoothing_strength == 0.5
45 | 
46 |     def test_override_defaults(self):
47 |         strength = 0.7
48 |         dummy_map = [(["layer1", "layer2"], "layer3")]
49 |         non_default_sq = SmoothQuantModifier(
50 |             smoothing_strength=strength, mappings=dummy_map
51 |         )
52 | 
53 |         assert non_default_sq.smoothing_strength == strength
54 |         assert non_default_sq.mappings == dummy_map
55 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/test_utils.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.smoothquant.utils import (
 6 |     get_layer_mappings_from_architecture,
 7 |     handle_mapping_resolution_errors,
 8 | )
 9 | 
10 | smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils"
11 | 
12 | 
13 | @pytest.mark.unit
14 | def test_handle_mapping_resolution_errors():
15 |     README_LOCATION = (
16 |         "https://github.com/vllm-project/llm-compressor/tree/main/"
17 |         "src/llmcompressor/modifiers/smoothquant"
18 |     )
19 | 
20 |     @handle_mapping_resolution_errors
21 |     def func_that_raises_exception():
22 |         raise ValueError("An error occurred")
23 | 
24 |     with pytest.raises(RuntimeError) as excinfo:
25 |         func_that_raises_exception()
26 | 
27 |     assert "Error resolving mappings for given architecture." in str(excinfo.value)
28 |     assert "Please refer to the README at" in str(excinfo.value)
29 |     assert README_LOCATION in str(excinfo.value)
30 | 
31 | 
32 | @pytest.mark.unit
33 | @patch(
34 |     f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"}
35 | )
36 | @patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping")
37 | def test_get_layer_mappings_from_architecture():
38 |     # Test when architecture is in MAPPINGS_REGISTRY
39 |     assert get_layer_mappings_from_architecture("arch1") == "mapping1"
40 | 
41 |     # Test when architecture is not in MAPPINGS_REGISTRY
42 |     assert get_layer_mappings_from_architecture("arch3") == "default_mapping"
43 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/observers/test_mse.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import pytest
17 | import torch
18 | from compressed_tensors.quantization.quant_args import QuantizationArgs
19 | 
20 | from llmcompressor.observers import MovingAverageMSEObserver, Observer
21 | 
22 | 
23 | @pytest.mark.parametrize(
24 |     "symmetric,expected_scale,expected_zero_point",
25 |     [
26 |         (True, 0.0078, 0),
27 |         (False, 0.0039, -128),
28 |     ],
29 | )
30 | def test_mse_observer(symmetric, expected_scale, expected_zero_point):
31 |     tensor = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0])
32 |     num_bits = 8
33 |     weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse")
34 | 
35 |     observer = weights.observer
36 |     observer = Observer.load_from_registry(observer, quantization_args=weights)
37 |     scale, zero_point = observer(tensor)
38 | 
39 |     assert isinstance(observer, MovingAverageMSEObserver)
40 |     assert round(scale.item(), 4) == expected_scale
41 |     assert round(zero_point.item(), 4) == expected_zero_point
42 | 
43 | 
44 | def test_mse_observer_symmetric_scale_range():
45 |     tensor = torch.rand(4, 4)
46 |     tensor *= 127
47 | 
48 |     num_bits = 8
49 |     weights = QuantizationArgs(num_bits=num_bits, symmetric=True, observer="mse")
50 | 
51 |     observer = weights.observer
52 |     observer = Observer.load_from_registry(observer, quantization_args=weights)
53 |     scale, zero_point = observer(tensor)
54 | 
55 |     # if symmetric, max symmetric_range = abs(-128) / 255
56 |     assert round(scale.item(), 4) <= 1.0039
57 |     assert round(zero_point.item(), 4) == 0
58 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pipelines/sequential/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from llmcompressor.pipelines.sequential.helpers import get_sequential_ancestors
 4 | 
 5 | 
 6 | class DummyModel(torch.nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.seq = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU())
10 |         self.fc = torch.nn.Linear(20, 5)
11 | 
12 |     def forward(self, x):
13 |         x = self.seq(x)
14 |         return self.fc(x)
15 | 
16 | 
17 | def test_get_sequential_ancestors():
18 |     model = DummyModel()
19 | 
20 |     assert get_sequential_ancestors(model, set()) == set()
21 |     assert get_sequential_ancestors(model, {model}) == set()
22 |     assert get_sequential_ancestors(model, {model.fc}) == {model}
23 |     assert get_sequential_ancestors(model, {model.seq[0]}) == {model, model.seq}
24 |     assert get_sequential_ancestors(model, {model.seq[1]}) == {model, model.seq}
25 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | from torch.nn import Linear
 5 | 
 6 | from llmcompressor.core import State
 7 | from llmcompressor.modifiers.logarithmic_equalization import (
 8 |     LogarithmicEqualizationModifier,
 9 | )
10 | from tests.llmcompressor.pytorch.helpers import LinearNet
11 | 
12 | 
13 | @pytest.mark.unit
14 | class TestLogEqualizationMapping(unittest.TestCase):
15 |     def setUp(self):
16 |         self.model = LinearNet()
17 |         self.state = State(model=self.model)
18 | 
19 |     def test_successful_map(self):
20 |         mappings = [(["seq.fc2"], "seq.block1.fc1")]
21 |         modifier = LogarithmicEqualizationModifier(mappings=mappings)
22 | 
23 |         modifier.ignore = []
24 |         modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model)
25 | 
26 |         self.assertEqual(len(modifier.resolved_mappings_), len(mappings))
27 | 
28 |         mapping = modifier.resolved_mappings_[0]
29 |         self.assertEqual(mapping.smooth_name, mappings[0][1])
30 |         self.assertIsInstance(mapping.smooth_layer, Linear)
31 |         self.assertIsInstance(mapping.balance_layers[0], Linear)
32 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | 
 5 | from llmcompressor.modifiers.factory import ModifierFactory
 6 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
 7 | 
 8 | 
 9 | @pytest.mark.unit
10 | class TestWandaPytorchIsRegistered(unittest.TestCase):
11 |     def setUp(self):
12 |         self.kwargs = dict(
13 |             sparsity=0.5,
14 |             targets="__ALL_PRUNABLE__",
15 |         )
16 |         setup_modifier_factory()
17 | 
18 |     def test_wanda_pytorch_is_registered(self):
19 |         from llmcompressor.modifiers.pruning.wanda import WandaPruningModifier
20 | 
21 |         type_ = ModifierFactory.create(
22 |             type_="WandaPruningModifier",
23 |             allow_experimental=False,
24 |             allow_registered=True,
25 |             **self.kwargs,
26 |         )
27 | 
28 |         self.assertIsInstance(
29 |             type_,
30 |             WandaPruningModifier,
31 |             "PyTorch ConstantPruningModifier not registered",
32 |         )
33 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | from torch.nn import Linear
 5 | 
 6 | from llmcompressor.core import State
 7 | from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 8 | from tests.llmcompressor.pytorch.helpers import LinearNet
 9 | 
10 | 
11 | @pytest.mark.unit
12 | class TestSmoothQuantMapping(unittest.TestCase):
13 |     def setUp(self):
14 |         self.model = LinearNet()
15 |         self.state = State(model=self.model)
16 | 
17 |     def test_successful_map(self):
18 |         mappings = [(["seq.fc1"], "seq.fc2")]
19 |         modifier = SmoothQuantModifier(mappings=mappings)
20 | 
21 |         modifier.ignore = []
22 |         modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model)
23 | 
24 |         self.assertEqual(len(modifier.resolved_mappings_), len(mappings))
25 | 
26 |         mapping = modifier.resolved_mappings_[0]
27 |         self.assertEqual(mapping.smooth_name, mappings[0][1])
28 |         self.assertIsInstance(mapping.smooth_layer, Linear)
29 |         self.assertIsInstance(mapping.balance_layers[0], Linear)
30 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/recipe/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/recipe/recipe.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.8
 5 |       mappings:
 6 |       - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
 7 |         - re:.*input_layernorm
 8 |       - - ['re:.*gate_proj', 're:.*up_proj']
 9 |         - re:.*post_attention_layernorm
10 |     GPTQModifier:
11 |       targets: ["Linear"]
12 |       ignore: [lm_head]
13 |       scheme: W8A8
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/test_sentinel.py:
--------------------------------------------------------------------------------
1 | from llmcompressor.sentinel import Sentinel
2 | 
3 | 
4 | def test_sentinel():
5 |     assert Sentinel("MISSING") == Sentinel("MISSING")
6 |     assert Sentinel("MISSING", "module_one") != Sentinel("MISSING", "module_two")
7 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/compression/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/actorder_group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/actorder_weight_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/channelwise_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/fp8_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/inputs_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
5 | ppl_threshold: 20


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs_skipped/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                         actorder: "group"
14 |                     input_activations: null
15 |                     output_activations: null
16 |                     targets: ["Linear"]
17 |         GPTQModifier:
18 |             block_size: 128


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                         actorder: "weight"
14 |                     input_activations: null
15 |                     output_activations: null
16 |                     targets: ["Linear"]
17 |         GPTQModifier:
18 |             block_size: 128


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "channel"
12 |                     input_activations: null
13 |                     output_activations: null
14 |                     targets: ["Linear"]
15 |         GPTQModifier:
16 |             block_size: 128


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml:
--------------------------------------------------------------------------------
 1 | quant_stage:
 2 |   quant_modifiers:
 3 |     QuantizationModifier:
 4 |       ignore: ["lm_head"]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 8
 9 |             type: "float"
10 |             symmetric: true
11 |             strategy: channel
12 |           targets: ["Linear"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         GPTQModifier:
 4 |             block_size: 128
 5 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 6 |             config_groups:
 7 |                 group_0:
 8 |                     weights:
 9 |                         num_bits: 8
10 |                         type: "int"
11 |                         symmetric: false
12 |                         strategy: "channel"
13 |                     input_activations:
14 |                         num_bits: 8
15 |                         type: "int"
16 |                         symmetric: false
17 |                         strategy: "tensor"
18 |                     output_activations: null
19 |                     targets: ["Linear"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 4
 9 |                         type: "int"
10 |                         symmetric: False
11 |                         strategy: "group"
12 |                         group_size: 128
13 |                     input_activations: null
14 |                     output_activations: null
15 |                     targets: ["Linear"]
16 |         GPTQModifier:
17 |             block_size: 128


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 8
 9 |                         type: "int"
10 |                         symmetric: true
11 |                         strategy: "tensor"
12 |                     input_activations:
13 |                         num_bits: 8
14 |                         type: "int"
15 |                         symmetric: false
16 |                         strategy: "tensor"
17 |                     output_activations: null
18 |                     targets: ["Linear"]
19 |                 group_1:
20 |                     weights:
21 |                         num_bits: 8
22 |                         type: "int"
23 |                         symmetric: true
24 |                         strategy: "tensor"
25 |                     input_activations: null
26 |                     output_activations: null
27 |                     targets: ["Embedding"]
28 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |     quant_modifiers:
 3 |         QuantizationModifier:
 4 |             ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
 5 |             config_groups:
 6 |                 group_0:
 7 |                     weights:
 8 |                         num_bits: 8
 9 |                         type: "int"
10 |                         symmetric: true
11 |                         strategy: "tensor"
12 |                     input_activations: null
13 |                     output_activations: null
14 |                     targets: ["Linear", "Embedding"]
15 |         GPTQModifier:
16 |             block_size: 128
17 |             targets: ["re:model.layers.\\d+$"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | pruning_stage:
2 |     obcq_modifiers:
3 |         SparseGPTModifier:
4 |             sparsity: 0.5
5 |             mask_structure: "2:4"
6 |             targets: ["Linear"]
7 |             ignore: ["re:.*lm_head"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml:
--------------------------------------------------------------------------------
 1 | pruning_stage:
 2 |     obcq_modifiers:
 3 |         SparseGPTModifier:
 4 |             sparsity: 0.5
 5 |             mask_structure: "2:4"
 6 |             targets: ["Linear"]
 7 |             ignore: ["re:.*lm_head"]
 8 | quant_stage:
 9 |     quant_modifiers:
10 |         QuantizationModifier:
11 |             ignore: ["lm_head"]
12 |             config_groups:
13 |                 group_0:
14 |                     weights:
15 |                         num_bits: 8
16 |                         type: float
17 |                         strategy: channel
18 |                         dynamic: false
19 |                         symmetric: true
20 |                     input_activations:
21 |                         num_bits: 8
22 |                         type: float
23 |                         strategy: token
24 |                         dynamic: true
25 |                         symmetric: true
26 |                     targets: ["Linear"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs_skipped/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/test_has_gpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | 
 7 | @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") != "true", reason="Only run for GHA")
 8 | def test_has_gpu():
 9 |     """
10 |     This test exists purely to raise an error if
11 |     a runner performs transformers tests without a GPU
12 |     """
13 |     assert torch.cuda.is_available()
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/test_infer_quant_format.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from compressed_tensors.quantization import preset_name_to_scheme
 3 | 
 4 | from llmcompressor.transformers.compression.quantization_format import (
 5 |     infer_quantization_format,
 6 | )
 7 | from tests.llmcompressor.pytorch.helpers import LinearNet
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "preset,sparsity_structure,expected_format",
12 |     [
13 |         ["W8A8", "unstructured", "int-quantized"],
14 |         ["W8A16", "unstructured", "pack-quantized"],
15 |         ["W8A16", "2:4", "marlin-24"],
16 |         ["W4A16", "unstructured", "pack-quantized"],
17 |         ["W4A16", "2:4", "marlin-24"],
18 |         ["FP8", "unstructured", "float-quantized"],
19 |     ],
20 | )
21 | def test_infer_quant_format(preset, sparsity_structure, expected_format):
22 |     quant_scheme = preset_name_to_scheme(preset, targets=["Linear"])
23 | 
24 |     dummy_model = LinearNet()
25 |     for _, module in dummy_model.named_modules():
26 |         module.quantization_scheme = quant_scheme
27 | 
28 |     inferred_format = infer_quantization_format(
29 |         dummy_model, save_compressed=True, sparsity_structure=sparsity_structure
30 |     )
31 |     assert inferred_format.value == expected_format
32 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True)
 7 | def run_before_and_after_tests(tmp_path):
 8 |     os.environ["TRANSFORMERS_CACHE"] = str(tmp_path / "transformers")
 9 |     os.environ["HF_DATASETS_CACHE"] = str(tmp_path / "datasets")
10 |     yield
11 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/data/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from transformers import AutoTokenizer
 3 | 
 4 | from llmcompressor.args import ModelArguments
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def tiny_llama_path():
 9 |     return "nm-testing/llama2.c-stories15M"
10 | 
11 | 
12 | @pytest.fixture
13 | def tiny_llama_model_args(tiny_llama_path):
14 |     return ModelArguments(model=tiny_llama_path)
15 | 
16 | 
17 | @pytest.fixture
18 | def tiny_llama_tokenizer(tiny_llama_model_args):
19 |     tokenizer = AutoTokenizer.from_pretrained(
20 |         tiny_llama_model_args.model,
21 |         cache_dir=tiny_llama_model_args.cache_dir,
22 |         use_fast=True,
23 |         revision=tiny_llama_model_args.model_revision,
24 |         use_auth_token=True if tiny_llama_model_args.use_auth_token else None,
25 |     )
26 |     return tokenizer
27 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.args import DatasetArguments
 4 | from llmcompressor.datasets import make_dataset_splits
 5 | from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset
 6 | 
 7 | 
 8 | @pytest.mark.unit
 9 | def test_combined_datasets():
10 |     dataset_args = DatasetArguments(
11 |         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
12 |     )
13 |     raw_wikitext2 = get_raw_dataset(dataset_args)
14 |     datasets = {"all": raw_wikitext2}
15 |     split_datasets = make_dataset_splits(datasets, do_train=True)
16 |     assert split_datasets.get("train") is not None
17 | 
18 |     split_datasets = make_dataset_splits(datasets, do_train=True)
19 |     assert split_datasets.get("train") is not None
20 | 
21 | 
22 | @pytest.mark.unit
23 | def test_separate_datasets():
24 |     splits = {"train": "train[:5%]", "validation": "train[10%:20%]"}
25 |     dataset_args = DatasetArguments(
26 |         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
27 |     )
28 |     datasets = {}
29 |     for split_name, split_str in splits.items():
30 |         raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str)
31 |         datasets[split_name] = raw_wikitext2
32 | 
33 |     split_datasets = make_dataset_splits(datasets, do_train=True)
34 |     assert split_datasets.get("train") is not None
35 | 
36 |     with pytest.raises(ValueError):
37 |         # fails due to no test split specified
38 | 
39 |         datasets.pop("train")
40 |         split_datasets = make_dataset_splits(datasets, do_train=True)
41 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | file_extension: json
5 | num_train_epochs: 1


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | file_extension: csv
5 | num_train_epochs: 1


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | file_extension: json
5 | num_train_epochs: 0.5


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: wikitext
5 | dataset_config_name: "wikitext-2-raw-v1"
6 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
7 | num_train_epochs: 0.25
8 | concat_txt: False


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | dataset: "ultrachat-200k"
5 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
6 | num_train_epochs: 0.05
7 | concat_txt: False
8 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
4 | dataset_config_name: wikitext-2-raw-v1
5 | dataset: wikitext


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml:
--------------------------------------------------------------------------------
 1 | test_oneshot_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.7
 5 |       block_size: 128
 6 |       percdamp: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: ["Linear"]
 9 |       ignore: ["re:.*lm_head"]
10 | test_train_stage:
11 |   pruning_modifiers:
12 |     ConstantPruningModifier:
13 |       targets: [
14 |         "re:.*self_attn.q_proj",
15 |         "re:.*self_attn.k_proj",
16 |         "re:.*self_attn.v_proj",
17 |         "re:.*self_attn.o_proj",
18 |         "re:.*mlp.down_proj",
19 |         "re:.*mlp.gate_proj",
20 |         "re:.*mlp.up_proj"
21 |       ]
22 |       start: 0


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   pruning_modifiers:
 3 |     ConstantPruningModifier:
 4 |       targets: [
 5 |         "re:.*self_attn.q_proj",
 6 |         "re:.*self_attn.k_proj",
 7 |         "re:.*self_attn.v_proj",
 8 |         "re:.*self_attn.o_proj",
 9 |         "re:.*mlp.gate_proj",
10 |         "re:.*mlp.up_proj"
11 |       ]
12 |       start: 0
13 |   distillation_modifiers:
14 |     OutputDistillationModifier:
15 |       targets: ["re:model.layers.\\d+$"]
16 |       comparison: "square_head"
17 |       start: 0
18 |       orig_scale: 1.0
19 |       distill_scale: 1.0


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import pytest
 6 | from parameterized import parameterized_class
 7 | 
 8 | from tests.testing_utils import parse_params, requires_gpu
 9 | 
10 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
11 | 
12 | 
13 | @pytest.mark.integration
14 | @requires_gpu
15 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
16 | class TestFinetuneWithoutRecipe(unittest.TestCase):
17 |     model = None
18 |     dataset = None
19 | 
20 |     def setUp(self):
21 |         self.output = "./finetune_output"
22 | 
23 |     def test_finetune_without_recipe(self):
24 |         from llmcompressor import train
25 | 
26 |         recipe_str = None
27 |         device = "cuda:0"
28 | 
29 |         concatenate_data = False
30 |         max_steps = 50
31 |         splits = "train"
32 | 
33 |         train(
34 |             model=self.model,
35 |             dataset=self.dataset,
36 |             output_dir=self.output,
37 |             recipe=recipe_str,
38 |             max_steps=max_steps,
39 |             concatenate_data=concatenate_data,
40 |             splits=splits,
41 |             oneshot_device=device,
42 |         )
43 | 
44 |     def tearDown(self):
45 |         if os.path.isdir(self.output):
46 |             shutil.rmtree(self.output)
47 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_quantization.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   quant_modifiers:
 3 |     QuantizationModifier:
 4 |       ignore:
 5 |         - model.layers.0.mlp.down_proj
 6 |         - model.layers.1.mlp.down_proj
 7 |         - model.layers.2.mlp.down_proj
 8 |         - model.layers.3.mlp.down_proj
 9 |         - model.layers.4.mlp.down_proj
10 |         - model.layers.5.mlp.down_proj
11 |       config_groups:
12 |           group_0:
13 |               weights:
14 |                   num_bits: 8
15 |                   type: "int"
16 |                   symmetric: False
17 |                   strategy: "tensor"
18 |               input_activations: null
19 |               output_activations: null
20 |               targets: ["Linear"]
21 |   pruning_modifiers:
22 |     ConstantPruningModifier:
23 |       targets: [
24 |         "re:.*self_attn.q_proj",
25 |         "re:.*self_attn.k_proj",
26 |         "re:.*self_attn.v_proj",
27 |         "re:.*self_attn.o_proj",
28 |         "re:.*mlp.gate_proj",
29 |         "re:.*mlp.up_proj"
30 |       ]
31 |       start: 0
32 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_safetensors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | from parameterized import parameterized_class
 8 | 
 9 | from tests.testing_utils import parse_params, requires_gpu
10 | 
11 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
12 | 
13 | 
14 | @pytest.mark.integration
15 | @requires_gpu
16 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
17 | class TestSafetensors(unittest.TestCase):
18 |     model = None
19 |     dataset = None
20 | 
21 |     def setUp(self):
22 |         self.output = Path("./finetune_output")
23 | 
24 |     def test_safetensors(self):
25 |         from llmcompressor import train
26 | 
27 |         device = "cuda:0"
28 |         output_dir = self.output / "output1"
29 |         max_steps = 10
30 |         splits = {"train": "train[:10%]"}
31 | 
32 |         train(
33 |             model=self.model,
34 |             dataset=self.dataset,
35 |             output_dir=output_dir,
36 |             max_steps=max_steps,
37 |             splits=splits,
38 |             oneshot_device=device,
39 |         )
40 | 
41 |         assert os.path.exists(output_dir / "model.safetensors")
42 |         assert not os.path.exists(output_dir / "pytorch_model.bin")
43 | 
44 |         # test we can also load
45 |         new_output_dir = self.output / "output2"
46 |         train(
47 |             model=output_dir,
48 |             dataset=self.dataset,
49 |             output_dir=new_output_dir,
50 |             max_steps=max_steps,
51 |             splits=splits,
52 |             oneshot_device=device,
53 |         )
54 | 
55 |     def tearDown(self):
56 |         if os.path.isdir(self.output):
57 |             shutil.rmtree(self.output)
58 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_sparse.yml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
6 | num_samples: 32
7 | perplexity: 5000


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | num_samples: 32
7 | perplexity: 5000


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/gpu/llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"
7 | device: "cuda:0"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | initial_pruning_only_recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml"
6 | initial_sparsity: 0.5
7 | recipe_mask_structure: "2:4"
8 | subsequent_prune_and_quant_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml"
9 | final_sparsity: 0.7


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | sparsity: 0.3
7 | device: "cuda:0"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | sparsity: 0.3


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.7
5 |       block_size: 128
6 |       percdamp: 0.01
7 |       mask_structure: "0:0"
8 |       targets: ["re:.*model.layers.0$"]
9 |       preserve_sparsity_mask: True


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.7
 5 |       block_size: 128
 6 |       percdamp: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: [
 9 |         "re:.*model.layers.0$",
10 |       ]
11 |       preserve_sparsity_mask: True
12 |     GPTQModifier:
13 |       config_groups:
14 |         group_0:
15 |           weights:
16 |             num_bits: 8
17 |             type: "int"
18 |             strategy: "channel"
19 |           targets: [
20 |             "re:.*model.layers.0.self_attn.q_proj",
21 |           ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/quant.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SmoothQuantModifier:
 4 |       smoothing_strength: 0.6
 5 |     GPTQModifier:
 6 |       block_size: 128
 7 |       percdamp: 0.01
 8 |       config_groups:
 9 |         group_0:
10 |           weights:
11 |             num_bits: 8
12 |           input_activations:
13 |             num_bits: 8
14 |           targets: ["Linear"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     GPTQModifier:
 4 |       ignore: [lm_head]
 5 |       config_groups:
 6 |         group_0:
 7 |           weights:
 8 |             num_bits: 8
 9 |             type: "int"
10 |             strategy: "channel"
11 |           targets: [Linear]
12 |     SparseGPTModifier:
13 |       sparsity: 0.5
14 |       block_size: 128
15 |       percdamp: 0.01
16 |       mask_structure: "0:0"
17 |       targets: ["re:.*model.layers.0$"]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/sparse.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.3
5 |       block_size: 128
6 |       percdamp: 0.01
7 |       targets: ["model.layers.0", "model.layers.1"]
8 |       mask_structure: "0:0"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       block_size: 128
 6 |       percdamp: 0.01
 7 |       mask_structure: "2:4"
 8 |       targets: [
 9 |         "re:.*model.layers.0$",
10 |       ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml:
--------------------------------------------------------------------------------
 1 | test_stage:
 2 |   obcq_modifiers:
 3 |     SparseGPTModifier:
 4 |       sparsity: 0.5
 5 |       block_size: 128
 6 |       percdamp: 0.01
 7 |       mask_structure: "0:0"
 8 |       targets: [
 9 |         "model.layers.0",
10 |         "model.layers.1",
11 |         "model.layers.2",
12 |         "model.layers.3",
13 |         "model.layers.4",
14 |         "model.layers.5"
15 |       ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from accelerate import init_empty_weights
 3 | from transformers import AutoModelForCausalLM
 4 | 
 5 | from llmcompressor.modifiers.obcq import SparseGPTModifier
 6 | 
 7 | 
 8 | @pytest.mark.integration
 9 | def test_infer_targets():
10 |     modifier = SparseGPTModifier(sparsity=0.0)
11 |     with init_empty_weights():
12 |         model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
13 | 
14 |     inferred = modifier._infer_sequential_targets(model)
15 |     assert inferred == ["LlamaDecoderLayer"]
16 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | 
 4 | import pytest
 5 | 
 6 | from llmcompressor.core.state import State
 7 | from llmcompressor.modifiers.obcq import SparseGPTModifier
 8 | 
 9 | 
10 | @pytest.mark.integration
11 | class TestLMHead(unittest.TestCase):
12 |     def setUp(self):
13 |         import torch
14 |         from transformers import AutoModelForCausalLM
15 | 
16 |         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
17 | 
18 |         self.model = AutoModelForCausalLM.from_pretrained(
19 |             "nm-testing/llama2.c-stories15M", device_map=self.device
20 |         )
21 | 
22 |         self.kwargs = {
23 |             "sparsity": 0.5,
24 |             "block_size": 128,
25 |             "quantize": False,
26 |             "targets": [
27 |                 "model.layers.0",
28 |                 "model.layers.1",
29 |                 "model.layers.2",
30 |                 "model.layers.3",
31 |                 "model.layers.4",
32 |                 "model.layers.5",
33 |             ],
34 |         }
35 | 
36 |         dataset = MagicMock()
37 |         dataset.column_names = []
38 |         self.dataloader = MagicMock()
39 |         self.dataloader.dataset = dataset
40 |         self.dataloader.__iter__.return_value = iter([])
41 | 
42 |     def test_no_lm_head_target(self):
43 |         modifier = SparseGPTModifier(**self.kwargs)
44 | 
45 |         state = State()
46 |         state.update(model=self.model, device=self.device, calib_data=self.dataloader)
47 |         modifier.initialize(state)
48 |         modifier.on_start(state, None)
49 | 
50 |         assert len(self.model.lm_head._forward_hooks) <= 0
51 | 
52 |         modifier.finalize(state)
53 | 
54 |     def test_lm_head_target(self):
55 |         self.kwargs["targets"].append("lm_head")
56 |         modifier = SparseGPTModifier(**self.kwargs)
57 | 
58 |         state = State()
59 |         state.update(model=self.model, device=self.device, calib_data=self.dataloader)
60 |         modifier.initialize(state)
61 |         modifier.on_start(state, None)
62 | 
63 |         assert len(self.model.lm_head._forward_hooks) == 1
64 | 
65 |         modifier.finalize(state)
66 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_owl.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from datasets import Dataset
 4 | from transformers import AutoModelForCausalLM
 5 | 
 6 | from llmcompressor.core.session_functions import create_session
 7 | from llmcompressor.datasets import format_calibration_data
 8 | from llmcompressor.modifiers.obcq import SparseGPTModifier
 9 | from llmcompressor.utils.pytorch.module import get_layers
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_infer_owl_layer_sparsity():
14 |     target_sparsity = 0.7
15 |     vocab_size = 512
16 |     seq_len = 2048
17 |     ds_size = 16
18 | 
19 |     with create_session() as session:
20 |         session.initialize()
21 |         modifier = SparseGPTModifier(
22 |             sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05
23 |         )
24 |         model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
25 | 
26 |         dataset = Dataset.from_dict(
27 |             {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))}
28 |         )
29 |         dataloader = format_calibration_data(dataset)
30 | 
31 |         sequential_targets = modifier._infer_sequential_targets(model)
32 |         layers = get_layers(sequential_targets, model)
33 |         sparsities = modifier._infer_owl_layer_sparsity(model, layers, dataloader)
34 |         assert sparsities.keys() == layers.keys()
35 | 
36 |         for sparsity in sparsities.values():
37 |             assert sparsity == pytest.approx(target_sparsity, abs=0.1)
38 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_oneshot_with_modifier.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | import pytest
 7 | from parameterized import parameterized_class
 8 | 
 9 | from tests.testing_utils import parse_params, requires_gpu
10 | 
11 | CONFIGS_DIRECTORY = (
12 |     "tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic"
13 | )
14 | 
15 | 
16 | @pytest.mark.integration
17 | @requires_gpu
18 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
19 | class TestOneshotWithModifierObject(unittest.TestCase):
20 |     model = None
21 |     dataset = None
22 | 
23 |     def setUp(self):
24 |         self.output = Path("./finetune_output")
25 | 
26 |     def test_oneshot_with_modifier_object(self):
27 |         from llmcompressor import oneshot
28 |         from llmcompressor.modifiers.obcq.base import SparseGPTModifier
29 | 
30 |         recipe_str = [
31 |             SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])
32 |         ]
33 | 
34 |         device = "cuda:0"
35 |         concatenate_data = False
36 |         num_calibration_samples = 64
37 |         output_dir = self.output / "oneshot_out"
38 |         splits = {"calibration": "train[:10%]"}
39 | 
40 |         oneshot(
41 |             model=self.model,
42 |             dataset=self.dataset,
43 |             output_dir=output_dir,
44 |             num_calibration_samples=num_calibration_samples,
45 |             recipe=recipe_str,
46 |             concatenate_data=concatenate_data,
47 |             splits=splits,
48 |             oneshot_device=device,
49 |         )
50 | 
51 |     def tearDown(self):
52 |         if os.path.isdir(self.output):
53 |             shutil.rmtree(self.output)
54 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/oneshot/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 |   obcq_modifiers:
3 |     SparseGPTModifier:
4 |       sparsity: 0.5
5 |       block_size: 128
6 |       targets: [
7 |         're:model.layers.3.mlp.gate_proj.weight'
8 |       ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "commit"
 2 | test_type: "smoke"
 3 | tokenize: False
 4 | model: "nm-testing/llama2.c-stories15M"
 5 | dataset: open_platypus
 6 | recipe: |
 7 |   test_stage:
 8 |     obcq_modifiers:
 9 |       SparseGPTModifier:
10 |         sparsity: 0.5
11 |         block_size: 128
12 |         targets: [
13 |           're:model.layers.3.mlp.gate_proj.weight'
14 |         ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: "gsm8k"
6 | dataset_config_name: "main"
7 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "commit"
 2 | test_type: "smoke"
 3 | tokenize: False
 4 | model: "nm-testing/llama2.c-stories15M"
 5 | dataset: "gsm8k"
 6 | dataset_config_name: "main"
 7 | recipe: |
 8 |   test_stage:
 9 |     obcq_modifiers:
10 |       SparseGPTModifier:
11 |         sparsity: 0.5
12 |         block_size: 128
13 |         targets: [
14 |           're:model.layers.3.mlp.gate_proj.weight'
15 |         ]


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf5.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf6.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: "gsm8k"
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"


--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/sparsification/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/pytorch/__init__.py


--------------------------------------------------------------------------------
/tests/llmcompressor/utils/pytorch/test_module.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch.nn as nn
 3 | 
 4 | from llmcompressor.utils.pytorch import get_layer_by_name
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def example_nested_module() -> str:
 9 |     return nn.Sequential(
10 |         nn.Linear(10, 20),
11 |         nn.Sequential(nn.ReLU(), nn.Linear(20, 10)),
12 |         nn.Sequential(nn.SiLU(), nn.Linear(20, 10)),
13 |         nn.Softmax(dim=1),
14 |     )
15 | 
16 | 
17 | @pytest.mark.unit
18 | def test_get_layer_by_name(example_nested_module):
19 |     # Test getting the parent of a nested layer
20 |     layer = get_layer_by_name("0", example_nested_module)
21 |     assert layer == example_nested_module[0]
22 | 
23 |     layer = get_layer_by_name("1.1", example_nested_module)
24 |     assert layer == example_nested_module[1][1]
25 | 
26 |     layer = get_layer_by_name("2.0", example_nested_module)
27 |     assert layer == example_nested_module[2][0]
28 | 
29 |     layer = get_layer_by_name("2.1", example_nested_module)
30 |     assert layer == example_nested_module[2][1]
31 | 
32 |     # Test getting the parent of a non-existent layer
33 |     with pytest.raises(AttributeError):
34 |         get_layer_by_name("non_existent_layer", example_nested_module)
35 | 


--------------------------------------------------------------------------------
/tests/lmeval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/lmeval/__init__.py


--------------------------------------------------------------------------------
/tests/lmeval/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: FP8_DYNAMIC
4 | lmeval:
5 |   metrics:
6 |     exact_match,flexible-extract: 0.75
7 |     exact_match,strict-match: 0.75
8 | 


--------------------------------------------------------------------------------
/tests/lmeval/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: FP8
 4 | dataset_id: HuggingFaceH4/ultrachat_200k
 5 | dataset_split: train_sft
 6 | lmeval:
 7 |   metrics:
 8 |     exact_match,flexible-extract: 0.75
 9 |     exact_match,strict-match: 0.75
10 | 


--------------------------------------------------------------------------------
/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: INT8_dyn_per_token
 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.77
10 |     exact_match,strict-match: 0.76


--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: weekly
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: FP8_DYNAMIC
 5 | lmeval:
 6 |   model: "hf-multimodal"
 7 |   model_args:
 8 |     dtype: bfloat16
 9 |     add_bos_token: True
10 |     convert_img_format: True
11 |   task: mmmu_val_literature
12 |   num_fewshot: 0
13 |   batch_size: 8
14 |   # dense model achieves accuracy of 0.9 +/ 0.0557
15 |   metrics:
16 |     acc,none: 0.8667
17 |     acc_stderr,none: 0.0557
18 | 


--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: INT8_dyn_per_token
 5 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 6 | dataset_id: lmms-lab/flickr30k
 7 | dataset_split: "test[:512]"
 8 | lmeval:
 9 |   model: "hf-multimodal"
10 |   model_args:
11 |     dtype: bfloat16
12 |     add_bos_token: True
13 |     convert_img_format: True
14 |   task: mmmu_val_literature
15 |   num_fewshot: 0
16 |   batch_size: 8
17 |   # dense model achieves accuracy of 0.9 +/ 0.0557
18 |   metrics:
19 |     acc,none: 0.833
20 |     acc_stderr,none: 0.0557


--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: Qwen/Qwen2.5-VL-7B-Instruct
 3 | model_class: Qwen2_5_VLForConditionalGeneration
 4 | scheme: W4A16_actorder_weight
 5 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 6 | dataset_id: lmms-lab/flickr30k
 7 | dataset_split: "test[:512]"
 8 | lmeval:
 9 |   model: "hf-multimodal"
10 |   model_args:
11 |     dtype: bfloat16
12 |     add_bos_token: True
13 |     convert_img_format: True
14 |   task: mmmu_val_literature
15 |   num_fewshot: 0
16 |   batch_size: 8
17 |   # dense model achieves accuracy of 0.9 +/ 0.0557
18 |   metrics:
19 |     acc,none: 0.8333
20 |     acc_stderr,none: 0.0557


--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16_actorder_group
 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16_actorder_weight
 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 5 | dataset_id: HuggingFaceH4/ultrachat_200k
 6 | dataset_split: train_sft
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
 1 | cadence: "weekly"
 2 | model: meta-llama/Meta-Llama-3-8B-Instruct
 3 | scheme: W4A16
 4 | dataset_id: HuggingFaceH4/ultrachat_200k
 5 | dataset_split: train_sft
 6 | quant_type: "GPTQ"
 7 | lmeval:
 8 |   metrics:
 9 |     exact_match,flexible-extract: 0.72
10 |     exact_match,strict-match: 0.72
11 | 


--------------------------------------------------------------------------------
/tests/test_timer/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | 
3 | from .timer import Timer
4 | 


--------------------------------------------------------------------------------
/tests/test_timer/timer_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | from tests.test_timer import Timer
 4 | 
 5 | __all__ = ["log_time", "get_singleton_manager"]
 6 | 
 7 | 
 8 | def get_singleton_manager(enable_logging: bool = True):
 9 |     """
10 |     Return the Timer. If not has not yet been initialized, initialize and
11 |     return. If it has, return the existing Timer.
12 |     """
13 |     if Timer._instance is None:
14 |         Timer._instance = Timer(enable_logging=enable_logging)
15 |     return Timer._instance
16 | 
17 | 
18 | def log_time(func):
19 |     """
20 |     Decorator to time functions. Times for the function are stored using
21 |     the class and function names.
22 |     """
23 | 
24 |     @wraps(func)
25 |     def wrapper(*args, **kwargs):
26 |         TIMER_MANAGER = get_singleton_manager()
27 |         func_name = func.__name__
28 | 
29 |         if not TIMER_MANAGER.enable_logging:
30 |             return func(*args, **kwargs)
31 | 
32 |         with TIMER_MANAGER.time(func_name):
33 |             return func(*args, **kwargs)
34 | 
35 |     return wrapper
36 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/__init__.py


--------------------------------------------------------------------------------
/tests/unit/core/events/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/events/__init__.py


--------------------------------------------------------------------------------
/tests/unit/core/events/test_event.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from llmcompressor.core import Event, EventType
 4 | 
 5 | 
 6 | @pytest.mark.smoke
 7 | def test_event_epoch_based():
 8 |     event = Event(steps_per_epoch=10)
 9 |     assert event.epoch_based is True
10 | 
11 | 
12 | @pytest.mark.smoke
13 | def test_event_epoch():
14 |     event = Event(steps_per_epoch=10, global_step=25)
15 |     assert event.epoch == 2
16 | 
17 | 
18 | @pytest.mark.smoke
19 | def test_event_epoch_full():
20 |     event = Event(steps_per_epoch=10, global_step=25)
21 |     assert event.epoch_full == 2.5
22 | 
23 | 
24 | @pytest.mark.smoke
25 | def test_event_epoch_step():
26 |     event = Event(steps_per_epoch=10, global_step=25)
27 |     assert event.epoch_step == 5
28 | 
29 | 
30 | @pytest.mark.smoke
31 | def test_event_epoch_batch():
32 |     event = Event(
33 |         steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50
34 |     )
35 |     assert event.epoch_batch == 10
36 | 
37 | 
38 | @pytest.mark.smoke
39 | def test_event_current_index():
40 |     event = Event(steps_per_epoch=10, global_step=25)
41 |     assert event.current_index == 2.5
42 | 
43 | 
44 | @pytest.mark.smoke
45 | def test_event_should_update():
46 |     event = Event(steps_per_epoch=10, global_step=25)
47 |     assert event.should_update(start=0, end=30, update=2.5) is True
48 |     assert event.should_update(start=0, end=20, update=5) is False
49 |     assert event.should_update(start=0, end=30, update=0) is True
50 | 
51 | 
52 | @pytest.mark.smoke
53 | def test_event_new_instance():
54 |     event = Event(type_=EventType.INITIALIZE, global_step=25)
55 |     new_event = event.new_instance(global_step=30)
56 |     assert new_event.global_step == 30
57 |     assert new_event.type_ == EventType.INITIALIZE
58 | 


--------------------------------------------------------------------------------