├── .MAINTAINERS
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── doc-edit.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
├── TODO.txt
└── workflows
│ ├── build-and-publish-release-images.yaml
│ ├── linkcheck.yml
│ ├── linkspector
│ └── linkspector.yml
│ ├── quality-check.yaml
│ ├── result.xml.fail
│ ├── result.xml.success
│ ├── set-comment.yaml
│ ├── test-check-transformers.yaml
│ └── test-check.yaml
├── .gitignore
├── CONTRIBUTING.md
├── DEVELOPING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE
├── README.md
├── docs
├── save_pretrained.md
└── schemes.md
├── examples
├── awq
│ ├── README.md
│ ├── llama_example.py
│ └── qwen3_moe_example.py
├── big_models_with_accelerate
│ ├── README.md
│ ├── cpu_offloading_fp8.py
│ ├── mult_gpus_int8_device_map.py
│ └── multi_gpu_int8.py
├── compressed_inference
│ └── fp8_compressed_inference.py
├── finetuning
│ ├── configure_fsdp.md
│ ├── example_alternating_recipe.yaml
│ ├── example_fsdp_config.yaml
│ └── example_single_gpu_config.yaml
├── multimodal_audio
│ ├── README.md
│ └── whisper_example.py
├── multimodal_vision
│ ├── README.md
│ ├── gemma3_example.py
│ ├── idefics3_example.py
│ ├── llava_example.py
│ ├── mistral3_chat_template.json
│ ├── mistral3_example.py
│ ├── mllama_example.py
│ ├── phi3_vision_example.py
│ ├── pixtral_example.py
│ ├── qwen2_vl_example.py
│ └── qwen_2_5_vl_example.py
├── quantization_2of4_sparse_w4a16
│ ├── 2of4_w4a16_group-128_recipe.yaml
│ ├── 2of4_w4a16_recipe.yaml
│ ├── README.md
│ └── llama7b_sparse_w4a16.py
├── quantization_kv_cache
│ ├── README.md
│ ├── gemma2_fp8_kv_example.py
│ ├── llama3_fp8_kv_example.py
│ └── phi3.5_fp8_kv_example.py
├── quantization_w4a16
│ ├── README.md
│ └── llama3_example.py
├── quantization_w4a16_fp4
│ └── llama3_example.py
├── quantization_w4a4_fp4
│ └── llama3_example.py
├── quantization_w8a8_fp8
│ ├── README.md
│ ├── gemma2_example.py
│ ├── llama3.2_vision_example.py
│ ├── llama3_example.py
│ ├── llava1.5_example.py
│ ├── qwen2vl_example.py
│ └── whisper_example.py
├── quantization_w8a8_int8
│ ├── README.md
│ ├── gemma2_example.py
│ └── llama3_example.py
├── quantizing_moe
│ ├── README.md
│ ├── deepseek_moe_w4a16.py
│ ├── deepseek_moe_w8a8_fp8.py
│ ├── deepseek_moe_w8a8_int8.py
│ ├── deepseek_recipe_w4a16.yaml
│ ├── mixtral_moe_w8a8_fp8.py
│ └── qwen_moe_w4a16.py
├── sparse_2of4_quantization_fp8
│ ├── README.md
│ └── llama3_8b_2of4.py
└── trl_mixin
│ ├── README.md
│ ├── ex_trl_constant.py
│ ├── ex_trl_distillation.py
│ └── sft_trainer.py
├── pyproject.toml
├── setup.py
├── src
└── llmcompressor
│ ├── __init__.py
│ ├── args
│ ├── README.md
│ ├── __init__.py
│ ├── dataset_arguments.py
│ ├── model_arguments.py
│ ├── recipe_arguments.py
│ ├── training_arguments.py
│ └── utils.py
│ ├── core
│ ├── __init__.py
│ ├── events
│ │ ├── __init__.py
│ │ └── event.py
│ ├── helpers.py
│ ├── lifecycle.py
│ ├── model_layer.py
│ ├── session.py
│ ├── session_functions.py
│ └── state.py
│ ├── datasets
│ ├── __init__.py
│ └── utils.py
│ ├── entrypoints
│ ├── README.md
│ ├── __init__.py
│ ├── oneshot.py
│ ├── train.py
│ └── utils.py
│ ├── logger.py
│ ├── metrics
│ ├── __init__.py
│ ├── logger.py
│ └── utils
│ │ ├── __init__.py
│ │ └── frequency_manager.py
│ ├── modifiers
│ ├── README.md
│ ├── __init__.py
│ ├── awq
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── mappings.py
│ ├── distillation
│ │ ├── __init__.py
│ │ ├── output
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ └── pytorch
│ │ │ ├── __init__.py
│ │ │ ├── kd_factory.py
│ │ │ ├── kd_wrapper.py
│ │ │ └── model_wrapper.py
│ ├── experimental
│ │ └── __init__.py
│ ├── factory.py
│ ├── interface.py
│ ├── logarithmic_equalization
│ │ ├── __init__.py
│ │ └── base.py
│ ├── modifier.py
│ ├── obcq
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── sgpt_base.py
│ │ └── sgpt_sparsify.py
│ ├── pruning
│ │ ├── __init__.py
│ │ ├── constant
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── helpers.py
│ │ ├── magnitude
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ └── pytorch
│ │ │ │ ├── __init__.py
│ │ │ │ ├── layer_mask.py
│ │ │ │ └── mask_factory.py
│ │ └── wanda
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── wanda_sparsify.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── cache.py
│ │ ├── calibration.py
│ │ ├── gptq
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── gptq_quantize.py
│ │ └── quantization
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ └── mixin.py
│ ├── smoothquant
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── utils.py
│ ├── stage.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ ├── helpers.py
│ │ ├── hooks.py
│ │ └── pytorch_helpers.py
│ ├── observers
│ ├── __init__.py
│ ├── base.py
│ ├── helpers.py
│ ├── min_max.py
│ └── mse.py
│ ├── pipelines
│ ├── __init__.py
│ ├── basic
│ │ ├── __init__.py
│ │ └── pipeline.py
│ ├── cache.py
│ ├── data_free
│ │ ├── __init__.py
│ │ └── pipeline.py
│ ├── independent
│ │ ├── __init__.py
│ │ └── pipeline.py
│ ├── layer_sequential
│ │ ├── __init__.py
│ │ ├── helpers.py
│ │ └── pipeline.py
│ ├── registry.py
│ └── sequential
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── ast_helpers.py
│ │ ├── ast_utils
│ │ ├── auto_wrapper.py
│ │ ├── control_flow_analyzer.py
│ │ └── name_analyzer.py
│ │ ├── helpers.py
│ │ └── pipeline.py
│ ├── pytorch
│ ├── __init__.py
│ ├── model_load
│ │ ├── __init__.py
│ │ └── helpers.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── helpers.py
│ │ ├── sparsification.py
│ │ └── sparsification_info
│ │ ├── __init__.py
│ │ ├── configs.py
│ │ ├── helpers.py
│ │ └── module_sparsification_info.py
│ ├── recipe
│ ├── __init__.py
│ ├── base.py
│ ├── metadata.py
│ ├── modifier.py
│ ├── recipe.py
│ └── stage.py
│ ├── sentinel.py
│ ├── transformers
│ ├── __init__.py
│ ├── compression
│ │ ├── __init__.py
│ │ ├── helpers.py
│ │ ├── quantization_format.py
│ │ └── sparsity_metadata_config.py
│ ├── finetune
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── callbacks.py
│ │ ├── data
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── c4.py
│ │ │ ├── cnn_dailymail.py
│ │ │ ├── custom.py
│ │ │ ├── data_helpers.py
│ │ │ ├── evolcodealpaca.py
│ │ │ ├── flickr_30k.py
│ │ │ ├── gsm8k.py
│ │ │ ├── open_platypus.py
│ │ │ ├── peoples_speech.py
│ │ │ ├── ptb.py
│ │ │ ├── ultrachat_200k.py
│ │ │ └── wikitext.py
│ │ ├── session_mixin.py
│ │ ├── text_generation.py
│ │ └── trainer.py
│ ├── sparsification
│ │ ├── __init__.py
│ │ ├── compressed_tensors_utils.py
│ │ └── sparse_model.py
│ ├── tracing
│ │ ├── __init__.py
│ │ └── debug.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── helpers.py
│ │ └── preprocessing_functions.py
│ ├── typing.py
│ └── utils
│ ├── __init__.py
│ ├── dev.py
│ ├── fsdp
│ ├── __init__.py
│ ├── context.py
│ └── helpers.py
│ ├── helpers.py
│ ├── metric_logging.py
│ └── pytorch
│ ├── __init__.py
│ ├── module.py
│ └── utils.py
└── tests
├── __init__.py
├── custom_test.py
├── data.py
├── e2e
├── __init__.py
├── e2e_utils.py
└── vLLM
│ ├── __init__.py
│ ├── configs
│ ├── fp8_dynamic_per_token.yaml
│ ├── fp8_dynamic_per_token_qwen.yaml
│ ├── fp8_static_per_tensor.yaml
│ ├── fp8_weight_only_channel.yaml
│ ├── fp8_weight_only_tensor.yaml
│ ├── int8_channel_weight_static_per_tensor_act.yaml
│ ├── int8_dynamic_per_token.yaml
│ ├── int8_tensor_weight_static_per_tensor_act.yaml
│ ├── int8_tensor_weight_static_per_tensor_act_qwen.yaml
│ ├── kv_cache_gptq_tinyllama.yaml
│ ├── kv_cache_phi3.yaml
│ ├── kv_cache_tinyllama.yaml
│ ├── sparse2of4_fp8_dynamic.yaml
│ ├── sparse2of4_fp8_dynamic_qwen.yaml
│ ├── sparse_24.yaml
│ ├── w4a16_2of4_channel_quant.yaml
│ ├── w4a16_2of4_grouped_quant.yaml
│ ├── w4a16_actorder_group.yaml
│ ├── w4a16_actorder_group_qwen.yaml
│ ├── w4a16_actorder_weight.yaml
│ ├── w4a16_actorder_weight_qwen.yaml
│ ├── w4a16_channel_quant.yaml
│ ├── w4a16_channel_quant_qwen.yaml
│ ├── w4a16_grouped_quant.yaml
│ ├── w4a16_grouped_quant_asym_awq.yaml
│ ├── w8a16_channel_quant.yaml
│ ├── w8a16_grouped_quant.yaml
│ ├── w8a8_dynamic_asym.yaml
│ └── w8a8_static_asym.yaml
│ ├── recipes
│ ├── FP8
│ │ ├── recipe_fp8_weight_only_channel.yaml
│ │ └── recipe_fp8_weight_only_per_tensor.yaml
│ ├── INT8
│ │ ├── recipe_int8_channel_weight_dynamic_per_token.yaml
│ │ ├── recipe_int8_channel_weight_static_per_tensor_act.yaml
│ │ ├── recipe_int8_tensor_weight_static_per_tensor_act.yaml
│ │ ├── recipe_w8a8_dynamic_asym.yaml
│ │ └── recipe_w8a8_static_asym.yaml
│ ├── Sparse_2of4
│ │ ├── recipe_sparse_2of4.yaml
│ │ └── recipe_sparse_2of4_fp8_dynamic.yaml
│ ├── WNA16
│ │ ├── recipe_w4a16_channel_quant.yaml
│ │ ├── recipe_w4a16_group_quant_asym_awq.yaml
│ │ └── recipe_w8a16_channel_quant.yaml
│ ├── WNA16_2of4
│ │ ├── 2of4_w4a16_group-128_recipe.yaml
│ │ └── 2of4_w4a16_recipe.yaml
│ ├── actorder
│ │ ├── recipe_w4a16_actorder_group.yaml
│ │ └── recipe_w4a16_actorder_weight.yaml
│ └── kv_cache
│ │ ├── default.yaml
│ │ └── gptq.yaml
│ ├── run_tests.sh
│ ├── skipped_configs
│ └── fp4_nvfp4a16.yaml
│ └── test_vllm.py
├── examples
├── __init__.py
├── test_big_models_with_accelerate.py
├── test_compressed_inference.py
├── test_quantization_2of4_sparse_w4a16.py
├── test_quantization_kv_cache.py
├── test_quantization_w4a16.py
├── test_quantization_w8a8_fp8.py
├── test_quantization_w8a8_int8.py
├── test_quantizing_moe.py
├── test_sparse_2of4_quantization_fp8.py
├── test_trl_mixin.py
└── utils.py
├── llmcompressor
├── __init__.py
├── conftest.py
├── helpers.py
├── metrics
│ ├── __init__.py
│ ├── test_logger.py
│ └── utils
│ │ ├── __init__.py
│ │ └── test_frequency_manager.py
├── modifiers
│ ├── __init__.py
│ ├── awq
│ │ ├── __init__.py
│ │ └── test_base.py
│ ├── calibration
│ │ ├── __init__.py
│ │ ├── test_cache.py
│ │ ├── test_frozen.py
│ │ ├── test_kv_cache.py
│ │ └── test_observers.py
│ ├── conf.py
│ ├── logarithmic_equalization
│ │ ├── __init__.py
│ │ └── test_base.py
│ ├── pruning
│ │ ├── __init__.py
│ │ ├── sparsegpt
│ │ │ ├── __init__.py
│ │ │ └── test_base.py
│ │ └── wanda
│ │ │ ├── __init__.py
│ │ │ └── test_base.py
│ ├── quantization
│ │ ├── __init__.py
│ │ └── test_base.py
│ ├── smoothquant
│ │ ├── __init__.py
│ │ ├── test_base.py
│ │ └── test_utils.py
│ └── utils
│ │ └── test_hooks.py
├── observers
│ ├── __init__.py
│ ├── test_helpers.py
│ ├── test_min_max.py
│ └── test_mse.py
├── pipelines
│ ├── sequential
│ │ ├── ast_utils.py
│ │ │ └── test_auto_wrapper.py
│ │ └── test_helpers.py
│ └── test_cache.py
├── pytorch
│ ├── __init__.py
│ ├── helpers.py
│ ├── modifiers
│ │ ├── __init__.py
│ │ ├── logarithmic_equalization
│ │ │ ├── __init__.py
│ │ │ └── test_pytorch.py
│ │ ├── pruning
│ │ │ ├── __init__.py
│ │ │ ├── constant
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_pytorch.py
│ │ │ ├── sparsegpt
│ │ │ │ ├── __init__.py
│ │ │ │ └── test_pytorch.py
│ │ │ └── wanda
│ │ │ │ └── test_pytorch.py
│ │ └── smoothquant
│ │ │ ├── __init__.py
│ │ │ └── test_pytorch.py
│ └── utils
│ │ ├── __init__.py
│ │ └── test_helpers.py
├── recipe
│ ├── __init__.py
│ ├── recipe.yaml
│ ├── test_recipe.py
│ └── test_recipe_parsing.py
├── test_sentinel.py
├── transformers
│ ├── __init__.py
│ ├── compression
│ │ ├── __init__.py
│ │ ├── configs
│ │ │ ├── actorder_group_1.1b.yaml
│ │ │ ├── actorder_weight_1.1b.yaml
│ │ │ ├── channelwise_1.1b.yaml
│ │ │ ├── channelwise_15m.yaml
│ │ │ ├── fp8_1.1b.yaml
│ │ │ ├── fp8_15m.yaml
│ │ │ ├── group_1.1b.yaml
│ │ │ ├── inputs_1.1b.yaml
│ │ │ ├── inputs_15m.yaml
│ │ │ ├── weights_only_1.1b.yaml
│ │ │ └── weights_only_15m.yaml
│ │ ├── decompression_configs
│ │ │ ├── fp8_dynamic.yaml
│ │ │ ├── w4a16.yaml
│ │ │ └── w8a16_dense.yaml
│ │ ├── decompression_configs_skipped
│ │ │ └── w8a8.yaml
│ │ ├── recipes
│ │ │ ├── new_quant_actorder_group.yaml
│ │ │ ├── new_quant_actorder_weight.yaml
│ │ │ ├── new_quant_channel.yaml
│ │ │ ├── new_quant_fp8.yaml
│ │ │ ├── new_quant_full.yaml
│ │ │ ├── new_quant_group.yaml
│ │ │ ├── new_quant_simple.yaml
│ │ │ ├── new_quant_weight.yaml
│ │ │ ├── sparse_24.yaml
│ │ │ └── sparse_24_fp8.yaml
│ │ ├── run_compressed_configs
│ │ │ ├── fp8_dynamic.yaml
│ │ │ ├── w4a16.yaml
│ │ │ └── w8a16.yaml
│ │ ├── run_compressed_configs_skipped
│ │ │ └── w8a8.yaml
│ │ ├── test_decompress.py
│ │ ├── test_has_gpu.py
│ │ ├── test_helpers.py
│ │ ├── test_infer_quant_format.py
│ │ ├── test_quantization.py
│ │ ├── test_run_compressed.py
│ │ └── test_sparsity_metadata_config.py
│ ├── conftest.py
│ ├── finetune
│ │ ├── __init__.py
│ │ ├── data
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ ├── test_dataset_helpers.py
│ │ │ ├── test_dataset_loading.py
│ │ │ └── test_registry.py
│ │ ├── finetune_custom
│ │ │ ├── config1.yaml
│ │ │ ├── config2.yaml
│ │ │ └── gpu
│ │ │ │ └── gpu_config.yaml
│ │ ├── finetune_generic
│ │ │ └── config1.yaml
│ │ ├── finetune_oneshot_configs
│ │ │ ├── config.yaml
│ │ │ └── gpu
│ │ │ │ └── gpu_config.yaml
│ │ ├── finetune_tokenizer
│ │ │ └── config1.yaml
│ │ ├── test_alternate_recipe.yaml
│ │ ├── test_finetune_no_recipe_custom_dataset.py
│ │ ├── test_finetune_recipe.yaml
│ │ ├── test_finetune_without_recipe.py
│ │ ├── test_oneshot_and_finetune.py
│ │ ├── test_oneshot_and_finetune_with_tokenizer.py
│ │ ├── test_oneshot_then_finetune.py
│ │ ├── test_quantization.yaml
│ │ ├── test_safetensors.py
│ │ └── test_session_mixin.py
│ ├── gptq
│ │ └── test_oneshot.py
│ ├── kv_cache
│ │ └── test_kv_cache.py
│ ├── obcq
│ │ ├── __init__.py
│ │ ├── obcq_configs
│ │ │ ├── completion
│ │ │ │ ├── gpu
│ │ │ │ │ ├── llama_7b_quant.yaml
│ │ │ │ │ ├── llama_7b_quant_and_sparse.yaml
│ │ │ │ │ └── llama_7b_sparse.yml
│ │ │ │ ├── tiny_llama_quant.yaml
│ │ │ │ └── tiny_llama_quant_and_sparse.yaml
│ │ │ ├── consec_runs
│ │ │ │ ├── gpu
│ │ │ │ │ └── llama_consec_runs.yaml
│ │ │ │ └── tiny_llama_consec_runs.yaml
│ │ │ ├── mask_structure
│ │ │ │ └── tiny_llama_mask_structure_preservation.yaml
│ │ │ ├── sparse
│ │ │ │ ├── gpu
│ │ │ │ │ └── llama_7b_sparse.yaml
│ │ │ │ └── tiny_llama_sparse.yaml
│ │ │ └── sparsity_generic
│ │ │ │ └── config.yaml
│ │ ├── recipes
│ │ │ ├── additional_sparsity.yaml
│ │ │ ├── additional_sparsity_with_quant.yaml
│ │ │ ├── quant.yaml
│ │ │ ├── quant_and_sparse.yaml
│ │ │ ├── sparse.yaml
│ │ │ ├── sparse_with_mask_structure.yaml
│ │ │ └── test_tiny2.yaml
│ │ ├── test_consecutive_runs.py
│ │ ├── test_mask_structure_preservation.py
│ │ ├── test_obcq_completion.py
│ │ ├── test_obcq_infer_targets.py
│ │ ├── test_obcq_lm_head.py
│ │ ├── test_obcq_owl.py
│ │ ├── test_obcq_sparsity.py
│ │ └── test_oneshot_with_modifier.py
│ ├── oneshot
│ │ ├── __init__.py
│ │ ├── dataset_processing.py
│ │ ├── oneshot_configs
│ │ │ ├── recipes
│ │ │ │ └── recipe.yaml
│ │ │ ├── tiny_stories_conf1.yaml
│ │ │ ├── tiny_stories_conf2.yaml
│ │ │ ├── tiny_stories_conf3.yaml
│ │ │ ├── tiny_stories_conf4.yaml
│ │ │ ├── tiny_stories_conf5.yaml
│ │ │ └── tiny_stories_conf6.yaml
│ │ └── test_api_inputs.py
│ ├── sparsification
│ │ ├── __init__.py
│ │ └── test_compress_tensor_utils.py
│ └── tracing
│ │ └── test_models.py
└── utils
│ ├── __init__.py
│ ├── pytorch
│ ├── __init__.py
│ └── test_module.py
│ └── test_helpers.py
├── lmeval
├── __init__.py
├── configs
│ ├── fp8_dynamic_per_token.yaml
│ ├── fp8_static_per_tensor.yaml
│ ├── int8_w8a8_dynamic_per_token.yaml
│ ├── vl_fp8_dynamic_per_token.yaml
│ ├── vl_int8_w8a8_dynamic_per_token.yaml
│ ├── vl_w4a16_actorder_weight.yaml
│ ├── w4a16_actorder_group.yaml
│ ├── w4a16_actorder_weight.yaml
│ └── w4a16_grouped_quant.yaml
└── test_lmeval.py
├── test_timer
├── __init__.py
├── timer.py
└── timer_utils.py
├── testing_utils.py
└── unit
├── __init__.py
├── core
├── __init__.py
├── events
│ ├── __init__.py
│ └── test_event.py
└── test_state.py
└── test_logger.py
/.MAINTAINERS:
--------------------------------------------------------------------------------
1 | # list of active maintainers
2 | # uncommented maintainers will be included in code review triage
3 |
4 | markurtz
5 | dsikka
6 | rahul-tuli
7 | horheynm
8 | brian-dellabetta
9 | kylesayrs
10 |
11 | # mgoin
12 | # anmarques
13 | # eldarkurtic
14 | # chibukach
15 | # shubhra
16 | # abhinavnmagic
17 | # eiofinov
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | labels: bug
5 |
6 | ---
7 |
8 | **Describe the bug**
9 | A clear and concise description of what the bug is.
10 |
11 | **Expected behavior**
12 | A clear and concise description of what you expected to happen.
13 |
14 | **Environment**
15 | Include all relevant environment information:
16 | 1. OS [e.g. Ubuntu 20.04]:
17 | 2. Python version [e.g. 3.7]:
18 | 3. LLM Compressor version or commit hash [e.g. 0.1.0, `f7245c8`]:
19 | 4. ML framework version(s) [e.g. torch 2.3.1]:
20 | 5. Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]:
21 | 6. Other relevant environment information [e.g. hardware, CUDA version]:
22 |
23 | **To Reproduce**
24 | Exact steps to reproduce the behavior:
25 |
26 |
27 | **Errors**
28 | If applicable, add a full print-out of any errors or exceptions that are raised or include screenshots to help explain your problem.
29 |
30 | **Additional context**
31 | Add any other context about the problem here. Also include any relevant files.
32 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/doc-edit.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Doc edit
3 | about: Propose changes to project documentation
4 | labels: documentation
5 |
6 | ---
7 |
8 | **What is the URL, file, or UI containing proposed doc change**
9 | Where does one find the original content or where would this change go?
10 |
11 | **What is the current content or situation in question**
12 | Copy/paste the source content or describe gap.
13 |
14 | **What is the proposed change**
15 | Add new content.
16 |
17 | **Additional context**
18 | Add any other context about the change here. Also include any relevant files or URLs.
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | labels: enhancement
5 |
6 | ---
7 |
8 | **Is your feature request related to a problem? Please describe.**
9 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
10 |
11 | **Describe the solution you'd like**
12 | A clear and concise description of what you want to happen.
13 |
14 | **Describe alternatives you've considered**
15 | A clear and concise description of any alternative solutions or features you've considered.
16 |
17 | **Additional context**
18 | Add any other context or screenshots about the feature request here.
19 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | SUMMARY:
2 | "please provide a brief summary"
3 |
4 |
5 | TEST PLAN:
6 | "please outline how the changes were tested"
7 |
--------------------------------------------------------------------------------
/.github/TODO.txt:
--------------------------------------------------------------------------------
1 | TODO: update for upstream push
--------------------------------------------------------------------------------
/.github/workflows/linkcheck.yml:
--------------------------------------------------------------------------------
1 | name: Check Markdown links
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | # Allows you to run this workflow manually from the Actions tab
12 | workflow_dispatch:
13 |
14 | jobs:
15 | markdown-link-check:
16 | runs-on: ubuntu-latest
17 | steps:
18 | - uses: actions/checkout@v4
19 | - uses: umbrelladocs/action-linkspector@v1
20 | with:
21 | github_token: ${{ secrets.github_token }}
22 | reporter: github-pr-review
23 | fail_on_error: true
24 | config_file: '.github/workflows/linkspector/linkspector.yml'
25 |
--------------------------------------------------------------------------------
/.github/workflows/linkspector/linkspector.yml:
--------------------------------------------------------------------------------
1 | aliveStatusCodes:
2 | - 0
3 | - 200
4 | ignorePatterns:
5 | - pattern: '.*localhost.*'
6 | - pattern: '.*127\\.0\\.0\\.1.*'
7 | - pattern: '.*0\\.0\\.0\\.0.*'
8 | dirs:
9 | - .
10 | useGitIgnore: true
--------------------------------------------------------------------------------
/.github/workflows/quality-check.yaml:
--------------------------------------------------------------------------------
1 | name: Quality Checks
2 | on:
3 | push:
4 | branches:
5 | - main
6 | - 'release/*'
7 | pull_request:
8 | branches:
9 | - main
10 | - 'release/*'
11 | jobs:
12 | quality-check:
13 | runs-on: ubuntu-22.04
14 | steps:
15 | - uses: actions/setup-python@v5
16 | with:
17 | python-version: '3.9'
18 | - uses: actions/checkout@v4
19 | - name: "⚙️ Install dependencies"
20 | run: pip3 install .[dev]
21 | - name: "🧹 Running quality checks"
22 | run: make quality
23 |
--------------------------------------------------------------------------------
/.github/workflows/result.xml.fail:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.github/workflows/result.xml.success:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.github/workflows/set-comment.yaml:
--------------------------------------------------------------------------------
1 | name: PR Reminder Comment Bot
2 | on:
3 | pull_request_target:
4 | branches: [main]
5 | types: [opened]
6 |
7 | jobs:
8 | pr_reminder:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Remind to add ready label
12 | uses: actions/github-script@v7
13 | with:
14 | script: |
15 | github.rest.issues.createComment({
16 | owner: context.repo.owner,
17 | repo: context.repo.repo,
18 | issue_number: context.issue.number,
19 | body: '👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.\n\n**Note:** This is required to complete the testing suite, please only add the label once the PR is code complete and local testing has been performed.'
20 | })
21 | env:
22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to LLM Compressor
2 |
3 | Thank you for your interest in contributing to LLM Compressor!
4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
5 | There are several ways you can contribute to the project:
6 |
7 | - Identify and report any issues or bugs.
8 | - Request or add new compression methods or research.
9 | - Suggest or implement new features.
10 |
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 |
14 | Finally, one of the most impactful ways to support us is by raising awareness about LLM Compressor and the vLLM community.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 |
18 | ## Setup for development
19 |
20 | ### Install from source
21 |
22 | ```bash
23 | pip install -e ./[dev]
24 | ```
25 |
26 | ### Code Styling and Formatting checks
27 |
28 | ```bash
29 | make style
30 | make quality
31 | ```
32 |
33 | ### Testing
34 |
35 | ```bash
36 | make test
37 | ```
38 |
39 | ## Contributing Guidelines
40 |
41 | ### Issue Reporting
42 |
43 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
44 | If not, please file a new issue, providing as much relevant information as possible.
45 |
46 | ### Pull Requests & Code Reviews
47 |
48 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
49 |
50 | ### Thank You
51 |
52 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to LLM Compressor.
53 | Your contributions make LLM Compressor a great tool for everyone!
54 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | recursive-exclude src *.png *.jpg *.jpeg *.gif *.svg *.bmp *.webp
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | BUILDDIR := $(PWD)
2 | CHECKDIRS := src tests examples setup.py
3 | DOCDIR := docs
4 |
5 | BUILD_ARGS := # set nightly to build nightly release
6 |
7 | # refer to setup.py for allowed values for BUILD_TYPE
8 | BUILD_TYPE?=dev
9 | export BUILD_TYPE
10 |
11 | TARGETS := "" # targets for running pytests: deepsparse,keras,onnx,pytorch,pytorch_models,export,pytorch_datasets,tensorflow_v1,tensorflow_v1_models,tensorflow_v1_datasets
12 | PYTEST_ARGS ?= ""
13 | ifneq ($(findstring transformers,$(TARGETS)),transformers)
14 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/transformers
15 | endif
16 | ifneq ($(findstring pytorch,$(TARGETS)),pytorch)
17 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/pytorch
18 | endif
19 | ifneq ($(findstring examples,$(TARGETS)),examples)
20 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/examples
21 | endif
22 |
23 | # run checks on all files for the repo
24 | # leaving out mypy src for now
25 | quality:
26 | @echo "Running python quality checks";
27 | ruff check $(CHECKDIRS);
28 | isort --check-only $(CHECKDIRS);
29 | flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
30 |
31 | # style the code according to accepted standards for the repo
32 | style:
33 | @echo "Running python styling";
34 | ruff format $(CHECKDIRS);
35 | isort $(CHECKDIRS);
36 | flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
37 |
38 | # run tests for the repo
39 | test:
40 | @echo "Running python tests";
41 | pytest tests $(PYTEST_ARGS)
42 |
43 | # creates wheel file
44 | .PHONY: build
45 | build:
46 | python3 setup.py sdist bdist_wheel $(BUILD_ARGS)
47 |
48 | # clean package
49 | clean:
50 | rm -fr .pytest_cache;
51 | rm -fr docs/_build docs/build;
52 | find $(CHECKDIRS) | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -fr;
53 |
--------------------------------------------------------------------------------
/examples/big_models_with_accelerate/cpu_offloading_fp8.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
7 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
8 |
9 | # Load model
10 | # Note: device_map="auto" will offload to CPU if not enough space on GPU.
11 | model = AutoModelForCausalLM.from_pretrained(
12 | MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
13 | )
14 |
15 | # Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
16 | recipe = QuantizationModifier(
17 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
18 | )
19 |
20 | # Apply quantization and save in `compressed-tensors` format.
21 | oneshot(
22 | model=model,
23 | recipe=recipe,
24 | tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
25 | output_dir=OUTPUT_DIR,
26 | )
27 |
--------------------------------------------------------------------------------
/examples/compressed_inference/fp8_compressed_inference.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | """
4 | This example covers how to load a quantized model using AutoModelForCausalLM.
5 |
6 | During inference, each layer will be decompressed as needed before the forward pass.
7 | This saves memory as only a single layer is ever uncompressed at a time, but increases
8 | runtime as we need to decompress each layer before running the forward pass
9 |
10 | """
11 |
12 | # any model with the "compressed-tensors" quant_method and "compressed"
13 | # quantization_status in the quantization config is supported
14 | MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed"
15 |
16 | SAMPLE_INPUT = [
17 | "I love quantization because",
18 | "What is the capital of France?",
19 | "def fibonacci(n):",
20 | ]
21 |
22 | compressed_model = AutoModelForCausalLM.from_pretrained(
23 | MODEL_STUB,
24 | torch_dtype="auto",
25 | device_map="cuda:0",
26 | )
27 |
28 | # tokenize the sample data
29 | tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
30 | inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
31 | compressed_model.device
32 | )
33 |
34 | # run the compressed model and decode the output
35 | output = compressed_model.generate(**inputs, max_length=50)
36 | print("========== SAMPLE GENERATION ==============")
37 | text_output = tokenizer.batch_decode(output)
38 | for sample in text_output:
39 | print(sample)
40 |
--------------------------------------------------------------------------------
/examples/finetuning/configure_fsdp.md:
--------------------------------------------------------------------------------
1 | # Configuring FSDP for Sparse Finetuning
2 |
3 | An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this
4 | folder. It can be used out of the box by editing the `num_processes` parameter to
5 | fit the number of GPUs on your machine.
6 |
7 | You can also customize your own config file by running the following prompt
8 | ```
9 | accelerate config
10 | ```
11 |
12 | An FSDP config file can be passed to the LLM Compressor finetuning script like this:
13 | ```
14 | accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune
15 | ```
16 |
--------------------------------------------------------------------------------
/examples/finetuning/example_alternating_recipe.yaml:
--------------------------------------------------------------------------------
1 | initial_sparsity_stage:
2 | run_type: oneshot
3 | obcq_modifiers:
4 | SparseGPTModifier:
5 | sparsity: 0.5
6 | block_size: 128
7 | percdamp: 0.01
8 | mask_structure: "0:0"
9 | targets: ["Linear"]
10 | ignore: ["re:.*lm_head"]
11 | initial_training_stage:
12 | run_type: train
13 | pruning_modifiers:
14 | ConstantPruningModifier:
15 | targets: '__ALL__'
16 | start: 0
17 | next_sparsity_stage:
18 | run_type: oneshot
19 | obcq_modifiers:
20 | SparseGPTModifier:
21 | sparsity: 0.7
22 | block_size: 128
23 | percdamp: 0.01
24 | mask_structure: "0:0"
25 | targets: ["Linear"]
26 | ignore: ["re:.*lm_head"]
27 | next_training_stage:
28 | run_type: train
29 | pruning_modifiers:
30 | ConstantPruningModifier:
31 | targets: '__ALL__'
32 | start: 0
--------------------------------------------------------------------------------
/examples/finetuning/example_fsdp_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | fsdp_config:
6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7 | fsdp_backward_prefetch_policy: BACKWARD_PRE
8 | fsdp_cpu_ram_efficient_loading: false
9 | fsdp_forward_prefetch: false
10 | fsdp_offload_params: false
11 | fsdp_sharding_strategy: 1
12 | fsdp_state_dict_type: SHARDED_STATE_DICT
13 | fsdp_sync_module_states: true
14 | fsdp_use_orig_params: false
15 | machine_rank: 0
16 | main_training_function: main
17 | num_machines: 1
18 | num_processes: 4
19 | rdzv_backend: static
20 | same_network: true
21 | tpu_env: []
22 | tpu_use_cluster: false
23 | tpu_use_sudo: false
24 | use_cpu: false
25 |
--------------------------------------------------------------------------------
/examples/finetuning/example_single_gpu_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | debug: false
3 | distributed_type: 'NO'
4 | enable_cpu_affinity: false
5 | gpu_ids: 0
6 | machine_rank: 0
7 | main_training_function: main
8 | num_machines: 1
9 | num_processes: 1
10 | rdzv_backend: static
11 | same_network: true
12 | tpu_env: []
13 | tpu_use_cluster: false
14 | tpu_use_sudo: false
15 | use_cpu: false
--------------------------------------------------------------------------------
/examples/multimodal_vision/mistral3_chat_template.json:
--------------------------------------------------------------------------------
1 | {
2 | "chat_template": "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- else %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] in ['image', 'image_url'] %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {%- if message['content'] is string %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- else %}\n {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}"
3 | }
--------------------------------------------------------------------------------
/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | sparsity_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | mask_structure: "2:4"
6 | targets: ["Linear"]
7 | ignore: ["re:.*lm_head"]
8 | finetuning_stage:
9 | finetuning_modifiers:
10 | ConstantPruningModifier:
11 | targets: [
12 | 're:.*q_proj.weight',
13 | 're:.*k_proj.weight',
14 | 're:.*v_proj.weight',
15 | 're:.*o_proj.weight',
16 | 're:.*gate_proj.weight',
17 | 're:.*up_proj.weight',
18 | 're:.*down_proj.weight',
19 | ]
20 | start: 0
21 | quantization_stage:
22 | quantization_modifiers:
23 | GPTQModifier:
24 | ignore: ["lm_head"]
25 | config_groups:
26 | group_0:
27 | weights:
28 | num_bits: 4
29 | type: "int"
30 | symmetric: true
31 | strategy: "group"
32 | group_size: 128
33 | targets: ["Linear"]
34 |
--------------------------------------------------------------------------------
/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | sparsity_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | mask_structure: "2:4"
6 | targets: ["Linear"]
7 | ignore: ["re:.*lm_head"]
8 | finetuning_stage:
9 | finetuning_modifiers:
10 | ConstantPruningModifier:
11 | targets: [
12 | 're:.*q_proj.weight',
13 | 're:.*k_proj.weight',
14 | 're:.*v_proj.weight',
15 | 're:.*o_proj.weight',
16 | 're:.*gate_proj.weight',
17 | 're:.*up_proj.weight',
18 | 're:.*down_proj.weight',
19 | ]
20 | start: 0
21 | quantization_stage:
22 | quantization_modifiers:
23 | GPTQModifier:
24 | ignore: ["lm_head"]
25 | config_groups:
26 | group_0:
27 | weights:
28 | num_bits: 4
29 | type: "int"
30 | symmetric: true
31 | strategy: "channel"
32 | targets: ["Linear"]
33 |
--------------------------------------------------------------------------------
/examples/quantization_w4a16_fp4/llama3_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
7 |
8 | # Load model.
9 | model = AutoModelForCausalLM.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 |
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp4 with per group 16 via ptq
17 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])
18 |
19 | # Apply quantization.
20 | oneshot(model=model, recipe=recipe)
21 |
22 | print("\n\n")
23 | print("========== SAMPLE GENERATION ==============")
24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
25 | output = model.generate(input_ids, max_new_tokens=100)
26 | print(tokenizer.decode(output[0]))
27 | print("==========================================\n\n")
28 |
29 |
30 | # Save to disk in compressed-tensors format.
31 | SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16"
32 | model.save_pretrained(SAVE_DIR, save_compressed=True)
33 | tokenizer.save_pretrained(SAVE_DIR)
34 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/gemma2_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "google/gemma-2-27b-it"
7 |
8 | # 1) Load model.
9 | model = AutoModelForCausalLM.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 |
14 | # 2) Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp8 with per channel via ptq
17 | # * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
20 | )
21 |
22 | # 3) Apply quantization and save in compressed-tensors format.
23 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
24 | oneshot(
25 | model=model,
26 | recipe=recipe,
27 | tokenizer=tokenizer,
28 | output_dir=OUTPUT_DIR,
29 | )
30 |
31 | # Confirm generations of the quantized model look sane.
32 | # NOTE: transformers 4.49.0 results in a generation error with gemma2.
33 | # Consider either downgrading your transformers version to a previous version
34 | # or use vLLM for sample generation.
35 | print("========== SAMPLE GENERATION ==============")
36 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
37 | output = model.generate(input_ids, max_new_tokens=20)
38 | print(tokenizer.decode(output[0]))
39 | print("==========================================")
40 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llama3.2_vision_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoProcessor, MllamaForConditionalGeneration
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
7 |
8 | # Load model.
9 | model = MllamaForConditionalGeneration.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 |
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp8 with per channel via ptq
17 | # * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 | targets="Linear",
20 | scheme="FP8_DYNAMIC",
21 | ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
22 | )
23 |
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(
27 | model=model,
28 | recipe=recipe,
29 | output_dir=SAVE_DIR,
30 | )
31 | processor.save_pretrained(SAVE_DIR)
32 |
33 | # Confirm generations of the quantized model look sane.
34 | print("========== SAMPLE GENERATION ==============")
35 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
36 | output = model.generate(input_ids, max_new_tokens=20)
37 | print(processor.decode(output[0]))
38 | print("==========================================")
39 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llama3_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
7 |
8 | # Load model.
9 | model = AutoModelForCausalLM.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
13 |
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp8 with per channel via ptq
17 | # * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
20 | )
21 |
22 | # Apply quantization.
23 | oneshot(model=model, recipe=recipe)
24 |
25 | # Confirm generations of the quantized model look sane.
26 | print("========== SAMPLE GENERATION ==============")
27 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
28 | output = model.generate(input_ids, max_new_tokens=20)
29 | print(tokenizer.decode(output[0]))
30 | print("==========================================")
31 |
32 | # Save to disk in compressed-tensors format.
33 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
34 | model.save_pretrained(SAVE_DIR)
35 | tokenizer.save_pretrained(SAVE_DIR)
36 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/llava1.5_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoProcessor, LlavaForConditionalGeneration
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "llava-hf/llava-1.5-7b-hf"
7 |
8 | # Load model.
9 | model = LlavaForConditionalGeneration.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 |
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp8 with per channel via ptq
17 | # * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 | targets="Linear",
20 | scheme="FP8_DYNAMIC",
21 | ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
22 | )
23 |
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27 | processor.save_pretrained(SAVE_DIR)
28 |
29 | # Confirm generations of the quantized model look sane.
30 | print("========== SAMPLE GENERATION ==============")
31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/qwen2vl_example.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
2 |
3 | from llmcompressor import oneshot
4 | from llmcompressor.modifiers.quantization import QuantizationModifier
5 |
6 | MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
7 |
8 | # Load model.
9 | model = Qwen2VLForConditionalGeneration.from_pretrained(
10 | MODEL_ID, device_map="auto", torch_dtype="auto"
11 | )
12 | processor = AutoProcessor.from_pretrained(MODEL_ID)
13 |
14 | # Configure the quantization algorithm and scheme.
15 | # In this case, we:
16 | # * quantize the weights to fp8 with per channel via ptq
17 | # * quantize the activations to fp8 with dynamic per token
18 | recipe = QuantizationModifier(
19 | targets="Linear",
20 | scheme="FP8_DYNAMIC",
21 | ignore=["re:.*lm_head", "re:visual.*"],
22 | )
23 |
24 | # Apply quantization and save to disk in compressed-tensors format.
25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
27 | processor.save_pretrained(SAVE_DIR)
28 |
29 | # Confirm generations of the quantized model look sane.
30 | print("========== SAMPLE GENERATION ==============")
31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
32 | output = model.generate(input_ids, max_new_tokens=20)
33 | print(processor.decode(output[0]))
34 | print("==========================================")
35 |
--------------------------------------------------------------------------------
/examples/quantization_w8a8_fp8/whisper_example.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | from transformers import AutoProcessor, WhisperForConditionalGeneration
3 |
4 | from llmcompressor import oneshot
5 | from llmcompressor.modifiers.quantization import QuantizationModifier
6 |
7 | MODEL_ID = "openai/whisper-large-v2"
8 |
9 | # Load model.
10 | model = WhisperForConditionalGeneration.from_pretrained(
11 | MODEL_ID, device_map="auto", torch_dtype="auto"
12 | )
13 | model.config.forced_decoder_ids = None
14 | processor = AutoProcessor.from_pretrained(MODEL_ID)
15 | processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
16 |
17 | # Configure the quantization algorithm and scheme.
18 | # In this case, we:
19 | # * quantize the weights to fp8 with per channel via ptq
20 | # * quantize the activations to fp8 with dynamic per token
21 | recipe = QuantizationModifier(
22 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
23 | )
24 |
25 | # Apply quantization.
26 | oneshot(model=model, recipe=recipe)
27 |
28 | # Confirm generations of the quantized model look sane.
29 | print("========== SAMPLE GENERATION ==============")
30 | ds = load_dataset(
31 | "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
32 | )
33 | sample = ds[0]["audio"]
34 | input_features = processor(
35 | sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
36 | ).input_features
37 | input_features = input_features.to(model.device)
38 | output_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
39 | print(processor.batch_decode(output_ids, skip_special_tokens=False)[0])
40 | # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
41 | print("==========================================")
42 |
43 | # Save to disk in compressed-tensors format.
44 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
45 | model.save_pretrained(SAVE_DIR, save_compressed=True)
46 | processor.save_pretrained(SAVE_DIR)
47 |
--------------------------------------------------------------------------------
/examples/quantizing_moe/deepseek_recipe_w4a16.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | ignore: [lm_head, "re:.*mlp.gate$"]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/examples/trl_mixin/README.md:
--------------------------------------------------------------------------------
1 | # Sparse Finetuning with TRL's SFTTrainer
2 |
3 | The `SessionManagerMixin` can be added to other Trainer classes that inherit from
4 | [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer).
5 |
6 | For example, we can add LLM Compressor support to TRL's SFTTrainer like so:
7 |
8 | Note: install `trl` using `pip install trl`
9 |
10 | ```python
11 | from trl import SFTTrainer as TRLSFTTrainer
12 |
13 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
14 | ...
15 | ```
16 |
17 | The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during
18 | supervised finetuning, will full support for all of the original TRL features. The full
19 | class is defined in the script `sft_trainer.py` and requires very minimal
20 | additional code: just a dataset load override to support passing in tokenized datasets
21 | to the Trainer.
22 |
23 | ### Examples
24 |
25 | * Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model,
26 | using TRL's dataset preprocessing. Sparsity is maintained throughout training by
27 | applying a `ConstantPruningModifier` recipe to the `SFTTrainer`
28 |
29 | * Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b
30 | model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained
31 | throughout training with a `ConstantPruningModifier` and layer-wise knowledge
32 | distillation is handled by the `OutputDistillationModifier`
--------------------------------------------------------------------------------
/examples/trl_mixin/ex_trl_constant.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | from sft_trainer import SFTTrainer
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from trl import DataCollatorForCompletionOnlyLM
5 |
6 | from llmcompressor.args import ModelArguments
7 |
8 | model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
9 | output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data"
10 | model = AutoModelForCausalLM.from_pretrained(
11 | model_path, torch_dtype="auto", device_map="auto"
12 | )
13 | tokenizer = AutoTokenizer.from_pretrained(model_path)
14 | tokenizer.pad_token = tokenizer.eos_token
15 |
16 | # recipe for maintaining model sparsity during finetuning
17 | recipe = """
18 | test_stage:
19 | pruning_modifiers:
20 | ConstantPruningModifier:
21 | targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight',
22 | 're:.*o_proj.weight','re:.*gate_proj.weight', 're:.*up_proj.weight',
23 | 're:.*down_proj.weight']
24 | start: 0
25 | """
26 |
27 | # Load gsm8k using TRL dataset tools
28 | dataset = load_dataset("gsm8k", "main", split="train")
29 |
30 |
31 | def formatting_prompts_func(example):
32 | output_texts = []
33 | for i in range(len(example["question"])):
34 | text = f"Question: {example['question'][i]}\n Answer: {example['answer'][i]}"
35 | output_texts.append(text)
36 | return output_texts
37 |
38 |
39 | response_template = "Answer:"
40 | collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
41 |
42 | trl_sft_config_args = dict(
43 | output_dir=output_dir,
44 | num_train_epochs=0.6,
45 | logging_steps=50,
46 | gradient_checkpointing=True,
47 | max_seq_length=512,
48 | )
49 | model_args = ModelArguments(model=model)
50 |
51 | trainer = SFTTrainer(
52 | model=model,
53 | processing_class=tokenizer,
54 | recipe=recipe,
55 | train_dataset=dataset,
56 | formatting_func=formatting_prompts_func,
57 | data_collator=collator,
58 | trl_sft_config_args=trl_sft_config_args,
59 | model_args=model_args,
60 | )
61 | trainer.train()
62 |
--------------------------------------------------------------------------------
/examples/trl_mixin/sft_trainer.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional
2 |
3 | from trl import SFTConfig as TRLSFTConfig
4 | from trl import SFTTrainer as TRLSFTTrainer
5 |
6 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
7 |
8 | __all__ = ["SFTTrainer"]
9 |
10 |
11 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer):
12 | def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs):
13 | if trl_sft_config_args is not None:
14 | kwargs["args"] = TRLSFTConfig(**trl_sft_config_args)
15 | super().__init__(*args, **kwargs)
16 |
17 | def _prepare_dataset(self, dataset, *args, **kwargs):
18 | if "input_ids" in dataset.column_names:
19 | # dataset is already tokenized, skip preprocessing
20 | return dataset
21 |
22 | return super()._prepare_dataset(dataset, *args, **kwargs)
23 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.black]
6 | line-length = 88
7 | target-version = ['py38']
8 |
9 | [tool.isort]
10 | profile = "black"
11 | skip = ["src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"]
12 |
13 | [tool.mypy]
14 | files = "src/guidellm"
15 |
16 | [tool.ruff]
17 | exclude = ["build", "dist", "env", ".venv", "src/llmcompressor/transformers/tracing/"]
18 | lint.select = ["E", "F", "W"]
19 |
20 | [tool.flake8]
21 | max-line-length = 88
22 | extend-ignore = 'E203'
23 |
24 | [tool.pytest.ini_options]
25 | markers = [
26 | "smoke: quick tests to check basic functionality",
27 | "sanity: tests to ensure that new changes do not break existing functionality",
28 | "regression: detailed tests to ensure major functions work correctly",
29 | "integration: tests which integrate with a third party service such as HF",
30 | "unit: tests to ensure code correctness and regression test functionality",
31 | "example: tests for content in the 'examples' folder",
32 | "multi_gpu: tests that require multiple GPUs",
33 | ]
34 | tmp_path_retention_policy = "failed"
35 |
--------------------------------------------------------------------------------
/src/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | A library for compressing large language models utilizing the latest techniques and
3 | research in the field for both training aware and post training techniques.
4 |
5 | The library is designed to be flexible and easy to use on top of
6 | PyTorch and HuggingFace Transformers, allowing for quick experimentation.
7 | """
8 |
9 | # flake8: noqa
10 |
11 | from .logger import LoggerConfig, configure_logger, logger
12 | from .version import __version__, version
13 |
14 | __all__ = [
15 | "__version__",
16 | "version",
17 | "configure_logger",
18 | "logger",
19 | "LoggerConfig",
20 | ]
21 |
22 | from llmcompressor.core.session_functions import (
23 | active_session,
24 | callbacks,
25 | create_session,
26 | reset_session,
27 | )
28 | from llmcompressor.entrypoints import Oneshot, oneshot, train
29 |
--------------------------------------------------------------------------------
/src/llmcompressor/args/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .dataset_arguments import DatasetArguments
4 | from .model_arguments import ModelArguments
5 | from .recipe_arguments import RecipeArguments
6 | from .training_arguments import TrainingArguments
7 | from .utils import parse_args
8 |
--------------------------------------------------------------------------------
/src/llmcompressor/args/recipe_arguments.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import List, Optional
3 |
4 |
5 | @dataclass
6 | class RecipeArguments:
7 | """Recipe and session variables"""
8 |
9 | recipe: Optional[str] = field(
10 | default=None,
11 | metadata={
12 | "help": "Path to a LLM Compressor sparsification recipe",
13 | },
14 | )
15 | recipe_args: Optional[List[str]] = field(
16 | default=None,
17 | metadata={
18 | "help": (
19 | "List of recipe arguments to evaluate, of the format key1=value1 "
20 | "key2=value2"
21 | )
22 | },
23 | )
24 | clear_sparse_session: Optional[bool] = field(
25 | default=False,
26 | metadata={
27 | "help": (
28 | "Whether to clear CompressionSession/CompressionLifecycle ",
29 | "data between runs.",
30 | )
31 | },
32 | )
33 | stage: Optional[str] = field(
34 | default=None,
35 | metadata={"help": ("The stage of the recipe to use for oneshot / train.",)},
36 | )
37 |
--------------------------------------------------------------------------------
/src/llmcompressor/args/training_arguments.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional
3 |
4 | from transformers import TrainingArguments as HFTrainingArgs
5 |
6 | __all__ = [
7 | "TrainingArguments",
8 | ]
9 |
10 |
11 | @dataclass
12 | class TrainingArguments(HFTrainingArgs):
13 | """
14 | Training arguments specific to LLM Compressor Transformers workflow using
15 | HFTrainingArgs as base class
16 |
17 | """
18 |
19 | do_oneshot: Optional[bool] = field(
20 | default=False,
21 | metadata={"help": "Whether to run one-shot calibration in stages"},
22 | )
23 | run_stages: Optional[bool] = field(
24 | default=False, metadata={"help": "Whether to trigger recipe stage by stage"}
25 | )
26 | output_dir: str = field(
27 | default="./output",
28 | metadata={
29 | "help": "The output directory where the model safetensors, "
30 | "recipe, config, and optionally checkpoints will be written."
31 | },
32 | )
33 |
34 | @property
35 | def place_model_on_device(self):
36 | return False
37 |
--------------------------------------------------------------------------------
/src/llmcompressor/core/__init__.py:
--------------------------------------------------------------------------------
1 | from llmcompressor.core.events import Event, EventType
2 | from llmcompressor.core.lifecycle import CompressionLifecycle
3 | from llmcompressor.core.model_layer import ModelParameterizedLayer
4 | from llmcompressor.core.session import CompressionSession
5 | from llmcompressor.core.session_functions import (
6 | LifecycleCallbacks,
7 | active_session,
8 | callbacks,
9 | create_session,
10 | reset_session,
11 | )
12 | from llmcompressor.core.state import Data, Hardware, ModifiedState, State
13 |
14 | __all__ = [
15 | "Event",
16 | "EventType",
17 | "State",
18 | "Data",
19 | "Hardware",
20 | "ModifiedState",
21 | "ModelParameterizedLayer",
22 | "CompressionLifecycle",
23 | "CompressionSession",
24 | "create_session",
25 | "active_session",
26 | "reset_session",
27 | "apply",
28 | "callbacks",
29 | "LifecycleCallbacks",
30 | ]
31 |
--------------------------------------------------------------------------------
/src/llmcompressor/core/events/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | LLM Compressor Core Events Package
3 |
4 | This package provides the core components and lifecycle management for events
5 | used in the LLM Compressor framework. It includes definitions for various
6 | event types and lifecycles that are critical for managing the state and
7 | execution flow of the model compression and training processes.
8 | """
9 |
10 | from .event import Event, EventType
11 |
12 | __all__ = ["Event", "EventType"]
13 |
--------------------------------------------------------------------------------
/src/llmcompressor/core/model_layer.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any
3 |
4 | __all__ = ["ModelParameterizedLayer"]
5 |
6 |
7 | @dataclass
8 | class ModelParameterizedLayer:
9 | """
10 | A dataclass for holding a parameter and its layer
11 |
12 | :param layer_name: the name of the layer
13 | :param layer: the layer object
14 | :param param_name: the name of the parameter
15 | :param param: the parameter object
16 | """
17 |
18 | layer_name: str
19 | layer: Any
20 | param_name: str
21 | param: Any
22 |
--------------------------------------------------------------------------------
/src/llmcompressor/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .utils import (
4 | format_calibration_data,
5 | get_calibration_dataloader,
6 | get_processed_dataset,
7 | make_dataset_splits,
8 | )
9 |
--------------------------------------------------------------------------------
/src/llmcompressor/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .oneshot import Oneshot, oneshot
3 | from .train import train
4 | from .utils import post_process, pre_process
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .logger import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .frequency_manager import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
1 | from .factory import ModifierFactory
2 | from .interface import ModifierInterface
3 | from .modifier import Modifier
4 | from .stage import StageModifiers
5 |
6 | __all__ = [
7 | "ModifierFactory",
8 | "ModifierInterface",
9 | "Modifier",
10 | "StageModifiers",
11 | ]
12 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 | from .mappings import *
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .output import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/output/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/distillation/utils/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .kd_factory import *
4 | from .kd_wrapper import *
5 | from .model_wrapper import *
6 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/experimental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/experimental/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/interface.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | from llmcompressor.core.events import Event
4 | from llmcompressor.core.state import State
5 |
6 | __all__ = ["ModifierInterface"]
7 |
8 |
9 | class ModifierInterface(ABC):
10 | """
11 | Defines the contract that all modifiers must implement
12 | """
13 |
14 | @property
15 | @abstractmethod
16 | def initialized(self) -> bool:
17 | """
18 | :return: True if the modifier has been initialized
19 | """
20 | raise NotImplementedError()
21 |
22 | @property
23 | @abstractmethod
24 | def finalized(self) -> bool:
25 | """
26 | :return: True if the modifier has been finalized
27 | """
28 | raise NotImplementedError()
29 |
30 | @abstractmethod
31 | def initialize(self, state: State, **kwargs):
32 | """
33 | Initialize the modifier
34 |
35 | :param state: The current state of the model
36 | :param kwargs: Additional keyword arguments
37 | for modifier initialization
38 | """
39 | raise NotImplementedError()
40 |
41 | @abstractmethod
42 | def finalize(self, state: State, **kwargs):
43 | """
44 | Finalize the modifier
45 |
46 | :param state: The current state of the model
47 | :param kwargs: Additional keyword arguments for
48 | modifier finalization
49 | """
50 | raise NotImplementedError()
51 |
52 | @abstractmethod
53 | def update_event(self, state: State, event: Event, **kwargs):
54 | """
55 | Update the modifier based on the event
56 |
57 | :param state: The current state of the model
58 | :param event: The event to update the modifier with
59 | :param kwargs: Additional keyword arguments for
60 | modifier update
61 | """
62 | raise NotImplementedError()
63 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/obcq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .constant import *
4 | from .magnitude import *
5 | from .wanda import *
6 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import ConstantPruningModifier
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/magnitude/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import MagnitudePruningModifier
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/pruning/utils/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .layer_mask import *
4 | from .mask_factory import *
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .cache import *
4 | from .gptq import *
5 | from .quantization import *
6 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/gptq/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/quantization/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 | from .mixin import *
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .constants import *
4 | from .helpers import *
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/constants.py:
--------------------------------------------------------------------------------
1 | __all__ = ["SPARSITY_THRESHOLD"]
2 |
3 | SPARSITY_THRESHOLD: float = 0.05
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/modifiers/utils/pytorch_helpers.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import torch
4 | from torch.nn import Module
5 |
6 | __all__ = [
7 | "apply_pad_mask_to_batch",
8 | "is_moe_model",
9 | ]
10 |
11 |
12 | def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
13 | """
14 | Apply a mask to the input ids of a batch. This is used to zero out
15 | padding tokens so they do not contribute to the hessian calculation in the
16 | GPTQ and SparseGPT algorithms
17 |
18 | Assumes that `attention_mask` only contains zeros and ones
19 |
20 | :param batch: batch to apply padding to if it exists
21 | :return: batch with padding zeroed out in the input_ids
22 | """
23 | if "attention_mask" in batch:
24 | for key in ("input_ids", "decoder_input_ids"):
25 | if key in batch:
26 | batch[key] = batch[key] * batch["attention_mask"]
27 |
28 | return batch
29 |
30 |
31 | def is_moe_model(model: Module) -> bool:
32 | """
33 | Check if the model is a mixture of experts model
34 |
35 | :param model: the model to check
36 | :return: True if the model is a mixture of experts model
37 | """
38 |
39 | # Check for MoE components
40 | for _, module in model.named_modules():
41 | module_name = module.__class__.__name__
42 | if "MoE" in module_name or "Expert" in module_name:
43 | return True
44 |
45 | # Check config for MoE attributes
46 | if hasattr(model, "config"):
47 | if any(
48 | "moe" in attr.lower() or "expert" in attr.lower()
49 | for attr in dir(model.config)
50 | ):
51 | return True
52 |
53 | return False
54 |
--------------------------------------------------------------------------------
/src/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # isort: skip_file
3 |
4 | from .helpers import *
5 | from .base import *
6 | from .min_max import *
7 | from .mse import *
8 |
--------------------------------------------------------------------------------
/src/llmcompressor/observers/helpers.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | import torch
4 |
5 | __all__ = ["get_observer_token_count"]
6 |
7 |
8 | def get_observer_token_count(module: torch.nn.Module) -> Counter:
9 | """
10 | Parse the module and return the number of tokens observed by
11 | each module's observer.
12 |
13 | :param module: module to parse
14 | :return: counter with the number of tokens observed by each observer
15 | """
16 | token_counts = Counter()
17 | for name, module in module.named_modules():
18 | if name.endswith(".input_observer"):
19 | token_counts[name.replace(".input_observer", "")] = (
20 | module._num_observed_tokens
21 | )
22 | return token_counts
23 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # populate registry
3 | from .basic import *
4 | from .data_free import *
5 | from .independent import *
6 | from .layer_sequential import *
7 | from .registry import *
8 | from .sequential import *
9 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/basic/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/basic/pipeline.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Union
2 |
3 | import torch
4 | import tqdm
5 | from compressed_tensors.utils import get_execution_device
6 | from torch.utils.data.dataloader import DataLoader
7 |
8 | from llmcompressor.core import LifecycleCallbacks
9 | from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
10 | from llmcompressor.pipelines.registry import CalibrationPipeline
11 | from llmcompressor.pytorch.utils.helpers import tensors_to_device
12 | from llmcompressor.utils.helpers import calibration_forward_context
13 |
14 | if TYPE_CHECKING:
15 | from llmcompressor.args.dataset_arguments import DatasetArguments
16 |
17 | __all__ = ["BasicPipeline", "run_calibration"]
18 |
19 |
20 | @CalibrationPipeline.register("basic")
21 | class BasicPipeline(CalibrationPipeline):
22 | @staticmethod
23 | def __call__(
24 | model: torch.nn.Module,
25 | dataloader: DataLoader,
26 | dataset_args: Union["DatasetArguments", None],
27 | ):
28 | """
29 | Run a basic data pipeline.
30 |
31 | Batches are fetched from the data loader and are used to perform forward passes
32 | through the model. This pipeline is typically used for basic model calibration
33 | and, unlike the sequential pipelines, does not propagate compression error when
34 | used to calibrate model compression
35 |
36 | :param model: model being calibrated
37 | :param dataloader: loads data for calibration
38 | :param dataset_args: dataset arguments relevant to pipelines
39 | """
40 | model_device = get_execution_device(model)
41 |
42 | LifecycleCallbacks.calibration_epoch_start()
43 |
44 | with calibration_forward_context(model):
45 | for batch in tqdm.tqdm(dataloader, desc="Calibrating"):
46 | batch = apply_pad_mask_to_batch(batch)
47 | batch = tensors_to_device(batch, model_device)
48 | model(**batch)
49 |
50 | LifecycleCallbacks.calibration_epoch_end()
51 |
52 |
53 | def run_calibration(model: torch.nn.Module, dataloader: DataLoader):
54 | pipeline = BasicPipeline()
55 | pipeline(model, dataloader, None)
56 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/data_free/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/data_free/pipeline.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Optional
2 |
3 | import torch
4 | from torch.utils.data.dataloader import DataLoader
5 |
6 | from llmcompressor.core.session_functions import LifecycleCallbacks
7 | from llmcompressor.pipelines.registry import CalibrationPipeline
8 |
9 | if TYPE_CHECKING:
10 | from llmcompressor.args.dataset_arguments import DatasetArguments
11 |
12 | __all__ = ["DataFreePipeline"]
13 |
14 |
15 | @CalibrationPipeline.register("datafree")
16 | class DataFreePipeline(CalibrationPipeline):
17 | @staticmethod
18 | def __call__(
19 | model: torch.nn.Module,
20 | dataloader: Optional[DataLoader],
21 | dataset_args: "DatasetArguments",
22 | ):
23 | """
24 | A pipeline for data-free calibration
25 |
26 | :param model: model being calibrated
27 | :param dataloader: loads data for calibration
28 | :param dataset_args: dataset arguments relevant to pipelines
29 | """
30 | LifecycleCallbacks.calibration_epoch_start()
31 | LifecycleCallbacks.calibration_epoch_end()
32 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/independent/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/independent/pipeline.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | import torch
4 | from loguru import logger
5 | from torch.utils.data.dataloader import DataLoader
6 |
7 | from llmcompressor.core import active_session
8 | from llmcompressor.modifiers.stage import StageModifiers
9 | from llmcompressor.pipelines.registry import CalibrationPipeline
10 | from llmcompressor.utils.helpers import patch_attr
11 |
12 | if TYPE_CHECKING:
13 | from llmcompressor.args.dataset_arguments import DatasetArguments
14 |
15 | __all__ = ["IndependentPipeline"]
16 |
17 |
18 | @CalibrationPipeline.register("independent")
19 | class IndependentPipeline(CalibrationPipeline):
20 | @staticmethod
21 | def __call__(
22 | model: torch.nn.Module,
23 | dataloader: DataLoader,
24 | dataset_args: "DatasetArguments",
25 | ):
26 | """
27 | Data pipeline where each modifier is assigned its own calibration epoch and data
28 | pipeline
29 |
30 | :param model: model being calibrated
31 | :param dataloader: loads data for calibration
32 | :param dataset_args: dataset arguments relevant to pipelines
33 | """
34 | _logger = logger.patch(lambda r: r.update(function="IndependentPipeline"))
35 |
36 | session = active_session()
37 | modifiers = session.get_modifiers()
38 | with patch_attr(session.lifecycle, "modifiers", None):
39 | for index, modifier in enumerate(modifiers):
40 | mod_type = str(type(modifier).__name__)
41 | session.lifecycle.modifiers = [
42 | StageModifiers(modifiers=[modifier], group=mod_type, index=index)
43 | ]
44 |
45 | pipeline = CalibrationPipeline.from_modifiers([modifier])
46 | pipeline_name = pipeline.__class__.__name__
47 | _logger.info(f"Inferred `{pipeline_name}` for `{mod_type}`")
48 |
49 | pipeline(model, dataloader, dataset_args)
50 |
51 | # restore modifiers on exit so model can be compressed based on recipe
52 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/layer_sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .pipeline import *
3 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/sequential/README.md:
--------------------------------------------------------------------------------
1 | # Sequential Pipeline #
2 | The sequential pipeline is a data pipeline, primarily used for compressing models with the
3 | [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the
4 | [SparseGPTModifier](/src/llmcompressor/modifiers/obcq/base.py).
5 |
--------------------------------------------------------------------------------
/src/llmcompressor/pipelines/sequential/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .helpers import get_targets_from_modifiers
3 | from .pipeline import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functionality for working with and sparsifying Models in the PyTorch framework
3 | """
4 |
5 | import os
6 | import warnings
7 |
8 | from packaging import version
9 |
10 | try:
11 | import torch
12 |
13 | _PARSED_TORCH_VERSION = version.parse(torch.__version__)
14 |
15 | if _PARSED_TORCH_VERSION.major >= 2:
16 | torch_compile_func = torch.compile
17 |
18 | def raise_torch_compile_warning(*args, **kwargs):
19 | warnings.warn(
20 | "torch.compile is not supported by llmcompressor for torch 2.0.x"
21 | )
22 | return torch_compile_func(*args, **kwargs)
23 |
24 | torch.compile = raise_torch_compile_warning
25 |
26 | _BYPASS = bool(int(os.environ.get("NM_BYPASS_TORCH_VERSION", "0")))
27 | if _PARSED_TORCH_VERSION.major == 1 and _PARSED_TORCH_VERSION.minor in [10, 11]:
28 | if not _BYPASS:
29 | raise RuntimeError(
30 | "llmcompressor does not support torch==1.10.* or 1.11.*. "
31 | f"Found torch version {torch.__version__}.\n\n"
32 | "To bypass this error, set environment variable "
33 | "`NM_BYPASS_TORCH_VERSION` to '1'.\n\n"
34 | "Bypassing may result in errors or "
35 | "incorrect behavior, so set at your own risk."
36 | )
37 | else:
38 | warnings.warn(
39 | "llmcompressor quantized onnx export does not work "
40 | "with torch==1.10.* or 1.11.*"
41 | )
42 | except ImportError:
43 | pass
44 |
45 | # flake8: noqa
46 |
--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/model_load/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/model_load/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Generic code used as utilities and helpers for PyTorch
3 | """
4 |
5 | # flake8: noqa
6 |
7 | from .helpers import *
8 | from .sparsification import *
9 |
--------------------------------------------------------------------------------
/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import RecipeBase
2 | from .metadata import DatasetMetaData, LayerMetaData, ModelMetaData, ParamMetaData
3 | from .modifier import RecipeModifier
4 | from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput
5 | from .stage import RecipeStage
6 |
7 | __all__ = [
8 | "DatasetMetaData",
9 | "ParamMetaData",
10 | "LayerMetaData",
11 | "ModelMetaData",
12 | "RecipeBase",
13 | "RecipeModifier",
14 | "RecipeStage",
15 | "Recipe",
16 | "RecipeInput",
17 | "RecipeStageInput",
18 | "RecipeArgsInput",
19 | ]
20 |
--------------------------------------------------------------------------------
/src/llmcompressor/recipe/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any
3 |
4 | from pydantic import BaseModel, ConfigDict
5 |
6 | __all__ = ["RecipeBase"]
7 |
8 |
9 | class RecipeBase(BaseModel, ABC):
10 | """
11 | Defines the contract that `Recipe` and its components
12 | such as `RecipeModifier` and `RecipeStage` must follow.
13 |
14 | All inheritors of this class must implement the following methods:
15 | - calculate_start
16 | - calculate_end
17 | - evaluate
18 | - create_modifier
19 | """
20 |
21 | model_config = ConfigDict(arbitrary_types_allowed=True)
22 |
23 | @abstractmethod
24 | def create_modifier(self) -> Any:
25 | raise NotImplementedError()
26 |
--------------------------------------------------------------------------------
/src/llmcompressor/recipe/metadata.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict, List, Optional
2 |
3 | from pydantic import BaseModel, Field
4 |
5 | __all__ = [
6 | "DatasetMetaData",
7 | "ParamMetaData",
8 | "LayerMetaData",
9 | "ModelMetaData",
10 | ]
11 |
12 |
13 | class DatasetMetaData(BaseModel):
14 | name: str = None
15 | version: str = None
16 | hash: str = None
17 | shape: List[int] = Field(default_factory=list)
18 | num_classes: int = None
19 | num_train_samples: int = None
20 | num_val_samples: int = None
21 | num_test_samples: int = None
22 |
23 |
24 | class ParamMetaData(BaseModel):
25 | name: str = None
26 | shape: List[int] = None
27 | weight_hash: str = None
28 |
29 |
30 | class LayerMetaData(BaseModel):
31 | name: str = None
32 | type: str = None
33 | index: int = None
34 | attributes: Dict[str, Any] = None
35 | input_shapes: List[List[int]] = None
36 | output_shapes: List[List[int]] = None
37 | params: Dict[str, ParamMetaData] = None
38 |
39 |
40 | class ModelMetaData(BaseModel):
41 | architecture: str = None
42 | sub_architecture: str = None
43 | input_shapes: List[List[int]] = None
44 | output_shapes: List[List[int]] = None
45 | layers: List[LayerMetaData] = Field(default_factory=list)
46 | layer_prefix: Optional[str] = None
47 |
--------------------------------------------------------------------------------
/src/llmcompressor/sentinel.py:
--------------------------------------------------------------------------------
1 | import inspect
2 |
3 | from pydantic_core import core_schema
4 |
5 | _registry = {}
6 |
7 |
8 | class Sentinel:
9 | """
10 | Unique sentinel values. Implements https://peps.python.org/pep-0661/
11 | with dummy pydantic validation
12 | """
13 |
14 | def __new__(cls, name, module_name=None):
15 | name = str(name)
16 |
17 | if module_name is None:
18 | module_name = inspect.currentframe().f_globals.get("__file__")
19 | if module_name is None:
20 | module_name = __name__
21 |
22 | registry_key = f"{module_name}-{name}"
23 |
24 | sentinel = _registry.get(registry_key, None)
25 | if sentinel is not None:
26 | return sentinel
27 |
28 | sentinel = super().__new__(cls)
29 | sentinel._name = name
30 | sentinel._module_name = module_name
31 |
32 | return _registry.setdefault(registry_key, sentinel)
33 |
34 | def __repr__(self):
35 | return self._name
36 |
37 | def __reduce__(self):
38 | return (
39 | self.__class__,
40 | (
41 | self._name,
42 | self._module_name,
43 | ),
44 | )
45 |
46 | @classmethod
47 | def __get_pydantic_core_schema__(cls, _source_type, _handler):
48 | return core_schema.no_info_plain_validator_function(cls.validate)
49 |
50 | @classmethod
51 | def validate(cls, value: "Sentinel") -> "Sentinel":
52 | return value
53 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tools for integrating LLM Compressor with transformers training flows
3 | """
4 |
5 | # flake8: noqa
6 |
7 | # isort: skip_file
8 | # (import order matters for circular import avoidance)
9 | from .utils import *
10 |
11 | from .sparsification import (
12 | SparseAutoModelForCausalLM,
13 | )
14 | from .finetune import *
15 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/transformers/compression/__init__.py
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .data import TextGenerationDataset
4 | from .session_mixin import SessionManagerMixIn
5 | from .text_generation import apply, oneshot, train
6 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base import TextGenerationDataset
4 | from .c4 import C4Dataset
5 | from .cnn_dailymail import CNNDailyMailDataset
6 | from .custom import CustomDataset
7 | from .evolcodealpaca import EvolCodeAlpacaDataset
8 | from .flickr_30k import Flickr30K
9 | from .gsm8k import GSM8KDataset
10 | from .open_platypus import OpenPlatypusDataset
11 | from .peoples_speech import PeoplesSpeech
12 | from .ptb import PtbDataset
13 | from .ultrachat_200k import UltraChatDataset
14 | from .wikitext import WikiTextDataset
15 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/c4.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="c4")
12 | class C4Dataset(TextGenerationDataset):
13 | """
14 | Child text generation class for the C4 dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | def __init__(
22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 | ):
24 | dataset_args = deepcopy(dataset_args)
25 | dataset_args.dataset = "allenai/c4"
26 | dataset_args.text_column = "text"
27 |
28 | super().__init__(dataset_args=dataset_args, split=split, processor=processor)
29 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="cnn_dailymail")
12 | class CNNDailyMailDataset(TextGenerationDataset):
13 | """
14 | Text generation class for the CNN/DailyMail dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"
22 |
23 | def __init__(
24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 | ):
26 | dataset_args = deepcopy(dataset_args)
27 | dataset_args.dataset = "cnn_dailymail"
28 | dataset_args.dataset_config_name = "3.0.0"
29 |
30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 |
32 | def dataset_template(self, sample):
33 | return {
34 | "text": self.SAMPLE_TEMPLATE.format(
35 | article=sample["article"], highlights=sample["highlights"]
36 | )
37 | }
38 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/custom.py:
--------------------------------------------------------------------------------
1 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
2 |
3 |
4 | @TextGenerationDataset.register(name="custom", alias=["json", "csv"])
5 | class CustomDataset(TextGenerationDataset):
6 | """
7 | Child text generation class for custom local dataset supporting load
8 | for csv and json
9 |
10 | :param dataset_args: configuration settings for dataset loading
11 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
12 | Can also be set to None to load all the splits
13 | :param processor: processor or tokenizer to use on dataset
14 |
15 | """
16 |
17 | pass
18 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="evolcodealpaca")
12 | class EvolCodeAlpacaDataset(TextGenerationDataset):
13 | """
14 | Child text generation class for the Evol Code Alpaca dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | EVOL_ALPACA_TEMPLATE = (
22 | "Below is an instruction that describes a "
23 | "programming task. Write a program that appropriately "
24 | "completes the request.\n\n### Instruction:\n{instruction}"
25 | "\n\n### Response:\n"
26 | )
27 |
28 | def __init__(
29 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
30 | ):
31 | dataset_args = deepcopy(dataset_args)
32 | dataset_args.dataset = "theblackcat102/evol-codealpaca-v1"
33 | dataset_args.text_column = "text"
34 |
35 | super().__init__(dataset_args, split=split, processor=processor)
36 |
37 | def dataset_template(self, sample):
38 | prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"])
39 | text = prompt
40 | if "output" in text:
41 | text += sample["output"]
42 |
43 | return {
44 | "text": text,
45 | self.PROMPT_KEY: prompt,
46 | }
47 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/gsm8k.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="gsm8k")
12 | class GSM8KDataset(TextGenerationDataset):
13 | """
14 | Child text generation class for the Grade School Math 8k dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | GSM_TEMPLATE = "Question: {question}\nAnswer:"
22 |
23 | def __init__(
24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
25 | ):
26 | dataset_args = deepcopy(dataset_args)
27 | dataset_args.dataset = "gsm8k"
28 | dataset_args.text_column = "text"
29 |
30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor)
31 |
32 | def dataset_template(self, sample):
33 | prompt = self.GSM_TEMPLATE.format(question=sample["question"])
34 | text = prompt
35 | if "answer" in sample:
36 | text += " " + sample["answer"]
37 |
38 | return {
39 | "text": text,
40 | self.PROMPT_KEY: prompt,
41 | }
42 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/ptb.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="ptb")
12 | class PtbDataset(TextGenerationDataset):
13 | """
14 | Child text generation class for the PTB dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | def __init__(
22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 | ):
24 | dataset_args = deepcopy(dataset_args)
25 | dataset_args.dataset = "ptb_text_only"
26 | dataset_args.text_column = "sentence"
27 |
28 | super().__init__(
29 | dataset_args=dataset_args,
30 | split=split,
31 | processor=processor,
32 | )
33 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/data/wikitext.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import TYPE_CHECKING
3 |
4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset
5 | from llmcompressor.typing import Processor
6 |
7 | if TYPE_CHECKING:
8 | from llmcompressor.args import DatasetArguments
9 |
10 |
11 | @TextGenerationDataset.register(name="wikitext")
12 | class WikiTextDataset(TextGenerationDataset):
13 | """
14 | Child text generation class for the Open Platypus dataset
15 |
16 | :param dataset_args: configuration settings for dataset loading
17 | :param split: split from dataset to load, for instance `test` or `train[:5%]`
18 | :param processor: processor or tokenizer to use on dataset
19 | """
20 |
21 | def __init__(
22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor
23 | ):
24 | dataset_args = deepcopy(dataset_args)
25 | dataset_args.dataset = "Salesforce/wikitext"
26 | dataset_args.text_column = "text"
27 |
28 | super().__init__(
29 | dataset_args=dataset_args,
30 | split=split,
31 | processor=processor,
32 | )
33 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/text_generation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Adapted from https://github.com/huggingface/transformers
18 | # vllm-project: no copyright
19 |
20 |
21 | from compressed_tensors.utils.helpers import deprecated
22 |
23 |
24 | @deprecated(
25 | message=(
26 | "`from llmcompressor.transformers import oneshot` is deprecated, "
27 | "please use `from llmcompressor import oneshot`."
28 | )
29 | )
30 | def oneshot(**kwargs) -> None:
31 | from llmcompressor import oneshot
32 |
33 | oneshot(**kwargs)
34 |
35 |
36 | @deprecated(
37 | message=(
38 | "`from llmcompressor import train` is deprecated, "
39 | "please use `from llmcompressor import train`."
40 | )
41 | )
42 | def train(**kwargs):
43 | from llmcompressor import train
44 |
45 | train(**kwargs)
46 |
47 |
48 | def apply(**kwargs):
49 | message = (
50 | "`from llmcompressor.transformers import apply, compress` is deprecated, "
51 | "please use `from llmcompressor import oneshot, train` "
52 | "for sequential stages."
53 | )
54 | raise ValueError(message)
55 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/finetune/trainer.py:
--------------------------------------------------------------------------------
1 | from transformers import Trainer as HFTransformersTrainer
2 |
3 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
4 |
5 | __all__ = ["Trainer"]
6 |
7 |
8 | class Trainer(SessionManagerMixIn, HFTransformersTrainer):
9 | pass
10 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/sparsification/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Objects, classes, and methods for applying sparsification algorithms to
3 | Hugging Face transformers flows
4 | """
5 |
6 | # flake8: noqa
7 | from .sparse_model import *
8 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/sparsification/sparse_model.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | from typing import Optional
3 |
4 | from loguru import logger
5 | from torch.nn import Module
6 | from transformers import AutoModelForCausalLM
7 |
8 | __all__ = [
9 | "SparseAutoModelForCausalLM",
10 | "get_processor_name_from_model",
11 | ]
12 |
13 |
14 | class SparseAutoModelForCausalLM:
15 | def from_pretrained(*args, **kwargs):
16 | logger.warning(
17 | "SparseAutoModelForCausalLM is deprecated, "
18 | "please use AutoModelForCausalLM"
19 | )
20 | return AutoModelForCausalLM.from_pretrained(*args, **kwargs)
21 |
22 |
23 | def get_processor_name_from_model(student: Module, teacher: Optional[Module]) -> str:
24 | """
25 | Get a processor/tokenizer source used for both student and teacher, assuming
26 | that they could be shared
27 |
28 | :param student: the student model
29 | :param teacher: the teacher model
30 | :return: the source for the processor/tokenizer shared between teacher and model
31 | """
32 |
33 | if teacher is not None and teacher not in ("disable", "self"):
34 | student_forward_params = list(
35 | inspect.signature(student.forward).parameters.keys()
36 | )
37 | teacher_forward_params = list(
38 | inspect.signature(teacher.forward).parameters.keys()
39 | )
40 | diff = [p for p in student_forward_params if p not in teacher_forward_params]
41 | if diff:
42 | raise RuntimeError(
43 | "Teacher tokenizer cannot be used for student "
44 | f"due to missing args: {diff}"
45 | )
46 | src_model = teacher
47 | else:
48 | src_model = student
49 | return src_model.config._name_or_path
50 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/tracing/__init__.py:
--------------------------------------------------------------------------------
1 | from .debug import trace
2 |
3 | __all__ = ["trace"]
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for applying sparsification algorithms to Hugging Face transformers flows
3 | """
4 |
5 | # flake8: noqa
6 | from .helpers import *
7 |
--------------------------------------------------------------------------------
/src/llmcompressor/transformers/utils/preprocessing_functions.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING, Dict
2 |
3 | from compressed_tensors.registry import RegistryMixin
4 |
5 | if TYPE_CHECKING:
6 | from llmcompressor.transformers.finetune.data.base import TextGenerationDataset
7 |
8 |
9 | class PreprocessingFunctionRegistry(RegistryMixin):
10 | pass
11 |
12 |
13 | @PreprocessingFunctionRegistry.register()
14 | def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict):
15 | PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:"""
16 | data["prompt"] = PROMPT_DICT.format_map(data)
17 | data["text"] = data["prompt"] + data["output"]
18 | return data
19 |
--------------------------------------------------------------------------------
/src/llmcompressor/typing.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | from datasets import Dataset, DatasetDict, IterableDataset
4 | from transformers import (
5 | BaseImageProcessor,
6 | FeatureExtractionMixin,
7 | PreTrainedTokenizer,
8 | ProcessorMixin,
9 | )
10 |
11 | # Tokenizer or Processor. Processors do not inherit from a unified base class
12 | Processor = Union[
13 | PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin
14 | ]
15 |
16 | # Supported dataset types, IterableDataset is a streamed dataset
17 | DatasetType = Union[Dataset, DatasetDict, IterableDataset]
18 |
--------------------------------------------------------------------------------
/src/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | General utility functions used throughout llmcompressor
3 | """
4 |
5 | # flake8: noqa
6 |
7 | from .helpers import *
8 |
--------------------------------------------------------------------------------
/src/llmcompressor/utils/fsdp/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
--------------------------------------------------------------------------------
/src/llmcompressor/utils/fsdp/context.py:
--------------------------------------------------------------------------------
1 | try:
2 | from accelerate import Accelerator
3 | except ImportError:
4 | Accelerator = None
5 |
6 | try:
7 | from torch.distributed.fsdp import FullyShardedDataParallel
8 | from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState
9 | except ImportError:
10 | FullyShardedDataParallel = None
11 |
12 | from contextlib import nullcontext
13 |
14 | __all__ = [
15 | "summon_full_params_context",
16 | "main_process_first_context",
17 | "fix_fsdp_module_name",
18 | ]
19 |
20 |
21 | def summon_full_params_context(model, offload_to_cpu: bool = False):
22 | if FullyShardedDataParallel is not None:
23 | # avoid nested summon_full_param context
24 | if (
25 | hasattr(model, "training_state")
26 | and model.training_state is TrainingState.SUMMON_FULL_PARAMS
27 | ):
28 | return nullcontext()
29 | return FullyShardedDataParallel.summon_full_params(
30 | model, offload_to_cpu=offload_to_cpu
31 | )
32 |
33 | return nullcontext()
34 |
35 |
36 | def main_process_first_context():
37 | """
38 | Creates a context manager where the main process runs the block before all other
39 | processes. Returns a nullcontext when called from a single process application.
40 | """
41 | if Accelerator is None:
42 | return nullcontext()
43 |
44 | return Accelerator().main_process_first()
45 |
46 |
47 | def fix_fsdp_module_name(name: str) -> str:
48 | """
49 | Remove FSDP wrapper prefixes from a module name.
50 | Accounts for scenario where FSDP_WRAPPED_MODULE is
51 | at the end of the name, as well as in the middle.
52 |
53 | :param name: name to strip
54 | :return: stripped name
55 | """
56 | if FullyShardedDataParallel is None:
57 | return name
58 |
59 | return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace(
60 | "." + FSDP_WRAPPED_MODULE, ""
61 | )
62 |
--------------------------------------------------------------------------------
/src/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .module import *
4 |
--------------------------------------------------------------------------------
/src/llmcompressor/utils/pytorch/utils.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import torch
4 |
5 | __all__ = ["measure_cuda_memory"]
6 |
7 |
8 | class measure_cuda_memory:
9 | def __init__(self, device=None):
10 | self.device = device
11 |
12 | def reset_peak_memory_stats(self):
13 | torch.cuda.reset_peak_memory_stats(self.device)
14 |
15 | def current_memory_usage(self) -> float:
16 | # Return the memory usage in bytes.
17 | self.reset_peak_memory_stats()
18 | mem = torch.cuda.max_memory_allocated(self.device)
19 | return mem
20 |
21 | def peak_memory_usage(self) -> float:
22 | # Return the peak memory usage in bytes since the last reset
23 | mem = torch.cuda.max_memory_allocated(self.device)
24 | return mem
25 |
26 | def __enter__(self):
27 | self.initial_memory = self.current_memory_usage()
28 | # This allows us to call methods of the context manager if needed
29 | return self
30 |
31 | def __exit__(self, exc_type, exc_val, exc_tb):
32 | self.overall_peak_memory = self.peak_memory_usage()
33 | self.peak_consumed_memory = self.overall_peak_memory - self.initial_memory
34 |
35 | # Force garbage collection
36 | gc.collect()
37 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from enum import Enum
3 |
4 |
5 | # TODO: maybe test type as decorators?
6 | class TestType(Enum):
7 | SANITY = "sanity"
8 | REGRESSION = "regression"
9 | SMOKE = "smoke"
10 |
11 |
12 | class Cadence(Enum):
13 | COMMIT = "commit"
14 | WEEKLY = "weekly"
15 | NIGHTLY = "nightly"
16 |
17 |
18 | @dataclass
19 | class TestConfig:
20 | test_type: TestType
21 | cadence: Cadence
22 |
23 |
24 | @dataclass
25 | class CustomTestConfig(TestConfig):
26 | script_path: str
27 |
--------------------------------------------------------------------------------
/tests/e2e/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/__init__.py
--------------------------------------------------------------------------------
/tests/e2e/vLLM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/vLLM/__init__.py
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8_DYNAMIC
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: FP8_DYNAMIC
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: FP8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml
5 | scheme: FP8A16_channel
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml
5 | scheme: FP8A16_tensor
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_channel_weight_static_per_tensor
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A8
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml
5 | dataset_id: garage-bAInd/Open-Platypus
6 | dataset_split: train
7 | scheme: W8A8_tensor_weight_static_per_tensor_act
8 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_gptq_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_gptq_tinyllama
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_phi3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: microsoft/Phi-3-mini-4k-instruct
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_phi3
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | scheme: kv_cache_default_tinyllama
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml
5 | scheme: sparse2of4_fp8_dynamic
6 | dataset_id: garage-bAInd/Open-Platypus
7 | dataset_split: train
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml
5 | scheme: sparse2of4_only
6 | dataset_id: HuggingFaceH4/ultrachat_200k
7 | dataset_split: train_sft
8 | save_compressed: True
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_2of4
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_group
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: neuralmagic/LLM_compression_calibration
6 | dataset_split: train
7 | scheme: W4A16_actorder_group
8 | save_dir: Qwen2.5-0.5B-actorder-group
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: openai/gsm8k
6 | dataset_config: main
7 | dataset_split: train
8 | scheme: W4A16_actorder_weight
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | scheme: W4A16_actorder_weight
8 | save_dir: Qwen2.5-0.5B-actorder-weight
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: Qwen/Qwen2.5-0.5B
4 | scheme: W4A16_channel
5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected
6 | dataset_split: train
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W4A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml
5 | dataset_id: "mit-han-lab/pile-val-backup"
6 | dataset_split: validation
7 | num_calibration_samples: 2000
8 | scheme: W4A16_weight_asym_awq
9 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16_channel
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: W8A16
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | quant_type: "GPTQ"
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_dynamic_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Asym
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/configs/w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | scheme: W8A8_static_asym_activations
7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml
8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Asym
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | GPTQModifier:
6 | ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
7 | config_groups:
8 | group_0:
9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true}
11 | targets: [Linear]
12 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | GPTQModifier:
6 | ignore: [lm_head]
7 | config_groups:
8 | group_0:
9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
11 | targets: [Linear]
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | QuantizationModifier:
6 | ignore: [lm_head]
7 | config_groups:
8 | group_0:
9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor}
11 | targets: [Linear]
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | mappings:
6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
7 | - re:.*input_layernorm
8 | - - ['re:.*gate_proj', 're:.*up_proj']
9 | - re:.*post_attention_layernorm
10 | GPTQModifier:
11 | ignore: [lm_head]
12 | config_groups:
13 | group_0:
14 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
15 | input_activations: {num_bits: 8, symmetric: false, dynamic: true, strategy: token, type: int}
16 | targets: [Linear]
17 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | GPTQModifier:
6 | ignore: [lm_head]
7 | config_groups:
8 | group_0:
9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
10 | input_activations: {num_bits: 8, symmetric: false, dynamic: false, strategy: tensor, type: int}
11 | targets: [Linear]
12 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | sparsity_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | mask_structure: "2:4"
6 | targets: ["Linear"]
7 | ignore: ["re:.*lm_head"]
8 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | run_type: oneshot
3 | sparsity_modifiers:
4 | SparseGPTModifier:
5 | sparsity: 0.5
6 | mask_structure: "2:4"
7 | targets: ["Linear"]
8 | ignore: ["re:.*lm_head"]
9 | quantization_stage:
10 | run_type: oneshot
11 | quantization_modifiers:
12 | QuantizationModifier:
13 | targets: ["Linear"]
14 | ignore: ["lm_head"]
15 | scheme: "FP8_DYNAMIC"
16 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | AWQModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
8 | targets: [Linear]
9 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | run_type: oneshot
3 | sparsity_modifiers:
4 | SparseGPTModifier:
5 | sparsity: 0.5
6 | mask_structure: "2:4"
7 | targets: ["Linear"]
8 | ignore: ["re:.*lm_head"]
9 | quantization_stage:
10 | run_type: oneshot
11 | quantization_modifiers:
12 | GPTQModifier:
13 | ignore: ["lm_head"]
14 | config_groups:
15 | group_0:
16 | weights:
17 | num_bits: 4
18 | type: "int"
19 | symmetric: true
20 | strategy: "group"
21 | group_size: 128
22 | targets: ["Linear"]
23 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml:
--------------------------------------------------------------------------------
1 | sparsity_stage:
2 | run_type: oneshot
3 | sparsity_modifiers:
4 | SparseGPTModifier:
5 | sparsity: 0.5
6 | mask_structure: "2:4"
7 | targets: ["Linear"]
8 | ignore: ["re:.*lm_head"]
9 | quantization_stage:
10 | run_type: oneshot
11 | quantization_modifiers:
12 | GPTQModifier:
13 | ignore: ["lm_head"]
14 | config_groups:
15 | group_0:
16 | weights:
17 | num_bits: 4
18 | type: "int"
19 | symmetric: true
20 | strategy: "channel"
21 | targets: ["Linear"]
22 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | ignore: ["lm_head"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: true
11 | strategy: "group"
12 | group_size: 128
13 | actorder: "group"
14 | targets: ["Linear"]
15 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: true
11 | strategy: "group"
12 | group_size: 128
13 | actorder: "weight"
14 | targets: ["Linear"]
15 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/kv_cache/default.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | kv_cache_scheme:
5 | {num_bits: 8, type: float, symmetric: true, strategy: tensor}
6 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | sequential_update: false
5 | ignore: ["lm_head"]
6 | config_groups:
7 | group_0:
8 | weights:
9 | num_bits: 4
10 | type: "int"
11 | symmetric: true
12 | strategy: "channel"
13 | actorder: False
14 | targets: ["Linear"]
15 | kv_cache_scheme:
16 | {num_bits: 8, type: float, symmetric: true, strategy: tensor}
--------------------------------------------------------------------------------
/tests/e2e/vLLM/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | SUCCESS=0
4 |
5 | while getopts "c:t:" OPT; do
6 | case ${OPT} in
7 | c )
8 | CONFIG="$OPTARG"
9 | ;;
10 | t )
11 | TEST="$OPTARG"
12 | ;;
13 | \? )
14 | exit 1
15 | ;;
16 | esac
17 | done
18 |
19 | # Parse list of configs.
20 | for MODEL_CONFIG in "$CONFIG"/*
21 | do
22 | LOCAL_SUCCESS=0
23 |
24 | echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
25 |
26 | export TEST_DATA_FILE="$MODEL_CONFIG"
27 | pytest \
28 | --capture=tee-sys \
29 | "$TEST" || LOCAL_SUCCESS=$?
30 |
31 | if [[ $LOCAL_SUCCESS == 0 ]]; then
32 | echo "=== PASSED MODEL: $MODEL_CONFIG ==="
33 | else
34 | echo "=== FAILED MODEL: $MODEL_CONFIG ==="
35 | fi
36 |
37 | SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
38 |
39 | done
40 |
41 | exit "$SUCCESS"
42 |
--------------------------------------------------------------------------------
/tests/e2e/vLLM/skipped_configs/fp4_nvfp4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4 | scheme: NVFP4A16
--------------------------------------------------------------------------------
/tests/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/examples/__init__.py
--------------------------------------------------------------------------------
/tests/examples/test_compressed_inference.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from tests.examples.utils import (
6 | copy_and_run_script,
7 | gen_cmd_fail_message,
8 | requires_gpu_count,
9 | )
10 |
11 |
12 | @pytest.fixture
13 | def example_dir() -> str:
14 | return "examples/compressed_inference"
15 |
16 |
17 | @pytest.mark.example
18 | @requires_gpu_count(1)
19 | class TestCompressedInference:
20 | """
21 | Tests for examples in the "compressed_inference" example folder.
22 | """
23 |
24 | def test_fp8_example_script(self, example_dir: str, tmp_path: Path):
25 | """
26 | Test for the "fp8_compressed_inference.py" script in the folder.
27 | """
28 | script_filename = "fp8_compressed_inference.py"
29 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
30 |
31 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
32 |
--------------------------------------------------------------------------------
/tests/examples/test_quantization_kv_cache.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from pathlib import Path
3 |
4 | import pytest
5 |
6 | from tests.examples.utils import (
7 | ReadMe,
8 | copy_and_run_command,
9 | gen_cmd_fail_message,
10 | requires_gpu_count,
11 | )
12 |
13 |
14 | @pytest.fixture
15 | def example_dir() -> str:
16 | return "examples/quantization_kv_cache"
17 |
18 |
19 | @pytest.mark.example
20 | @requires_gpu_count(1)
21 | class TestQuantizationKVCache:
22 | """
23 | Tests for examples in the "quantization_kv_cache" example folder.
24 | """
25 |
26 | def test_doc_example_command(self, example_dir: str, tmp_path: Path):
27 | """
28 | Test for the example command in the README.
29 | """
30 | readme_path = Path.cwd() / example_dir / "README.md"
31 | readme = ReadMe(readme_path)
32 |
33 | command = readme.get_code_block_content(position=2, lang="shell")
34 | assert command.startswith("python")
35 |
36 | command = shlex.split(command)
37 | result = copy_and_run_command(tmp_path, example_dir, command)
38 |
39 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
40 |
--------------------------------------------------------------------------------
/tests/examples/test_quantization_w4a16.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from pathlib import Path
3 |
4 | import pytest
5 |
6 | from tests.examples.utils import (
7 | ReadMe,
8 | copy_and_run_command,
9 | gen_cmd_fail_message,
10 | requires_gpu_count,
11 | )
12 |
13 |
14 | @pytest.fixture
15 | def example_dir() -> str:
16 | return "examples/quantization_w4a16"
17 |
18 |
19 | @pytest.mark.example
20 | @requires_gpu_count(1)
21 | class TestQuantizationW4A16:
22 | """
23 | Tests for examples in the "quantization_w4a16" example folder.
24 | """
25 |
26 | def test_doc_example_command(self, example_dir: str, tmp_path: Path):
27 | """
28 | Test for the example command in the README.
29 | """
30 | readme_path = Path.cwd() / example_dir / "README.md"
31 | readme = ReadMe(readme_path)
32 |
33 | command = readme.get_code_block_content(position=2, lang="shell")
34 | assert command.startswith("python")
35 |
36 | command = shlex.split(command)
37 | result = copy_and_run_command(tmp_path, example_dir, command)
38 |
39 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
40 |
--------------------------------------------------------------------------------
/tests/examples/test_quantization_w8a8_fp8.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from pathlib import Path
3 |
4 | import pytest
5 |
6 | from tests.examples.utils import (
7 | ReadMe,
8 | copy_and_run_command,
9 | copy_and_run_script,
10 | gen_cmd_fail_message,
11 | requires_gpu_count,
12 | )
13 |
14 |
15 | @pytest.fixture
16 | def example_dir() -> str:
17 | return "examples/quantization_w8a8_fp8"
18 |
19 |
20 | @pytest.mark.example
21 | @requires_gpu_count(1)
22 | class TestQuantizationW8A8_FP8:
23 | """
24 | Tests for examples in the "quantization_w8a8_fp8" example folder.
25 | """
26 |
27 | def test_doc_example_command(self, example_dir: str, tmp_path: Path):
28 | """
29 | Test for the example command in the README.
30 | """
31 | readme_path = Path.cwd() / example_dir / "README.md"
32 | readme = ReadMe(readme_path)
33 |
34 | command = readme.get_code_block_content(position=2, lang="shell")
35 | assert command.startswith("python")
36 |
37 | command = shlex.split(command)
38 | result = copy_and_run_command(tmp_path, example_dir, command)
39 |
40 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 |
42 | def test_gemma2_example_script(self, example_dir: str, tmp_path: Path):
43 | """
44 | Test for the "gemma2_example.py" script in the folder.
45 | """
46 | script_filename = "gemma2_example.py"
47 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
48 |
49 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
50 |
--------------------------------------------------------------------------------
/tests/examples/test_quantization_w8a8_int8.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from pathlib import Path
3 |
4 | import pytest
5 |
6 | from tests.examples.utils import (
7 | ReadMe,
8 | copy_and_run_command,
9 | copy_and_run_script,
10 | gen_cmd_fail_message,
11 | requires_gpu_count,
12 | )
13 |
14 |
15 | @pytest.fixture
16 | def example_dir() -> str:
17 | return "examples/quantization_w8a8_int8"
18 |
19 |
20 | @pytest.mark.example
21 | @requires_gpu_count(1)
22 | class TestQuantizationW8A8_Int8:
23 | """
24 | Tests for examples in the "quantization_w8a8_int8" example folder.
25 | """
26 |
27 | def test_doc_example_command(self, example_dir: str, tmp_path: Path):
28 | """
29 | Test for the example command in the README.
30 | """
31 | readme_path = Path.cwd() / example_dir / "README.md"
32 | readme = ReadMe(readme_path)
33 |
34 | command = readme.get_code_block_content(position=2, lang="shell")
35 | assert command.startswith("python")
36 |
37 | command = shlex.split(command)
38 | result = copy_and_run_command(tmp_path, example_dir, command)
39 |
40 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 |
42 | def test_gemma2_example_script(self, example_dir: str, tmp_path: Path):
43 | """
44 | Test for the "gemma2_example.py" script in the folder.
45 | """
46 | script_filename = "gemma2_example.py"
47 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
48 |
49 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
50 |
--------------------------------------------------------------------------------
/tests/examples/test_quantizing_moe.py:
--------------------------------------------------------------------------------
1 | import shlex
2 | from pathlib import Path
3 |
4 | import pytest
5 |
6 | from tests.examples.utils import (
7 | ReadMe,
8 | copy_and_run_command,
9 | copy_and_run_script,
10 | gen_cmd_fail_message,
11 | requires_gpu_count,
12 | )
13 |
14 |
15 | @pytest.fixture
16 | def example_dir() -> str:
17 | return "examples/quantizing_moe"
18 |
19 |
20 | @pytest.mark.example
21 | class TestQuantizingMOE:
22 | """
23 | Tests for examples in the "quantizing_moe" example folder.
24 | """
25 |
26 | @pytest.mark.multi_gpu
27 | @requires_gpu_count(2)
28 | def test_doc_example_command(self, example_dir: str, tmp_path: Path):
29 | """
30 | Test for the example command in the README.
31 | """
32 | readme_path = Path.cwd() / example_dir / "README.md"
33 | readme = ReadMe(readme_path)
34 |
35 | command = readme.get_code_block_content(position=2, lang="shell")
36 | assert command.startswith("python")
37 |
38 | command = shlex.split(command)
39 | result = copy_and_run_command(tmp_path, example_dir, command)
40 |
41 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
42 |
43 | @pytest.mark.parametrize(
44 | "script_filename",
45 | [
46 | pytest.param(
47 | "deepseek_moe_w4a16.py",
48 | marks=[
49 | pytest.mark.multi_gpu,
50 | pytest.mark.skip(reason="exceptionally long run time"),
51 | ],
52 | ),
53 | pytest.param("deepseek_moe_w8a8_fp8.py"),
54 | pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu),
55 | ],
56 | )
57 | def test_deepseek_example_script(
58 | self, script_filename: str, example_dir: str, tmp_path: Path
59 | ):
60 | """
61 | Test for the other example scripts in the folder.
62 | """
63 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
64 |
65 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
66 |
--------------------------------------------------------------------------------
/tests/examples/test_sparse_2of4_quantization_fp8.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from tests.examples.utils import (
6 | copy_and_run_script,
7 | gen_cmd_fail_message,
8 | requires_gpu_count,
9 | )
10 |
11 |
12 | @pytest.fixture
13 | def example_dir() -> str:
14 | return "examples/sparse_2of4_quantization_fp8"
15 |
16 |
17 | @requires_gpu_count(1)
18 | class TestSparse2of4QuantizationFP8:
19 | """
20 | Tests for examples in the "sparse_2of4_quantization_fp8" example folder.
21 | """
22 |
23 | @pytest.mark.parametrize(("flags"), [[], ["--fp8"]])
24 | def test_2of4_example_script(
25 | self, example_dir: str, tmp_path: Path, flags: list[str]
26 | ):
27 | """
28 | Tests for the "llama3_8b_2of4.py" example script.
29 | """
30 | script_filename = "llama3_8b_2of4.py"
31 | command, result = copy_and_run_script(
32 | tmp_path, example_dir, script_filename, flags=flags
33 | )
34 |
35 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
36 |
--------------------------------------------------------------------------------
/tests/examples/test_trl_mixin.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from tests.examples.utils import (
6 | copy_and_run_script,
7 | gen_cmd_fail_message,
8 | requires_gpu_count,
9 | )
10 |
11 |
12 | @pytest.fixture
13 | def example_dir() -> str:
14 | return "examples/trl_mixin"
15 |
16 |
17 | @pytest.mark.example
18 | @requires_gpu_count(1)
19 | class TestTRLMixin:
20 | """
21 | Tests for examples in the "trl_mixin" example folder.
22 | """
23 |
24 | @pytest.mark.parametrize(
25 | "script_filename",
26 | [
27 | "ex_trl_constant.py",
28 | # ex_trl_distillation.py hits CUDA OOM on 1x H100 (80 GiB VRAM)
29 | pytest.param("ex_trl_distillation.py", marks=pytest.mark.multi_gpu),
30 | ],
31 | )
32 | def test_example_scripts(
33 | self, example_dir: str, script_filename: str, tmp_path: Path
34 | ):
35 | """
36 | Test for the example scripts in the folder.
37 | """
38 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename)
39 |
40 | assert result.returncode == 0, gen_cmd_fail_message(command, result)
41 |
--------------------------------------------------------------------------------
/tests/llmcompressor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/metrics/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/utils/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/awq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/awq/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/calibration/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/test_frozen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from compressed_tensors.quantization.lifecycle.initialize import (
16 | initialize_module_for_quantization,
17 | )
18 | from compressed_tensors.quantization.quant_args import QuantizationArgs
19 | from compressed_tensors.quantization.quant_config import QuantizationStatus
20 | from compressed_tensors.quantization.quant_scheme import QuantizationScheme
21 | from torch.nn import Linear
22 |
23 | from llmcompressor.modifiers.quantization.calibration import (
24 | freeze_module_quantization,
25 | initialize_observer,
26 | )
27 |
28 |
29 | def test_set_module_for_calibration():
30 | num_bits = 8
31 | quantization_scheme = QuantizationScheme(
32 | targets=["*"],
33 | weights=QuantizationArgs(num_bits=num_bits, symmetric=True),
34 | input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False),
35 | )
36 |
37 | layer = Linear(4, 4)
38 |
39 | initialize_module_for_quantization(layer, quantization_scheme)
40 | layer.quantization_status = QuantizationStatus("calibration")
41 | initialize_observer(layer, "weight")
42 |
43 | # should have both input and weight observer after initalizing
44 | assert hasattr(layer, "weight_observer")
45 |
46 | # observers should get deleted after freezing
47 | freeze_module_quantization(layer)
48 | assert not hasattr(layer, "input_observer")
49 | assert not hasattr(layer, "weight_observer")
50 |
51 | assert layer.quantization_status == QuantizationStatus("frozen")
52 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/calibration/test_observers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch
3 | from compressed_tensors.quantization import (
4 | QuantizationArgs,
5 | QuantizationScheme,
6 | initialize_module_for_quantization,
7 | )
8 |
9 | from llmcompressor.modifiers.quantization.calibration import initialize_observer
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "shape,group_size,actorder",
14 | [
15 | ((1, 1), None, False),
16 | ((1, 1), 128, False),
17 | ((1, 1), 128, True),
18 | ((64, 64), None, False),
19 | ((64, 64), 128, False),
20 | ((64, 64), 128, True),
21 | ((1792, 4096), None, False),
22 | ((1792, 4096), 128, False),
23 | ((1792, 4096), 128, True),
24 | ((3420, 64), None, False),
25 | ((3420, 64), 128, False),
26 | ((3420, 64), 128, True),
27 | ],
28 | )
29 | def test_observers_update(shape, group_size, actorder):
30 | module = torch.nn.Linear(*shape)
31 | scheme = QuantizationScheme(
32 | targets=["Linear"],
33 | weights=QuantizationArgs(group_size=group_size, actorder=actorder),
34 | input_activations=QuantizationArgs(),
35 | output_activations=QuantizationArgs(),
36 | )
37 |
38 | input = torch.empty(module.in_features, dtype=module.weight.dtype)
39 | output = torch.empty(module.out_features, dtype=module.weight.dtype)
40 |
41 | initialize_module_for_quantization(module, scheme)
42 | initialize_observer(module, "weight")
43 | initialize_observer(module, "input")
44 | initialize_observer(module, "output")
45 |
46 | for location, value in (
47 | ("weight", module.weight),
48 | ("input", input),
49 | ("output", output),
50 | ):
51 | observer = getattr(module, f"{location}_observer")
52 | g_idx = getattr(module, "g_idx", None)
53 | updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
54 |
55 | assert_alike(updated_scale, getattr(module, f"{location}_scale"))
56 | assert_alike(updated_zero_point, getattr(module, f"{location}_zero_point"))
57 |
58 |
59 | def assert_alike(a, b):
60 | assert a.dtype == b.dtype
61 | assert a.shape == b.shape
62 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/conf.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import MagicMock
2 |
3 | from torch.utils.data import DataLoader
4 |
5 | from llmcompressor.core import Event, EventType, State
6 | from llmcompressor.modifiers.factory import ModifierFactory
7 |
8 |
9 | def setup_modifier_factory():
10 | ModifierFactory.refresh()
11 | assert ModifierFactory._loaded, "ModifierFactory not loaded"
12 |
13 |
14 | class LifecyleTestingHarness:
15 | def __init__(
16 | self,
17 | model=None,
18 | optimizer=None,
19 | device="cpu",
20 | start=0,
21 | ):
22 | self.state = State()
23 | self.state.update(
24 | model=model,
25 | device=device,
26 | optimizer=optimizer,
27 | start=start,
28 | steps_per_epoch=1,
29 | calib_data=DataLoader(MagicMock(__len__=lambda _: 0, column_names=[])),
30 | )
31 |
32 | def update_modifier(self, modifier, event_type):
33 | event = Event(event_type=event_type)
34 | modifier.update_event(self.state, event=event)
35 |
36 | def get_state(self):
37 | return self.state
38 |
39 | def trigger_modifier_for_epochs(self, modifier, num_epochs):
40 | for _ in range(num_epochs):
41 | self.update_modifier(modifier, EventType.BATCH_START)
42 | self.update_modifier(modifier, EventType.LOSS_CALCULATED)
43 | self.update_modifier(modifier, EventType.OPTIM_PRE_STEP)
44 | self.update_modifier(modifier, EventType.OPTIM_POST_STEP)
45 | self.update_modifier(modifier, EventType.BATCH_END)
46 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/logarithmic_equalization/test_base.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.factory import ModifierFactory
6 | from llmcompressor.modifiers.logarithmic_equalization.base import (
7 | LogarithmicEqualizationModifier,
8 | )
9 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
10 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
11 |
12 |
13 | @pytest.mark.unit
14 | class TestLogarithmicEqualizationIsRegistered(unittest.TestCase):
15 | def setUp(self):
16 | self.kwargs = dict(
17 | smoothing_strength=0.3,
18 | mappings=[(["layer1", "layer2"], "layer3")],
19 | )
20 | setup_modifier_factory()
21 |
22 | def test_log_equalization_is_registered(self):
23 | modifier = ModifierFactory.create(
24 | type_="LogarithmicEqualizationModifier",
25 | allow_experimental=False,
26 | allow_registered=True,
27 | **self.kwargs,
28 | )
29 |
30 | self.assertIsInstance(
31 | modifier,
32 | LogarithmicEqualizationModifier,
33 | "PyTorch LogarithmicEqualizationModifier not registered",
34 | )
35 |
36 | self.assertIsInstance(modifier, SmoothQuantModifier)
37 | self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"])
38 | self.assertEqual(modifier.mappings, self.kwargs["mappings"])
39 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.factory import ModifierFactory
6 | from llmcompressor.modifiers.obcq.base import SparseGPTModifier
7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
8 |
9 |
10 | @pytest.mark.unit
11 | class TestSparseGPTIsRegistered(unittest.TestCase):
12 | def setUp(self):
13 | self.kwargs = dict(
14 | sparsity=0.5,
15 | targets="__ALL_PRUNABLE__",
16 | )
17 | setup_modifier_factory()
18 |
19 | def test_wanda_is_registered(self):
20 | type_ = ModifierFactory.create(
21 | type_="SparseGPTModifier",
22 | allow_experimental=False,
23 | allow_registered=True,
24 | **self.kwargs,
25 | )
26 |
27 | self.assertIsInstance(
28 | type_,
29 | SparseGPTModifier,
30 | "PyTorch SparseGPTModifier not registered",
31 | )
32 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/wanda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/wanda/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/pruning/wanda/test_base.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.factory import ModifierFactory
6 | from llmcompressor.modifiers.pruning.wanda.base import WandaPruningModifier
7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
8 |
9 |
10 | @pytest.mark.unit
11 | class TestWandaIsRegistered(unittest.TestCase):
12 | def setUp(self):
13 | self.kwargs = dict(
14 | sparsity=0.5,
15 | targets="__ALL_PRUNABLE__",
16 | )
17 | setup_modifier_factory()
18 |
19 | def test_wanda_is_registered(self):
20 | type_ = ModifierFactory.create(
21 | type_="WandaPruningModifier",
22 | allow_experimental=False,
23 | allow_registered=True,
24 | **self.kwargs,
25 | )
26 |
27 | self.assertIsInstance(
28 | type_,
29 | WandaPruningModifier,
30 | "PyTorch WandaPruningModifier not registered",
31 | )
32 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/quantization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/quantization/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/smoothquant/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/test_base.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.factory import ModifierFactory
6 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier
7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
8 |
9 |
10 | @pytest.mark.unit
11 | class TestSmoothQuantIsRegistered(unittest.TestCase):
12 | def setUp(self):
13 | self.kwargs = dict(
14 | smoothing_strength=0.3,
15 | mappings=[(["layer1", "layer2"], "layer3")],
16 | )
17 | setup_modifier_factory()
18 |
19 | def test_smooth_quant_is_registered(self):
20 | modifier = ModifierFactory.create(
21 | type_="SmoothQuantModifier",
22 | allow_experimental=False,
23 | allow_registered=True,
24 | **self.kwargs,
25 | )
26 |
27 | self.assertIsInstance(
28 | modifier,
29 | SmoothQuantModifier,
30 | "PyTorch SmoothQuant not registered",
31 | )
32 |
33 | self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"])
34 | self.assertEqual(modifier.mappings, self.kwargs["mappings"])
35 |
36 |
37 | @pytest.mark.unit
38 | class TestSmoothQuantDefaults(unittest.TestCase):
39 | def setUp(self):
40 | setup_modifier_factory()
41 |
42 | def test_defaults(self):
43 | default_sq = SmoothQuantModifier()
44 | assert default_sq.smoothing_strength == 0.5
45 |
46 | def test_override_defaults(self):
47 | strength = 0.7
48 | dummy_map = [(["layer1", "layer2"], "layer3")]
49 | non_default_sq = SmoothQuantModifier(
50 | smoothing_strength=strength, mappings=dummy_map
51 | )
52 |
53 | assert non_default_sq.smoothing_strength == strength
54 | assert non_default_sq.mappings == dummy_map
55 |
--------------------------------------------------------------------------------
/tests/llmcompressor/modifiers/smoothquant/test_utils.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.smoothquant.utils import (
6 | get_layer_mappings_from_architecture,
7 | handle_mapping_resolution_errors,
8 | )
9 |
10 | smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils"
11 |
12 |
13 | @pytest.mark.unit
14 | def test_handle_mapping_resolution_errors():
15 | README_LOCATION = (
16 | "https://github.com/vllm-project/llm-compressor/tree/main/"
17 | "src/llmcompressor/modifiers/smoothquant"
18 | )
19 |
20 | @handle_mapping_resolution_errors
21 | def func_that_raises_exception():
22 | raise ValueError("An error occurred")
23 |
24 | with pytest.raises(RuntimeError) as excinfo:
25 | func_that_raises_exception()
26 |
27 | assert "Error resolving mappings for given architecture." in str(excinfo.value)
28 | assert "Please refer to the README at" in str(excinfo.value)
29 | assert README_LOCATION in str(excinfo.value)
30 |
31 |
32 | @pytest.mark.unit
33 | @patch(
34 | f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"}
35 | )
36 | @patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping")
37 | def test_get_layer_mappings_from_architecture():
38 | # Test when architecture is in MAPPINGS_REGISTRY
39 | assert get_layer_mappings_from_architecture("arch1") == "mapping1"
40 |
41 | # Test when architecture is not in MAPPINGS_REGISTRY
42 | assert get_layer_mappings_from_architecture("arch3") == "default_mapping"
43 |
--------------------------------------------------------------------------------
/tests/llmcompressor/observers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/observers/test_mse.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | import pytest
17 | import torch
18 | from compressed_tensors.quantization.quant_args import QuantizationArgs
19 |
20 | from llmcompressor.observers import MovingAverageMSEObserver, Observer
21 |
22 |
23 | @pytest.mark.parametrize(
24 | "symmetric,expected_scale,expected_zero_point",
25 | [
26 | (True, 0.0078, 0),
27 | (False, 0.0039, -128),
28 | ],
29 | )
30 | def test_mse_observer(symmetric, expected_scale, expected_zero_point):
31 | tensor = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0])
32 | num_bits = 8
33 | weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse")
34 |
35 | observer = weights.observer
36 | observer = Observer.load_from_registry(observer, quantization_args=weights)
37 | scale, zero_point = observer(tensor)
38 |
39 | assert isinstance(observer, MovingAverageMSEObserver)
40 | assert round(scale.item(), 4) == expected_scale
41 | assert round(zero_point.item(), 4) == expected_zero_point
42 |
43 |
44 | def test_mse_observer_symmetric_scale_range():
45 | tensor = torch.rand(4, 4)
46 | tensor *= 127
47 |
48 | num_bits = 8
49 | weights = QuantizationArgs(num_bits=num_bits, symmetric=True, observer="mse")
50 |
51 | observer = weights.observer
52 | observer = Observer.load_from_registry(observer, quantization_args=weights)
53 | scale, zero_point = observer(tensor)
54 |
55 | # if symmetric, max symmetric_range = abs(-128) / 255
56 | assert round(scale.item(), 4) <= 1.0039
57 | assert round(zero_point.item(), 4) == 0
58 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pipelines/sequential/test_helpers.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from llmcompressor.pipelines.sequential.helpers import get_sequential_ancestors
4 |
5 |
6 | class DummyModel(torch.nn.Module):
7 | def __init__(self):
8 | super().__init__()
9 | self.seq = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU())
10 | self.fc = torch.nn.Linear(20, 5)
11 |
12 | def forward(self, x):
13 | x = self.seq(x)
14 | return self.fc(x)
15 |
16 |
17 | def test_get_sequential_ancestors():
18 | model = DummyModel()
19 |
20 | assert get_sequential_ancestors(model, set()) == set()
21 | assert get_sequential_ancestors(model, {model}) == set()
22 | assert get_sequential_ancestors(model, {model.fc}) == {model}
23 | assert get_sequential_ancestors(model, {model.seq[0]}) == {model, model.seq}
24 | assert get_sequential_ancestors(model, {model.seq[1]}) == {model, model.seq}
25 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/test_pytorch.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 | from torch.nn import Linear
5 |
6 | from llmcompressor.core import State
7 | from llmcompressor.modifiers.logarithmic_equalization import (
8 | LogarithmicEqualizationModifier,
9 | )
10 | from tests.llmcompressor.pytorch.helpers import LinearNet
11 |
12 |
13 | @pytest.mark.unit
14 | class TestLogEqualizationMapping(unittest.TestCase):
15 | def setUp(self):
16 | self.model = LinearNet()
17 | self.state = State(model=self.model)
18 |
19 | def test_successful_map(self):
20 | mappings = [(["seq.fc2"], "seq.block1.fc1")]
21 | modifier = LogarithmicEqualizationModifier(mappings=mappings)
22 |
23 | modifier.ignore = []
24 | modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model)
25 |
26 | self.assertEqual(len(modifier.resolved_mappings_), len(mappings))
27 |
28 | mapping = modifier.resolved_mappings_[0]
29 | self.assertEqual(mapping.smooth_name, mappings[0][1])
30 | self.assertIsInstance(mapping.smooth_layer, Linear)
31 | self.assertIsInstance(mapping.balance_layers[0], Linear)
32 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 |
5 | from llmcompressor.modifiers.factory import ModifierFactory
6 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory
7 |
8 |
9 | @pytest.mark.unit
10 | class TestWandaPytorchIsRegistered(unittest.TestCase):
11 | def setUp(self):
12 | self.kwargs = dict(
13 | sparsity=0.5,
14 | targets="__ALL_PRUNABLE__",
15 | )
16 | setup_modifier_factory()
17 |
18 | def test_wanda_pytorch_is_registered(self):
19 | from llmcompressor.modifiers.pruning.wanda import WandaPruningModifier
20 |
21 | type_ = ModifierFactory.create(
22 | type_="WandaPruningModifier",
23 | allow_experimental=False,
24 | allow_registered=True,
25 | **self.kwargs,
26 | )
27 |
28 | self.assertIsInstance(
29 | type_,
30 | WandaPruningModifier,
31 | "PyTorch ConstantPruningModifier not registered",
32 | )
33 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pytest
4 | from torch.nn import Linear
5 |
6 | from llmcompressor.core import State
7 | from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
8 | from tests.llmcompressor.pytorch.helpers import LinearNet
9 |
10 |
11 | @pytest.mark.unit
12 | class TestSmoothQuantMapping(unittest.TestCase):
13 | def setUp(self):
14 | self.model = LinearNet()
15 | self.state = State(model=self.model)
16 |
17 | def test_successful_map(self):
18 | mappings = [(["seq.fc1"], "seq.fc2")]
19 | modifier = SmoothQuantModifier(mappings=mappings)
20 |
21 | modifier.ignore = []
22 | modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model)
23 |
24 | self.assertEqual(len(modifier.resolved_mappings_), len(mappings))
25 |
26 | mapping = modifier.resolved_mappings_[0]
27 | self.assertEqual(mapping.smooth_name, mappings[0][1])
28 | self.assertIsInstance(mapping.smooth_layer, Linear)
29 | self.assertIsInstance(mapping.balance_layers[0], Linear)
30 |
--------------------------------------------------------------------------------
/tests/llmcompressor/pytorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
--------------------------------------------------------------------------------
/tests/llmcompressor/recipe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/recipe/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/recipe/recipe.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.8
5 | mappings:
6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
7 | - re:.*input_layernorm
8 | - - ['re:.*gate_proj', 're:.*up_proj']
9 | - re:.*post_attention_layernorm
10 | GPTQModifier:
11 | targets: ["Linear"]
12 | ignore: [lm_head]
13 | scheme: W8A8
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/test_sentinel.py:
--------------------------------------------------------------------------------
1 | from llmcompressor.sentinel import Sentinel
2 |
3 |
4 | def test_sentinel():
5 | assert Sentinel("MISSING") == Sentinel("MISSING")
6 | assert Sentinel("MISSING", "module_one") != Sentinel("MISSING", "module_two")
7 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/compression/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/actorder_group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/actorder_weight_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/channelwise_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/fp8_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/group_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/inputs_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
5 | ppl_threshold: 20
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | model_stub: "nm-testing/llama2.c-stories15M"
4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/decompression_configs_skipped/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: False
11 | strategy: "group"
12 | group_size: 128
13 | actorder: "group"
14 | input_activations: null
15 | output_activations: null
16 | targets: ["Linear"]
17 | GPTQModifier:
18 | block_size: 128
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: False
11 | strategy: "group"
12 | group_size: 128
13 | actorder: "weight"
14 | input_activations: null
15 | output_activations: null
16 | targets: ["Linear"]
17 | GPTQModifier:
18 | block_size: 128
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: False
11 | strategy: "channel"
12 | input_activations: null
13 | output_activations: null
14 | targets: ["Linear"]
15 | GPTQModifier:
16 | block_size: 128
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml:
--------------------------------------------------------------------------------
1 | quant_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 8
9 | type: "float"
10 | symmetric: true
11 | strategy: channel
12 | targets: ["Linear"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | GPTQModifier:
4 | block_size: 128
5 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
6 | config_groups:
7 | group_0:
8 | weights:
9 | num_bits: 8
10 | type: "int"
11 | symmetric: false
12 | strategy: "channel"
13 | input_activations:
14 | num_bits: 8
15 | type: "int"
16 | symmetric: false
17 | strategy: "tensor"
18 | output_activations: null
19 | targets: ["Linear"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 4
9 | type: "int"
10 | symmetric: False
11 | strategy: "group"
12 | group_size: 128
13 | input_activations: null
14 | output_activations: null
15 | targets: ["Linear"]
16 | GPTQModifier:
17 | block_size: 128
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 8
9 | type: "int"
10 | symmetric: true
11 | strategy: "tensor"
12 | input_activations:
13 | num_bits: 8
14 | type: "int"
15 | symmetric: false
16 | strategy: "tensor"
17 | output_activations: null
18 | targets: ["Linear"]
19 | group_1:
20 | weights:
21 | num_bits: 8
22 | type: "int"
23 | symmetric: true
24 | strategy: "tensor"
25 | input_activations: null
26 | output_activations: null
27 | targets: ["Embedding"]
28 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 8
9 | type: "int"
10 | symmetric: true
11 | strategy: "tensor"
12 | input_activations: null
13 | output_activations: null
14 | targets: ["Linear", "Embedding"]
15 | GPTQModifier:
16 | block_size: 128
17 | targets: ["re:model.layers.\\d+$"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml:
--------------------------------------------------------------------------------
1 | pruning_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | mask_structure: "2:4"
6 | targets: ["Linear"]
7 | ignore: ["re:.*lm_head"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml:
--------------------------------------------------------------------------------
1 | pruning_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | mask_structure: "2:4"
6 | targets: ["Linear"]
7 | ignore: ["re:.*lm_head"]
8 | quant_stage:
9 | quant_modifiers:
10 | QuantizationModifier:
11 | ignore: ["lm_head"]
12 | config_groups:
13 | group_0:
14 | weights:
15 | num_bits: 8
16 | type: float
17 | strategy: channel
18 | dynamic: false
19 | symmetric: true
20 | input_activations:
21 | num_bits: 8
22 | type: float
23 | strategy: token
24 | dynamic: true
25 | symmetric: true
26 | targets: ["Linear"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/run_compressed_configs_skipped/w8a8.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/test_has_gpu.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 | import torch
5 |
6 |
7 | @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") != "true", reason="Only run for GHA")
8 | def test_has_gpu():
9 | """
10 | This test exists purely to raise an error if
11 | a runner performs transformers tests without a GPU
12 | """
13 | assert torch.cuda.is_available()
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/compression/test_infer_quant_format.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from compressed_tensors.quantization import preset_name_to_scheme
3 |
4 | from llmcompressor.transformers.compression.quantization_format import (
5 | infer_quantization_format,
6 | )
7 | from tests.llmcompressor.pytorch.helpers import LinearNet
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "preset,sparsity_structure,expected_format",
12 | [
13 | ["W8A8", "unstructured", "int-quantized"],
14 | ["W8A16", "unstructured", "pack-quantized"],
15 | ["W8A16", "2:4", "marlin-24"],
16 | ["W4A16", "unstructured", "pack-quantized"],
17 | ["W4A16", "2:4", "marlin-24"],
18 | ["FP8", "unstructured", "float-quantized"],
19 | ],
20 | )
21 | def test_infer_quant_format(preset, sparsity_structure, expected_format):
22 | quant_scheme = preset_name_to_scheme(preset, targets=["Linear"])
23 |
24 | dummy_model = LinearNet()
25 | for _, module in dummy_model.named_modules():
26 | module.quantization_scheme = quant_scheme
27 |
28 | inferred_format = infer_quantization_format(
29 | dummy_model, save_compressed=True, sparsity_structure=sparsity_structure
30 | )
31 | assert inferred_format.value == expected_format
32 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(autouse=True)
7 | def run_before_and_after_tests(tmp_path):
8 | os.environ["TRANSFORMERS_CACHE"] = str(tmp_path / "transformers")
9 | os.environ["HF_DATASETS_CACHE"] = str(tmp_path / "datasets")
10 | yield
11 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/data/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from transformers import AutoTokenizer
3 |
4 | from llmcompressor.args import ModelArguments
5 |
6 |
7 | @pytest.fixture
8 | def tiny_llama_path():
9 | return "nm-testing/llama2.c-stories15M"
10 |
11 |
12 | @pytest.fixture
13 | def tiny_llama_model_args(tiny_llama_path):
14 | return ModelArguments(model=tiny_llama_path)
15 |
16 |
17 | @pytest.fixture
18 | def tiny_llama_tokenizer(tiny_llama_model_args):
19 | tokenizer = AutoTokenizer.from_pretrained(
20 | tiny_llama_model_args.model,
21 | cache_dir=tiny_llama_model_args.cache_dir,
22 | use_fast=True,
23 | revision=tiny_llama_model_args.model_revision,
24 | use_auth_token=True if tiny_llama_model_args.use_auth_token else None,
25 | )
26 | return tokenizer
27 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from llmcompressor.args import DatasetArguments
4 | from llmcompressor.datasets import make_dataset_splits
5 | from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset
6 |
7 |
8 | @pytest.mark.unit
9 | def test_combined_datasets():
10 | dataset_args = DatasetArguments(
11 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
12 | )
13 | raw_wikitext2 = get_raw_dataset(dataset_args)
14 | datasets = {"all": raw_wikitext2}
15 | split_datasets = make_dataset_splits(datasets, do_train=True)
16 | assert split_datasets.get("train") is not None
17 |
18 | split_datasets = make_dataset_splits(datasets, do_train=True)
19 | assert split_datasets.get("train") is not None
20 |
21 |
22 | @pytest.mark.unit
23 | def test_separate_datasets():
24 | splits = {"train": "train[:5%]", "validation": "train[10%:20%]"}
25 | dataset_args = DatasetArguments(
26 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
27 | )
28 | datasets = {}
29 | for split_name, split_str in splits.items():
30 | raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str)
31 | datasets[split_name] = raw_wikitext2
32 |
33 | split_datasets = make_dataset_splits(datasets, do_train=True)
34 | assert split_datasets.get("train") is not None
35 |
36 | with pytest.raises(ValueError):
37 | # fails due to no test split specified
38 |
39 | datasets.pop("train")
40 | split_datasets = make_dataset_splits(datasets, do_train=True)
41 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | file_extension: json
5 | num_train_epochs: 1
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | file_extension: csv
5 | num_train_epochs: 1
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | file_extension: json
5 | num_train_epochs: 0.5
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: wikitext
5 | dataset_config_name: "wikitext-2-raw-v1"
6 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
7 | num_train_epochs: 0.25
8 | concat_txt: False
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "neuralmagic/Llama-2-7b-ultrachat200k"
4 | dataset: "ultrachat-200k"
5 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
6 | num_train_epochs: 0.05
7 | concat_txt: False
8 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
4 | dataset_config_name: wikitext-2-raw-v1
5 | dataset: wikitext
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml:
--------------------------------------------------------------------------------
1 | test_oneshot_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.7
5 | block_size: 128
6 | percdamp: 0.01
7 | mask_structure: "0:0"
8 | targets: ["Linear"]
9 | ignore: ["re:.*lm_head"]
10 | test_train_stage:
11 | pruning_modifiers:
12 | ConstantPruningModifier:
13 | targets: [
14 | "re:.*self_attn.q_proj",
15 | "re:.*self_attn.k_proj",
16 | "re:.*self_attn.v_proj",
17 | "re:.*self_attn.o_proj",
18 | "re:.*mlp.down_proj",
19 | "re:.*mlp.gate_proj",
20 | "re:.*mlp.up_proj"
21 | ]
22 | start: 0
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | pruning_modifiers:
3 | ConstantPruningModifier:
4 | targets: [
5 | "re:.*self_attn.q_proj",
6 | "re:.*self_attn.k_proj",
7 | "re:.*self_attn.v_proj",
8 | "re:.*self_attn.o_proj",
9 | "re:.*mlp.gate_proj",
10 | "re:.*mlp.up_proj"
11 | ]
12 | start: 0
13 | distillation_modifiers:
14 | OutputDistillationModifier:
15 | targets: ["re:model.layers.\\d+$"]
16 | comparison: "square_head"
17 | start: 0
18 | orig_scale: 1.0
19 | distill_scale: 1.0
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import unittest
4 |
5 | import pytest
6 | from parameterized import parameterized_class
7 |
8 | from tests.testing_utils import parse_params, requires_gpu
9 |
10 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
11 |
12 |
13 | @pytest.mark.integration
14 | @requires_gpu
15 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
16 | class TestFinetuneWithoutRecipe(unittest.TestCase):
17 | model = None
18 | dataset = None
19 |
20 | def setUp(self):
21 | self.output = "./finetune_output"
22 |
23 | def test_finetune_without_recipe(self):
24 | from llmcompressor import train
25 |
26 | recipe_str = None
27 | device = "cuda:0"
28 |
29 | concatenate_data = False
30 | max_steps = 50
31 | splits = "train"
32 |
33 | train(
34 | model=self.model,
35 | dataset=self.dataset,
36 | output_dir=self.output,
37 | recipe=recipe_str,
38 | max_steps=max_steps,
39 | concatenate_data=concatenate_data,
40 | splits=splits,
41 | oneshot_device=device,
42 | )
43 |
44 | def tearDown(self):
45 | if os.path.isdir(self.output):
46 | shutil.rmtree(self.output)
47 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_quantization.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | quant_modifiers:
3 | QuantizationModifier:
4 | ignore:
5 | - model.layers.0.mlp.down_proj
6 | - model.layers.1.mlp.down_proj
7 | - model.layers.2.mlp.down_proj
8 | - model.layers.3.mlp.down_proj
9 | - model.layers.4.mlp.down_proj
10 | - model.layers.5.mlp.down_proj
11 | config_groups:
12 | group_0:
13 | weights:
14 | num_bits: 8
15 | type: "int"
16 | symmetric: False
17 | strategy: "tensor"
18 | input_activations: null
19 | output_activations: null
20 | targets: ["Linear"]
21 | pruning_modifiers:
22 | ConstantPruningModifier:
23 | targets: [
24 | "re:.*self_attn.q_proj",
25 | "re:.*self_attn.k_proj",
26 | "re:.*self_attn.v_proj",
27 | "re:.*self_attn.o_proj",
28 | "re:.*mlp.gate_proj",
29 | "re:.*mlp.up_proj"
30 | ]
31 | start: 0
32 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/finetune/test_safetensors.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import unittest
4 | from pathlib import Path
5 |
6 | import pytest
7 | from parameterized import parameterized_class
8 |
9 | from tests.testing_utils import parse_params, requires_gpu
10 |
11 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic"
12 |
13 |
14 | @pytest.mark.integration
15 | @requires_gpu
16 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
17 | class TestSafetensors(unittest.TestCase):
18 | model = None
19 | dataset = None
20 |
21 | def setUp(self):
22 | self.output = Path("./finetune_output")
23 |
24 | def test_safetensors(self):
25 | from llmcompressor import train
26 |
27 | device = "cuda:0"
28 | output_dir = self.output / "output1"
29 | max_steps = 10
30 | splits = {"train": "train[:10%]"}
31 |
32 | train(
33 | model=self.model,
34 | dataset=self.dataset,
35 | output_dir=output_dir,
36 | max_steps=max_steps,
37 | splits=splits,
38 | oneshot_device=device,
39 | )
40 |
41 | assert os.path.exists(output_dir / "model.safetensors")
42 | assert not os.path.exists(output_dir / "pytorch_model.bin")
43 |
44 | # test we can also load
45 | new_output_dir = self.output / "output2"
46 | train(
47 | model=output_dir,
48 | dataset=self.dataset,
49 | output_dir=new_output_dir,
50 | max_steps=max_steps,
51 | splits=splits,
52 | oneshot_device=device,
53 | )
54 |
55 | def tearDown(self):
56 | if os.path.isdir(self.output):
57 | shutil.rmtree(self.output)
58 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_sparse.yml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | device: "cuda:0"
7 | num_samples: 512
8 | perplexity: 20
9 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
6 | num_samples: 32
7 | perplexity: 5000
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | num_samples: 32
7 | perplexity: 5000
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/gpu/llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"
7 | device: "cuda:0"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | initial_pruning_only_recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml"
6 | initial_sparsity: 0.5
7 | recipe_mask_structure: "2:4"
8 | subsequent_prune_and_quant_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml"
9 | final_sparsity: 0.7
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "meta-llama/Llama-2-7b-hf"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | sparsity: 0.3
7 | device: "cuda:0"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "sanity"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
6 | sparsity: 0.3
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic/config.yaml:
--------------------------------------------------------------------------------
1 | cadence: "nightly"
2 | test_type: "regression"
3 | model: "nm-testing/llama2.c-stories15M"
4 | dataset: open_platypus
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.7
5 | block_size: 128
6 | percdamp: 0.01
7 | mask_structure: "0:0"
8 | targets: ["re:.*model.layers.0$"]
9 | preserve_sparsity_mask: True
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.7
5 | block_size: 128
6 | percdamp: 0.01
7 | mask_structure: "0:0"
8 | targets: [
9 | "re:.*model.layers.0$",
10 | ]
11 | preserve_sparsity_mask: True
12 | GPTQModifier:
13 | config_groups:
14 | group_0:
15 | weights:
16 | num_bits: 8
17 | type: "int"
18 | strategy: "channel"
19 | targets: [
20 | "re:.*model.layers.0.self_attn.q_proj",
21 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/quant.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SmoothQuantModifier:
4 | smoothing_strength: 0.6
5 | GPTQModifier:
6 | block_size: 128
7 | percdamp: 0.01
8 | config_groups:
9 | group_0:
10 | weights:
11 | num_bits: 8
12 | input_activations:
13 | num_bits: 8
14 | targets: ["Linear"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | GPTQModifier:
4 | ignore: [lm_head]
5 | config_groups:
6 | group_0:
7 | weights:
8 | num_bits: 8
9 | type: "int"
10 | strategy: "channel"
11 | targets: [Linear]
12 | SparseGPTModifier:
13 | sparsity: 0.5
14 | block_size: 128
15 | percdamp: 0.01
16 | mask_structure: "0:0"
17 | targets: ["re:.*model.layers.0$"]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/sparse.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.3
5 | block_size: 128
6 | percdamp: 0.01
7 | targets: ["model.layers.0", "model.layers.1"]
8 | mask_structure: "0:0"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | block_size: 128
6 | percdamp: 0.01
7 | mask_structure: "2:4"
8 | targets: [
9 | "re:.*model.layers.0$",
10 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | block_size: 128
6 | percdamp: 0.01
7 | mask_structure: "0:0"
8 | targets: [
9 | "model.layers.0",
10 | "model.layers.1",
11 | "model.layers.2",
12 | "model.layers.3",
13 | "model.layers.4",
14 | "model.layers.5"
15 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from accelerate import init_empty_weights
3 | from transformers import AutoModelForCausalLM
4 |
5 | from llmcompressor.modifiers.obcq import SparseGPTModifier
6 |
7 |
8 | @pytest.mark.integration
9 | def test_infer_targets():
10 | modifier = SparseGPTModifier(sparsity=0.0)
11 | with init_empty_weights():
12 | model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
13 |
14 | inferred = modifier._infer_sequential_targets(model)
15 | assert inferred == ["LlamaDecoderLayer"]
16 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import MagicMock
3 |
4 | import pytest
5 |
6 | from llmcompressor.core.state import State
7 | from llmcompressor.modifiers.obcq import SparseGPTModifier
8 |
9 |
10 | @pytest.mark.integration
11 | class TestLMHead(unittest.TestCase):
12 | def setUp(self):
13 | import torch
14 | from transformers import AutoModelForCausalLM
15 |
16 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
17 |
18 | self.model = AutoModelForCausalLM.from_pretrained(
19 | "nm-testing/llama2.c-stories15M", device_map=self.device
20 | )
21 |
22 | self.kwargs = {
23 | "sparsity": 0.5,
24 | "block_size": 128,
25 | "quantize": False,
26 | "targets": [
27 | "model.layers.0",
28 | "model.layers.1",
29 | "model.layers.2",
30 | "model.layers.3",
31 | "model.layers.4",
32 | "model.layers.5",
33 | ],
34 | }
35 |
36 | dataset = MagicMock()
37 | dataset.column_names = []
38 | self.dataloader = MagicMock()
39 | self.dataloader.dataset = dataset
40 | self.dataloader.__iter__.return_value = iter([])
41 |
42 | def test_no_lm_head_target(self):
43 | modifier = SparseGPTModifier(**self.kwargs)
44 |
45 | state = State()
46 | state.update(model=self.model, device=self.device, calib_data=self.dataloader)
47 | modifier.initialize(state)
48 | modifier.on_start(state, None)
49 |
50 | assert len(self.model.lm_head._forward_hooks) <= 0
51 |
52 | modifier.finalize(state)
53 |
54 | def test_lm_head_target(self):
55 | self.kwargs["targets"].append("lm_head")
56 | modifier = SparseGPTModifier(**self.kwargs)
57 |
58 | state = State()
59 | state.update(model=self.model, device=self.device, calib_data=self.dataloader)
60 | modifier.initialize(state)
61 | modifier.on_start(state, None)
62 |
63 | assert len(self.model.lm_head._forward_hooks) == 1
64 |
65 | modifier.finalize(state)
66 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_obcq_owl.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch
3 | from datasets import Dataset
4 | from transformers import AutoModelForCausalLM
5 |
6 | from llmcompressor.core.session_functions import create_session
7 | from llmcompressor.datasets import format_calibration_data
8 | from llmcompressor.modifiers.obcq import SparseGPTModifier
9 | from llmcompressor.utils.pytorch.module import get_layers
10 |
11 |
12 | @pytest.mark.integration
13 | def test_infer_owl_layer_sparsity():
14 | target_sparsity = 0.7
15 | vocab_size = 512
16 | seq_len = 2048
17 | ds_size = 16
18 |
19 | with create_session() as session:
20 | session.initialize()
21 | modifier = SparseGPTModifier(
22 | sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05
23 | )
24 | model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
25 |
26 | dataset = Dataset.from_dict(
27 | {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))}
28 | )
29 | dataloader = format_calibration_data(dataset)
30 |
31 | sequential_targets = modifier._infer_sequential_targets(model)
32 | layers = get_layers(sequential_targets, model)
33 | sparsities = modifier._infer_owl_layer_sparsity(model, layers, dataloader)
34 | assert sparsities.keys() == layers.keys()
35 |
36 | for sparsity in sparsities.values():
37 | assert sparsity == pytest.approx(target_sparsity, abs=0.1)
38 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/obcq/test_oneshot_with_modifier.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import unittest
4 | from pathlib import Path
5 |
6 | import pytest
7 | from parameterized import parameterized_class
8 |
9 | from tests.testing_utils import parse_params, requires_gpu
10 |
11 | CONFIGS_DIRECTORY = (
12 | "tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic"
13 | )
14 |
15 |
16 | @pytest.mark.integration
17 | @requires_gpu
18 | @parameterized_class(parse_params(CONFIGS_DIRECTORY))
19 | class TestOneshotWithModifierObject(unittest.TestCase):
20 | model = None
21 | dataset = None
22 |
23 | def setUp(self):
24 | self.output = Path("./finetune_output")
25 |
26 | def test_oneshot_with_modifier_object(self):
27 | from llmcompressor import oneshot
28 | from llmcompressor.modifiers.obcq.base import SparseGPTModifier
29 |
30 | recipe_str = [
31 | SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])
32 | ]
33 |
34 | device = "cuda:0"
35 | concatenate_data = False
36 | num_calibration_samples = 64
37 | output_dir = self.output / "oneshot_out"
38 | splits = {"calibration": "train[:10%]"}
39 |
40 | oneshot(
41 | model=self.model,
42 | dataset=self.dataset,
43 | output_dir=output_dir,
44 | num_calibration_samples=num_calibration_samples,
45 | recipe=recipe_str,
46 | concatenate_data=concatenate_data,
47 | splits=splits,
48 | oneshot_device=device,
49 | )
50 |
51 | def tearDown(self):
52 | if os.path.isdir(self.output):
53 | shutil.rmtree(self.output)
54 |
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/oneshot/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml:
--------------------------------------------------------------------------------
1 | test_stage:
2 | obcq_modifiers:
3 | SparseGPTModifier:
4 | sparsity: 0.5
5 | block_size: 128
6 | targets: [
7 | 're:model.layers.3.mlp.gate_proj.weight'
8 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: open_platypus
6 | recipe: |
7 | test_stage:
8 | obcq_modifiers:
9 | SparseGPTModifier:
10 | sparsity: 0.5
11 | block_size: 128
12 | targets: [
13 | 're:model.layers.3.mlp.gate_proj.weight'
14 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf2.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf3.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: "gsm8k"
6 | dataset_config_name: "main"
7 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: False
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: "gsm8k"
6 | dataset_config_name: "main"
7 | recipe: |
8 | test_stage:
9 | obcq_modifiers:
10 | SparseGPTModifier:
11 | sparsity: 0.5
12 | block_size: 128
13 | targets: [
14 | 're:model.layers.3.mlp.gate_proj.weight'
15 | ]
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf5.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: open_platypus
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf6.yaml:
--------------------------------------------------------------------------------
1 | cadence: "commit"
2 | test_type: "smoke"
3 | tokenize: True
4 | model: "nm-testing/llama2.c-stories15M"
5 | dataset: "gsm8k"
6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
--------------------------------------------------------------------------------
/tests/llmcompressor/transformers/sparsification/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing,
10 | # software distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tests/llmcompressor/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/utils/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/pytorch/__init__.py
--------------------------------------------------------------------------------
/tests/llmcompressor/utils/pytorch/test_module.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch.nn as nn
3 |
4 | from llmcompressor.utils.pytorch import get_layer_by_name
5 |
6 |
7 | @pytest.fixture
8 | def example_nested_module() -> str:
9 | return nn.Sequential(
10 | nn.Linear(10, 20),
11 | nn.Sequential(nn.ReLU(), nn.Linear(20, 10)),
12 | nn.Sequential(nn.SiLU(), nn.Linear(20, 10)),
13 | nn.Softmax(dim=1),
14 | )
15 |
16 |
17 | @pytest.mark.unit
18 | def test_get_layer_by_name(example_nested_module):
19 | # Test getting the parent of a nested layer
20 | layer = get_layer_by_name("0", example_nested_module)
21 | assert layer == example_nested_module[0]
22 |
23 | layer = get_layer_by_name("1.1", example_nested_module)
24 | assert layer == example_nested_module[1][1]
25 |
26 | layer = get_layer_by_name("2.0", example_nested_module)
27 | assert layer == example_nested_module[2][0]
28 |
29 | layer = get_layer_by_name("2.1", example_nested_module)
30 | assert layer == example_nested_module[2][1]
31 |
32 | # Test getting the parent of a non-existent layer
33 | with pytest.raises(AttributeError):
34 | get_layer_by_name("non_existent_layer", example_nested_module)
35 |
--------------------------------------------------------------------------------
/tests/lmeval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/lmeval/__init__.py
--------------------------------------------------------------------------------
/tests/lmeval/configs/fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: FP8_DYNAMIC
4 | lmeval:
5 | metrics:
6 | exact_match,flexible-extract: 0.75
7 | exact_match,strict-match: 0.75
8 |
--------------------------------------------------------------------------------
/tests/lmeval/configs/fp8_static_per_tensor.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: FP8
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | lmeval:
7 | metrics:
8 | exact_match,flexible-extract: 0.75
9 | exact_match,strict-match: 0.75
10 |
--------------------------------------------------------------------------------
/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: INT8_dyn_per_token
4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | lmeval:
8 | metrics:
9 | exact_match,flexible-extract: 0.77
10 | exact_match,strict-match: 0.76
--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: weekly
2 | model: Qwen/Qwen2.5-VL-7B-Instruct
3 | model_class: Qwen2_5_VLForConditionalGeneration
4 | scheme: FP8_DYNAMIC
5 | lmeval:
6 | model: "hf-multimodal"
7 | model_args:
8 | dtype: bfloat16
9 | add_bos_token: True
10 | convert_img_format: True
11 | task: mmmu_val_literature
12 | num_fewshot: 0
13 | batch_size: 8
14 | # dense model achieves accuracy of 0.9 +/ 0.0557
15 | metrics:
16 | acc,none: 0.8667
17 | acc_stderr,none: 0.0557
18 |
--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: Qwen/Qwen2.5-VL-7B-Instruct
3 | model_class: Qwen2_5_VLForConditionalGeneration
4 | scheme: INT8_dyn_per_token
5 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
6 | dataset_id: lmms-lab/flickr30k
7 | dataset_split: "test[:512]"
8 | lmeval:
9 | model: "hf-multimodal"
10 | model_args:
11 | dtype: bfloat16
12 | add_bos_token: True
13 | convert_img_format: True
14 | task: mmmu_val_literature
15 | num_fewshot: 0
16 | batch_size: 8
17 | # dense model achieves accuracy of 0.9 +/ 0.0557
18 | metrics:
19 | acc,none: 0.833
20 | acc_stderr,none: 0.0557
--------------------------------------------------------------------------------
/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: Qwen/Qwen2.5-VL-7B-Instruct
3 | model_class: Qwen2_5_VLForConditionalGeneration
4 | scheme: W4A16_actorder_weight
5 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
6 | dataset_id: lmms-lab/flickr30k
7 | dataset_split: "test[:512]"
8 | lmeval:
9 | model: "hf-multimodal"
10 | model_args:
11 | dtype: bfloat16
12 | add_bos_token: True
13 | convert_img_format: True
14 | task: mmmu_val_literature
15 | num_fewshot: 0
16 | batch_size: 8
17 | # dense model achieves accuracy of 0.9 +/ 0.0557
18 | metrics:
19 | acc,none: 0.8333
20 | acc_stderr,none: 0.0557
--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_actorder_group.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: W4A16_actorder_group
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | lmeval:
8 | metrics:
9 | exact_match,flexible-extract: 0.72
10 | exact_match,strict-match: 0.72
11 |
--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_actorder_weight.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: W4A16_actorder_weight
4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
5 | dataset_id: HuggingFaceH4/ultrachat_200k
6 | dataset_split: train_sft
7 | lmeval:
8 | metrics:
9 | exact_match,flexible-extract: 0.72
10 | exact_match,strict-match: 0.72
11 |
--------------------------------------------------------------------------------
/tests/lmeval/configs/w4a16_grouped_quant.yaml:
--------------------------------------------------------------------------------
1 | cadence: "weekly"
2 | model: meta-llama/Meta-Llama-3-8B-Instruct
3 | scheme: W4A16
4 | dataset_id: HuggingFaceH4/ultrachat_200k
5 | dataset_split: train_sft
6 | quant_type: "GPTQ"
7 | lmeval:
8 | metrics:
9 | exact_match,flexible-extract: 0.72
10 | exact_match,strict-match: 0.72
11 |
--------------------------------------------------------------------------------
/tests/test_timer/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .timer import Timer
4 |
--------------------------------------------------------------------------------
/tests/test_timer/timer_utils.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 |
3 | from tests.test_timer import Timer
4 |
5 | __all__ = ["log_time", "get_singleton_manager"]
6 |
7 |
8 | def get_singleton_manager(enable_logging: bool = True):
9 | """
10 | Return the Timer. If not has not yet been initialized, initialize and
11 | return. If it has, return the existing Timer.
12 | """
13 | if Timer._instance is None:
14 | Timer._instance = Timer(enable_logging=enable_logging)
15 | return Timer._instance
16 |
17 |
18 | def log_time(func):
19 | """
20 | Decorator to time functions. Times for the function are stored using
21 | the class and function names.
22 | """
23 |
24 | @wraps(func)
25 | def wrapper(*args, **kwargs):
26 | TIMER_MANAGER = get_singleton_manager()
27 | func_name = func.__name__
28 |
29 | if not TIMER_MANAGER.enable_logging:
30 | return func(*args, **kwargs)
31 |
32 | with TIMER_MANAGER.time(func_name):
33 | return func(*args, **kwargs)
34 |
35 | return wrapper
36 |
--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/__init__.py
--------------------------------------------------------------------------------
/tests/unit/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/__init__.py
--------------------------------------------------------------------------------
/tests/unit/core/events/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/events/__init__.py
--------------------------------------------------------------------------------
/tests/unit/core/events/test_event.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from llmcompressor.core import Event, EventType
4 |
5 |
6 | @pytest.mark.smoke
7 | def test_event_epoch_based():
8 | event = Event(steps_per_epoch=10)
9 | assert event.epoch_based is True
10 |
11 |
12 | @pytest.mark.smoke
13 | def test_event_epoch():
14 | event = Event(steps_per_epoch=10, global_step=25)
15 | assert event.epoch == 2
16 |
17 |
18 | @pytest.mark.smoke
19 | def test_event_epoch_full():
20 | event = Event(steps_per_epoch=10, global_step=25)
21 | assert event.epoch_full == 2.5
22 |
23 |
24 | @pytest.mark.smoke
25 | def test_event_epoch_step():
26 | event = Event(steps_per_epoch=10, global_step=25)
27 | assert event.epoch_step == 5
28 |
29 |
30 | @pytest.mark.smoke
31 | def test_event_epoch_batch():
32 | event = Event(
33 | steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50
34 | )
35 | assert event.epoch_batch == 10
36 |
37 |
38 | @pytest.mark.smoke
39 | def test_event_current_index():
40 | event = Event(steps_per_epoch=10, global_step=25)
41 | assert event.current_index == 2.5
42 |
43 |
44 | @pytest.mark.smoke
45 | def test_event_should_update():
46 | event = Event(steps_per_epoch=10, global_step=25)
47 | assert event.should_update(start=0, end=30, update=2.5) is True
48 | assert event.should_update(start=0, end=20, update=5) is False
49 | assert event.should_update(start=0, end=30, update=0) is True
50 |
51 |
52 | @pytest.mark.smoke
53 | def test_event_new_instance():
54 | event = Event(type_=EventType.INITIALIZE, global_step=25)
55 | new_event = event.new_instance(global_step=30)
56 | assert new_event.global_step == 30
57 | assert new_event.type_ == EventType.INITIALIZE
58 |
--------------------------------------------------------------------------------