├── .MAINTAINERS ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── doc-edit.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── TODO.txt └── workflows │ ├── build-and-publish-release-images.yaml │ ├── linkcheck.yml │ ├── linkspector │ └── linkspector.yml │ ├── quality-check.yaml │ ├── result.xml.fail │ ├── result.xml.success │ ├── set-comment.yaml │ ├── test-check-transformers.yaml │ └── test-check.yaml ├── .gitignore ├── CONTRIBUTING.md ├── DEVELOPING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── NOTICE ├── README.md ├── docs ├── save_pretrained.md └── schemes.md ├── examples ├── awq │ ├── README.md │ ├── llama_example.py │ └── qwen3_moe_example.py ├── big_models_with_accelerate │ ├── README.md │ ├── cpu_offloading_fp8.py │ ├── mult_gpus_int8_device_map.py │ └── multi_gpu_int8.py ├── compressed_inference │ └── fp8_compressed_inference.py ├── finetuning │ ├── configure_fsdp.md │ ├── example_alternating_recipe.yaml │ ├── example_fsdp_config.yaml │ └── example_single_gpu_config.yaml ├── multimodal_audio │ ├── README.md │ └── whisper_example.py ├── multimodal_vision │ ├── README.md │ ├── gemma3_example.py │ ├── idefics3_example.py │ ├── llava_example.py │ ├── mistral3_chat_template.json │ ├── mistral3_example.py │ ├── mllama_example.py │ ├── phi3_vision_example.py │ ├── pixtral_example.py │ ├── qwen2_vl_example.py │ └── qwen_2_5_vl_example.py ├── quantization_2of4_sparse_w4a16 │ ├── 2of4_w4a16_group-128_recipe.yaml │ ├── 2of4_w4a16_recipe.yaml │ ├── README.md │ └── llama7b_sparse_w4a16.py ├── quantization_kv_cache │ ├── README.md │ ├── gemma2_fp8_kv_example.py │ ├── llama3_fp8_kv_example.py │ └── phi3.5_fp8_kv_example.py ├── quantization_w4a16 │ ├── README.md │ └── llama3_example.py ├── quantization_w4a16_fp4 │ └── llama3_example.py ├── quantization_w4a4_fp4 │ └── llama3_example.py ├── quantization_w8a8_fp8 │ ├── README.md │ ├── gemma2_example.py │ ├── llama3.2_vision_example.py │ ├── llama3_example.py │ ├── llava1.5_example.py │ ├── qwen2vl_example.py │ └── whisper_example.py ├── quantization_w8a8_int8 │ ├── README.md │ ├── gemma2_example.py │ └── llama3_example.py ├── quantizing_moe │ ├── README.md │ ├── deepseek_moe_w4a16.py │ ├── deepseek_moe_w8a8_fp8.py │ ├── deepseek_moe_w8a8_int8.py │ ├── deepseek_recipe_w4a16.yaml │ ├── mixtral_moe_w8a8_fp8.py │ └── qwen_moe_w4a16.py ├── sparse_2of4_quantization_fp8 │ ├── README.md │ └── llama3_8b_2of4.py └── trl_mixin │ ├── README.md │ ├── ex_trl_constant.py │ ├── ex_trl_distillation.py │ └── sft_trainer.py ├── pyproject.toml ├── setup.py ├── src └── llmcompressor │ ├── __init__.py │ ├── args │ ├── README.md │ ├── __init__.py │ ├── dataset_arguments.py │ ├── model_arguments.py │ ├── recipe_arguments.py │ ├── training_arguments.py │ └── utils.py │ ├── core │ ├── __init__.py │ ├── events │ │ ├── __init__.py │ │ └── event.py │ ├── helpers.py │ ├── lifecycle.py │ ├── model_layer.py │ ├── session.py │ ├── session_functions.py │ └── state.py │ ├── datasets │ ├── __init__.py │ └── utils.py │ ├── entrypoints │ ├── README.md │ ├── __init__.py │ ├── oneshot.py │ ├── train.py │ └── utils.py │ ├── logger.py │ ├── metrics │ ├── __init__.py │ ├── logger.py │ └── utils │ │ ├── __init__.py │ │ └── frequency_manager.py │ ├── modifiers │ ├── README.md │ ├── __init__.py │ ├── awq │ │ ├── __init__.py │ │ ├── base.py │ │ └── mappings.py │ ├── distillation │ │ ├── __init__.py │ │ ├── output │ │ │ ├── __init__.py │ │ │ └── base.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── pytorch │ │ │ ├── __init__.py │ │ │ ├── kd_factory.py │ │ │ ├── kd_wrapper.py │ │ │ └── model_wrapper.py │ ├── experimental │ │ └── __init__.py │ ├── factory.py │ ├── interface.py │ ├── logarithmic_equalization │ │ ├── __init__.py │ │ └── base.py │ ├── modifier.py │ ├── obcq │ │ ├── __init__.py │ │ ├── base.py │ │ ├── sgpt_base.py │ │ └── sgpt_sparsify.py │ ├── pruning │ │ ├── __init__.py │ │ ├── constant │ │ │ ├── __init__.py │ │ │ └── base.py │ │ ├── helpers.py │ │ ├── magnitude │ │ │ ├── __init__.py │ │ │ └── base.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── pytorch │ │ │ │ ├── __init__.py │ │ │ │ ├── layer_mask.py │ │ │ │ └── mask_factory.py │ │ └── wanda │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── wanda_sparsify.py │ ├── quantization │ │ ├── __init__.py │ │ ├── cache.py │ │ ├── calibration.py │ │ ├── gptq │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── gptq_quantize.py │ │ └── quantization │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── mixin.py │ ├── smoothquant │ │ ├── README.md │ │ ├── __init__.py │ │ ├── base.py │ │ └── utils.py │ ├── stage.py │ └── utils │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── helpers.py │ │ ├── hooks.py │ │ └── pytorch_helpers.py │ ├── observers │ ├── __init__.py │ ├── base.py │ ├── helpers.py │ ├── min_max.py │ └── mse.py │ ├── pipelines │ ├── __init__.py │ ├── basic │ │ ├── __init__.py │ │ └── pipeline.py │ ├── cache.py │ ├── data_free │ │ ├── __init__.py │ │ └── pipeline.py │ ├── independent │ │ ├── __init__.py │ │ └── pipeline.py │ ├── layer_sequential │ │ ├── __init__.py │ │ ├── helpers.py │ │ └── pipeline.py │ ├── registry.py │ └── sequential │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ast_helpers.py │ │ ├── ast_utils │ │ ├── auto_wrapper.py │ │ ├── control_flow_analyzer.py │ │ └── name_analyzer.py │ │ ├── helpers.py │ │ └── pipeline.py │ ├── pytorch │ ├── __init__.py │ ├── model_load │ │ ├── __init__.py │ │ └── helpers.py │ └── utils │ │ ├── __init__.py │ │ ├── helpers.py │ │ ├── sparsification.py │ │ └── sparsification_info │ │ ├── __init__.py │ │ ├── configs.py │ │ ├── helpers.py │ │ └── module_sparsification_info.py │ ├── recipe │ ├── __init__.py │ ├── base.py │ ├── metadata.py │ ├── modifier.py │ ├── recipe.py │ └── stage.py │ ├── sentinel.py │ ├── transformers │ ├── __init__.py │ ├── compression │ │ ├── __init__.py │ │ ├── helpers.py │ │ ├── quantization_format.py │ │ └── sparsity_metadata_config.py │ ├── finetune │ │ ├── README.md │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── c4.py │ │ │ ├── cnn_dailymail.py │ │ │ ├── custom.py │ │ │ ├── data_helpers.py │ │ │ ├── evolcodealpaca.py │ │ │ ├── flickr_30k.py │ │ │ ├── gsm8k.py │ │ │ ├── open_platypus.py │ │ │ ├── peoples_speech.py │ │ │ ├── ptb.py │ │ │ ├── ultrachat_200k.py │ │ │ └── wikitext.py │ │ ├── session_mixin.py │ │ ├── text_generation.py │ │ └── trainer.py │ ├── sparsification │ │ ├── __init__.py │ │ ├── compressed_tensors_utils.py │ │ └── sparse_model.py │ ├── tracing │ │ ├── __init__.py │ │ └── debug.py │ └── utils │ │ ├── __init__.py │ │ ├── helpers.py │ │ └── preprocessing_functions.py │ ├── typing.py │ └── utils │ ├── __init__.py │ ├── dev.py │ ├── fsdp │ ├── __init__.py │ ├── context.py │ └── helpers.py │ ├── helpers.py │ ├── metric_logging.py │ └── pytorch │ ├── __init__.py │ ├── module.py │ └── utils.py └── tests ├── __init__.py ├── custom_test.py ├── data.py ├── e2e ├── __init__.py ├── e2e_utils.py └── vLLM │ ├── __init__.py │ ├── configs │ ├── fp8_dynamic_per_token.yaml │ ├── fp8_dynamic_per_token_qwen.yaml │ ├── fp8_static_per_tensor.yaml │ ├── fp8_weight_only_channel.yaml │ ├── fp8_weight_only_tensor.yaml │ ├── int8_channel_weight_static_per_tensor_act.yaml │ ├── int8_dynamic_per_token.yaml │ ├── int8_tensor_weight_static_per_tensor_act.yaml │ ├── int8_tensor_weight_static_per_tensor_act_qwen.yaml │ ├── kv_cache_gptq_tinyllama.yaml │ ├── kv_cache_phi3.yaml │ ├── kv_cache_tinyllama.yaml │ ├── sparse2of4_fp8_dynamic.yaml │ ├── sparse2of4_fp8_dynamic_qwen.yaml │ ├── sparse_24.yaml │ ├── w4a16_2of4_channel_quant.yaml │ ├── w4a16_2of4_grouped_quant.yaml │ ├── w4a16_actorder_group.yaml │ ├── w4a16_actorder_group_qwen.yaml │ ├── w4a16_actorder_weight.yaml │ ├── w4a16_actorder_weight_qwen.yaml │ ├── w4a16_channel_quant.yaml │ ├── w4a16_channel_quant_qwen.yaml │ ├── w4a16_grouped_quant.yaml │ ├── w4a16_grouped_quant_asym_awq.yaml │ ├── w8a16_channel_quant.yaml │ ├── w8a16_grouped_quant.yaml │ ├── w8a8_dynamic_asym.yaml │ └── w8a8_static_asym.yaml │ ├── recipes │ ├── FP8 │ │ ├── recipe_fp8_weight_only_channel.yaml │ │ └── recipe_fp8_weight_only_per_tensor.yaml │ ├── INT8 │ │ ├── recipe_int8_channel_weight_dynamic_per_token.yaml │ │ ├── recipe_int8_channel_weight_static_per_tensor_act.yaml │ │ ├── recipe_int8_tensor_weight_static_per_tensor_act.yaml │ │ ├── recipe_w8a8_dynamic_asym.yaml │ │ └── recipe_w8a8_static_asym.yaml │ ├── Sparse_2of4 │ │ ├── recipe_sparse_2of4.yaml │ │ └── recipe_sparse_2of4_fp8_dynamic.yaml │ ├── WNA16 │ │ ├── recipe_w4a16_channel_quant.yaml │ │ ├── recipe_w4a16_group_quant_asym_awq.yaml │ │ └── recipe_w8a16_channel_quant.yaml │ ├── WNA16_2of4 │ │ ├── 2of4_w4a16_group-128_recipe.yaml │ │ └── 2of4_w4a16_recipe.yaml │ ├── actorder │ │ ├── recipe_w4a16_actorder_group.yaml │ │ └── recipe_w4a16_actorder_weight.yaml │ └── kv_cache │ │ ├── default.yaml │ │ └── gptq.yaml │ ├── run_tests.sh │ ├── skipped_configs │ └── fp4_nvfp4a16.yaml │ └── test_vllm.py ├── examples ├── __init__.py ├── test_big_models_with_accelerate.py ├── test_compressed_inference.py ├── test_quantization_2of4_sparse_w4a16.py ├── test_quantization_kv_cache.py ├── test_quantization_w4a16.py ├── test_quantization_w8a8_fp8.py ├── test_quantization_w8a8_int8.py ├── test_quantizing_moe.py ├── test_sparse_2of4_quantization_fp8.py ├── test_trl_mixin.py └── utils.py ├── llmcompressor ├── __init__.py ├── conftest.py ├── helpers.py ├── metrics │ ├── __init__.py │ ├── test_logger.py │ └── utils │ │ ├── __init__.py │ │ └── test_frequency_manager.py ├── modifiers │ ├── __init__.py │ ├── awq │ │ ├── __init__.py │ │ └── test_base.py │ ├── calibration │ │ ├── __init__.py │ │ ├── test_cache.py │ │ ├── test_frozen.py │ │ ├── test_kv_cache.py │ │ └── test_observers.py │ ├── conf.py │ ├── logarithmic_equalization │ │ ├── __init__.py │ │ └── test_base.py │ ├── pruning │ │ ├── __init__.py │ │ ├── sparsegpt │ │ │ ├── __init__.py │ │ │ └── test_base.py │ │ └── wanda │ │ │ ├── __init__.py │ │ │ └── test_base.py │ ├── quantization │ │ ├── __init__.py │ │ └── test_base.py │ ├── smoothquant │ │ ├── __init__.py │ │ ├── test_base.py │ │ └── test_utils.py │ └── utils │ │ └── test_hooks.py ├── observers │ ├── __init__.py │ ├── test_helpers.py │ ├── test_min_max.py │ └── test_mse.py ├── pipelines │ ├── sequential │ │ ├── ast_utils.py │ │ │ └── test_auto_wrapper.py │ │ └── test_helpers.py │ └── test_cache.py ├── pytorch │ ├── __init__.py │ ├── helpers.py │ ├── modifiers │ │ ├── __init__.py │ │ ├── logarithmic_equalization │ │ │ ├── __init__.py │ │ │ └── test_pytorch.py │ │ ├── pruning │ │ │ ├── __init__.py │ │ │ ├── constant │ │ │ │ ├── __init__.py │ │ │ │ └── test_pytorch.py │ │ │ ├── sparsegpt │ │ │ │ ├── __init__.py │ │ │ │ └── test_pytorch.py │ │ │ └── wanda │ │ │ │ └── test_pytorch.py │ │ └── smoothquant │ │ │ ├── __init__.py │ │ │ └── test_pytorch.py │ └── utils │ │ ├── __init__.py │ │ └── test_helpers.py ├── recipe │ ├── __init__.py │ ├── recipe.yaml │ ├── test_recipe.py │ └── test_recipe_parsing.py ├── test_sentinel.py ├── transformers │ ├── __init__.py │ ├── compression │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── actorder_group_1.1b.yaml │ │ │ ├── actorder_weight_1.1b.yaml │ │ │ ├── channelwise_1.1b.yaml │ │ │ ├── channelwise_15m.yaml │ │ │ ├── fp8_1.1b.yaml │ │ │ ├── fp8_15m.yaml │ │ │ ├── group_1.1b.yaml │ │ │ ├── inputs_1.1b.yaml │ │ │ ├── inputs_15m.yaml │ │ │ ├── weights_only_1.1b.yaml │ │ │ └── weights_only_15m.yaml │ │ ├── decompression_configs │ │ │ ├── fp8_dynamic.yaml │ │ │ ├── w4a16.yaml │ │ │ └── w8a16_dense.yaml │ │ ├── decompression_configs_skipped │ │ │ └── w8a8.yaml │ │ ├── recipes │ │ │ ├── new_quant_actorder_group.yaml │ │ │ ├── new_quant_actorder_weight.yaml │ │ │ ├── new_quant_channel.yaml │ │ │ ├── new_quant_fp8.yaml │ │ │ ├── new_quant_full.yaml │ │ │ ├── new_quant_group.yaml │ │ │ ├── new_quant_simple.yaml │ │ │ ├── new_quant_weight.yaml │ │ │ ├── sparse_24.yaml │ │ │ └── sparse_24_fp8.yaml │ │ ├── run_compressed_configs │ │ │ ├── fp8_dynamic.yaml │ │ │ ├── w4a16.yaml │ │ │ └── w8a16.yaml │ │ ├── run_compressed_configs_skipped │ │ │ └── w8a8.yaml │ │ ├── test_decompress.py │ │ ├── test_has_gpu.py │ │ ├── test_helpers.py │ │ ├── test_infer_quant_format.py │ │ ├── test_quantization.py │ │ ├── test_run_compressed.py │ │ └── test_sparsity_metadata_config.py │ ├── conftest.py │ ├── finetune │ │ ├── __init__.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── conftest.py │ │ │ ├── test_dataset_helpers.py │ │ │ ├── test_dataset_loading.py │ │ │ └── test_registry.py │ │ ├── finetune_custom │ │ │ ├── config1.yaml │ │ │ ├── config2.yaml │ │ │ └── gpu │ │ │ │ └── gpu_config.yaml │ │ ├── finetune_generic │ │ │ └── config1.yaml │ │ ├── finetune_oneshot_configs │ │ │ ├── config.yaml │ │ │ └── gpu │ │ │ │ └── gpu_config.yaml │ │ ├── finetune_tokenizer │ │ │ └── config1.yaml │ │ ├── test_alternate_recipe.yaml │ │ ├── test_finetune_no_recipe_custom_dataset.py │ │ ├── test_finetune_recipe.yaml │ │ ├── test_finetune_without_recipe.py │ │ ├── test_oneshot_and_finetune.py │ │ ├── test_oneshot_and_finetune_with_tokenizer.py │ │ ├── test_oneshot_then_finetune.py │ │ ├── test_quantization.yaml │ │ ├── test_safetensors.py │ │ └── test_session_mixin.py │ ├── gptq │ │ └── test_oneshot.py │ ├── kv_cache │ │ └── test_kv_cache.py │ ├── obcq │ │ ├── __init__.py │ │ ├── obcq_configs │ │ │ ├── completion │ │ │ │ ├── gpu │ │ │ │ │ ├── llama_7b_quant.yaml │ │ │ │ │ ├── llama_7b_quant_and_sparse.yaml │ │ │ │ │ └── llama_7b_sparse.yml │ │ │ │ ├── tiny_llama_quant.yaml │ │ │ │ └── tiny_llama_quant_and_sparse.yaml │ │ │ ├── consec_runs │ │ │ │ ├── gpu │ │ │ │ │ └── llama_consec_runs.yaml │ │ │ │ └── tiny_llama_consec_runs.yaml │ │ │ ├── mask_structure │ │ │ │ └── tiny_llama_mask_structure_preservation.yaml │ │ │ ├── sparse │ │ │ │ ├── gpu │ │ │ │ │ └── llama_7b_sparse.yaml │ │ │ │ └── tiny_llama_sparse.yaml │ │ │ └── sparsity_generic │ │ │ │ └── config.yaml │ │ ├── recipes │ │ │ ├── additional_sparsity.yaml │ │ │ ├── additional_sparsity_with_quant.yaml │ │ │ ├── quant.yaml │ │ │ ├── quant_and_sparse.yaml │ │ │ ├── sparse.yaml │ │ │ ├── sparse_with_mask_structure.yaml │ │ │ └── test_tiny2.yaml │ │ ├── test_consecutive_runs.py │ │ ├── test_mask_structure_preservation.py │ │ ├── test_obcq_completion.py │ │ ├── test_obcq_infer_targets.py │ │ ├── test_obcq_lm_head.py │ │ ├── test_obcq_owl.py │ │ ├── test_obcq_sparsity.py │ │ └── test_oneshot_with_modifier.py │ ├── oneshot │ │ ├── __init__.py │ │ ├── dataset_processing.py │ │ ├── oneshot_configs │ │ │ ├── recipes │ │ │ │ └── recipe.yaml │ │ │ ├── tiny_stories_conf1.yaml │ │ │ ├── tiny_stories_conf2.yaml │ │ │ ├── tiny_stories_conf3.yaml │ │ │ ├── tiny_stories_conf4.yaml │ │ │ ├── tiny_stories_conf5.yaml │ │ │ └── tiny_stories_conf6.yaml │ │ └── test_api_inputs.py │ ├── sparsification │ │ ├── __init__.py │ │ └── test_compress_tensor_utils.py │ └── tracing │ │ └── test_models.py └── utils │ ├── __init__.py │ ├── pytorch │ ├── __init__.py │ └── test_module.py │ └── test_helpers.py ├── lmeval ├── __init__.py ├── configs │ ├── fp8_dynamic_per_token.yaml │ ├── fp8_static_per_tensor.yaml │ ├── int8_w8a8_dynamic_per_token.yaml │ ├── vl_fp8_dynamic_per_token.yaml │ ├── vl_int8_w8a8_dynamic_per_token.yaml │ ├── vl_w4a16_actorder_weight.yaml │ ├── w4a16_actorder_group.yaml │ ├── w4a16_actorder_weight.yaml │ └── w4a16_grouped_quant.yaml └── test_lmeval.py ├── test_timer ├── __init__.py ├── timer.py └── timer_utils.py ├── testing_utils.py └── unit ├── __init__.py ├── core ├── __init__.py ├── events │ ├── __init__.py │ └── test_event.py └── test_state.py └── test_logger.py /.MAINTAINERS: -------------------------------------------------------------------------------- 1 | # list of active maintainers 2 | # uncommented maintainers will be included in code review triage 3 | 4 | markurtz 5 | dsikka 6 | rahul-tuli 7 | horheynm 8 | brian-dellabetta 9 | kylesayrs 10 | 11 | # mgoin 12 | # anmarques 13 | # eldarkurtic 14 | # chibukach 15 | # shubhra 16 | # abhinavnmagic 17 | # eiofinov 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | labels: bug 5 | 6 | --- 7 | 8 | **Describe the bug** 9 | A clear and concise description of what the bug is. 10 | 11 | **Expected behavior** 12 | A clear and concise description of what you expected to happen. 13 | 14 | **Environment** 15 | Include all relevant environment information: 16 | 1. OS [e.g. Ubuntu 20.04]: 17 | 2. Python version [e.g. 3.7]: 18 | 3. LLM Compressor version or commit hash [e.g. 0.1.0, `f7245c8`]: 19 | 4. ML framework version(s) [e.g. torch 2.3.1]: 20 | 5. Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]: 21 | 6. Other relevant environment information [e.g. hardware, CUDA version]: 22 | 23 | **To Reproduce** 24 | Exact steps to reproduce the behavior: 25 | 26 | 27 | **Errors** 28 | If applicable, add a full print-out of any errors or exceptions that are raised or include screenshots to help explain your problem. 29 | 30 | **Additional context** 31 | Add any other context about the problem here. Also include any relevant files. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/doc-edit.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Doc edit 3 | about: Propose changes to project documentation 4 | labels: documentation 5 | 6 | --- 7 | 8 | **What is the URL, file, or UI containing proposed doc change** 9 | Where does one find the original content or where would this change go? 10 | 11 | **What is the current content or situation in question** 12 | Copy/paste the source content or describe gap. 13 | 14 | **What is the proposed change** 15 | Add new content. 16 | 17 | **Additional context** 18 | Add any other context about the change here. Also include any relevant files or URLs. 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | labels: enhancement 5 | 6 | --- 7 | 8 | **Is your feature request related to a problem? Please describe.** 9 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 10 | 11 | **Describe the solution you'd like** 12 | A clear and concise description of what you want to happen. 13 | 14 | **Describe alternatives you've considered** 15 | A clear and concise description of any alternative solutions or features you've considered. 16 | 17 | **Additional context** 18 | Add any other context or screenshots about the feature request here. 19 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | SUMMARY: 2 | "please provide a brief summary" 3 | 4 | 5 | TEST PLAN: 6 | "please outline how the changes were tested" 7 | -------------------------------------------------------------------------------- /.github/TODO.txt: -------------------------------------------------------------------------------- 1 | TODO: update for upstream push -------------------------------------------------------------------------------- /.github/workflows/linkcheck.yml: -------------------------------------------------------------------------------- 1 | name: Check Markdown links 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | markdown-link-check: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: umbrelladocs/action-linkspector@v1 20 | with: 21 | github_token: ${{ secrets.github_token }} 22 | reporter: github-pr-review 23 | fail_on_error: true 24 | config_file: '.github/workflows/linkspector/linkspector.yml' 25 | -------------------------------------------------------------------------------- /.github/workflows/linkspector/linkspector.yml: -------------------------------------------------------------------------------- 1 | aliveStatusCodes: 2 | - 0 3 | - 200 4 | ignorePatterns: 5 | - pattern: '.*localhost.*' 6 | - pattern: '.*127\\.0\\.0\\.1.*' 7 | - pattern: '.*0\\.0\\.0\\.0.*' 8 | dirs: 9 | - . 10 | useGitIgnore: true -------------------------------------------------------------------------------- /.github/workflows/quality-check.yaml: -------------------------------------------------------------------------------- 1 | name: Quality Checks 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - 'release/*' 7 | pull_request: 8 | branches: 9 | - main 10 | - 'release/*' 11 | jobs: 12 | quality-check: 13 | runs-on: ubuntu-22.04 14 | steps: 15 | - uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.9' 18 | - uses: actions/checkout@v4 19 | - name: "⚙️ Install dependencies" 20 | run: pip3 install .[dev] 21 | - name: "🧹 Running quality checks" 22 | run: make quality 23 | -------------------------------------------------------------------------------- /.github/workflows/result.xml.fail: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.github/workflows/result.xml.success: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.github/workflows/set-comment.yaml: -------------------------------------------------------------------------------- 1 | name: PR Reminder Comment Bot 2 | on: 3 | pull_request_target: 4 | branches: [main] 5 | types: [opened] 6 | 7 | jobs: 8 | pr_reminder: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Remind to add ready label 12 | uses: actions/github-script@v7 13 | with: 14 | script: | 15 | github.rest.issues.createComment({ 16 | owner: context.repo.owner, 17 | repo: context.repo.repo, 18 | issue_number: context.issue.number, 19 | body: '👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.\n\n**Note:** This is required to complete the testing suite, please only add the label once the PR is code complete and local testing has been performed.' 20 | }) 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to LLM Compressor 2 | 3 | Thank you for your interest in contributing to LLM Compressor! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add new compression methods or research. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about LLM Compressor and the vLLM community. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | ## Setup for development 19 | 20 | ### Install from source 21 | 22 | ```bash 23 | pip install -e ./[dev] 24 | ``` 25 | 26 | ### Code Styling and Formatting checks 27 | 28 | ```bash 29 | make style 30 | make quality 31 | ``` 32 | 33 | ### Testing 34 | 35 | ```bash 36 | make test 37 | ``` 38 | 39 | ## Contributing Guidelines 40 | 41 | ### Issue Reporting 42 | 43 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 44 | If not, please file a new issue, providing as much relevant information as possible. 45 | 46 | ### Pull Requests & Code Reviews 47 | 48 | Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. 49 | 50 | ### Thank You 51 | 52 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to LLM Compressor. 53 | Your contributions make LLM Compressor a great tool for everyone! 54 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | recursive-exclude src *.png *.jpg *.jpeg *.gif *.svg *.bmp *.webp 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BUILDDIR := $(PWD) 2 | CHECKDIRS := src tests examples setup.py 3 | DOCDIR := docs 4 | 5 | BUILD_ARGS := # set nightly to build nightly release 6 | 7 | # refer to setup.py for allowed values for BUILD_TYPE 8 | BUILD_TYPE?=dev 9 | export BUILD_TYPE 10 | 11 | TARGETS := "" # targets for running pytests: deepsparse,keras,onnx,pytorch,pytorch_models,export,pytorch_datasets,tensorflow_v1,tensorflow_v1_models,tensorflow_v1_datasets 12 | PYTEST_ARGS ?= "" 13 | ifneq ($(findstring transformers,$(TARGETS)),transformers) 14 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/transformers 15 | endif 16 | ifneq ($(findstring pytorch,$(TARGETS)),pytorch) 17 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/llmcompressor/pytorch 18 | endif 19 | ifneq ($(findstring examples,$(TARGETS)),examples) 20 | PYTEST_ARGS := $(PYTEST_ARGS) --ignore tests/examples 21 | endif 22 | 23 | # run checks on all files for the repo 24 | # leaving out mypy src for now 25 | quality: 26 | @echo "Running python quality checks"; 27 | ruff check $(CHECKDIRS); 28 | isort --check-only $(CHECKDIRS); 29 | flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203; 30 | 31 | # style the code according to accepted standards for the repo 32 | style: 33 | @echo "Running python styling"; 34 | ruff format $(CHECKDIRS); 35 | isort $(CHECKDIRS); 36 | flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203; 37 | 38 | # run tests for the repo 39 | test: 40 | @echo "Running python tests"; 41 | pytest tests $(PYTEST_ARGS) 42 | 43 | # creates wheel file 44 | .PHONY: build 45 | build: 46 | python3 setup.py sdist bdist_wheel $(BUILD_ARGS) 47 | 48 | # clean package 49 | clean: 50 | rm -fr .pytest_cache; 51 | rm -fr docs/_build docs/build; 52 | find $(CHECKDIRS) | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -fr; 53 | -------------------------------------------------------------------------------- /examples/big_models_with_accelerate/cpu_offloading_fp8.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct" 7 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 8 | 9 | # Load model 10 | # Note: device_map="auto" will offload to CPU if not enough space on GPU. 11 | model = AutoModelForCausalLM.from_pretrained( 12 | MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True 13 | ) 14 | 15 | # Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC). 16 | recipe = QuantizationModifier( 17 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] 18 | ) 19 | 20 | # Apply quantization and save in `compressed-tensors` format. 21 | oneshot( 22 | model=model, 23 | recipe=recipe, 24 | tokenizer=AutoTokenizer.from_pretrained(MODEL_ID), 25 | output_dir=OUTPUT_DIR, 26 | ) 27 | -------------------------------------------------------------------------------- /examples/compressed_inference/fp8_compressed_inference.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | """ 4 | This example covers how to load a quantized model using AutoModelForCausalLM. 5 | 6 | During inference, each layer will be decompressed as needed before the forward pass. 7 | This saves memory as only a single layer is ever uncompressed at a time, but increases 8 | runtime as we need to decompress each layer before running the forward pass 9 | 10 | """ 11 | 12 | # any model with the "compressed-tensors" quant_method and "compressed" 13 | # quantization_status in the quantization config is supported 14 | MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed" 15 | 16 | SAMPLE_INPUT = [ 17 | "I love quantization because", 18 | "What is the capital of France?", 19 | "def fibonacci(n):", 20 | ] 21 | 22 | compressed_model = AutoModelForCausalLM.from_pretrained( 23 | MODEL_STUB, 24 | torch_dtype="auto", 25 | device_map="cuda:0", 26 | ) 27 | 28 | # tokenize the sample data 29 | tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB) 30 | inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( 31 | compressed_model.device 32 | ) 33 | 34 | # run the compressed model and decode the output 35 | output = compressed_model.generate(**inputs, max_length=50) 36 | print("========== SAMPLE GENERATION ==============") 37 | text_output = tokenizer.batch_decode(output) 38 | for sample in text_output: 39 | print(sample) 40 | -------------------------------------------------------------------------------- /examples/finetuning/configure_fsdp.md: -------------------------------------------------------------------------------- 1 | # Configuring FSDP for Sparse Finetuning 2 | 3 | An example FSDP configuration file, `example_fsdp_config.yaml`, is provided in this 4 | folder. It can be used out of the box by editing the `num_processes` parameter to 5 | fit the number of GPUs on your machine. 6 | 7 | You can also customize your own config file by running the following prompt 8 | ``` 9 | accelerate config 10 | ``` 11 | 12 | An FSDP config file can be passed to the LLM Compressor finetuning script like this: 13 | ``` 14 | accelerate launch --config_file example_fsdp_config.yaml --no_python llmcompressor.transformers.text_generation.finetune 15 | ``` 16 | -------------------------------------------------------------------------------- /examples/finetuning/example_alternating_recipe.yaml: -------------------------------------------------------------------------------- 1 | initial_sparsity_stage: 2 | run_type: oneshot 3 | obcq_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | block_size: 128 7 | percdamp: 0.01 8 | mask_structure: "0:0" 9 | targets: ["Linear"] 10 | ignore: ["re:.*lm_head"] 11 | initial_training_stage: 12 | run_type: train 13 | pruning_modifiers: 14 | ConstantPruningModifier: 15 | targets: '__ALL__' 16 | start: 0 17 | next_sparsity_stage: 18 | run_type: oneshot 19 | obcq_modifiers: 20 | SparseGPTModifier: 21 | sparsity: 0.7 22 | block_size: 128 23 | percdamp: 0.01 24 | mask_structure: "0:0" 25 | targets: ["Linear"] 26 | ignore: ["re:.*lm_head"] 27 | next_training_stage: 28 | run_type: train 29 | pruning_modifiers: 30 | ConstantPruningModifier: 31 | targets: '__ALL__' 32 | start: 0 -------------------------------------------------------------------------------- /examples/finetuning/example_fsdp_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch_policy: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: false 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: false 11 | fsdp_sharding_strategy: 1 12 | fsdp_state_dict_type: SHARDED_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: false 15 | machine_rank: 0 16 | main_training_function: main 17 | num_machines: 1 18 | num_processes: 4 19 | rdzv_backend: static 20 | same_network: true 21 | tpu_env: [] 22 | tpu_use_cluster: false 23 | tpu_use_sudo: false 24 | use_cpu: false 25 | -------------------------------------------------------------------------------- /examples/finetuning/example_single_gpu_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: 'NO' 4 | enable_cpu_affinity: false 5 | gpu_ids: 0 6 | machine_rank: 0 7 | main_training_function: main 8 | num_machines: 1 9 | num_processes: 1 10 | rdzv_backend: static 11 | same_network: true 12 | tpu_env: [] 13 | tpu_use_cluster: false 14 | tpu_use_sudo: false 15 | use_cpu: false -------------------------------------------------------------------------------- /examples/multimodal_vision/mistral3_chat_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "chat_template": "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- else %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] in ['image', 'image_url'] %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {%- if message['content'] is string %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- else %}\n {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}" 3 | } -------------------------------------------------------------------------------- /examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | finetuning_stage: 9 | finetuning_modifiers: 10 | ConstantPruningModifier: 11 | targets: [ 12 | 're:.*q_proj.weight', 13 | 're:.*k_proj.weight', 14 | 're:.*v_proj.weight', 15 | 're:.*o_proj.weight', 16 | 're:.*gate_proj.weight', 17 | 're:.*up_proj.weight', 18 | 're:.*down_proj.weight', 19 | ] 20 | start: 0 21 | quantization_stage: 22 | quantization_modifiers: 23 | GPTQModifier: 24 | ignore: ["lm_head"] 25 | config_groups: 26 | group_0: 27 | weights: 28 | num_bits: 4 29 | type: "int" 30 | symmetric: true 31 | strategy: "group" 32 | group_size: 128 33 | targets: ["Linear"] 34 | -------------------------------------------------------------------------------- /examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | finetuning_stage: 9 | finetuning_modifiers: 10 | ConstantPruningModifier: 11 | targets: [ 12 | 're:.*q_proj.weight', 13 | 're:.*k_proj.weight', 14 | 're:.*v_proj.weight', 15 | 're:.*o_proj.weight', 16 | 're:.*gate_proj.weight', 17 | 're:.*up_proj.weight', 18 | 're:.*down_proj.weight', 19 | ] 20 | start: 0 21 | quantization_stage: 22 | quantization_modifiers: 23 | GPTQModifier: 24 | ignore: ["lm_head"] 25 | config_groups: 26 | group_0: 27 | weights: 28 | num_bits: 4 29 | type: "int" 30 | symmetric: true 31 | strategy: "channel" 32 | targets: ["Linear"] 33 | -------------------------------------------------------------------------------- /examples/quantization_w4a16_fp4/llama3_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" 7 | 8 | # Load model. 9 | model = AutoModelForCausalLM.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp4 with per group 16 via ptq 17 | recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]) 18 | 19 | # Apply quantization. 20 | oneshot(model=model, recipe=recipe) 21 | 22 | print("\n\n") 23 | print("========== SAMPLE GENERATION ==============") 24 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") 25 | output = model.generate(input_ids, max_new_tokens=100) 26 | print(tokenizer.decode(output[0])) 27 | print("==========================================\n\n") 28 | 29 | 30 | # Save to disk in compressed-tensors format. 31 | SAVE_DIR = MODEL_ID.split("/")[1] + "-NVFP4A16" 32 | model.save_pretrained(SAVE_DIR, save_compressed=True) 33 | tokenizer.save_pretrained(SAVE_DIR) 34 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/gemma2_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "google/gemma-2-27b-it" 7 | 8 | # 1) Load model. 9 | model = AutoModelForCausalLM.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 13 | 14 | # 2) Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp8 with per channel via ptq 17 | # * quantize the activations to fp8 with dynamic per token 18 | recipe = QuantizationModifier( 19 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] 20 | ) 21 | 22 | # 3) Apply quantization and save in compressed-tensors format. 23 | OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 24 | oneshot( 25 | model=model, 26 | recipe=recipe, 27 | tokenizer=tokenizer, 28 | output_dir=OUTPUT_DIR, 29 | ) 30 | 31 | # Confirm generations of the quantized model look sane. 32 | # NOTE: transformers 4.49.0 results in a generation error with gemma2. 33 | # Consider either downgrading your transformers version to a previous version 34 | # or use vLLM for sample generation. 35 | print("========== SAMPLE GENERATION ==============") 36 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") 37 | output = model.generate(input_ids, max_new_tokens=20) 38 | print(tokenizer.decode(output[0])) 39 | print("==========================================") 40 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/llama3.2_vision_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, MllamaForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" 7 | 8 | # Load model. 9 | model = MllamaForConditionalGeneration.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | processor = AutoProcessor.from_pretrained(MODEL_ID) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp8 with per channel via ptq 17 | # * quantize the activations to fp8 with dynamic per token 18 | recipe = QuantizationModifier( 19 | targets="Linear", 20 | scheme="FP8_DYNAMIC", 21 | ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"], 22 | ) 23 | 24 | # Apply quantization and save to disk in compressed-tensors format. 25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 26 | oneshot( 27 | model=model, 28 | recipe=recipe, 29 | output_dir=SAVE_DIR, 30 | ) 31 | processor.save_pretrained(SAVE_DIR) 32 | 33 | # Confirm generations of the quantized model look sane. 34 | print("========== SAMPLE GENERATION ==============") 35 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") 36 | output = model.generate(input_ids, max_new_tokens=20) 37 | print(processor.decode(output[0])) 38 | print("==========================================") 39 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/llama3_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" 7 | 8 | # Load model. 9 | model = AutoModelForCausalLM.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp8 with per channel via ptq 17 | # * quantize the activations to fp8 with dynamic per token 18 | recipe = QuantizationModifier( 19 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] 20 | ) 21 | 22 | # Apply quantization. 23 | oneshot(model=model, recipe=recipe) 24 | 25 | # Confirm generations of the quantized model look sane. 26 | print("========== SAMPLE GENERATION ==============") 27 | input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda") 28 | output = model.generate(input_ids, max_new_tokens=20) 29 | print(tokenizer.decode(output[0])) 30 | print("==========================================") 31 | 32 | # Save to disk in compressed-tensors format. 33 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 34 | model.save_pretrained(SAVE_DIR) 35 | tokenizer.save_pretrained(SAVE_DIR) 36 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/llava1.5_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, LlavaForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "llava-hf/llava-1.5-7b-hf" 7 | 8 | # Load model. 9 | model = LlavaForConditionalGeneration.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | processor = AutoProcessor.from_pretrained(MODEL_ID) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp8 with per channel via ptq 17 | # * quantize the activations to fp8 with dynamic per token 18 | recipe = QuantizationModifier( 19 | targets="Linear", 20 | scheme="FP8_DYNAMIC", 21 | ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"], 22 | ) 23 | 24 | # Apply quantization and save to disk in compressed-tensors format. 25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR) 27 | processor.save_pretrained(SAVE_DIR) 28 | 29 | # Confirm generations of the quantized model look sane. 30 | print("========== SAMPLE GENERATION ==============") 31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(processor.decode(output[0])) 34 | print("==========================================") 35 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/qwen2vl_example.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration 2 | 3 | from llmcompressor import oneshot 4 | from llmcompressor.modifiers.quantization import QuantizationModifier 5 | 6 | MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct" 7 | 8 | # Load model. 9 | model = Qwen2VLForConditionalGeneration.from_pretrained( 10 | MODEL_ID, device_map="auto", torch_dtype="auto" 11 | ) 12 | processor = AutoProcessor.from_pretrained(MODEL_ID) 13 | 14 | # Configure the quantization algorithm and scheme. 15 | # In this case, we: 16 | # * quantize the weights to fp8 with per channel via ptq 17 | # * quantize the activations to fp8 with dynamic per token 18 | recipe = QuantizationModifier( 19 | targets="Linear", 20 | scheme="FP8_DYNAMIC", 21 | ignore=["re:.*lm_head", "re:visual.*"], 22 | ) 23 | 24 | # Apply quantization and save to disk in compressed-tensors format. 25 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 26 | oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR) 27 | processor.save_pretrained(SAVE_DIR) 28 | 29 | # Confirm generations of the quantized model look sane. 30 | print("========== SAMPLE GENERATION ==============") 31 | input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") 32 | output = model.generate(input_ids, max_new_tokens=20) 33 | print(processor.decode(output[0])) 34 | print("==========================================") 35 | -------------------------------------------------------------------------------- /examples/quantization_w8a8_fp8/whisper_example.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import AutoProcessor, WhisperForConditionalGeneration 3 | 4 | from llmcompressor import oneshot 5 | from llmcompressor.modifiers.quantization import QuantizationModifier 6 | 7 | MODEL_ID = "openai/whisper-large-v2" 8 | 9 | # Load model. 10 | model = WhisperForConditionalGeneration.from_pretrained( 11 | MODEL_ID, device_map="auto", torch_dtype="auto" 12 | ) 13 | model.config.forced_decoder_ids = None 14 | processor = AutoProcessor.from_pretrained(MODEL_ID) 15 | processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") 16 | 17 | # Configure the quantization algorithm and scheme. 18 | # In this case, we: 19 | # * quantize the weights to fp8 with per channel via ptq 20 | # * quantize the activations to fp8 with dynamic per token 21 | recipe = QuantizationModifier( 22 | targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] 23 | ) 24 | 25 | # Apply quantization. 26 | oneshot(model=model, recipe=recipe) 27 | 28 | # Confirm generations of the quantized model look sane. 29 | print("========== SAMPLE GENERATION ==============") 30 | ds = load_dataset( 31 | "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]" 32 | ) 33 | sample = ds[0]["audio"] 34 | input_features = processor( 35 | sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" 36 | ).input_features 37 | input_features = input_features.to(model.device) 38 | output_ids = model.generate(input_features, language="en", forced_decoder_ids=None) 39 | print(processor.batch_decode(output_ids, skip_special_tokens=False)[0]) 40 | # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel 41 | print("==========================================") 42 | 43 | # Save to disk in compressed-tensors format. 44 | SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" 45 | model.save_pretrained(SAVE_DIR, save_compressed=True) 46 | processor.save_pretrained(SAVE_DIR) 47 | -------------------------------------------------------------------------------- /examples/quantizing_moe/deepseek_recipe_w4a16.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head, "re:.*mlp.gate$"] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /examples/trl_mixin/README.md: -------------------------------------------------------------------------------- 1 | # Sparse Finetuning with TRL's SFTTrainer 2 | 3 | The `SessionManagerMixin` can be added to other Trainer classes that inherit from 4 | [Hugging Face's Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer). 5 | 6 | For example, we can add LLM Compressor support to TRL's SFTTrainer like so: 7 | 8 | Note: install `trl` using `pip install trl` 9 | 10 | ```python 11 | from trl import SFTTrainer as TRLSFTTrainer 12 | 13 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): 14 | ... 15 | ``` 16 | 17 | The new `SFTTrainer` class can now apply LLM Compressor recipes and modifiers during 18 | supervised finetuning, will full support for all of the original TRL features. The full 19 | class is defined in the script `sft_trainer.py` and requires very minimal 20 | additional code: just a dataset load override to support passing in tokenized datasets 21 | to the Trainer. 22 | 23 | ### Examples 24 | 25 | * Script `ex_trl_constant.py`: finetunes a 50% sparse Llama-7b model, 26 | using TRL's dataset preprocessing. Sparsity is maintained throughout training by 27 | applying a `ConstantPruningModifier` recipe to the `SFTTrainer` 28 | 29 | * Script `ex_trl_distillation.py`: finetunes a 50% sparse Llama-7b 30 | model using knowledge distillation from a dense Llama-7b model. Sparsity is maintained 31 | throughout training with a `ConstantPruningModifier` and layer-wise knowledge 32 | distillation is handled by the `OutputDistillationModifier` -------------------------------------------------------------------------------- /examples/trl_mixin/ex_trl_constant.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from sft_trainer import SFTTrainer 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from trl import DataCollatorForCompletionOnlyLM 5 | 6 | from llmcompressor.args import ModelArguments 7 | 8 | model_path = "neuralmagic/Llama-2-7b-pruned50-retrained" 9 | output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data" 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_path, torch_dtype="auto", device_map="auto" 12 | ) 13 | tokenizer = AutoTokenizer.from_pretrained(model_path) 14 | tokenizer.pad_token = tokenizer.eos_token 15 | 16 | # recipe for maintaining model sparsity during finetuning 17 | recipe = """ 18 | test_stage: 19 | pruning_modifiers: 20 | ConstantPruningModifier: 21 | targets: ['re:.*q_proj.weight', 're:.*k_proj.weight', 're:.*v_proj.weight', 22 | 're:.*o_proj.weight','re:.*gate_proj.weight', 're:.*up_proj.weight', 23 | 're:.*down_proj.weight'] 24 | start: 0 25 | """ 26 | 27 | # Load gsm8k using TRL dataset tools 28 | dataset = load_dataset("gsm8k", "main", split="train") 29 | 30 | 31 | def formatting_prompts_func(example): 32 | output_texts = [] 33 | for i in range(len(example["question"])): 34 | text = f"Question: {example['question'][i]}\n Answer: {example['answer'][i]}" 35 | output_texts.append(text) 36 | return output_texts 37 | 38 | 39 | response_template = "Answer:" 40 | collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer) 41 | 42 | trl_sft_config_args = dict( 43 | output_dir=output_dir, 44 | num_train_epochs=0.6, 45 | logging_steps=50, 46 | gradient_checkpointing=True, 47 | max_seq_length=512, 48 | ) 49 | model_args = ModelArguments(model=model) 50 | 51 | trainer = SFTTrainer( 52 | model=model, 53 | processing_class=tokenizer, 54 | recipe=recipe, 55 | train_dataset=dataset, 56 | formatting_func=formatting_prompts_func, 57 | data_collator=collator, 58 | trl_sft_config_args=trl_sft_config_args, 59 | model_args=model_args, 60 | ) 61 | trainer.train() 62 | -------------------------------------------------------------------------------- /examples/trl_mixin/sft_trainer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from trl import SFTConfig as TRLSFTConfig 4 | from trl import SFTTrainer as TRLSFTTrainer 5 | 6 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn 7 | 8 | __all__ = ["SFTTrainer"] 9 | 10 | 11 | class SFTTrainer(SessionManagerMixIn, TRLSFTTrainer): 12 | def __init__(self, trl_sft_config_args: Optional[Dict] = None, *args, **kwargs): 13 | if trl_sft_config_args is not None: 14 | kwargs["args"] = TRLSFTConfig(**trl_sft_config_args) 15 | super().__init__(*args, **kwargs) 16 | 17 | def _prepare_dataset(self, dataset, *args, **kwargs): 18 | if "input_ids" in dataset.column_names: 19 | # dataset is already tokenized, skip preprocessing 20 | return dataset 21 | 22 | return super()._prepare_dataset(dataset, *args, **kwargs) 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "setuptools_scm==8.2.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.black] 6 | line-length = 88 7 | target-version = ['py38'] 8 | 9 | [tool.isort] 10 | profile = "black" 11 | skip = ["src/llmcompressor/transformers/tracing/", "src/llmcompressor/version.py"] 12 | 13 | [tool.mypy] 14 | files = "src/guidellm" 15 | 16 | [tool.ruff] 17 | exclude = ["build", "dist", "env", ".venv", "src/llmcompressor/transformers/tracing/"] 18 | lint.select = ["E", "F", "W"] 19 | 20 | [tool.flake8] 21 | max-line-length = 88 22 | extend-ignore = 'E203' 23 | 24 | [tool.pytest.ini_options] 25 | markers = [ 26 | "smoke: quick tests to check basic functionality", 27 | "sanity: tests to ensure that new changes do not break existing functionality", 28 | "regression: detailed tests to ensure major functions work correctly", 29 | "integration: tests which integrate with a third party service such as HF", 30 | "unit: tests to ensure code correctness and regression test functionality", 31 | "example: tests for content in the 'examples' folder", 32 | "multi_gpu: tests that require multiple GPUs", 33 | ] 34 | tmp_path_retention_policy = "failed" 35 | -------------------------------------------------------------------------------- /src/llmcompressor/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A library for compressing large language models utilizing the latest techniques and 3 | research in the field for both training aware and post training techniques. 4 | 5 | The library is designed to be flexible and easy to use on top of 6 | PyTorch and HuggingFace Transformers, allowing for quick experimentation. 7 | """ 8 | 9 | # flake8: noqa 10 | 11 | from .logger import LoggerConfig, configure_logger, logger 12 | from .version import __version__, version 13 | 14 | __all__ = [ 15 | "__version__", 16 | "version", 17 | "configure_logger", 18 | "logger", 19 | "LoggerConfig", 20 | ] 21 | 22 | from llmcompressor.core.session_functions import ( 23 | active_session, 24 | callbacks, 25 | create_session, 26 | reset_session, 27 | ) 28 | from llmcompressor.entrypoints import Oneshot, oneshot, train 29 | -------------------------------------------------------------------------------- /src/llmcompressor/args/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .dataset_arguments import DatasetArguments 4 | from .model_arguments import ModelArguments 5 | from .recipe_arguments import RecipeArguments 6 | from .training_arguments import TrainingArguments 7 | from .utils import parse_args 8 | -------------------------------------------------------------------------------- /src/llmcompressor/args/recipe_arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional 3 | 4 | 5 | @dataclass 6 | class RecipeArguments: 7 | """Recipe and session variables""" 8 | 9 | recipe: Optional[str] = field( 10 | default=None, 11 | metadata={ 12 | "help": "Path to a LLM Compressor sparsification recipe", 13 | }, 14 | ) 15 | recipe_args: Optional[List[str]] = field( 16 | default=None, 17 | metadata={ 18 | "help": ( 19 | "List of recipe arguments to evaluate, of the format key1=value1 " 20 | "key2=value2" 21 | ) 22 | }, 23 | ) 24 | clear_sparse_session: Optional[bool] = field( 25 | default=False, 26 | metadata={ 27 | "help": ( 28 | "Whether to clear CompressionSession/CompressionLifecycle ", 29 | "data between runs.", 30 | ) 31 | }, 32 | ) 33 | stage: Optional[str] = field( 34 | default=None, 35 | metadata={"help": ("The stage of the recipe to use for oneshot / train.",)}, 36 | ) 37 | -------------------------------------------------------------------------------- /src/llmcompressor/args/training_arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from transformers import TrainingArguments as HFTrainingArgs 5 | 6 | __all__ = [ 7 | "TrainingArguments", 8 | ] 9 | 10 | 11 | @dataclass 12 | class TrainingArguments(HFTrainingArgs): 13 | """ 14 | Training arguments specific to LLM Compressor Transformers workflow using 15 | HFTrainingArgs as base class 16 | 17 | """ 18 | 19 | do_oneshot: Optional[bool] = field( 20 | default=False, 21 | metadata={"help": "Whether to run one-shot calibration in stages"}, 22 | ) 23 | run_stages: Optional[bool] = field( 24 | default=False, metadata={"help": "Whether to trigger recipe stage by stage"} 25 | ) 26 | output_dir: str = field( 27 | default="./output", 28 | metadata={ 29 | "help": "The output directory where the model safetensors, " 30 | "recipe, config, and optionally checkpoints will be written." 31 | }, 32 | ) 33 | 34 | @property 35 | def place_model_on_device(self): 36 | return False 37 | -------------------------------------------------------------------------------- /src/llmcompressor/core/__init__.py: -------------------------------------------------------------------------------- 1 | from llmcompressor.core.events import Event, EventType 2 | from llmcompressor.core.lifecycle import CompressionLifecycle 3 | from llmcompressor.core.model_layer import ModelParameterizedLayer 4 | from llmcompressor.core.session import CompressionSession 5 | from llmcompressor.core.session_functions import ( 6 | LifecycleCallbacks, 7 | active_session, 8 | callbacks, 9 | create_session, 10 | reset_session, 11 | ) 12 | from llmcompressor.core.state import Data, Hardware, ModifiedState, State 13 | 14 | __all__ = [ 15 | "Event", 16 | "EventType", 17 | "State", 18 | "Data", 19 | "Hardware", 20 | "ModifiedState", 21 | "ModelParameterizedLayer", 22 | "CompressionLifecycle", 23 | "CompressionSession", 24 | "create_session", 25 | "active_session", 26 | "reset_session", 27 | "apply", 28 | "callbacks", 29 | "LifecycleCallbacks", 30 | ] 31 | -------------------------------------------------------------------------------- /src/llmcompressor/core/events/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM Compressor Core Events Package 3 | 4 | This package provides the core components and lifecycle management for events 5 | used in the LLM Compressor framework. It includes definitions for various 6 | event types and lifecycles that are critical for managing the state and 7 | execution flow of the model compression and training processes. 8 | """ 9 | 10 | from .event import Event, EventType 11 | 12 | __all__ = ["Event", "EventType"] 13 | -------------------------------------------------------------------------------- /src/llmcompressor/core/model_layer.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any 3 | 4 | __all__ = ["ModelParameterizedLayer"] 5 | 6 | 7 | @dataclass 8 | class ModelParameterizedLayer: 9 | """ 10 | A dataclass for holding a parameter and its layer 11 | 12 | :param layer_name: the name of the layer 13 | :param layer: the layer object 14 | :param param_name: the name of the parameter 15 | :param param: the parameter object 16 | """ 17 | 18 | layer_name: str 19 | layer: Any 20 | param_name: str 21 | param: Any 22 | -------------------------------------------------------------------------------- /src/llmcompressor/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .utils import ( 4 | format_calibration_data, 5 | get_calibration_dataloader, 6 | get_processed_dataset, 7 | make_dataset_splits, 8 | ) 9 | -------------------------------------------------------------------------------- /src/llmcompressor/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .oneshot import Oneshot, oneshot 3 | from .train import train 4 | from .utils import post_process, pre_process 5 | -------------------------------------------------------------------------------- /src/llmcompressor/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .logger import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/metrics/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .frequency_manager import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import ModifierFactory 2 | from .interface import ModifierInterface 3 | from .modifier import Modifier 4 | from .stage import StageModifiers 5 | 6 | __all__ = [ 7 | "ModifierFactory", 8 | "ModifierInterface", 9 | "Modifier", 10 | "StageModifiers", 11 | ] 12 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/awq/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | from .mappings import * 5 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .output import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/distillation/output/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/distillation/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/distillation/utils/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/distillation/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .kd_factory import * 4 | from .kd_wrapper import * 5 | from .model_wrapper import * 6 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/experimental/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/experimental/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/interface.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from llmcompressor.core.events import Event 4 | from llmcompressor.core.state import State 5 | 6 | __all__ = ["ModifierInterface"] 7 | 8 | 9 | class ModifierInterface(ABC): 10 | """ 11 | Defines the contract that all modifiers must implement 12 | """ 13 | 14 | @property 15 | @abstractmethod 16 | def initialized(self) -> bool: 17 | """ 18 | :return: True if the modifier has been initialized 19 | """ 20 | raise NotImplementedError() 21 | 22 | @property 23 | @abstractmethod 24 | def finalized(self) -> bool: 25 | """ 26 | :return: True if the modifier has been finalized 27 | """ 28 | raise NotImplementedError() 29 | 30 | @abstractmethod 31 | def initialize(self, state: State, **kwargs): 32 | """ 33 | Initialize the modifier 34 | 35 | :param state: The current state of the model 36 | :param kwargs: Additional keyword arguments 37 | for modifier initialization 38 | """ 39 | raise NotImplementedError() 40 | 41 | @abstractmethod 42 | def finalize(self, state: State, **kwargs): 43 | """ 44 | Finalize the modifier 45 | 46 | :param state: The current state of the model 47 | :param kwargs: Additional keyword arguments for 48 | modifier finalization 49 | """ 50 | raise NotImplementedError() 51 | 52 | @abstractmethod 53 | def update_event(self, state: State, event: Event, **kwargs): 54 | """ 55 | Update the modifier based on the event 56 | 57 | :param state: The current state of the model 58 | :param event: The event to update the modifier with 59 | :param kwargs: Additional keyword arguments for 60 | modifier update 61 | """ 62 | raise NotImplementedError() 63 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/obcq/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .constant import * 4 | from .magnitude import * 5 | from .wanda import * 6 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/constant/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import ConstantPruningModifier 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/magnitude/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import MagnitudePruningModifier 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/modifiers/pruning/utils/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .layer_mask import * 4 | from .mask_factory import * 5 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/pruning/wanda/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .cache import * 4 | from .gptq import * 5 | from .quantization import * 6 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/quantization/gptq/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/quantization/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | from .mixin import * 5 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .constants import * 4 | from .helpers import * 5 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/utils/constants.py: -------------------------------------------------------------------------------- 1 | __all__ = ["SPARSITY_THRESHOLD"] 2 | 3 | SPARSITY_THRESHOLD: float = 0.05 4 | -------------------------------------------------------------------------------- /src/llmcompressor/modifiers/utils/pytorch_helpers.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | from torch.nn import Module 5 | 6 | __all__ = [ 7 | "apply_pad_mask_to_batch", 8 | "is_moe_model", 9 | ] 10 | 11 | 12 | def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: 13 | """ 14 | Apply a mask to the input ids of a batch. This is used to zero out 15 | padding tokens so they do not contribute to the hessian calculation in the 16 | GPTQ and SparseGPT algorithms 17 | 18 | Assumes that `attention_mask` only contains zeros and ones 19 | 20 | :param batch: batch to apply padding to if it exists 21 | :return: batch with padding zeroed out in the input_ids 22 | """ 23 | if "attention_mask" in batch: 24 | for key in ("input_ids", "decoder_input_ids"): 25 | if key in batch: 26 | batch[key] = batch[key] * batch["attention_mask"] 27 | 28 | return batch 29 | 30 | 31 | def is_moe_model(model: Module) -> bool: 32 | """ 33 | Check if the model is a mixture of experts model 34 | 35 | :param model: the model to check 36 | :return: True if the model is a mixture of experts model 37 | """ 38 | 39 | # Check for MoE components 40 | for _, module in model.named_modules(): 41 | module_name = module.__class__.__name__ 42 | if "MoE" in module_name or "Expert" in module_name: 43 | return True 44 | 45 | # Check config for MoE attributes 46 | if hasattr(model, "config"): 47 | if any( 48 | "moe" in attr.lower() or "expert" in attr.lower() 49 | for attr in dir(model.config) 50 | ): 51 | return True 52 | 53 | return False 54 | -------------------------------------------------------------------------------- /src/llmcompressor/observers/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # isort: skip_file 3 | 4 | from .helpers import * 5 | from .base import * 6 | from .min_max import * 7 | from .mse import * 8 | -------------------------------------------------------------------------------- /src/llmcompressor/observers/helpers.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import torch 4 | 5 | __all__ = ["get_observer_token_count"] 6 | 7 | 8 | def get_observer_token_count(module: torch.nn.Module) -> Counter: 9 | """ 10 | Parse the module and return the number of tokens observed by 11 | each module's observer. 12 | 13 | :param module: module to parse 14 | :return: counter with the number of tokens observed by each observer 15 | """ 16 | token_counts = Counter() 17 | for name, module in module.named_modules(): 18 | if name.endswith(".input_observer"): 19 | token_counts[name.replace(".input_observer", "")] = ( 20 | module._num_observed_tokens 21 | ) 22 | return token_counts 23 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # populate registry 3 | from .basic import * 4 | from .data_free import * 5 | from .independent import * 6 | from .layer_sequential import * 7 | from .registry import * 8 | from .sequential import * 9 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/basic/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/basic/pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Union 2 | 3 | import torch 4 | import tqdm 5 | from compressed_tensors.utils import get_execution_device 6 | from torch.utils.data.dataloader import DataLoader 7 | 8 | from llmcompressor.core import LifecycleCallbacks 9 | from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch 10 | from llmcompressor.pipelines.registry import CalibrationPipeline 11 | from llmcompressor.pytorch.utils.helpers import tensors_to_device 12 | from llmcompressor.utils.helpers import calibration_forward_context 13 | 14 | if TYPE_CHECKING: 15 | from llmcompressor.args.dataset_arguments import DatasetArguments 16 | 17 | __all__ = ["BasicPipeline", "run_calibration"] 18 | 19 | 20 | @CalibrationPipeline.register("basic") 21 | class BasicPipeline(CalibrationPipeline): 22 | @staticmethod 23 | def __call__( 24 | model: torch.nn.Module, 25 | dataloader: DataLoader, 26 | dataset_args: Union["DatasetArguments", None], 27 | ): 28 | """ 29 | Run a basic data pipeline. 30 | 31 | Batches are fetched from the data loader and are used to perform forward passes 32 | through the model. This pipeline is typically used for basic model calibration 33 | and, unlike the sequential pipelines, does not propagate compression error when 34 | used to calibrate model compression 35 | 36 | :param model: model being calibrated 37 | :param dataloader: loads data for calibration 38 | :param dataset_args: dataset arguments relevant to pipelines 39 | """ 40 | model_device = get_execution_device(model) 41 | 42 | LifecycleCallbacks.calibration_epoch_start() 43 | 44 | with calibration_forward_context(model): 45 | for batch in tqdm.tqdm(dataloader, desc="Calibrating"): 46 | batch = apply_pad_mask_to_batch(batch) 47 | batch = tensors_to_device(batch, model_device) 48 | model(**batch) 49 | 50 | LifecycleCallbacks.calibration_epoch_end() 51 | 52 | 53 | def run_calibration(model: torch.nn.Module, dataloader: DataLoader): 54 | pipeline = BasicPipeline() 55 | pipeline(model, dataloader, None) 56 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/data_free/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/data_free/pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Optional 2 | 3 | import torch 4 | from torch.utils.data.dataloader import DataLoader 5 | 6 | from llmcompressor.core.session_functions import LifecycleCallbacks 7 | from llmcompressor.pipelines.registry import CalibrationPipeline 8 | 9 | if TYPE_CHECKING: 10 | from llmcompressor.args.dataset_arguments import DatasetArguments 11 | 12 | __all__ = ["DataFreePipeline"] 13 | 14 | 15 | @CalibrationPipeline.register("datafree") 16 | class DataFreePipeline(CalibrationPipeline): 17 | @staticmethod 18 | def __call__( 19 | model: torch.nn.Module, 20 | dataloader: Optional[DataLoader], 21 | dataset_args: "DatasetArguments", 22 | ): 23 | """ 24 | A pipeline for data-free calibration 25 | 26 | :param model: model being calibrated 27 | :param dataloader: loads data for calibration 28 | :param dataset_args: dataset arguments relevant to pipelines 29 | """ 30 | LifecycleCallbacks.calibration_epoch_start() 31 | LifecycleCallbacks.calibration_epoch_end() 32 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/independent/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/independent/pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | import torch 4 | from loguru import logger 5 | from torch.utils.data.dataloader import DataLoader 6 | 7 | from llmcompressor.core import active_session 8 | from llmcompressor.modifiers.stage import StageModifiers 9 | from llmcompressor.pipelines.registry import CalibrationPipeline 10 | from llmcompressor.utils.helpers import patch_attr 11 | 12 | if TYPE_CHECKING: 13 | from llmcompressor.args.dataset_arguments import DatasetArguments 14 | 15 | __all__ = ["IndependentPipeline"] 16 | 17 | 18 | @CalibrationPipeline.register("independent") 19 | class IndependentPipeline(CalibrationPipeline): 20 | @staticmethod 21 | def __call__( 22 | model: torch.nn.Module, 23 | dataloader: DataLoader, 24 | dataset_args: "DatasetArguments", 25 | ): 26 | """ 27 | Data pipeline where each modifier is assigned its own calibration epoch and data 28 | pipeline 29 | 30 | :param model: model being calibrated 31 | :param dataloader: loads data for calibration 32 | :param dataset_args: dataset arguments relevant to pipelines 33 | """ 34 | _logger = logger.patch(lambda r: r.update(function="IndependentPipeline")) 35 | 36 | session = active_session() 37 | modifiers = session.get_modifiers() 38 | with patch_attr(session.lifecycle, "modifiers", None): 39 | for index, modifier in enumerate(modifiers): 40 | mod_type = str(type(modifier).__name__) 41 | session.lifecycle.modifiers = [ 42 | StageModifiers(modifiers=[modifier], group=mod_type, index=index) 43 | ] 44 | 45 | pipeline = CalibrationPipeline.from_modifiers([modifier]) 46 | pipeline_name = pipeline.__class__.__name__ 47 | _logger.info(f"Inferred `{pipeline_name}` for `{mod_type}`") 48 | 49 | pipeline(model, dataloader, dataset_args) 50 | 51 | # restore modifiers on exit so model can be compressed based on recipe 52 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/layer_sequential/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .pipeline import * 3 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/sequential/README.md: -------------------------------------------------------------------------------- 1 | # Sequential Pipeline # 2 | The sequential pipeline is a data pipeline, primarily used for compressing models with the 3 | [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the 4 | [SparseGPTModifier](/src/llmcompressor/modifiers/obcq/base.py). 5 | -------------------------------------------------------------------------------- /src/llmcompressor/pipelines/sequential/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .helpers import get_targets_from_modifiers 3 | from .pipeline import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functionality for working with and sparsifying Models in the PyTorch framework 3 | """ 4 | 5 | import os 6 | import warnings 7 | 8 | from packaging import version 9 | 10 | try: 11 | import torch 12 | 13 | _PARSED_TORCH_VERSION = version.parse(torch.__version__) 14 | 15 | if _PARSED_TORCH_VERSION.major >= 2: 16 | torch_compile_func = torch.compile 17 | 18 | def raise_torch_compile_warning(*args, **kwargs): 19 | warnings.warn( 20 | "torch.compile is not supported by llmcompressor for torch 2.0.x" 21 | ) 22 | return torch_compile_func(*args, **kwargs) 23 | 24 | torch.compile = raise_torch_compile_warning 25 | 26 | _BYPASS = bool(int(os.environ.get("NM_BYPASS_TORCH_VERSION", "0"))) 27 | if _PARSED_TORCH_VERSION.major == 1 and _PARSED_TORCH_VERSION.minor in [10, 11]: 28 | if not _BYPASS: 29 | raise RuntimeError( 30 | "llmcompressor does not support torch==1.10.* or 1.11.*. " 31 | f"Found torch version {torch.__version__}.\n\n" 32 | "To bypass this error, set environment variable " 33 | "`NM_BYPASS_TORCH_VERSION` to '1'.\n\n" 34 | "Bypassing may result in errors or " 35 | "incorrect behavior, so set at your own risk." 36 | ) 37 | else: 38 | warnings.warn( 39 | "llmcompressor quantized onnx export does not work " 40 | "with torch==1.10.* or 1.11.*" 41 | ) 42 | except ImportError: 43 | pass 44 | 45 | # flake8: noqa 46 | -------------------------------------------------------------------------------- /src/llmcompressor/pytorch/model_load/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/model_load/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/pytorch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generic code used as utilities and helpers for PyTorch 3 | """ 4 | 5 | # flake8: noqa 6 | 7 | from .helpers import * 8 | from .sparsification import * 9 | -------------------------------------------------------------------------------- /src/llmcompressor/pytorch/utils/sparsification_info/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/pytorch/utils/sparsification_info/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/recipe/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import RecipeBase 2 | from .metadata import DatasetMetaData, LayerMetaData, ModelMetaData, ParamMetaData 3 | from .modifier import RecipeModifier 4 | from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput 5 | from .stage import RecipeStage 6 | 7 | __all__ = [ 8 | "DatasetMetaData", 9 | "ParamMetaData", 10 | "LayerMetaData", 11 | "ModelMetaData", 12 | "RecipeBase", 13 | "RecipeModifier", 14 | "RecipeStage", 15 | "Recipe", 16 | "RecipeInput", 17 | "RecipeStageInput", 18 | "RecipeArgsInput", 19 | ] 20 | -------------------------------------------------------------------------------- /src/llmcompressor/recipe/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any 3 | 4 | from pydantic import BaseModel, ConfigDict 5 | 6 | __all__ = ["RecipeBase"] 7 | 8 | 9 | class RecipeBase(BaseModel, ABC): 10 | """ 11 | Defines the contract that `Recipe` and its components 12 | such as `RecipeModifier` and `RecipeStage` must follow. 13 | 14 | All inheritors of this class must implement the following methods: 15 | - calculate_start 16 | - calculate_end 17 | - evaluate 18 | - create_modifier 19 | """ 20 | 21 | model_config = ConfigDict(arbitrary_types_allowed=True) 22 | 23 | @abstractmethod 24 | def create_modifier(self) -> Any: 25 | raise NotImplementedError() 26 | -------------------------------------------------------------------------------- /src/llmcompressor/recipe/metadata.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | __all__ = [ 6 | "DatasetMetaData", 7 | "ParamMetaData", 8 | "LayerMetaData", 9 | "ModelMetaData", 10 | ] 11 | 12 | 13 | class DatasetMetaData(BaseModel): 14 | name: str = None 15 | version: str = None 16 | hash: str = None 17 | shape: List[int] = Field(default_factory=list) 18 | num_classes: int = None 19 | num_train_samples: int = None 20 | num_val_samples: int = None 21 | num_test_samples: int = None 22 | 23 | 24 | class ParamMetaData(BaseModel): 25 | name: str = None 26 | shape: List[int] = None 27 | weight_hash: str = None 28 | 29 | 30 | class LayerMetaData(BaseModel): 31 | name: str = None 32 | type: str = None 33 | index: int = None 34 | attributes: Dict[str, Any] = None 35 | input_shapes: List[List[int]] = None 36 | output_shapes: List[List[int]] = None 37 | params: Dict[str, ParamMetaData] = None 38 | 39 | 40 | class ModelMetaData(BaseModel): 41 | architecture: str = None 42 | sub_architecture: str = None 43 | input_shapes: List[List[int]] = None 44 | output_shapes: List[List[int]] = None 45 | layers: List[LayerMetaData] = Field(default_factory=list) 46 | layer_prefix: Optional[str] = None 47 | -------------------------------------------------------------------------------- /src/llmcompressor/sentinel.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from pydantic_core import core_schema 4 | 5 | _registry = {} 6 | 7 | 8 | class Sentinel: 9 | """ 10 | Unique sentinel values. Implements https://peps.python.org/pep-0661/ 11 | with dummy pydantic validation 12 | """ 13 | 14 | def __new__(cls, name, module_name=None): 15 | name = str(name) 16 | 17 | if module_name is None: 18 | module_name = inspect.currentframe().f_globals.get("__file__") 19 | if module_name is None: 20 | module_name = __name__ 21 | 22 | registry_key = f"{module_name}-{name}" 23 | 24 | sentinel = _registry.get(registry_key, None) 25 | if sentinel is not None: 26 | return sentinel 27 | 28 | sentinel = super().__new__(cls) 29 | sentinel._name = name 30 | sentinel._module_name = module_name 31 | 32 | return _registry.setdefault(registry_key, sentinel) 33 | 34 | def __repr__(self): 35 | return self._name 36 | 37 | def __reduce__(self): 38 | return ( 39 | self.__class__, 40 | ( 41 | self._name, 42 | self._module_name, 43 | ), 44 | ) 45 | 46 | @classmethod 47 | def __get_pydantic_core_schema__(cls, _source_type, _handler): 48 | return core_schema.no_info_plain_validator_function(cls.validate) 49 | 50 | @classmethod 51 | def validate(cls, value: "Sentinel") -> "Sentinel": 52 | return value 53 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for integrating LLM Compressor with transformers training flows 3 | """ 4 | 5 | # flake8: noqa 6 | 7 | # isort: skip_file 8 | # (import order matters for circular import avoidance) 9 | from .utils import * 10 | 11 | from .sparsification import ( 12 | SparseAutoModelForCausalLM, 13 | ) 14 | from .finetune import * 15 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/compression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/src/llmcompressor/transformers/compression/__init__.py -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .data import TextGenerationDataset 4 | from .session_mixin import SessionManagerMixIn 5 | from .text_generation import apply, oneshot, train 6 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base import TextGenerationDataset 4 | from .c4 import C4Dataset 5 | from .cnn_dailymail import CNNDailyMailDataset 6 | from .custom import CustomDataset 7 | from .evolcodealpaca import EvolCodeAlpacaDataset 8 | from .flickr_30k import Flickr30K 9 | from .gsm8k import GSM8KDataset 10 | from .open_platypus import OpenPlatypusDataset 11 | from .peoples_speech import PeoplesSpeech 12 | from .ptb import PtbDataset 13 | from .ultrachat_200k import UltraChatDataset 14 | from .wikitext import WikiTextDataset 15 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/c4.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="c4") 12 | class C4Dataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the C4 dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | def __init__( 22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 23 | ): 24 | dataset_args = deepcopy(dataset_args) 25 | dataset_args.dataset = "allenai/c4" 26 | dataset_args.text_column = "text" 27 | 28 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 29 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/cnn_dailymail.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="cnn_dailymail") 12 | class CNNDailyMailDataset(TextGenerationDataset): 13 | """ 14 | Text generation class for the CNN/DailyMail dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n" 22 | 23 | def __init__( 24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 25 | ): 26 | dataset_args = deepcopy(dataset_args) 27 | dataset_args.dataset = "cnn_dailymail" 28 | dataset_args.dataset_config_name = "3.0.0" 29 | 30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 31 | 32 | def dataset_template(self, sample): 33 | return { 34 | "text": self.SAMPLE_TEMPLATE.format( 35 | article=sample["article"], highlights=sample["highlights"] 36 | ) 37 | } 38 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/custom.py: -------------------------------------------------------------------------------- 1 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 2 | 3 | 4 | @TextGenerationDataset.register(name="custom", alias=["json", "csv"]) 5 | class CustomDataset(TextGenerationDataset): 6 | """ 7 | Child text generation class for custom local dataset supporting load 8 | for csv and json 9 | 10 | :param dataset_args: configuration settings for dataset loading 11 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 12 | Can also be set to None to load all the splits 13 | :param processor: processor or tokenizer to use on dataset 14 | 15 | """ 16 | 17 | pass 18 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/evolcodealpaca.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="evolcodealpaca") 12 | class EvolCodeAlpacaDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the Evol Code Alpaca dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | EVOL_ALPACA_TEMPLATE = ( 22 | "Below is an instruction that describes a " 23 | "programming task. Write a program that appropriately " 24 | "completes the request.\n\n### Instruction:\n{instruction}" 25 | "\n\n### Response:\n" 26 | ) 27 | 28 | def __init__( 29 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 30 | ): 31 | dataset_args = deepcopy(dataset_args) 32 | dataset_args.dataset = "theblackcat102/evol-codealpaca-v1" 33 | dataset_args.text_column = "text" 34 | 35 | super().__init__(dataset_args, split=split, processor=processor) 36 | 37 | def dataset_template(self, sample): 38 | prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"]) 39 | text = prompt 40 | if "output" in text: 41 | text += sample["output"] 42 | 43 | return { 44 | "text": text, 45 | self.PROMPT_KEY: prompt, 46 | } 47 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/gsm8k.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="gsm8k") 12 | class GSM8KDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the Grade School Math 8k dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | GSM_TEMPLATE = "Question: {question}\nAnswer:" 22 | 23 | def __init__( 24 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 25 | ): 26 | dataset_args = deepcopy(dataset_args) 27 | dataset_args.dataset = "gsm8k" 28 | dataset_args.text_column = "text" 29 | 30 | super().__init__(dataset_args=dataset_args, split=split, processor=processor) 31 | 32 | def dataset_template(self, sample): 33 | prompt = self.GSM_TEMPLATE.format(question=sample["question"]) 34 | text = prompt 35 | if "answer" in sample: 36 | text += " " + sample["answer"] 37 | 38 | return { 39 | "text": text, 40 | self.PROMPT_KEY: prompt, 41 | } 42 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/ptb.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="ptb") 12 | class PtbDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the PTB dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | def __init__( 22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 23 | ): 24 | dataset_args = deepcopy(dataset_args) 25 | dataset_args.dataset = "ptb_text_only" 26 | dataset_args.text_column = "sentence" 27 | 28 | super().__init__( 29 | dataset_args=dataset_args, 30 | split=split, 31 | processor=processor, 32 | ) 33 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/data/wikitext.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import TYPE_CHECKING 3 | 4 | from llmcompressor.transformers.finetune.data import TextGenerationDataset 5 | from llmcompressor.typing import Processor 6 | 7 | if TYPE_CHECKING: 8 | from llmcompressor.args import DatasetArguments 9 | 10 | 11 | @TextGenerationDataset.register(name="wikitext") 12 | class WikiTextDataset(TextGenerationDataset): 13 | """ 14 | Child text generation class for the Open Platypus dataset 15 | 16 | :param dataset_args: configuration settings for dataset loading 17 | :param split: split from dataset to load, for instance `test` or `train[:5%]` 18 | :param processor: processor or tokenizer to use on dataset 19 | """ 20 | 21 | def __init__( 22 | self, dataset_args: "DatasetArguments", split: str, processor: Processor 23 | ): 24 | dataset_args = deepcopy(dataset_args) 25 | dataset_args.dataset = "Salesforce/wikitext" 26 | dataset_args.text_column = "text" 27 | 28 | super().__init__( 29 | dataset_args=dataset_args, 30 | split=split, 31 | processor=processor, 32 | ) 33 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/text_generation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Adapted from https://github.com/huggingface/transformers 18 | # vllm-project: no copyright 19 | 20 | 21 | from compressed_tensors.utils.helpers import deprecated 22 | 23 | 24 | @deprecated( 25 | message=( 26 | "`from llmcompressor.transformers import oneshot` is deprecated, " 27 | "please use `from llmcompressor import oneshot`." 28 | ) 29 | ) 30 | def oneshot(**kwargs) -> None: 31 | from llmcompressor import oneshot 32 | 33 | oneshot(**kwargs) 34 | 35 | 36 | @deprecated( 37 | message=( 38 | "`from llmcompressor import train` is deprecated, " 39 | "please use `from llmcompressor import train`." 40 | ) 41 | ) 42 | def train(**kwargs): 43 | from llmcompressor import train 44 | 45 | train(**kwargs) 46 | 47 | 48 | def apply(**kwargs): 49 | message = ( 50 | "`from llmcompressor.transformers import apply, compress` is deprecated, " 51 | "please use `from llmcompressor import oneshot, train` " 52 | "for sequential stages." 53 | ) 54 | raise ValueError(message) 55 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/finetune/trainer.py: -------------------------------------------------------------------------------- 1 | from transformers import Trainer as HFTransformersTrainer 2 | 3 | from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn 4 | 5 | __all__ = ["Trainer"] 6 | 7 | 8 | class Trainer(SessionManagerMixIn, HFTransformersTrainer): 9 | pass 10 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/sparsification/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Objects, classes, and methods for applying sparsification algorithms to 3 | Hugging Face transformers flows 4 | """ 5 | 6 | # flake8: noqa 7 | from .sparse_model import * 8 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/sparsification/sparse_model.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Optional 3 | 4 | from loguru import logger 5 | from torch.nn import Module 6 | from transformers import AutoModelForCausalLM 7 | 8 | __all__ = [ 9 | "SparseAutoModelForCausalLM", 10 | "get_processor_name_from_model", 11 | ] 12 | 13 | 14 | class SparseAutoModelForCausalLM: 15 | def from_pretrained(*args, **kwargs): 16 | logger.warning( 17 | "SparseAutoModelForCausalLM is deprecated, " 18 | "please use AutoModelForCausalLM" 19 | ) 20 | return AutoModelForCausalLM.from_pretrained(*args, **kwargs) 21 | 22 | 23 | def get_processor_name_from_model(student: Module, teacher: Optional[Module]) -> str: 24 | """ 25 | Get a processor/tokenizer source used for both student and teacher, assuming 26 | that they could be shared 27 | 28 | :param student: the student model 29 | :param teacher: the teacher model 30 | :return: the source for the processor/tokenizer shared between teacher and model 31 | """ 32 | 33 | if teacher is not None and teacher not in ("disable", "self"): 34 | student_forward_params = list( 35 | inspect.signature(student.forward).parameters.keys() 36 | ) 37 | teacher_forward_params = list( 38 | inspect.signature(teacher.forward).parameters.keys() 39 | ) 40 | diff = [p for p in student_forward_params if p not in teacher_forward_params] 41 | if diff: 42 | raise RuntimeError( 43 | "Teacher tokenizer cannot be used for student " 44 | f"due to missing args: {diff}" 45 | ) 46 | src_model = teacher 47 | else: 48 | src_model = student 49 | return src_model.config._name_or_path 50 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/tracing/__init__.py: -------------------------------------------------------------------------------- 1 | from .debug import trace 2 | 3 | __all__ = ["trace"] 4 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for applying sparsification algorithms to Hugging Face transformers flows 3 | """ 4 | 5 | # flake8: noqa 6 | from .helpers import * 7 | -------------------------------------------------------------------------------- /src/llmcompressor/transformers/utils/preprocessing_functions.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Dict 2 | 3 | from compressed_tensors.registry import RegistryMixin 4 | 5 | if TYPE_CHECKING: 6 | from llmcompressor.transformers.finetune.data.base import TextGenerationDataset 7 | 8 | 9 | class PreprocessingFunctionRegistry(RegistryMixin): 10 | pass 11 | 12 | 13 | @PreprocessingFunctionRegistry.register() 14 | def custom_evolved_codealpaca_dataset(self: "TextGenerationDataset", data: Dict): 15 | PROMPT_DICT = """[Instruction]:\n{instruction}\n\n[Response]:""" 16 | data["prompt"] = PROMPT_DICT.format_map(data) 17 | data["text"] = data["prompt"] + data["output"] 18 | return data 19 | -------------------------------------------------------------------------------- /src/llmcompressor/typing.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from datasets import Dataset, DatasetDict, IterableDataset 4 | from transformers import ( 5 | BaseImageProcessor, 6 | FeatureExtractionMixin, 7 | PreTrainedTokenizer, 8 | ProcessorMixin, 9 | ) 10 | 11 | # Tokenizer or Processor. Processors do not inherit from a unified base class 12 | Processor = Union[ 13 | PreTrainedTokenizer, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin 14 | ] 15 | 16 | # Supported dataset types, IterableDataset is a streamed dataset 17 | DatasetType = Union[Dataset, DatasetDict, IterableDataset] 18 | -------------------------------------------------------------------------------- /src/llmcompressor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | General utility functions used throughout llmcompressor 3 | """ 4 | 5 | # flake8: noqa 6 | 7 | from .helpers import * 8 | -------------------------------------------------------------------------------- /src/llmcompressor/utils/fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | -------------------------------------------------------------------------------- /src/llmcompressor/utils/fsdp/context.py: -------------------------------------------------------------------------------- 1 | try: 2 | from accelerate import Accelerator 3 | except ImportError: 4 | Accelerator = None 5 | 6 | try: 7 | from torch.distributed.fsdp import FullyShardedDataParallel 8 | from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState 9 | except ImportError: 10 | FullyShardedDataParallel = None 11 | 12 | from contextlib import nullcontext 13 | 14 | __all__ = [ 15 | "summon_full_params_context", 16 | "main_process_first_context", 17 | "fix_fsdp_module_name", 18 | ] 19 | 20 | 21 | def summon_full_params_context(model, offload_to_cpu: bool = False): 22 | if FullyShardedDataParallel is not None: 23 | # avoid nested summon_full_param context 24 | if ( 25 | hasattr(model, "training_state") 26 | and model.training_state is TrainingState.SUMMON_FULL_PARAMS 27 | ): 28 | return nullcontext() 29 | return FullyShardedDataParallel.summon_full_params( 30 | model, offload_to_cpu=offload_to_cpu 31 | ) 32 | 33 | return nullcontext() 34 | 35 | 36 | def main_process_first_context(): 37 | """ 38 | Creates a context manager where the main process runs the block before all other 39 | processes. Returns a nullcontext when called from a single process application. 40 | """ 41 | if Accelerator is None: 42 | return nullcontext() 43 | 44 | return Accelerator().main_process_first() 45 | 46 | 47 | def fix_fsdp_module_name(name: str) -> str: 48 | """ 49 | Remove FSDP wrapper prefixes from a module name. 50 | Accounts for scenario where FSDP_WRAPPED_MODULE is 51 | at the end of the name, as well as in the middle. 52 | 53 | :param name: name to strip 54 | :return: stripped name 55 | """ 56 | if FullyShardedDataParallel is None: 57 | return name 58 | 59 | return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace( 60 | "." + FSDP_WRAPPED_MODULE, "" 61 | ) 62 | -------------------------------------------------------------------------------- /src/llmcompressor/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .module import * 4 | -------------------------------------------------------------------------------- /src/llmcompressor/utils/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import torch 4 | 5 | __all__ = ["measure_cuda_memory"] 6 | 7 | 8 | class measure_cuda_memory: 9 | def __init__(self, device=None): 10 | self.device = device 11 | 12 | def reset_peak_memory_stats(self): 13 | torch.cuda.reset_peak_memory_stats(self.device) 14 | 15 | def current_memory_usage(self) -> float: 16 | # Return the memory usage in bytes. 17 | self.reset_peak_memory_stats() 18 | mem = torch.cuda.max_memory_allocated(self.device) 19 | return mem 20 | 21 | def peak_memory_usage(self) -> float: 22 | # Return the peak memory usage in bytes since the last reset 23 | mem = torch.cuda.max_memory_allocated(self.device) 24 | return mem 25 | 26 | def __enter__(self): 27 | self.initial_memory = self.current_memory_usage() 28 | # This allows us to call methods of the context manager if needed 29 | return self 30 | 31 | def __exit__(self, exc_type, exc_val, exc_tb): 32 | self.overall_peak_memory = self.peak_memory_usage() 33 | self.peak_consumed_memory = self.overall_peak_memory - self.initial_memory 34 | 35 | # Force garbage collection 36 | gc.collect() 37 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/__init__.py -------------------------------------------------------------------------------- /tests/data.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | 4 | 5 | # TODO: maybe test type as decorators? 6 | class TestType(Enum): 7 | SANITY = "sanity" 8 | REGRESSION = "regression" 9 | SMOKE = "smoke" 10 | 11 | 12 | class Cadence(Enum): 13 | COMMIT = "commit" 14 | WEEKLY = "weekly" 15 | NIGHTLY = "nightly" 16 | 17 | 18 | @dataclass 19 | class TestConfig: 20 | test_type: TestType 21 | cadence: Cadence 22 | 23 | 24 | @dataclass 25 | class CustomTestConfig(TestConfig): 26 | script_path: str 27 | -------------------------------------------------------------------------------- /tests/e2e/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/__init__.py -------------------------------------------------------------------------------- /tests/e2e/vLLM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/e2e/vLLM/__init__.py -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: FP8_DYNAMIC -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | scheme: FP8_DYNAMIC -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/fp8_static_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: FP8 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/fp8_weight_only_channel.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml 5 | scheme: FP8A16_channel -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/fp8_weight_only_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml 5 | scheme: FP8A16_tensor -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/int8_channel_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W8A8_channel_weight_static_per_tensor -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/int8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A8 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: W8A8_tensor_weight_static_per_tensor_act 8 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml 5 | dataset_id: garage-bAInd/Open-Platypus 6 | dataset_split: train 7 | scheme: W8A8_tensor_weight_static_per_tensor_act 8 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/kv_cache_gptq_tinyllama.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/gptq.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_gptq_tinyllama -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/kv_cache_phi3.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: microsoft/Phi-3-mini-4k-instruct 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_phi3 -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/kv_cache_tinyllama.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/kv_cache/default.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | scheme: kv_cache_default_tinyllama -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml 5 | scheme: sparse2of4_fp8_dynamic 6 | dataset_id: HuggingFaceH4/ultrachat_200k 7 | dataset_split: train_sft -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/sparse2of4_fp8_dynamic_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml 5 | scheme: sparse2of4_fp8_dynamic 6 | dataset_id: garage-bAInd/Open-Platypus 7 | dataset_split: train -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/sparse_24.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml 5 | scheme: sparse2of4_only 6 | dataset_id: HuggingFaceH4/ultrachat_200k 7 | dataset_split: train_sft 8 | save_compressed: True -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_2of4_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_2of4_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_2of4_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_2of4 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: openai/gsm8k 6 | dataset_config: main 7 | dataset_split: train 8 | scheme: W4A16_actorder_group 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: neuralmagic/LLM_compression_calibration 6 | dataset_split: train 7 | scheme: W4A16_actorder_group 8 | save_dir: Qwen2.5-0.5B-actorder-group -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: openai/gsm8k 6 | dataset_config: main 7 | dataset_split: train 8 | scheme: W4A16_actorder_weight 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-weight -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected 6 | dataset_split: train 7 | scheme: W4A16_actorder_weight 8 | save_dir: Qwen2.5-0.5B-actorder-weight -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: Qwen/Qwen2.5-0.5B 4 | scheme: W4A16_channel 5 | dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected 6 | dataset_split: train 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W4A16 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | quant_type: "GPTQ" -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml 5 | dataset_id: "mit-han-lab/pile-val-backup" 6 | dataset_split: validation 7 | num_calibration_samples: 2000 8 | scheme: W4A16_weight_asym_awq 9 | save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w8a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A16_channel 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w8a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: W8A16 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | quant_type: "GPTQ" -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w8a8_dynamic_asym.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | scheme: W8A8_dynamic_asym_activations 7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Asym 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/configs/w8a8_static_asym.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | scheme: W8A8_static_asym_activations 7 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml 8 | save_dir: TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Asym 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_channel.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: channel, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/FP8/recipe_fp8_weight_only_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 8, type: float, symmetric: true, strategy: tensor, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"] 7 | config_groups: 8 | group_0: 9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: token, dynamic: true} 11 | targets: [Linear] 12 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: [lm_head] 7 | config_groups: 8 | group_0: 9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 11 | targets: [Linear] -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | QuantizationModifier: 6 | ignore: [lm_head] 7 | config_groups: 8 | group_0: 9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 10 | input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor} 11 | targets: [Linear] -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | mappings: 6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] 7 | - re:.*input_layernorm 8 | - - ['re:.*gate_proj', 're:.*up_proj'] 9 | - re:.*post_attention_layernorm 10 | GPTQModifier: 11 | ignore: [lm_head] 12 | config_groups: 13 | group_0: 14 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 15 | input_activations: {num_bits: 8, symmetric: false, dynamic: true, strategy: token, type: int} 16 | targets: [Linear] 17 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | GPTQModifier: 6 | ignore: [lm_head] 7 | config_groups: 8 | group_0: 9 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel} 10 | input_activations: {num_bits: 8, symmetric: false, dynamic: false, strategy: tensor, type: int} 11 | targets: [Linear] 12 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | sparsity_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4_fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | QuantizationModifier: 13 | targets: ["Linear"] 14 | ignore: ["lm_head"] 15 | scheme: "FP8_DYNAMIC" 16 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | AWQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false} 8 | targets: [Linear] 9 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | GPTQModifier: 13 | ignore: ["lm_head"] 14 | config_groups: 15 | group_0: 16 | weights: 17 | num_bits: 4 18 | type: "int" 19 | symmetric: true 20 | strategy: "group" 21 | group_size: 128 22 | targets: ["Linear"] 23 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml: -------------------------------------------------------------------------------- 1 | sparsity_stage: 2 | run_type: oneshot 3 | sparsity_modifiers: 4 | SparseGPTModifier: 5 | sparsity: 0.5 6 | mask_structure: "2:4" 7 | targets: ["Linear"] 8 | ignore: ["re:.*lm_head"] 9 | quantization_stage: 10 | run_type: oneshot 11 | quantization_modifiers: 12 | GPTQModifier: 13 | ignore: ["lm_head"] 14 | config_groups: 15 | group_0: 16 | weights: 17 | num_bits: 4 18 | type: "int" 19 | symmetric: true 20 | strategy: "channel" 21 | targets: ["Linear"] 22 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: true 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "group" 14 | targets: ["Linear"] 15 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: true 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "weight" 14 | targets: ["Linear"] 15 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/kv_cache/default.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | kv_cache_scheme: 5 | {num_bits: 8, type: float, symmetric: true, strategy: tensor} 6 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/recipes/kv_cache/gptq.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | sequential_update: false 5 | ignore: ["lm_head"] 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 4 10 | type: "int" 11 | symmetric: true 12 | strategy: "channel" 13 | actorder: False 14 | targets: ["Linear"] 15 | kv_cache_scheme: 16 | {num_bits: 8, type: float, symmetric: true, strategy: tensor} -------------------------------------------------------------------------------- /tests/e2e/vLLM/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SUCCESS=0 4 | 5 | while getopts "c:t:" OPT; do 6 | case ${OPT} in 7 | c ) 8 | CONFIG="$OPTARG" 9 | ;; 10 | t ) 11 | TEST="$OPTARG" 12 | ;; 13 | \? ) 14 | exit 1 15 | ;; 16 | esac 17 | done 18 | 19 | # Parse list of configs. 20 | for MODEL_CONFIG in "$CONFIG"/* 21 | do 22 | LOCAL_SUCCESS=0 23 | 24 | echo "=== RUNNING MODEL: $MODEL_CONFIG ===" 25 | 26 | export TEST_DATA_FILE="$MODEL_CONFIG" 27 | pytest \ 28 | --capture=tee-sys \ 29 | "$TEST" || LOCAL_SUCCESS=$? 30 | 31 | if [[ $LOCAL_SUCCESS == 0 ]]; then 32 | echo "=== PASSED MODEL: $MODEL_CONFIG ===" 33 | else 34 | echo "=== FAILED MODEL: $MODEL_CONFIG ===" 35 | fi 36 | 37 | SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) 38 | 39 | done 40 | 41 | exit "$SUCCESS" 42 | -------------------------------------------------------------------------------- /tests/e2e/vLLM/skipped_configs/fp4_nvfp4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 4 | scheme: NVFP4A16 -------------------------------------------------------------------------------- /tests/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/examples/__init__.py -------------------------------------------------------------------------------- /tests/examples/test_compressed_inference.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from tests.examples.utils import ( 6 | copy_and_run_script, 7 | gen_cmd_fail_message, 8 | requires_gpu_count, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def example_dir() -> str: 14 | return "examples/compressed_inference" 15 | 16 | 17 | @pytest.mark.example 18 | @requires_gpu_count(1) 19 | class TestCompressedInference: 20 | """ 21 | Tests for examples in the "compressed_inference" example folder. 22 | """ 23 | 24 | def test_fp8_example_script(self, example_dir: str, tmp_path: Path): 25 | """ 26 | Test for the "fp8_compressed_inference.py" script in the folder. 27 | """ 28 | script_filename = "fp8_compressed_inference.py" 29 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename) 30 | 31 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 32 | -------------------------------------------------------------------------------- /tests/examples/test_quantization_kv_cache.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from tests.examples.utils import ( 7 | ReadMe, 8 | copy_and_run_command, 9 | gen_cmd_fail_message, 10 | requires_gpu_count, 11 | ) 12 | 13 | 14 | @pytest.fixture 15 | def example_dir() -> str: 16 | return "examples/quantization_kv_cache" 17 | 18 | 19 | @pytest.mark.example 20 | @requires_gpu_count(1) 21 | class TestQuantizationKVCache: 22 | """ 23 | Tests for examples in the "quantization_kv_cache" example folder. 24 | """ 25 | 26 | def test_doc_example_command(self, example_dir: str, tmp_path: Path): 27 | """ 28 | Test for the example command in the README. 29 | """ 30 | readme_path = Path.cwd() / example_dir / "README.md" 31 | readme = ReadMe(readme_path) 32 | 33 | command = readme.get_code_block_content(position=2, lang="shell") 34 | assert command.startswith("python") 35 | 36 | command = shlex.split(command) 37 | result = copy_and_run_command(tmp_path, example_dir, command) 38 | 39 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 40 | -------------------------------------------------------------------------------- /tests/examples/test_quantization_w4a16.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from tests.examples.utils import ( 7 | ReadMe, 8 | copy_and_run_command, 9 | gen_cmd_fail_message, 10 | requires_gpu_count, 11 | ) 12 | 13 | 14 | @pytest.fixture 15 | def example_dir() -> str: 16 | return "examples/quantization_w4a16" 17 | 18 | 19 | @pytest.mark.example 20 | @requires_gpu_count(1) 21 | class TestQuantizationW4A16: 22 | """ 23 | Tests for examples in the "quantization_w4a16" example folder. 24 | """ 25 | 26 | def test_doc_example_command(self, example_dir: str, tmp_path: Path): 27 | """ 28 | Test for the example command in the README. 29 | """ 30 | readme_path = Path.cwd() / example_dir / "README.md" 31 | readme = ReadMe(readme_path) 32 | 33 | command = readme.get_code_block_content(position=2, lang="shell") 34 | assert command.startswith("python") 35 | 36 | command = shlex.split(command) 37 | result = copy_and_run_command(tmp_path, example_dir, command) 38 | 39 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 40 | -------------------------------------------------------------------------------- /tests/examples/test_quantization_w8a8_fp8.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from tests.examples.utils import ( 7 | ReadMe, 8 | copy_and_run_command, 9 | copy_and_run_script, 10 | gen_cmd_fail_message, 11 | requires_gpu_count, 12 | ) 13 | 14 | 15 | @pytest.fixture 16 | def example_dir() -> str: 17 | return "examples/quantization_w8a8_fp8" 18 | 19 | 20 | @pytest.mark.example 21 | @requires_gpu_count(1) 22 | class TestQuantizationW8A8_FP8: 23 | """ 24 | Tests for examples in the "quantization_w8a8_fp8" example folder. 25 | """ 26 | 27 | def test_doc_example_command(self, example_dir: str, tmp_path: Path): 28 | """ 29 | Test for the example command in the README. 30 | """ 31 | readme_path = Path.cwd() / example_dir / "README.md" 32 | readme = ReadMe(readme_path) 33 | 34 | command = readme.get_code_block_content(position=2, lang="shell") 35 | assert command.startswith("python") 36 | 37 | command = shlex.split(command) 38 | result = copy_and_run_command(tmp_path, example_dir, command) 39 | 40 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 41 | 42 | def test_gemma2_example_script(self, example_dir: str, tmp_path: Path): 43 | """ 44 | Test for the "gemma2_example.py" script in the folder. 45 | """ 46 | script_filename = "gemma2_example.py" 47 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename) 48 | 49 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 50 | -------------------------------------------------------------------------------- /tests/examples/test_quantization_w8a8_int8.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from tests.examples.utils import ( 7 | ReadMe, 8 | copy_and_run_command, 9 | copy_and_run_script, 10 | gen_cmd_fail_message, 11 | requires_gpu_count, 12 | ) 13 | 14 | 15 | @pytest.fixture 16 | def example_dir() -> str: 17 | return "examples/quantization_w8a8_int8" 18 | 19 | 20 | @pytest.mark.example 21 | @requires_gpu_count(1) 22 | class TestQuantizationW8A8_Int8: 23 | """ 24 | Tests for examples in the "quantization_w8a8_int8" example folder. 25 | """ 26 | 27 | def test_doc_example_command(self, example_dir: str, tmp_path: Path): 28 | """ 29 | Test for the example command in the README. 30 | """ 31 | readme_path = Path.cwd() / example_dir / "README.md" 32 | readme = ReadMe(readme_path) 33 | 34 | command = readme.get_code_block_content(position=2, lang="shell") 35 | assert command.startswith("python") 36 | 37 | command = shlex.split(command) 38 | result = copy_and_run_command(tmp_path, example_dir, command) 39 | 40 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 41 | 42 | def test_gemma2_example_script(self, example_dir: str, tmp_path: Path): 43 | """ 44 | Test for the "gemma2_example.py" script in the folder. 45 | """ 46 | script_filename = "gemma2_example.py" 47 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename) 48 | 49 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 50 | -------------------------------------------------------------------------------- /tests/examples/test_quantizing_moe.py: -------------------------------------------------------------------------------- 1 | import shlex 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from tests.examples.utils import ( 7 | ReadMe, 8 | copy_and_run_command, 9 | copy_and_run_script, 10 | gen_cmd_fail_message, 11 | requires_gpu_count, 12 | ) 13 | 14 | 15 | @pytest.fixture 16 | def example_dir() -> str: 17 | return "examples/quantizing_moe" 18 | 19 | 20 | @pytest.mark.example 21 | class TestQuantizingMOE: 22 | """ 23 | Tests for examples in the "quantizing_moe" example folder. 24 | """ 25 | 26 | @pytest.mark.multi_gpu 27 | @requires_gpu_count(2) 28 | def test_doc_example_command(self, example_dir: str, tmp_path: Path): 29 | """ 30 | Test for the example command in the README. 31 | """ 32 | readme_path = Path.cwd() / example_dir / "README.md" 33 | readme = ReadMe(readme_path) 34 | 35 | command = readme.get_code_block_content(position=2, lang="shell") 36 | assert command.startswith("python") 37 | 38 | command = shlex.split(command) 39 | result = copy_and_run_command(tmp_path, example_dir, command) 40 | 41 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 42 | 43 | @pytest.mark.parametrize( 44 | "script_filename", 45 | [ 46 | pytest.param( 47 | "deepseek_moe_w4a16.py", 48 | marks=[ 49 | pytest.mark.multi_gpu, 50 | pytest.mark.skip(reason="exceptionally long run time"), 51 | ], 52 | ), 53 | pytest.param("deepseek_moe_w8a8_fp8.py"), 54 | pytest.param("deepseek_moe_w8a8_int8.py", marks=pytest.mark.multi_gpu), 55 | ], 56 | ) 57 | def test_deepseek_example_script( 58 | self, script_filename: str, example_dir: str, tmp_path: Path 59 | ): 60 | """ 61 | Test for the other example scripts in the folder. 62 | """ 63 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename) 64 | 65 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 66 | -------------------------------------------------------------------------------- /tests/examples/test_sparse_2of4_quantization_fp8.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from tests.examples.utils import ( 6 | copy_and_run_script, 7 | gen_cmd_fail_message, 8 | requires_gpu_count, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def example_dir() -> str: 14 | return "examples/sparse_2of4_quantization_fp8" 15 | 16 | 17 | @requires_gpu_count(1) 18 | class TestSparse2of4QuantizationFP8: 19 | """ 20 | Tests for examples in the "sparse_2of4_quantization_fp8" example folder. 21 | """ 22 | 23 | @pytest.mark.parametrize(("flags"), [[], ["--fp8"]]) 24 | def test_2of4_example_script( 25 | self, example_dir: str, tmp_path: Path, flags: list[str] 26 | ): 27 | """ 28 | Tests for the "llama3_8b_2of4.py" example script. 29 | """ 30 | script_filename = "llama3_8b_2of4.py" 31 | command, result = copy_and_run_script( 32 | tmp_path, example_dir, script_filename, flags=flags 33 | ) 34 | 35 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 36 | -------------------------------------------------------------------------------- /tests/examples/test_trl_mixin.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from tests.examples.utils import ( 6 | copy_and_run_script, 7 | gen_cmd_fail_message, 8 | requires_gpu_count, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def example_dir() -> str: 14 | return "examples/trl_mixin" 15 | 16 | 17 | @pytest.mark.example 18 | @requires_gpu_count(1) 19 | class TestTRLMixin: 20 | """ 21 | Tests for examples in the "trl_mixin" example folder. 22 | """ 23 | 24 | @pytest.mark.parametrize( 25 | "script_filename", 26 | [ 27 | "ex_trl_constant.py", 28 | # ex_trl_distillation.py hits CUDA OOM on 1x H100 (80 GiB VRAM) 29 | pytest.param("ex_trl_distillation.py", marks=pytest.mark.multi_gpu), 30 | ], 31 | ) 32 | def test_example_scripts( 33 | self, example_dir: str, script_filename: str, tmp_path: Path 34 | ): 35 | """ 36 | Test for the example scripts in the folder. 37 | """ 38 | command, result = copy_and_run_script(tmp_path, example_dir, script_filename) 39 | 40 | assert result.returncode == 0, gen_cmd_fail_message(command, result) 41 | -------------------------------------------------------------------------------- /tests/llmcompressor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/metrics/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/metrics/utils/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/awq/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/awq/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/calibration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/calibration/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/calibration/test_frozen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from compressed_tensors.quantization.lifecycle.initialize import ( 16 | initialize_module_for_quantization, 17 | ) 18 | from compressed_tensors.quantization.quant_args import QuantizationArgs 19 | from compressed_tensors.quantization.quant_config import QuantizationStatus 20 | from compressed_tensors.quantization.quant_scheme import QuantizationScheme 21 | from torch.nn import Linear 22 | 23 | from llmcompressor.modifiers.quantization.calibration import ( 24 | freeze_module_quantization, 25 | initialize_observer, 26 | ) 27 | 28 | 29 | def test_set_module_for_calibration(): 30 | num_bits = 8 31 | quantization_scheme = QuantizationScheme( 32 | targets=["*"], 33 | weights=QuantizationArgs(num_bits=num_bits, symmetric=True), 34 | input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), 35 | ) 36 | 37 | layer = Linear(4, 4) 38 | 39 | initialize_module_for_quantization(layer, quantization_scheme) 40 | layer.quantization_status = QuantizationStatus("calibration") 41 | initialize_observer(layer, "weight") 42 | 43 | # should have both input and weight observer after initalizing 44 | assert hasattr(layer, "weight_observer") 45 | 46 | # observers should get deleted after freezing 47 | freeze_module_quantization(layer) 48 | assert not hasattr(layer, "input_observer") 49 | assert not hasattr(layer, "weight_observer") 50 | 51 | assert layer.quantization_status == QuantizationStatus("frozen") 52 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/calibration/test_observers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from compressed_tensors.quantization import ( 4 | QuantizationArgs, 5 | QuantizationScheme, 6 | initialize_module_for_quantization, 7 | ) 8 | 9 | from llmcompressor.modifiers.quantization.calibration import initialize_observer 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "shape,group_size,actorder", 14 | [ 15 | ((1, 1), None, False), 16 | ((1, 1), 128, False), 17 | ((1, 1), 128, True), 18 | ((64, 64), None, False), 19 | ((64, 64), 128, False), 20 | ((64, 64), 128, True), 21 | ((1792, 4096), None, False), 22 | ((1792, 4096), 128, False), 23 | ((1792, 4096), 128, True), 24 | ((3420, 64), None, False), 25 | ((3420, 64), 128, False), 26 | ((3420, 64), 128, True), 27 | ], 28 | ) 29 | def test_observers_update(shape, group_size, actorder): 30 | module = torch.nn.Linear(*shape) 31 | scheme = QuantizationScheme( 32 | targets=["Linear"], 33 | weights=QuantizationArgs(group_size=group_size, actorder=actorder), 34 | input_activations=QuantizationArgs(), 35 | output_activations=QuantizationArgs(), 36 | ) 37 | 38 | input = torch.empty(module.in_features, dtype=module.weight.dtype) 39 | output = torch.empty(module.out_features, dtype=module.weight.dtype) 40 | 41 | initialize_module_for_quantization(module, scheme) 42 | initialize_observer(module, "weight") 43 | initialize_observer(module, "input") 44 | initialize_observer(module, "output") 45 | 46 | for location, value in ( 47 | ("weight", module.weight), 48 | ("input", input), 49 | ("output", output), 50 | ): 51 | observer = getattr(module, f"{location}_observer") 52 | g_idx = getattr(module, "g_idx", None) 53 | updated_scale, updated_zero_point = observer(value, g_idx=g_idx) 54 | 55 | assert_alike(updated_scale, getattr(module, f"{location}_scale")) 56 | assert_alike(updated_zero_point, getattr(module, f"{location}_zero_point")) 57 | 58 | 59 | def assert_alike(a, b): 60 | assert a.dtype == b.dtype 61 | assert a.shape == b.shape 62 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/conf.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from torch.utils.data import DataLoader 4 | 5 | from llmcompressor.core import Event, EventType, State 6 | from llmcompressor.modifiers.factory import ModifierFactory 7 | 8 | 9 | def setup_modifier_factory(): 10 | ModifierFactory.refresh() 11 | assert ModifierFactory._loaded, "ModifierFactory not loaded" 12 | 13 | 14 | class LifecyleTestingHarness: 15 | def __init__( 16 | self, 17 | model=None, 18 | optimizer=None, 19 | device="cpu", 20 | start=0, 21 | ): 22 | self.state = State() 23 | self.state.update( 24 | model=model, 25 | device=device, 26 | optimizer=optimizer, 27 | start=start, 28 | steps_per_epoch=1, 29 | calib_data=DataLoader(MagicMock(__len__=lambda _: 0, column_names=[])), 30 | ) 31 | 32 | def update_modifier(self, modifier, event_type): 33 | event = Event(event_type=event_type) 34 | modifier.update_event(self.state, event=event) 35 | 36 | def get_state(self): 37 | return self.state 38 | 39 | def trigger_modifier_for_epochs(self, modifier, num_epochs): 40 | for _ in range(num_epochs): 41 | self.update_modifier(modifier, EventType.BATCH_START) 42 | self.update_modifier(modifier, EventType.LOSS_CALCULATED) 43 | self.update_modifier(modifier, EventType.OPTIM_PRE_STEP) 44 | self.update_modifier(modifier, EventType.OPTIM_POST_STEP) 45 | self.update_modifier(modifier, EventType.BATCH_END) 46 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/logarithmic_equalization/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/logarithmic_equalization/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.factory import ModifierFactory 6 | from llmcompressor.modifiers.logarithmic_equalization.base import ( 7 | LogarithmicEqualizationModifier, 8 | ) 9 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier 10 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory 11 | 12 | 13 | @pytest.mark.unit 14 | class TestLogarithmicEqualizationIsRegistered(unittest.TestCase): 15 | def setUp(self): 16 | self.kwargs = dict( 17 | smoothing_strength=0.3, 18 | mappings=[(["layer1", "layer2"], "layer3")], 19 | ) 20 | setup_modifier_factory() 21 | 22 | def test_log_equalization_is_registered(self): 23 | modifier = ModifierFactory.create( 24 | type_="LogarithmicEqualizationModifier", 25 | allow_experimental=False, 26 | allow_registered=True, 27 | **self.kwargs, 28 | ) 29 | 30 | self.assertIsInstance( 31 | modifier, 32 | LogarithmicEqualizationModifier, 33 | "PyTorch LogarithmicEqualizationModifier not registered", 34 | ) 35 | 36 | self.assertIsInstance(modifier, SmoothQuantModifier) 37 | self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"]) 38 | self.assertEqual(modifier.mappings, self.kwargs["mappings"]) 39 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/sparsegpt/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/pruning/sparsegpt/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.factory import ModifierFactory 6 | from llmcompressor.modifiers.obcq.base import SparseGPTModifier 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory 8 | 9 | 10 | @pytest.mark.unit 11 | class TestSparseGPTIsRegistered(unittest.TestCase): 12 | def setUp(self): 13 | self.kwargs = dict( 14 | sparsity=0.5, 15 | targets="__ALL_PRUNABLE__", 16 | ) 17 | setup_modifier_factory() 18 | 19 | def test_wanda_is_registered(self): 20 | type_ = ModifierFactory.create( 21 | type_="SparseGPTModifier", 22 | allow_experimental=False, 23 | allow_registered=True, 24 | **self.kwargs, 25 | ) 26 | 27 | self.assertIsInstance( 28 | type_, 29 | SparseGPTModifier, 30 | "PyTorch SparseGPTModifier not registered", 31 | ) 32 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/pruning/wanda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/pruning/wanda/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/pruning/wanda/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.factory import ModifierFactory 6 | from llmcompressor.modifiers.pruning.wanda.base import WandaPruningModifier 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory 8 | 9 | 10 | @pytest.mark.unit 11 | class TestWandaIsRegistered(unittest.TestCase): 12 | def setUp(self): 13 | self.kwargs = dict( 14 | sparsity=0.5, 15 | targets="__ALL_PRUNABLE__", 16 | ) 17 | setup_modifier_factory() 18 | 19 | def test_wanda_is_registered(self): 20 | type_ = ModifierFactory.create( 21 | type_="WandaPruningModifier", 22 | allow_experimental=False, 23 | allow_registered=True, 24 | **self.kwargs, 25 | ) 26 | 27 | self.assertIsInstance( 28 | type_, 29 | WandaPruningModifier, 30 | "PyTorch WandaPruningModifier not registered", 31 | ) 32 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/quantization/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/modifiers/smoothquant/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/smoothquant/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.factory import ModifierFactory 6 | from llmcompressor.modifiers.smoothquant.base import SmoothQuantModifier 7 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory 8 | 9 | 10 | @pytest.mark.unit 11 | class TestSmoothQuantIsRegistered(unittest.TestCase): 12 | def setUp(self): 13 | self.kwargs = dict( 14 | smoothing_strength=0.3, 15 | mappings=[(["layer1", "layer2"], "layer3")], 16 | ) 17 | setup_modifier_factory() 18 | 19 | def test_smooth_quant_is_registered(self): 20 | modifier = ModifierFactory.create( 21 | type_="SmoothQuantModifier", 22 | allow_experimental=False, 23 | allow_registered=True, 24 | **self.kwargs, 25 | ) 26 | 27 | self.assertIsInstance( 28 | modifier, 29 | SmoothQuantModifier, 30 | "PyTorch SmoothQuant not registered", 31 | ) 32 | 33 | self.assertEqual(modifier.smoothing_strength, self.kwargs["smoothing_strength"]) 34 | self.assertEqual(modifier.mappings, self.kwargs["mappings"]) 35 | 36 | 37 | @pytest.mark.unit 38 | class TestSmoothQuantDefaults(unittest.TestCase): 39 | def setUp(self): 40 | setup_modifier_factory() 41 | 42 | def test_defaults(self): 43 | default_sq = SmoothQuantModifier() 44 | assert default_sq.smoothing_strength == 0.5 45 | 46 | def test_override_defaults(self): 47 | strength = 0.7 48 | dummy_map = [(["layer1", "layer2"], "layer3")] 49 | non_default_sq = SmoothQuantModifier( 50 | smoothing_strength=strength, mappings=dummy_map 51 | ) 52 | 53 | assert non_default_sq.smoothing_strength == strength 54 | assert non_default_sq.mappings == dummy_map 55 | -------------------------------------------------------------------------------- /tests/llmcompressor/modifiers/smoothquant/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.smoothquant.utils import ( 6 | get_layer_mappings_from_architecture, 7 | handle_mapping_resolution_errors, 8 | ) 9 | 10 | smoothquant_utils = "llmcompressor.modifiers.smoothquant.utils" 11 | 12 | 13 | @pytest.mark.unit 14 | def test_handle_mapping_resolution_errors(): 15 | README_LOCATION = ( 16 | "https://github.com/vllm-project/llm-compressor/tree/main/" 17 | "src/llmcompressor/modifiers/smoothquant" 18 | ) 19 | 20 | @handle_mapping_resolution_errors 21 | def func_that_raises_exception(): 22 | raise ValueError("An error occurred") 23 | 24 | with pytest.raises(RuntimeError) as excinfo: 25 | func_that_raises_exception() 26 | 27 | assert "Error resolving mappings for given architecture." in str(excinfo.value) 28 | assert "Please refer to the README at" in str(excinfo.value) 29 | assert README_LOCATION in str(excinfo.value) 30 | 31 | 32 | @pytest.mark.unit 33 | @patch( 34 | f"{smoothquant_utils}.MAPPINGS_REGISTRY", {"arch1": "mapping1", "arch2": "mapping2"} 35 | ) 36 | @patch(f"{smoothquant_utils}.DEFAULT_SMOOTHQUANT_MAPPINGS", "default_mapping") 37 | def test_get_layer_mappings_from_architecture(): 38 | # Test when architecture is in MAPPINGS_REGISTRY 39 | assert get_layer_mappings_from_architecture("arch1") == "mapping1" 40 | 41 | # Test when architecture is not in MAPPINGS_REGISTRY 42 | assert get_layer_mappings_from_architecture("arch3") == "default_mapping" 43 | -------------------------------------------------------------------------------- /tests/llmcompressor/observers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/observers/test_mse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import pytest 17 | import torch 18 | from compressed_tensors.quantization.quant_args import QuantizationArgs 19 | 20 | from llmcompressor.observers import MovingAverageMSEObserver, Observer 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "symmetric,expected_scale,expected_zero_point", 25 | [ 26 | (True, 0.0078, 0), 27 | (False, 0.0039, -128), 28 | ], 29 | ) 30 | def test_mse_observer(symmetric, expected_scale, expected_zero_point): 31 | tensor = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0]) 32 | num_bits = 8 33 | weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse") 34 | 35 | observer = weights.observer 36 | observer = Observer.load_from_registry(observer, quantization_args=weights) 37 | scale, zero_point = observer(tensor) 38 | 39 | assert isinstance(observer, MovingAverageMSEObserver) 40 | assert round(scale.item(), 4) == expected_scale 41 | assert round(zero_point.item(), 4) == expected_zero_point 42 | 43 | 44 | def test_mse_observer_symmetric_scale_range(): 45 | tensor = torch.rand(4, 4) 46 | tensor *= 127 47 | 48 | num_bits = 8 49 | weights = QuantizationArgs(num_bits=num_bits, symmetric=True, observer="mse") 50 | 51 | observer = weights.observer 52 | observer = Observer.load_from_registry(observer, quantization_args=weights) 53 | scale, zero_point = observer(tensor) 54 | 55 | # if symmetric, max symmetric_range = abs(-128) / 255 56 | assert round(scale.item(), 4) <= 1.0039 57 | assert round(zero_point.item(), 4) == 0 58 | -------------------------------------------------------------------------------- /tests/llmcompressor/pipelines/sequential/test_helpers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from llmcompressor.pipelines.sequential.helpers import get_sequential_ancestors 4 | 5 | 6 | class DummyModel(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.seq = torch.nn.Sequential(torch.nn.Linear(10, 20), torch.nn.ReLU()) 10 | self.fc = torch.nn.Linear(20, 5) 11 | 12 | def forward(self, x): 13 | x = self.seq(x) 14 | return self.fc(x) 15 | 16 | 17 | def test_get_sequential_ancestors(): 18 | model = DummyModel() 19 | 20 | assert get_sequential_ancestors(model, set()) == set() 21 | assert get_sequential_ancestors(model, {model}) == set() 22 | assert get_sequential_ancestors(model, {model.fc}) == {model} 23 | assert get_sequential_ancestors(model, {model.seq[0]}) == {model, model.seq} 24 | assert get_sequential_ancestors(model, {model.seq[1]}) == {model, model.seq} 25 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/logarithmic_equalization/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | from torch.nn import Linear 5 | 6 | from llmcompressor.core import State 7 | from llmcompressor.modifiers.logarithmic_equalization import ( 8 | LogarithmicEqualizationModifier, 9 | ) 10 | from tests.llmcompressor.pytorch.helpers import LinearNet 11 | 12 | 13 | @pytest.mark.unit 14 | class TestLogEqualizationMapping(unittest.TestCase): 15 | def setUp(self): 16 | self.model = LinearNet() 17 | self.state = State(model=self.model) 18 | 19 | def test_successful_map(self): 20 | mappings = [(["seq.fc2"], "seq.block1.fc1")] 21 | modifier = LogarithmicEqualizationModifier(mappings=mappings) 22 | 23 | modifier.ignore = [] 24 | modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model) 25 | 26 | self.assertEqual(len(modifier.resolved_mappings_), len(mappings)) 27 | 28 | mapping = modifier.resolved_mappings_[0] 29 | self.assertEqual(mapping.smooth_name, mappings[0][1]) 30 | self.assertIsInstance(mapping.smooth_layer, Linear) 31 | self.assertIsInstance(mapping.balance_layers[0], Linear) 32 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/pruning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/pruning/constant/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/pruning/wanda/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | 5 | from llmcompressor.modifiers.factory import ModifierFactory 6 | from tests.llmcompressor.modifiers.conf import setup_modifier_factory 7 | 8 | 9 | @pytest.mark.unit 10 | class TestWandaPytorchIsRegistered(unittest.TestCase): 11 | def setUp(self): 12 | self.kwargs = dict( 13 | sparsity=0.5, 14 | targets="__ALL_PRUNABLE__", 15 | ) 16 | setup_modifier_factory() 17 | 18 | def test_wanda_pytorch_is_registered(self): 19 | from llmcompressor.modifiers.pruning.wanda import WandaPruningModifier 20 | 21 | type_ = ModifierFactory.create( 22 | type_="WandaPruningModifier", 23 | allow_experimental=False, 24 | allow_registered=True, 25 | **self.kwargs, 26 | ) 27 | 28 | self.assertIsInstance( 29 | type_, 30 | WandaPruningModifier, 31 | "PyTorch ConstantPruningModifier not registered", 32 | ) 33 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/pytorch/modifiers/smoothquant/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/modifiers/smoothquant/test_pytorch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | from torch.nn import Linear 5 | 6 | from llmcompressor.core import State 7 | from llmcompressor.modifiers.smoothquant import SmoothQuantModifier 8 | from tests.llmcompressor.pytorch.helpers import LinearNet 9 | 10 | 11 | @pytest.mark.unit 12 | class TestSmoothQuantMapping(unittest.TestCase): 13 | def setUp(self): 14 | self.model = LinearNet() 15 | self.state = State(model=self.model) 16 | 17 | def test_successful_map(self): 18 | mappings = [(["seq.fc1"], "seq.fc2")] 19 | modifier = SmoothQuantModifier(mappings=mappings) 20 | 21 | modifier.ignore = [] 22 | modifier.resolved_mappings_ = modifier._resolve_mappings(self.state.model) 23 | 24 | self.assertEqual(len(modifier.resolved_mappings_), len(mappings)) 25 | 26 | mapping = modifier.resolved_mappings_[0] 27 | self.assertEqual(mapping.smooth_name, mappings[0][1]) 28 | self.assertIsInstance(mapping.smooth_layer, Linear) 29 | self.assertIsInstance(mapping.balance_layers[0], Linear) 30 | -------------------------------------------------------------------------------- /tests/llmcompressor/pytorch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | -------------------------------------------------------------------------------- /tests/llmcompressor/recipe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/recipe/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/recipe/recipe.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.8 5 | mappings: 6 | - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] 7 | - re:.*input_layernorm 8 | - - ['re:.*gate_proj', 're:.*up_proj'] 9 | - re:.*post_attention_layernorm 10 | GPTQModifier: 11 | targets: ["Linear"] 12 | ignore: [lm_head] 13 | scheme: W8A8 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/test_sentinel.py: -------------------------------------------------------------------------------- 1 | from llmcompressor.sentinel import Sentinel 2 | 3 | 4 | def test_sentinel(): 5 | assert Sentinel("MISSING") == Sentinel("MISSING") 6 | assert Sentinel("MISSING", "module_one") != Sentinel("MISSING", "module_two") 7 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/compression/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/actorder_group_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/actorder_weight_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/channelwise_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/llama2.c-stories15M" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/fp8_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/llama2.c-stories15M" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/group_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/inputs_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml" 5 | ppl_threshold: 20 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/llama2.c-stories15M" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | model_stub: "nm-testing/llama2.c-stories15M" 4 | new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/decompression_configs_skipped/w8a8.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" 4 | skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "group" 14 | input_activations: null 15 | output_activations: null 16 | targets: ["Linear"] 17 | GPTQModifier: 18 | block_size: 128 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | actorder: "weight" 14 | input_activations: null 15 | output_activations: null 16 | targets: ["Linear"] 17 | GPTQModifier: 18 | block_size: 128 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "channel" 12 | input_activations: null 13 | output_activations: null 14 | targets: ["Linear"] 15 | GPTQModifier: 16 | block_size: 128 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml: -------------------------------------------------------------------------------- 1 | quant_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "float" 10 | symmetric: true 11 | strategy: channel 12 | targets: ["Linear"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | GPTQModifier: 4 | block_size: 128 5 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 6 | config_groups: 7 | group_0: 8 | weights: 9 | num_bits: 8 10 | type: "int" 11 | symmetric: false 12 | strategy: "channel" 13 | input_activations: 14 | num_bits: 8 15 | type: "int" 16 | symmetric: false 17 | strategy: "tensor" 18 | output_activations: null 19 | targets: ["Linear"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_group.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 4 9 | type: "int" 10 | symmetric: False 11 | strategy: "group" 12 | group_size: 128 13 | input_activations: null 14 | output_activations: null 15 | targets: ["Linear"] 16 | GPTQModifier: 17 | block_size: 128 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | symmetric: true 11 | strategy: "tensor" 12 | input_activations: 13 | num_bits: 8 14 | type: "int" 15 | symmetric: false 16 | strategy: "tensor" 17 | output_activations: null 18 | targets: ["Linear"] 19 | group_1: 20 | weights: 21 | num_bits: 8 22 | type: "int" 23 | symmetric: true 24 | strategy: "tensor" 25 | input_activations: null 26 | output_activations: null 27 | targets: ["Embedding"] 28 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: ["lm_head", "model.layers.0.mlp.down_proj"] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | symmetric: true 11 | strategy: "tensor" 12 | input_activations: null 13 | output_activations: null 14 | targets: ["Linear", "Embedding"] 15 | GPTQModifier: 16 | block_size: 128 17 | targets: ["re:model.layers.\\d+$"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml: -------------------------------------------------------------------------------- 1 | pruning_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml: -------------------------------------------------------------------------------- 1 | pruning_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | mask_structure: "2:4" 6 | targets: ["Linear"] 7 | ignore: ["re:.*lm_head"] 8 | quant_stage: 9 | quant_modifiers: 10 | QuantizationModifier: 11 | ignore: ["lm_head"] 12 | config_groups: 13 | group_0: 14 | weights: 15 | num_bits: 8 16 | type: float 17 | strategy: channel 18 | dynamic: false 19 | symmetric: true 20 | input_activations: 21 | num_bits: 8 22 | type: float 23 | strategy: token 24 | dynamic: true 25 | symmetric: true 26 | targets: ["Linear"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/run_compressed_configs_skipped/w8a8.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed 4 | uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/test_has_gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | import torch 5 | 6 | 7 | @pytest.mark.skipif(os.getenv("GITHUB_ACTIONS") != "true", reason="Only run for GHA") 8 | def test_has_gpu(): 9 | """ 10 | This test exists purely to raise an error if 11 | a runner performs transformers tests without a GPU 12 | """ 13 | assert torch.cuda.is_available() 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/compression/test_infer_quant_format.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from compressed_tensors.quantization import preset_name_to_scheme 3 | 4 | from llmcompressor.transformers.compression.quantization_format import ( 5 | infer_quantization_format, 6 | ) 7 | from tests.llmcompressor.pytorch.helpers import LinearNet 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "preset,sparsity_structure,expected_format", 12 | [ 13 | ["W8A8", "unstructured", "int-quantized"], 14 | ["W8A16", "unstructured", "pack-quantized"], 15 | ["W8A16", "2:4", "marlin-24"], 16 | ["W4A16", "unstructured", "pack-quantized"], 17 | ["W4A16", "2:4", "marlin-24"], 18 | ["FP8", "unstructured", "float-quantized"], 19 | ], 20 | ) 21 | def test_infer_quant_format(preset, sparsity_structure, expected_format): 22 | quant_scheme = preset_name_to_scheme(preset, targets=["Linear"]) 23 | 24 | dummy_model = LinearNet() 25 | for _, module in dummy_model.named_modules(): 26 | module.quantization_scheme = quant_scheme 27 | 28 | inferred_format = infer_quantization_format( 29 | dummy_model, save_compressed=True, sparsity_structure=sparsity_structure 30 | ) 31 | assert inferred_format.value == expected_format 32 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def run_before_and_after_tests(tmp_path): 8 | os.environ["TRANSFORMERS_CACHE"] = str(tmp_path / "transformers") 9 | os.environ["HF_DATASETS_CACHE"] = str(tmp_path / "datasets") 10 | yield 11 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/finetune/data/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/data/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import AutoTokenizer 3 | 4 | from llmcompressor.args import ModelArguments 5 | 6 | 7 | @pytest.fixture 8 | def tiny_llama_path(): 9 | return "nm-testing/llama2.c-stories15M" 10 | 11 | 12 | @pytest.fixture 13 | def tiny_llama_model_args(tiny_llama_path): 14 | return ModelArguments(model=tiny_llama_path) 15 | 16 | 17 | @pytest.fixture 18 | def tiny_llama_tokenizer(tiny_llama_model_args): 19 | tokenizer = AutoTokenizer.from_pretrained( 20 | tiny_llama_model_args.model, 21 | cache_dir=tiny_llama_model_args.cache_dir, 22 | use_fast=True, 23 | revision=tiny_llama_model_args.model_revision, 24 | use_auth_token=True if tiny_llama_model_args.use_auth_token else None, 25 | ) 26 | return tokenizer 27 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.args import DatasetArguments 4 | from llmcompressor.datasets import make_dataset_splits 5 | from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset 6 | 7 | 8 | @pytest.mark.unit 9 | def test_combined_datasets(): 10 | dataset_args = DatasetArguments( 11 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1" 12 | ) 13 | raw_wikitext2 = get_raw_dataset(dataset_args) 14 | datasets = {"all": raw_wikitext2} 15 | split_datasets = make_dataset_splits(datasets, do_train=True) 16 | assert split_datasets.get("train") is not None 17 | 18 | split_datasets = make_dataset_splits(datasets, do_train=True) 19 | assert split_datasets.get("train") is not None 20 | 21 | 22 | @pytest.mark.unit 23 | def test_separate_datasets(): 24 | splits = {"train": "train[:5%]", "validation": "train[10%:20%]"} 25 | dataset_args = DatasetArguments( 26 | dataset="wikitext", dataset_config_name="wikitext-2-raw-v1" 27 | ) 28 | datasets = {} 29 | for split_name, split_str in splits.items(): 30 | raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str) 31 | datasets[split_name] = raw_wikitext2 32 | 33 | split_datasets = make_dataset_splits(datasets, do_train=True) 34 | assert split_datasets.get("train") is not None 35 | 36 | with pytest.raises(ValueError): 37 | # fails due to no test split specified 38 | 39 | datasets.pop("train") 40 | split_datasets = make_dataset_splits(datasets, do_train=True) 41 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | file_extension: json 5 | num_train_epochs: 1 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | file_extension: csv 5 | num_train_epochs: 1 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_custom/gpu/gpu_config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "neuralmagic/Llama-2-7b-ultrachat200k" 4 | file_extension: json 5 | num_train_epochs: 0.5 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: wikitext 5 | dataset_config_name: "wikitext-2-raw-v1" 6 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" 7 | num_train_epochs: 0.25 8 | concat_txt: False -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/gpu/gpu_config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "neuralmagic/Llama-2-7b-ultrachat200k" 4 | dataset: "ultrachat-200k" 5 | recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml" 6 | num_train_epochs: 0.05 7 | concat_txt: False 8 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/finetune_tokenizer/config1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 4 | dataset_config_name: wikitext-2-raw-v1 5 | dataset: wikitext -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml: -------------------------------------------------------------------------------- 1 | test_oneshot_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | percdamp: 0.01 7 | mask_structure: "0:0" 8 | targets: ["Linear"] 9 | ignore: ["re:.*lm_head"] 10 | test_train_stage: 11 | pruning_modifiers: 12 | ConstantPruningModifier: 13 | targets: [ 14 | "re:.*self_attn.q_proj", 15 | "re:.*self_attn.k_proj", 16 | "re:.*self_attn.v_proj", 17 | "re:.*self_attn.o_proj", 18 | "re:.*mlp.down_proj", 19 | "re:.*mlp.gate_proj", 20 | "re:.*mlp.up_proj" 21 | ] 22 | start: 0 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/test_finetune_recipe.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | pruning_modifiers: 3 | ConstantPruningModifier: 4 | targets: [ 5 | "re:.*self_attn.q_proj", 6 | "re:.*self_attn.k_proj", 7 | "re:.*self_attn.v_proj", 8 | "re:.*self_attn.o_proj", 9 | "re:.*mlp.gate_proj", 10 | "re:.*mlp.up_proj" 11 | ] 12 | start: 0 13 | distillation_modifiers: 14 | OutputDistillationModifier: 15 | targets: ["re:model.layers.\\d+$"] 16 | comparison: "square_head" 17 | start: 0 18 | orig_scale: 1.0 19 | distill_scale: 1.0 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import pytest 6 | from parameterized import parameterized_class 7 | 8 | from tests.testing_utils import parse_params, requires_gpu 9 | 10 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" 11 | 12 | 13 | @pytest.mark.integration 14 | @requires_gpu 15 | @parameterized_class(parse_params(CONFIGS_DIRECTORY)) 16 | class TestFinetuneWithoutRecipe(unittest.TestCase): 17 | model = None 18 | dataset = None 19 | 20 | def setUp(self): 21 | self.output = "./finetune_output" 22 | 23 | def test_finetune_without_recipe(self): 24 | from llmcompressor import train 25 | 26 | recipe_str = None 27 | device = "cuda:0" 28 | 29 | concatenate_data = False 30 | max_steps = 50 31 | splits = "train" 32 | 33 | train( 34 | model=self.model, 35 | dataset=self.dataset, 36 | output_dir=self.output, 37 | recipe=recipe_str, 38 | max_steps=max_steps, 39 | concatenate_data=concatenate_data, 40 | splits=splits, 41 | oneshot_device=device, 42 | ) 43 | 44 | def tearDown(self): 45 | if os.path.isdir(self.output): 46 | shutil.rmtree(self.output) 47 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/test_quantization.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | quant_modifiers: 3 | QuantizationModifier: 4 | ignore: 5 | - model.layers.0.mlp.down_proj 6 | - model.layers.1.mlp.down_proj 7 | - model.layers.2.mlp.down_proj 8 | - model.layers.3.mlp.down_proj 9 | - model.layers.4.mlp.down_proj 10 | - model.layers.5.mlp.down_proj 11 | config_groups: 12 | group_0: 13 | weights: 14 | num_bits: 8 15 | type: "int" 16 | symmetric: False 17 | strategy: "tensor" 18 | input_activations: null 19 | output_activations: null 20 | targets: ["Linear"] 21 | pruning_modifiers: 22 | ConstantPruningModifier: 23 | targets: [ 24 | "re:.*self_attn.q_proj", 25 | "re:.*self_attn.k_proj", 26 | "re:.*self_attn.v_proj", 27 | "re:.*self_attn.o_proj", 28 | "re:.*mlp.gate_proj", 29 | "re:.*mlp.up_proj" 30 | ] 31 | start: 0 32 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/finetune/test_safetensors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | import pytest 7 | from parameterized import parameterized_class 8 | 9 | from tests.testing_utils import parse_params, requires_gpu 10 | 11 | CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/finetune/finetune_generic" 12 | 13 | 14 | @pytest.mark.integration 15 | @requires_gpu 16 | @parameterized_class(parse_params(CONFIGS_DIRECTORY)) 17 | class TestSafetensors(unittest.TestCase): 18 | model = None 19 | dataset = None 20 | 21 | def setUp(self): 22 | self.output = Path("./finetune_output") 23 | 24 | def test_safetensors(self): 25 | from llmcompressor import train 26 | 27 | device = "cuda:0" 28 | output_dir = self.output / "output1" 29 | max_steps = 10 30 | splits = {"train": "train[:10%]"} 31 | 32 | train( 33 | model=self.model, 34 | dataset=self.dataset, 35 | output_dir=output_dir, 36 | max_steps=max_steps, 37 | splits=splits, 38 | oneshot_device=device, 39 | ) 40 | 41 | assert os.path.exists(output_dir / "model.safetensors") 42 | assert not os.path.exists(output_dir / "pytorch_model.bin") 43 | 44 | # test we can also load 45 | new_output_dir = self.output / "output2" 46 | train( 47 | model=output_dir, 48 | dataset=self.dataset, 49 | output_dir=new_output_dir, 50 | max_steps=max_steps, 51 | splits=splits, 52 | oneshot_device=device, 53 | ) 54 | 55 | def tearDown(self): 56 | if os.path.isdir(self.output): 57 | shutil.rmtree(self.output) 58 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/completion/gpu/llama_7b_sparse.yml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml" 6 | device: "cuda:0" 7 | num_samples: 512 8 | perplexity: 20 9 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml" 6 | num_samples: 32 7 | perplexity: 5000 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml" 6 | num_samples: 32 7 | perplexity: 5000 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/gpu/llama_consec_runs.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml" 6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml" 7 | device: "cuda:0" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus 5 | first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml" 6 | second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/mask_structure/tiny_llama_mask_structure_preservation.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus 5 | initial_pruning_only_recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml" 6 | initial_sparsity: 0.5 7 | recipe_mask_structure: "2:4" 8 | subsequent_prune_and_quant_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml" 9 | final_sparsity: 0.7 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "meta-llama/Llama-2-7b-hf" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml" 6 | sparsity: 0.3 7 | device: "cuda:0" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "sanity" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus 5 | recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml" 6 | sparsity: 0.3 -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic/config.yaml: -------------------------------------------------------------------------------- 1 | cadence: "nightly" 2 | test_type: "regression" 3 | model: "nm-testing/llama2.c-stories15M" 4 | dataset: open_platypus -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | percdamp: 0.01 7 | mask_structure: "0:0" 8 | targets: ["re:.*model.layers.0$"] 9 | preserve_sparsity_mask: True -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/additional_sparsity_with_quant.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.7 5 | block_size: 128 6 | percdamp: 0.01 7 | mask_structure: "0:0" 8 | targets: [ 9 | "re:.*model.layers.0$", 10 | ] 11 | preserve_sparsity_mask: True 12 | GPTQModifier: 13 | config_groups: 14 | group_0: 15 | weights: 16 | num_bits: 8 17 | type: "int" 18 | strategy: "channel" 19 | targets: [ 20 | "re:.*model.layers.0.self_attn.q_proj", 21 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/quant.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SmoothQuantModifier: 4 | smoothing_strength: 0.6 5 | GPTQModifier: 6 | block_size: 128 7 | percdamp: 0.01 8 | config_groups: 9 | group_0: 10 | weights: 11 | num_bits: 8 12 | input_activations: 13 | num_bits: 8 14 | targets: ["Linear"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | GPTQModifier: 4 | ignore: [lm_head] 5 | config_groups: 6 | group_0: 7 | weights: 8 | num_bits: 8 9 | type: "int" 10 | strategy: "channel" 11 | targets: [Linear] 12 | SparseGPTModifier: 13 | sparsity: 0.5 14 | block_size: 128 15 | percdamp: 0.01 16 | mask_structure: "0:0" 17 | targets: ["re:.*model.layers.0$"] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/sparse.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.3 5 | block_size: 128 6 | percdamp: 0.01 7 | targets: ["model.layers.0", "model.layers.1"] 8 | mask_structure: "0:0" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | percdamp: 0.01 7 | mask_structure: "2:4" 8 | targets: [ 9 | "re:.*model.layers.0$", 10 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | percdamp: 0.01 7 | mask_structure: "0:0" 8 | targets: [ 9 | "model.layers.0", 10 | "model.layers.1", 11 | "model.layers.2", 12 | "model.layers.3", 13 | "model.layers.4", 14 | "model.layers.5" 15 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from accelerate import init_empty_weights 3 | from transformers import AutoModelForCausalLM 4 | 5 | from llmcompressor.modifiers.obcq import SparseGPTModifier 6 | 7 | 8 | @pytest.mark.integration 9 | def test_infer_targets(): 10 | modifier = SparseGPTModifier(sparsity=0.0) 11 | with init_empty_weights(): 12 | model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") 13 | 14 | inferred = modifier._infer_sequential_targets(model) 15 | assert inferred == ["LlamaDecoderLayer"] 16 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | import pytest 5 | 6 | from llmcompressor.core.state import State 7 | from llmcompressor.modifiers.obcq import SparseGPTModifier 8 | 9 | 10 | @pytest.mark.integration 11 | class TestLMHead(unittest.TestCase): 12 | def setUp(self): 13 | import torch 14 | from transformers import AutoModelForCausalLM 15 | 16 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" 17 | 18 | self.model = AutoModelForCausalLM.from_pretrained( 19 | "nm-testing/llama2.c-stories15M", device_map=self.device 20 | ) 21 | 22 | self.kwargs = { 23 | "sparsity": 0.5, 24 | "block_size": 128, 25 | "quantize": False, 26 | "targets": [ 27 | "model.layers.0", 28 | "model.layers.1", 29 | "model.layers.2", 30 | "model.layers.3", 31 | "model.layers.4", 32 | "model.layers.5", 33 | ], 34 | } 35 | 36 | dataset = MagicMock() 37 | dataset.column_names = [] 38 | self.dataloader = MagicMock() 39 | self.dataloader.dataset = dataset 40 | self.dataloader.__iter__.return_value = iter([]) 41 | 42 | def test_no_lm_head_target(self): 43 | modifier = SparseGPTModifier(**self.kwargs) 44 | 45 | state = State() 46 | state.update(model=self.model, device=self.device, calib_data=self.dataloader) 47 | modifier.initialize(state) 48 | modifier.on_start(state, None) 49 | 50 | assert len(self.model.lm_head._forward_hooks) <= 0 51 | 52 | modifier.finalize(state) 53 | 54 | def test_lm_head_target(self): 55 | self.kwargs["targets"].append("lm_head") 56 | modifier = SparseGPTModifier(**self.kwargs) 57 | 58 | state = State() 59 | state.update(model=self.model, device=self.device, calib_data=self.dataloader) 60 | modifier.initialize(state) 61 | modifier.on_start(state, None) 62 | 63 | assert len(self.model.lm_head._forward_hooks) == 1 64 | 65 | modifier.finalize(state) 66 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/test_obcq_owl.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from datasets import Dataset 4 | from transformers import AutoModelForCausalLM 5 | 6 | from llmcompressor.core.session_functions import create_session 7 | from llmcompressor.datasets import format_calibration_data 8 | from llmcompressor.modifiers.obcq import SparseGPTModifier 9 | from llmcompressor.utils.pytorch.module import get_layers 10 | 11 | 12 | @pytest.mark.integration 13 | def test_infer_owl_layer_sparsity(): 14 | target_sparsity = 0.7 15 | vocab_size = 512 16 | seq_len = 2048 17 | ds_size = 16 18 | 19 | with create_session() as session: 20 | session.initialize() 21 | modifier = SparseGPTModifier( 22 | sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05 23 | ) 24 | model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M") 25 | 26 | dataset = Dataset.from_dict( 27 | {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))} 28 | ) 29 | dataloader = format_calibration_data(dataset) 30 | 31 | sequential_targets = modifier._infer_sequential_targets(model) 32 | layers = get_layers(sequential_targets, model) 33 | sparsities = modifier._infer_owl_layer_sparsity(model, layers, dataloader) 34 | assert sparsities.keys() == layers.keys() 35 | 36 | for sparsity in sparsities.values(): 37 | assert sparsity == pytest.approx(target_sparsity, abs=0.1) 38 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/obcq/test_oneshot_with_modifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | import pytest 7 | from parameterized import parameterized_class 8 | 9 | from tests.testing_utils import parse_params, requires_gpu 10 | 11 | CONFIGS_DIRECTORY = ( 12 | "tests/llmcompressor/transformers/obcq/obcq_configs/sparsity_generic" 13 | ) 14 | 15 | 16 | @pytest.mark.integration 17 | @requires_gpu 18 | @parameterized_class(parse_params(CONFIGS_DIRECTORY)) 19 | class TestOneshotWithModifierObject(unittest.TestCase): 20 | model = None 21 | dataset = None 22 | 23 | def setUp(self): 24 | self.output = Path("./finetune_output") 25 | 26 | def test_oneshot_with_modifier_object(self): 27 | from llmcompressor import oneshot 28 | from llmcompressor.modifiers.obcq.base import SparseGPTModifier 29 | 30 | recipe_str = [ 31 | SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"]) 32 | ] 33 | 34 | device = "cuda:0" 35 | concatenate_data = False 36 | num_calibration_samples = 64 37 | output_dir = self.output / "oneshot_out" 38 | splits = {"calibration": "train[:10%]"} 39 | 40 | oneshot( 41 | model=self.model, 42 | dataset=self.dataset, 43 | output_dir=output_dir, 44 | num_calibration_samples=num_calibration_samples, 45 | recipe=recipe_str, 46 | concatenate_data=concatenate_data, 47 | splits=splits, 48 | oneshot_device=device, 49 | ) 50 | 51 | def tearDown(self): 52 | if os.path.isdir(self.output): 53 | shutil.rmtree(self.output) 54 | -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/transformers/oneshot/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml: -------------------------------------------------------------------------------- 1 | test_stage: 2 | obcq_modifiers: 3 | SparseGPTModifier: 4 | sparsity: 0.5 5 | block_size: 128 6 | targets: [ 7 | 're:model.layers.3.mlp.gate_proj.weight' 8 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: open_platypus 6 | recipe: | 7 | test_stage: 8 | obcq_modifiers: 9 | SparseGPTModifier: 10 | sparsity: 0.5 11 | block_size: 128 12 | targets: [ 13 | 're:model.layers.3.mlp.gate_proj.weight' 14 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf2.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: open_platypus 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf3.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: "gsm8k" 6 | dataset_config_name: "main" 7 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: False 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: "gsm8k" 6 | dataset_config_name: "main" 7 | recipe: | 8 | test_stage: 9 | obcq_modifiers: 10 | SparseGPTModifier: 11 | sparsity: 0.5 12 | block_size: 128 13 | targets: [ 14 | 're:model.layers.3.mlp.gate_proj.weight' 15 | ] -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf5.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: True 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: open_platypus 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf6.yaml: -------------------------------------------------------------------------------- 1 | cadence: "commit" 2 | test_type: "smoke" 3 | tokenize: True 4 | model: "nm-testing/llama2.c-stories15M" 5 | dataset: "gsm8k" 6 | recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml" -------------------------------------------------------------------------------- /tests/llmcompressor/transformers/sparsification/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, 10 | # software distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/llmcompressor/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/utils/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/llmcompressor/utils/pytorch/__init__.py -------------------------------------------------------------------------------- /tests/llmcompressor/utils/pytorch/test_module.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch.nn as nn 3 | 4 | from llmcompressor.utils.pytorch import get_layer_by_name 5 | 6 | 7 | @pytest.fixture 8 | def example_nested_module() -> str: 9 | return nn.Sequential( 10 | nn.Linear(10, 20), 11 | nn.Sequential(nn.ReLU(), nn.Linear(20, 10)), 12 | nn.Sequential(nn.SiLU(), nn.Linear(20, 10)), 13 | nn.Softmax(dim=1), 14 | ) 15 | 16 | 17 | @pytest.mark.unit 18 | def test_get_layer_by_name(example_nested_module): 19 | # Test getting the parent of a nested layer 20 | layer = get_layer_by_name("0", example_nested_module) 21 | assert layer == example_nested_module[0] 22 | 23 | layer = get_layer_by_name("1.1", example_nested_module) 24 | assert layer == example_nested_module[1][1] 25 | 26 | layer = get_layer_by_name("2.0", example_nested_module) 27 | assert layer == example_nested_module[2][0] 28 | 29 | layer = get_layer_by_name("2.1", example_nested_module) 30 | assert layer == example_nested_module[2][1] 31 | 32 | # Test getting the parent of a non-existent layer 33 | with pytest.raises(AttributeError): 34 | get_layer_by_name("non_existent_layer", example_nested_module) 35 | -------------------------------------------------------------------------------- /tests/lmeval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/lmeval/__init__.py -------------------------------------------------------------------------------- /tests/lmeval/configs/fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: FP8_DYNAMIC 4 | lmeval: 5 | metrics: 6 | exact_match,flexible-extract: 0.75 7 | exact_match,strict-match: 0.75 8 | -------------------------------------------------------------------------------- /tests/lmeval/configs/fp8_static_per_tensor.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: FP8 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | lmeval: 7 | metrics: 8 | exact_match,flexible-extract: 0.75 9 | exact_match,strict-match: 0.75 10 | -------------------------------------------------------------------------------- /tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: INT8_dyn_per_token 4 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.77 10 | exact_match,strict-match: 0.76 -------------------------------------------------------------------------------- /tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: weekly 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: FP8_DYNAMIC 5 | lmeval: 6 | model: "hf-multimodal" 7 | model_args: 8 | dtype: bfloat16 9 | add_bos_token: True 10 | convert_img_format: True 11 | task: mmmu_val_literature 12 | num_fewshot: 0 13 | batch_size: 8 14 | # dense model achieves accuracy of 0.9 +/ 0.0557 15 | metrics: 16 | acc,none: 0.8667 17 | acc_stderr,none: 0.0557 18 | -------------------------------------------------------------------------------- /tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: INT8_dyn_per_token 5 | recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml 6 | dataset_id: lmms-lab/flickr30k 7 | dataset_split: "test[:512]" 8 | lmeval: 9 | model: "hf-multimodal" 10 | model_args: 11 | dtype: bfloat16 12 | add_bos_token: True 13 | convert_img_format: True 14 | task: mmmu_val_literature 15 | num_fewshot: 0 16 | batch_size: 8 17 | # dense model achieves accuracy of 0.9 +/ 0.0557 18 | metrics: 19 | acc,none: 0.833 20 | acc_stderr,none: 0.0557 -------------------------------------------------------------------------------- /tests/lmeval/configs/vl_w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: Qwen/Qwen2.5-VL-7B-Instruct 3 | model_class: Qwen2_5_VLForConditionalGeneration 4 | scheme: W4A16_actorder_weight 5 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 6 | dataset_id: lmms-lab/flickr30k 7 | dataset_split: "test[:512]" 8 | lmeval: 9 | model: "hf-multimodal" 10 | model_args: 11 | dtype: bfloat16 12 | add_bos_token: True 13 | convert_img_format: True 14 | task: mmmu_val_literature 15 | num_fewshot: 0 16 | batch_size: 8 17 | # dense model achieves accuracy of 0.9 +/ 0.0557 18 | metrics: 19 | acc,none: 0.8333 20 | acc_stderr,none: 0.0557 -------------------------------------------------------------------------------- /tests/lmeval/configs/w4a16_actorder_group.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16_actorder_group 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /tests/lmeval/configs/w4a16_actorder_weight.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16_actorder_weight 4 | recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml 5 | dataset_id: HuggingFaceH4/ultrachat_200k 6 | dataset_split: train_sft 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /tests/lmeval/configs/w4a16_grouped_quant.yaml: -------------------------------------------------------------------------------- 1 | cadence: "weekly" 2 | model: meta-llama/Meta-Llama-3-8B-Instruct 3 | scheme: W4A16 4 | dataset_id: HuggingFaceH4/ultrachat_200k 5 | dataset_split: train_sft 6 | quant_type: "GPTQ" 7 | lmeval: 8 | metrics: 9 | exact_match,flexible-extract: 0.72 10 | exact_match,strict-match: 0.72 11 | -------------------------------------------------------------------------------- /tests/test_timer/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .timer import Timer 4 | -------------------------------------------------------------------------------- /tests/test_timer/timer_utils.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | from tests.test_timer import Timer 4 | 5 | __all__ = ["log_time", "get_singleton_manager"] 6 | 7 | 8 | def get_singleton_manager(enable_logging: bool = True): 9 | """ 10 | Return the Timer. If not has not yet been initialized, initialize and 11 | return. If it has, return the existing Timer. 12 | """ 13 | if Timer._instance is None: 14 | Timer._instance = Timer(enable_logging=enable_logging) 15 | return Timer._instance 16 | 17 | 18 | def log_time(func): 19 | """ 20 | Decorator to time functions. Times for the function are stored using 21 | the class and function names. 22 | """ 23 | 24 | @wraps(func) 25 | def wrapper(*args, **kwargs): 26 | TIMER_MANAGER = get_singleton_manager() 27 | func_name = func.__name__ 28 | 29 | if not TIMER_MANAGER.enable_logging: 30 | return func(*args, **kwargs) 31 | 32 | with TIMER_MANAGER.time(func_name): 33 | return func(*args, **kwargs) 34 | 35 | return wrapper 36 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/__init__.py -------------------------------------------------------------------------------- /tests/unit/core/events/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vllm-project/llm-compressor/341e27c52a3523193babed1b2c546f1d66b0f3e5/tests/unit/core/events/__init__.py -------------------------------------------------------------------------------- /tests/unit/core/events/test_event.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from llmcompressor.core import Event, EventType 4 | 5 | 6 | @pytest.mark.smoke 7 | def test_event_epoch_based(): 8 | event = Event(steps_per_epoch=10) 9 | assert event.epoch_based is True 10 | 11 | 12 | @pytest.mark.smoke 13 | def test_event_epoch(): 14 | event = Event(steps_per_epoch=10, global_step=25) 15 | assert event.epoch == 2 16 | 17 | 18 | @pytest.mark.smoke 19 | def test_event_epoch_full(): 20 | event = Event(steps_per_epoch=10, global_step=25) 21 | assert event.epoch_full == 2.5 22 | 23 | 24 | @pytest.mark.smoke 25 | def test_event_epoch_step(): 26 | event = Event(steps_per_epoch=10, global_step=25) 27 | assert event.epoch_step == 5 28 | 29 | 30 | @pytest.mark.smoke 31 | def test_event_epoch_batch(): 32 | event = Event( 33 | steps_per_epoch=10, global_step=25, batches_per_step=2, global_batch=50 34 | ) 35 | assert event.epoch_batch == 10 36 | 37 | 38 | @pytest.mark.smoke 39 | def test_event_current_index(): 40 | event = Event(steps_per_epoch=10, global_step=25) 41 | assert event.current_index == 2.5 42 | 43 | 44 | @pytest.mark.smoke 45 | def test_event_should_update(): 46 | event = Event(steps_per_epoch=10, global_step=25) 47 | assert event.should_update(start=0, end=30, update=2.5) is True 48 | assert event.should_update(start=0, end=20, update=5) is False 49 | assert event.should_update(start=0, end=30, update=0) is True 50 | 51 | 52 | @pytest.mark.smoke 53 | def test_event_new_instance(): 54 | event = Event(type_=EventType.INITIALIZE, global_step=25) 55 | new_event = event.new_instance(global_step=30) 56 | assert new_event.global_step == 30 57 | assert new_event.type_ == EventType.INITIALIZE 58 | --------------------------------------------------------------------------------