├── .DS_Store ├── .gitattributes ├── .gitignore ├── LICENSE ├── Layer_Drop.svg ├── README.md ├── SECURITY.md ├── requirements.txt ├── scripts ├── .DS_Store ├── benchmark │ ├── benchmark_lm_eval.sh │ └── benchmark_speed.sh ├── dropping │ ├── block_drop.sh │ ├── layer_drop.sh │ ├── layer_drop_iterative.sh │ └── layer_drop_joint.sh └── quantization │ ├── awq.sh │ └── gptq.sh ├── setup.py └── src ├── .DS_Store ├── __init__.py ├── benchmark_speed.py ├── compress.py └── llmtuner ├── .DS_Store ├── __init__.py ├── compression ├── .DS_Store ├── __init__.py ├── prune │ ├── __init__.py │ ├── block_drop.py │ ├── io.py │ ├── layer_drop.py │ ├── models │ │ ├── __init__.py │ │ ├── configuration_deepseek.py │ │ ├── configuration_dropped_baichuan.py │ │ ├── configuration_dropped_gemma2.py │ │ ├── configuration_dropped_llama.py │ │ ├── configuration_dropped_mistral.py │ │ ├── modeling_dropped_baichuan.py │ │ ├── modeling_dropped_deepseek.py │ │ ├── modeling_dropped_gemma2.py │ │ ├── modeling_dropped_llama.py │ │ └── modeling_dropped_mistral.py │ ├── utils.py │ ├── workflow.py │ └── wrapper.py ├── quantization │ ├── AutoAWQ │ │ ├── AutoAWQ_kernels │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── awq_ext │ │ │ │ ├── attention │ │ │ │ │ ├── cuda_bf16_fallbacks.cuh │ │ │ │ │ ├── cuda_bf16_wrapper.h │ │ │ │ │ ├── decoder_masked_multihead_attention.cu │ │ │ │ │ ├── decoder_masked_multihead_attention.h │ │ │ │ │ ├── decoder_masked_multihead_attention_template.hpp │ │ │ │ │ ├── decoder_masked_multihead_attention_utils.h │ │ │ │ │ ├── ft_attention.cpp │ │ │ │ │ └── ft_attention.h │ │ │ │ ├── exllama │ │ │ │ │ ├── cu_compat.cuh │ │ │ │ │ ├── cuda_buffers.cu │ │ │ │ │ ├── cuda_buffers.cuh │ │ │ │ │ ├── cuda_func │ │ │ │ │ │ ├── column_remap.cu │ │ │ │ │ │ ├── column_remap.cuh │ │ │ │ │ │ ├── q4_matmul.cu │ │ │ │ │ │ ├── q4_matmul.cuh │ │ │ │ │ │ ├── q4_matrix.cu │ │ │ │ │ │ └── q4_matrix.cuh │ │ │ │ │ ├── exllama_ext.cpp │ │ │ │ │ ├── hip_compat.cuh │ │ │ │ │ ├── matrix.cuh │ │ │ │ │ ├── tuning.h │ │ │ │ │ └── util.cuh │ │ │ │ ├── exllamav2 │ │ │ │ │ ├── config.h │ │ │ │ │ ├── cpp │ │ │ │ │ │ └── util.h │ │ │ │ │ ├── cuda │ │ │ │ │ │ ├── compat.cuh │ │ │ │ │ │ ├── compat_gemm.cuh │ │ │ │ │ │ ├── matrix_view.cuh │ │ │ │ │ │ ├── q_gemm.cu │ │ │ │ │ │ ├── q_gemm.cuh │ │ │ │ │ │ ├── q_gemm_kernel.cuh │ │ │ │ │ │ ├── q_gemm_kernel_gptq.cuh │ │ │ │ │ │ ├── q_matrix.cu │ │ │ │ │ │ ├── q_matrix.cuh │ │ │ │ │ │ ├── quant │ │ │ │ │ │ │ ├── qdq_2.cuh │ │ │ │ │ │ │ ├── qdq_3.cuh │ │ │ │ │ │ │ ├── qdq_4.cuh │ │ │ │ │ │ │ ├── qdq_5.cuh │ │ │ │ │ │ │ ├── qdq_6.cuh │ │ │ │ │ │ │ ├── qdq_8.cuh │ │ │ │ │ │ │ └── qdq_util.cuh │ │ │ │ │ │ └── util.cuh │ │ │ │ │ └── ext.cpp │ │ │ │ ├── layernorm │ │ │ │ │ ├── layernorm.cu │ │ │ │ │ ├── layernorm.h │ │ │ │ │ └── reduction.cuh │ │ │ │ ├── position_embedding │ │ │ │ │ ├── pos_encoding.h │ │ │ │ │ └── pos_encoding_kernels.cu │ │ │ │ ├── pybind_awq.cpp │ │ │ │ ├── pybind_awq_ft.cpp │ │ │ │ ├── pybind_awq_v2.cpp │ │ │ │ ├── quantization │ │ │ │ │ ├── dequantize.cuh │ │ │ │ │ ├── gemm_cuda.h │ │ │ │ │ ├── gemm_cuda_gen.cu │ │ │ │ │ ├── gemv_cuda.cu │ │ │ │ │ └── gemv_cuda.h │ │ │ │ ├── quantization_new │ │ │ │ │ ├── dequantize.cuh │ │ │ │ │ ├── gemm │ │ │ │ │ │ ├── gemm_cuda.cu │ │ │ │ │ │ ├── gemm_cuda.h │ │ │ │ │ │ └── semaphore.h │ │ │ │ │ └── gemv │ │ │ │ │ │ ├── gemv_cuda.cu │ │ │ │ │ │ └── gemv_cuda.h │ │ │ │ └── vllm │ │ │ │ │ ├── activation.cu │ │ │ │ │ ├── activation.h │ │ │ │ │ ├── moe_alig_block.cu │ │ │ │ │ ├── moe_alig_block.h │ │ │ │ │ ├── topk_softmax_kernels.cu │ │ │ │ │ └── topk_softmax_kernels.h │ │ │ ├── scripts │ │ │ │ └── download_wheels.sh │ │ │ └── setup.py │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── awq │ │ │ ├── __init__.py │ │ │ ├── evaluation │ │ │ │ ├── __init__.py │ │ │ │ ├── eval_utils.py │ │ │ │ ├── humaneval_utils.py │ │ │ │ └── kl_divergence.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── _config.py │ │ │ │ ├── aquila.py │ │ │ │ ├── auto.py │ │ │ │ ├── baichuan.py │ │ │ │ ├── base.py │ │ │ │ ├── bloom.py │ │ │ │ ├── deepseek.py │ │ │ │ ├── deepseek_moe │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── configuration_deepseek.py │ │ │ │ │ └── modeling_deepseek.py │ │ │ │ ├── falcon.py │ │ │ │ ├── gemma.py │ │ │ │ ├── gpt_bigcode.py │ │ │ │ ├── gpt_neox.py │ │ │ │ ├── gptj.py │ │ │ │ ├── llama.py │ │ │ │ ├── llava.py │ │ │ │ ├── mistral.py │ │ │ │ ├── mixtral.py │ │ │ │ ├── mpt.py │ │ │ │ ├── opt.py │ │ │ │ ├── qwen.py │ │ │ │ ├── qwen2.py │ │ │ │ ├── stablelm.py │ │ │ │ ├── starcoder2.py │ │ │ │ └── yi.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── act.py │ │ │ │ ├── fused │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── attn.py │ │ │ │ │ ├── block.py │ │ │ │ │ ├── cache.py │ │ │ │ │ ├── mlp.py │ │ │ │ │ ├── model.py │ │ │ │ │ ├── moe.py │ │ │ │ │ └── norm.py │ │ │ │ └── linear │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── exllama.py │ │ │ │ │ ├── exllamav2.py │ │ │ │ │ ├── gemm.py │ │ │ │ │ ├── gemv.py │ │ │ │ │ ├── gemv_fast.py │ │ │ │ │ └── marlin.py │ │ │ ├── quantize │ │ │ │ ├── __init__.py │ │ │ │ ├── quantizer.py │ │ │ │ └── scale.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── calib_data.py │ │ │ │ ├── fused_utils.py │ │ │ │ ├── module.py │ │ │ │ ├── packing_utils.py │ │ │ │ ├── parallel.py │ │ │ │ ├── quant_utils.py │ │ │ │ └── utils.py │ │ ├── quantize.py │ │ └── setup.py │ ├── AutoGPTQ │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── auto_gptq │ │ │ ├── __init__.py │ │ │ ├── eval_tasks │ │ │ │ ├── __init__.py │ │ │ │ ├── _base.py │ │ │ │ ├── _utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── classification_utils.py │ │ │ │ │ └── generation_utils.py │ │ │ │ ├── language_modeling_task.py │ │ │ │ ├── sequence_classification_task.py │ │ │ │ └── text_summarization_task.py │ │ │ ├── modeling │ │ │ │ ├── __init__.py │ │ │ │ ├── _base.py │ │ │ │ ├── _const.py │ │ │ │ ├── _utils.py │ │ │ │ ├── auto.py │ │ │ │ ├── baichuan.py │ │ │ │ ├── bloom.py │ │ │ │ ├── codegen.py │ │ │ │ ├── decilm.py │ │ │ │ ├── deepseek.py │ │ │ │ ├── gemma.py │ │ │ │ ├── gpt2.py │ │ │ │ ├── gpt_bigcode.py │ │ │ │ ├── gpt_neox.py │ │ │ │ ├── gptj.py │ │ │ │ ├── internlm.py │ │ │ │ ├── llama.py │ │ │ │ ├── longllama.py │ │ │ │ ├── mistral.py │ │ │ │ ├── mixtral.py │ │ │ │ ├── moss.py │ │ │ │ ├── opt.py │ │ │ │ ├── qwen.py │ │ │ │ ├── qwen2.py │ │ │ │ ├── rw.py │ │ │ │ ├── stablelmepoch.py │ │ │ │ ├── xverse.py │ │ │ │ └── yi.py │ │ │ ├── nn_modules │ │ │ │ ├── __init__.py │ │ │ │ ├── _fused_base.py │ │ │ │ ├── fused_gptj_attn.py │ │ │ │ ├── fused_llama_attn.py │ │ │ │ ├── fused_llama_mlp.py │ │ │ │ ├── qlinear │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── qlinear_cuda.py │ │ │ │ │ ├── qlinear_cuda_old.py │ │ │ │ │ ├── qlinear_exllama.py │ │ │ │ │ ├── qlinear_exllamav2.py │ │ │ │ │ ├── qlinear_marlin.py │ │ │ │ │ ├── qlinear_qigen.py │ │ │ │ │ └── qlinear_triton.py │ │ │ │ └── triton_utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── custom_autotune.py │ │ │ │ │ ├── kernels.py │ │ │ │ │ └── mixin.py │ │ │ ├── quantization │ │ │ │ ├── __init__.py │ │ │ │ ├── gptq.py │ │ │ │ └── quantizer.py │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── data_utils.py │ │ │ │ ├── exllama_utils.py │ │ │ │ ├── import_utils.py │ │ │ │ ├── marlin_utils.py │ │ │ │ ├── modeling_utils.py │ │ │ │ ├── peft_utils.py │ │ │ │ └── perplexity_utils.py │ │ ├── datautils.py │ │ ├── gptq.py │ │ ├── modelutils.py │ │ ├── quant.py │ │ ├── quantize.py │ │ ├── setup_cuda.py │ │ └── test_kernel.py │ └── __init__.py ├── tuner.py └── utils.py ├── data ├── __init__.py ├── aligner.py ├── c4_demo.json ├── c4_train.json ├── c4_val.json ├── dataset_info.json ├── formatter.py ├── loader.py ├── parser.py ├── preprocess.py ├── template.py ├── test_data.py └── utils.py ├── extras ├── __init__.py ├── callbacks.py ├── constants.py ├── logging.py ├── misc.py ├── packages.py ├── patches │ ├── __init__.py │ ├── llama_patch.py │ └── mixtral_patch.py └── ploting.py ├── hparams ├── __init__.py ├── data_args.py ├── evaluation_args.py ├── finetuning_args.py ├── generating_args.py ├── model_args.py ├── parser.py └── pruning_args.py └── model ├── __init__.py ├── adapter.py ├── loader.py ├── patcher.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 5.1.x | :white_check_mark: | 11 | | 5.0.x | :x: | 12 | | 4.0.x | :white_check_mark: | 13 | | < 4.0 | :x: | 14 | 15 | ## Reporting a Vulnerability 16 | 17 | Use this section to tell people how to report a vulnerability. 18 | 19 | Tell them where to go, how often they can expect to get an update on a 20 | reported vulnerability, what to expect if the vulnerability is accepted or 21 | declined, etc. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.13.1 2 | transformers>=4.38.1 3 | datasets>=2.14.3 4 | accelerate>=0.21.0 5 | peft>=0.8.2 6 | trl>=0.7.6 7 | gradio>=3.38.0,<4.0.0 8 | scipy 9 | einops 10 | sentencepiece 11 | protobuf 12 | jieba 13 | rouge-chinese 14 | nltk 15 | uvicorn 16 | pydantic 17 | fastapi 18 | sse-starlette 19 | matplotlib 20 | -------------------------------------------------------------------------------- /scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/scripts/.DS_Store -------------------------------------------------------------------------------- /scripts/benchmark/benchmark_lm_eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | port="21804" 4 | GPUs="0,1,2,3,4,5,6,7" 5 | 6 | # Taking mistralai/Mistral-7B-v0.1 as an example. 7 | model_names=("mistral") # The model to be compressed. 8 | drop_modules=("mlp" "attn" "block") # The modules to be dropped. 9 | drop_nums=("4" "8") # The number of dropped modules. 10 | 11 | tasks=("boolq" "rte" "openbookqa" "piqa" "mmlu" "winogrande" "gsm8k" "hellaswag" "arc_challenge") 12 | num_fewshots=("0" "0" "0" "0" "5" "5" "5" "10" "25") 13 | 14 | for model_name in "${model_names[@]}" 15 | do 16 | # Download the model to a local directory. 17 | git lfs install 18 | git clone https://huggingface.co/mistralai/Mistral-7B-v0.1 19 | mv Mistral-7B-v0.1 ./"$model_name"_model 20 | 21 | for drop_module in "${drop_modules[@]}" 22 | do 23 | for drop_num in "${drop_nums[@]}" 24 | do 25 | cfg_path=./"$model_name"_drop"$drop_num"_"$drop_module"/config.json # PATH to the corresponding config.json file. 26 | cp -f "$cfg_path" ./"$model_name"_model/config.json # Replace the original config.json file. 27 | cp ./"$model_name"_drop"$drop_num"_"$drop_module"/*.py ./"$model_name"_model/ # Build the configuration and modeling files for remote code. 28 | echo "Eval the config of:" 29 | echo $cfg_path 30 | 31 | num_tasks=${#tasks[@]} 32 | for ((i=0; i<$num_tasks; i++)); do 33 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port -m lm_eval \ 34 | --model hf \ 35 | --model_args pretrained=./${model_name}_model,trust_remote_code=True,dtype="bfloat16" \ 36 | --tasks ${tasks[$i]} \ 37 | --num_fewshot ${num_fewshots[$i]} \ 38 | --batch_size 1 \ 39 | --output_path ./${num_fewshots[$i]}shot_${tasks[$i]}_"$model_name"_drop"$drop_num"_"$drop_module".json >> output_"$model_name"_drop"$drop_num"_"$drop_module".out 40 | done 41 | done 42 | done 43 | done -------------------------------------------------------------------------------- /scripts/benchmark/benchmark_speed.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT########" 4 | save_file="########PATH_TO_SAVE_THE_RESULTS########/speed.csv" 5 | model_type="normal" # normal or quantized 6 | 7 | python src/benchmark_speed.py \ 8 | --model_path $model_path \ 9 | --model_type ${model_type} \ 10 | --save_file ${save_file} \ 11 | --pretrained -------------------------------------------------------------------------------- /scripts/dropping/block_drop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | port="21304" 3 | GPUs="0,1,2,3" 4 | 5 | dataset="c4_val" 6 | prune_data_type="pt" 7 | n_calibration_samples=256 8 | seq_len=2048 9 | 10 | prune_method="block_drop" 11 | block_drop_method="discrete" 12 | drop_n=8 13 | 14 | model_name=mistral-base 15 | model_name_or_path=mistralai/Mistral-7B-v0.1 16 | 17 | folder_name="${model_name}-${prune_method}-${block_drop_method}-drop${drop_n}" 18 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}-${dataset}-${n_calibration_samples}samples.pt" 19 | 20 | echo ${folder_name} 21 | 22 | output_dir=../results_prune/${folder_name} 23 | prune_model_save_path=${output_dir}/checkpoint 24 | 25 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \ 26 | src/compress.py \ 27 | --stage prune \ 28 | --model_name_or_path ${model_name_or_path} \ 29 | --dataset ${dataset} \ 30 | --dataset_dir ./src/llmtuner/data \ 31 | --split "train" \ 32 | --prune_data_type ${prune_data_type} \ 33 | --cutoff_len ${seq_len} \ 34 | --output_dir ${output_dir} \ 35 | --logging_steps 10 \ 36 | --bf16 \ 37 | --n_calibration_samples ${n_calibration_samples} \ 38 | --prune_method ${prune_method} \ 39 | --block_drop_method ${block_drop_method} \ 40 | --drop_n ${drop_n} \ 41 | --similarity_cache_file ${similarity_cache_file} \ 42 | --prune_model_save_path ${prune_model_save_path} 43 | 44 | block_drop_method="post_dropping" 45 | # set only_update_config to True to save the disk memory 46 | only_update_config=False 47 | 48 | python \ 49 | src/compress.py \ 50 | --stage prune \ 51 | --model_name_or_path ${model_name_or_path} \ 52 | --dataset ${dataset} \ 53 | --dataset_dir ./src/llmtuner/data \ 54 | --split "train" \ 55 | --only_update_config $only_update_config \ 56 | --prune_data_type ${prune_data_type} \ 57 | --cutoff_len ${seq_len} \ 58 | --output_dir ${output_dir} \ 59 | --logging_steps 10 \ 60 | --bf16 \ 61 | --n_calibration_samples ${n_calibration_samples} \ 62 | --prune_method ${prune_method} \ 63 | --block_drop_method ${block_drop_method} \ 64 | --drop_n ${drop_n} \ 65 | --similarity_cache_file ${similarity_cache_file} \ 66 | --prune_model_save_path ${prune_model_save_path} -------------------------------------------------------------------------------- /scripts/dropping/layer_drop.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | port="21304" 4 | GPUs="0,1,2,3" 5 | 6 | dataset="c4_val" 7 | prune_data_type="pt" 8 | n_calibration_samples=256 9 | seq_len=2048 10 | 11 | prune_method="layer_drop" 12 | layer_drop_method="discrete" 13 | target_layer="attn" 14 | drop_n=8 15 | 16 | model_name=mistral-base 17 | model_name_or_path=mistralai/Mistral-7B-v0.1 18 | 19 | folder_name="${model_name}-${prune_method}_${target_layer}-${layer_drop_method}-drop${drop_n}" 20 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}_${target_layer}-${dataset}-${n_calibration_samples}samples.pt" 21 | 22 | echo ${folder_name} 23 | 24 | output_dir=../results_prune/${folder_name} 25 | prune_model_save_path=${output_dir}/checkpoint 26 | 27 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \ 28 | src/compress.py \ 29 | --stage prune \ 30 | --model_name_or_path ${model_name_or_path} \ 31 | --dataset ${dataset} \ 32 | --dataset_dir ./src/llmtuner/data \ 33 | --split "train" \ 34 | --layer_drop_norm True \ 35 | --target_layer ${target_layer} \ 36 | --only_update_config True \ 37 | --prune_data_type ${prune_data_type} \ 38 | --cutoff_len ${seq_len} \ 39 | --output_dir ${output_dir} \ 40 | --logging_steps 10 \ 41 | --bf16 \ 42 | --n_calibration_samples ${n_calibration_samples} \ 43 | --prune_method ${prune_method} \ 44 | --layer_drop_method ${layer_drop_method} \ 45 | --drop_n ${drop_n} \ 46 | --similarity_cache_file ${similarity_cache_file} \ 47 | --prune_model_save_path ${prune_model_save_path} 48 | 49 | 50 | layer_drop_method="post_dropping" 51 | # set only_update_config to True to save the disk memory 52 | only_update_config=False 53 | 54 | python src/compress.py \ 55 | --stage prune \ 56 | --model_name_or_path ${model_name_or_path} \ 57 | --dataset ${dataset} \ 58 | --dataset_dir ./src/llmtuner/data \ 59 | --split "train" \ 60 | --only_update_config $only_update_config \ 61 | --layer_drop_norm True \ 62 | --target_layer ${target_layer} \ 63 | --prune_data_type ${prune_data_type} \ 64 | --cutoff_len ${seq_len} \ 65 | --output_dir ${output_dir} \ 66 | --logging_steps 10 \ 67 | --bf16 \ 68 | --n_calibration_samples ${n_calibration_samples} \ 69 | --prune_method ${prune_method} \ 70 | --layer_drop_method ${layer_drop_method} \ 71 | --drop_n ${drop_n} \ 72 | --similarity_cache_file ${similarity_cache_file} \ 73 | --prune_model_save_path ${prune_model_save_path} -------------------------------------------------------------------------------- /scripts/dropping/layer_drop_iterative.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | port="21304" 3 | GPUs="0,1,2,3" 4 | 5 | dataset="c4_val" 6 | prune_data_type="pt" 7 | n_calibration_samples=128 8 | seq_len=2048 9 | 10 | prune_method="layer_drop" 11 | layer_drop_method="discrete" 12 | target_layer="all" 13 | 14 | drop_n=1 15 | num_epochs=8 16 | 17 | model_name=mistral-base 18 | model_name_or_path=mistralai/Mistral-7B-v0.1 19 | 20 | for ((epoch=1; epoch<=num_epochs; epoch++)) do 21 | layer_drop_method="discrete" 22 | folder_name="Iterative-epoch${epoch}-${model_name}-${prune_method}-${target_layer}-${layer_drop_method}-drop${drop_n}PerEpoch" 23 | similarity_cache_file="../results_prune/cache/Iterative-epoch${epoch}-${model_name}-drop_${target_layer}-${dataset}-${n_calibration_samples}samples.pt" 24 | echo ${folder_name} 25 | echo ${model_name_or_path} 26 | output_dir=./results_prune/Iterative/${folder_name} 27 | prune_model_save_path=${output_dir}/checkpoint 28 | 29 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \ 30 | src/compress.py \ 31 | --stage prune \ 32 | --model_name_or_path ${model_name_or_path} \ 33 | --dataset ${dataset} \ 34 | --dataset_dir ./src/llmtuner/data \ 35 | --split "train" \ 36 | --prune_data_type ${prune_data_type} \ 37 | --cutoff_len ${seq_len} \ 38 | --layer_drop_norm True \ 39 | --target_layer ${target_layer} \ 40 | --output_dir ${output_dir} \ 41 | --logging_steps 10 \ 42 | --bf16 \ 43 | --n_calibration_samples ${n_calibration_samples} \ 44 | --prune_method ${prune_method} \ 45 | --layer_drop_method ${layer_drop_method} \ 46 | --drop_n ${drop_n} \ 47 | --similarity_cache_file ${similarity_cache_file} \ 48 | --prune_model_save_path ${prune_model_save_path} 49 | 50 | # Save the converted the model without DeepSpeed 51 | layer_drop_method="post_dropping" 52 | # set only_update_config to True to save the disk memory 53 | only_update_config=False 54 | 55 | python src/compress.py \ 56 | --stage prune \ 57 | --model_name_or_path ${model_name_or_path} \ 58 | --dataset ${dataset} \ 59 | --dataset_dir ./src/llmtuner/data \ 60 | --split "train" \ 61 | --only_update_config $only_update_config \ 62 | --layer_drop_norm True \ 63 | --target_layer ${target_layer} \ 64 | --prune_data_type ${prune_data_type} \ 65 | --cutoff_len ${seq_len} \ 66 | --output_dir ${output_dir} \ 67 | --logging_steps 10 \ 68 | --bf16 \ 69 | --n_calibration_samples ${n_calibration_samples} \ 70 | --prune_method ${prune_method} \ 71 | --layer_drop_method ${layer_drop_method} \ 72 | --drop_n ${drop_n} \ 73 | --similarity_cache_file ${similarity_cache_file} \ 74 | --prune_model_save_path ${prune_model_save_path} 75 | model_name_or_path=$prune_model_save_path 76 | done -------------------------------------------------------------------------------- /scripts/dropping/layer_drop_joint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | port="21304" 3 | GPUs="0,1,2,3" 4 | 5 | dataset="c4_val" 6 | prune_data_type="pt" 7 | n_calibration_samples=256 8 | seq_len=2048 9 | 10 | prune_method="layer_drop" 11 | layer_drop_method="discrete" 12 | target_layer="all" 13 | drop_n=64 14 | 15 | model_name=llama2-13b-base 16 | model_name_or_path=mistralai/Mistral-7B-v0.1 17 | 18 | folder_name="${model_name}-${prune_method}_${target_layer}-${layer_drop_method}-drop${drop_n}" 19 | similarity_cache_file="../results_prune/cache/${model_name}-${prune_method}_${target_layer}-${dataset}-${n_calibration_samples}samples.pt" 20 | 21 | echo ${folder_name} 22 | 23 | output_dir=./results_prune/${folder_name} 24 | prune_model_save_path=${output_dir}/checkpoint 25 | 26 | CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \ 27 | src/compress.py \ 28 | --stage prune \ 29 | --model_name_or_path ${model_name_or_path} \ 30 | --dataset ${dataset} \ 31 | --dataset_dir ./src/llmtuner/data \ 32 | --split "train" \ 33 | --layer_drop_norm True \ 34 | --target_layer ${target_layer} \ 35 | --only_update_config True \ 36 | --prune_data_type ${prune_data_type} \ 37 | --cutoff_len ${seq_len} \ 38 | --output_dir ${output_dir} \ 39 | --logging_steps 10 \ 40 | --bf16 \ 41 | --n_calibration_samples ${n_calibration_samples} \ 42 | --prune_method ${prune_method} \ 43 | --layer_drop_method ${layer_drop_method} \ 44 | --drop_n ${drop_n} \ 45 | --similarity_cache_file ${similarity_cache_file} \ 46 | --prune_model_save_path ${prune_model_save_path} 47 | 48 | 49 | layer_drop_method="post_dropping" 50 | # set only_update_config to True to save the disk memory 51 | only_update_config=False 52 | 53 | python \ 54 | src/compress.py \ 55 | --stage prune \ 56 | --model_name_or_path ${model_name_or_path} \ 57 | --dataset ${dataset} \ 58 | --dataset_dir ./src/llmtuner/data \ 59 | --split "train" \ 60 | --only_update_config $only_update_config \ 61 | --layer_drop_norm True \ 62 | --target_layer ${target_layer} \ 63 | --prune_data_type ${prune_data_type} \ 64 | --cutoff_len ${seq_len} \ 65 | --output_dir ${output_dir} \ 66 | --logging_steps 10 \ 67 | --bf16 \ 68 | --n_calibration_samples ${n_calibration_samples} \ 69 | --prune_method ${prune_method} \ 70 | --layer_drop_method ${layer_drop_method} \ 71 | --drop_n ${drop_n} \ 72 | --similarity_cache_file ${similarity_cache_file} \ 73 | --prune_model_save_path ${prune_model_save_path} -------------------------------------------------------------------------------- /scripts/quantization/awq.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT#########" 4 | quant_path="########PATH_TO_SAVE_THE_QUANTIZED_MODEL########" 5 | bits=4 6 | 7 | python AutoAWQ/quantize.py \ 8 | $model_path \ 9 | $quant_path \ 10 | $bits -------------------------------------------------------------------------------- /scripts/quantization/gptq.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | model_path="########PATH_TO_HUGGING_FACE_CHECKPOINT#########" 4 | quant_path="########PATH_TO_SAVE_THE_QUANTIZED_MODEL########" 5 | 6 | bits=4 7 | seed=0 8 | num_samples=16 9 | calibration_template=default 10 | 11 | python AutoGPTQ/quantize.py \ 12 | --pretrained_model_dir $model_path \ 13 | --quantized_model_dir $quant_path \ 14 | --bits $bits \ 15 | --save_and_reload \ 16 | --desc_act \ 17 | --seed $seed \ 18 | --num_samples $num_samples \ 19 | --calibration-template $calibration_template \ 20 | --trust_remote_code \ 21 | --use_triton -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def get_version(): 7 | with open(os.path.join("src", "llmtuner", "__init__.py"), "r", encoding="utf-8") as f: 8 | file_content = f.read() 9 | pattern = r"{0}\W*=\W*\"([^\"]+)\"".format("__version__") 10 | version, = re.findall(pattern, file_content) 11 | return version 12 | 13 | 14 | def get_requires(): 15 | with open("requirements.txt", "r", encoding="utf-8") as f: 16 | file_content = f.read() 17 | lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")] 18 | return lines 19 | 20 | 21 | def main(): 22 | 23 | setup( 24 | name="llmtuner", 25 | version=get_version(), 26 | author="hiyouga", 27 | author_email="hiyouga" "@" "buaa.edu.cn", 28 | description="Easy-to-use LLM fine-tuning framework", 29 | long_description=open("README.md", "r", encoding="utf-8").read(), 30 | long_description_content_type="text/markdown", 31 | keywords=["LLaMA", "BLOOM", "Falcon", "LLM", "ChatGPT", "transformer", "pytorch", "deep learning"], 32 | license="Apache 2.0 License", 33 | url="https://github.com/hiyouga/LLaMA-Factory", 34 | package_dir={"": "src"}, 35 | packages=find_packages("src"), 36 | python_requires=">=3.8.0", 37 | install_requires=get_requires(), 38 | classifiers=[ 39 | "Development Status :: 3 - Alpha", 40 | "Intended Audience :: Developers", 41 | "Intended Audience :: Education", 42 | "Intended Audience :: Science/Research", 43 | "License :: OSI Approved :: Apache Software License", 44 | "Operating System :: OS Independent", 45 | "Programming Language :: Python :: 3", 46 | "Programming Language :: Python :: 3.8", 47 | "Programming Language :: Python :: 3.9", 48 | "Programming Language :: Python :: 3.10", 49 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 50 | ] 51 | ) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/.DS_Store -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/__init__.py -------------------------------------------------------------------------------- /src/compress.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path = [os.getcwd()] + sys.path 5 | 6 | from llmtuner import run_exp 7 | 8 | 9 | def main(): 10 | run_exp() 11 | 12 | 13 | def _mp_fn(index): 14 | # For xla_spawn (TPUs) 15 | main() 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | -------------------------------------------------------------------------------- /src/llmtuner/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/.DS_Store -------------------------------------------------------------------------------- /src/llmtuner/__init__.py: -------------------------------------------------------------------------------- 1 | # Level: api, webui > chat, eval, compression > data, model > extras, hparams 2 | 3 | from .compression import export_model, run_exp 4 | 5 | __version__ = "0.5.2" 6 | __all__ = ["export_model", "run_exp",] 7 | -------------------------------------------------------------------------------- /src/llmtuner/compression/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/.DS_Store -------------------------------------------------------------------------------- /src/llmtuner/compression/__init__.py: -------------------------------------------------------------------------------- 1 | from .tuner import export_model, run_exp 2 | 3 | 4 | __all__ = ["export_model", "run_exp"] 5 | -------------------------------------------------------------------------------- /src/llmtuner/compression/prune/__init__.py: -------------------------------------------------------------------------------- 1 | from .workflow import run_prune 2 | 3 | 4 | __all__ = ["run_prune"] 5 | -------------------------------------------------------------------------------- /src/llmtuner/compression/prune/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/prune/models/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/prune/wrapper.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | """For recording weights""" 7 | class HiddenStatesRecordWrapper: 8 | def __init__(self, layer, layer_name="none", record_input=True, record_output=True): 9 | self.layer = layer 10 | self.layer_name = layer_name 11 | 12 | self.record_input = record_input 13 | self.record_output = record_output 14 | 15 | if record_input: 16 | self.input_hidden_states = [] 17 | if record_output: 18 | self.output_hidden_states = [] 19 | 20 | def record(self, input, output): 21 | # input: (1, seq_len, hidden_size) 22 | 23 | if self.record_input: 24 | self.input_hidden_states.append(input.squeeze(0).clone().cpu()) 25 | if self.record_output: 26 | self.output_hidden_states.append(output.squeeze(0).clone().cpu()) 27 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Casper 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/README.md: -------------------------------------------------------------------------------- 1 | # AutoAWQ Kernels 2 | 3 | AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times. 4 | 5 | ## Requirements 6 | 7 | - Windows: Must use WSL2. 8 | 9 | - NVIDIA: 10 | - GPU: Must be compute capability 7.5 or higher. 11 | - CUDA Toolkit: Must be 11.8 or higher. 12 | - AMD: 13 | - ROCm: Must be 5.6 or higher. 14 | 15 | ## Install 16 | 17 | ### Install from PyPi 18 | 19 | The package is available on PyPi with CUDA 12.1.1 wheels: 20 | 21 | ``` 22 | pip install autoawq-kernels 23 | ``` 24 | 25 | ### Install release wheels 26 | 27 | For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/): 28 | 29 | ``` 30 | pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl 31 | ``` 32 | 33 | ### Build from source 34 | You can also build from source: 35 | 36 | ``` 37 | git clone https://github.com/casper-hansen/AutoAWQ_kernels 38 | cd AutoAWQ_kernels 39 | pip install -e . 40 | ``` 41 | 42 | To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`. -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/attention/cuda_bf16_wrapper.h: -------------------------------------------------------------------------------- 1 | // Downloaded from from FasterTransformer v5.2.1 2 | // https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h 3 | /* 4 | * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | #pragma once 20 | 21 | #ifdef ENABLE_BF16 22 | #include 23 | #endif 24 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/attention/ft_attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | 5 | torch::Tensor single_query_attention(const torch::Tensor q, 6 | const torch::Tensor k, 7 | const torch::Tensor v, 8 | torch::Tensor k_cache, 9 | torch::Tensor v_cache, 10 | c10::optional length_per_sample_, 11 | c10::optional alibi_slopes_, 12 | const int timestep, 13 | const int rotary_embedding_dim = 0, 14 | const float rotary_base = 10000.0f, 15 | const bool neox_rotary_style=true); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cu_compat.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _cuda_compat_cuh 4 | #define _cuda_compat_cuh 5 | 6 | // atomicAdd for half types, to support CC < 7.x 7 | 8 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) 9 | { 10 | unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); 11 | unsigned int old = *address_as_ui; 12 | unsigned int assumed; 13 | 14 | do 15 | { 16 | assumed = old; 17 | __half_raw hsum; 18 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 19 | half tmpres = __hadd(hsum, val); 20 | hsum = __half_raw(tmpres); 21 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; 22 | old = atomicCAS(address_as_ui, assumed, old); 23 | } 24 | while (assumed != old); 25 | } 26 | 27 | // atomicAdd for half2 types 28 | 29 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) 30 | { 31 | unsigned int* address_as_ui = (unsigned int*)address; 32 | unsigned int old = *address_as_ui; 33 | unsigned int assumed; 34 | do 35 | { 36 | assumed = old; 37 | half2 old_val = *((half2*)&old); 38 | half2 new_val = __hadd2(old_val, val); 39 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 40 | } 41 | while (assumed != old); 42 | } 43 | 44 | // 45 | 46 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 47 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 48 | 49 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } 50 | 51 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 52 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } 53 | #endif 54 | 55 | #endif 56 | #endif 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_buffers.cu: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #define _cuda_buffers_cu 4 | #include "cuda_buffers.cuh" 5 | 6 | CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL}; 7 | // __constant__ half2 q4_table[16][256]; 8 | // half2 q4_table_host[16][256]; 9 | // bool q4_table_init = false; 10 | 11 | CudaBuffers::CudaBuffers 12 | ( 13 | int _device, 14 | int _temp_state_size, 15 | half* _temp_state, 16 | half* _temp_dq 17 | ) : 18 | device(_device), 19 | temp_state_size(_temp_state_size), 20 | temp_state(_temp_state), 21 | temp_dq(_temp_dq) 22 | { 23 | cudaSetDevice(_device); 24 | 25 | cudaStreamCreate(&alt_stream_1); 26 | cudaStreamCreate(&alt_stream_2); 27 | cudaStreamCreate(&alt_stream_3); 28 | cudaEventCreate(&alt_stream_1_done); 29 | cudaEventCreate(&alt_stream_2_done); 30 | cudaEventCreate(&alt_stream_3_done); 31 | } 32 | 33 | CudaBuffers::~CudaBuffers() 34 | { 35 | cudaStreamDestroy(alt_stream_1); 36 | cudaStreamDestroy(alt_stream_2); 37 | cudaStreamDestroy(alt_stream_3); 38 | cudaEventDestroy(alt_stream_1_done); 39 | cudaEventDestroy(alt_stream_2_done); 40 | cudaEventDestroy(alt_stream_3_done); 41 | } 42 | 43 | CudaBuffers* get_buffers(const int device_index) 44 | { 45 | return g_buffers[device_index]; 46 | } 47 | 48 | void prepare_buffers_cuda 49 | ( 50 | int _device, 51 | int _temp_state_size, 52 | half* _temp_state, 53 | half* _temp_dq 54 | ) 55 | { 56 | CudaBuffers* buffers = new CudaBuffers 57 | ( 58 | _device, 59 | _temp_state_size, 60 | _temp_state, 61 | _temp_dq 62 | ); 63 | 64 | g_buffers[_device] = buffers; 65 | } 66 | 67 | void cleanup_buffers_cuda() 68 | { 69 | for (int i = 0; i < CUDA_MAX_DEVICES; i++) 70 | { 71 | if (!g_buffers[i]) continue; 72 | delete g_buffers[i]; 73 | g_buffers[i] = NULL; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_buffers.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _cuda_buffers_cuh 4 | #define _cuda_buffers_cuh 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | const int CUDA_MAX_DEVICES = 16; 12 | 13 | // #ifndef _cuda_buffers_cu 14 | // extern __constant__ half2 q4_table[16][256]; 15 | // #endif 16 | 17 | class CudaBuffers 18 | { 19 | public: 20 | int device; 21 | 22 | half* temp_state; // [max_hidden_rows * intermediate_size] 23 | int temp_state_size; 24 | half* temp_dq; // size of largest quant tensor * 8 25 | 26 | cudaStream_t alt_stream_1; 27 | cudaStream_t alt_stream_2; 28 | cudaStream_t alt_stream_3; 29 | cudaEvent_t alt_stream_1_done; 30 | cudaEvent_t alt_stream_2_done; 31 | cudaEvent_t alt_stream_3_done; 32 | 33 | CudaBuffers 34 | ( 35 | int _device, 36 | int _temp_state_size, 37 | half* _temp_state, 38 | half* _temp_dq 39 | ); 40 | ~CudaBuffers(); 41 | }; 42 | 43 | CudaBuffers* get_buffers(const int device_index); 44 | 45 | void prepare_buffers_cuda 46 | ( 47 | int _device, 48 | int _temp_state_size, 49 | half* _temp_state, 50 | half* _temp_dq 51 | ); 52 | 53 | void cleanup_buffers_cuda(); 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/column_remap.cu: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #include "column_remap.cuh" 4 | #include "../util.cuh" 5 | 6 | const int SHUF_BLOCKSIZE_X = 256; 7 | const int SHUF_BLOCKSIZE_Y = 16; 8 | 9 | __global__ void column_remap_kernel 10 | ( 11 | const half* __restrict__ x, 12 | half* __restrict__ x_new, 13 | const int x_width, 14 | const int x_height, 15 | const uint32_t* x_map 16 | ) 17 | { 18 | int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; 19 | int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y; 20 | if (x_column >= x_width) return; 21 | //if (x_row >= x_height) return; 22 | 23 | int x_stride = x_width; 24 | int x_idx = x_row * x_stride + x_column; 25 | 26 | int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height); 27 | int x_idx_end = x_row_end * x_stride + x_column; 28 | 29 | int s_column = x_map[x_column]; 30 | int s_idx = x_row * x_stride + s_column; 31 | 32 | while (x_idx < x_idx_end) 33 | { 34 | x_new[x_idx] = x[s_idx]; 35 | x_idx += x_stride; 36 | s_idx += x_stride; 37 | } 38 | } 39 | 40 | // Remap columns in x to correspond to sequential group index before matmul 41 | // 42 | // perform x -> seq_x such that seq_x @ seq_w == x @ w 43 | 44 | void column_remap_cuda 45 | ( 46 | const half* x, 47 | half* x_new, 48 | const int x_height, 49 | const int x_width, 50 | const uint32_t* x_map 51 | ) 52 | { 53 | dim3 threads(SHUF_BLOCKSIZE_X, 1, 1); 54 | 55 | dim3 blocks 56 | ( 57 | (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X, 58 | (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y, 59 | 1 60 | ); 61 | 62 | column_remap_kernel<<>>(x, x_new, x_width, x_height, x_map); 63 | } 64 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/column_remap.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _column_remap_cuh 4 | #define _column_remap_cuh 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | void column_remap_cuda 11 | ( 12 | const half* x, 13 | half* x_new, 14 | const int x_height, 15 | const int x_width, 16 | const uint32_t* x_map 17 | ); 18 | 19 | #endif -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/q4_matmul.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _q4_matmul_cuh 4 | #define _q4_matmul_cuh 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "q4_matrix.cuh" 13 | #include "../tuning.h" 14 | 15 | // Workaround for hipify_python using rocblas instead of hipblas. 16 | #if defined(USE_ROCM) 17 | #include 18 | #define rocblas_handle hipblasHandle_t 19 | #endif 20 | 21 | void q4_matmul_cuda 22 | ( 23 | ExLlamaTuning* tuningParams, 24 | const half* x, 25 | const int x_height, 26 | const Q4Matrix* w, 27 | half* out, 28 | bool no_zero = false, 29 | cudaStream_t alt_stream = NULL 30 | ); 31 | 32 | void q4_matmul_recons_cuda 33 | ( 34 | ExLlamaTuning* tuningParams, 35 | const half* x, 36 | const int x_height, 37 | Q4Matrix* w, 38 | half* out, 39 | const cublasHandle_t handle, 40 | bool no_zero = false 41 | ); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/cuda_func/q4_matrix.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _q4_matrix_cuh 4 | #define _q4_matrix_cuh 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | class Q4Matrix 11 | { 12 | public: 13 | 14 | int device; 15 | 16 | int height; 17 | int width; 18 | int groups; 19 | int groupsize; 20 | 21 | uint32_t* cuda_qweight = NULL; 22 | uint32_t* cuda_qzeros = NULL; 23 | half* cuda_scales = NULL; 24 | uint32_t* cuda_x_map = NULL; 25 | 26 | Q4Matrix 27 | ( 28 | const int _height, 29 | const int _width, 30 | const int _groups, 31 | 32 | uint32_t* _qweight, 33 | uint32_t* _qzeros, 34 | half* _scales, 35 | uint32_t* _g_idx, 36 | 37 | const int _device 38 | ); 39 | 40 | ~Q4Matrix(); 41 | 42 | void reconstruct(half* out); 43 | 44 | private: 45 | 46 | void make_sequential(const uint32_t* cpu_g_idx); 47 | 48 | }; 49 | 50 | void g_q4_keep_matrix(Q4Matrix* m); 51 | void g_q4_free_matrices(); 52 | 53 | #endif -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/hip_compat.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _hip_compat_cuh 4 | #define _hip_compat_cuh 5 | 6 | // Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6. 7 | __device__ __forceinline__ __half __compat_hrcp(__half x) { 8 | return __half_raw{ 9 | static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; 10 | } 11 | 12 | // ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708 13 | __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { 14 | return _Float16_2{_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data}; 15 | } 16 | 17 | #define hrcp __compat_hrcp 18 | #define h2rcp __compat_h2rcp 19 | 20 | // Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf. 21 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, 22 | hipblasOperation_t transA, 23 | hipblasOperation_t transB, 24 | int m, 25 | int n, 26 | int k, 27 | const half* alpha, 28 | const half* AP, 29 | int lda, 30 | const half* BP, 31 | int ldb, 32 | const half* beta, 33 | half* CP, 34 | int ldc) { 35 | return hipblasHgemm(handle, transA, transB, m, n, k, 36 | reinterpret_cast(alpha), 37 | reinterpret_cast(AP), lda, 38 | reinterpret_cast(BP), ldb, 39 | reinterpret_cast(beta), 40 | reinterpret_cast(CP), ldc); 41 | } 42 | #define hipblasHgemm __compat_hipblasHgemm 43 | 44 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. 45 | #define rocblas_handle hipblasHandle_t 46 | #define rocblas_operation_none HIPBLAS_OP_N 47 | #define rocblas_get_stream hipblasGetStream 48 | #define rocblas_set_stream hipblasSetStream 49 | #define rocblas_hgemm __compat_hipblasHgemm 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/tuning.h: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _tuning_h 4 | #define _tuning_h 5 | 6 | struct ExLlamaTuning 7 | { 8 | int matmul_recons_thd; 9 | bool matmul_fused_remap; 10 | bool matmul_no_half2; 11 | }; 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllama/util.cuh: -------------------------------------------------------------------------------- 1 | // Adapted from turboderp exllama: https://github.com/turboderp/exllama 2 | 3 | #ifndef _util_cuh 4 | #define _util_cuh 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #if defined(USE_ROCM) 12 | #define cudaUnspecified hipErrorUnknown 13 | #else 14 | #define cudaUnspecified cudaErrorApiFailureBase 15 | #endif 16 | 17 | // React to failure on return code != cudaSuccess 18 | 19 | #define _cuda_check(fn) \ 20 | do { \ 21 | {_cuda_err = fn;} \ 22 | if (_cuda_err != cudaSuccess) goto _cuda_fail; \ 23 | } while(false) 24 | 25 | // React to failure on return code == 0 26 | 27 | #define _alloc_check(fn) \ 28 | do { \ 29 | if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \ 30 | else _cuda_err = cudaSuccess; \ 31 | } while(false) 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/config.h: -------------------------------------------------------------------------------- 1 | #ifndef _config_h 2 | #define _config_h 3 | 4 | #define MAX_Q_GEMM_ROWS 50 5 | 6 | #define QMODE_2BIT 1 7 | #define QMODE_3BIT 1 8 | #define QMODE_4BIT 1 9 | #define QMODE_5BIT 1 10 | #define QMODE_6BIT 0 11 | #define QMODE_8BIT 0 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cpp/util.h: -------------------------------------------------------------------------------- 1 | #ifndef _util_h 2 | #define _util_h 3 | 4 | #define DBGS(__x) printf("%s\n", __x) 5 | #define DBGI(__x) printf("%s: %i\n", #__x, __x) 6 | #define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y) 7 | #define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z) 8 | #define DBGF(__x) printf("%s: %f\n", #__x, __x) 9 | #define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y) 10 | #define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z) 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/compat.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _compat_cuh 2 | #define _compat_cuh 3 | 4 | // atomicAdd for half types, to support CC < 7.x 5 | 6 | __device__ __forceinline__ void atomicAdd_half(half* address, half val) 7 | { 8 | unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); 9 | unsigned int old = *address_as_ui; 10 | unsigned int assumed; 11 | 12 | do 13 | { 14 | assumed = old; 15 | __half_raw hsum; 16 | hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); 17 | half tmpres = __hadd(hsum, val); 18 | hsum = __half_raw(tmpres); 19 | old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; 20 | old = atomicCAS(address_as_ui, assumed, old); 21 | } 22 | while (assumed != old); 23 | } 24 | 25 | // atomicAdd for half2 types 26 | 27 | __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) 28 | { 29 | unsigned int* address_as_ui = (unsigned int*)address; 30 | unsigned int old = *address_as_ui; 31 | unsigned int assumed; 32 | do 33 | { 34 | assumed = old; 35 | half2 old_val = *((half2*)&old); 36 | half2 new_val = __hadd2(old_val, val); 37 | old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); 38 | } 39 | while (assumed != old); 40 | } 41 | 42 | // 43 | 44 | #if defined(__CUDA_ARCH__) || defined(USE_ROCM) 45 | #if __CUDA_ARCH__ < 700 || defined(USE_ROCM) 46 | 47 | __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } 48 | 49 | #if __CUDA_ARCH__ < 600 || defined(USE_ROCM) 50 | __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } 51 | #endif 52 | 53 | #endif 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/compat_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _compat_gemm_cuh 2 | #define _compat_gemm_cuh 3 | 4 | #if defined(USE_ROCM) 5 | 6 | // For some reason this include is not present anywhere in exllama_v2 codebase, but it is required 7 | // for symbols as hipblasHalf. 8 | #include 9 | 10 | __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, 11 | hipblasOperation_t transA, 12 | hipblasOperation_t transB, 13 | int m, 14 | int n, 15 | int k, 16 | const half* alpha, 17 | const half* AP, 18 | int lda, 19 | const half* BP, 20 | int ldb, 21 | const half* beta, 22 | half* CP, 23 | int ldc) { 24 | return hipblasHgemm(handle, transA, transB, m, n, k, 25 | reinterpret_cast(alpha), 26 | reinterpret_cast(AP), lda, 27 | reinterpret_cast(BP), ldb, 28 | reinterpret_cast(beta), 29 | reinterpret_cast(CP), ldc); 30 | } 31 | #define hipblasHgemm __compat_hipblasHgemm 32 | 33 | // Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. 34 | #define rocblas_operation_none HIPBLAS_OP_N 35 | #define rocblas_hgemm __compat_hipblasHgemm 36 | #endif 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/q_gemm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q_gemm_cuh 2 | #define _q_gemm_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "q_matrix.cuh" 11 | 12 | void gemm_half_q_half_cuda 13 | ( 14 | cublasHandle_t cublas_handle, 15 | const half* a, 16 | QMatrix* b, 17 | half* c, 18 | int size_m, 19 | int size_n, 20 | int size_k, 21 | bool clear = false, 22 | half* reconstruct = NULL, 23 | bool force_cuda = false 24 | ); 25 | 26 | void clear_tensor_cuda 27 | ( 28 | half* c, 29 | int size_m, 30 | int size_n 31 | ); 32 | 33 | #endif -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/q_matrix.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _q_matrix_cuh 2 | #define _q_matrix_cuh 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MAX_SUPERGROUPS 16 10 | 11 | class QMatrix 12 | { 13 | public: 14 | 15 | int device; 16 | bool is_gptq; 17 | 18 | int height; 19 | int width; 20 | int groups; 21 | int groupsize; 22 | 23 | int rows_8; 24 | int rows_6; 25 | int rows_5; 26 | int rows_4; 27 | int rows_3; 28 | int rows_2; 29 | 30 | uint32_t* cuda_q_weight = NULL; 31 | uint16_t* cuda_q_perm = NULL; 32 | uint16_t* cuda_q_invperm = NULL; 33 | uint32_t* cuda_q_scale = NULL; 34 | half* cuda_q_scale_max = NULL; 35 | uint16_t* cuda_q_groups = NULL; 36 | uint32_t* cuda_gptq_qzeros = NULL; 37 | half* cuda_gptq_scales = NULL; 38 | 39 | half* temp_dq; 40 | 41 | bool failed; 42 | 43 | QMatrix 44 | ( 45 | const int _device, 46 | const int _height, 47 | const int _width, 48 | const int _groups, 49 | 50 | uint32_t* _q_weight, 51 | uint16_t* _q_perm, 52 | uint16_t* _q_invperm, 53 | uint32_t* _q_scale, 54 | half* _q_scale_max, 55 | uint16_t* _q_groups, 56 | 57 | uint32_t* _gptq_qzeros, 58 | half* _gptq_scales, 59 | uint32_t* _gptq_g_idx, 60 | 61 | half* _temp_dq 62 | ); 63 | 64 | ~QMatrix(); 65 | 66 | void reconstruct(half* out); 67 | bool make_sequential(const uint32_t* cpu_g_idx); 68 | 69 | private: 70 | 71 | }; 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_2.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _qdq_2_cuh 2 | #define _qdq_2_cuh 3 | 4 | #include "qdq_util.cuh" 5 | #include "../../config.h" 6 | 7 | #if QMODE_2BIT == 1 8 | 9 | // Permutation: 10 | // 11 | // ffddbb99 77553311 eeccaa88 66442200 12 | 13 | __forceinline__ __device__ void shuffle_2bit_16 14 | ( 15 | uint32_t* q, 16 | int stride 17 | ) 18 | { 19 | uint32_t qa = q[0]; 20 | uint32_t qb = 0; 21 | 22 | #pragma unroll 23 | for (int i = 0; i < 8; i++) 24 | { 25 | uint32_t qa0 = qa & 0x03; 26 | uint32_t qa1 = (qa & 0x0c) >> 2; 27 | qa >>= 4; 28 | qb |= (qa1 << (i * 2 + 16)); 29 | qb |= (qa0 << (i * 2)); 30 | } 31 | q[0] = qb; 32 | } 33 | 34 | __forceinline__ __device__ void dequant_2bit_16 35 | ( 36 | const uint32_t q_0, 37 | half2 (&dq)[8], 38 | int stride 39 | ) 40 | { 41 | const uint32_t c0 = 0x64006400; 42 | const half y4_ = __float2half_rn(1.0f / 4.0f); 43 | const half y16_ = __float2half_rn(1.0f / 16.0f); 44 | const half y64_ = __float2half_rn(1.0f / 64.0f); 45 | const half2 y4 = __halves2half2(y4_, y4_); 46 | const half2 y16 = __halves2half2(y16_, y16_); 47 | const half2 y64 = __halves2half2(y64_, y64_); 48 | const half z1_ = __float2half_rn(-1024.0f - 2.0f); 49 | const half z4_ = __float2half_rn(-1024.0f / 4.0f - 2.0f); 50 | const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f); 51 | const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f); 52 | const half2 z1 = __halves2half2(z1_, z1_); 53 | const half2 z4 = __halves2half2(z4_, z4_); 54 | const half2 z16 = __halves2half2(z16_, z16_); 55 | const half2 z64 = __halves2half2(z64_, z64_); 56 | 57 | uint32_t qa = q_0; 58 | half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 59 | half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 60 | half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 61 | half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 62 | qa >>= 8; 63 | half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 64 | half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 65 | half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 66 | half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 67 | 68 | dq[0] = __hadd2(q0.as_half2, z1); 69 | dq[1] = __hfma2(q1.as_half2, y4, z4); 70 | dq[2] = __hfma2(q2.as_half2, y16, z16); 71 | dq[3] = __hfma2(q3.as_half2, y64, z64); 72 | dq[4] = __hadd2(q4.as_half2, z1); 73 | dq[5] = __hfma2(q5.as_half2, y4, z4); 74 | dq[6] = __hfma2(q6.as_half2, y16, z16); 75 | dq[7] = __hfma2(q7.as_half2, y64, z64); 76 | } 77 | 78 | #else 79 | 80 | __forceinline__ __device__ void shuffle_2bit_16 81 | ( 82 | uint32_t* q, 83 | int stride 84 | ) 85 | { 86 | } 87 | 88 | __forceinline__ __device__ void dequant_2bit_16 89 | ( 90 | const uint32_t q_0, 91 | half2 (&dq)[8], 92 | int stride 93 | ) 94 | { 95 | half dqh[16]; 96 | for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2); 97 | 98 | for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 99 | } 100 | 101 | #endif 102 | 103 | #endif -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_6.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _qdq_6_cuh 2 | #define _qdq_6_cuh 3 | 4 | #include "qdq_util.cuh" 5 | #include "../../config.h" 6 | 7 | #if QMODE_6BIT == 1 8 | 9 | // Not implemented 10 | 11 | #else 12 | 13 | __forceinline__ __device__ void shuffle_6bit_16 14 | ( 15 | uint32_t* q, 16 | int stride 17 | ) 18 | { 19 | } 20 | 21 | __forceinline__ __device__ void dequant_6bit_16 22 | ( 23 | const uint32_t q_0, 24 | const uint32_t q_1, 25 | const uint32_t q_2, 26 | half2 (&dq)[8], 27 | int stride 28 | ) 29 | { 30 | half dqh[16]; 31 | for (int i = 0; i < 5; i++) dqh[ i] = dq_ns(exb( q_0, i * 6 , 0x3f), 32); 32 | dqh[ 5 ] = dq_ns(exb(q_1, q_0, 30, 0x3f), 32); 33 | for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb( q_1, i * 6 + 4, 0x3f), 32); 34 | dqh[10 ] = dq_ns(exb(q_2, q_1, 28, 0x3f), 32); 35 | for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb( q_2, i * 6 + 2, 0x3f), 32); 36 | 37 | for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 38 | } 39 | 40 | #endif 41 | 42 | #endif 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_8.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _qdq_8_cuh 2 | #define _qdq_8_cuh 3 | 4 | #include "qdq_util.cuh" 5 | #include "../../config.h" 6 | 7 | #if QMODE_8BIT == 1 8 | 9 | // Not implemented 10 | 11 | #else 12 | 13 | __forceinline__ __device__ void shuffle_8bit_4 14 | ( 15 | uint32_t* q, 16 | int stride 17 | ) 18 | { 19 | } 20 | 21 | __forceinline__ __device__ void dequant_8bit_8 22 | ( 23 | const uint32_t q_0, 24 | const uint32_t q_1, 25 | half2 (&dq)[4], 26 | int stride 27 | ) 28 | { 29 | half dqh[8]; 30 | for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), 128); 31 | for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128); 32 | 33 | for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); 34 | } 35 | 36 | #endif 37 | 38 | #endif -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/quant/qdq_util.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _qdq_util_cuh 2 | #define _qdq_util_cuh 3 | 4 | union half2_uint32 5 | { 6 | uint32_t as_uint32; 7 | half2 as_half2; 8 | __device__ half2_uint32(uint32_t val) : as_uint32(val) {} 9 | __device__ half2_uint32(half2 val) : as_half2(val) {} 10 | }; 11 | 12 | union half_uint16 13 | { 14 | uint16_t as_uint16; 15 | half as_half; 16 | __device__ half_uint16(uint16_t val) : as_uint16(val) {} 17 | __device__ half_uint16(half val) : as_half(val) {} 18 | }; 19 | 20 | // Max_scale premultiplied by 1/256 21 | 22 | __forceinline__ __device__ half dq_scale(const int qs, const half max_scale) 23 | { 24 | int qs_i = qs + 1; 25 | half qs_h = __int2half_rn(qs_i * qs_i); 26 | qs_h = __hmul(qs_h, max_scale); 27 | return qs_h; 28 | } 29 | 30 | __forceinline__ __device__ half dq(const int q, const int qzero, const half scale) 31 | { 32 | return __hmul(__int2half_rn(q - qzero), scale); 33 | } 34 | 35 | __forceinline__ __device__ half dq_ns(const int q, const int qzero) 36 | { 37 | //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); 38 | return __int2half_rn(q - qzero); 39 | } 40 | 41 | __forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) 42 | { 43 | return (int)((q >> shift) & mask); 44 | } 45 | 46 | __forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) 47 | { 48 | return (int)(__funnelshift_rc(q0, q1, shift) & mask); 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/exllamav2/cuda/util.cuh: -------------------------------------------------------------------------------- 1 | 2 | #define DIVIDE(x, size) (((x) + (size) - 1) / (size)) 3 | 4 | #define DBGS(__x) printf("%s\n", __x) 5 | #define DBGI(__x) printf("%s: %i\n", #__x, __x) 6 | #define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y) 7 | #define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z) 8 | #define DBGX(__x) printf("%s: %x\n", #__x, __x) 9 | #define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y) 10 | #define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z) 11 | #define DBGF(__x) printf("%s: %f\n", #__x, __x) 12 | #define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y) 13 | #define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z) 14 | #define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x)) 15 | #define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y)) 16 | #define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z)) 17 | 18 | #define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y)) 19 | #define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z)) 20 | 21 | __forceinline__ __device__ half dq_scale_(const int qs, const half max_scale) 22 | { 23 | half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f)); 24 | qs_h = __hmul(qs_h, qs_h); 25 | qs_h = __hmul(qs_h, max_scale); 26 | return qs_h; 27 | } 28 | 29 | __forceinline__ __device__ float clamp(float x, float a, float b) 30 | { 31 | return fmaxf(a, fminf(b, x)); 32 | } 33 | 34 | #define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); } 35 | inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true) 36 | { 37 | if (code != cudaSuccess) 38 | { 39 | fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); 40 | if (abort) exit(code); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/layernorm.cu: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Adapted from NVIDIA FasterTransformer: 4 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/layernorm_kernels.cu 5 | 6 | */ 7 | 8 | #include 9 | #include 10 | #include "reduction.cuh" 11 | #include "layernorm.h" 12 | #include 13 | #include 14 | 15 | static inline __device__ float to_float(half src) 16 | { 17 | return __half2float(src); 18 | } 19 | 20 | static inline __device__ float to_float(float src) 21 | { 22 | return src; 23 | } 24 | 25 | template 26 | __global__ void generalT5LayerNorm( 27 | const T* __restrict input, const T* __restrict gamma, T* output, const float layernorm_eps, int m, int n) 28 | { 29 | // layernorm module in the T5 style No bias and no subtraction of mean. 30 | const int tid = threadIdx.x; 31 | 32 | __shared__ float s_variance; 33 | float variance = 0.0f; 34 | 35 | float local_var_sum = 0.0f; 36 | for (int i = tid; i < n; i += blockDim.x) { 37 | float diff = to_float(__ldg(&input[blockIdx.x * n + i])); 38 | local_var_sum += diff * diff; 39 | } 40 | variance = blockReduceSum(local_var_sum); 41 | 42 | if (threadIdx.x == 0) { 43 | s_variance = rsqrtf(variance / (float)n + layernorm_eps); 44 | } 45 | __syncthreads(); 46 | 47 | for (int i = tid; i < n; i += blockDim.x) { 48 | output[blockIdx.x * n + i] = 49 | clamp_inf_for_half((to_float(input[blockIdx.x * n + i]) * s_variance) * to_float(__ldg(&gamma[i]))); 50 | } 51 | } 52 | 53 | 54 | template 55 | void invokeGeneralT5LayerNorm(T* out, 56 | const T* input, 57 | const T* gamma, 58 | // const T* beta, 59 | const float layernorm_eps, 60 | const int m, 61 | const int n) 62 | { 63 | dim3 grid(m); 64 | dim3 block(min(n, 1024)); 65 | 66 | /* For general cases, n is equal to hidden_units, e.g., 512/1024. 67 | Since we have warp shuffle inside the code, block.x % 32 should be 0. 68 | */ 69 | if (n % 32 != 0) { 70 | block.x = 1024; 71 | } 72 | 73 | block.x = block.x / (4 / sizeof(T)); // if using half, only need half of block.x 74 | 75 | /* should pay attention to the rsqrt precision*/ 76 | generalT5LayerNorm<<>>(input, gamma, out, layernorm_eps, m, n); // For gpt-3 77 | } 78 | 79 | template void invokeGeneralT5LayerNorm(half* out, 80 | const half* input, 81 | const half* gamma, 82 | // const half* beta, 83 | const float layernorm_eps, 84 | const int m, 85 | const int n); 86 | 87 | template void invokeGeneralT5LayerNorm(float* out, 88 | const float* input, 89 | const float* gamma, 90 | // const half* beta, 91 | const float layernorm_eps, 92 | const int m, 93 | const int n); 94 | 95 | 96 | 97 | // input b, n, c 98 | void layernorm_forward_cuda( 99 | torch::Tensor _input, 100 | torch::Tensor _gamma, 101 | torch::Tensor _out, 102 | float eps) 103 | { 104 | int m = _input.size(0) * _input.size(1); 105 | int n = _input.size(2); 106 | const at::cuda::OptionalCUDAGuard device_guard(device_of(_input)); 107 | 108 | auto input = reinterpret_cast(_input.data_ptr()); 109 | auto gamma = reinterpret_cast(_gamma.data_ptr()); 110 | auto out = reinterpret_cast(_out.data_ptr()); 111 | 112 | invokeGeneralT5LayerNorm(out, input, gamma, eps, m, n); 113 | } 114 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/layernorm.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void layernorm_forward_cuda(torch::Tensor _input, torch::Tensor _gamma, torch::Tensor _out, float eps); 4 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/layernorm/reduction.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Adapted from NVIDIA FasterTransformer: 4 | https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/reduce_kernel_utils.cuh 5 | */ 6 | 7 | #pragma once 8 | #include 9 | #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) 10 | #include 11 | #else 12 | #include 13 | #endif 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define HALF_FLT_MAX 65504.F 20 | #define FINAL_MASK 0xffffffff 21 | 22 | 23 | template 24 | inline __device__ T add(T a, T b) { 25 | return a + b; 26 | } 27 | 28 | template<> 29 | inline __device__ half2 add(half2 a, half2 b) { 30 | return __hadd2(a, b); 31 | } 32 | 33 | template<> 34 | inline __device__ half add(half a, half b) { 35 | return __hadd(a, b); 36 | } 37 | 38 | template 39 | __inline__ __device__ T warpReduceSum(T val) 40 | { 41 | #pragma unroll 42 | for (int mask = 16; mask > 0; mask >>= 1) 43 | val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32)); //__shfl_sync bf16 return float when sm < 80 44 | return val; 45 | } 46 | 47 | /* Calculate the sum of all elements in a block */ 48 | template 49 | __inline__ __device__ T blockReduceSum(T val) 50 | { 51 | static __shared__ T shared[32]; 52 | int lane = threadIdx.x & 0x1f; 53 | int wid = threadIdx.x >> 5; 54 | 55 | val = warpReduceSum(val); 56 | 57 | if (lane == 0) 58 | shared[wid] = val; 59 | 60 | __syncthreads(); 61 | 62 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 63 | // blockDim.x is not divided by 32 64 | val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); 65 | val = warpReduceSum(val); 66 | 67 | return val; 68 | } 69 | 70 | 71 | template 72 | __device__ __forceinline__ T clamp_inf_for_half(const float input) 73 | { 74 | return input; 75 | } 76 | 77 | template<> 78 | __device__ __forceinline__ half clamp_inf_for_half(const float input) 79 | { 80 | // clamp inf values to enable fp16 training 81 | return input > 0.0f ? __float2half(min(input, HALF_FLT_MAX - 1000)) : __float2half(max(input, -HALF_FLT_MAX + 1000)); 82 | } 83 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/position_embedding/pos_encoding.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | void rotary_embedding_neox( 5 | torch::Tensor& positions, 6 | torch::Tensor& query, 7 | torch::Tensor& key, 8 | int head_size, 9 | torch::Tensor& cos_sin_cache); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/position_embedding/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Adapted from the VLLM project: 4 | https://github.com/vllm-project/vllm/blob/main/csrc/pos_encoding_kernels.cu 5 | 6 | */ 7 | 8 | #include 9 | #include 10 | #include "pos_encoding.h" 11 | 12 | template 13 | __global__ void rotary_embedding_neox_kernel( 14 | const int64_t* __restrict__ positions, // [num_tokens] 15 | scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size] 16 | scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] 17 | const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] 18 | const int rot_dim, 19 | const int stride, 20 | const int num_heads, 21 | const int head_size) { 22 | // Each thread block is responsible for one token. 23 | const int token_idx = blockIdx.x; 24 | int64_t pos = positions[token_idx]; 25 | const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; 26 | 27 | const int embed_dim = rot_dim / 2; 28 | const int n = num_heads * embed_dim; 29 | for (int i = threadIdx.x; i < n; i += blockDim.x) { 30 | const int head_idx = i / embed_dim; 31 | const int token_head = token_idx * stride + head_idx * head_size; 32 | 33 | const int rot_offset = i % embed_dim; 34 | const int x_index = rot_offset; 35 | const int y_index = embed_dim + rot_offset; 36 | 37 | const int out_x = token_idx * stride + head_idx * head_size + x_index; 38 | const int out_y = token_idx * stride + head_idx * head_size + y_index; 39 | 40 | const scalar_t cos = __ldg(cache_ptr + x_index); 41 | const scalar_t sin = __ldg(cache_ptr + y_index); 42 | 43 | const scalar_t q_x = query[token_head + x_index]; 44 | const scalar_t q_y = query[token_head + y_index]; 45 | query[out_x] = q_x * cos - q_y * sin; 46 | query[out_y] = q_y * cos + q_x * sin; 47 | 48 | const scalar_t k_x = key[token_head + x_index]; 49 | const scalar_t k_y = key[token_head + y_index]; 50 | key[out_x] = k_x * cos - k_y * sin; 51 | key[out_y] = k_y * cos + k_x * sin; 52 | } 53 | } 54 | 55 | void rotary_embedding_neox( 56 | torch::Tensor& positions, // [b, num_tokens] 57 | torch::Tensor& query, // [b, num_tokens, 1, num_heads, head_size] 58 | torch::Tensor& key, // [b, num_tokens, 1, num_heads, head_size] 59 | int head_size, 60 | torch::Tensor& cos_sin_cache) // [max_position, rot_dim] 61 | { 62 | int num_tokens = query.size(0) * query.size(1); 63 | int rot_dim = cos_sin_cache.size(1); 64 | int num_heads = query.size(-2); 65 | int stride = num_heads * head_size; 66 | // TORCH_CHECK(stride == key.stride(0)); 67 | 68 | dim3 grid(num_tokens); 69 | dim3 block(std::min(num_heads * rot_dim / 2, 512)); 70 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 71 | AT_DISPATCH_FLOATING_TYPES_AND2( 72 | at::ScalarType::Half, 73 | at::ScalarType::BFloat16, 74 | query.scalar_type(), 75 | "rotary_embedding_neox", 76 | [&] { 77 | rotary_embedding_neox_kernel<<>>( 78 | positions.data_ptr(), 79 | query.data_ptr(), 80 | key.data_ptr(), 81 | cos_sin_cache.data_ptr(), 82 | rot_dim, 83 | stride, 84 | num_heads, 85 | head_size); 86 | }); 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "layernorm/layernorm.h" 4 | #include "quantization/gemm_cuda.h" 5 | #include "quantization/gemv_cuda.h" 6 | #include "position_embedding/pos_encoding.h" 7 | #include "vllm/moe_alig_block.h" 8 | #include "vllm/activation.h" 9 | #include "vllm/topk_softmax_kernels.h" 10 | 11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 12 | { 13 | m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel"); 14 | m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel."); 15 | m.def("grouped_gemm_forward", &grouped_gemm_forward, "Quantized grouped GEMM kernel."); 16 | m.def("gemmv2_forward_cuda", &gemmv2_forward_cuda, "Quantized v2 GEMM kernel."); 17 | m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel."); 18 | m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key"); 19 | m.def("dequantize_weights_cuda", &dequantize_weights_cuda, "Dequantize weights."); 20 | m.def("moe_alig_block_size", &moe_alig_block_size, "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size."); 21 | m.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU."); 22 | m.def("topk_softmax", &topk_softmax, "Computes fused topk and softmax operation."); 23 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq_ft.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "attention/ft_attention.h" 4 | 5 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 6 | { 7 | m.def("single_query_attention", &single_query_attention, "Attention with a single query", 8 | py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"), 9 | py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0, 10 | py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true); 11 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/pybind_awq_v2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "quantization_new/gemm/gemm_cuda.h" 4 | #include "quantization_new/gemv/gemv_cuda.h" 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 7 | { 8 | m.def("gemm_forward_cuda_prefill", &gemm_forward_cuda_prefill, "New quantized GEMM kernel."); 9 | m.def("gemv_forward_cuda_decode", &gemv_forward_cuda_decode, "New quantized GEMM kernel."); 10 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/dequantize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h 3 | 4 | @article{lin2023awq, 5 | title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, 6 | author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, 7 | journal={arXiv}, 8 | year={2023} 9 | } 10 | */ 11 | 12 | #pragma once 13 | 14 | 15 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) 16 | { 17 | uint4 result; 18 | 19 | uint32_t* h = reinterpret_cast(&result); 20 | uint32_t const i4s = reinterpret_cast(source); 21 | 22 | // First, we extract the i4s and construct an intermediate fp16 number. 23 | static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; 24 | static constexpr uint32_t BOTTOM_MASK = 0x000f000f; 25 | static constexpr uint32_t TOP_MASK = 0x00f000f0; 26 | static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; 27 | 28 | // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing 29 | // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. 30 | // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and 31 | // elt_67 to fp16 without having to shift them to the bottom bits before hand. 32 | 33 | // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue 34 | // immediately before required. 35 | const uint32_t top_i4s = i4s >> 8; 36 | // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 37 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 38 | : "=r"(h[0]) 39 | : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 40 | // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 41 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 42 | : "=r"(h[1]) 43 | : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 44 | // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 45 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 46 | : "=r"(h[2]) 47 | : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 48 | // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 49 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 50 | : "=r"(h[3]) 51 | : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 52 | 53 | // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the 54 | // half2 ctor. In this case, I chose performance reliability over code readability. 55 | 56 | // This is the half2 {1032, 1032} represented as an integer. 57 | // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; 58 | // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] 59 | static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; 60 | // This is the half2 {1 / 16, 1 / 16} represented as an integer. 61 | static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; 62 | // This is the half2 {-72, -72} represented as an integer. 63 | // static constexpr uint32_t NEG_72 = 0xd480d480; 64 | // Haotian: Let's use {-64, -64}. 65 | static constexpr uint32_t NEG_64 = 0xd400d400; 66 | 67 | // Finally, we construct the output numbers. 68 | // Convert elt_01 69 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); 70 | // Convert elt_23 71 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 72 | // Convert elt_45 73 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); 74 | // Convert elt_67 75 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 76 | 77 | return result; 78 | } 79 | 80 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/gemm_cuda.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor gemm_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel, 4 | torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters); 5 | 6 | torch::Tensor grouped_gemm_forward( 7 | torch::Tensor _in_feats, 8 | torch::Tensor _kernel, 9 | torch::Tensor _scaling_factors, 10 | torch::Tensor _zeros, 11 | torch::Tensor _topk_weights, 12 | torch::Tensor _sorted_token_ids_ptr, 13 | torch::Tensor _expert_ids_ptr, 14 | torch::Tensor _num_tokens_post_padded, 15 | bool mul_weights, 16 | int split_k_iters); 17 | 18 | torch::Tensor gemmv2_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel, 19 | torch::Tensor _scaling_factors, torch::Tensor _zeros, int group_size, int split_k_iters); 20 | 21 | // Source - https://github.com/compressa-ai/AutoAWQ/blob/6673333456b8871522b11a7fb110de612edfdf95/awq_cuda/quantization/gemm_cuda.h#L9C1-L10C106 22 | torch::Tensor dequantize_weights_cuda(torch::Tensor _kernel, 23 | torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters, int thx, int thy, bool dbg); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization/gemv_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | torch::Tensor gemv_forward_cuda( 5 | torch::Tensor _in_feats, 6 | torch::Tensor _kernel, 7 | torch::Tensor _scaling_factors, 8 | torch::Tensor _zeros, 9 | int group_size); 10 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/dequantize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h 3 | 4 | @article{lin2023awq, 5 | title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, 6 | author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, 7 | journal={arXiv}, 8 | year={2023} 9 | } 10 | */ 11 | #include 12 | #pragma once 13 | 14 | __inline__ __device__ void dequantize_s4_to_fp16x2(half2 const &source, uint4 *result) 15 | { 16 | // uint4 result; 17 | 18 | uint32_t *h = reinterpret_cast(result); 19 | uint32_t const i4s = reinterpret_cast(source); 20 | 21 | // First, we extract the i4s and construct an intermediate fp16 number. 22 | static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; 23 | static constexpr uint32_t BOTTOM_MASK = 0x000f000f; 24 | static constexpr uint32_t TOP_MASK = 0x00f000f0; 25 | static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; 26 | 27 | // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing 28 | // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. 29 | // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and 30 | // elt_67 to fp16 without having to shift them to the bottom bits before hand. 31 | 32 | // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue 33 | // immediately before required. 34 | const uint32_t top_i4s = i4s >> 8; 35 | // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 36 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 37 | : "=r"(h[0]) 38 | : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 39 | // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 40 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 41 | : "=r"(h[1]) 42 | : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 43 | // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 44 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 45 | : "=r"(h[2]) 46 | : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 47 | // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 48 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 49 | : "=r"(h[3]) 50 | : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 51 | 52 | // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the 53 | // half2 ctor. In this case, I chose performance reliability over code readability. 54 | 55 | // This is the half2 {1032, 1032} represented as an integer. 56 | // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; 57 | // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] 58 | static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; 59 | // This is the half2 {1 / 16, 1 / 16} represented as an integer. 60 | static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; 61 | // This is the half2 {-72, -72} represented as an integer. 62 | // static constexpr uint32_t NEG_72 = 0xd480d480; 63 | // Haotian: Let's use {-64, -64}. 64 | static constexpr uint32_t NEG_64 = 0xd400d400; 65 | 66 | // Finally, we construct the output numbers. 67 | // Convert elt_01 68 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); 69 | // Convert elt_23 70 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 71 | // Convert elt_45 72 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); 73 | // Convert elt_67 74 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 75 | 76 | // return result; 77 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/gemm/gemm_cuda.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor gemm_forward_cuda_prefill(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _scales, torch::Tensor _zeros); 4 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/quantization_new/gemv/gemv_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | torch::Tensor gemv_forward_cuda_decode( 5 | torch::Tensor _in_feats, 6 | torch::Tensor _kernel, 7 | torch::Tensor _scaling_factors, 8 | torch::Tensor _zeros, 9 | int m, 10 | int n, 11 | int k, 12 | int group_size); 13 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/activation.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define VLLM_LDG(arg) *(arg) 6 | 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 8 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 9 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 10 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 11 | 12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 13 | AT_DISPATCH_SWITCH( \ 14 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 15 | 16 | template 17 | __device__ __forceinline__ T silu(const T& x) { 18 | // x * sigmoid(x) 19 | return (T) (((float) x) / (1.0f + expf((float) -x))); 20 | } 21 | 22 | template 23 | __global__ void silu_and_mul_kernel( 24 | scalar_t* __restrict__ out, // [..., d] 25 | const scalar_t* __restrict__ input, // [..., 2, d] 26 | const int d) { 27 | const int64_t token_idx = blockIdx.x; 28 | for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { 29 | const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); 30 | const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); 31 | out[token_idx * d + idx] = silu(x) * y; 32 | } 33 | } 34 | 35 | 36 | void silu_and_mul( 37 | torch::Tensor& out, // [..., d] 38 | torch::Tensor& input) // [..., 2 * d] 39 | { 40 | int64_t num_tokens = input.numel() / input.size(-1); 41 | int d = input.size(-1) / 2; 42 | 43 | dim3 grid(num_tokens); 44 | dim3 block(std::min(d, 1024)); 45 | const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); 46 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 47 | VLLM_DISPATCH_FLOATING_TYPES( 48 | input.scalar_type(), 49 | "silu_and_mul_kernel", 50 | [&] { 51 | silu_and_mul_kernel<<>>( 52 | out.data_ptr(), 53 | input.data_ptr(), 54 | d); 55 | }); 56 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/activation.h: -------------------------------------------------------------------------------- 1 | void silu_and_mul( 2 | torch::Tensor& out, 3 | torch::Tensor& input); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/moe_alig_block.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | const static size_t NUM_MAX_EXPERTS = 64; 9 | 10 | #define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ 11 | AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ 12 | AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ 13 | AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ 14 | AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ 15 | AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) 16 | 17 | #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ 18 | AT_DISPATCH_SWITCH( \ 19 | TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) 20 | 21 | template 22 | __global__ void moe_alig_block_size_kernel(scalar_t *__restrict__ topk_ids, 23 | int32_t *sorted_token_ids, 24 | int32_t *expert_ids, 25 | int32_t *total_tokens_post_pad, 26 | int32_t num_experts, 27 | int32_t block_size, 28 | size_t numel) { 29 | const size_t tokens_per_thread = ((numel + blockDim.x - 1) / blockDim.x); 30 | const size_t start_idx = threadIdx.x * tokens_per_thread; 31 | __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS]; 32 | __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1]; 33 | for(int i = 0;i < num_experts;i++){ 34 | tokens_cnts[threadIdx.x + 1][i] = 0; 35 | } 36 | 37 | for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { 38 | ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; 39 | } 40 | 41 | __syncthreads(); 42 | 43 | tokens_cnts[0][threadIdx.x] = 0; 44 | for(int i=1;i<=blockDim.x;++i){ 45 | tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x]; 46 | } 47 | 48 | __syncthreads(); 49 | 50 | if(threadIdx.x ==0){ 51 | cumsum[0] = 0; 52 | for(int i=1;i<=num_experts;++i){ 53 | cumsum[i] = cumsum[i-1] + (tokens_cnts[blockDim.x][i - 1] + block_size - 1) / block_size * block_size; 54 | } 55 | *total_tokens_post_pad = cumsum[num_experts]; 56 | } 57 | 58 | __syncthreads(); 59 | 60 | for(int i= cumsum[threadIdx.x];i<<<1, num_experts, 0, stream>>>( 88 | topk_ids.data_ptr(), 89 | sorted_token_ids.data_ptr(), 90 | experts_ids.data_ptr(), 91 | num_tokens_post_pad.data_ptr(), 92 | num_experts, 93 | block_size, 94 | topk_ids.numel()); 95 | }); 96 | } -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/moe_alig_block.h: -------------------------------------------------------------------------------- 1 | void moe_alig_block_size( 2 | torch::Tensor topk_ids, 3 | int num_experts, 4 | int block_size, 5 | torch::Tensor sorted_token_ids, 6 | torch::Tensor experts_ids, 7 | torch::Tensor num_tokens_post_pad 8 | ); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/awq_ext/vllm/topk_softmax_kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void topk_softmax( 6 | torch::Tensor& topk_weights, 7 | torch::Tensor& topk_indices, 8 | torch::Tensor& token_expert_indices, 9 | torch::Tensor& gating_output); -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/AutoAWQ_kernels/scripts/download_wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set variables 4 | AWQ_KERNELS_VERSION="0.0.6" 5 | RELEASE_URL="https://api.github.com/repos/casper-hansen/AutoAWQ_kernels/releases/tags/v${AWQ_KERNELS_VERSION}" 6 | 7 | # Create a directory to download the wheels 8 | mkdir -p dist 9 | cd dist 10 | 11 | # Download all the wheel files from the GitHub release 12 | # excluding ones with '+cu' (%2B is + but encoded) 13 | curl -s $RELEASE_URL | \ 14 | jq -r ".assets[].browser_download_url" | \ 15 | grep '\.whl' | \ 16 | grep -v '%2Bcu' | \ 17 | grep -v '%2Brocm' | \ 18 | xargs -n 1 -P 4 wget 19 | 20 | # Rename the wheels from 'linux_x86_64' to 'manylinux_x86_64' 21 | for file in *linux_x86_64.whl; do 22 | mv "$file" "$(echo $file | sed 's/linux_x86_64/manylinux2014_x86_64/')" 23 | done 24 | 25 | cd .. 26 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MIT HAN Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.4" 2 | 3 | import sys 4 | 5 | transformers_path = "/mnt/petrelfs/dongdaize.d/workspace/compression/src" 6 | sys.path = [transformers_path] + sys.path 7 | 8 | from awq.models.auto import AutoAWQForCausalLM 9 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from awq.evaluation.eval_utils import ( 2 | evaluate_perplexity, 3 | eval_librispeech, 4 | eval_mmlu, 5 | ) 6 | from awq.evaluation.humaneval_utils import eval_humaneval 7 | from awq.evaluation.kl_divergence import eval_kl_divergence 8 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .mpt import MptAWQForCausalLM 2 | from .llama import LlamaAWQForCausalLM 3 | from .opt import OptAWQForCausalLM 4 | from .falcon import FalconAWQForCausalLM 5 | from .bloom import BloomAWQForCausalLM 6 | from .gptj import GPTJAWQForCausalLM 7 | from .gpt_bigcode import GptBigCodeAWQForCausalLM 8 | from .mistral import MistralAWQForCausalLM 9 | from .gpt_neox import GPTNeoXAWQForCausalLM 10 | from .aquila import AquilaAWQForCausalLM 11 | from .yi import YiAWQForCausalLM 12 | from .qwen import QwenAWQForCausalLM 13 | from .baichuan import BaichuanAWQForCausalLM 14 | from .llava import LlavaAWQForCausalLM 15 | from .mixtral import MixtralAWQForCausalLM 16 | from .qwen2 import Qwen2AWQForCausalLM 17 | from .gemma import GemmaAWQForCausalLM 18 | from .stablelm import StableLmAWQForCausalLM 19 | from .deepseek import DeepseekAWQForCausalLM 20 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import Dict, Optional, List 4 | from dataclasses import dataclass, field 5 | from transformers.utils.hub import PushToHubMixin, cached_file 6 | 7 | 8 | @dataclass 9 | class AwqConfig(PushToHubMixin): 10 | quant_method: str = field(default="awq") 11 | zero_point: bool = field(default=True) 12 | q_group_size: int = field(default=128) 13 | w_bit: int = field(default=4) 14 | version: str = field(default="gemm") 15 | config_file_name = "config.json" 16 | modules_to_not_convert: Optional[List] = None 17 | 18 | @classmethod 19 | def from_dict(cls, quant_config: Dict = {}): 20 | if not quant_config: 21 | quant_config = cls() 22 | else: 23 | quant_config = cls(**quant_config) 24 | quant_config.version = quant_config.version.lower() 25 | 26 | return quant_config 27 | 28 | @classmethod 29 | def from_pretrained(cls, save_dir: str, **kwargs): 30 | cache_dir = kwargs.pop("cache_dir", None) 31 | force_download = kwargs.pop("force_download", False) 32 | resume_download = kwargs.pop("resume_download", False) 33 | proxies = kwargs.pop("proxies", None) 34 | local_files_only = kwargs.pop("local_files_only", False) 35 | use_auth_token = kwargs.pop("use_auth_token", None) 36 | revision = kwargs.pop("revision", None) 37 | subfolder = kwargs.pop("subfolder", None) 38 | commit_hash = kwargs.pop("_commit_hash", None) 39 | 40 | if os.path.isdir(save_dir): # Local 41 | resolved_config_file = os.path.join(save_dir, cls.config_file_name) 42 | else: # Remote 43 | resolved_config_file = cached_file( 44 | save_dir, 45 | cls.config_file_name, 46 | cache_dir=cache_dir, 47 | force_download=force_download, 48 | resume_download=resume_download, 49 | proxies=proxies, 50 | use_auth_token=use_auth_token, 51 | revision=revision, 52 | local_files_only=local_files_only, 53 | subfolder=subfolder, 54 | _raise_exceptions_for_missing_entries=False, 55 | _raise_exceptions_for_connection_errors=False, 56 | _commit_hash=commit_hash, 57 | ) 58 | 59 | quant_config = None 60 | if os.path.exists(resolved_config_file): 61 | with open(resolved_config_file, "r", encoding="utf-8") as file: 62 | loaded_config = json.loads(file.read()) 63 | 64 | quant_config = loaded_config.get("quantization_config") 65 | 66 | if quant_config is not None: 67 | awq_config = cls.from_transformers_dict(cls, quant_config) 68 | quant_config = cls(**awq_config) 69 | 70 | if quant_config is None: 71 | quant_config = cls() 72 | 73 | return quant_config 74 | 75 | def to_dict(self): 76 | return { 77 | "zero_point": self.zero_point, 78 | "q_group_size": self.q_group_size, 79 | "w_bit": self.w_bit, 80 | "version": self.version, 81 | "modules_to_not_convert": self.modules_to_not_convert, 82 | } 83 | 84 | def to_transformers_dict(self): 85 | return { 86 | "quant_method": self.quant_method, 87 | "zero_point": self.zero_point, 88 | "group_size": self.q_group_size, 89 | "bits": self.w_bit, 90 | "version": self.version.lower(), 91 | "modules_to_not_convert": self.modules_to_not_convert, 92 | } 93 | 94 | def from_transformers_dict(self, transformers_dict: Dict): 95 | return { 96 | "quant_method": transformers_dict.get("quant_method"), 97 | "zero_point": transformers_dict.get("zero_point"), 98 | "q_group_size": transformers_dict.get("group_size"), 99 | "w_bit": transformers_dict.get("bits"), 100 | "version": transformers_dict.get("version"), 101 | "modules_to_not_convert": transformers_dict.get("modules_to_not_convert"), 102 | } 103 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/bloom.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.bloom.modeling_bloom import BloomForCausalLM, BloomBlock 3 | 4 | 5 | class BloomAWQForCausalLM(BaseAWQForCausalLM): 6 | layer_type = "BloomBlock" 7 | 8 | @staticmethod 9 | def get_model_layers(model: BloomForCausalLM): 10 | return model.transformer.h 11 | 12 | @staticmethod 13 | def get_act_for_scaling(module: BloomBlock): 14 | return dict( 15 | is_scalable=True, 16 | scale_name="mlp.gelu_impl", 17 | scale_layer=module.mlp.gelu_impl, 18 | scale_shape=module.mlp.dense_h_to_4h.out_features, 19 | ) 20 | 21 | @staticmethod 22 | def move_embed(model: BloomForCausalLM, device: str): 23 | model.transformer.word_embeddings = model.transformer.word_embeddings.to(device) 24 | model.transformer.word_embeddings_layernorm = ( 25 | model.transformer.word_embeddings_layernorm.to(device) 26 | ) 27 | 28 | @staticmethod 29 | def get_layers_for_scaling(module: BloomBlock, input_feat, module_kwargs): 30 | layers = [] 31 | 32 | # attention input 33 | layers.append( 34 | dict( 35 | prev_op=module.input_layernorm, 36 | layers=[module.self_attention.query_key_value], 37 | inp=input_feat["self_attention.query_key_value"], 38 | module2inspect=module, 39 | kwargs=module_kwargs, 40 | ) 41 | ) 42 | # attention out 43 | # Please refer to https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469 44 | """ 45 | scales_list.append(_auto_get_scale( 46 | prev_op=module.self_attention.query_key_value, 47 | layers=[module.self_attention.dense], 48 | inp=input_feat['self_attention.dense'], 49 | )) 50 | """ 51 | # linear 1 52 | layers.append( 53 | dict( 54 | prev_op=module.post_attention_layernorm, 55 | layers=[module.mlp.dense_h_to_4h], 56 | inp=input_feat["mlp.dense_h_to_4h"], 57 | module2inspect=module, 58 | kwargs=module_kwargs, 59 | ) 60 | ) 61 | # linear 2 62 | layers.append( 63 | dict( 64 | prev_op=module.mlp.gelu_impl, 65 | layers=[module.mlp.dense_4h_to_h], 66 | inp=input_feat["mlp.dense_4h_to_h"], 67 | ) 68 | ) 69 | 70 | return layers 71 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/deepseek_moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/models/deepseek_moe/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/gpt_bigcode.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.gpt_bigcode.modeling_gpt_bigcode import ( 3 | GPTBigCodeForCausalLM, 4 | GPTBigCodeBlock as OldGptBigCodeBlock, 5 | ) 6 | 7 | 8 | class GptBigCodeAWQForCausalLM(BaseAWQForCausalLM): 9 | layer_type = "GPTBigCodeBlock" 10 | max_seq_len_key = "n_positions" 11 | 12 | @staticmethod 13 | def get_model_layers(model: GPTBigCodeForCausalLM): 14 | return model.transformer.h 15 | 16 | @staticmethod 17 | def get_act_for_scaling(module: OldGptBigCodeBlock): 18 | return dict( 19 | is_scalable=True, 20 | scale_name="mlp.act", 21 | scale_layer=module.mlp.act, 22 | scale_shape=module.mlp.c_fc.out_features, 23 | ) 24 | 25 | @staticmethod 26 | def move_embed(model: GPTBigCodeForCausalLM, device): 27 | model.transformer.wte = model.transformer.wte.to(device) 28 | model.transformer.wpe = model.transformer.wpe.to(device) 29 | model.transformer.drop = model.transformer.drop.to(device) 30 | 31 | @staticmethod 32 | def get_layers_for_scaling(module: OldGptBigCodeBlock, input_feat, module_kwargs): 33 | layers = [] 34 | 35 | # attention input 36 | layers.append( 37 | dict( 38 | prev_op=module.ln_1, 39 | layers=[module.attn.c_attn], 40 | inp=input_feat["attn.c_attn"], 41 | module2inspect=module.attn, 42 | kwargs=module_kwargs, 43 | ) 44 | ) 45 | 46 | # linear 1 47 | layers.append( 48 | dict( 49 | prev_op=module.ln_2, 50 | layers=[module.mlp.c_fc], 51 | inp=input_feat["mlp.c_fc"], 52 | module2inspect=module.mlp, 53 | ) 54 | ) 55 | 56 | # linear 2 57 | layers.append( 58 | dict( 59 | prev_op=module.mlp.act, 60 | layers=[module.mlp.c_proj], 61 | inp=input_feat["mlp.c_proj"], 62 | ) 63 | ) 64 | 65 | return layers 66 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/gpt_neox.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.gpt_neox.modeling_gpt_neox import ( 3 | GPTNeoXLayer, 4 | GPTNeoXForCausalLM, 5 | ) 6 | 7 | 8 | class GPTNeoXAWQForCausalLM(BaseAWQForCausalLM): 9 | layer_type = "GPTNeoXDecoderLayer" 10 | max_seq_len_key = "max_position_embeddings" 11 | 12 | @staticmethod 13 | def get_model_layers(model: GPTNeoXForCausalLM): 14 | return model.gpt_neox.layers 15 | 16 | @staticmethod 17 | def get_act_for_scaling(module: GPTNeoXLayer): 18 | return dict( 19 | is_scalable=True, 20 | scale_name="mlp.act", 21 | scale_layer=module.mlp.act, 22 | scale_shape=module.mlp.dense_h_to_4h.out_features, 23 | ) 24 | 25 | @staticmethod 26 | def move_embed(model: GPTNeoXForCausalLM, device: str): 27 | model.gpt_neox.embed_in = model.gpt_neox.embed_in.to(device) 28 | 29 | @staticmethod 30 | def get_layers_for_scaling(module: GPTNeoXLayer, input_feat, module_kwargs): 31 | layers = [] 32 | 33 | # attention input 34 | layers.append( 35 | dict( 36 | prev_op=module.input_layernorm, 37 | layers=[module.attention.query_key_value], 38 | inp=input_feat["attention.query_key_value"], 39 | ) 40 | ) 41 | 42 | # attention out 43 | # Please refer to https://github.com/mit-han-lab/llm-awq/issues/2#issuecomment-1606297469 44 | """ 45 | layers.append(dict( 46 | prev_op=module.attention.query_key_value, 47 | layers=[module.attention.dense], 48 | inp=input_feat['attention.dense'], 49 | )) 50 | """ 51 | 52 | # linear 1 53 | layers.append( 54 | dict( 55 | prev_op=module.post_attention_layernorm, 56 | layers=[module.mlp.dense_h_to_4h], 57 | inp=input_feat["mlp.dense_h_to_4h"], 58 | ) 59 | ) 60 | 61 | # linear 2 62 | layers.append( 63 | dict( 64 | prev_op=module.mlp.act, 65 | layers=[module.mlp.dense_4h_to_h], 66 | inp=input_feat["mlp.dense_4h_to_h"], 67 | ) 68 | ) 69 | 70 | return layers 71 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/gptj.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.gptj.modeling_gptj import GPTJForCausalLM, GPTJBlock 3 | 4 | 5 | class GPTJAWQForCausalLM(BaseAWQForCausalLM): 6 | layer_type = "GPTJBlock" 7 | max_seq_len_key = "n_positions" 8 | 9 | @staticmethod 10 | def get_model_layers(model: GPTJForCausalLM): 11 | return model.transformer.h 12 | 13 | @staticmethod 14 | def get_act_for_scaling(module: GPTJBlock): 15 | return dict( 16 | is_scalable=True, 17 | scale_name="mlp.act", 18 | scale_layer=module.mlp.act, 19 | scale_shape=module.mlp.fc_in.out_features, 20 | ) 21 | 22 | @staticmethod 23 | def move_embed(model: GPTJForCausalLM, device: str): 24 | model.transformer.wte = model.transformer.wte.to(device) 25 | 26 | @staticmethod 27 | def get_layers_for_scaling(module: GPTJBlock, input_feat, module_kwargs): 28 | layers = [] 29 | 30 | # attention input + linear 1 31 | layers.append( 32 | dict( 33 | prev_op=module.ln_1, 34 | layers=[ 35 | module.attn.q_proj, 36 | module.attn.k_proj, 37 | module.attn.v_proj, 38 | module.mlp.fc_in, 39 | ], 40 | inp=input_feat["attn.q_proj"], 41 | module2inspect=module, 42 | kwargs=module_kwargs, 43 | ) 44 | ) 45 | 46 | # attention out 47 | layers.append( 48 | dict( 49 | prev_op=module.attn.v_proj, 50 | layers=[module.attn.out_proj], 51 | inp=input_feat["attn.out_proj"], 52 | ) 53 | ) 54 | 55 | # linear 2 56 | layers.append( 57 | dict( 58 | prev_op=module.mlp.act, 59 | layers=[module.mlp.fc_out], 60 | inp=input_feat["mlp.fc_out"], 61 | ) 62 | ) 63 | 64 | return layers 65 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/mpt.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.mpt.modeling_mpt import MptBlock as OldMptBlock, MptForCausalLM 3 | 4 | 5 | class MptAWQForCausalLM(BaseAWQForCausalLM): 6 | layer_type = "MPTBlock" 7 | max_seq_len_key = "max_seq_len" 8 | 9 | @staticmethod 10 | def fuse_layers(model: MptForCausalLM): 11 | fuser = MptFuser(model) 12 | fuser.fuse_transformer() 13 | 14 | @staticmethod 15 | def get_model_layers(model: MptForCausalLM): 16 | return model.transformer.blocks 17 | 18 | @staticmethod 19 | def get_act_for_scaling(module: OldMptBlock): 20 | return dict( 21 | is_scalable=True, 22 | scale_name="ffn.act", 23 | scale_layer=module.ffn.act, 24 | scale_shape=module.ffn.up_proj.out_features, 25 | ) 26 | 27 | @staticmethod 28 | def move_embed(model: MptForCausalLM, device: str): 29 | model.transformer.wte = model.transformer.wte.to(device) 30 | model.transformer.emb_drop = model.transformer.emb_drop.to(device) 31 | 32 | @staticmethod 33 | def get_layers_for_scaling(module: OldMptBlock, input_feat, module_kwargs): 34 | layers = [] 35 | 36 | if module_kwargs.get("output_attentions") is not None: 37 | module_kwargs.pop("output_attentions") 38 | 39 | # attention input 40 | layers.append( 41 | dict( 42 | prev_op=module.norm_1, 43 | layers=[module.attn.Wqkv], 44 | inp=input_feat["attn.Wqkv"], 45 | module2inspect=module.attn, 46 | kwargs=module_kwargs, 47 | ) 48 | ) 49 | 50 | # attention output 51 | layers.append( 52 | dict( 53 | prev_op=module.attn.Wqkv, 54 | layers=[module.attn.out_proj], 55 | inp=input_feat["attn.out_proj"], 56 | ) 57 | ) 58 | 59 | # linear 1 60 | layers.append( 61 | dict( 62 | prev_op=module.norm_2, 63 | layers=[module.ffn.up_proj], 64 | inp=input_feat["ffn.up_proj"], 65 | module2inspect=module.ffn, 66 | ) 67 | ) 68 | 69 | # linear 2 70 | layers.append( 71 | dict( 72 | prev_op=module.ffn.act, 73 | layers=[module.ffn.down_proj], 74 | inp=input_feat["ffn.down_proj"], 75 | ) 76 | ) 77 | 78 | return layers 79 | 80 | 81 | from typing import List, Tuple 82 | from awq.utils.utils import set_module_name 83 | from awq.modules.fused.block import MPTBlock 84 | from awq.modules.fused.model import MPTModel 85 | 86 | 87 | class MptFuser: 88 | def __init__(self, model: MptForCausalLM): 89 | self.model = model 90 | 91 | self.mpt_blocks: List[Tuple[str, OldMptBlock]] = [ 92 | (name, module) 93 | for name, module in self.model.named_modules() 94 | if "mptblock" in module.__class__.__name__.lower() 95 | ] 96 | 97 | def fuse_transformer(self): 98 | blocks = [] 99 | 100 | module: OldMptBlock 101 | for module in self.model.transformer.blocks: 102 | blocks.append( 103 | MPTBlock( 104 | self.model.config.d_model, 105 | self.model.config.n_heads, 106 | module.attn.Wqkv, 107 | module.attn.out_proj, 108 | module.ffn, 109 | module.norm_1, 110 | module.norm_2, 111 | next(iter(module.state_dict().values())).device, 112 | self.model.config.max_seq_len, 113 | ) 114 | ) 115 | 116 | self.model.transformer = MPTModel( 117 | self.model.config.vocab_size, 118 | blocks, 119 | self.model.transformer.wte, 120 | self.model.transformer.norm_f, 121 | ) 122 | 123 | setattr(self.model.transformer, "blocks", self.model.transformer.blocks) 124 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/opt.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | from transformers.models.opt.modeling_opt import OPTForCausalLM, OPTDecoderLayer 3 | 4 | 5 | class OptAWQForCausalLM(BaseAWQForCausalLM): 6 | layer_type = "OPTDecoderLayer" 7 | max_seq_len_key = "max_position_embeddings" 8 | 9 | @staticmethod 10 | def get_model_layers(model: OPTForCausalLM): 11 | return model.model.decoder.layers 12 | 13 | @staticmethod 14 | def get_act_for_scaling(module: OPTDecoderLayer): 15 | return dict(is_scalable=False) 16 | 17 | @staticmethod 18 | def move_embed(model: OPTForCausalLM, device: str): 19 | model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(device) 20 | model.model.decoder.embed_positions = model.model.decoder.embed_positions.to( 21 | device 22 | ) 23 | 24 | @staticmethod 25 | def get_layers_for_scaling(module: OPTDecoderLayer, input_feat, module_kwargs): 26 | layers = [] 27 | 28 | # attention input 29 | layers.append( 30 | dict( 31 | prev_op=module.self_attn_layer_norm, 32 | layers=[ 33 | module.self_attn.q_proj, 34 | module.self_attn.k_proj, 35 | module.self_attn.v_proj, 36 | ], 37 | inp=input_feat["self_attn.q_proj"], 38 | module2inspect=module.self_attn, 39 | kwargs=module_kwargs, 40 | ) 41 | ) 42 | 43 | # attention out 44 | layers.append( 45 | dict( 46 | prev_op=module.self_attn.v_proj, 47 | layers=[module.self_attn.out_proj], 48 | inp=input_feat["self_attn.out_proj"], 49 | ) 50 | ) 51 | 52 | # linear 1 53 | layers.append( 54 | dict( 55 | prev_op=module.final_layer_norm, 56 | layers=[module.fc1], 57 | inp=input_feat["fc1"], 58 | ) 59 | ) 60 | 61 | # linear 2 62 | layers.append( 63 | dict( 64 | prev_op=module.fc1, 65 | layers=[module.fc2], 66 | inp=input_feat["fc2"], 67 | ) 68 | ) 69 | 70 | return layers 71 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/models/qwen.py: -------------------------------------------------------------------------------- 1 | from .base import BaseAWQForCausalLM 2 | 3 | 4 | class QwenAWQForCausalLM(BaseAWQForCausalLM): 5 | layer_type = "QWenBlock" 6 | max_seq_len_key = "seq_length" 7 | 8 | @staticmethod 9 | def get_model_layers(model): 10 | return model.transformer.h 11 | 12 | @staticmethod 13 | def get_act_for_scaling(module): 14 | return dict(is_scalable=False) 15 | 16 | @staticmethod 17 | def move_embed(model, device: str): 18 | model.transformer.wte = model.transformer.wte.to(device) 19 | model.transformer.rotary_emb = model.transformer.rotary_emb.to(device) 20 | 21 | @staticmethod 22 | def get_layers_for_scaling(module, input_feat, module_kwargs): 23 | layers = [] 24 | 25 | # attention 26 | layers.append( 27 | dict( 28 | prev_op=module.ln_1, 29 | layers=[module.attn.c_attn], 30 | inp=input_feat["attn.c_attn"], 31 | module2inspect=module.attn, 32 | kwargs=module_kwargs, 33 | ) 34 | ) 35 | 36 | # mlp 37 | layers.append( 38 | dict( 39 | prev_op=module.ln_2, 40 | layers=[module.mlp.w2, module.mlp.w1], 41 | inp=input_feat["mlp.w2"], 42 | module2inspect=module.mlp, 43 | ) 44 | ) 45 | 46 | # linear 2 47 | layers.append( 48 | dict( 49 | prev_op=module.mlp.w1, 50 | layers=[module.mlp.c_proj], 51 | inp=input_feat["mlp.c_proj"], 52 | ) 53 | ) 54 | 55 | return layers 56 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/act.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class ScaledActivation(nn.Module): 5 | def __init__(self, module, scales): 6 | super().__init__() 7 | self.act = module 8 | self.scales = nn.Parameter(scales.data) 9 | 10 | def forward(self, x): 11 | return self.act(x) / self.scales.view(1, 1, -1).to(x.device) 12 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/cache.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class WindowedCache: 5 | def __init__(self, cache_v_shape, cache_k_shape, max_seq_len, device): 6 | """ 7 | The window size is the same as the max_seq_len. The window will 8 | automatically roll once max_seq_len is exceeded. 9 | """ 10 | # [batch_size, n_kv_heads, max_seq_len, head_dim] 11 | self.v = torch.zeros(cache_v_shape).to(device).half() 12 | # [batch_size, n_kv_heads, head_dim // pack_factor, max_seq_len, pack_factor] 13 | self.k = torch.zeros(cache_k_shape).to(device).half() 14 | self.max_seq_len = max_seq_len 15 | 16 | def get_kv(self, batch_size, start_pos, seqlen, head_dim): 17 | """ 18 | Gets the key-value store in correct shapes. 19 | """ 20 | xv = ( 21 | self.v[:batch_size, :, : start_pos + seqlen, :].transpose(1, 2).contiguous() 22 | ) 23 | xk = ( 24 | self.k[:batch_size, :, :, : start_pos + seqlen, :] 25 | .transpose(2, 3) 26 | .contiguous() 27 | ) 28 | xk = xk.reshape(xk.shape[:-2] + (head_dim,)).transpose(1, 2).contiguous() 29 | 30 | return xv, xk 31 | 32 | def update_kv(self, values_store, keys_store, batch_size, start_pos, seqlen): 33 | """ 34 | Updates the values in the key-value store. 35 | """ 36 | self.v[:batch_size, :, start_pos : start_pos + seqlen, :] = values_store 37 | self.k[:batch_size, :, :, start_pos : start_pos + seqlen, :] = keys_store 38 | 39 | def roll_kv_n_steps(self, start_pos, n=100): 40 | """ 41 | Roll cache n to the left. 42 | """ 43 | n = min(n, self.max_seq_len) 44 | # Roll cache to the left 45 | self.v = torch.roll(self.v, shifts=-n, dims=2) 46 | self.k = torch.roll(self.k, shifts=-n, dims=3) 47 | 48 | # Zero out the new part 49 | self.v[:, :, -n:, :] = 0 50 | self.k[:, :, :, -n:, :] = 0 51 | 52 | return start_pos - n 53 | 54 | def to(self, device): 55 | self.k = self.k.to(device) 56 | self.v = self.v.to(device) 57 | 58 | def increase_batch_size(self, to_bsz): 59 | """Dynamically allocate new kv when batch size changes.""" 60 | self.v = torch.zeros( 61 | to_bsz, *self.v.shape[1:], dtype=self.v.dtype, device=self.v.device 62 | ) 63 | self.k = torch.zeros( 64 | to_bsz, *self.k.shape[1:], dtype=self.k.dtype, device=self.k.device 65 | ) 66 | 67 | def decrease_batch_size(self, to_bsz): 68 | """Dynamically remove part of cache if batch size changes.""" 69 | self.v = self.v[:to_bsz, :, :, :] 70 | self.k = self.k[:to_bsz, :, :, :, :] 71 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from awq.modules.linear.gemm import WQLinear_GEMM 4 | from awq.modules.linear.gemv import WQLinear_GEMV 5 | 6 | try: 7 | import awq_ext # with CUDA kernels 8 | 9 | AWQ_INSTALLED = True 10 | except: 11 | AWQ_INSTALLED = False 12 | 13 | 14 | class QuantFusedMLP(nn.Module): 15 | def __init__( 16 | self, 17 | gate_proj, 18 | down_proj, 19 | up_proj, 20 | activation=F.silu, 21 | ): 22 | super().__init__() 23 | 24 | self.register_buffer("gate_proj_qweight", gate_proj.qweight) 25 | self.register_buffer("gate_proj_scales", gate_proj.scales) 26 | self.register_buffer("gate_proj_qzeros", gate_proj.qzeros) 27 | self.register_buffer("up_proj_qweight", up_proj.qweight) 28 | self.register_buffer("up_proj_scales", up_proj.scales) 29 | self.register_buffer("up_proj_qzeros", up_proj.qzeros) 30 | 31 | self.in_features = gate_proj.in_features 32 | self.intermediate_size = gate_proj.out_features 33 | self.out_features = down_proj.out_features 34 | self.w_bit = gate_proj.w_bit 35 | self.down_proj = down_proj 36 | 37 | if isinstance(down_proj, WQLinear_GEMV): 38 | self.linear = awq_ext.gemv_forward_cuda 39 | self.group_size = down_proj.group_size 40 | else: 41 | self.linear = awq_ext.gemm_forward_cuda 42 | self.group_size = 8 43 | 44 | self.activation = activation 45 | 46 | def forward(self, x, routing_weights=None): 47 | out_shape = x.shape[:-1] + (self.intermediate_size,) 48 | x = x.reshape(-1, x.shape[-1]) 49 | gate_output = self.linear( 50 | x, 51 | self.gate_proj_qweight, 52 | self.gate_proj_scales, 53 | self.gate_proj_qzeros, 54 | self.group_size, 55 | ) 56 | up_output = self.linear( 57 | x, 58 | self.up_proj_qweight, 59 | self.up_proj_scales, 60 | self.up_proj_qzeros, 61 | self.group_size, 62 | ) 63 | x = self.activation(gate_output) * up_output 64 | x = x.reshape(out_shape) 65 | x = self.down_proj(x) 66 | 67 | if routing_weights is not None: 68 | x = routing_weights * x 69 | 70 | return x 71 | 72 | 73 | class QuantLlamaMLP(QuantFusedMLP): 74 | r""" 75 | QuantLlamaMLP class kept for backward compatibilty, in the future, users 76 | should always use `QuantFusedMLP` class instead. 77 | """ 78 | 79 | def __init__(self, gate_proj, down_proj, up_proj): 80 | super().__init__(gate_proj, down_proj, up_proj) 81 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/fused/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | try: 5 | import awq_ext # with CUDA kernels 6 | 7 | AWQ_INSTALLED = True 8 | except: 9 | AWQ_INSTALLED = False 10 | 11 | 12 | class FasterTransformerRMSNorm(nn.Module): 13 | def __init__(self, weight, eps=1e-6): 14 | super().__init__() 15 | self.weight = weight 16 | self.variance_epsilon = eps 17 | 18 | def forward(self, x): 19 | assert AWQ_INSTALLED, ( 20 | "AWQ kernels could not be loaded. " 21 | "Please install them from https://github.com/casper-hansen/AutoAWQ_kernels" 22 | ) 23 | 24 | output = torch.empty_like(x) 25 | awq_ext.layernorm_forward_cuda(x, self.weight, output, self.variance_epsilon) 26 | 27 | return output 28 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/modules/linear/__init__.py: -------------------------------------------------------------------------------- 1 | from .exllama import WQLinear_Exllama, exllama_post_init 2 | from .exllamav2 import WQLinear_ExllamaV2, exllamav2_post_init 3 | from .gemm import WQLinear_GEMM 4 | from .gemv import WQLinear_GEMV 5 | from .marlin import WQLinear_Marlin, marlin_post_init 6 | from .gemv_fast import WQLinear_GEMVFast 7 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/quantize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/quantize/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoAWQ/awq/utils/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/calib_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | from typing import List, Union 4 | from datasets import load_dataset 5 | 6 | 7 | def get_calib_dataset( 8 | data: Union[str, List[str], List[List[int]]] = "pileval", 9 | tokenizer=None, 10 | n_samples=512, 11 | block_size=512, 12 | split="compression", 13 | text_column="text", 14 | ): 15 | if isinstance(data, str): 16 | if data == "pileval": 17 | dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") 18 | else: 19 | dataset = load_dataset(data, split=split) 20 | 21 | dataset = dataset.shuffle(seed=42) 22 | 23 | elif isinstance(data, list): 24 | if isinstance(data[0], str): 25 | dataset = [{text_column: text} for text in data] 26 | elif isinstance(data[0][0], int): 27 | dataset = data 28 | else: 29 | raise NotImplementedError( 30 | "Either pass a string to a huggingface dataset or a list" 31 | "that is preprocessed with one sample of text per element" 32 | " or a list of list of int for tokenized words." 33 | ) 34 | else: 35 | raise NotImplementedError( 36 | "Either pass a string to a huggingface dataset or a list" 37 | "that is preprocessed with one sample of text per element" 38 | " or a list of list of int for tokenized words." 39 | ) 40 | 41 | samples = [] 42 | n_run = 0 43 | for data in dataset: 44 | if isinstance(data, list): 45 | line_encoded = data 46 | else: 47 | line = data[text_column] 48 | line = line.strip() 49 | line_encoded = tokenizer.encode(line) 50 | if len(line_encoded) > 512: 51 | continue 52 | sample = torch.tensor([line_encoded]) 53 | if sample.numel() == 0: 54 | continue 55 | samples.append(sample) 56 | n_run += 1 57 | if n_run == n_samples: 58 | break 59 | # now concatenate all samples and split according to block size 60 | cat_samples = torch.cat(samples, dim=1) 61 | n_split = cat_samples.shape[1] // block_size 62 | logging.debug(f" * Split into {n_split} blocks") 63 | return [ 64 | cat_samples[:, i * block_size : (i + 1) * block_size] for i in range(n_split) 65 | ] 66 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/module.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def get_named_linears(module): 5 | return {name: m for name, m in module.named_modules() if isinstance(m, nn.Linear)} 6 | 7 | 8 | def get_op_by_name(module, op_name): 9 | # get the op by its name relative to the module 10 | for name, m in module.named_modules(): 11 | if name == op_name: 12 | return m 13 | raise ValueError(f"Cannot find op {op_name} in module {module}") 14 | 15 | 16 | def set_op_by_name(layer, name, new_module): 17 | levels = name.split(".") 18 | if len(levels) > 1: 19 | mod_ = layer 20 | for l_idx in range(len(levels) - 1): 21 | if levels[l_idx].isdigit(): 22 | mod_ = mod_[int(levels[l_idx])] 23 | else: 24 | mod_ = getattr(mod_, levels[l_idx]) 25 | setattr(mod_, levels[-1], new_module) 26 | else: 27 | setattr(layer, name, new_module) 28 | 29 | 30 | def get_op_name(module, op): 31 | # get the name of the op relative to the module 32 | for name, m in module.named_modules(): 33 | if m is op: 34 | return name 35 | raise ValueError(f"Cannot find op {op} in module {module}") 36 | 37 | 38 | def append_str_prefix(x, prefix): 39 | if isinstance(x, str): 40 | return prefix + x 41 | elif isinstance(x, tuple): 42 | return tuple([append_str_prefix(y, prefix) for y in x]) 43 | elif isinstance(x, list): 44 | return [append_str_prefix(y, prefix) for y in x] 45 | else: 46 | return x 47 | 48 | 49 | def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert): 50 | if modules_to_not_convert is None: 51 | return linear_layers 52 | 53 | filtered_layers = {} 54 | for name, linear_layer in linear_layers.items(): 55 | if not any(key in name for key in modules_to_not_convert): 56 | filtered_layers[name] = linear_layer 57 | elif "gate_proj" in name: # 🔍 add gate_proj to filtered_layers. 58 | filtered_layers[name] = linear_layer 59 | return filtered_layers 60 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/packing_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7] 5 | AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] 6 | 7 | 8 | def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): 9 | shifts = torch.arange(0, 32, bits, device=qzeros.device) 10 | 11 | # unpacking columnwise 12 | iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to( 13 | torch.int8 # smallest dtype available 14 | ) 15 | iweights = iweights.view(iweights.shape[0], -1) 16 | 17 | # unpacking columnwise 18 | izeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to( 19 | torch.int8 # smallest dtype available 20 | ) 21 | izeros = izeros.view(izeros.shape[0], -1) 22 | 23 | return iweights, izeros 24 | 25 | 26 | def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): 27 | reverse_order_tensor = torch.arange( 28 | izeros.shape[-1], 29 | dtype=torch.int32, 30 | device=izeros.device, 31 | ) 32 | reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits) 33 | reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER] 34 | reverse_order_tensor = reverse_order_tensor.view(-1) 35 | 36 | izeros = izeros[:, reverse_order_tensor] 37 | iweights = iweights[:, reverse_order_tensor] 38 | 39 | return iweights, izeros 40 | 41 | 42 | def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int): 43 | shifts = torch.arange(0, 32, bits, device=iweights.device) 44 | 45 | # packing rowwise 46 | iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1) 47 | qweight = ( 48 | torch.bitwise_left_shift(iweights, shifts[None, :, None]) 49 | .sum(dim=1) 50 | .to(torch.int32) 51 | ) 52 | 53 | # packing columnwise 54 | izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits) 55 | qzeros = ( 56 | torch.bitwise_left_shift(izeros, shifts[None, None, :]) 57 | .sum(dim=-1) 58 | .to(torch.int32) 59 | ) 60 | 61 | return qweight, qzeros 62 | 63 | 64 | def unpack_reorder_pack(qweight, qzeros, bits): 65 | # Unpack the qweight and qzeros tensors 66 | iweight, izeros = unpack_awq(qweight, qzeros, bits) 67 | # Reverse the order of the iweight and izeros tensors 68 | iweight, izeros = reverse_awq_order(iweight, izeros, bits) 69 | 70 | # overflow checks 71 | iweight = torch.bitwise_and(iweight, (2**bits) - 1) 72 | izeros = torch.bitwise_and(izeros, (2**bits) - 1) 73 | 74 | # Subtract 1 from the izeros tensor (exllama adds 1 during inference) 75 | # We can remove it if we remove the +1 in the exllama code 76 | izeros = izeros - 1 77 | # Pack the qweight and qzeros tensors 78 | qweight, qzeros = pack_exllama(iweight, izeros, bits) 79 | 80 | return qweight, qzeros 81 | 82 | 83 | def dequantize_gemm(qweight, qzeros, scales, bits, group_size): 84 | # Unpack the qweight and qzeros tensors 85 | iweight, izeros = unpack_awq(qweight, qzeros, bits) 86 | # Reverse the order of the iweight and izeros tensors 87 | iweight, izeros = reverse_awq_order(iweight, izeros, bits) 88 | 89 | # overflow checks 90 | iweight = torch.bitwise_and(iweight, (2**bits) - 1) 91 | izeros = torch.bitwise_and(izeros, (2**bits) - 1) 92 | 93 | # fp16 weights 94 | scales = scales.repeat_interleave(group_size, dim=0) 95 | izeros = izeros.repeat_interleave(group_size, dim=0) 96 | iweight = (iweight - izeros) * scales 97 | 98 | return iweight 99 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/parallel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import gc 4 | import logging 5 | 6 | 7 | def auto_parallel(args): 8 | model_size = args.model_path.split("-")[-1] 9 | if model_size.endswith("m"): 10 | model_gb = 1 11 | else: 12 | model_gb = float(model_size[:-1]) 13 | if model_gb < 20: 14 | n_gpu = 1 15 | elif model_gb < 50: 16 | n_gpu = 4 17 | else: 18 | n_gpu = 8 19 | args.parallel = n_gpu > 1 20 | cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) 21 | if isinstance(cuda_visible_devices, str): 22 | cuda_visible_devices = cuda_visible_devices.split(",") 23 | else: 24 | cuda_visible_devices = list(range(8)) 25 | os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( 26 | [str(dev) for dev in cuda_visible_devices[:n_gpu]] 27 | ) 28 | logging.debug("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"]) 29 | return cuda_visible_devices 30 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/awq/utils/utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import torch 3 | import accelerate 4 | 5 | 6 | def get_module_by_name_suffix(model, module_name: str): 7 | for name, module in model.named_modules(): 8 | if name.endswith(module_name): 9 | return module 10 | 11 | 12 | def simple_dispatch_model(model, device_map): 13 | from accelerate.hooks import add_hook_to_module, AlignDevicesHook 14 | 15 | if "" in device_map: 16 | d = device_map[""] 17 | model = model.to(torch.device(d)) 18 | model.hf_device_map = device_map 19 | return model 20 | 21 | tied_params = accelerate.utils.modeling.find_tied_parameters(model) 22 | if set(device_map.values()) == {"cpu"} or set(device_map.values()) == { 23 | "cpu", 24 | "disk", 25 | }: 26 | main_device = "cpu" 27 | else: 28 | main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0] 29 | 30 | cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"] 31 | prev_hook = None 32 | for idx, (n, d) in enumerate(cpu_offload_group): 33 | m = get_module_by_name_suffix(model, n) 34 | _, prev_hook = accelerate.cpu_offload_with_hook( 35 | m, execution_device=main_device, prev_module_hook=prev_hook 36 | ) 37 | # set first cpu offload module's prev_module_hook to the last cpu offload module's hook 38 | if len(cpu_offload_group) > 1: 39 | get_module_by_name_suffix( 40 | model, cpu_offload_group[0][0] 41 | )._hf_hook.prev_module_hook = prev_hook 42 | 43 | for n, d in device_map.items(): 44 | m = get_module_by_name_suffix(model, n) 45 | if d != "cpu": 46 | d = torch.device(d) 47 | hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True) 48 | add_hook_to_module(m, hook) 49 | accelerate.utils.modeling.retie_parameters(model, tied_params) 50 | model.hf_device_map = device_map 51 | 52 | return model 53 | 54 | 55 | def set_module_name(model, name, value): 56 | if "." in name: 57 | parent_name = name.rsplit(".", 1)[0] 58 | child_name = name[len(parent_name) + 1 :] 59 | parent = model.get_submodule(parent_name) 60 | else: 61 | parent_name = "" 62 | parent = model 63 | child_name = name 64 | 65 | setattr(parent, child_name, value) 66 | 67 | 68 | def clear_memory(weight=None): 69 | if weight is not None: 70 | del weight 71 | gc.collect() 72 | torch.cuda.empty_cache() 73 | 74 | 75 | def compute_memory_used_pct(device): 76 | memory_used = torch.cuda.max_memory_allocated(device) / (1024**3) 77 | memory_pct = ( 78 | memory_used 79 | / (torch.cuda.get_device_properties(device).total_memory / (1024**3)) 80 | * 100 81 | ) 82 | return memory_pct 83 | 84 | 85 | def get_best_device(): 86 | if torch.backends.mps.is_available(): 87 | return "mps" 88 | elif torch.cuda.is_available(): 89 | return "cuda:0" 90 | else: 91 | return "cpu" 92 | 93 | 94 | def get_lowest_memory_device_index(): 95 | device = None 96 | curr_device_memory_pct = 0 97 | for device_index in range(torch.cuda.device_count()): 98 | device_memory_pct = compute_memory_used_pct(device_index) 99 | if device is None or device_memory_pct < curr_device_memory_pct: 100 | device = device_index 101 | curr_device_memory_pct = device_memory_pct 102 | 103 | return device 104 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoAWQ/quantize.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | 5 | from transformers import AutoTokenizer 6 | from llmtuner.compression.quantization.AutoAWQ.awq import AutoAWQForCausalLM 7 | 8 | 9 | model_path = sys.argv[1] 10 | quant_path = sys.argv[2] 11 | bits = sys.argv[3] 12 | 13 | quant_config = { 14 | "zero_point": True, 15 | "q_group_size": q_group_size, 16 | "w_bit": int(bits), 17 | "version": "GEMM", 18 | } 19 | 20 | print(f"quant_config: {quant_config}") 21 | # Load model 22 | model = AutoAWQForCausalLM.from_pretrained( 23 | model_path, **{"low_cpu_mem_usage": True, "use_cache": False} 24 | ) 25 | 26 | try: 27 | tokenizer = AutoTokenizer.from_pretrained( 28 | model_path, 29 | use_fast=False, 30 | trust_remote_code=True, 31 | ) 32 | except: 33 | tokenizer = AutoTokenizer.from_pretrained( 34 | model_path, 35 | use_fast=True, 36 | trust_remote_code=True, 37 | ) 38 | 39 | # Quantize 40 | model.quantize(tokenizer, quant_config=quant_config) 41 | 42 | # Save quantized model 43 | model.save_quantized(quant_path) 44 | tokenizer.save_pretrained(quant_path) 45 | f = open(os.path.join(quant_path, "quantize_config.json"), 'w') 46 | config_to_save = json.dumps(quant_config, indent=2, sort_keys=True) 47 | f.write(config_to_save) 48 | f.close() 49 | print(f'Model is quantized and saved at "{quant_path}"') -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import AutoGPTQForCausalLM, BaseQuantizeConfig 2 | from .utils.exllama_utils import exllama_set_max_input_length 3 | from .utils.peft_utils import get_gptq_peft_model 4 | 5 | __version__ = "0.7.1" 6 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_modeling_task import LanguageModelingTask 2 | from .sequence_classification_task import SequenceClassificationTask, get_predictions 3 | from .text_summarization_task import TextSummarizationTask 4 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any, Dict, List, Optional, Union 3 | 4 | import torch 5 | from transformers import PreTrainedModel, PreTrainedTokenizer 6 | 7 | from ..modeling import BaseGPTQForCausalLM 8 | from ..utils.data_utils import get_dataloader 9 | 10 | 11 | class BaseTask: 12 | def __init__( 13 | self, 14 | model: Union[BaseGPTQForCausalLM, PreTrainedModel], 15 | tokenizer: PreTrainedTokenizer, 16 | data_name_or_path: str, 17 | prompt_col_name: str, 18 | label_col_name: str, 19 | device: Optional[str] = None, 20 | **kwargs, 21 | ): 22 | self.model = model 23 | self.tokenizer = tokenizer 24 | if self.tokenizer.pad_token_id is None: 25 | self.tokenizer.pad_token = self.tokenizer.eos_token 26 | self.tokenizer.pad_token_id = self.tokenizer.eos_token_id 27 | self.model.config.pad_token_id = self.tokenizer.eos_token_id 28 | self.dl = get_dataloader( 29 | data_name_or_path, 30 | prompt_col_name=prompt_col_name, 31 | label_col_name=label_col_name, 32 | tokenizer=tokenizer, 33 | **kwargs, 34 | ) 35 | 36 | self.device = device 37 | if not self.device: 38 | self.device = self.model.device 39 | if isinstance(self.device, str): 40 | self.device = torch.device(self.device) 41 | 42 | @abstractmethod 43 | def _predict(self, batch_data: Dict[str, Any], **kwargs) -> List[Any]: 44 | pass 45 | 46 | @abstractmethod 47 | def _parse_labels(self, label_ids: torch.LongTensor) -> List[Any]: 48 | pass 49 | 50 | @abstractmethod 51 | def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, float]: 52 | pass 53 | 54 | def run(self, **predict_kwargs) -> Dict[str, float]: 55 | with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): 56 | predictions = [] 57 | labels = [] 58 | for batch_data in self.dl: 59 | for k, v in batch_data.items(): 60 | if isinstance(v, torch.Tensor): 61 | batch_data[k] = v.to(self.device) 62 | labels += self._parse_labels(batch_data["labels"]) 63 | predictions += self._predict(batch_data, **predict_kwargs) 64 | 65 | return self._metric(predictions, labels) 66 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/classification_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import List, Sequence 3 | 4 | import numpy as np 5 | 6 | 7 | def levenshtein_distance(seq1: Sequence, seq2: Sequence): 8 | if seq1 == seq2: 9 | return 0 10 | num_rows = len(seq1) + 1 11 | num_cols = len(seq2) + 1 12 | dp_matrix = np.empty((num_rows, num_cols)) 13 | dp_matrix[0, :] = range(num_cols) 14 | dp_matrix[:, 0] = range(num_rows) 15 | 16 | for i in range(1, num_rows): 17 | for j in range(1, num_cols): 18 | if seq1[i - 1] == seq2[j - 1]: 19 | dp_matrix[i, j] = dp_matrix[i - 1, j - 1] 20 | else: 21 | dp_matrix[i, j] = ( 22 | min( 23 | dp_matrix[i - 1, j - 1], 24 | dp_matrix[i - 1, j], 25 | dp_matrix[i, j - 1], 26 | ) 27 | + 1 28 | ) 29 | 30 | return dp_matrix[num_rows - 1, num_cols - 1] 31 | 32 | 33 | def get_closest_label(pred: Sequence, classes: List[Sequence]) -> int: 34 | min_id = sys.maxsize 35 | min_edit_distance = sys.maxsize 36 | for i, class_label in enumerate(classes): 37 | edit_distance = levenshtein_distance(pred, class_label) 38 | if edit_distance < min_edit_distance: 39 | min_id = i 40 | min_edit_distance = edit_distance 41 | return min_id 42 | 43 | 44 | __all__ = ["levenshtein_distance", "get_closest_label"] 45 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/_utils/generation_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from torch import LongTensor 4 | from transformers import PreTrainedTokenizer 5 | 6 | 7 | def postprocess_generation_ids( 8 | input_ids: LongTensor, 9 | output_ids: LongTensor, 10 | num_return_sequences: int, 11 | tokenizer: Optional[PreTrainedTokenizer] = None, 12 | pad_token_ids: Optional[int] = None, 13 | ) -> List[List[Union[str, List[int]]]]: 14 | outputs = [] 15 | for idx, start in enumerate(range(0, len(output_ids), num_return_sequences)): 16 | sub_output_ids = output_ids[start : start + num_return_sequences] 17 | sub_generated_ids = sub_output_ids[..., input_ids[idx].size(0) :] 18 | if tokenizer: 19 | decoded_bach = ( 20 | generated_text 21 | for generated_text in tokenizer.batch_decode(sub_generated_ids, clean_up_tokenization_spaces=True) 22 | ) 23 | decoded_bach = list(decoded_bach) 24 | outputs.append(decoded_bach) 25 | else: 26 | sub_generated_ids = sub_output_ids.cpu().numpy().tolist() 27 | for i, one_sub_generated_ids in enumerate(sub_generated_ids): 28 | if pad_token_ids is not None and pad_token_ids in one_sub_generated_ids: 29 | one_sub_generated_ids = one_sub_generated_ids[: one_sub_generated_ids.index(pad_token_ids)] 30 | sub_generated_ids[i] = one_sub_generated_ids 31 | outputs.append(sub_generated_ids) 32 | 33 | return outputs 34 | 35 | 36 | __all__ = ["postprocess_generation_ids"] 37 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/language_modeling_task.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Any, Dict, List, Optional 3 | 4 | from torch import LongTensor 5 | 6 | from ._base import BaseTask 7 | 8 | 9 | class LanguageModelingTask(BaseTask): 10 | def __init__( 11 | self, 12 | model, 13 | tokenizer, 14 | data_name_or_path: str, 15 | prompt_col_name: str, 16 | label_col_name: str, 17 | device: Optional[str] = None, 18 | **kwargs, 19 | ): 20 | kwargs["merge_prompt_label"] = True 21 | super().__init__( 22 | model=model, 23 | tokenizer=tokenizer, 24 | data_name_or_path=data_name_or_path, 25 | prompt_col_name=prompt_col_name, 26 | label_col_name=label_col_name, 27 | device=device, 28 | **kwargs, 29 | ) 30 | 31 | def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[float]: 32 | outputs = self.model(**batch_data) 33 | loss = outputs.loss.cpu().item() 34 | 35 | return [loss] 36 | 37 | def _parse_labels(self, label_ids: LongTensor) -> List[Any]: 38 | return [] 39 | 40 | def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, float]: 41 | return {"ppl": math.exp(sum(pred) / len(pred))} 42 | 43 | def run(self) -> Dict[str, float]: 44 | return super().run() 45 | 46 | 47 | __all__ = ["LanguageModelingTask"] 48 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/sequence_classification_task.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from typing import Any, Dict, List, Optional 3 | 4 | import numpy as np 5 | from torch import LongTensor 6 | from transformers import GenerationConfig, PreTrainedTokenizer 7 | 8 | from ._base import BaseTask 9 | from ._utils.classification_utils import get_closest_label 10 | from ._utils.generation_utils import postprocess_generation_ids 11 | 12 | 13 | def get_predictions( 14 | input_ids: LongTensor, 15 | output_ids: LongTensor, 16 | num_return_sequences: int, 17 | tokenizer: PreTrainedTokenizer, 18 | classes: List[str], 19 | ) -> List[int]: 20 | predictions = [] 21 | generated_texts = postprocess_generation_ids( 22 | input_ids=input_ids, 23 | output_ids=output_ids, 24 | num_return_sequences=num_return_sequences, 25 | tokenizer=tokenizer, 26 | ) 27 | for sub_generated_texts in generated_texts: 28 | sub_predictions = [] 29 | for gen_text in sub_generated_texts: 30 | sub_predictions.append(get_closest_label(gen_text.lower().strip(), classes)) 31 | predictions.append(Counter(sub_predictions).most_common(1)[0][0]) 32 | return predictions 33 | 34 | 35 | class SequenceClassificationTask(BaseTask): 36 | def __init__( 37 | self, 38 | model, 39 | tokenizer: PreTrainedTokenizer, 40 | classes: List[str], 41 | data_name_or_path: str, 42 | prompt_col_name: str, 43 | label_col_name: str, 44 | device: Optional[str] = None, 45 | **kwargs, 46 | ): 47 | kwargs["merge_prompt_label"] = False 48 | super().__init__( 49 | model=model, 50 | tokenizer=tokenizer, 51 | data_name_or_path=data_name_or_path, 52 | prompt_col_name=prompt_col_name, 53 | label_col_name=label_col_name, 54 | device=device, 55 | **kwargs, 56 | ) 57 | self.classes = [each.lower().strip() for each in classes] 58 | classes_ids = self.tokenizer(classes) 59 | self.max_new_tokens = max([len(each) for each in classes_ids]) 60 | 61 | def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[int]: 62 | generation_config = kwargs["generation_config"] 63 | output_ids = self.model.generate( 64 | input_ids=batch_data["input_ids"], 65 | attention_mask=batch_data["attention_mask"], 66 | generation_config=generation_config, 67 | ) 68 | return get_predictions( 69 | batch_data["input_ids"], 70 | output_ids, 71 | generation_config.num_return_sequences, 72 | self.tokenizer, 73 | self.classes, 74 | ) 75 | 76 | def _parse_labels(self, label_ids: LongTensor) -> List[int]: 77 | labels = [] 78 | for one_label_ids in label_ids: 79 | one_label_ids = one_label_ids[(one_label_ids == -100).sum() :] 80 | label = self.tokenizer.decode(one_label_ids, clean_up_tokenization_spaces=True).lower().strip() 81 | label = get_closest_label(label, self.classes) 82 | labels.append(label) 83 | 84 | return labels 85 | 86 | def _metric(self, pred: List[int], label: List[int]) -> Dict[str, float]: 87 | pred = np.array(pred) 88 | label = np.array(label) 89 | 90 | acc = (pred == label).mean() 91 | 92 | return {"acc": acc} 93 | 94 | def run(self, generation_config: Optional[GenerationConfig] = None) -> Dict[str, float]: 95 | if not generation_config: 96 | generation_config = GenerationConfig(num_beams=1, do_sample=False, num_return_sequences=1) 97 | generation_config.max_new_tokens = self.max_new_tokens 98 | generation_config.eos_token_id = self.tokenizer.eos_token_id 99 | generation_config.pad_token_id = self.tokenizer.pad_token_id 100 | return super().run(generation_config=generation_config) 101 | 102 | 103 | __all__ = ["SequenceClassificationTask"] 104 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/eval_tasks/text_summarization_task.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | import rouge 4 | from torch import LongTensor 5 | from transformers import GenerationConfig 6 | 7 | from ._base import BaseTask 8 | from ._utils.generation_utils import postprocess_generation_ids 9 | 10 | 11 | class TextSummarizationTask(BaseTask): 12 | def __init__( 13 | self, 14 | model, 15 | tokenizer, 16 | data_name_or_path: str, 17 | prompt_col_name: str, 18 | label_col_name: str, 19 | device: Optional[str] = None, 20 | **kwargs, 21 | ): 22 | kwargs["merge_prompt_label"] = False 23 | super().__init__( 24 | model=model, 25 | tokenizer=tokenizer, 26 | data_name_or_path=data_name_or_path, 27 | prompt_col_name=prompt_col_name, 28 | label_col_name=label_col_name, 29 | device=device, 30 | **kwargs, 31 | ) 32 | 33 | def _predict(self, batch_data: Dict[str, Any], *args, **kwargs) -> List[str]: 34 | generation_config = kwargs["generation_config"] 35 | output_ids = self.model.generate( 36 | input_ids=batch_data["input_ids"], 37 | attention_mask=batch_data["attention_mask"], 38 | generation_config=generation_config, 39 | ) 40 | return [ 41 | each[0].lower().strip() 42 | for each in postprocess_generation_ids( 43 | input_ids=batch_data["input_ids"], 44 | output_ids=output_ids, 45 | num_return_sequences=generation_config.num_return_sequences, 46 | tokenizer=self.tokenizer, 47 | ) 48 | ] 49 | 50 | def _parse_labels(self, label_ids: LongTensor) -> List[str]: 51 | labels = [] 52 | for one_label_ids in label_ids: 53 | one_label_ids = one_label_ids[(one_label_ids == -100).sum() :] 54 | label = self.tokenizer.decode(one_label_ids).lower().strip() 55 | labels.append(label) 56 | 57 | return labels 58 | 59 | def _metric(self, pred: List[Any], label: List[Any]) -> Dict[str, Dict[str, float]]: 60 | metric = rouge.Rouge() 61 | return metric.get_scores(hyps=pred, refs=label, avg=True) 62 | 63 | def run(self, generation_config: Optional[GenerationConfig] = None) -> Dict[str, float]: 64 | if not generation_config: 65 | generation_config = GenerationConfig(num_beams=1, do_sample=False, max_new_tokens=128) 66 | generation_config.num_return_sequences = 1 67 | generation_config.eos_token_id = self.tokenizer.eos_token_id 68 | generation_config.pad_token_id = self.tokenizer.pad_token_id 69 | return super().run(generation_config=generation_config) 70 | 71 | 72 | __all__ = ["TextSummarizationTask"] 73 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM, BaseQuantizeConfig 2 | from .auto import GPTQ_CAUSAL_LM_MODEL_MAP, AutoGPTQForCausalLM 3 | from .baichuan import BaiChuanGPTQForCausalLM 4 | from .bloom import BloomGPTQForCausalLM 5 | from .codegen import CodeGenGPTQForCausalLM 6 | from .decilm import DeciLMGPTQForCausalLM 7 | from .gemma import GemmaGPTQForCausalLM 8 | from .gpt2 import GPT2GPTQForCausalLM 9 | from .gpt_bigcode import GPTBigCodeGPTQForCausalLM 10 | from .gpt_neox import GPTNeoXGPTQForCausalLM 11 | from .gptj import GPTJGPTQForCausalLM 12 | from .internlm import InternLMGPTQForCausalLM 13 | from .llama import LlamaGPTQForCausalLM 14 | from .longllama import LongLlamaGPTQForCausalLM 15 | from .mistral import MistralGPTQForCausalLM 16 | from .mixtral import MixtralGPTQForCausalLM 17 | from .moss import MOSSGPTQForCausalLM 18 | from .opt import OPTGPTQForCausalLM 19 | from .qwen import QwenGPTQForCausalLM 20 | from .qwen2 import Qwen2GPTQForCausalLM 21 | from .rw import RWGPTQForCausalLM 22 | from .stablelmepoch import StableLMEpochGPTQForCausalLM 23 | from .xverse import XverseGPTQForCausalLM 24 | from .yi import YiGPTQForCausalLM 25 | from .deepseek import DeepseekGPTQForCausalLM -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/_const.py: -------------------------------------------------------------------------------- 1 | from torch import device 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | 5 | 6 | CPU = device("cpu") 7 | CUDA_0 = device("cuda:0") 8 | 9 | SUPPORTED_MODELS = [ 10 | "bloom", 11 | "gptj", 12 | "gpt2", 13 | "gpt_neox", 14 | "opt", 15 | "moss", 16 | "gpt_bigcode", 17 | "codegen", 18 | "RefinedWebModel", 19 | "RefinedWeb", 20 | "baichuan", 21 | "internlm", 22 | "qwen", 23 | "xverse", 24 | "deci", 25 | "stablelm_epoch", 26 | "deepseek", 27 | ] 28 | if compare_transformers_version("v4.28.0", op="ge"): 29 | SUPPORTED_MODELS.append("llama") 30 | if compare_transformers_version("v4.30.0", op="ge"): 31 | SUPPORTED_MODELS.append("longllama") 32 | if compare_transformers_version("v4.33.0", op="ge"): 33 | SUPPORTED_MODELS.append("falcon") 34 | if compare_transformers_version("v4.34.0", op="ge"): 35 | SUPPORTED_MODELS.append("mistral") 36 | SUPPORTED_MODELS.append("Yi") 37 | if compare_transformers_version("v4.36.0", op="ge"): 38 | SUPPORTED_MODELS.append("mixtral") 39 | if compare_transformers_version("v4.37.0", op="ge"): 40 | SUPPORTED_MODELS.append("qwen2") 41 | if compare_transformers_version("v4.38.0", op="ge"): 42 | SUPPORTED_MODELS.append("gemma") 43 | 44 | 45 | EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 46 | 47 | __all__ = ["CPU", "CUDA_0", "SUPPORTED_MODELS", "EXLLAMA_DEFAULT_MAX_INPUT_LENGTH"] 48 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/baichuan.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "DecoderLayer" 6 | layers_block_name = "model.layers" 7 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 8 | inside_layer_modules = [ 9 | ["self_attn.W_pack"], 10 | ["self_attn.o_proj"], 11 | ["mlp.up_proj", "mlp.gate_proj"], 12 | ["mlp.down_proj"], 13 | ] 14 | 15 | 16 | __all__ = ["BaiChuanGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/bloom.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class BloomGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "BloomBlock" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = [ 8 | "transformer.word_embeddings", 9 | "transformer.word_embeddings_layernorm", 10 | "transformer.ln_f", 11 | ] 12 | inside_layer_modules = [ 13 | ["self_attention.query_key_value"], 14 | ["self_attention.dense"], 15 | ["mlp.dense_h_to_4h"], 16 | ["mlp.dense_4h_to_h"], 17 | ] 18 | 19 | 20 | __all__ = ["BloomGPTQForCausalLM"] 21 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/codegen.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class CodeGenGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "CodeGenBlock" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = ["transformer.wte", "transformer.ln_f"] 8 | inside_layer_modules = [ 9 | ["attn.qkv_proj"], 10 | ["attn.out_proj"], 11 | ["mlp.fc_in"], 12 | ["mlp.fc_out"], 13 | ] 14 | 15 | 16 | __all__ = ["CodeGenGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/decilm.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class DeciLMGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "DeciLMDecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["DeciLMGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/deepseek.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | n_shared_experts = 2 5 | n_routed_experts = 64 6 | 7 | 8 | class DeepseekGPTQForCausalLM(BaseGPTQForCausalLM): 9 | layer_type = "DeepseekDecoderLayer" 10 | layers_block_name = "model.layers" 11 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 12 | inside_layer_modules = [ 13 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 14 | ["self_attn.o_proj"], 15 | [f"mlp.experts.{i}.gate_proj" for i in range(n_routed_experts)] + [f"mlp.experts.{i}.up_proj" for i in range(n_routed_experts)], 16 | [f"mlp.experts.{i}.down_proj" for i in range(n_routed_experts)], 17 | [f"mlp.shared_experts.gate_proj"] + ["mlp.shared_experts.up_proj"], 18 | [f"mlp.shared_experts.down_proj"], 19 | ] 20 | 21 | 22 | __all__ = ["MixtralGPTQForCausalLM"] 23 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gemma.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ._base import BaseGPTQForCausalLM 4 | 5 | 6 | logger = getLogger(__name__) 7 | 8 | 9 | class GemmaGPTQForCausalLM(BaseGPTQForCausalLM): 10 | layer_type = "GemmaDecoderLayer" 11 | layers_block_name = "model.layers" 12 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 13 | inside_layer_modules = [ 14 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 15 | ["self_attn.o_proj"], 16 | ["mlp.up_proj", "mlp.gate_proj"], 17 | ["mlp.down_proj"], 18 | ] 19 | 20 | 21 | __all__ = ["GemmaGPTQForCausalLM"] 22 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt2.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class GPT2GPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "GPT2Block" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = ["transformer.wte", "transformer.wpe", "transformer.ln_f"] 8 | inside_layer_modules = [ 9 | ["attn.c_attn"], 10 | ["attn.c_proj"], 11 | ["mlp.c_fc"], 12 | ["mlp.c_proj"], 13 | ] 14 | 15 | 16 | __all__ = ["GPT2GPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt_bigcode.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class GPTBigCodeGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "GPTBigCodeBlock" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"] 8 | inside_layer_modules = [ 9 | ["attn.c_attn"], 10 | ["attn.c_proj"], 11 | ["mlp.c_fc"], 12 | ["mlp.c_proj"], 13 | ] 14 | 15 | 16 | __all__ = ["GPTBigCodeGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gpt_neox.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class GPTNeoXGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "GPTNeoXLayer" 6 | layers_block_name = "gpt_neox.layers" 7 | outside_layer_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"] 8 | inside_layer_modules = [ 9 | ["attention.query_key_value"], 10 | ["attention.dense"], 11 | ["mlp.dense_h_to_4h"], 12 | ["mlp.dense_4h_to_h"], 13 | ] 14 | lm_head_name = "embed_out" 15 | 16 | 17 | __all__ = ["GPTNeoXGPTQForCausalLM"] 18 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/gptj.py: -------------------------------------------------------------------------------- 1 | from ..nn_modules.fused_gptj_attn import FusedGPTJAttentionForQuantizedModel 2 | from ._base import BaseGPTQForCausalLM 3 | 4 | 5 | class GPTJGPTQForCausalLM(BaseGPTQForCausalLM): 6 | layer_type = "GPTJBlock" 7 | layers_block_name = "transformer.h" 8 | outside_layer_modules = ["transformer.wte", "transformer.ln_f"] 9 | inside_layer_modules = [ 10 | ["attn.k_proj", "attn.v_proj", "attn.q_proj"], 11 | ["attn.out_proj"], 12 | ["mlp.fc_in"], 13 | ["mlp.fc_out"], 14 | ] 15 | 16 | fused_attn_module_type = FusedGPTJAttentionForQuantizedModel 17 | 18 | 19 | __all__ = ["GPTJGPTQForCausalLM"] 20 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/internlm.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class InternLMGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "InternLMDecoderLayer" 6 | layers_block_name = "model.layers" 7 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 8 | inside_layer_modules = [ 9 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 10 | ["self_attn.o_proj"], 11 | ["mlp.up_proj", "mlp.gate_proj"], 12 | ["mlp.down_proj"], 13 | ] 14 | 15 | 16 | __all__ = ["InternLMGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/llama.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class LlamaGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "LlamaDecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["LlamaGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/longllama.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class LongLlamaGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "LongLlamaDecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["LongLlamaGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/mistral.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | from ..utils.import_utils import compare_transformers_version 3 | 4 | if compare_transformers_version("v4.28.0", op="ge"): 5 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 6 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 7 | else: 8 | FusedLlamaAttentionForQuantizedModel = None 9 | FusedLlamaMLPForQuantizedModel = None 10 | 11 | 12 | class MistralGPTQForCausalLM(BaseGPTQForCausalLM): 13 | layer_type = "MistralDecoderLayer" 14 | layers_block_name = "model.layers" 15 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 16 | inside_layer_modules = [ 17 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 18 | ["self_attn.o_proj"], 19 | ["mlp.up_proj", "mlp.gate_proj"], 20 | ["mlp.down_proj"], 21 | ] 22 | 23 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 24 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 25 | 26 | __all__ = ["MistralGPTQForCausalLM"] 27 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/mixtral.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class MixtralGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "MixtralDecoderLayer" 6 | layers_block_name = "model.layers" 7 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 8 | inside_layer_modules = [ 9 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 10 | ["self_attn.o_proj"], 11 | [ 12 | "block_sparse_moe.experts.0.w1", 13 | "block_sparse_moe.experts.1.w1", 14 | "block_sparse_moe.experts.2.w1", 15 | "block_sparse_moe.experts.3.w1", 16 | "block_sparse_moe.experts.4.w1", 17 | "block_sparse_moe.experts.5.w1", 18 | "block_sparse_moe.experts.6.w1", 19 | "block_sparse_moe.experts.7.w1", 20 | "block_sparse_moe.experts.0.w3", 21 | "block_sparse_moe.experts.1.w3", 22 | "block_sparse_moe.experts.2.w3", 23 | "block_sparse_moe.experts.3.w3", 24 | "block_sparse_moe.experts.4.w3", 25 | "block_sparse_moe.experts.5.w3", 26 | "block_sparse_moe.experts.6.w3", 27 | "block_sparse_moe.experts.7.w3", 28 | ], 29 | [ 30 | "block_sparse_moe.experts.0.w2", 31 | "block_sparse_moe.experts.1.w2", 32 | "block_sparse_moe.experts.2.w2", 33 | "block_sparse_moe.experts.3.w2", 34 | "block_sparse_moe.experts.4.w2", 35 | "block_sparse_moe.experts.5.w2", 36 | "block_sparse_moe.experts.6.w2", 37 | "block_sparse_moe.experts.7.w2", 38 | ], 39 | ] 40 | 41 | 42 | __all__ = ["MixtralGPTQForCausalLM"] 43 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/moss.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class MOSSGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "MossBlock" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = ["transformer.wte", "transformer.ln_f"] 8 | inside_layer_modules = [ 9 | ["attn.qkv_proj"], 10 | ["attn.out_proj"], 11 | ["mlp.fc_in"], 12 | ["mlp.fc_out"], 13 | ] 14 | 15 | 16 | __all__ = ["MOSSGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/opt.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class OPTGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "OPTDecoderLayer" 6 | layers_block_name = "model.decoder.layers" 7 | outside_layer_modules = [ 8 | "model.decoder.embed_tokens", 9 | "model.decoder.embed_positions", 10 | "model.decoder.project_out", 11 | "model.decoder.project_in", 12 | "model.decoder.final_layer_norm", 13 | ] 14 | inside_layer_modules = [ 15 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 16 | ["self_attn.out_proj"], 17 | ["fc1"], 18 | ["fc2"], 19 | ] 20 | 21 | 22 | __all__ = ["OPTGPTQForCausalLM"] 23 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/qwen.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class QwenGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "QWenBlock" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = [ 8 | "transformer.wte", 9 | "transformer.wpe", 10 | "transformer.ln_f", 11 | "transformer.visual", 12 | ] 13 | inside_layer_modules = [ 14 | ["attn.c_attn"], 15 | ["attn.c_proj"], 16 | ["mlp.w1", "mlp.w2"], 17 | ["mlp.c_proj"], 18 | ] 19 | 20 | 21 | __all__ = ["QwenGPTQForCausalLM"] 22 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/qwen2.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class Qwen2GPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "Qwen2DecoderLayer" 6 | layers_block_name = "model.layers" 7 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 8 | inside_layer_modules = [ 9 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 10 | ["self_attn.o_proj"], 11 | ["mlp.up_proj", "mlp.gate_proj"], 12 | ["mlp.down_proj"], 13 | ] 14 | 15 | 16 | __all__ = ["Qwen2GPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/rw.py: -------------------------------------------------------------------------------- 1 | from ._base import BaseGPTQForCausalLM 2 | 3 | 4 | class RWGPTQForCausalLM(BaseGPTQForCausalLM): 5 | layer_type = "DecoderLayer" 6 | layers_block_name = "transformer.h" 7 | outside_layer_modules = ["transformer.word_embeddings", "transformer.ln_f"] 8 | inside_layer_modules = [ 9 | ["self_attention.query_key_value"], 10 | ["self_attention.dense"], 11 | ["mlp.dense_h_to_4h"], 12 | ["mlp.dense_4h_to_h"], 13 | ] 14 | 15 | 16 | __all__ = ["RWGPTQForCausalLM"] 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/stablelmepoch.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class StableLMEpochGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "DecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["StableLMEpochGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/xverse.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class XverseGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "XverseDecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["XverseGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/modeling/yi.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from ..utils.import_utils import compare_transformers_version 4 | from ._base import BaseGPTQForCausalLM 5 | 6 | 7 | if compare_transformers_version("v4.28.0", op="ge"): 8 | from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel 9 | from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel 10 | else: 11 | FusedLlamaAttentionForQuantizedModel = None 12 | FusedLlamaMLPForQuantizedModel = None 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | class YiGPTQForCausalLM(BaseGPTQForCausalLM): 18 | layer_type = "YiDecoderLayer" 19 | layers_block_name = "model.layers" 20 | outside_layer_modules = ["model.embed_tokens", "model.norm"] 21 | inside_layer_modules = [ 22 | ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], 23 | ["self_attn.o_proj"], 24 | ["mlp.up_proj", "mlp.gate_proj"], 25 | ["mlp.down_proj"], 26 | ] 27 | 28 | fused_attn_module_type = FusedLlamaAttentionForQuantizedModel 29 | fused_mlp_module_type = FusedLlamaMLPForQuantizedModel 30 | 31 | 32 | __all__ = ["YiGPTQForCausalLM"] 33 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/_fused_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from logging import getLogger 3 | 4 | import torch.nn as nn 5 | 6 | from .triton_utils.mixin import TritonModuleMixin 7 | 8 | 9 | logger = getLogger(__name__) 10 | 11 | 12 | class FusedBaseModule(nn.Module, TritonModuleMixin): 13 | @classmethod 14 | @abstractmethod 15 | def inject_to_model(cls, *args, **kwargs): 16 | raise NotImplementedError() 17 | 18 | 19 | class FusedBaseAttentionModule(FusedBaseModule): 20 | @classmethod 21 | @abstractmethod 22 | def inject_to_model( 23 | cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, trainable=False, **kwargs 24 | ): 25 | raise NotImplementedError() 26 | 27 | @classmethod 28 | def warmup(cls, model, transpose=False, seqlen=2048): 29 | pass 30 | 31 | 32 | class FusedBaseMLPModule(FusedBaseModule): 33 | @classmethod 34 | @abstractmethod 35 | def inject_to_model(cls, model, use_triton=False, **kwargs): 36 | raise NotImplementedError() 37 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/qlinear/__init__.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class GeneralQuantLinear(nn.Linear): 5 | def __init__(self, quant_linear_module): 6 | super().__init__( 7 | in_features=quant_linear_module.infeatures, 8 | out_features=quant_linear_module.outfeatures, 9 | bias=True, 10 | ) 11 | self.infeatures = quant_linear_module.infeatures 12 | self.outfeatures = quant_linear_module.outfeatures 13 | self.bits = quant_linear_module.bits 14 | self.group_size = quant_linear_module.group_size 15 | self.maxq = quant_linear_module.maxq 16 | 17 | self.weight.requires_grad = False 18 | 19 | self.weight.data = quant_linear_module.qweight 20 | self.register_buffer("qweight", quant_linear_module.qweight) 21 | self.bias.data = quant_linear_module.bias 22 | 23 | self.qweight.requires_grad = False 24 | self.bias.requires_grad = False 25 | 26 | self.register_buffer("qzeros", quant_linear_module.qzeros) 27 | self.register_buffer("scales", quant_linear_module.scales) 28 | self.register_buffer("g_idx", quant_linear_module.g_idx) 29 | 30 | if hasattr(quant_linear_module, "wf"): 31 | self.wf = quant_linear_module.wf 32 | if hasattr(quant_linear_module, "kernel_switch_threshold"): 33 | self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold 34 | if hasattr(quant_linear_module, "autogptq_cuda_available"): 35 | self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available 36 | 37 | self.trainable = quant_linear_module.trainable 38 | 39 | self.forward = quant_linear_module.forward 40 | 41 | @classmethod 42 | def inject_to_model(cls, model, target_module_type): 43 | for name, m in model.named_modules(): 44 | if not isinstance(m, target_module_type): 45 | continue 46 | new_m = cls(m) 47 | if "." in name: 48 | parent_name = name.rsplit(".", 1)[0] 49 | child_name = name[len(parent_name) + 1 :] 50 | parent = model.get_submodule(parent_name) 51 | else: 52 | parent_name = "" 53 | parent = model 54 | child_name = name 55 | 56 | setattr(parent, child_name, new_m) 57 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/nn_modules/triton_utils/mixin.py: -------------------------------------------------------------------------------- 1 | class TritonModuleMixin: 2 | @classmethod 3 | def warmup(cls, model, transpose=False, seqlen=2048): 4 | pass 5 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from .gptq import GPTQ 2 | from .quantizer import Quantizer, quantize 3 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .perplexity_utils import Perplexity 2 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/exllama_utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import torch 4 | 5 | from ..nn_modules.qlinear.qlinear_exllama import QuantLinear as ExllamaQuantLinear 6 | 7 | 8 | def exllama_set_max_input_length(model, max_input_length: int): 9 | """ 10 | This method does not necessarily require `model` to inherit from BaseGPTQForCausalLM. 11 | 12 | When using the exllama backend with act-order, it is necessary to initialize a buffer that depends on the maximum expected input length. In case the 13 | default used (EXLLAMA_DEFAULT_MAX_INPUT_LENGTH) is too short, this method can be called to extend the buffer size without reloading the whole model. 14 | """ 15 | 16 | # The import is set here to avoid a global import. Arguably this is quite ugly, it would be better to have lazy loading. 17 | from exllama_kernels import cleanup_buffers_cuda, prepare_buffers 18 | 19 | if not model.quantize_config.desc_act: 20 | raise ValueError( 21 | "The method exllama_set_max_input_length should be called only when using the exllama backend **with act-order**." 22 | ) 23 | 24 | uses_exllama = False 25 | for name, submodule in model.named_modules(): 26 | if isinstance(submodule, ExllamaQuantLinear): 27 | uses_exllama = True 28 | 29 | if not uses_exllama: 30 | raise ValueError( 31 | f"The function exllama_set_max_input_length was called, but the model (instance of {model.__class__.__name__}) does not use the exllama backend for GPTQ. An other implementation is used (exllamav2, cuda, cuda-old, triton) and that the call to exllama_set_max_input_length is unnecessary. Please remove the call to exllama_set_max_input_length or use the exllama v1 backend." 32 | ) 33 | 34 | device_to_buffers_size = {} 35 | for device, buffers in model.device_to_buffers.items(): 36 | device_to_buffers_size[device] = { 37 | "max_dq_buffer_size": buffers["max_dq_buffer_size"], 38 | "max_inner_outer_dim": buffers["max_inner_outer_dim"], 39 | } 40 | 41 | # For an unknown reason calling just `del model.device_to_buffers` raises an AttributeError. 42 | for key in list(model.device_to_buffers.keys()): 43 | del model.device_to_buffers[key] 44 | model.device_to_buffers = None 45 | del model.device_to_buffers 46 | 47 | gc.collect() 48 | torch.cuda.empty_cache() 49 | cleanup_buffers_cuda() 50 | 51 | device_to_buffers = {} 52 | for device, buffers_size in device_to_buffers_size.items(): 53 | # The temp_state buffer is required to reorder X in the act-order case. 54 | # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. 55 | device_to_buffers[device] = { 56 | "temp_state": torch.zeros( 57 | (max_input_length, buffers_size["max_inner_outer_dim"]), 58 | dtype=torch.float16, 59 | device=device, 60 | ), 61 | "temp_dq": torch.zeros( 62 | (1, buffers_size["max_dq_buffer_size"]), 63 | dtype=torch.float16, 64 | device=device, 65 | ), 66 | "max_dq_buffer_size": buffers_size["max_dq_buffer_size"], 67 | "max_inner_outer_dim": buffers_size["max_inner_outer_dim"], 68 | } 69 | 70 | prepare_buffers( 71 | device, 72 | device_to_buffers[device]["temp_state"], 73 | device_to_buffers[device]["temp_dq"], 74 | ) 75 | 76 | # Buffers need to be persistent to avoid any bug. 77 | model.device_to_buffers = device_to_buffers 78 | 79 | return model 80 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/import_utils.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | from typing import Optional 3 | 4 | import torch 5 | from packaging.version import parse as parse_version 6 | 7 | 8 | try: 9 | import triton # noqa: F401 10 | 11 | TRITON_AVAILABLE = True 12 | except ImportError: 13 | TRITON_AVAILABLE = False 14 | 15 | try: 16 | import autogptq_cuda_64 # noqa: F401 17 | 18 | AUTOGPTQ_CUDA_AVAILABLE = True 19 | except Exception: 20 | AUTOGPTQ_CUDA_AVAILABLE = False 21 | 22 | 23 | try: 24 | import exllama_kernels # noqa: F401 25 | 26 | EXLLAMA_KERNELS_AVAILABLE = True 27 | except Exception: 28 | EXLLAMA_KERNELS_AVAILABLE = False 29 | 30 | try: 31 | import exllamav2_kernels # noqa: F401 32 | 33 | EXLLAMAV2_KERNELS_AVAILABLE = True 34 | except Exception: 35 | EXLLAMAV2_KERNELS_AVAILABLE = False 36 | 37 | try: 38 | import cQIGen # noqa: F401 39 | 40 | QIGEN_AVAILABLE = True 41 | QIGEN_EXCEPTION = None 42 | except Exception as e: 43 | QIGEN_AVAILABLE = False 44 | QIGEN_EXCEPTION = e 45 | 46 | try: 47 | import autogptq_marlin_cuda # noqa: F401 48 | 49 | MARLIN_AVAILABLE = True 50 | MARLIN_EXCEPTION = None 51 | except Exception as e: 52 | MARLIN_AVAILABLE = False 53 | MARLIN_EXCEPTION = e 54 | 55 | 56 | logger = getLogger(__name__) 57 | 58 | 59 | def dynamically_import_QuantLinear( 60 | use_triton: bool, 61 | desc_act: bool, 62 | group_size: int, 63 | bits: int, 64 | disable_exllama: Optional[bool] = None, 65 | disable_exllamav2: bool = False, 66 | use_qigen: bool = False, 67 | disable_marlin: bool = True, 68 | ): 69 | if use_qigen: 70 | if not QIGEN_AVAILABLE: 71 | raise ValueError( 72 | f"QIGen appears to be not available with the error: {QIGEN_EXCEPTION}. Please check your installation or use `use_qigen=False`." 73 | ) 74 | from ..nn_modules.qlinear.qlinear_qigen import QuantLinear 75 | else: 76 | if use_triton: 77 | if torch.version.hip: 78 | logger.warning( 79 | "Running GPTQ triton version on AMD GPUs is untested and may result in errors or wrong predictions. Please use use_triton=False." 80 | ) 81 | 82 | from ..nn_modules.qlinear.qlinear_triton import QuantLinear 83 | else: 84 | # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones. 85 | if disable_exllama is None: 86 | if disable_exllamav2: 87 | disable_exllama = False 88 | else: 89 | disable_exllama = True 90 | if bits == 4 and not disable_marlin: 91 | from ..nn_modules.qlinear.qlinear_marlin import QuantLinear 92 | elif bits == 4 and not disable_exllamav2 and EXLLAMAV2_KERNELS_AVAILABLE: 93 | from ..nn_modules.qlinear.qlinear_exllamav2 import QuantLinear 94 | elif bits == 4 and not disable_exllama and EXLLAMA_KERNELS_AVAILABLE: 95 | from ..nn_modules.qlinear.qlinear_exllama import QuantLinear 96 | elif not desc_act or group_size == -1: 97 | from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear 98 | else: 99 | from ..nn_modules.qlinear.qlinear_cuda import QuantLinear 100 | 101 | return QuantLinear 102 | 103 | 104 | def compare_transformers_version(version: str = "v4.28.0", op: str = "eq"): 105 | assert op in ["eq", "lt", "le", "gt", "ge"] 106 | 107 | from transformers import __version__ 108 | 109 | return getattr(parse_version(__version__), f"__{op}__")(parse_version(version)) 110 | 111 | 112 | def compare_pytorch_version(version: str = "v2.0.0", op: str = "eq"): 113 | assert op in ["eq", "lt", "le", "gt", "ge"] 114 | 115 | from torch import __version__ 116 | 117 | return getattr(parse_version(__version__), f"__{op}__")(parse_version(version)) 118 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/auto_gptq/utils/modeling_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | 4 | def recurse_getattr(obj, attr: str): 5 | """ 6 | Recursive `getattr`. 7 | 8 | Args: 9 | obj: 10 | A class instance holding the attribute. 11 | attr (`str`): 12 | The attribute that is to be retrieved, e.g. 'attribute1.attribute2'. 13 | """ 14 | 15 | def _getattr(obj, attr): 16 | return getattr(obj, attr) 17 | 18 | return functools.reduce(_getattr, [obj] + attr.split(".")) 19 | 20 | 21 | def recurse_setattr(module, name, value): 22 | """A function to recursively set attributes to a module.""" 23 | if "." not in name: 24 | setattr(module, name, value) 25 | else: 26 | name, rest = name.split(".", 1) 27 | recurse_setattr(getattr(module, name), rest, value) 28 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/modelutils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | DEV = torch.device('cuda:0') 6 | 7 | 8 | def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): 9 | if type(module) in layers: 10 | return {name: module} 11 | res = {} 12 | for name1, child in module.named_children(): 13 | res.update(find_layers( 14 | child, layers=layers, name=name + '.' + name1 if name != '' else name1 15 | )) 16 | return res 17 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/setup_cuda.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from torch.utils import cpp_extension 3 | 4 | setup( 5 | name='quant_cuda', 6 | ext_modules=[cpp_extension.CUDAExtension( 7 | 'quant_cuda', ['quant_cuda.cpp', 'quant_cuda_kernel.cu'] 8 | )], 9 | cmdclass={'build_ext': cpp_extension.BuildExtension} 10 | ) 11 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/AutoGPTQ/test_kernel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import quant_cuda 5 | 6 | torch.backends.cuda.matmul.allow_tf32 = False 7 | torch.backends.cudnn.allow_tf32 = False 8 | 9 | print('Benchmarking OPT-175B FC2 matvec ...') 10 | 11 | DEV = torch.device('cuda:0') 12 | 13 | M = 12288 * 4 14 | N = 12288 15 | 16 | DTYPE = torch.half 17 | mat = torch.randn((M, N), device=DEV, dtype=DTYPE) 18 | vec = torch.randn((1, M), device=DEV, dtype=DTYPE) 19 | mul = torch.zeros((1, N), device=DEV, dtype=DTYPE) 20 | 21 | COUNT = 1000 22 | import time 23 | tick = time.time() 24 | for _ in range(COUNT): 25 | torch.matmul(vec, mat, out=mul) 26 | torch.cuda.synchronize() 27 | print('FP16:', (time.time() - tick) / COUNT) 28 | 29 | DTYPE = torch.float 30 | mat = mat.to(DTYPE) 31 | vec = vec.to(DTYPE) 32 | mul = mul.to(DTYPE) 33 | 34 | mat = torch.randint(-1000000000, 1000000000, (M // 1024 * 96, N), device=DEV, dtype=torch.int) 35 | scales = torch.randn(N, device=DEV, dtype=DTYPE) 36 | zeros = torch.randn(N, device=DEV, dtype=DTYPE) 37 | 38 | COUNT = 1000 39 | import time 40 | tick = time.time() 41 | for _ in range(COUNT): 42 | quant_cuda.vecquant3matmul(vec, mat, mul, scales, zeros) 43 | torch.cuda.synchronize() 44 | print('3bit:', (time.time() - tick) / COUNT) 45 | 46 | COUNT = 1000 47 | import time 48 | tick = time.time() 49 | for _ in range(COUNT): 50 | quant_cuda.vecquant3matmul_faster(vec, mat, mul, scales, zeros) 51 | torch.cuda.synchronize() 52 | print('3bit:', (time.time() - tick) / COUNT, '(faster)') 53 | 54 | print('Verifiying kernel correctness ...') 55 | 56 | M = 4 * 4096 57 | N = 4096 58 | 59 | layer = nn.Linear(M, N) 60 | vec = torch.randn(M).to(DEV) 61 | 62 | from quant import * 63 | quantizer = Quantizer() 64 | quantizer.configure(3, perchannel=True, sym=False, mse=False) 65 | quantizer.find_params(layer.weight.data, weight=True) 66 | layer.weight.data = quantize( 67 | layer.weight.data, quantizer.scale, quantizer.zero, quantizer.maxq 68 | ) 69 | 70 | qlayer = Quant3Linear(layer.in_features, layer.out_features) 71 | qlayer.pack(layer, quantizer.scale, quantizer.zero) 72 | 73 | qlayer = qlayer.to(DEV) 74 | layer = layer.to(DEV) 75 | 76 | with torch.no_grad(): 77 | print('Simu:', layer.to(DEV)(vec)) 78 | print('Kern:', qlayer(vec)) 79 | qlayer.faster = True 80 | print('Kern:', qlayer(vec.half()), '(faster)') 81 | -------------------------------------------------------------------------------- /src/llmtuner/compression/quantization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/compression/quantization/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/compression/tuner.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, Dict, List, Optional 2 | 3 | import torch 4 | from transformers import PreTrainedModel 5 | # from .dpo import run_dpo 6 | # from .ppo import run_ppo 7 | from llmtuner.compression.prune import run_prune 8 | # from .pt import run_pt 9 | # from .rm import run_rm 10 | # from .sft import run_sft 11 | from ..extras.callbacks import LogCallback 12 | from ..extras.logging import get_logger 13 | from ..hparams import get_infer_args, get_train_sparse_args 14 | from ..model import load_model_and_tokenizer 15 | 16 | if TYPE_CHECKING: 17 | from transformers import TrainerCallback 18 | 19 | logger = get_logger(__name__) 20 | 21 | 22 | def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None): 23 | # model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args) 24 | model_args, data_args, training_args, finetuning_args, generating_args, pruning_args = get_train_sparse_args(args) 25 | callbacks = [LogCallback()] if callbacks is None else callbacks 26 | if finetuning_args.stage == "prune": # 🔍 27 | run_prune(model_args, data_args, training_args, finetuning_args, pruning_args, callbacks) 28 | 29 | def export_model(args: Optional[Dict[str, Any]] = None): 30 | model_args, _, finetuning_args, _ = get_infer_args(args) 31 | 32 | if model_args.export_dir is None: 33 | raise ValueError("Please specify `export_dir`.") 34 | 35 | if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None: 36 | raise ValueError("Please merge adapters before quantizing the model.") 37 | 38 | model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args) 39 | 40 | if getattr(model, "quantization_method", None) and model_args.adapter_name_or_path is not None: 41 | raise ValueError("Cannot merge adapters to a quantized model.") 42 | 43 | if not isinstance(model, PreTrainedModel): 44 | raise ValueError("The model is not a `PreTrainedModel`, export aborted.") 45 | 46 | if getattr(model, "quantization_method", None): 47 | model = model.to("cpu") 48 | elif hasattr(model.config, "torch_dtype"): 49 | model = model.to(getattr(model.config, "torch_dtype")).to("cpu") 50 | else: 51 | model = model.to(torch.float16).to("cpu") 52 | setattr(model.config, "torch_dtype", torch.float16) 53 | 54 | model.save_pretrained( 55 | save_directory=model_args.export_dir, 56 | max_shard_size="{}GB".format(model_args.export_size), 57 | safe_serialization=(not model_args.export_legacy_format), 58 | ) 59 | if model_args.export_hub_model_id is not None: 60 | model.push_to_hub( 61 | model_args.export_hub_model_id, 62 | token=model_args.hf_hub_token, 63 | max_shard_size="{}GB".format(model_args.export_size), 64 | safe_serialization=(not model_args.export_legacy_format), 65 | ) 66 | 67 | try: 68 | tokenizer.padding_side = "left" # restore padding side 69 | tokenizer.init_kwargs["padding_side"] = "left" 70 | tokenizer.save_pretrained(model_args.export_dir) 71 | if model_args.export_hub_model_id is not None: 72 | tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token) 73 | except Exception: 74 | logger.warning("Cannot save tokenizer, please copy the files manually.") 75 | 76 | 77 | if __name__ == "__main__": 78 | run_exp() 79 | -------------------------------------------------------------------------------- /src/llmtuner/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import get_dataset 2 | from .template import get_template_and_fix_tokenizer, templates 3 | from .utils import Role, split_dataset 4 | 5 | 6 | __all__ = ["get_dataset", "get_template_and_fix_tokenizer", "templates", "Role", "split_dataset"] 7 | -------------------------------------------------------------------------------- /src/llmtuner/data/test_data.py: -------------------------------------------------------------------------------- 1 | from loader import load_single_dataset 2 | 3 | load_single_dataset(dataset_attr, model_args, data_args) -------------------------------------------------------------------------------- /src/llmtuner/data/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from enum import Enum, unique 3 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union 4 | 5 | from ..extras.logging import get_logger 6 | 7 | 8 | if TYPE_CHECKING: 9 | from datasets import Dataset, IterableDataset 10 | from transformers import TrainingArguments 11 | 12 | from llmtuner.hparams import DataArguments 13 | 14 | 15 | logger = get_logger(__name__) 16 | 17 | 18 | @unique 19 | class Role(str, Enum): 20 | USER = "user" 21 | ASSISTANT = "assistant" 22 | SYSTEM = "system" 23 | FUNCTION = "function" 24 | OBSERVATION = "observation" 25 | 26 | 27 | def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: 28 | if file_sha1 is None: 29 | logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.") 30 | return 31 | 32 | if len(data_files) != 1: 33 | logger.warning("Checksum failed: too many files.") 34 | return 35 | 36 | with open(data_files[0], "rb") as f: 37 | sha1 = hashlib.sha1(f.read()).hexdigest() 38 | if sha1 != file_sha1: 39 | logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0])) 40 | 41 | 42 | def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]: 43 | max_target_len = int(max_len * (target_len / (source_len + target_len))) 44 | max_target_len = max(max_target_len, reserved_label_len) 45 | max_source_len = max_len - max_target_len 46 | return max_source_len, max_target_len 47 | 48 | 49 | def split_dataset( 50 | dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", training_args: "TrainingArguments" 51 | ) -> Dict[str, "Dataset"]: 52 | if training_args.do_train: 53 | if data_args.val_size > 1e-6: # Split the dataset 54 | if data_args.streaming: 55 | val_set = dataset.take(int(data_args.val_size)) 56 | train_set = dataset.skip(int(data_args.val_size)) 57 | dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) 58 | return {"train_dataset": train_set, "eval_dataset": val_set} 59 | else: 60 | val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size 61 | dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed) 62 | return {"train_dataset": dataset["compression"], "eval_dataset": dataset["test"]} 63 | else: 64 | if data_args.streaming: 65 | dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) 66 | return {"train_dataset": dataset} 67 | else: # do_eval or do_predict 68 | return {"eval_dataset": dataset} 69 | -------------------------------------------------------------------------------- /src/llmtuner/extras/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/extras/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | class LoggerHandler(logging.Handler): 6 | r""" 7 | Logger handler used in Web UI. 8 | """ 9 | 10 | def __init__(self): 11 | super().__init__() 12 | self.log = "" 13 | 14 | def reset(self): 15 | self.log = "" 16 | 17 | def emit(self, record): 18 | if record.name == "httpx": 19 | return 20 | log_entry = self.format(record) 21 | self.log += log_entry 22 | self.log += "\n\n" 23 | 24 | 25 | def get_logger(name: str) -> logging.Logger: 26 | r""" 27 | Gets a standard logger with a stream hander to stdout. 28 | """ 29 | formatter = logging.Formatter( 30 | fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S" 31 | ) 32 | handler = logging.StreamHandler(sys.stdout) 33 | handler.setFormatter(formatter) 34 | 35 | logger = logging.getLogger(name) 36 | logger.setLevel(logging.INFO) 37 | logger.addHandler(handler) 38 | 39 | return logger 40 | 41 | 42 | def reset_logging() -> None: 43 | r""" 44 | Removes basic config of root logger. (unused in script) 45 | """ 46 | root = logging.getLogger() 47 | list(map(root.removeHandler, root.handlers)) 48 | list(map(root.removeFilter, root.filters)) 49 | -------------------------------------------------------------------------------- /src/llmtuner/extras/packages.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import importlib.util 3 | 4 | 5 | def _is_package_available(name: str) -> bool: 6 | return importlib.util.find_spec(name) is not None 7 | 8 | 9 | def _get_package_version(name: str) -> str: 10 | try: 11 | return importlib.metadata.version(name) 12 | except Exception: 13 | return "0.0.0" 14 | 15 | 16 | def is_fastapi_availble(): 17 | return _is_package_available("fastapi") 18 | 19 | 20 | def is_flash_attn2_available(): 21 | return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2") 22 | 23 | 24 | def is_jieba_available(): 25 | return _is_package_available("jieba") 26 | 27 | 28 | def is_matplotlib_available(): 29 | return _is_package_available("matplotlib") 30 | 31 | 32 | def is_nltk_available(): 33 | return _is_package_available("nltk") 34 | 35 | 36 | def is_requests_available(): 37 | return _is_package_available("requests") 38 | 39 | 40 | def is_rouge_available(): 41 | return _is_package_available("rouge_chinese") 42 | 43 | 44 | def is_starlette_available(): 45 | return _is_package_available("sse_starlette") 46 | 47 | 48 | def is_unsloth_available(): 49 | return _is_package_available("unsloth") 50 | 51 | 52 | def is_uvicorn_available(): 53 | return _is_package_available("uvicorn") 54 | -------------------------------------------------------------------------------- /src/llmtuner/extras/patches/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/__init__.py -------------------------------------------------------------------------------- /src/llmtuner/extras/patches/llama_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/llama_patch.py -------------------------------------------------------------------------------- /src/llmtuner/extras/patches/mixtral_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CASE-Lab-UMD/LLM-Drop/8cbe3e117e26172ce6c0fea8db131053b10e8327/src/llmtuner/extras/patches/mixtral_patch.py -------------------------------------------------------------------------------- /src/llmtuner/extras/ploting.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | from typing import List, Optional 5 | 6 | from transformers.trainer import TRAINER_STATE_NAME 7 | 8 | from .logging import get_logger 9 | from .packages import is_matplotlib_available 10 | 11 | 12 | if is_matplotlib_available(): 13 | import matplotlib.pyplot as plt 14 | 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def smooth(scalars: List[float]) -> List[float]: 20 | r""" 21 | EMA implementation according to TensorBoard. 22 | """ 23 | last = scalars[0] 24 | smoothed = list() 25 | weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5) # a sigmoid function 26 | for next_val in scalars: 27 | smoothed_val = last * weight + (1 - weight) * next_val 28 | smoothed.append(smoothed_val) 29 | last = smoothed_val 30 | return smoothed 31 | 32 | 33 | def plot_loss(save_dictionary: os.PathLike, keys: Optional[List[str]] = ["loss"]) -> None: 34 | with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f: 35 | data = json.load(f) 36 | 37 | for key in keys: 38 | steps, metrics = [], [] 39 | for i in range(len(data["log_history"])): 40 | if key in data["log_history"][i]: 41 | steps.append(data["log_history"][i]["step"]) 42 | metrics.append(data["log_history"][i][key]) 43 | 44 | if len(metrics) == 0: 45 | logger.warning(f"No metric {key} to plot.") 46 | continue 47 | 48 | plt.figure() 49 | plt.plot(steps, metrics, alpha=0.4, label="original") 50 | plt.plot(steps, smooth(metrics), label="smoothed") 51 | plt.title("training {} of {}".format(key, save_dictionary)) 52 | plt.xlabel("step") 53 | plt.ylabel(key) 54 | plt.legend() 55 | plt.savefig(os.path.join(save_dictionary, "training_{}.png".format(key)), format="png", dpi=100) 56 | print("Figure saved:", os.path.join(save_dictionary, "training_{}.png".format(key))) 57 | -------------------------------------------------------------------------------- /src/llmtuner/hparams/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_args import DataArguments 2 | from .evaluation_args import EvaluationArguments 3 | from .finetuning_args import FinetuningArguments 4 | from .generating_args import GeneratingArguments 5 | from .model_args import ModelArguments 6 | from .pruning_args import PruningArguments 7 | 8 | from .parser import get_eval_args, get_infer_args, get_train_args, get_eval_sparse_args, get_train_sparse_args 9 | 10 | 11 | __all__ = [ 12 | "DataArguments", 13 | "EvaluationArguments", 14 | "FinetuningArguments", 15 | "GeneratingArguments", 16 | "ModelArguments", 17 | "PruningArguments", 18 | "get_eval_args", 19 | "get_infer_args", 20 | "get_train_args", 21 | "get_train_sparse_args" 22 | "get_eval_sparse_args", 23 | ] 24 | -------------------------------------------------------------------------------- /src/llmtuner/hparams/evaluation_args.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, field 3 | from typing import Literal, Optional 4 | 5 | from datasets import DownloadMode 6 | 7 | 8 | @dataclass 9 | class EvaluationArguments: 10 | r""" 11 | Arguments pertaining to specify the evaluation parameters. 12 | """ 13 | 14 | task: str = field( 15 | metadata={"help": "Name of the evaluation task."}, 16 | ) 17 | task_dir: Optional[str] = field( 18 | default="evaluation", 19 | metadata={"help": "Path to the folder containing the evaluation datasets."}, 20 | ) 21 | batch_size: Optional[int] = field( 22 | default=4, 23 | metadata={"help": "The batch size per GPU for evaluation."}, 24 | ) 25 | seed: Optional[int] = field( 26 | default=42, 27 | metadata={"help": "Random seed to be used with data loaders."}, 28 | ) 29 | lang: Optional[Literal["en", "zh"]] = field( 30 | default="en", 31 | metadata={"help": "Language used at evaluation."}, 32 | ) 33 | n_shot: Optional[int] = field( 34 | default=5, 35 | metadata={"help": "Number of examplars for few-shot learning."}, 36 | ) 37 | save_dir: Optional[str] = field( 38 | default=None, 39 | metadata={"help": "Path to save the evaluation results."}, 40 | ) 41 | download_mode: Optional[DownloadMode] = field( 42 | default=DownloadMode.REUSE_DATASET_IF_EXISTS, 43 | metadata={"help": "Download mode used for the evaluation datasets."}, 44 | ) 45 | 46 | def __post_init__(self): 47 | if self.save_dir is not None and os.path.exists(self.save_dir): 48 | raise ValueError("`save_dir` already exists, use another one.") 49 | -------------------------------------------------------------------------------- /src/llmtuner/hparams/generating_args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict, dataclass, field 2 | from typing import Any, Dict, Optional 3 | 4 | 5 | @dataclass 6 | class GeneratingArguments: 7 | r""" 8 | Arguments pertaining to specify the decoding parameters. 9 | """ 10 | 11 | do_sample: Optional[bool] = field( 12 | default=True, 13 | metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."}, 14 | ) 15 | temperature: Optional[float] = field( 16 | default=0.95, 17 | metadata={"help": "The value used to modulate the next token probabilities."}, 18 | ) 19 | top_p: Optional[float] = field( 20 | default=0.7, 21 | metadata={ 22 | "help": "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept." 23 | }, 24 | ) 25 | top_k: Optional[int] = field( 26 | default=50, 27 | metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."}, 28 | ) 29 | num_beams: Optional[int] = field( 30 | default=1, 31 | metadata={"help": "Number of beams for beam search. 1 means no beam search."}, 32 | ) 33 | max_length: Optional[int] = field( 34 | default=512, 35 | metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."}, 36 | ) 37 | max_new_tokens: Optional[int] = field( 38 | default=512, 39 | metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."}, 40 | ) 41 | repetition_penalty: Optional[float] = field( 42 | default=1.0, 43 | metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."}, 44 | ) 45 | length_penalty: Optional[float] = field( 46 | default=1.0, 47 | metadata={"help": "Exponential penalty to the length that is used with beam-based generation."}, 48 | ) 49 | 50 | def to_dict(self) -> Dict[str, Any]: 51 | args = asdict(self) 52 | if args.get("max_new_tokens", -1) > 0: 53 | args.pop("max_length", None) 54 | else: 55 | args.pop("max_new_tokens", None) 56 | return args 57 | -------------------------------------------------------------------------------- /src/llmtuner/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import load_model_and_tokenizer, load_tokenizer 2 | from .utils import dispatch_model, load_valuehead_params 3 | 4 | 5 | __all__ = ["load_model_and_tokenizer", "load_tokenizer", "dispatch_model", "load_valuehead_params"] 6 | --------------------------------------------------------------------------------